From 25eaf06ce6c131eec40163d2c10bf6ac05bf01ab Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Nov 2007 15:15:19 -0500 Subject: [PATCH] Import 2.1.99pre2 --- Documentation/00-INDEX | 18 +- Documentation/ARM-README | 8 +- Documentation/IO-APIC.txt | 6 +- Documentation/binfmt_misc.txt | 6 +- Documentation/cdrom/aztcd | 18 +- Documentation/cdrom/cdrom-standard.tex | 58 +- Documentation/cdrom/cdu31a | 4 +- Documentation/cdrom/cm206 | 24 +- Documentation/cdrom/gscd | 6 +- Documentation/cdrom/ide-cd | 12 +- Documentation/cdrom/isp16 | 2 +- Documentation/cdrom/mcdx | 2 +- Documentation/cdrom/sbpcd | 38 +- Documentation/cdrom/sjcd | 6 +- Documentation/cdrom/sonycd535 | 2 +- Documentation/devices.tex | 14 +- Documentation/digiboard.txt | 29 +- Documentation/exception.txt | 10 +- Documentation/filesystems/00-INDEX | 9 + Documentation/filesystems/affs.txt | 6 +- Documentation/filesystems/coda.txt | 72 +- Documentation/filesystems/fat_cvf.txt | 12 +- Documentation/filesystems/ntfs.txt | 4 +- Documentation/filesystems/romfs.txt | 4 +- Documentation/filesystems/umsdos.txt | 34 +- Documentation/filesystems/vfat.txt | 14 +- Documentation/filesystems/vfs.txt | 6 +- Documentation/ftape.txt | 6 +- Documentation/hayes-esp.txt | 2 +- Documentation/ide.txt | 2 +- Documentation/isdn/INTERFACE | 14 +- Documentation/isdn/README | 32 +- Documentation/isdn/README.HiSax | 20 +- Documentation/isdn/README.act2000 | 8 +- Documentation/isdn/README.audio | 6 +- Documentation/isdn/README.avmb1 | 2 +- Documentation/isdn/README.concap | 68 +- Documentation/isdn/README.icn | 10 +- Documentation/isdn/README.pcbit | 14 +- Documentation/isdn/README.sc | 111 +- Documentation/isdn/README.x25 | 10 +- Documentation/isdn/syncPPP.FAQ | 16 +- Documentation/java.txt | 2 +- Documentation/joystick.txt | 8 +- Documentation/locks.txt | 2 +- Documentation/m68k/framebuffer.txt | 13 +- Documentation/m68k/kernel-options.txt | 22 +- Documentation/mandatory.txt | 10 +- Documentation/mca.txt | 8 +- Documentation/networking/00-INDEX | 53 +- Documentation/networking/6pack.txt | 32 +- Documentation/networking/DLINK.txt | 4 +- Documentation/networking/PLIP.txt | 13 +- Documentation/networking/alias.txt | 4 +- Documentation/networking/baycom.txt | 6 +- Documentation/networking/cops.txt | 12 +- Documentation/networking/cs89x0.txt | 20 +- Documentation/networking/de4x5.txt | 4 +- Documentation/networking/eql.txt | 12 +- Documentation/networking/ethertap.txt | 10 +- Documentation/networking/filter.txt | 10 +- Documentation/networking/ip-sysctl.txt | 4 +- Documentation/networking/ipddp.txt | 42 +- Documentation/networking/lapb-module.txt | 4 +- Documentation/networking/ltpc.txt | 12 +- Documentation/networking/multicast.txt | 2 +- Documentation/networking/net-modules.txt | 16 +- Documentation/networking/policy-routing.txt | 2 +- Documentation/networking/pt.txt | 2 +- Documentation/networking/routing.txt | 8 +- Documentation/networking/shaper.txt | 4 +- Documentation/networking/soundmodem.txt | 4 +- Documentation/networking/wan-router.txt | 32 +- Documentation/networking/wanpipe.txt | 23 +- Documentation/networking/wavelan.txt | 25 +- Documentation/networking/x25-iface.txt | 4 +- Documentation/networking/x25.txt | 2 +- Documentation/networking/z8530drv.txt | 16 +- Documentation/nfsroot.txt | 6 +- Documentation/oops-tracing.txt | 2 +- Documentation/paride.txt | 4 +- Documentation/pci.txt | 6 +- Documentation/powerpc/00-INDEX | 2 +- Documentation/ramdisk.txt | 8 +- Documentation/riscom8.txt | 6 +- Documentation/scsi.txt | 2 +- Documentation/smp | 4 +- Documentation/smp.tex | 20 +- Documentation/sound/AWE32 | 2 +- Documentation/sound/Opti | 14 +- Documentation/sound/Soundblaster | 4 +- Documentation/sound/mwave | 6 +- Documentation/sound/ultrasound | 4 +- Documentation/specialix.txt | 10 +- Documentation/spinlocks.txt | 4 +- Documentation/stallion.txt | 8 +- Documentation/svga.txt | 16 +- Documentation/sysctl/kernel.txt | 10 +- Documentation/sysctl/vm.txt | 47 +- Documentation/transname.txt | 32 +- Documentation/unicode.txt | 4 +- arch/alpha/math-emu/ieee-math.c | 30 +- arch/i386/kernel/irq.c | 2 +- arch/i386/kernel/smp.c | 8 +- arch/i386/mm/init.c | 8 +- arch/sparc/mm/sun4c.c | 1 + drivers/block/ide.c | 3 - drivers/macintosh/imstt.c | 2 - drivers/scsi/scsi.c | 1 + fs/ext2/namei.c | 1 + include/linux/if_ec.h | 47 + include/linux/netdevice.h | 1 + include/linux/netlink.h | 2 +- include/linux/pkt_cls.h | 117 ++ include/linux/pkt_sched.h | 280 ++- include/linux/rtnetlink.h | 26 +- include/linux/socket.h | 14 +- include/linux/sysctl.h | 1 - include/linux/wanrouter.h | 8 +- include/net/dst.h | 4 + include/net/ip.h | 13 +- include/net/ip_fib.h | 7 +- include/net/ip_masq.h | 5 - include/net/pkt_cls.h | 83 + include/net/pkt_sched.h | 237 ++- include/net/snmp.h | 1 + include/net/sock.h | 2 +- include/net/tcp.h | 56 +- mm/page_alloc.c | 16 +- mm/vmscan.c | 37 +- net/Config.in | 5 + net/bridge/Makefile | 2 +- net/core/Makefile | 5 +- net/core/dev.c | 2 + net/core/rtnetlink.c | 30 +- net/core/sock.c | 2 + net/econet/Makefile | 23 + net/econet/econet.c | 1108 +++++++++++ net/ethernet/Makefile | 2 +- net/ipv4/af_inet.c | 7 +- net/ipv4/fib_rules.c | 25 +- net/ipv4/fib_semantics.c | 28 +- net/ipv4/ip_output.c | 384 ++-- net/ipv4/ipmr.c | 5 +- net/ipv4/proc.c | 19 +- net/ipv4/route.c | 24 +- net/ipv4/tcp.c | 31 +- net/ipv4/tcp_input.c | 158 +- net/ipv4/tcp_ipv4.c | 141 +- net/ipv4/tcp_output.c | 52 +- net/ipv4/tcp_timer.c | 121 +- net/ipv6/ip6_fib.c | 13 +- net/ipv6/ip6_output.c | 3 +- net/ipv6/proc.c | 12 +- net/ipv6/route.c | 10 +- net/ipv6/tcp_ipv6.c | 240 +-- net/ipx/af_ipx.c | 14 +- net/netsyms.c | 46 +- net/sched/Config.in | 31 +- net/sched/Makefile | 83 +- net/sched/cls_api.c | 432 +++++ net/sched/cls_fw.c | 97 + net/sched/cls_route.c | 99 + net/sched/cls_rsvp.c | 42 + net/sched/cls_rsvp.h | 670 +++++++ net/sched/cls_rsvp6.c | 43 + net/sched/cls_u32.c | 704 +++++++ net/sched/estimator.c | 184 ++ net/sched/police.c | 196 ++ net/sched/sch_api.c | 994 ++++++++++ net/sched/sch_cbq.c | 1911 +++++++++++++++---- net/sched/sch_csz.c | 402 +++- net/sched/sch_fifo.c | 199 +- net/sched/sch_generic.c | 496 ++--- net/sched/sch_prio.c | 281 ++- net/sched/sch_red.c | 275 ++- net/sched/sch_sfq.c | 247 ++- net/sched/sch_tbf.c | 318 ++- net/sched/sch_teql.c | 475 +++++ net/unix/Makefile | 2 +- net/wanrouter/Makefile | 3 +- net/wanrouter/wanmain.c | 19 +- net/x25/af_x25.c | 5 +- 183 files changed, 10629 insertions(+), 2517 deletions(-) create mode 100644 include/linux/if_ec.h create mode 100644 include/linux/pkt_cls.h create mode 100644 include/net/pkt_cls.h create mode 100644 net/econet/Makefile create mode 100644 net/econet/econet.c create mode 100644 net/sched/cls_api.c create mode 100644 net/sched/cls_fw.c create mode 100644 net/sched/cls_route.c create mode 100644 net/sched/cls_rsvp.c create mode 100644 net/sched/cls_rsvp.h create mode 100644 net/sched/cls_rsvp6.c create mode 100644 net/sched/cls_u32.c create mode 100644 net/sched/estimator.c create mode 100644 net/sched/police.c create mode 100644 net/sched/sch_api.c create mode 100644 net/sched/sch_teql.c diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 393057142ea3..fd986e95ca93 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -68,12 +68,12 @@ magic-number.txt - list of magic numbers used to mark/protect kernel data structures. mandatory.txt - info on the linux implementation of Sys V mandatory file locking. +mca.txt + - info on supporting Micro Channel Architecture (e.g. PS/2) systems. md.txt - info on boot arguments for the multiple devices driver memory.txt - info on typical Linux memory problems. -mca.txt - - info on supporting Micro Channel Architecture (e.g. PS/2) systems. modules.txt - short guide on how to make kernel parts into loadable modules nbd.txt @@ -84,12 +84,14 @@ nfsroot.txt - short guide on setting up a diskless box with NFS root filesystem oops-tracing.txt - how to decode those nasty internal kernel error dump messages. -pcwd-watchdog.txt - - info and sample code for using with the PC Watchdog reset card. paride.txt - information about the parallel port IDE subsystem. parport.txt - how to use the parallel-port driver. +pci.txt + - info on the PCI subsystem for device driver authors +pcwd-watchdog.txt + - info and sample code for using with the PC Watchdog reset card. powerpc/ - directory with info on using linux with the PowerPC. ramdisk.txt @@ -104,8 +106,12 @@ serial-console.txt - how to set up linux with a serial line console as the default. smart-config.txt - description of the Smart Config makefile feature. +smp + - how to setup the kernel for SMP smp.tex - TeX document describing implementation of Multiprocessor Linux +sound/ + - directory with info on sound card support specialix.txt - info on hardware/driver for specialix IO8+ multiport serial card. spinlocks.txt @@ -114,6 +120,10 @@ stallion.txt - info on using the Stallion multiport serial driver. svga.txt - short guide on selecting video modes at boot via VGA BIOS. +sysctl/ + - directory with info on the /proc/sys/* files +sysrq.txt + - info on the magic SysRq key transname.txt - how to use name translation to ease use of diskless systems. unicode.txt diff --git a/Documentation/ARM-README b/Documentation/ARM-README index 0524ba63e5a1..2ffd1113c03a 100644 --- a/Documentation/ARM-README +++ b/Documentation/ARM-README @@ -89,12 +89,12 @@ Bug reports etc to linux@arm.uk.linux.org. Patches will not be included into future kernels unless they come to me (or the relevant person concerned). - When sending bug reports, please ensure that they contain all relevent + When sending bug reports, please ensure that they contain all relevant information, eg. the kernel messages that were printed before/during the problem, what you were doing, etc. - For patches, please include some explaination as to what the patch does - and why (if relevent). + For patches, please include some explanation as to what the patch does + and why (if relevant). Modules ------- @@ -153,7 +153,7 @@ ST506 hard drives big external 5.25" FH 64MB drive (who could ever want more :-) ). I've just got 240K/s off it (a dd with bs=128k); thats about half of what - RiscOS gets; but its a heck of a lot better than the 50K/s I was getting + RiscOS gets; but it's a heck of a lot better than the 50K/s I was getting last week :-) Known bug: Drive data errors can cause a hang; including cases where diff --git a/Documentation/IO-APIC.txt b/Documentation/IO-APIC.txt index 84ab57c19981..13565b69c6fd 100644 --- a/Documentation/IO-APIC.txt +++ b/Documentation/IO-APIC.txt @@ -43,7 +43,7 @@ running: A) if your board is unlisted, then mail to linux-smp to get it into either the white or the blacklist B) if your board is blacklisted, then figure out the apropriate - pirq= option to get your system boot + pirq= option to get your system to boot pirq= lines look like the following in /etc/lilo.conf: @@ -107,9 +107,9 @@ card (IRQ11) in Slot3, and have Slot1 empty: slots.] generally, it's always possible to find out the correct pirq= settings, just -permutate all IRQ numbers properly ... it will take some time though. An +permute all IRQ numbers properly ... it will take some time though. An 'incorrect' pirq line will cause the booting process to hang, or a device -wont function properly (if it's inserted as eg. a module). +won't function properly (if it's inserted as eg. a module). If you have 2 PCI buses, then you can use up to 8 pirq values. Although such boards tend to have a good configuration and will be included in the diff --git a/Documentation/binfmt_misc.txt b/Documentation/binfmt_misc.txt index 6a8b87aff9e1..1f405c589e15 100644 --- a/Documentation/binfmt_misc.txt +++ b/Documentation/binfmt_misc.txt @@ -1,8 +1,8 @@ Kernel Support for miscellaneous (your favourite) Binary Formats v1.1 ===================================================================== -This Kernel feature allows to invoke almost (for restrictions see below) every -program by simply typing its name in the shell. +This Kernel feature allows you to invoke almost (for restrictions see below) +every program by simply typing its name in the shell. This includes for example compiled Java(TM), Python or Emacs programs. To achieve this you must tell binfmt_misc which interpreter has to be invoked @@ -34,7 +34,7 @@ Here is what the fields mean: There are some restrictions: - the whole register string may not exceed 255 characters - - the magic must resist in the first 128 bytes of the file, i.e. + - the magic must reside in the first 128 bytes of the file, i.e. offset+size(magic) has to be less than 128 - the interpreter string may not exceed 127 characters diff --git a/Documentation/cdrom/aztcd b/Documentation/cdrom/aztcd index 0056ce9143c3..fa3081cfb16d 100644 --- a/Documentation/cdrom/aztcd +++ b/Documentation/cdrom/aztcd @@ -202,7 +202,7 @@ configured and mail me (see 6.) the appropriate information. 5.1 MULTISESSION SUPPORT Multisession support for CD's still is a myth. I implemented and tested a basic support for multisession and XA CDs, but I still have not enough CDs and appli- -cations to test it rigourously. So if you'd like to help me, please contact me +cations to test it rigorously. So if you'd like to help me, please contact me (Email address see below). As of version 1.4 and newer you can enable the multisession support in aztcd.h by setting AZT_MULTISESSION to 1. Doing so will cause the ISO9660-filesystem to deal with multisession CDs, ie. redirect @@ -375,7 +375,7 @@ If this still does not help, the finite state machine in azt_poll(). The most important are the status messages, look how they are defined and try to understand, if they make sense in the context where they appear. With a CD-ROM inserted the status - should always be 8, except in aztcd_open(). Try to open the tray, insert a + should always be 8, except in aztcd_open(). Try to open the tray, insert an audio disk, insert no disk or reinsert the CD-ROM and check, if the status bits change accordingly. The status bits are the most likely point, where the drive manufacturers may implement changes. @@ -400,7 +400,7 @@ following: that the ACMD_SOFT_RESET is issued in any case, by substituting the if-statement 'if ( ...=AFL_OP_OK)' by 'if (1)'. -If you succeed, please mail may the exact version string of your drive and +If you succeed, please mail me the exact version string of your drive and the code modifications, you have made together with a short explanation. If you don't succeed, you may mail me the output of the debugging messages. But remember, they are only useful, if they are exact and complete and you @@ -439,13 +439,13 @@ d) I did not get information about changing drive mode. So I doubt, that the code around function azt_poll() case AZT_S_MODE does work. In my test I have not been able to switch to reading in raw mode. For reading raw mode, Aztech uses a different command than for cooked mode, which I only have implemen- -ted in the ioctl-section but not in the section which is used by the ISO9660- +ted in the ioctl-section but not in the section which is used by the ISO9660. The driver was developed on an AST PC with Intel 486/DX2, 8MB RAM, 340MB IDE hard disk and on an AST PC with Intel Pentium 60MHz, 16MB RAM, 520MB IDE running Linux kernel version 1.0.9 from the LST 1.8 Distribution. The kernel was compiled with gcc.2.5.8. My CD-ROM drive is an Aztech CDA268-01A. My -drive says, that it has Firmware Version AZT26801A1.3. It came with a ISA-bus +drive says, that it has Firmware Version AZT26801A1.3. It came with an ISA-bus interface card and works with polled I/O without DMA and without interrupts. The code for all other drives was 'remote' tested and debugged by a number of volunteers on the Internet. @@ -508,7 +508,7 @@ You have to set the correct permissions for cdplay *and* for /dev/mcd0 or /dev/aztcd0 in order to use it. Remember, that you should not have /dev/cdrom mounted, when you're playing audio CDs. -This program is just a hack for testing the ioctl-functions in aztcd.c, I will +This program is just a hack for testing the ioctl-functions in aztcd.c. I will not maintain it, so if you run into problems, discard it or have a look into the source code 'cdplay.c'. The program does only contain a minimum of user protection and input error detection. If you use the commands in the wrong @@ -517,11 +517,11 @@ or even hang your machine. If you get STEN_LOW, STEN_LOW_WAIT or segment violati error messages when using cdplay, after that, the system might not be stable any more, so you'd better reboot. As the ioctl-functions run in kernel mode, most normal Linux-multitasking protection features do not work. By using -uninitialized 'wild' pointers etc., it is easy to write to other users data and -program areas, destroy kernel tables etc.. So if you experiment with ioctls +uninitialized 'wild' pointers etc., it is easy to write to other users' data +and program areas, destroy kernel tables etc.. So if you experiment with ioctls as always when you are doing systems programming and kernel hacking, you should have a backup copy of your system in a safe place (and you also -should try before, how to restore from a backup copy)! +should try restoring from a backup copy first)! A reworked and improved version called 'cdtester.c', which has yet more features for testing CDROM-drives can be found in diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex index 1db560b158a0..6d6e74bdcd9b 100644 --- a/Documentation/cdrom/cdrom-standard.tex +++ b/Documentation/cdrom/cdrom-standard.tex @@ -45,15 +45,15 @@ presumably \end{itemize} The openness of \linux, and the many different types of available hardware has allowed \linux\ to support many different hardware devices. -Unfortunatly, the very openness that has allowed \linux\ to support +Unfortunately, the very openness that has allowed \linux\ to support all these different devices has also allowed the behavior of each device driver to differ significantly from one device to another. -This divergence of behavior has been the very significant for \cdrom\ +This divergence of behavior has been very significant for \cdrom\ devices; the way a particular drive reacts to a `standard' $ioctl()$ call varies greatly from one device driver to another. To avoid making their drivers totally inconsistent, the writers of \linux\ \cdrom\ drivers generally created new device drivers by understanding, copying, -and then changing an existing one. Unfortunatly, this practice did not +and then changing an existing one. Unfortunately, this practice did not maintain uniform behavior across all the \linux\ \cdrom\ drivers. This document describes an effort to establish Uniform behavior across @@ -85,7 +85,7 @@ which was expressed through \cdromh, it appeared to be a rather wild set of commands and data formats.\footnote{I cannot recollect what kernel version I looked at, then, presumably 1.2.13 and 1.3.34---the latest kernel that I was indirectly involved in.} It seemed that many -features of the software interface had been added to accomodate the +features of the software interface had been added to accommodate the capabilities of a particular drive, in an {\fo ad hoc\/} manner. More importantly, it appeared that the behavior of the `standard' commands was different for most of the different drivers: \eg, some drivers @@ -93,7 +93,7 @@ close the tray if an $open()$ call occurs when the tray is open, while others do not. Some drivers lock the door upon opening the device, to prevent an incoherent file system, but others don't, to allow software ejection. Undoubtedly, the capabilities of the different drives vary, -but even when two drives have the same capability their driver's +but even when two drives have the same capability their drivers' behavior was usually different. I decided to start a discussion on how to make all the \linux\ \cdrom\ @@ -109,7 +109,7 @@ hardware will allow). The goal of the \UCD\ is {\em not\/} to alienate driver developers who have not yet taken steps to support this effort. The goal of \UCD\ is -simply is give people writing application programs for \cdrom\ drives +simply to give people writing application programs for \cdrom\ drives {\em one\/} \linux\ \cdrom\ interface with consistent behavior for all \cdrom\ devices. In addition, this also provides a consistent interface between the low-level device driver code and the \linux\ kernel. Care @@ -147,14 +147,14 @@ software-level, that separates the $ioctl()$ and $open()$ implementation from the actual hardware implementation. Note that this effort has made few changes which will effect a user's application programs. The greatest change involved moving the contents of the various low-level -\cdrom\ driver's header files to the kernel's cdrom directory. This was +\cdrom\ drivers' header files to the kernel's cdrom directory. This was done to help ensure that the user is only presented with only one cdrom interface, the interface defined in \cdromh. \cdrom\ drives are specific enough (\ie, different from other block-devices such as floppy or hard disc drives), to define a set of common {\em \cdrom\ device operations}, $_dops$. -These operations are different than the classical block-device file +These operations are different from the classical block-device file operations, $_fops$. The routines for the \UCD\ interface level are implemented in the file @@ -267,7 +267,7 @@ difficult in computer programming. Note that most functions have fewer parameters than their $blkdev_fops$ counterparts. This is because very little of the -information in the structures $inode$ and $file$ are used. For most +information in the structures $inode$ and $file$ is used. For most drivers, the main parameter is the $struct$ $cdrom_device_info$, from which the major and minor number can be extracted. (Most low-level \cdrom\ drivers don't even look at the major and minor number though, @@ -291,7 +291,7 @@ struct& cdrom_device_info\ \{ \hidewidth\cr \noalign{\medskip} &int& options : 30;& options flags \cr &long& mc_flags : 2;& media-change buffer flags \cr - & int& use_count;& number of times devices is opened\cr + & int& use_count;& number of times device is opened\cr \}\cr }$$ Using this $struct$, a linked list of the registered minor devices is @@ -312,23 +312,23 @@ registration. A few registers contain variables local to the \cdrom\ drive. The flags $options$ are used to specify how the general \cdrom\ routines should behave. These various flags registers should provide enough -flexibility to adapt to the different user's wishes (and {\em not\/} the +flexibility to adapt to the different users' wishes (and {\em not\/} the `arbitrary' wishes of the author of the low-level device driver, as is the case in the old scheme). The register $mc_flags$ is used to buffer the information from $media_changed()$ to two separate queues. Other -data that is specific to minor drive, can be accessed through $handle$, +data that is specific to a minor drive, can be accessed through $handle$, which can point to a data structure specific to the low-level driver. The fields $use_count$, $next$, $options$ and $mc_flags$ need not be initialized. -The intermediate software layer that \cdromc\ forms will performs some +The intermediate software layer that \cdromc\ forms will perform some additional bookkeeping. The use count of the device (the number of processes that have the device opened) is registered in $use_count$. The function $cdrom_ioctl()$ will verify the appropriate user-memory regions for read and write, and in case a location on the CD is transferred, it will `sanitize' the format by making requests to the low-level drivers in a standard format, and translating all formats between the -user-software and low level drivers. This relieves much of the drivers +user-software and low level drivers. This relieves much of the drivers' memory checking and format checking and translation. Also, the necessary structures will be declared on the program stack. @@ -469,7 +469,7 @@ addressing mode), whatever the calling software requested. But sanitization goes even further: the low-level implementation may return the requested information in $CDROM_MSF$ format if it wishes so (setting the $ms_info\rightarrow addr_format$ field appropriately, of -course) and the routines in \cdromc\ will make the transform if +course) and the routines in \cdromc\ will make the transformation if necessary. The return value is 0 upon success. \subsection{$Int\ get_mcn(struct\ cdrom_device_info * cdi, struct\ @@ -498,7 +498,7 @@ driver to time out. Some of the \cdrom-$ioctl$s defined in \cdromh\ can be implemented by the routines described above, and hence the function $cdrom_ioctl$ will use those. However, most $ioctl$s deal with -audio-control. We have decided to leave these accessed through a +audio-control. We have decided to leave these to be accessed through a single function, repeating the arguments $cmd$ and $arg$. Note that the latter is of type $void*{}$, rather than $unsigned\ long\ int$. The routine $cdrom_ioctl()$ does do some useful things, @@ -532,7 +532,7 @@ problem here could be the fact that audio-frames are 2352 bytes long, so either the audio-file-system should ask for 75264 bytes at once (the least common multiple of 512 and 2352), or the drivers should bend their backs to cope with this incoherence (to which I would be -opposed). Furthermore, it it very difficult for the hardware to find +opposed). Furthermore, it is very difficult for the hardware to find the exact frame boundaries, since there are no synchronization headers in audio frames. Once these issues are resolved, this code should be standardized in \cdromc. @@ -562,7 +562,7 @@ CDC_LOCK& can lock and unlock the door\cr CDC_SELECT_SPEED& can select speed, in units of $\sim$150\,kB/s\cr CDC_SELECT_DISC& drive is juke-box\cr CDC_MULTI_SESSION& can read sessions $>\rm1$\cr -CDC_MCN& can read Medium Catalog Number\cr +CDC_MCN& can read Media Catalog Number\cr CDC_MEDIA_CHANGED& can report if disc has changed\cr CDC_PLAY_AUDIO& can perform audio-functions (play, pause, etc)\cr CDC_RESET& hard reset device\cr @@ -724,12 +724,12 @@ modes of operation can be set: \begin{description} \item[$CDO_AUTO_CLOSE \mathrel| CDO_USE_FFLAGS \mathrel| CDO_LOCK$] This is the default setting. (With $CDO_CHECK_TYPE$ it will be better, in the -future.) If the device is not yet opened by any other process, and it +future.) If the device is not yet opened by any other process, and if the device is being opened for data ($O_NONBLOCK$ is not set) and the tray is found to be open, an attempt to close the tray is made. Then, it is verified that a disc is in the drive and, if $CDO_CHECK_TYPE$ is set, that it contains tracks of type `data mode 1.' Only if all tests -are passed, the return value is zero. The door is locked to prevent file +are passed is the return value zero. The door is locked to prevent file system corruption. If the drive is opened for audio ($O_NONBLOCK$ is set), no actions are taken and a value of 0 will be returned. \item[$CDO_AUTO_CLOSE \mathrel| CDO_AUTO_EJECT \mathrel| CDO_LOCK$] This @@ -745,7 +745,7 @@ driver scheme and option flag interpretation. \newsection{Description of routines in \cdromc} Only a few routines in \cdromc\ are exported to the drivers. In this -newsection we will discuss these, as well as the functions that `take +new section we will discuss these, as well as the functions that `take over' the \cdrom\ interface to the kernel. The header file belonging to \cdromc\ is called \cdromh. Formerly, some of the contents of this file were placed in the file {\tt {ucdrom.h}}, but this file has now been @@ -833,7 +833,7 @@ not masked: \item[CDROMEJECT_SW] If $arg\not=0$, set behavior to auto-close (close tray on first open) and auto-eject (eject on last release), otherwise set behavior to non-moving on $open()$ and $release()$ calls. -\item[CDROM_GET_MCN or CDROM_GET_UPC] Get the Medium Catalog Number from a CD. +\item[CDROM_GET_MCN or CDROM_GET_UPC] Get the Media Catalog Number from a CD. \end{description} \subsubsection{$Ioctl$s routed through $audio_ioctl()$} @@ -878,7 +878,7 @@ the current flags. \item[CDROM_SELECT_SPEED] Select head-rate speed of disc specified as by $arg$ in units of standard cdrom speed (176\,kB/sec raw data or 150\,kB/sec file system data). The value 0 means `auto-select', \ie, - play audio discs at real time and data disc at maximum speed. The value + play audio discs at real time and data discs at maximum speed. The value $arg$ is checked against the maximum head rate of the drive found in the $cdrom_dops$. \item[CDROM_SELECT_DISC] Select disc numbered $arg$ from a juke-box. @@ -887,18 +887,18 @@ the current flags. \item[CDROM_MEDIA_CHANGED] Returns 1 if a disc has been changed since the last call. Note that calls to $cdrom_media_changed$ by the VFS are treated by an independent queue, so both mechanisms will detect - a media change once. For Juke-boxes, an extra argument $arg$ + a media change once. For juke-boxes, an extra argument $arg$ specifies the slot for which the information is given. The special value $CDSL_CURRENT$ requests that information about the currently - selected slot is returned. + selected slot be returned. \item[CDROM_DRIVE_STATUS] Returns the status of the drive by a call to $drive_status()$. Return values are defined in section~\ref{drive status}. Note that this call doesn't return information on the current playing activity of the drive; this can be polled through an - $ioctl$ call to $CDROMSUBCHNL$. For Juke-boxes, an extra argument + $ioctl$ call to $CDROMSUBCHNL$. For juke-boxes, an extra argument $arg$ specifies the slot for which (possibly limited) information is given. The special value $CDSL_CURRENT$ requests that information - about the currently selected slot is returned. + about the currently selected slot be returned. \item[CDROM_DISC_STATUS] Returns the type of the disc currently in the drive. It should be viewed as a complement to $CDROM_DRIVE_STATUS$. This $ioctl$ can provide \emph {some} information about the current @@ -996,7 +996,7 @@ $\&_fops$ to $\&cdrom_fops$. \item Change the prototypes of $_open()$ and $_release()$, and remove any strategic code (\ie, tray movement, door locking, etc.). -\item Try to recompile the drivers. We advice you to use modules, both +\item Try to recompile the drivers. We advise you to use modules, both for {\tt {cdrom.o}} and your driver, as debugging is much easier this way. \end{enumerate} @@ -1004,7 +1004,7 @@ $\&_fops$ to $\&cdrom_fops$. \newsection{Thanks} Thanks to all the people involved. First, Erik Andersen, who has -taken over the torch in maintaining \cdromc\ and integrating many +taken over the torch in maintaining \cdromc\ and integrating much \cdrom-related code in the 2.1-kernel. Thanks to Scott Snyder and Gerd Knorr, who were the first to implement this interface for SCSI and IDE-CD drivers and added many ideas for extension of the data diff --git a/Documentation/cdrom/cdu31a b/Documentation/cdrom/cdu31a index 6fa1d4b0fa16..c0667da09c00 100644 --- a/Documentation/cdrom/cdu31a +++ b/Documentation/cdrom/cdu31a @@ -32,7 +32,7 @@ same card). They are not software compatible. Setting Up the Hardware ----------------------- -The CDU31A driver in unable to safely tell if an interface card is +The CDU31A driver is unable to safely tell if an interface card is present that it can use because the interface card does not announce its presence in any way besides placing 4 I/O locations in memory. It used to just probe memory and attempt commands, but Linus wisely asked @@ -44,7 +44,7 @@ is, what interrupts are used, and possibly if you are on a PAS-16 soundcard. If you have the Sony CDU31A/CDU33A drive interface card, the following -diagram will help you set it up. If You have another card, you are on +diagram will help you set it up. If you have another card, you are on your own. You need to make sure that the I/O address and interrupt is not used by another card in the system. You will need to know the I/O address and interrupt you have set. Note that use of interrupts is diff --git a/Documentation/cdrom/cm206 b/Documentation/cdrom/cm206 index 1aebd7e5d07d..bc4f9e656544 100644 --- a/Documentation/cdrom/cm206 +++ b/Documentation/cdrom/cm206 @@ -14,7 +14,7 @@ Features since version 0.33 - Full audio support, that is, both workman, workbone and cdp work now reasonably. Reading TOC still takes some time. xmcd has been reported to run successfully. -- Made auto-probe code a little better, i hope +- Made auto-probe code a little better, I hope Features since version 0.28 --------------------------- @@ -37,8 +37,8 @@ options: Further, you must decide if you are going to specify the base port address and the interrupt request line of the adapter card cm260 as boot options for (a), module parameters for (b), use automatic - probing of these values, or hard-wire your adaptor cards settings - into the source code. If you don't care, you can choose for + probing of these values, or hard-wire your adaptor card's settings + into the source code. If you don't care, you can choose autoprobing, which is the default. In that case you can move on to the next step. @@ -48,10 +48,10 @@ Compiling the kernel make config - If you have chosen for option (a), answer yes to CONFIG_CM206 and + If you have chosen option (a), answer yes to CONFIG_CM206 and CONFIG_ISO9660_FS. - If you have chosen for option (b), answer yes to CONFIG_MODVERSIONS + If you have chosen option (b), answer yes to CONFIG_MODVERSIONS and no (!) to CONFIG_CM206 and CONFIG_ISO9660_FS. 2) then do a @@ -64,7 +64,7 @@ Compiling the kernel Using the driver as a module ---------------------------- -If you will only seldomly use the cd-rom driver, you can choose for +If you will only occasionally use the cd-rom driver, you can choose option (b), install as a loadable module. You may have to re-compile the module when you upgrade the kernel to a new version. @@ -84,7 +84,7 @@ line to be used, e.g. insmod /usr/src/linux/modules/cm206.o cm206=0x300,11 -The order of base port and irq line doesn't matter; you may specify only +The order of base port and irq line doesn't matter; if you specify only one, the other will have the value of the compiled-in default. You may also have to install the file-system module `iso9660.o', if you didn't compile that into the kernel. @@ -92,17 +92,17 @@ didn't compile that into the kernel. Using the driver as part of the kernel -------------------------------------- -If you have chosen for option a, you can specify the base-port +If you have chosen option (a), you can specify the base-port address and irq on the lilo boot command line, e.g.: LILO: linux cm206=0x340,11 This assumes that your linux kernel image keyword is `linux'. -If you may specify either IRQ (3--11) or base port (0x300--0x370), +If you specify either IRQ (3--11) or base port (0x300--0x370), auto probing is turned off for both settings, thus setting the other value to the compiled-in default. -Note that you can put these parameters also in the lilo configuration file: +Note that you can also put these parameters in the lilo configuration file: # linux config image = /vmlinuz @@ -122,7 +122,7 @@ the defines of CM206_IRQ and CM206_BASE. Mounting the cdrom ------------------ -1) Make sure that there is the right device installed in /dev. +1) Make sure that the right device is installed in /dev. mknod /dev/cm206cd b 32 0 @@ -159,7 +159,7 @@ If things don't work DISCLAIMER ---------- I cannot guarantee that this driver works, or that the hardware will -not be harmed, although i consider it most unlikely. +not be harmed, although I consider it most unlikely. I hope that you'll find this driver in some way useful. diff --git a/Documentation/cdrom/gscd b/Documentation/cdrom/gscd index bb25a8c60e61..560069e2a94a 100644 --- a/Documentation/cdrom/gscd +++ b/Documentation/cdrom/gscd @@ -1,7 +1,7 @@ Goldstar R420 CD-Rom device driver README For all kind of other information about the GoldStar R420 CDROM -and this Linux device driver is a WWW-URL Page installed: +and this Linux device driver see the WWW page: http://linux.rz.fh-hannover.de/~raupach @@ -44,12 +44,12 @@ Install your new kernel as usual - maybe you do it with 'make zlilo'. Before you can use the driver, you have to mknod /dev/gscd0 b 16 0 -to create the appropriate device file (once for all times). +to create the appropriate device file (you only need to do this once). If you use modules, you can try to insert the driver. Say: 'insmod /usr/src/linux/modules/gscd.o' or: 'insmod /usr/src/linux/modules/gscd.o gscd=
' -The driver should report his results now. +The driver should report its results. That's it! Mount a disk, i.e. 'mount -rt iso9660 /dev/gscd0 /cdrom' diff --git a/Documentation/cdrom/ide-cd b/Documentation/cdrom/ide-cd index ecd498158106..744a81407d2e 100644 --- a/Documentation/cdrom/ide-cd +++ b/Documentation/cdrom/ide-cd @@ -22,7 +22,7 @@ This driver provides the following features: - Reading from data tracks, and mounting iso9660 filesystems. - Playing audio tracks. Most of the cdrom player programs floating - around should work; i usually use Workman. + around should work; I usually use Workman. - Multisession support. @@ -148,7 +148,7 @@ workbone, cdplayer, etc.). Lacking anything else, you could use the cdtester program in Documentation/cdrom/sbpcd. On a few drives, you can read digital audio directly using a program -such as cdda2wav. The only types of drive which i've heard support +such as cdda2wav. The only types of drive which I've heard support this are Sony and Toshiba drives. You will get errors if you try to use this function on a drive which does not support it. @@ -189,7 +189,7 @@ CDROM_NBLOCKS_BUFFER ioctl. The default is 8. TEST - This presently enables an additional ioctl which enables a user-mode + This currently enables an additional ioctl which enables a user-mode program to execute an arbitrary packet command. See the source for details. This should be left off unless you know what you're doing. @@ -271,7 +271,7 @@ b. Timeout/IRQ errors. and 15 for the secondary (0x1f0) interface.) Also be sure that you don't have some other hardware which might be conflicting with the IRQ you're using. Also check the BIOS setup for your system; - some have the ability to disable individual IRQ levels, and i've + some have the ability to disable individual IRQ levels, and I've had one report of a system which was shipped with IRQ 15 disabled by default. @@ -282,7 +282,7 @@ b. Timeout/IRQ errors. - If you own a Pioneer DR-A24X, you _will_ get nasty error messages on boot such as "irq timeout: status=0x50 { DriveReady SeekComplete }" The Pioneer DR-A24X cdrom drives are fairly popular these days. - Unfortunatly, these drives seem to become very confused when we perform + Unfortunately, these drives seem to become very confused when we perform the standard Linux ATA disk drive probe. If you own one of these drives, you can bypass the ATA probing which confuses these cdrom drives, by adding `append="hdX=noprobe hdX=cdrom"' to your lilo.conf file and runing @@ -377,7 +377,7 @@ f. Data corruption. /* * cdchange.c [-v] [] * - * This load a cdrom from a specified slot in a changer, and displays + * This loads a cdrom from a specified slot in a changer, and displays * information about the changer status. The drive should be unmounted before * using this program. * diff --git a/Documentation/cdrom/isp16 b/Documentation/cdrom/isp16 index 7dcf6f9c99b3..cc86533ac9f3 100644 --- a/Documentation/cdrom/isp16 +++ b/Documentation/cdrom/isp16 @@ -71,7 +71,7 @@ sound card configuration. The syntax of the command line does not allow the specification of irq when there's nothing specified for the base address and no specification of dma when there is no specification of irq. -The value 'nosip16' for drive_type, which may be used as the first +The value 'noisp16' for drive_type, which may be used as the first non-integer option value (e.g. 'isp16=noisp16'), makes sure that probing for and subsequent configuration of an ISP16-compatible card is skipped all together. This can be useful to overcome possible conflicts which diff --git a/Documentation/cdrom/mcdx b/Documentation/cdrom/mcdx index fd2c37321a97..3c0fee5e2721 100644 --- a/Documentation/cdrom/mcdx +++ b/Documentation/cdrom/mcdx @@ -1,6 +1,6 @@ This is a first attempt to create an `improved' driver for the Mitsumi drives. It is able to "live together" with mcd.c, if you have at least two Mitsumi -drives: each driver can use his own drive. +drives: each driver can use its own drive. To allow this "coexistence" as long as mcdx.c is not a superset of mcd.c, this driver has to use its own device files. We use MAJOR 20 for it. So, diff --git a/Documentation/cdrom/sbpcd b/Documentation/cdrom/sbpcd index 7e43d9ce47fd..3dc3a249d56d 100644 --- a/Documentation/cdrom/sbpcd +++ b/Documentation/cdrom/sbpcd @@ -4,7 +4,7 @@ CD-ROM driver for Linux. sbpcd really, really is NOT for ANY IDE/ATAPI drive! Not even if you have an "original" SoundBlaster card with an IDE interface! -So, you better have a look into README.ide if your port address is 0x1F0, +So, you'd better have a look into README.ide if your port address is 0x1F0, 0x170, 0x1E8, 0x168 or similar. I get tons of mails from IDE/ATAPI drive users - I really can't continue any more to answer them all. So, if your drive/interface information sheets @@ -18,7 +18,7 @@ LILO commands and get lucky. To make it fully clear to you: if you mail me about IDE/ATAPI drive problems, my answer is above, and I simply will discard your mail, hoping to stop the -flood and to find time to lead my 12-years old son towards happy computing. +flood and to find time to lead my 12-year old son towards happy computing. The driver is able to drive the whole family of "traditional" AT-style (that is NOT the new "Enhanced IDE" or "ATAPI" drive standard) Matsushita, @@ -29,13 +29,13 @@ CR-574 is an IDE/ATAPI drive. The Longshine LCS-7260 is a double-speed drive which uses the "old" Matsushita command set. It is supported - with help by Serge Robyns. Vertos ("Elitegroup Computer Systems", ECS) has a similar drive - support -has started; come in contact if you have such a "Vertos 100" or "ECS-AT" +has started; get in contact if you have such a "Vertos 100" or "ECS-AT" drive. There exists an "IBM External ISA CD-ROM Drive" which in fact is a CR-563 with a special controller board. This drive is supported (the interface is of the "LaserMate" type), and it is possibly the best buy today (cheaper than -an internal drive, and you can use it as an internal, too - f.e. plug it into +an internal drive, and you can use it as an internal, too - e.g. plug it into a soundcard). CreativeLabs has a new drive "CD200" and a similar drive "CD200F". The latter @@ -51,7 +51,7 @@ speed". The data rate already reaches 500 kB/sec if you set SBP_BUFFER_FRAMES to 64 (it is not recommended to do that for normal "file access" usage, but it can speed up things a lot if you use something like "dd" to read from the drive; I use it for verifying self-written CDs this way). -The drive itself is able to deliver 600 kB/sec, so this has to get a point of +The drive itself is able to deliver 600 kB/sec, so this needs work; with the normal setup, the performance currently is not even as good as double-speed. @@ -63,7 +63,7 @@ and include an original log message excerpt, and try to give all information a complete idiot needs to understand your hassle already with your first mail. And if you want to say "as I have mailed you before", be sure that I don't remember your "case" by such remarks; at the moment, I have some -hundreds open correspondences about Linux CDROM questions (hope to reduce if +hundreds of open correspondences about Linux CDROM questions (hope to reduce if the IDE/ATAPI user questions disappear). @@ -79,7 +79,7 @@ specify the type "SBPRO 2" and the true CDROM port address with it, not the If you have a sound card which needs a "configuration driver" instead of jumpers for interface types and addresses (like Mozart cards) - those drivers get invoked before the DOS CDROM driver in your CONFIG.SYS, typical -names are "cdsetup.sys" and "mztinit.sys" -, let the sound driver do the +names are "cdsetup.sys" and "mztinit.sys" - let the sound driver do the CDROM port configuration (the leading comments in linux/drivers/sound/mad16.c are just for you!). Hannu Savolainen's mad16.c code is able to set up my Mozart card - I simply had to add @@ -184,10 +184,10 @@ To install: 1. Setup your hardware parameters. Though the driver does "auto-probing" at a lot of (not all possible!) addresses, this step is recommended for - every-day use. You should let sbpcd auto-probe once and use the reported + everyday use. You should let sbpcd auto-probe once and use the reported address if a drive got found. The reported type may be incorrect; it is correct if you can mount a data CD. There is no choice for you with the - type; only one is the right, the others are deadly wrong. + type; only one is right, the others are deadly wrong. a. Go into /usr/src/linux/drivers/cdrom/sbpcd.h and configure it for your hardware (near the beginning): @@ -229,7 +229,7 @@ To install: second, third, or fourth controller installed, do not say "y" to the secondary Matsushita CD-ROM questions. -3. Then do a "make dep", then make the kernel image ("make zlilo" or else). +3. Then do a "make dep", then make the kernel image ("make zlilo" or similar). 4. Make the device file(s). This step usually already has been done by the MAKEDEV script. @@ -242,7 +242,7 @@ To install: mknod /dev/sbpcd3 b 25 3 to make the node(s). - The "first found" drive gets MINOR 0 (regardless to its jumpered ID), the + The "first found" drive gets MINOR 0 (regardless of its jumpered ID), the "next found" (at the same cable) gets MINOR 1, ... For a second interface board, you have to make nodes like @@ -297,21 +297,21 @@ No DMA and no IRQ is used. To reduce or increase the amount of kernel messages, edit sbpcd.c and play with the "DBG_xxx" switches (initialization of the variable "sbpcd_debug"). -Don't forget to reflect what you do; enabling all DBG_xxx switches at once +Don't forget to reflect on what you do; enabling all DBG_xxx switches at once may crash your system, and each message line is accompanied by a delay. The driver uses the "variable BLOCK_SIZE" feature. To use it, you have to specify "block=2048" as a mount option. Doing this will disable the direct execution of a binary from the CD; you have to copy it to a device with the -standard BLOCK_SIZE (1024) before. So, do not use this if your system is +standard BLOCK_SIZE (1024) first. So, do not use this if your system is directly "running from the CDROM" (like some of YGGDRASIL's installation variants). There are CDs on the market (like the german "unifix" Linux distribution) which MUST get handled with a block_size of 1024. Generally, one can say all the CDs which hold files of the name YMTRANS.TBL are defective; do not use block=2048 with those. -Within sbpcd.h, you will find some "#define"s (f.e. EJECT and JUKEBOX). With -that, you can configure the driver for some special things. +Within sbpcd.h, you will find some "#define"s (e.g. EJECT and JUKEBOX). With +these, you can configure the driver for some special things. You can use the appended program "cdtester" to set the auto-eject feature during runtime. Jeff Tranter's "eject" utility can do this, too (and more) for you. @@ -344,7 +344,7 @@ o.k., but you will get I/O errors during mount). In that case, use the "kernel command line" feature and specify address & type at boot time to find out the right setup. -For every-day use, address and type should get configured within sbpcd.h. That +For everyday use, address and type should get configured within sbpcd.h. That will stop the auto-probing due to success with the first try. The kernel command "sbpcd=0" suppresses each auto-probing and causes @@ -373,7 +373,7 @@ Almost all of the "SoundBlaster compatible" cards behave like the no-sound interfaces, i.e. need SBPRO 0! With "original" SB Pro cards, an initial setting of CD_volume through the -sound cards MIXER register gets done. +sound card's MIXER register gets done. If you are using a "compatible" sound card of types "LaserMate" or "SPEA", you can set SOUND_BASE (in sbpcd.h) to get it done with your card, too... @@ -385,8 +385,8 @@ Workman, WorkBone, xcdplayer, cdplayer and the nice little tool "cdplay" (see README.aztcd from the Aztech driver package) should work. The program CDplayer likes to talk to "/dev/mcd" only, xcdplayer wants -"/dev/rsr0", workman loves "/dev/sr0" or "/dev/cdrom" - so, do the appropriate -links for using them without the need of supplying parameters. +"/dev/rsr0", workman loves "/dev/sr0" or "/dev/cdrom" - so, make the +appropriate links to use them without the need to supply parameters. Copying audio tracks: diff --git a/Documentation/cdrom/sjcd b/Documentation/cdrom/sjcd index 2ebaf4f4d328..74a14847b93a 100644 --- a/Documentation/cdrom/sjcd +++ b/Documentation/cdrom/sjcd @@ -1,10 +1,10 @@ -- Documentation/cdrom/sjcd 80% of the work takes 20% of the time, 20% of the work takes 80% of the time... - (Murphy law) + (Murphy's law) Once started, training can not be stopped... - (StarWars) + (Star Wars) This is the README for the sjcd cdrom driver, version 1.6. @@ -13,7 +13,7 @@ cdrom drive. It will grow as the questions arise. ;-) For info on configuring the ISP16 sound card look at Documentation/cdrom/isp16. The driver should work with any of the Panasonic, Sony or Mitsumi style -CDROM interface. +CDROM interfaces. The cdrom interface on Media Magic's soft configurable sound card ISP16, which used to be included in the driver, is now supported in a separate module. This initialisation module will probably also work with other interfaces diff --git a/Documentation/cdrom/sonycd535 b/Documentation/cdrom/sonycd535 index b7bf48d8afdf..a931d5093a54 100644 --- a/Documentation/cdrom/sonycd535 +++ b/Documentation/cdrom/sonycd535 @@ -35,7 +35,7 @@ REQUIREMENTS - Drive must be set up as unit 1. Only the first unit will be recognized - - you must enter your interface address into + - You must enter your interface address into /usr/src/linux/drivers/cdrom/sonycd535.h and build the appropriate kernel or use the "kernel command line" parameter sonycd535=0x320 diff --git a/Documentation/devices.tex b/Documentation/devices.tex index 15e06fd2df39..baa9e81efb88 100644 --- a/Documentation/devices.tex +++ b/Documentation/devices.tex @@ -1148,11 +1148,11 @@ MAJOR NUMBER 42 IS NONCOMPLIANT. \end{devicelist} \noindent -Network Block Device is somehow similar to loopback devices: If you -read from it, it sends packet accross network asking server for -data. If you write to it, it sends packet telling server to write. It -could be used to mounting filesystems over the net, swapping over the -net, implementing block device in userland etc. +Network Block Device is somewhat similar to the loopback device: if you +read from it, it sends packets across the network asking a server for +data. If you write to it, it sends packets telling the server to write. It +could be used for mounting filesystems over the net, swapping over the +net, implementing block devices in userland etc. \begin{devicelist} \major{44}{}{char }{isdn4linux virtual modem -- alternate devices} @@ -1283,7 +1283,7 @@ microcontrollers} \end{devicelist} \noindent -This device is used for the interfacing to the MC683xx +This device is used for interfacing to the MC683xx microcontrollers via Background Debug Mode by use of a Parallel Port interface. PD is the Motorola Public Domain Interface and ICD is the commercial interface by P\&E. @@ -1832,7 +1832,7 @@ virtual console such as {\file /dev/tty1}, or to a serial port primary Serial ports are RS-232 serial ports and any device which simulates one, either in hardware (such as internal modems) or in software (such -as the ISDN driver.) Under Linux, each serial ports has two device +as the ISDN driver.) Under Linux, each serial port has two device names, the primary or callin device and the alternate or callout one. Each kind of device is indicated by a different letter. For any letter $X$, the names of the devices are {\file /dev/tty${X\#}$} and diff --git a/Documentation/digiboard.txt b/Documentation/digiboard.txt index ab1490ada7df..03292883469c 100644 --- a/Documentation/digiboard.txt +++ b/Documentation/digiboard.txt @@ -5,18 +5,18 @@ The Digiboard Driver for Linux supports the following boards: DigiBoard PC/Xi, PC/Xe, PC/Xeve(which is the newer, smaller Xe with a 8K window which is also known as PC/Xe(8K) and has no memory/irq - switches) You can use up to 4 cards with this driver and should work + switches) You can use up to 4 cards with this driver and it should work on other architectures than intel also. -In case you have problems with this version(1.6.1) of this driver, please +In case you have problems with this version (1.6.1) of this driver, please email directly to me as I made the last update. It you have a report about runnning it on other architectures than intel, email me, so I can document it here. -An version of this driver has been taken by Digiboard to make a driver +A version of this driver has been taken by Digiboard to make a driver software package which supports also PC/Xem cards and newer PCI cards -but it don't support the old PC/Xi cards and it isn't yet ported to -linux-2.1.x and may not be useable on other architectures than intel now. +but it doesn't support the old PC/Xi cards and it isn't yet ported to +linux-2.1.x and may not be usable on other architectures than intel now. It is available from ftp.digi.com/ftp.digiboard.com. You can write me if you need an patch for this driver. @@ -25,7 +25,7 @@ you need an patch for this driver. Configuring the Driver ---------------------- -The driver can be build direct into the kernel or as module. +The driver can be built direct into the kernel or as a module. The pcxx driver can be configured using the command line feature while loading the kernel with LILO or LOADLIN or, if built as a module, with arguments to insmod and modprobe or with parameters in @@ -66,14 +66,14 @@ io - I/O port address of that card. membase - Memory start address of that card. memsize - Memory size of that card, in kilobytes. If given, this value is compared against the card to verify configuration and - hinder the driver to use a misconfigured card. If the parameter + hinder the driver from using a misconfigured card. If the parameter does not match the board it is disabled with a memory size error. numports - Number of ports on this card. This is the number of devices to assign to this card or reserve if disabled. altpin - 1: swap DCD and DSR for 8-pin RJ-45 with modems. 0: don't swap DCD and DSR. other values count as 1. -verbose - 1: give nice verbose output during initialisation of the driver. +verbose - 1: give nice verbose output during initialisation of the driver, possibly helpful during board configuration. 0: normal terse output. @@ -82,19 +82,19 @@ If the io= parameter is not given, the default config is used. This is io=0x200 membase=0xD0000 numports=16 altpin=0 -Only parameters applicable need be specified. For example to configure +Only applicable parameters need be specified. For example to configure 2 boards, first one at 0x200 with 8 ports, rest defaults, second one at 0x120, memory at 0xD80000, altpin enabled, rest defaults, you can do this by using these parameters: modprobe pcxx io=0x200,0x120 numports=8,8 membase=,0xD80000 altpin=,1 -To disable a temporary unuseable board without changing the mapping of the +To disable a temporary unusable board without changing the mapping of the devices following that board, you can empty the io-value for that board: modprobe pcxx io=,0x120 numports=8,8 membase=,0xD80000 altpin=,1 -The remainig board still uses ttyD8-ttyD15 and cud8-cud15. +The remaining board still uses ttyD8-ttyD15 and cud8-cud15. Example line for /etc/conf.modules for use with kerneld and as default parameters for modprobe: @@ -120,7 +120,7 @@ Card status: Enable - use that board Card type: PC/Xi - the old ones with 64/128/256/512K RAM. PC/Xe - PC/Xe(old ones with 64k mem range). - PC/Xeve - PC/Xe(newers with 8k mem range). + PC/Xeve - PC/Xe(new ones with 8k mem range). Note: This is for documentation only, the type is detected from the board. @@ -146,7 +146,8 @@ If you don't give a digi= commandline, the compiled-in defaults of board 1: io=0x200, membase=0xd0000, altpin=off and numports=16 are used. If you have the resources (io&mem) free for use, configure your board to -these settings and you should be set up fine even if yours has not 16 ports. +these settings and you should be set up fine even if yours has not got 16 +ports. Sources of Information @@ -274,7 +275,7 @@ Samples: append="digi=E,PC/Xi,D,16,200,D0000" append="digi=1,0,0,16,512,(whatever D0000 is in base 10 :) -Driver's minor device numbers are conserved. This means that instead of +Drivers' minor device numbers are conserved. This means that instead of each board getting a block of 16 minors pre-assigned, it gets however many it should, with the next card following directly behind it. A system with 4 2-port PC/Xi boards will use minor numbers 0-7. diff --git a/Documentation/exception.txt b/Documentation/exception.txt index 78118f1d570b..0102bb615607 100644 --- a/Documentation/exception.txt +++ b/Documentation/exception.txt @@ -9,7 +9,7 @@ In older versions of Linux this was done with the int verify_area(int type, const void * addr, unsigned long size) function. -This function verified, that the memory area starting at address +This function verified that the memory area starting at address addr and of size size was accessible for the operation specified in type (read or write). To do this, verify_read had to look up the virtual memory area (vma) that contained the address addr. In the @@ -53,7 +53,7 @@ Where does fixup point to? Since we jump to the the contents of fixup, fixup obviously points to executable code. This code is hidden inside the user access macros. I have picked the get_user macro defined in include/asm/uacess.h as an -example. The definition is somewhat hard to follow, so lets peek at +example. The definition is somewhat hard to follow, so let's peek at the code generated by the preprocessor and the compiler. I selected the get_user call in drivers/char/console.c for a detailed examination. @@ -122,7 +122,7 @@ The preprocessor output (edited to become somewhat readable): } ); -WOW! Black GCC/assembly magic. This is impossible to follow, so lets +WOW! Black GCC/assembly magic. This is impossible to follow, so let's see what code gcc generates: > xorl %edx,%edx @@ -266,7 +266,7 @@ vma occurs? 3.) CPU calls do_page_fault 4.) do page fault calls search_exception_table (regs->eip == c017e7a5); 5.) search_exception_table looks up the address c017e7a5 in the - exception table (i.e. the contents of the ELF section __ex_table + exception table (i.e. the contents of the ELF section __ex_table) and returns the address of the associated fault handle code c0199ff5. 6.) do_page_fault modifies its own return address to point to the fault handle code and returns. @@ -278,7 +278,7 @@ vma occurs? The steps 8a to 8c in a certain way emulate the faulting instruction. -That's it, mostely. If you look at our example, you might ask, why +That's it, mostly. If you look at our example, you might ask why we set EAX to -EFAULT in the exception handler code. Well, the get_user macro actually returns a value: 0, if the user access was successful, -EFAULT on failure. Our original code did not test this diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 72903619e4e7..679904d0b8d1 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -2,12 +2,19 @@ - this file (info on some of the filesystems supported by linux). affs.txt - info and mount options for the Amiga Fast File System. +coda.txt + - description of the CODA filesystem. +fat_cvf.txt + - Description of the Compressed Volume Files extension to the FAT + filesystem hpfs.txt - info and mount options for the OS/2 HPFS. isofs.txt - info and mount options for the ISO9660 (CDROM) filesystem. ncpfs.txt - info on Novell Netware(tm) filesystem using NCP protocol. +ntfs.txt + - info and mount options for the NTFS filesystem (Win NT). romfs.txt - Description of the ROMFS filesystem. smbfs.txt @@ -18,3 +25,5 @@ umsdos.txt - info on the umsdos extensions to the msdos filesystem. vfat.txt - info on using the VFAT filesystem used in Win NT and Win 95 +vfs.txt + - Overview of the Virtual File System diff --git a/Documentation/filesystems/affs.txt b/Documentation/filesystems/affs.txt index 4c5d7fa7a4ba..85d5a58b1b24 100644 --- a/Documentation/filesystems/affs.txt +++ b/Documentation/filesystems/affs.txt @@ -120,8 +120,8 @@ Symbolic links Although the Amiga and Linux file systems resemble each other, there are some, not always subtle, differences. One of them becomes apparent with symbolic links. While Linux has a file system with exactly one -root directory, the Amiga has a seperate root directory for each -file system (i. e. partition, floppy disk, ...). With the Amiga, +root directory, the Amiga has a separate root directory for each +file system (e.g. partition, floppy disk, ...). With the Amiga, these entities are called "volumes". They have symbolic names which can be used to access them. Thus, symbolic links can point to a different volume. AFFS turns the volume name into a directory name @@ -156,7 +156,7 @@ fs/affs/Changes. Filenames are truncated to 30 characters without warning (this can be changed by setting the compile-time option AFFS_NO_TRUNCATE -ina include/linux/amigaffs.h). +in include/linux/amigaffs.h). Case is ignored by the affs in filename matching, but Linux shells do care about the case. Example (with /mnt being an affs mounted fs): diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt index f049c02a600a..0c88ce6dc914 100644 --- a/Documentation/filesystems/coda.txt +++ b/Documentation/filesystems/coda.txt @@ -28,7 +28,7 @@ kernel support. This document describes the communication between Venus and kernel level file system code needed for the operation of the Coda filesys- - tem. This version document is meant to describe the current interface + tem. This document version is meant to describe the current interface (version 1.0) as well as improvements we envisage. ______________________________________________________________________ @@ -161,7 +161,7 @@ kernel support. client cache and makes remote procedure calls to Coda file servers and related servers (such as authentication servers) to service these requests it receives from the operating system. When Venus has - serviced a request it replies to the operating system with appropiate + serviced a request it replies to the operating system with appropriate return codes, and other data related to the request. Optionally the kernel support for Coda may maintain a minicache of recently processed requests to limit the number of interactions with Venus. Venus @@ -218,10 +218,10 @@ kernel support. as applicable in the operating system. These differ very significantly among operating systems, but share features such as facilities to read/write and create and remove objects. The Coda FS layer services - such VFS requests in by invoking on or more well defined services + such VFS requests by invoking one or more well defined services offered by the cache manager Venus. When the replies from Venus have come back to the FS driver, servicing of the VFS call continues and - finishes with a reply to the kernels VFS. Finally the VFS layer + finishes with a reply to the kernel's VFS. Finally the VFS layer returns to the process. As a result of this design a basic interface exposed by the FS driver @@ -277,7 +277,7 @@ kernel support. FS Driver in kernel memory on behalf of P and copied to user memory in Venus. - The FS Driver while servicing P makes upcall's to Venus. Such an + The FS Driver while servicing P makes upcalls to Venus. Such an upcall is dispatched to Venus by creating a message structure. The structure contains the identification of P, the message sequence number, the size of the request and a pointer to the data in kernel @@ -289,7 +289,7 @@ kernel support. synchronization objects. In the upcall routine the message structure is filled in, flags are set to 0, and it is placed on the _p_e_n_d_i_n_g queue. The routine calling upcall is responsible for allocating the - data buffer; it's structure will be described in the next section. + data buffer; its structure will be described in the next section. A facility must exist to notify Venus that the message has been created, and implemented using available synchronization objects in @@ -323,15 +323,15 @@ kernel support. +o The message is a _d_o_w_n_c_a_l_l. A downcall is a request from Venus to the FS Driver. The FS driver processes the request immediately - (usually a cache eviction or replacement) and when finishes + (usually a cache eviction or replacement) and when it finishes sendmsg_to_kernel returns. Now P awakes and continues processing upcall. There are some - subtleties to take account off. First P will determine if it was woken + subtleties to take account of. First P will determine if it was woken up in upcall by a signal from some other source (for example an attempt to terminate P) or as is normally the case by Venus in its sendmsg_to_kernel call. In the normal case, the upcall routine will - deallocate message structure and return. The FS routine can proceed + deallocate the message structure and return. The FS routine can proceed with its processing. @@ -344,7 +344,7 @@ kernel support. In case P is woken up by a signal and not by Venus, it will first look at the flags field. If the message is not yet READ, the process P can - handle it's signal without notifying Venus. If Venus has READ, and + handle its signal without notifying Venus. If Venus has READ, and the request should not be processed, P can send Venus a signal message to indicate that it should disregard the previous message. Such signals are put in the queue at the head, and read first by Venus. If @@ -407,7 +407,7 @@ kernel support. Before going on let us elucidate the role of the various fields. The inputArgs start with the opcode which defines the type of service requested from Venus. There are approximately 30 upcalls at present - which we will discuss. The unique field labels the inputArg with + which we will discuss. The unique field labels the inputArg with a unique number which will identify the message uniquely. A process and process group id are passed. Finally the credentials of the caller are included. @@ -421,9 +421,9 @@ kernel support. 44..11.. DDaattaa ssttrruuccttuurreess sshhaarreedd bbyy tthhee kkeerrnneell aanndd VVeennuuss - The CodaCred structure defines a variety of user and group id's as + The CodaCred structure defines a variety of user and group ids as they are set for the calling process. The vuid_t and guid_t are 32 bit - unsigned integers. It also defines group member ship in an array. On + unsigned integers. It also defines group membership in an array. On Unix the CodaCred has proven sufficient to implement good security semantics for Coda but the structure may have to undergo modification for the Windows environment when these mature. @@ -462,7 +462,7 @@ kernel support. to be prefixed to identify the Coda cell; this will probably take the form of a Ipv6 size IP address naming the Coda cell through DNS. - The next important structure shared between Venus and the kernel are + The next important structure shared between Venus and the kernel is the attributes of the file. The following structure is used to exchange information. It has room for future extensions such as support for device files (currently not present in Coda). @@ -514,7 +514,7 @@ kernel support. Coda specific requests can be made by application through the pioctl interface. The pioctl is implemented as an ordinary ioctl on a - ficticious file /coda/.CONTROL. The piocl call opens this file, gets + ficticious file /coda/.CONTROL. The pioctl call opens this file, gets a file handle and makes the ioctl call. Finally it closes the file. The kernel involvement in this is limited to providing the facility to @@ -614,7 +614,7 @@ kernel support. The name of the object is an 8 bit character string of maximum length CFS_MAXNAMLEN, currently set to 256 (including a 0 terminator.) - It is extremely important to realize that Venus bitwise or's the field + It is extremely important to realize that Venus bitwise ors the field cfs_lookup.vtype with CFS_NOCACHE to indicate that the object should not be put in the kernel name cache. @@ -650,11 +650,11 @@ kernel support. DDeessccrriippttiioonn This call returns the attributes of the file identified by fid. - EErrrroorrss Errors can occur if the object with fid does not exist, are + EErrrroorrss Errors can occur if the object with fid does not exist, is unaccessible or if the caller does not have permission to fetch attributes. - NNoottee Many kernel FS drivers (Linux, NT and Windows 95 need to acquire + NNoottee Many kernel FS drivers (Linux, NT and Windows 95) need to acquire the attributes as well as the Fid for the instantiation of an internal "inode" or "FileHandle". A significant improvement in performance on such systems could be made by combining the _l_o_o_k_u_p and _g_e_t_a_t_t_r calls @@ -689,7 +689,7 @@ kernel support. in BSD style. Attributes not to be changed are set to -1, apart from vtype which is set to VNON. Other are set to the value to be assigned. The only attributes which the FS driver may request to change are the - mode, ownner, groupid, atime, mtime and ctime. The return value + mode, owner, groupid, atime, mtime and ctime. The return value indicates success or failure. EErrrroorrss A variety of errors can occur. The object may not exist, may @@ -719,7 +719,7 @@ kernel support. DDeessccrriippttiioonn Verify if access to the object identified by VFid for operations described by flags is permitted. The result indicates if access will be granted. It is important to remember that Coda uses - ACL's to enforce protection and that ultimately the servers, not the + ACLs to enforce protection and that ultimately the servers, not the clients enforce the security of the system. The result of this call will depend on wether a _t_o_k_e_n is held by the user. @@ -851,7 +851,7 @@ kernel support. DDeessccrriippttiioonn This call creates a link to the sourceFid in the directory identified by destFid with name tname. The source must reside in the - targets parent, i.e. the source must be have parent destFid, i.e. Coda + target's parent, i.e. the source must be have parent destFid, i.e. Coda does not support cross directory hard links. Only the return value is relevant. It indicates success or the type of failure. @@ -1015,7 +1015,7 @@ kernel support. EErrrroorrss NNOOTTEE Currently the cfs_open_out structure is not properly adapted to - deal with the windows case. It might be best to implement two + deal with the Windows case. It might be best to implement two upcalls, one to open aiming at a container file name, the other at a container file inode. @@ -1051,7 +1051,7 @@ kernel support. fetching the data in Venus vproc_vfscalls. This seems silly. If a file is being closed, the data in the container file is to be the new data. Here again the execp flag might be in play to create confusion: - presently Venus might think a file can be flushed from the cache when + currently Venus might think a file can be flushed from the cache when it is still memory mapped. This needs to be understood. 0wpage @@ -1059,7 +1059,7 @@ kernel support. 44..1177.. iiooccttll - SSuummmmaarryy Do an ioctl on a file. This includes the piocl interface. + SSuummmmaarryy Do an ioctl on a file. This includes the pioctl interface. AArrgguummeennttss @@ -1091,7 +1091,7 @@ kernel support. EErrrroorrss NNOOTTEE Another bogus parameter. flags is not used. What is the - business about PREFETCHING in the Venus' code? + business about PREFETCHING in the Venus code? 0wpage @@ -1154,8 +1154,8 @@ kernel support. DDeessccrriippttiioonn Read directory entries from VFid starting at offset and - read at most count bytes. Returns the data into data and indicates - the size returned size. + read at most count bytes. Returns the data in data and returns + the size in size. EErrrroorrss @@ -1196,7 +1196,7 @@ kernel support. NNOOTTEE This operation is not used. However, it is extremely useful since it can be used to deal with read/write memory mapped files. - These can be "pinned" in the Venus cache using vget and release with + These can be "pinned" in the Venus cache using vget and released with inactive. 0wpage @@ -1219,8 +1219,8 @@ kernel support. oouutt none - DDeessccrriippttiioonn Ask Venus to update RVM attributes of object VFid. This - should be called as part of kernel level fsync type calls. The + DDeessccrriippttiioonn Ask Venus to update RVM attributes of object VFid. This + should be called as part of kernel level fsync type calls. The result indicates if the synching was successful. EErrrroorrss @@ -1452,7 +1452,7 @@ kernel support. 4. the cnode of the object The lookup call in the Coda FS Driver may request the cnode of the - desired object from the cache, by passing it's name, directory and the + desired object from the cache, by passing its name, directory and the CodaCred's of the caller. The cache will return the cnode or indicate that it cannot be found. The Coda FS Driver must be careful to invalidate cache entries when it modifies or removes objects. @@ -1496,7 +1496,7 @@ kernel support. DDeessccrriippttiioonn Remove all entries in the cache carrying the Cred. This - call is issued when tokes for a user expire or are flushed. + call is issued when tokens for a user expire or are flushed. 55..44.. ZZAAPPFFIILLEE @@ -1567,7 +1567,7 @@ kernel support. DDeessccrriippttiioonn Flush the attribute for the file. If it is a dir (odd - vnode), purge its children from the namecache remove the file from the + vnode), purge its children from the namecache and remove the file from the namecache. @@ -1589,7 +1589,7 @@ kernel support. DDeessccrriippttiioonn This routine replaces a ViceFid in the name cache with another. It is added to allow Venus during reintegration to replace locally allocated temp fids while disconnected with global fids even - when the reference count on those fids are not zero. + when the reference counts on those fids are not zero. 0wpage @@ -1629,7 +1629,7 @@ kernel support. 66..11.. RReeqquuiirreemmeennttss - The following requirements should be accomodated: + The following requirements should be accommodated: 1. The message queueus should have open and close routines. On Unix the opening of the character devices are such routines. @@ -1659,7 +1659,7 @@ kernel support. 6. All memory held by cnodes can be freed without relying on upcalls. - 7. Unmounting the file system can be done without relying on upcalss. + 7. Unmounting the file system can be done without relying on upcalls. 8. Mounting the Coda filesystem should fail gracefully if Venus cannot get the rootfid or the attributes of the rootfid. The latter is diff --git a/Documentation/filesystems/fat_cvf.txt b/Documentation/filesystems/fat_cvf.txt index 9e7062e13e1e..ef598932c518 100644 --- a/Documentation/filesystems/fat_cvf.txt +++ b/Documentation/filesystems/fat_cvf.txt @@ -34,7 +34,7 @@ like compression and decompression silently. - BMAP problems - CVF filesystems cannot do bmap. It's impossible by principle. Thus + CVF filesystems cannot do bmap. It's impossible in principle. Thus all actions that require bmap do not work (swapping, writable mmapping). Read-only mmapping works because the FAT driver has a hack for this situation :) Well, with some tricks writable mmapping could work, @@ -66,7 +66,7 @@ driver's standard options: cvf_format=xxx Forces the driver to use the CVF module "xxx" instead of auto-detection. - This is only necessary if the CVF format is not recognized corrrectly + This is only necessary if the CVF format is not recognized correctly because of bugs or incompatibilities in the CVF modules. (It skips the detect_cvf call.) "xxx" may be the text "none" (without the quotes) to inhibit using any of the loaded CVF modules, just in case a CVF @@ -80,7 +80,7 @@ driver's standard options: misinterpretation by the FAT driver, which would recognize the text after a comma as a FAT driver option and might get confused or print strange error messages. The documentation for the CVF module should - offer a different seperation symbol, for example the dot ".", which + offer a different separation symbol, for example the dot ".", which is only valid inside the string "yyy". @@ -109,11 +109,11 @@ to introduce the module to the FAT/CVF-FAT driver. It contains... - cvf_version: - A version id which must be uniqe. Choose one. + A version id which must be unique. Choose one. - cvf_version_text: A human readable version string that should be one short word describing the CVF format the module implements. This text is used - for the cvf_format option. This name must also be uniqe. + for the cvf_format option. This name must also be unique. - flags: Bit coded flags, currently only used for a readpage/mmap hack that provides both mmap and readpage functionality. If CVF_USE_READPAGE @@ -178,7 +178,7 @@ int unregister_cvf_format(struct cvf_format*cvf_format); This is usually called in cleanup_module. Return value =0 means success. An error only occurs if you try to unregister a CVF format that has not been previously registered. The code uses the version id - to distinguish the modules, so be sure to keep it uniqe. + to distinguish the modules, so be sure to keep it unique. 5. CVS Modules ------------------------------------------------------------------------------ diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt index 504408e52fc8..d9789eadbdea 100644 --- a/Documentation/filesystems/ntfs.txt +++ b/Documentation/filesystems/ntfs.txt @@ -5,9 +5,9 @@ To mount an NTFS volume, use the filesystem type 'ntfs'. The driver currently works only in read-only mode, with no fault-tolerance supported. If you enable the experimental write support, make sure you can recover from a complete loss of data. For ftdisk support, -limit success was reported with volume sets on top of the md driver, +limited success was reported with volume sets on top of the md driver, although mirror and stripe sets should work as well - if the md -driver can be talked into using the same lay-out as Windows NT. +driver can be talked into using the same layout as Windows NT. The ntfs driver supports the following mount options: iocharset=name Character set to use when returning file names. diff --git a/Documentation/filesystems/romfs.txt b/Documentation/filesystems/romfs.txt index 32a14e68bb37..7df713201bc9 100644 --- a/Documentation/filesystems/romfs.txt +++ b/Documentation/filesystems/romfs.txt @@ -22,7 +22,7 @@ genromfs. It is available via anonymous ftp on sunsite.unc.edu and its mirrors, in the /pub/Linux/system/recovery/ directory. As the name suggests, romfs could be also used (space-efficiently) on -various read-only medias, like (E)EPROM disks if someone will have the +various read-only media, like (E)EPROM disks if someone will have the motivation.. :) However, the main purpose of romfs is to have a very small kernel, @@ -79,7 +79,7 @@ The first eight bytes identify the filesystem, even for the casual inspector. After that, in the 3rd longword, it contains the number of bytes accessible from the start of this filesystem. The 4th longword is the checksum of the first 512 bytes (or the number of bytes -accessible, whichever is smallest). The applied algorithm is the same +accessible, whichever is smaller). The applied algorithm is the same as in the AFFS filesystem, namely a simple sum of the longwords (assuming bigendian quantities again). For details, please consult the source. This algorithm was chosen because although it's not quite diff --git a/Documentation/filesystems/umsdos.txt b/Documentation/filesystems/umsdos.txt index 9e658b166abc..9dda08913e62 100644 --- a/Documentation/filesystems/umsdos.txt +++ b/Documentation/filesystems/umsdos.txt @@ -14,13 +14,13 @@ one into a useful one. It gives you: - long file name - Permissions and owner + long file names + Permissions and owners Links - Special files (devices, pipe...) - All is need to be a linux root fs. + Special files (devices, pipes...) + All that is needed to be a linux root fs. -There is plenty of documentation on it in the source. A formated document +There is plenty of documentation on it in the source. A formatted document made from those comments is available from sunsite.unc.edu:/pub/Linux/system/Filesystems/umsdos. @@ -32,21 +32,21 @@ mount -t umsdos /dev/hda3 /mnt ^ ---------| -All option are passed to the msdos drivers. Option like uid,gid etc are +All options are passed to the msdos drivers. Option like uid,gid etc are given to msdos. The default behavior of Umsdos is to do the same thing as the msdos driver mostly passing commands to it without much processing. Again, this is the default. After doing the mount on a DOS partition, nothing special -happen. This is why all mount options are passed to the Msdos fs driver. +happens. This is why all mount options are passed to the msdos fs driver. -Umsdos use a special DOS file --linux-.--- to store the information +Umsdos uses a special DOS file --linux-.--- to store the information which can't be handle by the normal MsDOS file system. This is the trick. --linux-.--- is optional. There is one per directory. **** If --linux-.--- is missing, then Umsdos process the directory the - same way the msdos driver do. Short file name, no goodies, default + same way the msdos driver does. Short file names, no goodies, default owner and permissions. So each directory may have or not this --linux-.--- @@ -59,7 +59,7 @@ Now, how to get those --linux-.---. $5 per directory. Add any applicable taxes. \end joke_section -A utility umssync creates those. The kernel maintain them. It is available +A utility umssync creates those. The kernel maintains them. It is available from the same directory above (sunsite) in the file umsdos_progs-0.7.tar.gz. A compiled version is available in umsdos_progs-0.7.bin.tar.gz. @@ -69,20 +69,20 @@ umssync . This will promote this directory (a recursive option is available) to full umsdos capabilities (long name ...). A ls -l before and after won't show -much difference however. The file which were there are still there. But now +much difference however. The files which were there are still there. But now you can do all this: chmod 644 * - chown you.your_groupe * + chown you.your_group * ls >THIS_IS.A.VERY.LONG.NAME ln -s toto tata ls -l -Once a directory is promoted, all subdirectory created will inherit that +Once a directory is promoted, all subdirectories created will inherit that promotion. -What happen if you boot DOS and create files in those promoted directories ? -Umsdos won't notice new files, but will signal removed file (it won't crash). +What happens if you boot DOS and create files in those promoted directories ? +Umsdos won't notice new files, but will signal removed files (it won't crash). Using umssync in /etc/rc will make sure the DOS directory is in sync with the --linux-.---. @@ -95,8 +95,8 @@ after the "mount -a": (You put one for each umsdos mount point in the fstab) This will insure nice operation. A umsdos.fsck is in the making, -so you will be allowed to managed umsdos partition in the same way -other filesystem are, using the generic fsck front end. +so you will be allowed to manage umsdos partitions in the same way +other filesystems are, using the generic fsck front end. Hope this helps! diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index 61f44a87c74c..1a0da9d40dc6 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -71,7 +71,7 @@ POSSIBLE PROBLEMS * vfat_valid_longname does not properly checked reserved names. * When a volume name is the same as a directory name in the root directory of the filesystem, the directory name sometimes shows - up empty an empty file. + up as an empty file. * autoconv option does not work correctly. BUG REPORTS @@ -103,7 +103,7 @@ but it appears to be so. The extended FAT file system is almost identical to the FAT file system used in DOS versions up to and including 6.223410239847 :-). The significant change has been the addition of long file names. -Theses names support up to 255 characters including spaces and lower +These names support up to 255 characters including spaces and lower case characters as opposed to the traditional 8.3 short names. Here is the description of the traditional FAT entry in the current @@ -142,7 +142,7 @@ directory entries for any files with extended names. (Any name which legally fits within the old 8.3 encoding scheme does not have extra entries.) I call these extra entries slots. Basically, a slot is a specially formatted directory entry which holds up to 13 characters of -a files extended name. Think of slots as additional labeling for the +a file's extended name. Think of slots as additional labeling for the directory entry of the file to which they correspond. Microsoft prefers to refer to the 8.3 entry for a file as its alias and the extended slot directory entries as the file name. @@ -163,7 +163,7 @@ The C structure for a slot directory entry follows: If the layout of the slots looks a little odd, it's only because of Microsoft's efforts to maintain compatibility with old software. The slots must be disguised to prevent old software from -panicing. To this end, a number of measures are taken: +panicking. To this end, a number of measures are taken: 1) The attribute byte for a slot directory entry is always set to 0x0f. This corresponds to an old directory entry with @@ -206,9 +206,9 @@ the following: sum = (((sum&1)<<7)|((sum&0xfe)>>1)) + name[i] } - 3) If there is in the final slot, a Unicode NULL (0x0000) is stored - after the final character. After that, all unused characters in - the final slot are set to Unicode 0xFFFF. + 3) If there is free space in the final slot, a Unicode NULL (0x0000) + is stored after the final character. After that, all unused + characters in the final slot are set to Unicode 0xFFFF. Finally, note that the extended name is stored in Unicode. Each Unicode character takes two bytes. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 0644c2e2fe5d..7f75f4770de1 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -5,7 +5,7 @@ A Brief Overview of the Virtual File System Noone else seems to be writing this, so here's a quick description of what I've learned while writing lofs... -The VFS relatively simple, but it is nice not to have to browse through +The VFS is relatively simple, but it is nice not to have to browse through pages of code to determine what is expected when writing a filesystem. Hopefully this helps anyone attempting such a feat, as well as clearing up a few important points/dependencies. @@ -133,9 +133,9 @@ struct inode_operations int (*follow_link) (struct inode *,struct inode *,int,int,struct inode **); [optional] - The follow_link function is only nescessary if a filesystem uses a really + The follow_link function is only necessary if a filesystem uses a really twisted form of symbolic links - namely if the symbolic link comes from a - foriegn filesystem that makes no sense.... + foreign filesystem that makes no sense.... I threw this one out - too much redundant code! int (*readpage) (struct inode *, struct page *); [optional] diff --git a/Documentation/ftape.txt b/Documentation/ftape.txt index 28c3912ed41d..db53cd828aac 100644 --- a/Documentation/ftape.txt +++ b/Documentation/ftape.txt @@ -6,7 +6,7 @@ floppy tape device driver that comes with the Linux kernel. This document deals with ftape-3.04 and later. Please read the section "Changes" for the most striking differences between version 3.04 and 2.08; the latter was the version of ftape delivered with the kernel -until kernel version 2.030 and 2.1.57. ftape-3.x developed as the +until kernel version 2.0.30 and 2.1.57. ftape-3.x developed as the re-unification of ftape-2.x and zftape. zftape was developed in parallel with the stock ftape-2.x driver sharing the same hardware support but providing an enhanced file system interface. zftape also @@ -54,7 +54,7 @@ A minus 1. Ftape documentation ============================== Unluckily, the ftape-HOWTO is out of date. This really needs to be -changed. Up to data documentation as well as recent development +changed. Up to date documentation as well as recent development versions of ftape and useful links to related topics can be found at the ftape home page at @@ -245,7 +245,7 @@ C. Boot and load time configuration insmod ftape.o ft_tracing=4 or by editing the file `/etc/conf.modules' in which case they take - affect each time when the module is loaded with `modprobe' (please + effect each time when the module is loaded with `modprobe' (please refer to the modules documentation, i.e. `modules.txt' and the respective manual pages). Thus, you should add a line diff --git a/Documentation/hayes-esp.txt b/Documentation/hayes-esp.txt index 62a9c7451674..3cd744c215fe 100644 --- a/Documentation/hayes-esp.txt +++ b/Documentation/hayes-esp.txt @@ -51,7 +51,7 @@ be specified by using the irq= option. The format is: irq=[0x100],[0x140],[0x180],[0x200],[0x240],[0x280],[0x300],[0x380] The address in brackets is the base address of the card. The IRQ of -nonexistant cards can be set to 0. If and IRQ of a card that does exist is set +nonexistent cards can be set to 0. If an IRQ of a card that does exist is set to 0, the driver will attempt to guess at the correct IRQ. For example, to set the IRQ of the card at address 0x300 to 12, the insmod command would be: diff --git a/Documentation/ide.txt b/Documentation/ide.txt index a6e7e551e585..7ee08c752bf3 100644 --- a/Documentation/ide.txt +++ b/Documentation/ide.txt @@ -221,7 +221,7 @@ If you always get timeout errors, interrupts from the drive are probably not making it to the host. Check how you have the hardware jumpered and make sure it matches what the driver expects (see the configuration instructions above). If you have a PCI system, also check the BIOS -setup; i've had one report of a system which was shipped with IRQ 15 +setup; I've had one report of a system which was shipped with IRQ 15 disabled by the BIOS. The kernel is able to execute binaries directly off of the cdrom, diff --git a/Documentation/isdn/INTERFACE b/Documentation/isdn/INTERFACE index 5b25c9597088..cf938f67f0b4 100644 --- a/Documentation/isdn/INTERFACE +++ b/Documentation/isdn/INTERFACE @@ -53,7 +53,7 @@ Description of the Interface between Linklevel and Hardwarelevel ***CHANGE0.6: New since this version. Also to be preset by the HL-driver. With this value the HL-driver - tells to the LL the maximum size of a data-packet it will accept. + tells the LL the maximum size of a data-packet it will accept. unsigned long features; @@ -70,8 +70,8 @@ Description of the Interface between Linklevel and Hardwarelevel ***CHANGE0.7.4: New field. To be preset by the HL-driver, if it supports sk_buff's. The driver - should put here the amount of additional space needed in sk-buff's for - its internal purposes. Drivers not supporting sk_buff's should put + should put here the amount of additional space needed in sk_buff's for + its internal purposes. Drivers not supporting sk_buff's should initialize this field to 0. void (*rcvcallb_skb)(int, int, struct sk_buff *) @@ -211,7 +211,7 @@ Description of the Interface between Linklevel and Hardwarelevel All commands will be performed by calling the function command() described above from within the LL. The field command of the struct-parameter will - contain the desired command, the field driver always is set to the + contain the desired command, the field driver is always set to the appropriate driver-Id. Until now, the following commands are defined: @@ -436,7 +436,7 @@ Description of the Interface between Linklevel and Hardwarelevel arg = unused. para = unused. -3. Description of the events to be signaled by the HL-driver to th LL. +3. Description of the events to be signaled by the HL-driver to the LL. All status-changes are signaled via calling the previously described function statcallb(). The field command of the struct isdn_cmd has @@ -520,7 +520,7 @@ Description of the Interface between Linklevel and Hardwarelevel remote-station has initiated establishment) The HL driver should call this when the logical l2/l3 protocol - connection on top of the physical B-channel is esatblished . + connection on top of the physical B-channel is established. Parameter: driver = driver-Id @@ -624,7 +624,7 @@ Description of the Interface between Linklevel and Hardwarelevel With this call, the HL-driver delivers CAUSE-messages to the LL. Currently the LL does not use this messages. Their contents is simply logged via kernel-messages. Therefore, currently the format of the - messages is currently completely free. However they should be printable. + messages is completely free. However they should be printable. Parameter: driver = driver-Id diff --git a/Documentation/isdn/README b/Documentation/isdn/README index 1d94afb68909..cfe4beae43b6 100644 --- a/Documentation/isdn/README +++ b/Documentation/isdn/README @@ -62,7 +62,7 @@ README for the ISDN-subsystem read: raw D-channel-messages (format: depends on driver). ioctl: depends on driver, i.e. for the ICN-driver, the base-address of the ports and the shared memory on the card can be set and read - also the boot-code an the protocol software can be loaded into + also the boot-code and the protocol software can be loaded into the card. O N L Y !!! for debugging (no locking against other devices): @@ -74,7 +74,7 @@ README for the ISDN-subsystem 128 tty-devices (64 cuix and 64 ttyIx) with integrated modem-emulator: The functionality is almost the same as that of a serial device - (the line-discs are handled by the kernel, which lets you run + (the line-discs are handled by the kernel), which lets you run SLIP, CSLIP and asynchronous PPP through the devices. We have tested Seyon, minicom, CSLIP (uri-dip) PPP and mgetty (compiled with NO_FAX), XCept. @@ -96,7 +96,7 @@ README for the ISDN-subsystem ATI Return "ISDN for Linux...". ATI0 " ATI1 " - ATI2 Report of last connection. + ATI2 Report of last connection. ATO On line (data mode). ATQ0 Enable result codes (default). ATQ1 Disable result codes (default). @@ -107,9 +107,9 @@ README for the ISDN-subsystem ATZ Load registers and EAZ/MSN from Profile. AT&Bx Set Send-Packet-size to x (max. 4000) The real packet-size may be limited by the - low-level-driver used. i.e.: the HiSax-Module- + low-level-driver used. e.g. the HiSax-Module- limit is 2000. You will get NO Error-Message, - if you set it to higher Values, because at the + if you set it to higher values, because at the time of giving this command the corresponding driver may not be selected (see "Automatic Assignment") however the size of outgoing packets @@ -245,7 +245,7 @@ README for the ISDN-subsystem 19 0 Service-Octet-2 20 0 Bit coded register (readonly) Service-Octet-1 of last call. - Bit mapping is the same like register 18 + Bit mapping is the same as register 18 21 0 Bit coded register (readonly) Set on incoming call (during RING) to octet 3 of calling party number IE (Numbering plan) @@ -263,17 +263,17 @@ README for the ISDN-subsystem All inactive physical lines are listening to all EAZs for incoming calls and are NOT assigned to a specific tty or network interface. When an incoming call is detected, the driver looks first for a network - interfaces and then for an opened tty which: + interface and then for an opened tty which: 1. is configured for the same EAZ. 2. has the same protocol settings for the B-channel. 3. (only for network interfaces if the security flag is set) contains the caller number in its access list. 4. Either the channel is not bound exclusively to another Net-interface, or - it is bound AND the other checks apply to exact this Interface. + it is bound AND the other checks apply to exactly this Interface. (For usage of the bind-features, refer to the isdnctrl-man-page) - Only when a matching interface or tty is found, the call is accepted + Only when a matching interface or tty is found is the call accepted and the "connection" between the low-level-layer and the link-level-layer is established and kept until the end of the connection. In all other cases no connection is established. Isdn4linux can be @@ -309,7 +309,7 @@ README for the ISDN-subsystem 4. Device-inodes - The major and minor-numbers and its names are described in + The major and minor numbers and their names are described in Documentation/devices.txt. The major-numbers are: 43 for the ISDN-tty's. @@ -357,7 +357,7 @@ README for the ISDN-subsystem i) Setup the interface with ifconfig as usual, and set a route to it. - j) (optional) If you run X11 and have Tcl/Tk-wish Version4.0, you can use + j) (optional) If you run X11 and have Tcl/Tk-wish Version 4.0, you can use the script tools/tcltk/isdnmon. You can add actions for line-status changes. See the comments at the beginning of the script for how to do that. There are other tty-based tools in the tools-subdirectory @@ -399,7 +399,7 @@ README for the ISDN-subsystem "isdnctrl secure off" - Switch of secure operation (default). + Switch off secure operation (default). "isdnctrl ihup [on|off]" Switch the hang-up-timer for incoming calls on or off. @@ -434,15 +434,15 @@ README for the ISDN-subsystem Selects the type of packet-encapsulation. The encapsulation can be changed only while an interface is down. - At the moment th following Values are supported: + At the moment the following values are supported: rawip (Default) Selects raw-IP-encapsulation. This means, MAC-headers are stripped off. ip IP with type-field. Same as IP but the type-field of the MAC-header is preserved. - x25iface x25 interface encapsulation (first byte semantics as defined in + x25iface X.25 interface encapsulation (first byte semantics as defined in ../networking/x25-iface.txt). Use this for running the linux - x25 network protocol stack (AF_X25 sockets) on top of isdn. + X.25 network protocol stack (AF_X25 sockets) on top of isdn. cisco-h A special-mode for communicating with a Cisco, which is configured to do "hdlc" ethernet No stripping. Packets are sent with full MAC-header. @@ -483,7 +483,7 @@ README for the ISDN-subsystem dial out using a specific Card or even preserve a specific Channel for Dialout of a specific net-interface. This can be done with the above command. Replace by whatever you assigned while loading the - module. The is counting from zero. the upper Limit + module. The is counting from zero. The upper Limit depends on the card used. At the Moment no card supports more than 2 Channels, so the upper limit is one. diff --git a/Documentation/isdn/README.HiSax b/Documentation/isdn/README.HiSax index 21f849cbd62b..4b49e0939b23 100644 --- a/Documentation/isdn/README.HiSax +++ b/Documentation/isdn/README.HiSax @@ -66,7 +66,7 @@ It can be configured using the command line feature while loading the kernel with LILO or LOADLIN or, if built as a module, using insmod/modprobe with parameters. There is also some config needed before you compile the kernel and/or -modules. It is enclose in the normal "make [menu]config" target at the +modules. It is included in the normal "make [menu]config" target at the kernel. Don't forget it, especially to select the right D-channel protocol. Please note: All PnP cards need to be configured with isapnp and will work @@ -152,7 +152,7 @@ Card types: At the moment IRQ sharing is not possible. Please make sure that your IRQ is free and enabled for ISA use. Note: For using the ELSA PCMCIA you need the cardmanager under MSDOS for -enabling in the moment, then boot linux with loadlin. +enabling at the moment, then boot linux with loadlin. Examples for module loading @@ -272,7 +272,7 @@ At the moment, debugging messages are enabled with the hisaxctrl tool: hisaxctrl DebugCmd - default is HiSax, if you didn't specified one. + default is HiSax, if you didn't specify one. DebugCmd is 1 for generic debugging 11 for layer 1 development debugging @@ -309,18 +309,18 @@ With DebugCmd set to 11: With DebugCmd set to 13: 1 Warnings (default: on) - 2 l3 protocol discriptor errors + 2 l3 protocol descriptor errors 4 l3 state machine 8 charge info debugging (1TR6) For example, 'hisaxctrl HiSax 1 0x3ff' enables full generic debugging. Because of some obscure problems with some switch equipment, the delay -between CONNECT message and sending the first data on th B-channel is now +between the CONNECT message and sending the first data on the B-channel is now configurable with hisaxctrl 2 - in ms Value between 50 an 800 ms are recommended. + in ms Value between 50 and 800 ms is recommended. Warning @@ -389,7 +389,7 @@ Original from Juergen Quade, new version KKe. Attention NEW VERSION, the old leased line syntax won't work !!! You can use HiSax to connect your Linux-Box via an ISDN leased line -to i.e. the internet: +to e.g. the Internet: 1. Build a kernel which includes the HiSax driver either as a module or as part of the kernel. @@ -407,7 +407,7 @@ to i.e. the internet: vi /etc/lilo.conf lilo - Your lilo.conf _might_ look as the following: + Your lilo.conf _might_ look like the following: # LILO configuration-file # global section @@ -449,7 +449,7 @@ to i.e. the internet: /sbin/isdnctrl secure isdn0 on /sbin/isdnctrl huptimeout isdn0 0 /sbin/isdnctrl l2_prot isdn0 hdlc - # Attention you must not set a outgoing number !!! This won't work !!! + # Attention you must not set an outgoing number !!! This won't work !!! # The incomming number is LEASED0 for the first card, LEASED1 for the # second and so on. /sbin/isdnctrl addphone isdn0 in LEASED0 @@ -465,7 +465,7 @@ to i.e. the internet: /sbin/hisaxctrl HiSax 5 1 Remarks: -a) If you have a CISCO don´t forget to switch off the KEEP ALIVE option! +a) If you have a CISCO don't forget to switch off the KEEP ALIVE option! Here an example script: #!/bin/sh diff --git a/Documentation/isdn/README.act2000 b/Documentation/isdn/README.act2000 index 953582229e55..155095db162e 100644 --- a/Documentation/isdn/README.act2000 +++ b/Documentation/isdn/README.act2000 @@ -36,7 +36,7 @@ IRQ is configured by software. Possible values are: 3, 5, 7, 10, 11, 12, 15 and none (polled mode) -The ACT2000 driver either may be build into kernel or as a module. +The ACT2000 driver may either be built into the kernel or as a module. Initialization depends on how the driver is built: Driver built into the kernel: @@ -78,11 +78,11 @@ Driver built as module: act_bus=b act_port=p act_irq=i act_id=idstring - where b, p, i and idstring have the same meanings like parameters + where b, p, i and idstring have the same meanings as the parameters described for the builtin version above. Using the "actctrl"-utility, the same features apply to the modularized - version like to the kernel-builtin one. (i.e. loading of firmware and + version as to the kernel-builtin one. (i.e. loading of firmware and configuring the D-channel protocol) Loading the firmware into the card: @@ -90,7 +90,7 @@ Loading the firmware into the card: The firmware is supplied together with the isdn4k-utils package. It can be found in the subdirectory act2000/firmware/ - Assumed you have installed the utility-package correctly, the firmware + Assuming you have installed the utility-package correctly, the firmware will be downloaded into the card using the following command: actctrl -d idstring load /etc/isdn/bip11.btl diff --git a/Documentation/isdn/README.audio b/Documentation/isdn/README.audio index c01a116bcf2d..d7b845162fa7 100644 --- a/Documentation/isdn/README.audio +++ b/Documentation/isdn/README.audio @@ -22,7 +22,7 @@ Commands for enabling/disabling audio mode: Commands supported in audio mode: -All audio mode commands have the one of the following form: +All audio mode commands have one of the following forms: AT+Vxx? Show current setting. AT+Vxx=? Show possible settings. @@ -89,8 +89,8 @@ General behavior and description of data formats/protocol. End of audio data. (i.e. caused by a hangup of the remote side) Emulator stops recording, responding with VCON. - Abort recording, (send by appl.) Emulator - stops recording, sends DLE,ETX. + Abort recording, (send by appl.) Emulator + stops recording, sends DLE,ETX. Escape sequence for DLE in data stream. 0 Touchtone "0" received. ... diff --git a/Documentation/isdn/README.avmb1 b/Documentation/isdn/README.avmb1 index 68bdf0985e47..342532786841 100644 --- a/Documentation/isdn/README.avmb1 +++ b/Documentation/isdn/README.avmb1 @@ -26,7 +26,7 @@ To use the card you need the t4-files to download the firmware. AVM GmbH provides several t4-files for the different D-channel protocols (b1.t4 for Euro-ISDN). Install these file in /lib/isdn. -If you not compile the driver as modules, you have to add the +If you do not compile the driver as modules, you have to add the card(s) and load them after booting: avmcapictrl add 0x150 15 diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap index b6ff4d2d1e97..1e46096caff1 100644 --- a/Documentation/isdn/README.concap +++ b/Documentation/isdn/README.concap @@ -10,7 +10,7 @@ Thus, the mnemonic: "CONnection CONtrolling eNCAPsulation Protocol". This is currently only used inside the isdn subsystem. But it might also be useful to other kinds of network devices. Thus, if you want -to suggest changes that improve usability or performace of the +to suggest changes that improve usability or performance of the interface, please let me know. I'm willing to include them in future releases (even if I needed to adapt the current isdn code to the changed interface). @@ -25,14 +25,14 @@ Thus, a device driver for a certain type of hardware must support several different encapsulation protocols at once. The isdn device driver did already support several different -encapsulation protocols. The encapsulation protocol is configuered by a +encapsulation protocols. The encapsulation protocol is configured by a user space utility (isdnctrl). The isdn network interface code then uses several case statements which select appropriate actions -depending on the currently configuered encapsulation protocol. +depending on the currently configured encapsulation protocol. In contrast, LAN network interfaces always used a single encapsulation protocol which is unique to the hardware type of the interface. The LAN -encapsulation is usually done by just sticking a header at the data. Thus, +encapsulation is usually done by just sticking a header on the data. Thus, traditional linux network device drivers used to process the encapsulation protocol directly (usually by just providing a hard_header() method in the device structure) using some hardware type specific support @@ -46,13 +46,13 @@ the requirements for complex WAN encapsulations. Many Encapsulation protocols used on top of WAN connections will not just -stick a header at the data. They also might need to set up or release +stick a header on the data. They also might need to set up or release the WAN connection. They also might want to send other data for their -private purpose over the wire. I.e. ppp does a lot of link level -negotiation before the first peace of user data can be transmitted. +private purpose over the wire, e.g. ppp does a lot of link level +negotiation before the first piece of user data can be transmitted. Such encapsulation protocols for WAN devices are typically more complex -than encapsulation protocols for lan devices. Thus, network interfaces -code for typical WAN devices also tends to be more more complex. +than encapsulation protocols for lan devices. Thus, network interface +code for typical WAN devices also tends to be more complex. In order to support Linux' x25 PLP implementation on top of @@ -65,22 +65,22 @@ protocol, complexity could be reduced and maintainability could be increased. -Likewise, a same encapsulation protocol will frequently be needed by -several different interfaces of even different hardware type. I.e. the -synchronous ppp implementaion used by the isdn driver and the -asyncronous ppp implemntation used by the ppp driver have a lot of +Likewise, a similar encapsulation protocol will frequently be needed by +several different interfaces of even different hardware type, e.g. the +synchronous ppp implementation used by the isdn driver and the +asyncronous ppp implementation used by the ppp driver have a lot of similar code in them. By cleanly separating the encapsulation protocol from the hardware specific interface stuff such code could be shared better in future. -When operating over dial-up-connections (i.e. telephone lines via modem, +When operating over dial-up-connections (e.g. telephone lines via modem, non-permanent virtual circuits of wide area networks, ISDN) many -encapsulation protocols will need to control the connection. Therfore, +encapsulation protocols will need to control the connection. Therefore, some basic connection control primitives are supported. The type and semantics of the connection (i.e the ISO layer where connection service is provided) is outside our scope and might be different depending on -the encapsulation protocol used. I.e. for a ppp module using our service +the encapsulation protocol used, e.g. for a ppp module using our service on top of a modem connection a connect_request will result in dialing a (somewhere else configured) remote phone number. For an X25-interface module (LAPB semantics, as defined in Documentation/networking/x25-iface.txt) @@ -88,7 +88,7 @@ a connect_request will ask for establishing a reliable lapb datalink connection. -The encapsulation protocol currently provides the follwing +The encapsulation protocol currently provides the following service primitives to the network device. - create a new encapsulation protocol instance @@ -121,7 +121,7 @@ struct concap_proto_ops{ struct device *ndev, struct concap_device_ops *dops); - /* inactivate an encapsulation protocol instance. The encapsulation + /* deactivate an encapsulation protocol instance. The encapsulation protocol may not call any *dops methods after this. */ int (*close)(struct concap_proto *cprot); @@ -145,24 +145,24 @@ The data structures are defined in the header file include/linux/concap.h. A Network interface using encapsulation protocols must also provide some service primitives to the encapsulation protocol: -- request data beeing submitted by lower layer (device hardware) -- request a connection beeing set up by lower layer -- request a connection beeing released by lower layer +- request data being submitted by lower layer (device hardware) +- request a connection being set up by lower layer +- request a connection being released by lower layer -The encapsulations protocol accesses those primitives via callbacks +The encapsulation protocol accesses those primitives via callbacks provided by the network interface within a struct concap_device_ops. struct concap_device_ops{ - /* to request data is submitted by device*/ + /* to request data be submitted by device */ int (*data_req)(struct concap_proto *, struct sk_buff *); /* Control methods must be set to NULL by devices which do not - support connection control.*/ - /* to request a connection is set up */ + support connection control. */ + /* to request a connection be set up */ int (*connect_req)(struct concap_proto *); - /* to request a connection is released */ + /* to request a connection be released */ int (*disconn_req)(struct concap_proto *); }; @@ -172,7 +172,7 @@ because the encapsulation protocol directly calls netif_rx(). -An encapsulation protocol itsself is actually the +An encapsulation protocol itself is actually the struct concap_proto{ struct device *net_dev; /* net device using our service */ struct concap_device_ops *dops; /* callbacks provided by device */ @@ -189,7 +189,7 @@ struct concap_proto{ Most of this is filled in when the device requests the protocol to be reset (opend). The network interface must provide the net_dev and -dops pointers. Other concap_proto members should be considerd private +dops pointers. Other concap_proto members should be considered private data that are only accessed by the pops callback functions. Likewise, a concap proto should access the network device's private data only by means of the callbacks referred to by the dops pointer. @@ -217,21 +217,21 @@ The concept of the concap proto might help to reuse protocol code and reduce the complexity of certain network interface implementations. The trade off is that it introduces yet another procedure call layer when processing the protocol. This has of course some impact on -performace. However, typically the concap interface will be used by +performance. However, typically the concap interface will be used by devices attached to slow lines (like telephone, isdn, leased synchronous -lines). For such slow lines, the overhead is probably neglectable. +lines). For such slow lines, the overhead is probably negligible. This might no longer hold for certain high speed WAN links (like ATM). If general linux network interfaces explicitly supported concap -protocols (i.e. by a member struct concap_proto* in struct device) +protocols (e.g. by a member struct concap_proto* in struct device) then the interface of the service function could be changed by passing a pointer of type (struct device*) instead of type (struct concap_proto*). Doing so would make many of the service -functions compatible to network device support fuctions. i.e. +functions compatible to network device support fuctions. -i.e. instead of the concap protocol's service function +e.g. instead of the concap protocol's service function int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb); @@ -252,7 +252,7 @@ The device's data request function could also be defined as This might even allow for some protocol stacking. And the network interface might even register the same data_req() function directly as its hard_start_xmit() method when a zero layer encapsulation -protocol is configured. Thus, eliminating the performace penalty +protocol is configured. Thus, eliminating the performance penalty of the concap interface when a trivial concap protocol is used. Nevertheless, the device remains able to support encapsulation protocol configuration. diff --git a/Documentation/isdn/README.icn b/Documentation/isdn/README.icn index cb8908d58767..155e87457ae8 100644 --- a/Documentation/isdn/README.icn +++ b/Documentation/isdn/README.icn @@ -62,8 +62,8 @@ Setting up the IO-address dipswitches for the ICN-ISDN-card: 1 1 1 0 0x368 1 1 1 1 NOT ALLOWED! -The ICN driver either may be build into kernel or as a module. Initialization -depends on how the drive is built: +The ICN driver may be built into the kernel or as a module. Initialization +depends on how the driver is built: Driver built into the kernel: @@ -102,7 +102,7 @@ Driver built as module: portbase=p membase=m icn_id=idstring [icn_id2=idstring2] - where p, m, idstring1 and idstring2 have the same meanings like + where p, m, idstring1 and idstring2 have the same meanings as the parameters described for the kernel-version above. When using the ICN double card (4B), you MUST define TWO idstrings. @@ -127,12 +127,12 @@ Loading the firmware into the card: pc_1t_ca.bin - Image of firmware for german 1TR6 protocol. pc_eu_ca.bin - Image if firmware for EDSS1 (Euro-ISDN) protocol. - Assumed you have installed the utility-package correctly, the firmware + Assuming you have installed the utility-package correctly, the firmware will be downloaded into the 2B-card using the following command: icnctrl -d Idstring load /etc/isdn/loadpg.bin /etc/isdn/pc_XX_ca.bin - where XX is either "1t" or "eu", depending of the D-Channel protocol + where XX is either "1t" or "eu", depending on the D-Channel protocol used on your S0-bus and Idstring is the Name of the card, given during insmod-time or (for kernel-builtin driver) on the kernel commandline. diff --git a/Documentation/isdn/README.pcbit b/Documentation/isdn/README.pcbit index e93562b2c33b..fb696422f911 100644 --- a/Documentation/isdn/README.pcbit +++ b/Documentation/isdn/README.pcbit @@ -16,20 +16,20 @@ ftp://ftp.di.fc.ul.pt/pub/systems/Linux/isdn Known Limitations: -- The board reset proceeding is at the moment incorrect and will only +- The board reset procedure is at the moment incorrect and will only allow you to load the firmware after a hard reset. -- Only HDLC in B-channels is supported at the moment. There is now -current support to X.25 in B or D channels nor LAPD in B -channels. The main reason is that this two other protocol modes have, +- Only HDLC in B-channels is supported at the moment. There is no +current support for X.25 in B or D channels nor LAPD in B +channels. The main reason is that these two other protocol modes have, to my knowledge, very little use. If you want to see them implemented *do* send me a mail. -- The driver often triggers errors in the board that i and the +- The driver often triggers errors in the board that I and the manufacturer believe to be caused by bugs in the firmware. The current -version includes several proceedings for error recovery that should +version includes several procedures for error recovery that should allow normal operation. Plans for the future include cooperation with -the manufacturer in order to solve this problems. +the manufacturer in order to solve this problem. Information/hints/help can be obtained in the linux isdn mailing list (isdn4linux@hub-wue.franken.de) or directly from me. diff --git a/Documentation/isdn/README.sc b/Documentation/isdn/README.sc index 0ea8ca165ebc..b70db7a630b1 100644 --- a/Documentation/isdn/README.sc +++ b/Documentation/isdn/README.sc @@ -9,7 +9,7 @@ Speaking of guarantees, THIS IS BETA SOFTWARE and as such contains bugs and defects either known or unknown. Use this software at your own risk. There is NO SUPPORT for this software. Some help may be available through the web site or the mailing list but such support is totally at -our own option and without warrantee. If you choose to assume all and +our own option and without warranty. If you choose to assume all and total risk by using this driver, we encourage you to join the beta mailing list. @@ -17,7 +17,7 @@ To join the Linux beta mailing list, send a message to: majordomo@spellcast.com with the words "subscribe linux-beta" as the only contents of the message. Do not include a signature. If you choose to remove yourself from this list at a later date, send another message to -the same address with the words "unsubscribe linux-beta" as it's only +the same address with the words "unsubscribe linux-beta" as its only contents. TABLE OF CONTENTS @@ -42,7 +42,7 @@ TABLE OF CONTENTS --------------- The revision 2 Linux driver for SpellCaster ISA ISDN adapters is built -upon ISDN4Linux available seperately or as included in Linux 2.0 and later. +upon ISDN4Linux available separately or as included in Linux 2.0 and later. The driver will support a maximum of 4 adapters in any one system of any type including DataCommute/BRI, DataCommute/PRI and TeleCommute/BRI for a maximum of 92 channels for host. The driver is supplied as a module in @@ -74,14 +74,14 @@ include: allow us to utilize all of the available RAM on the adapter through only one 16K page. - Better detection of available upper memory. The probing routines - have been improved to better detect avaialble shared RAM pages and + have been improved to better detect available shared RAM pages and used pages are now locked. - Decreased loading time and a wider range of I/O ports probed. We have significantly reduced the amount of time it takes to load the driver and at the same time doubled the number of I/O ports - probed increasing the likelyhood of finding an adapter. + probed increasing the likelihood of finding an adapter. - We now support all ISA adapter models with a single driver instead - of seperate drivers for each model. The revision 2 driver supports + of separate drivers for each model. The revision 2 driver supports the DataCommute/BRI, DataCommute/PRI and TeleCommute/BRI in any combination up to a maximum of four adapters per system. - On board PPP protocol support has been removed in favour of the @@ -115,7 +115,7 @@ must ensure that the following software is installed, configuraed and running: 2.1 Unpacking and installing the driver - 1. As root, create a directory in a convienient place. We suggest + 1. As root, create a directory in a convenient place. We suggest /usr/src/spellcaster. 2. Unpack the archive with : @@ -170,36 +170,38 @@ reserved for ISA use only. 2.6 How to setup ISDN4Linux with the driver -There are two main configurations which you can use with the driver: +There are three main configurations which you can use with the driver: A) Basic HDLC connection B) PPP connection C) MLPPP connection -It should be mentioned here that you may also use a tty connection if you desire. -The Documentation directory of the isdn4linux subsystem offers a good documentation -on this feature. +It should be mentioned here that you may also use a tty connection if you +desire. The Documentation directory of the isdn4linux subsystem offers good +documentation on this feature. A) 10 steps to the establishment of a basic HDLC connection ----------------------------------------------------------- - please open the isdn-hdlc file in the examples directory and follow along... - This file is a script used to configure a BRI ISDN TA to establish a basic HDLC - connection between its two channels. There two network interfaces which are - created and two routes added between the channels. + This file is a script used to configure a BRI ISDN TA to establish a + basic HDLC connection between its two channels. Two network + interfaces are created and two routes added between the channels. - i) using the isdnctrl utitity, add an interface with "addif" and name it "isdn0" + i) using the isdnctrl utitity, add an interface with "addif" and + name it "isdn0" ii) add the outgoing and inbound telephone numbers iii) set the Layer 2 protocol to hdlc - iv) set the eaz of the interface to be the phone number of that specific channel + iv) set the eaz of the interface to be the phone number of that + specific channel v) to turn the callback features off, set the callback to "off" and the callback delay (cbdelay) to 0. vi) the hangup timeout can be set to a specified number of seconds - vii) the hangup upon incomming call can be set on or off - viii) use the ifconfig command to bring-up the network interface with a specific - IP address and point to point address - viv) add a route to the IP address through the isdn0 interface + vii) the hangup upon incoming call can be set on or off + viii) use the ifconfig command to bring up the network interface with + a specific IP address and point to point address + ix) add a route to the IP address through the isdn0 interface x) a ping should result in the establishment of the connection @@ -208,13 +210,15 @@ B) Establishment of a PPP connection - please open the isdn-ppp file in the examples directory and follow along... - This file is a script used to configure a BRI ISDN TA to establish a PPP connection - between the two channels. The file is almost identical to the HDLC connection - example except that the packet ecapsulation type has to be set. + This file is a script used to configure a BRI ISDN TA to establish a + PPP connection between the two channels. The file is almost + identical to the HDLC connection example except that the packet + ecapsulation type has to be set. - use the same procedure as in the HDLC connection from steps i) to iii) then, - after the Layer 2 protocol is set, set the encapsulation "encap" to syncppp. - With this done, the rest of the steps, iv) to x) can be followed from above. + use the same procedure as in the HDLC connection from steps i) to + iii) then, after the Layer 2 protocol is set, set the encapsulation + "encap" to syncppp. With this done, the rest of the steps, iv) to x) + can be followed from above. Then, the ipppd (ippp daemon) must be setup: @@ -223,52 +227,55 @@ B) Establishment of a PPP connection xiii) set the mru size to 2000 xiv) link the two /dev interfaces to the daemon -NOTE: A "*" in the inbound telephone number specifies that a call can be accepted - on any number. +NOTE: A "*" in the inbound telephone number specifies that a call can be +accepted on any number. C) Establishment of a MLPPP connection -------------------------------------- - please open the isdn-mppp file in the examples directory and follow along... - This file is a script used to configure a BRI ISDN TA to accept a Multi Link PPP - connection. + This file is a script used to configure a BRI ISDN TA to accept a + Multi Link PPP connection. - i) using the isdnctrl utitity, add an interface with "addif" and name it "ippp0" + i) using the isdnctrl utitity, add an interface with "addif" and + name it "ippp0" ii) add the inbound telephone number - iii) set the Layer 2 protocol to hdlc and the Layer 3 protocol to trans (transparent) + iii) set the Layer 2 protocol to hdlc and the Layer 3 protocol to + trans (transparent) iv) set the packet encapsulation to syncppp - v) set the eaz of the interface to be the phone number of that specific channel - vi) to turn the callback features off, set the callback to "off" and + v) set the eaz of the interface to be the phone number of that + specific channel + vi) to turn the callback features off, set the callback to "off" and the callback delay (cbdelay) to 0. vi) the hangup timeout can be set to a specified number of seconds - vii) the hangup upon incomming call can be set on or off + vii) the hangup upon incoming call can be set on or off viii) add a slave interface and name it "ippp32" for example - viv) set the similar parameters for the ippp32 interface - x) use the ifconfig command to bring-up the ippp0 interface with a specific - IP address and point to point address + ix) set the similar parameters for the ippp32 interface + x) use the ifconfig command to bring-up the ippp0 interface with a + specific IP address and point to point address xi) add a route to the IP address through the ippp0 interface xii) use the ipppd function found in /sbin/ipppd to set the following: xiii) take out (minus) bsd compression xiv) set the mru size to 2000 xv) add (+) the multi-link function "+mp" - xv) link the two /dev interfaces to the daemon + xvi) link the two /dev interfaces to the daemon -NOTE: To use the MLPPP connection to dial OUT to a MLPPP connection, change the - inbound telephone numbers to the outgoing telephone numbers of the MLPPP - host. +NOTE: To use the MLPPP connection to dial OUT to a MLPPP connection, change +the inbound telephone numbers to the outgoing telephone numbers of the MLPPP +host. 3. Beta Change Summaries and Miscellaneous Notes ------------------------------------------------ -When using the "scctrl" utility to upload firmware revisions on the board, please -note that the byte count displayed at the end of the operation may be different -than the total number of bytes in the "dcbfwn.nn.sr" file. Please disregard the -displayed byte count. - -It was noted that in Beta Release 1, the module would fail to load and result in a -segmentation fault when insmod"ed". This problem was created when one of the -isdn4linux parameters, (isdn_ctrl, data field) was filled in. In some cases, this -data field was NULL, and was left unchecked, so when it was referenced.. segv. -The bug has been fixed around line 63-68 of event.c. +When using the "scctrl" utility to upload firmware revisions on the board, +please note that the byte count displayed at the end of the operation may be +different from the total number of bytes in the "dcbfwn.nn.sr" file. Please +disregard the displayed byte count. + +It was noted that in Beta Release 1, the module would fail to load and result +in a segmentation fault when 'insmod'ed. This problem was created when one of +the isdn4linux parameters, (isdn_ctrl, data field) was filled in. In some +cases, this data field was NULL, and was left unchecked, so when it was +referenced... segv. The bug has been fixed around line 63-68 of event.c. diff --git a/Documentation/isdn/README.x25 b/Documentation/isdn/README.x25 index 3737bc97f62e..cc1b701208c0 100644 --- a/Documentation/isdn/README.x25 +++ b/Documentation/isdn/README.x25 @@ -3,7 +3,7 @@ X25 support within isdn4linux This is experimental code and should be used with linux version 2.1.72. -or later. Use it completely on your own risk. +or later. Use it completely at your own risk. As new versions appear, the stuff described here might suddenly change @@ -161,7 +161,7 @@ is needed to set up x25 routes. I.e. x25route add 01 will cause all x.25 connections to the destination x.25-address -"01" beeing routed to your created isdn network interface. +"01" to be routed to your created isdn network interface. There are currently no real x25 applications available. However, for @@ -185,14 +185,14 @@ ix25test start This will set up a sample configuration using the isdnloop and hisax driver and create some isdn network interfaces. It is recommended that all other isdn drivers and the -x25 module is unloaded before calling this script. +x25 module are unloaded before calling this script. Known problems and deficiencies: The isdnloop HL driver apparently has problems to re-establish a -connection that has been hang up from the outgoing device. You have to +connection that has been hung up from the outgoing device. You have to unload the isdnloop driver after the faked isdn-connection is closed and insmod it again. With the Hisax driver, this problem is not present. @@ -210,7 +210,7 @@ The latter problem could be reproduced by using hisax as well as the isdnloop driver. It seems that it is not caused by the isdn code. Somehow, the inode of a socket is freed while a process still refers the socket's wait queue. This causes problems when the process tries to -remove itsself from the wait queue (refered by the dangling +remove itself from the wait queue (refered by the dangling sock->sleep pointer) before returning from a select() system call. - Henner diff --git a/Documentation/isdn/syncPPP.FAQ b/Documentation/isdn/syncPPP.FAQ index 6813818e0b18..3257a4bc0786 100644 --- a/Documentation/isdn/syncPPP.FAQ +++ b/Documentation/isdn/syncPPP.FAQ @@ -1,8 +1,8 @@ simple isdn4linux PPP FAQ .. to be continued .. not 'debugged' ------------------------------------------------------------------- -Q01: what's pppd,ipppd, syncPPP , asyncPPP ?? -Q02: error message "this systems lacks PPP support" +Q01: what's pppd, ipppd, syncPPP, asyncPPP ?? +Q02: error message "this system lacks PPP support" Q03: strange information using 'ifconfig' Q04: MPPP?? What's that and how can I use it ... Q05: I tried MPPP but it doesn't work @@ -16,7 +16,7 @@ Q12: How can I reduce login delay? ------------------------------------------------------------------- -Q01: pppd,ipppd, syncPPP , asyncPPP .. what is that ? +Q01: pppd, ipppd, syncPPP, asyncPPP .. what is that ? what should I use? A: The pppd is for asynchronous PPP .. asynchronous means here, the framing is character based. (e.g when @@ -45,7 +45,7 @@ A: The pppd is for asynchronous PPP .. asynchronous means -- Q02: when I start the ipppd .. I only get the - error message "this systems lacks PPP support" + error message "this system lacks PPP support" A: check that at least the device 'ippp0' exists. (you can check this e.g with the program 'ifconfig') The ipppd NEEDS this device under THIS name .. @@ -123,7 +123,7 @@ A: (from Alexanter Strauss: ) -- -Q08: A wanna talk to remote machines, which need +Q08: I wanna talk to remote machines, which need a different configuration. The only way I found to do this is to kill the ipppd and start a new one with another config to connect @@ -152,14 +152,14 @@ A: When starting, the ipppd calls functions which may Q10: I wanna use dynamic IP address assignment ... How must I configure the network device. -A: At least you must have a routing, which forwards +A: At least you must have a route which forwards a packet to the ippp network-interface to trigger the dial-on-demand. - A default routing to the ippp-interface will work. + A default route to the ippp-interface will work. Now you must choose a dummy IP address for your interface. If for some reason you can't set the default - routing to the ippp interface, you may take any + route to the ippp interface, you may take any address of the subnet from which you expect your dynamic IP number and set a 'network route' for this subnet to the ippp interface. diff --git a/Documentation/java.txt b/Documentation/java.txt index a5439f730ec2..1b30c1183e3e 100644 --- a/Documentation/java.txt +++ b/Documentation/java.txt @@ -18,7 +18,7 @@ other program after you have done the following: nonstandard classes (not included in the same directory as the application itself). -2) You have to compile BINFMT_MISC either as module or into +2) You have to compile BINFMT_MISC either as a module or into the kernel (CONFIG_BINFMT_MISC) and set it up properly. If you choose to compile it as a module, you will have to insert it manually with modprobe/insmod, as kerneld diff --git a/Documentation/joystick.txt b/Documentation/joystick.txt index 00b620e052a1..eada60d0df76 100644 --- a/Documentation/joystick.txt +++ b/Documentation/joystick.txt @@ -15,7 +15,7 @@ stories are also welcome. 2. Usage ~~~~~~~~ If you enable the joystick driver in the kernel configuration, all -connected joystick should be found automatically. If that doesn't work, you +connected joysticks should be found automatically. If that doesn't work, you can pass the joystick driver the following kernel command line arguments: js=0xXX,0xYY @@ -120,7 +120,7 @@ Version 0.4 Linux 0.99.6 and fixed race condition in js_read. port started ALL the joystick one shots. If the one that we are reading is short enough and the first one to be read, the second one will return - bad data if it's one shot has not expired when + bad data if its one shot has not expired when the joystick port is written for the second time. Thus solves the mystery delay problem in 0.2! Version 0.5 Upgraded the driver to the 0.99.9 kernel, added @@ -155,7 +155,7 @@ Version 0.9 Ported to 2.1.x Better ioctl names. Kept binary compatibility. Removed 'save_busy'. Just set busy to 1. Version 0.9.0 Based on 0.7.3 - New read function that allows two axes have same value + New read function that allows two axes to have the same value New joystick calibration code Real support for 3-axis joysticks CPU speed independent timeouts @@ -180,7 +180,7 @@ Version 1.0.1 Complete rewrite Version 1.0.2 Works, many bugs fixed, more yet to come Version 1.0.3 Tail cutting logic changes & fixes Fix in js_do_bh - no more zero values for axes - Lost event changest & fixes + Lost event changes & fixes Version 1.0.4 Kernel command line & module configuration support Better cli()/sti() handling Linux 2.1.25 select => poll changes diff --git a/Documentation/locks.txt b/Documentation/locks.txt index 3911417b2fb1..91a01676172a 100644 --- a/Documentation/locks.txt +++ b/Documentation/locks.txt @@ -67,7 +67,7 @@ arises. Until an updated version of mount(8) becomes available you may have to apply this patch to the mount sources (based on the version distributed with Rick -Faiths util-linux-2.5 package): +Faith's util-linux-2.5 package): *** mount.c.orig Sat Jun 8 09:14:31 1996 --- mount.c Sat Jun 8 09:13:02 1996 diff --git a/Documentation/m68k/framebuffer.txt b/Documentation/m68k/framebuffer.txt index b533f0202216..c5b77d077339 100644 --- a/Documentation/m68k/framebuffer.txt +++ b/Documentation/m68k/framebuffer.txt @@ -86,14 +86,15 @@ which data structures they work. Here's just a brief overview: - You can request and change variable information about the hardware, like visible and virtual geometry, depth, color map format, timing, and so on. - If you try to change that informations, the driver maybe will round up some + If you try to change that information, the driver maybe will round up some values to meet the hardware's capabilities (or return EINVAL if that isn't possible). - You can get and set parts of the color map. Communication is done with 16 - bit per color part (red, green, blue, transparency) to support all existing - hardware. The driver does all the computations needed to bring it into the - hardware (round it down to less bits, maybe throw away transparency). + bits per color part (red, green, blue, transparency) to support all + existing hardware. The driver does all the computations needed to apply + it to the hardware (round it down to less bits, maybe throw away + transparency). All this hardware abstraction makes the implementation of application programs easier and more portable. E.g. the X server works completely on /dev/fb* and @@ -113,8 +114,8 @@ much trouble... 3. Frame Buffer Resolution Maintenance -------------------------------------- -Frame buffer resolutions are maintained using the utility `fbset'. It allows to -change the video mode properties of the current resolution. It's main usage is +Frame buffer resolutions are maintained using the utility `fbset'. It can +change the video mode properties of the current resolution. Its main usage is to change the current video mode, e.g. during boot up in one of your /etc/rc.* or /etc/init.d/* files. diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt index 7b611ef55cc6..8d3994d424c8 100644 --- a/Documentation/m68k/kernel-options.txt +++ b/Documentation/m68k/kernel-options.txt @@ -87,7 +87,7 @@ Valid names are: /dev/adc: -> 0x1c20 (third ACSI device) /dev/add: -> 0x1c30 (forth ACSI device) -The last for names are available only if the kernel has been compiled +The last four names are available only if the kernel has been compiled with Atari and ACSI support. The name must be followed by a decimal number, that stands for the @@ -114,8 +114,8 @@ consequences: If, for example, you have a symbolic link from /dev/fd to /dev/fd0D720 as an abbreviation for floppy driver #0 in DD format, you cannot use this name for specifying the root device, because the kernel cannot see this symlink before mounting the root FS and it -isn't in the table above. If you would use it, the root device weren't -set at all, without error message. Another example: You cannot use a +isn't in the table above. If you use it, the root device will not be +set at all, without an error message. Another example: You cannot use a partition on e.g. the sixth SCSI disk as the root filesystem, if you want to specify it by name. This is, because only the devices up to /dev/sde are in the table above, but not /dev/sdf. Although, you can @@ -561,7 +561,7 @@ thresholds. Syntax: ataflop=[,[,[,]]] The drive type may be 0, 1, or 2, for DD, HD, and ED, resp. This - setting affects how much buffers are reserved and which formats are + setting affects how many buffers are reserved and which formats are probed (see also below). The default is 1 (HD). Only one drive type can be selected. If you have two disk drives, select the "better" type. @@ -586,12 +586,12 @@ defaults depend on whether TT-style or Falcon-style SCSI is used. Below, defaults are noted as n/m, where the first value refers to TT-SCSI and the latter to Falcon-SCSI. If an illegal value is given for one parameter, an error message is printed and that one setting is -ignored (other aren't affected). +ignored (others aren't affected). : - This is the maximum number of SCSI commands queued internal to the + This is the maximum number of SCSI commands queued internally to the Atari SCSI driver. A value of 1 effectively turns off the driver - internal multitasking (if it makes problems). Legal values are >= + internal multitasking (if it causes problems). Legal values are >= 1. can be as high as you like, but values greater than times the number of SCSI targets (LUNs) you have don't make sense. Default: 16/8. @@ -632,7 +632,7 @@ ignored (other aren't affected). 0 means turn off tagged queuing support, all other values > 0 mean use tagged queuing for targets that support it. Default: currently off, but this may change when tagged queuing handling has been - proofed to be reliable. + proved to be reliable. Tagged queuing means that more than one command can be issued to one LUN, and the SCSI device itself orders the requests so they @@ -689,7 +689,7 @@ even if you have less alternate RAM. 0 stands for never swap to ST-RAM, even if it's small enough compared to the rest of memory. If ST-RAM swapping is enabled, the kernel usually uses all free -ST-RAM as swap "device". (If the kernel resides in ST-RAM, the region +ST-RAM as swap "device". If the kernel resides in ST-RAM, the region allocated by it is obviously never used for swapping :-) You can also limit this amount by specifying the second parameter, , if you want to use parts of ST-RAM as normal system memory. is @@ -852,8 +852,8 @@ Syntax: clock:x x = clock input in MHz for WD33c93 chip. Normal values would be from 8 through 20. The default value depends on your hostadapter(s), -default for the A3000 internal controller is 14, for the A2091 its 8 -and for the GVP hostadapters its either 8 or 14, depending on the +default for the A3000 internal controller is 14, for the A2091 it's 8 +and for the GVP hostadapters it's either 8 or 14, depending on the hostadapter and the SCSI-clock jumper present on some GVP hostadapters. diff --git a/Documentation/mandatory.txt b/Documentation/mandatory.txt index 1ef2788d8858..bc449d49eee5 100644 --- a/Documentation/mandatory.txt +++ b/Documentation/mandatory.txt @@ -19,7 +19,7 @@ troublesome) is access to a user's mailbox. The mail user agent and the mail transfer agent must guard against updating the mailbox at the same time, and prevent reading the mailbox while it is being updated. -In a perfect world all process would use and honour a cooperative, or +In a perfect world all processes would use and honour a cooperative, or "advisory" locking scheme. However, the world isn't perfect, and there's a lot of poorly written code out there. @@ -47,8 +47,8 @@ scheme is defined by the System V Interface Definition (SVID) Version 3. 2. Marking a file for mandatory locking --------------------------------------- -A file is marked as a candidate for mandatory by setting the group-id bit in -its file mode but removing the group-execute bit. This is an otherwise +A file is marked as a candidate for mandatory locking by setting the group-id +bit in its file mode but removing the group-execute bit. This is an otherwise meaningless combination, and was chosen by the System V implementors so as not to break existing user programs. @@ -103,7 +103,7 @@ agree. 2. If a process has locked a region of a file with a mandatory read lock, then other processes are permitted to read from that region. If any of these processes attempts to write to the region it will block until the lock is - released, unless the process has opened the file opened with the O_NONBLOCK + released, unless the process has opened the file with the O_NONBLOCK flag in which case the system call will return immediately with the error status EAGAIN. @@ -145,7 +145,7 @@ better still fix the system calls yourself and submit a patch to me or Linus. 6. Warning! ----------- -Not even root can override a mandatory lock, so runaway process can wreak +Not even root can override a mandatory lock, so runaway processes can wreak havoc if they lock crucial files. The way around it is to change the file permissions (remove the setgid bit) before trying to read or write to it. Of course, that might be a bit tricky if the system is hung :-( diff --git a/Documentation/mca.txt b/Documentation/mca.txt index 62b777de490e..23df12b57d65 100644 --- a/Documentation/mca.txt +++ b/Documentation/mca.txt @@ -57,7 +57,7 @@ into memory. mca_read_stored_pos() accesses that data. mca_read_pos() and mca_write_pos() are also available for (safer) direct POS access, but their use is _highly_ discouraged. mca_write_pos() is particularly dangerous, as it is possible for adapters to be put in inconsistent -states (i.e. sharing IO address, etc) and may result in crashes, toasted +states (e.g. sharing IO address, etc) and may result in crashes, toasted hardware, and operator injury. User level drivers (such as the AGX X server) can use /proc/mca to find @@ -124,7 +124,7 @@ Your typical proc function will look something like this: } Some of the standard MCA information will already be printed, so don't -bother repeating it. Don't try putting in more that 3K of information. +bother repeating it. Don't try putting in more than 3K of information. Enable this function with: mca_set_adapter_procfn( slot, dev_getinfo, dev ); @@ -132,8 +132,8 @@ Enable this function with: Disable it with: mca_set_adapter_procfn( slot, NULL, NULL ); -It is also recommended that, even if you don't write a proc function, to -set the name of the adapter (i.e. "PS/2 ESDI Controller") via +It is also recommended, even if you don't write a proc function, to +set the name of the adapter (e.g. "PS/2 ESDI Controller") via mca_set_adapter_name( int slot, char* name ). Up to 30 characters are used. diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX index 6bc7c31ace8b..aa3c596fcd86 100644 --- a/Documentation/networking/00-INDEX +++ b/Documentation/networking/00-INDEX @@ -2,8 +2,14 @@ - this file 3c505.txt - information on the 3Com EtherLink Plus (3c505) driver. +6pack.txt + - info on the 6pack protocol, an alternative to KISS for AX.25 Configurable - info on some of the configurable network parameters +DLINK.txt + - info on the D-Link DE-600/DE-620 parallel port pocket adapters +PLIP.txt + - PLIP: The Parallel Line Internet Protocol device driver alias.txt - info on using alias network devices arcnet-hardware.txt @@ -12,18 +18,58 @@ arcnet.txt - info on the using the arcnet driver itself. ax25.txt - info on using AX.25 and NET/ROM code for Linux +baycom.txt + - info on the driver for Baycom style amateur radio modems +cops.txt + - info on the COPS LocalTalk Linux driver +cs89x0.txt + - the Crystal LAN (CS8900/20-based) Ethernet ISA adapter driver +de4x5.txt + - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver +depca.txt + - the Digital DEPCA/EtherWORKS DE1?? and DE2?? LANCE Ethernet driver +dgrs.txt + - the Digi International RightSwitch SE-X Ethernet driver +eql.txt + - serial IP load balancing +ethertap.txt + - the Ethertap user space packet reception and transmission driver +ewrk3.txt + - the Digital EtherWORKS 3 DE203/4/5 Ethernet driver +filter.txt + - Linux Socket Filtering framerelay.txt - info on using Frame Relay/Data Link Connection Identifier (DLCI). +ip-sysctl.txt + - /proc/sys/net/ipv4/* variables +ip_dynaddr.txt + - IP dynamic address hack e.g. for auto-dialup links +ipddp.txt + - AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation lapb-module.txt - - programming information on the LAPB module. + - programming information of the LAPB module. +ltpc.txt + - the Apple or Farallon LocalTalk PC card driver +multicast.txt + - Behaviour of cards under Multicast ncsa-telnet - notes on how NCSA telnet (DOS) breaks with MTU discovery enabled. net-modules.txt - info and "insmod" parameters for all network driver modules. +policy-routing.txt + - IP policy-based routing ppp.txt - info on what software you should use to run PPP. +pt.txt + - the Gracilis Packetwin AX.25 device driver +routing.txt + - the new routing mechanism shaper.txt - info on the module that can shape/limit transmitted traffic. +smc9.txt + - the driver for SMC's 9000 series of Ethernet cards +soundmodem.txt + - Linux driver for soundcards as AX.25 modems tcp.txt - short blurb on how TCP output takes place. tulip.txt @@ -32,10 +78,13 @@ vortex.txt - info on using 3Com Vortex (3c590, 3c592, 3c595, 3c597) e'net cards. wan-router.txt - Wan router documentation +wanpipe.txt + - WANPIPE(tm) Multiprotocol WAN Driver for Linux WAN Router +wavelan.txt + - AT&T GIS (nee NCR) WaveLAN card: An Ethernet-like radio transceiver x25.txt - general info on X.25 development. x25-iface.txt - description of the X.25 Packet Layer to LAPB device interface. z8530drv.txt - info about Linux driver for Z8530 based HDLC cards for AX.25 - diff --git a/Documentation/networking/6pack.txt b/Documentation/networking/6pack.txt index 11339d36b8e6..2f8446381d14 100644 --- a/Documentation/networking/6pack.txt +++ b/Documentation/networking/6pack.txt @@ -9,20 +9,20 @@ Last update: April 7, 1998 1. What is 6pack, and what are the advantages to KISS? -6pack is a transmission protocol for the data exchange between the PC and +6pack is a transmission protocol for data exchange between the PC and the TNC over a serial line. It can be used as an alternative to KISS. 6pack has two major advantages: -- The PC is given the full control over the radio +- The PC is given full control over the radio channel. Special control data is exchanged between the PC and the TNC so - that the PC knows at any time if the TNC is receiving data, if an TNC + that the PC knows at any time if the TNC is receiving data, if a TNC buffer underrun or overrun has occured, if the PTT is set and so on. This control data is processed at a higher priority than normal data, so a data stream can be interrupted at any time to issue an - important event. This helps to improve the channel access and timing algorithms - as everything is computed in the PC. It would even be possible to experiment with - something completely different than the known CSMA and DAMA channel access - methods. + important event. This helps to improve the channel access and timing + algorithms as everything is computed in the PC. It would even be possible + to experiment with something completely different from the known CSMA and + DAMA channel access methods. This kind of real-time control is especially important to supply several TNCs that are connected between each other and the PC by a daisy chain (however, this feature is not supported yet by the Linux 6pack driver). @@ -55,15 +55,15 @@ To be able to use 6pack, a special firmware for the TNC is needed. The EPROM of a newly bought TNC does not contain 6pack, so you will have to program an EPROM yourself. The image file for 6pack EPROMs should be available on any packet radio box where PC/FlexNet can be found. The name of -the file is 6pack.bin. This file is copyrighted and maintainend by the FlexNet +the file is 6pack.bin. This file is copyrighted and maintained by the FlexNet team. It can be used under the terms of the license that comes along with PC/FlexNet. Please do not ask me about the internals of this file as I don't know anything about it. I used a textual description of the 6pack protocol to program the Linux driver. TNCs contain a 64kByte EPROM, the lower half of which is used for -TheFirmware/KISS. The upper half is either empty or is sometimes -programmed with a software called TAPR. In the latter case, the TNC +the firmware/KISS. The upper half is either empty or is sometimes +programmed with software called TAPR. In the latter case, the TNC is supplied with a DIP switch so you can easily change between the two systems. When programming a new EPROM, one of the systems is replaced by 6pack. It is useful to replace TAPR, as this software is rarely used @@ -76,7 +76,7 @@ the TNC correctly. 5. Building and installing the 6pack driver -The driver has been tested with kernel version 2.1.90. Using with older +The driver has been tested with kernel version 2.1.90. Use with older kernels may lead to a compilation error because the interface to a kernel function has been changed in the 2.1.8x kernels. @@ -93,7 +93,7 @@ How to turn on 6pack support: To use the driver, the kissattach program delivered with the AX.25 utilities has to be modified. -- Do a cd to the directory that keeps the kissattach sources. Edit the +- Do a cd to the directory that holds the kissattach sources. Edit the kissattach.c file. At the top, insert the following lines: #ifndef N_6PACK @@ -110,8 +110,8 @@ has to be modified. Installing the driver: -- Do an insmod 6pack. Look at your - /var/log/messages file to check if the module has printed its initialization message. +- Do an insmod 6pack. Look at your /var/log/messages file to check if the + module has printed its initialization message. - Do a spattach as you would launch kissattach when starting a KISS port. Check if the kernel prints the message '6pack: TNC found'. @@ -130,7 +130,7 @@ Note that the connect and status LEDs of the TNC are controlled in a different way than they are when the TNC is used with PC/FlexNet. When using FlexNet, the connect LED is on if there is a connection; the status LED is on if there is data in the buffer of the PC's AX.25 engine that has to be -transmitted. Under LinuX, the 6pack layer is beyond the AX.25 layer, +transmitted. Under Linux, the 6pack layer is beyond the AX.25 layer, so the 6pack driver doesn't know anything about connects or data that has not yet been transmitted. Therefore the LEDs are controlled as they are in KISS mode: The connect LED is turned on if data is transferred @@ -143,7 +143,7 @@ When testing the driver with 2.0.3x kernels and operating with data rates on the radio channel of 9600 Baud or higher, the driver may, on certain systems, sometimes print the message '6pack: bad checksum', which is due to data loss if the other station sends two -or more subsequent packets. I have been told that this is due tu a problem +or more subsequent packets. I have been told that this is due to a problem with the serial driver of 2.0.3x kernels. I don't know yet if the problem still exists with 2.1.x kernels, as I have heard that the serial driver code has been changed with 2.1.x. diff --git a/Documentation/networking/DLINK.txt b/Documentation/networking/DLINK.txt index dba5dbcc5f23..6caabdf4bb7c 100644 --- a/Documentation/networking/DLINK.txt +++ b/Documentation/networking/DLINK.txt @@ -52,7 +52,7 @@ Released 1994-06-13 3. FILES IN THIS RELEASE. README.DLINK This file. - de600.c The Source (,may it be with You :-) for the DE-600 + de600.c The Source (may it be with You :-) for the DE-600 de620.c ditto for the DE-620 de620.h Macros for de620.c @@ -78,7 +78,7 @@ Released 1994-06-13 modify "linux/drivers/net/CONFIG" accordingly, or adjust the parameters in the "tuning" section in the sources. - If you are going to use the drivers a loadable modules, do _not_ + If you are going to use the drivers as loadable modules, do _not_ enable them while doing "make config", but instead make sure that the drivers are included in "linux/drivers/net/MODULES". diff --git a/Documentation/networking/PLIP.txt b/Documentation/networking/PLIP.txt index b58cea27f2bb..c2bb44fdb5b8 100644 --- a/Documentation/networking/PLIP.txt +++ b/Documentation/networking/PLIP.txt @@ -29,11 +29,11 @@ Advantages of PLIP It's cheap, it's available everywhere, and it's easy. The PLIP cable is all that's needed to connect two Linux boxes, and it -can be build for very few bucks. +can be built for very few bucks. -Connecting two Linux boxes takes only a seconds decision and a few -minutes work, no need to search for a [supported] netcard. This might -even be especially important in the case of notebooks, where netcard +Connecting two Linux boxes takes only a second's decision and a few +minutes' work, no need to search for a [supported] netcard. This might +even be especially important in the case of notebooks, where netcards are not easily available. Not requiring a netcard also means that apart from connecting the @@ -45,7 +45,7 @@ Disadvantages of PLIP Doesn't work over a modem, like SLIP and PPP. Limited range, 15 m. Can only be used to connect three (?) Linux boxes. Doesn't connect to -an exiting ethernet. Isn't standard (not even de facto standard, like +an existing ethernet. Isn't standard (not even de facto standard, like SLIP). Performance @@ -150,7 +150,8 @@ Each octet is sent as To start a transfer the transmitting machine outputs a nibble 0x08. The raises the ACK line, triggering an interrupt in the receiving -machine. The receiving machine disables +machine. The receiving machine disables interrupts and raises its own ACK +line. Restated: diff --git a/Documentation/networking/alias.txt b/Documentation/networking/alias.txt index ef04159360fe..c6c0b0cf3a35 100644 --- a/Documentation/networking/alias.txt +++ b/Documentation/networking/alias.txt @@ -11,7 +11,7 @@ o Alias creation. 200.1.1.1 alias for eth0 ... # ifconfig eth0:0 200.1.1.1 etc,etc.... - ~~ -> request alias #0 creation (if it not exists) for eth0 + ~~ -> request alias #0 creation (if not yet exists) for eth0 and routing stuff also ... # route add -host 200.1.1.1 dev eth0:0 (if same IP network as main device) @@ -28,7 +28,7 @@ o Alias deletion. Alias (re-)configuring - Aliases are no real devices, but should be able to configure and + Aliases are not real devices, but programs` should be able to configure and refer to them as usual (ifconfig, route, etc). Relationship with main device diff --git a/Documentation/networking/baycom.txt b/Documentation/networking/baycom.txt index cdb0a9a500ee..8d7547688dcb 100644 --- a/Documentation/networking/baycom.txt +++ b/Documentation/networking/baycom.txt @@ -43,7 +43,7 @@ but it is now a true kernel network interface. Installation is therefore simple. Once installed, four interfaces named bc[0-3] are available. sethdlc from the ax25 utilities may be used to set driver states etc. Users of userland AX.25 stacks may use the net2kiss utility (also available -in the ax25 utilities package) to converts packets of a network interface +in the ax25 utilities package) to convert packets of a network interface to a KISS stream on a pseudo tty. There's also a patch available from me for WAMPES which allows attaching a kernel network interface directly. @@ -72,7 +72,7 @@ first parallel port (LPT1 under DOS). options=1 instructs the driver to use the software DCD algorithm (see below). The channel access parameters can be set with sethdlc -a or kissparms. -Note that both utilities interpret the values slightly different. +Note that both utilities interpret the values slightly differently. Hardware DCD versus Software DCD @@ -93,7 +93,7 @@ par96: the software DCD algorithm for this type of modem is rather poor. feeds the DCD input of the PAR96 modem, the use of the hardware DCD circuitry is recommended. -picpar: the picpar modem features a builtin DCD hardware, which is highly +picpar: the picpar modem features builtin DCD hardware, which is highly recommended. diff --git a/Documentation/networking/cops.txt b/Documentation/networking/cops.txt index ec5d05c67dca..23723dfe5208 100644 --- a/Documentation/networking/cops.txt +++ b/Documentation/networking/cops.txt @@ -1,12 +1,12 @@ Text File for the COPS LocalTalk Linux driver (cops.c). By Jay Schulist -This driver has teo modes and they are: Dayna mode and Tangent mode. +This driver has two modes and they are: Dayna mode and Tangent mode. Each mode corresponds with the type of card. It has been found that there are 2 main types of cards and all other cards are the same and just have different names or only have minor differences such as more IO ports. As this driver is tested it will -become more clear on exactly what cards are supported. +become more clear exactly what cards are supported. Right now these cards are known to work with the COPS driver. The LT-200 cards work in a somewhat more limited capacity than the @@ -20,8 +20,8 @@ DAYNA driver mode: Other cards possibly supported mode unkown though: Dayna DL2000 (Full length) -The COPS driver defaults to using Dayna mode. To change the drivers -mode if you build a driver with a dual support use board_type=1 or +The COPS driver defaults to using Dayna mode. To change the driver's +mode if you built a driver with dual support use board_type=1 or board_type=2 for Dayna or Tangent with insmod. ** Operation/loading of the driver. @@ -52,12 +52,12 @@ it work with the cops.c driver. dummy -seed -phase 2 -net 2000 -addr 2000.10 -zone "1033" lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033" -* For multiple cards, Ethernet and Localtalk. +* For multiple cards, Ethernet and LocalTalk. eth0 -seed -phase 2 -net 3000 -addr 3000.20 -zone "1033" lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033" * For multiple LocalTalk cards, and an Ethernet card. -* Order seems to matters here, Ethernet last. +* Order seems to matter here, Ethernet last. lt0 -seed -phase 1 -net 1000 -addr 1000.10 -zone "LocalTalk1" lt1 -seed -phase 1 -net 2000 -addr 2000.20 -zone "LocalTalk2" eth0 -seed -phase 2 -net 3000 -addr 3000.30 -zone "EtherTalk" diff --git a/Documentation/networking/cs89x0.txt b/Documentation/networking/cs89x0.txt index ab89d645ebe1..895df60df8ac 100644 --- a/Documentation/networking/cs89x0.txt +++ b/Documentation/networking/cs89x0.txt @@ -40,22 +40,6 @@ TABLE OF CONTENTS 6.3.2 Crystal's Bulletin Board Service -8.3 OBTAINING THE LATEST DRIVER VERSION - -You can obtain the latest CS89XX drivers and support software from Crystal's -BBS or Web site. - - -8.3.1 CRYSTAL'S WEB SITE - -Crystal Semiconductor maintains a web page at http://www.crystal.com with the -the latest drivers and technical publications. - - -8.3.2 CRYSTAL'S BULLETIN BOARD SERVICE - - - 1.0 CRYSTAL LAN CS8900/CS8920 ETHERNET ADAPTERS =============================================================================== @@ -172,7 +156,7 @@ with the following "default" settings: 10BASE-T (10BASE-T only adapter) You should only change the default configuration settings if conflicts with -another adapter exists. To change the adapter's configuration, run the +another adapter exist. To change the adapter's configuration, run the CS8900/20 Setup Utility. @@ -388,7 +372,7 @@ Example: 5.1 KNOWN DEFECTS and LIMITATIONS Refer to the RELEASE.TXT file distributed as part of this archive for a list of -know defects, driver limitations, and work arounds. +known defects, driver limitations, and work arounds. 5.2 TESTING THE ADAPTER diff --git a/Documentation/networking/de4x5.txt b/Documentation/networking/de4x5.txt index dc6a99a7ad33..d82b19c74b6a 100644 --- a/Documentation/networking/de4x5.txt +++ b/Documentation/networking/de4x5.txt @@ -82,7 +82,7 @@ To unload a module, turn off the associated interface(s) 'ifconfig eth?? down' then 'rmmod de4x5'. - Automedia detection is included so that in principal you can disconnect + Automedia detection is included so that in principle you can disconnect from, e.g. TP, reconnect to BNC and things will still work (after a pause whilst the driver figures out where its media went). My tests using ping showed that it appears to work.... @@ -118,7 +118,7 @@ debt to for the testing and feedback that helped get this feature working. So far we have tested KINGSTON, SMC8432, SMC9332 (with the latest SROM complying with the SROM spec V3: their first was - broken), ZNYX342 and LinkSys. ZYNX314 (dual 21041 MAC) and ZNYX 315 + broken), ZNYX342 and LinkSys. ZNYX314 (dual 21041 MAC) and ZNYX 315 (quad 21041 MAC) cards also appear to work despite their incorrectly wired IRQs. diff --git a/Documentation/networking/eql.txt b/Documentation/networking/eql.txt index 08cded5980d4..0694f528c66e 100644 --- a/Documentation/networking/eql.txt +++ b/Documentation/networking/eql.txt @@ -15,7 +15,7 @@ 1. Introduction Which is worse? A huge fee for a 56K leased line or two phone lines? - Its probably the former. If you find yourself craving more bandwidth, + It's probably the former. If you find yourself craving more bandwidth, and have a ISP that is flexible, it is now possible to bind modems together to work as one point-to-point link to increase your bandwidth. All without having to have a special black box on either @@ -64,7 +64,7 @@ -rw-r--r-- guru/ncm 2195 Jan 10 21:48 1995 eql-1.1/eql_enslave.c ______________________________________________________________________ - Unpack a recent kernel (something after 1.1.92) Someplace convenient + Unpack a recent kernel (something after 1.1.92) someplace convenient like say /usr/src/linux-1.1.92.eql. Use symbolic links to point /usr/src/linux to this development directory. @@ -250,13 +250,13 @@ One version of the scheduler was able to push 5.3 K/s through the 28800 and 14400 connections, but when the priorities on the links were - very wide apart (57600 vs. 14400) The "faster" modem received all + very wide apart (57600 vs. 14400) the "faster" modem received all traffic and the "slower" modem starved. - 5. Tester's Reports + 5. Testers' Reports - Some people have experimented with the eql device with newer kernels + Some people have experimented with the eql device with newer kernels (than 1.1.75). I have since updated the driver to patch cleanly in newer kernels because of the removal of the old "slave- balancing" driver config option. @@ -469,7 +469,7 @@ I've installed your patch and it works great. I have trialed it over twin SL/IP lines, just over null modems, but I was able to data at over 48Kb/s [ISDN link -Simon]. I managed a - transfer of upto 7.5 Kbyte/s on one go, but averaged around + transfer of up to 7.5 Kbyte/s on one go, but averaged around 6.4 Kbyte/s, which I think is pretty cool. :) diff --git a/Documentation/networking/ethertap.txt b/Documentation/networking/ethertap.txt index b60f39c2246f..8b7ab4be43cc 100644 --- a/Documentation/networking/ethertap.txt +++ b/Documentation/networking/ethertap.txt @@ -8,14 +8,14 @@ Introduction Ethertap provides packet reception and transmission for user space programs. It can be viewed as a simple ethernet device, -which instead of recieving packets from a network wire, it recieves +which instead of receiving packets from a network wire, it receives them from user space. -Ethertap can be used for anything from Appletalk to IPX to even +Ethertap can be used for anything from AppleTalk to IPX to even building bridging tunnels. It also has many other general purpose uses. -Ethertap also can do ARP for you. Although this is not enabled per +Ethertap also can do ARP for you, although this is not enabled by default. SetUp @@ -36,7 +36,7 @@ proper IP number for your situation.) If you want your Ethertap device to ARP for you would ifconfig the interface like this: ifconfig tap* 192.168.1.1 arp -Remember that the you need to have a corresponding /dev/tap* file +Remember that you need to have a corresponding /dev/tap* file for each tap* device you need to ifconfig. Now Ethertap should be ready to use. @@ -65,7 +65,7 @@ The routing on our box would be C code for a Simple program using an EtherTap device ==================================================== -This code is just excepts from a real program, so some parts are missing +This code is just excerpts from a real program, so some parts are missing but the important stuff is below. void main (void) diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 60deb1771bd3..370cb92f866a 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -18,16 +18,16 @@ creating filters. LSF is much simpler that BPF. One does not have to worry about devices or anything like that. You simply create your filter code, send it to the kernel via the SO_ATTACH_FILTER ioctl and -if you filter code passes the kernel check on it, you then +if your filter code passes the kernel check on it, you then immediately begin filtering data on that socket. You can also detach filters from your socket via the SO_DETACH_FILTER ioctl. This will probably not be used much since when you close a socket that has a filter on it the -filter is automagicly removed. The other less common case -may be adding a differnt filter on the same socket you had another -filter that is still running, the kernel takes care of removing -the old one and placing your new one in its place, assumming your +filter is automagically removed. The other less common case +may be adding a different filter on the same socket where you had another +filter that is still running: the kernel takes care of removing +the old one and placing your new one in its place, assuming your filter has passed the checks, otherwise if it fails the old filter will remain on that socket. diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index e202b4f80a2e..ab794ef77d56 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -49,7 +49,7 @@ ip_bootp_relay - BOOLEAN not to this host as local ones. It is supposed, that BOOTP relay deamon will catch and forward such packets. - default FASLE + default FALSE Not Implemented Yet. @@ -110,7 +110,7 @@ TCP variables: tcp_syn_retries - INTEGER Number of times initial SYNs for an TCP connection attempt will - be retransmitted. Should not be higher that 255. + be retransmitted. Should not be higher than 255. tcp_keepalive_time - INTEGER How often TCP sends out keepalive messages when keepalive is enabled. diff --git a/Documentation/networking/ipddp.txt b/Documentation/networking/ipddp.txt index 6ed2d93f559e..f49b1690988d 100644 --- a/Documentation/networking/ipddp.txt +++ b/Documentation/networking/ipddp.txt @@ -1,42 +1,42 @@ Text file for ipddp.c: - Appletalk-IP Decapsulation and Appletalk-IP Encapsulation + AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation This text file writen by Jay Schulist Introduction ------------ -Appletalk-IP (IPDDP) is the method computers connected to Appletalk -networks can communicate via IP. Appletalk-IP is simply IP datagrams -inside Appletalk packets. +AppleTalk-IP (IPDDP) is the method computers connected to AppleTalk +networks can use to communicate via IP. AppleTalk-IP is simply IP datagrams +inside AppleTalk packets. Through this driver you can either allow your Linux box to communicate -IP over an Appletalk network or you can provide IP gatewaying functions -for you Appletalk users. +IP over an AppleTalk network or you can provide IP gatewaying functions +for your AppleTalk users. -You can currently Encapsulate or Decapsulate Appletalk-IP on LocalTalk, +You can currently Encapsulate or Decapsulate AppleTalk-IP on LocalTalk, EtherTalk and PPPTalk. The only limit on the protocol is that of what -the kernel Appletalk layer and drivers are available. +kernel AppleTalk layer and drivers are available. Each mode requires its own user space software. -Compiling Appletalk-IP Decapsulation/Encapsulation +Compiling AppleTalk-IP Decapsulation/Encapsulation ================================================= -Appletalk-IP Decapsulation needs to be compiled into your kernel. You +AppleTalk-IP Decapsulation needs to be compiled into your kernel. You will need to turn on Appletalk-IP driver support. Then you will need to -select ONE of the two options; IP to Appletalk-IP Encapsulation support or -Appletalk-IP to IP Decapsulation support. If you compile the driver -staticly you will only be able to use the driver for the function you have +select ONE of the two options; IP to AppleTalk-IP Encapsulation support or +AppleTalk-IP to IP Decapsulation support. If you compile the driver +statically you will only be able to use the driver for the function you have enabled in the kernel. If you compile the driver as a module you can select what mode you want it to run in via a module loading param. -ipddp_mode=1 for Appletalk-IP Encapsulation and ipddp_mode=2 for -Appletalk-IP to IP Decapsulation. +ipddp_mode=1 for AppleTalk-IP Encapsulation and ipddp_mode=2 for +AppleTalk-IP to IP Decapsulation. Basic instructions for user space tools ======================================= -To enable Appletalk-IP Decapsulation/Encapsulation you will need the +To enable AppleTalk-IP Decapsulation/Encapsulation you will need the proper tools. You can get the tools for Decapsulation from http://spacs1.spacs.k12.wi.us/~jschlst/MacGate and for Encapsulation from http://www.maths.unm.edu/~bradford/ltpc.html @@ -46,7 +46,7 @@ need to consult the supporting documentation for each set of tools. Decapsulation - You will need to download a software package called MacGate. In this distribution there will be a tool called MacRoute -which enabled you to add routes to the kernel for your Macs by hand. +which enables you to add routes to the kernel for your Macs by hand. Also the tool MacRegGateWay is included to register the proper IP Gateway and IP addresses for your machine. Included in this distribution is a patch to netatalk-1.4b2+asun2.0a17.2 (available from @@ -55,13 +55,13 @@ but it allows automatic adding and deleting of routes for Macs. (Handy for locations with large Mac installations) Encapsulation - You will need to download a software daemon called ipddpd. -This software expects there to be and Appletalk-IP gateway on the network. +This software expects there to be an AppleTalk-IP gateway on the network. You will also need to add the proper routes to route your Linux box's IP traffic out the ipddp interface. Common Uses of ipddp.c ---------------------- -Of course Appletalk-IP Decapsulation and Encapsulation, but specificly +Of course AppleTalk-IP Decapsulation and Encapsulation, but specificly Decapsulation is being used most for connecting LocalTalk networks to IP networks. Although it has been used on EtherTalk networks to allow Macs that are only able to tunnel IP over EtherTalk. @@ -70,9 +70,9 @@ Encapsulation has been used to allow a Linux box stuck on a LocalTalk network to use IP. It should work equally well if you are stuck on an EtherTalk only network. -Further Assisatance +Further Assistance ------------------- You can contact me (Jay Schulist ) with any questions reguarding Decapsulation or Encapsulation. Bradford W. Johnson originally wrote the ipddp.c driver for IP -encapsulation in Appletalk. +encapsulation in AppleTalk. diff --git a/Documentation/networking/lapb-module.txt b/Documentation/networking/lapb-module.txt index 938d09787b4e..2b564a5f5018 100644 --- a/Documentation/networking/lapb-module.txt +++ b/Documentation/networking/lapb-module.txt @@ -2,7 +2,7 @@ Jonathan Naylor 29.12.96 -The LAPB module will be a seperately compiled module for use by any parts of +The LAPB module will be a separately compiled module for use by any parts of the Linux operating system that require a LAPB service. This document defines the interfaces to, and the services provided by this module. The term module in this context does not imply that the LAPB module is a @@ -73,7 +73,7 @@ be unacknowledged by the remote end, the value of the window is between 1 and 7 for a standard LAPB link, and between 1 and 127 for an extended LAPB link. -The mode variable is a bit field is used for setting (at present) three values. +The mode variable is a bit field used for setting (at present) three values. The bit fields have the following meanings: Bit Meaning diff --git a/Documentation/networking/ltpc.txt b/Documentation/networking/ltpc.txt index 477f9fc6bace..50a39ea881f3 100644 --- a/Documentation/networking/ltpc.txt +++ b/Documentation/networking/ltpc.txt @@ -1,8 +1,8 @@ This is the ALPHA version of the ltpc driver. In order to use it, you will need at least version 1.3.3 of the -netatalk package, and the Apple or Farallon Localtalk PC card. -There are a number of different Localtalk cards for the PC; this +netatalk package, and the Apple or Farallon LocalTalk PC card. +There are a number of different LocalTalk cards for the PC; this driver applies only to the one with the 65c02 processor chip on it. To include it in the kernel, select the CONFIG_LTPC switch in the @@ -20,7 +20,7 @@ The driver will autoprobe, and you should see a message like: at bootup. The appropriate netatalk configuration depends on whether you are -attached to a network that includes appletalk routers or not. If, +attached to a network that includes AppleTalk routers or not. If, like me, you are simply connecting to your home Macintoshes and printers, you need to set up netatalk to "seed". The way I do this is to have the lines @@ -29,14 +29,14 @@ dummy -seed -phase 2 -net 2000 -addr 2000.26 -zone "1033" ltalk0 -seed -phase 1 -net 1033 -addr 1033.27 -zone "1033" in my atalkd.conf. What is going on here is that I need to fool -netatalk into thinking that there are two appletalk interfaces +netatalk into thinking that there are two AppleTalk interfaces present -- otherwise it refuses to seed. This is a hack, and a more permanent solution would be to alter the netatalk code. Note that the dummy driver needs to accept multicasts also -- earlier versions of dummy.c may need to be patched. -If you are attached to an extended appletalk network, with routers on +If you are attached to an extended AppleTalk network, with routers on it, then you don't need to fool around with this -- the appropriate line in atalkd.conf is @@ -75,7 +75,7 @@ board. Set the switches so as not to conflict with other hardware. IP: Many people are interested in this driver in order to use IP -when Localtalk, but no Ethernet, is available. While the code to do +when LocalTalk, but no Ethernet, is available. While the code to do this is not strictly speaking part of this driver, an experimental version is available which seems to work under kernel 2.0.xx. It is not yet functional in the 2.1.xx kernels. diff --git a/Documentation/networking/multicast.txt b/Documentation/networking/multicast.txt index b7e234e0394c..a8539007aaa0 100644 --- a/Documentation/networking/multicast.txt +++ b/Documentation/networking/multicast.txt @@ -52,6 +52,6 @@ wd YES YES YES Hardware znet YES YES YES Software -PROMISC = This multicasts mode is in fact promiscuous mode. Avoid using +PROMISC = This multicast mode is in fact promiscuous mode. Avoid using cards who go PROMISC on any multicast in a multicast kernel. (#) = Hardware multicast support is not used yet. diff --git a/Documentation/networking/net-modules.txt b/Documentation/networking/net-modules.txt index f0f3ab4700c2..3830a83513d2 100644 --- a/Documentation/networking/net-modules.txt +++ b/Documentation/networking/net-modules.txt @@ -2,31 +2,31 @@ Wed 2-Aug-95 Linux network driver modules - Do not mistake this to "README.modules" at the top-level + Do not mistake this for "README.modules" at the top-level directory! That document tells about modules in general, while this one tells only about network device driver modules. This is a potpourri of INSMOD-time(*) configuration options (if such exists) and their default values of various modules - on Linux network drivers collection. + in the Linux network drivers collection. Some modules have also hidden (= non-documented) tunable values. - Choice of not documenting them is based on general belief, that - the less user needs to know, the better. (There are things that - driver developer can use, others should not confuse themselves.) + The choice of not documenting them is based on general belief, that + the less the user needs to know, the better. (There are things that + driver developers can use, others should not confuse themselves.) In many cases it is highly preferred that insmod:ing is done ONLY with defining an explicit address for the card, AND BY NOT USING AUTO-PROBING! - Now most cards have some explicitly defined base address, they + Now most cards have some explicitly defined base address that they are compiled with (to avoid auto-probing, among other things). If that compiled value does not match your actual configuration, - do use "io=0xXXX" -parameter for the insmod, and give there + do use the "io=0xXXX" -parameter for the insmod, and give there a value matching your environment. If you are adventurous, you can ask the driver to autoprobe - by using "io=0" parameter, however it is potentially dangerous + by using the "io=0" parameter, however it is a potentially dangerous thing to do in a live system. (If you don't know where the card is located, you can try autoprobing, and after possible crash recovery, insmod with proper IO-address..) diff --git a/Documentation/networking/policy-routing.txt b/Documentation/networking/policy-routing.txt index aa0d8f9f621f..9cf967737142 100644 --- a/Documentation/networking/policy-routing.txt +++ b/Documentation/networking/policy-routing.txt @@ -127,7 +127,7 @@ Applications It is funny, but pretty useless algorithm. I listed it just to show power of new routing code. -5. All the variaty of combinations...... +5. All the variety of combinations...... GATED diff --git a/Documentation/networking/pt.txt b/Documentation/networking/pt.txt index 63f40b31abea..cae844d3f8c1 100644 --- a/Documentation/networking/pt.txt +++ b/Documentation/networking/pt.txt @@ -49,7 +49,7 @@ As an example, here is my /etc/rc.net /sbin/route add -host 44.136.8.95 dev pt0b /sbin/route add -host 44.255.255.255 dev pt0b -This version of the driver comes under the GNU GPL. If you have one on my +This version of the driver comes under the GNU GPL. If you have one of my previous (non-GPL) versions of the driver, please update to this one. I hope that this all works well for you. I would be pleased to hear how diff --git a/Documentation/networking/routing.txt b/Documentation/networking/routing.txt index 42d4218ca13c..32dfeb2cd3c9 100644 --- a/Documentation/networking/routing.txt +++ b/Documentation/networking/routing.txt @@ -18,18 +18,18 @@ The directory ftp.inr.ac.ru:/ip-routing contains: NEWS for user. -- Policy based routing. Routing decisions are made on the base +- Policy based routing. Routing decisions are made on the basis not only of destination address, but also source address, TOS and incoming interface. - Complete set of IP level control messages. - Now Linux is the only in the world OS comlying to RFC requirements. + Now Linux is the only OS in the world complying to RFC requirements. Great win 8) - New interface addressing paradigm. Assignment of address ranges to interface, multiple prefixes etc. etc. Do not bother, it is compatible with old one. Moreover: -- You more need not make "route add aaa.bbb.ccc... eth0", - it is made automatically. +- You more need not do "route add aaa.bbb.ccc... eth0", + it is done automatically. - "Abstract" UNIX sockets and security enhancements. It is necessary to use TIRPC and TLI emulation library. diff --git a/Documentation/networking/shaper.txt b/Documentation/networking/shaper.txt index 24393a6fa9ad..53ff5aefff4d 100644 --- a/Documentation/networking/shaper.txt +++ b/Documentation/networking/shaper.txt @@ -29,8 +29,8 @@ for normal use. Gotchas: - The shaper shapes transmitted traffic. Its rather impossible to -shape received traffic except at the end (or a router) transmiting it. + The shaper shapes transmitted traffic. It's rather impossible to +shape received traffic except at the end (or a router) transmitting it. Gated/routed/rwhod/mrouted all see the shaper as an additional device and will treat it as such unless patched. Note that for mrouted you can run diff --git a/Documentation/networking/soundmodem.txt b/Documentation/networking/soundmodem.txt index cecd206cda8f..f50985720997 100644 --- a/Documentation/networking/soundmodem.txt +++ b/Documentation/networking/soundmodem.txt @@ -16,10 +16,10 @@ use only. The Interface of the driver -The driver provides a kernel network drivers named sm[0-3]. sethdlc +The driver provides kernel network drivers named sm[0-3]. sethdlc from the ax25 utilities may be used to set driver states etc. Users of userland AX.25 stacks may use the net2kiss utility (also available -in the ax25 utilities package) to converts packets of a network interface +in the ax25 utilities package) to convert packets of a network interface to a KISS stream on a pseudo tty. There's also a patch available from me for WAMPES which allows attaching a kernel network interface directly. diff --git a/Documentation/networking/wan-router.txt b/Documentation/networking/wan-router.txt index 7f64cbc72a67..1ffeb14bc4cb 100644 --- a/Documentation/networking/wan-router.txt +++ b/Documentation/networking/wan-router.txt @@ -30,8 +30,8 @@ devices are notoriously expensive, with prices as much as 2 - 5 times higher then the price of a typical PC box. Alternatively, considering robustness and multitasking capabilities of Linux, -an internal router can be build (most routers use some sort of stripped down -Unix-like operating system anyway). With number of relatively inexpensive WAN +an internal router can be built (most routers use some sort of stripped down +Unix-like operating system anyway). With a number of relatively inexpensive WAN interface cards available on the market, a perfectly usable router can be built for less than half a price of an external router. Yet a Linux box acting as a router can still be used for other purposes, such as firewalling, @@ -39,37 +39,37 @@ running FTP, WWW or DNS server, etc. This kernel module introduces the notion of a WAN Link Driver (WLD) to Linux operating system and provides generic hardware-independent services for such -drivers. Why existing Linux network device interface can not be used for -this purpose? Well, it can. However, there are few key differences between -typical network interface (i.e. ethernet) and WAN link. +drivers. Why can existing Linux network device interface not be used for +this purpose? Well, it can. However, there are a few key differences between +a typical network interface (e.g. ethernet) and a WAN link. Many WAN protocols, such as X.25 and frame relay, allow for multiple logical connections (known as `virtual circuits' in X.25 terminology) over a single physical link. Each such virtual circuit may (and almost always does) lead -to diffrent geographical location and, therefore, different network. As a +to a different geographical location and, therefore, different network. As a result, it is the virtual circuit, not the physical link, that represents a route and, therefore, a network interface in Linux terms. To further complicate things, virtual cuircits are usually volatile in nature (excluding so called `permanent' virtual circuits or PVCs). With almost no -time required to set up and tear down virtual circuit, it is highly desirable +time required to set up and tear down a virtual circuit, it is highly desirable to implement on-demand connections in order to minimize network charges. So -unlike typical network driver, the WAN driver must be able to handle multiple -network interfaces and cope with multiple virtual circuits come into existance +unlike a typical network driver, the WAN driver must be able to handle multiple +network interfaces and cope as multiple virtual circuits come into existence and go away dynamically. Last, but not least, WAN configuration is much more complex than that of say ethernet and may well amount to several dozens of parameters. Some of them are "link-wide" while others are virtual circuit-specific. The same holds true for WAN statistics which is by far more extensive and extremely useful -when troubleshooting WAN connections. Extending ifconfig utility to suite +when troubleshooting WAN connections. Extending the ifconfig utility to suit these needs may be possible, but does not seem quite reasonable. Therefore, a WAN configuration utility and corresponding application programmer's interface is needed for this purpose. -Most of these problems are taken care of by this module. It's goal is to -provide user with more-or-less standard look and feel for all WAN devices and -assist WAN device driver writer by providing common services, such as: +Most of these problems are taken care of by this module. Its goal is to +provide a user with more-or-less standard look and feel for all WAN devices and +assist a WAN device driver writer by providing common services, such as: o User-level interface via /proc filesystem o Centralized configuration @@ -77,7 +77,7 @@ assist WAN device driver writer by providing common services, such as: o Network interface management (dynamic creation/destruction) o Protocol encapsulation/decapsulation -To ba able to use Linux WAN Router you will also need a WAN Tools package +To ba able to use the Linux WAN Router you will also need a WAN Tools package available from ftp.sangoma.com/pub/linux/vX.Y.Z/wantools-X.Y.Z.tgz @@ -112,12 +112,12 @@ ACKNOLEGEMENTS This product is based on the WANPIPE(tm) Multiprotocol WAN Router developed by Sangoma Technologies Inc. for Linux 1.2.x. Release of Linux 2.0 in summer 1996 commanded adequate changes to the WANPIPE code to take full advantage of -new Linux features. Instead of continuing developing proprietory interface +new Linux features. Instead of continuing developing proprietary interface specific to Sangoma WAN cards, we decided to put all hardware-independent code into a separate module and define two levels of interfaces - one for user- level applications and another for kernel-level WAN drivers. -Many usefull ideas concerning hardware-independent interface implementation +Many useful ideas concerning hardware-independent interface implementation were given by Mike McLagan and his implementation of the Frame Relay router and drivers for Sangoma cards (dlci/sdla). diff --git a/Documentation/networking/wanpipe.txt b/Documentation/networking/wanpipe.txt index 5288c825998c..0be0c5dc12fd 100644 --- a/Documentation/networking/wanpipe.txt +++ b/Documentation/networking/wanpipe.txt @@ -9,25 +9,26 @@ Copyright (c) 1995-1997 Sangoma Technologies Inc. INTRODUCTION -WANPIPE(tm) is a family of intelligent muliprotocol WAN communication adapters +WANPIPE(tm) is a family of intelligent multiprotocol WAN communication adapters for personal computers (ISA bus) designed to provide PC connectivity to various communication links, such as leased lines and public data networks, at -speeds up to T1/E1 using variety of synchronous communications protocols, +speeds up to T1/E1 using a variety of synchronous communications protocols, including frame relay, PPP, X.25, SDLC, etc. -WANPIPE driver together with Linux WAN Router module allows you to build -relatively inexpensive, yet high-prformance multiprotocol WAN router. For -more information about Linux WAN Router please read file -Documentation/networking/wan-router.txt. You must also obtain WAN Tools -package to be able to use Linux WAN Router and WANPIPE driver. The package +WANPIPE driver together with Linux WAN Router module allows you to build a +relatively inexpensive, yet high-performance multiprotocol WAN router. For +more information about the Linux WAN Router please read the file +Documentation/networking/wan-router.txt. You must also obtain the WAN Tools +package to be able to use the Linux WAN Router and WANPIPE driver. The package is available via the Internet from Sangoma Technologies' anonymous FTP server: ftp.sangoma.com/pub/linux/wantools-X.Y.Z.tgz or ftp.sangoma.com/pub/linux/wanpipe-X.Y.Z.tgz -The name of the package differ only due to naming convention. The functionalityof wantools and wanpipe packages are the same. The latest version of WAN -Drivers is wanpipe-2.0.0. +The names of the packages differ only due to naming convention. The +functionality of wantools and wanpipe packages are the same. The latest +version of the WAN Drivers is wanpipe-2.0.0. For technical questions and/or comments please e-mail to jaspreet@sangoma.com. For general inquiries please contact Sangoma Technologies Inc. by @@ -74,7 +75,7 @@ drivers/net: sdladrv.c SDLA support module source code sdla_fr.c SDLA Frame Relay source code sdla_ppp.c SDLA PPP source code - sdla_x25.c SDLA X25 source code + sdla_x25.c SDLA X.25 source code sdlamain.c SDLA support source code include/linux: @@ -137,7 +138,7 @@ REVISION HISTORY o Added support for synchronous PPP o Added support for S503 adapter o Added API for executing adapter commands - o Fixed a re-entrancy problem in frame relaty driver + o Fixed a re-entrancy problem in frame relay driver o Changed interface between SDLA driver and protocol support modules o Updated frame relay firmware diff --git a/Documentation/networking/wavelan.txt b/Documentation/networking/wavelan.txt index 5849f287b225..19093932304a 100644 --- a/Documentation/networking/wavelan.txt +++ b/Documentation/networking/wavelan.txt @@ -1,18 +1,19 @@ Sun Jul 2 01:38:33 EST 1995 -1. At present the driver autoprobes for a WaveLAN card only at I/O address 0x390. - The version of the card that I use (NCR) supports four I/O addresses (selectable - via a pair of DIP switches). If you want the driver to autoprobe a different - subset of the four valid addresses then you will need to edit - .../drivers/net/wavelan.c (near line 714) and change the initialisation of the - `iobase[]' array. Normally, I use a LILO configuration file directive to - obviate the need for autoprobing entirely, a course of action I heartily - recommend. +1. At present the driver autoprobes for a WaveLAN card only at I/O address + 0x390. The version of the card that I use (NCR) supports four I/O addresses + (selectable via a pair of DIP switches). If you want the driver to + autoprobe a different subset of the four valid addresses then you will need + to edit .../drivers/net/wavelan.c (near line 714) and change the + initialisation of the `iobase[]' array. Normally, I use a LILO + configuration file directive to obviate the need for autoprobing entirely, + a course of action I heartily recommend. -2. By default, the driver uses the Network ID (NWID) stored in the card's Parameter - Storage Area (PSA). However, the PSA NWID can be overridden by a value passed - explicitly as the third numeric argument to LILO's "ether=" directive, either - at the LILO prompt at boot time or within LILO's configuration file. +2. By default, the driver uses the Network ID (NWID) stored in the card's + Parameter Storage Area (PSA). However, the PSA NWID can be overridden by a + value passed explicitly as the third numeric argument to LILO's "ether=" + directive, either at the LILO prompt at boot time or within LILO's + configuration file. For example, the following line from such a LILO configuration file would auto-configure the IRQ value, set the I/O base to 0x390 and set the NWID to 0x4321, all on a WaveLAN card labelled "eth0": diff --git a/Documentation/networking/x25-iface.txt b/Documentation/networking/x25-iface.txt index fab57cf3240a..14af1b8862d3 100644 --- a/Documentation/networking/x25-iface.txt +++ b/Documentation/networking/x25-iface.txt @@ -7,8 +7,8 @@ Layer and the X.25 device driver. They are designed to allow for the easy setting of the LAPB mode from within the Packet Layer. The X.25 device driver will be coded normally as per the Linux device driver -standards, most X.25 device drivers will be moderately similar to the -already existing Eethernet device drivers. However unlike those drivers, the +standards. Most X.25 device drivers will be moderately similar to the +already existing Ethernet device drivers. However unlike those drivers, the X.25 device driver has a state associated with it, and this information needs to be passed to and from the Packet Layer for proper operation. diff --git a/Documentation/networking/x25.txt b/Documentation/networking/x25.txt index 67f17e64419e..6ae6b8ef705a 100644 --- a/Documentation/networking/x25.txt +++ b/Documentation/networking/x25.txt @@ -19,7 +19,7 @@ To confuse matters a little, an 802.2 LLC implementation for Linux is being written which will allow X.25 to be run over an Ethernet (or Token Ring) and conform with the JNT "Pink Book", this will have a different interface to the Packet Layer but there will be no confusion since the class of device -being served by the LLC will be completely seperate from LAPB. The LLC +being served by the LLC will be completely separate from LAPB. The LLC implementation is being done as part of another protocol project (SNA) and by a different author. diff --git a/Documentation/networking/z8530drv.txt b/Documentation/networking/z8530drv.txt index b566181a91e6..e2e5b5a130b8 100644 --- a/Documentation/networking/z8530drv.txt +++ b/Documentation/networking/z8530drv.txt @@ -232,7 +232,7 @@ will print a skeleton z8530drv.conf for the OptoSCC to stdout. gencfg 2 0x300 2 4 5 -4 0 7 4915200 0x10 -does the same for the BAYCOM USCC card. I my opinion it is much easier +does the same for the BAYCOM USCC card. In my opinion it is much easier to edit scc_config.h... @@ -318,9 +318,9 @@ the kernel AX.25. ======================= Since the TTY driver (aka KISS TNC emulation) is gone you need -to emulate the old behaviour. The cost using these programs is -that you probably need to compile the kernel AX.25, regardless -if you actually use it or not. First setup your /etc/ax25/axports, +to emulate the old behaviour. The cost of using these programs is +that you probably need to compile the kernel AX.25, regardless of whether +you actually use it or not. First setup your /etc/ax25/axports, for example: 9k6 dl0tha-9 9600 255 4 9600 baud port (scc3) @@ -406,7 +406,7 @@ NoSpace - number of times a buffer could not get allocated An overrun is abnormal. If lots of these occur, the product of baudrate and number of interfaces is too high for the processing -power of you computer. NoSpace errors are unlikely caused by the +power of your computer. NoSpace errors are unlikely to be caused by the driver or the kernel AX.25. @@ -559,7 +559,7 @@ txoff: group: It is possible to build special radio equipment to use more than - one frequency on the same bad, e.g. using several receivers and + one frequency on the same band, e.g. using several receivers and only one transmitter that can be switched between frequencies. Also, you can connect several radios that are active on the same band. In these cases, it is not possible, or not a good idea, to @@ -617,7 +617,7 @@ I got reports that the driver has problems on some 386-based systems. (i.e. Amstrad) Those systems have a bogus AT bus timing which will lead to delayed answers on interrupts. You can recognize these problems by looking at the output of Sccstat for the suspected -port. See if it shows under- and overruns you own such a system. +port. If it shows under- and overruns you own such a system. Delayed processing of received data: This depends on @@ -634,7 +634,7 @@ Delayed processing of received data: This depends on - using information from rxecho or kissbridge. -Kernel panics: please read to /linux/README and find out if it +Kernel panics: please read /linux/README and find out if it really occurred within the scc driver. If you cannot solve a problem, send me diff --git a/Documentation/nfsroot.txt b/Documentation/nfsroot.txt index 1d42ec6fe631..79b08fab5b4d 100644 --- a/Documentation/nfsroot.txt +++ b/Documentation/nfsroot.txt @@ -81,7 +81,7 @@ ip=:::::: This parameter tells the kernel how to configure IP addresses of devices and also how to set up the IP routing table. It was originally called `nfsaddrs', - but now the boot-time IP configuration works independently on NFS, so it + but now the boot-time IP configuration works independently of NFS, so it was renamed to `ip' and the old name remained as an alias for compatibility reasons. @@ -106,14 +106,14 @@ ip=:::::: the address of the server is used which answered the RARP or BOOTP request. - IP address of a gateway if the server in on a different + IP address of a gateway if the server is on a different subnet. If this entry is empty no gateway is used and the server is assumed to be on the local network, unless a value has been received by BOOTP. Netmask for local network interface. If this is empty, the netmask is derived from the client IP address assuming - classful addressing, unless overriden in BOOTP reply. + classful addressing, unless overridden in BOOTP reply. Name of the client. If empty, the client IP address is used in ASCII-notation, or the value received by BOOTP. diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt index 2fb21a7268e7..6e685aa64220 100644 --- a/Documentation/oops-tracing.txt +++ b/Documentation/oops-tracing.txt @@ -41,7 +41,7 @@ and then you get a better idea of what happens than with the gdb disassembly. Now, the trick is just then to combine all the data you have: the C -sources (and general knowledge of what it _should_ do, the assembly +sources (and general knowledge of what it _should_ do), the assembly listing and the code disassembly (and additionally the register dump you also get from the "oops" message - that can be useful to see _what_ the corrupted pointers were, and when you have the assembler listing you can diff --git a/Documentation/paride.txt b/Documentation/paride.txt index 923377cad69e..89941584a800 100644 --- a/Documentation/paride.txt +++ b/Documentation/paride.txt @@ -146,10 +146,10 @@ will need them to identify the devices. If you happen to be using a MicroSolutions backpack device, you will also need to know the unit ID number for each drive. This is usually -the last two digits of the drive's serial number (but read MicroSolution's +the last two digits of the drive's serial number (but read MicroSolutions' documentation about this). -As an example, lets assume that you have a MicroSolutions PD/CD drive +As an example, let's assume that you have a MicroSolutions PD/CD drive with unit ID number 36 connected to the parallel port at 0x378, a SyQuest EZ-135 connected to the chained port on the PD/CD drive and also an Imation Superdisk connected to port 0x278. You could give the following diff --git a/Documentation/pci.txt b/Documentation/pci.txt index 8dbd3b2725f0..d40bfaf38ec2 100644 --- a/Documentation/pci.txt +++ b/Documentation/pci.txt @@ -22,7 +22,7 @@ ID, it should use: In case you want to do some complex matching, look at pci_devices -- it's a linked list of pci_dev structures for all PCI devices in the system. - All these methods return pointer to a pci_dev structure which is used as a + All these methods return a pointer to a pci_dev structure which is used as a parameter for many other PCI functions. The rest of them accept bus and device/function numbers which can be found in pci_dev->bus->number and pci_dev->devfn. Feel free to use all other fields of the pci_dev structure, but @@ -34,8 +34,8 @@ machine. 2. How to access PCI config space ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can use pci_(read|write)_config_(byte|word|dword) to access the config -space of device represented by pci_dev. All these functions return 0 when -successfull or an error code (PCIBIOS_...) which can be translated to text +space of a device represented by pci_dev. All these functions return 0 when +successful or an error code (PCIBIOS_...) which can be translated to a text string by pcibios_strerror. Most drivers expect that accesses to valid PCI devices don't fail. diff --git a/Documentation/powerpc/00-INDEX b/Documentation/powerpc/00-INDEX index 9e553e6c8dde..6cd33d720d07 100644 --- a/Documentation/powerpc/00-INDEX +++ b/Documentation/powerpc/00-INDEX @@ -1,5 +1,5 @@ Index of files in Documentation/powerpc. If you think something about -Linux/PPC needs an entry here, needs correction of you've written one +Linux/PPC needs an entry here, needs correction or you've written one please mail me. Cort Dougan (cort@cs.nmt.edu) diff --git a/Documentation/ramdisk.txt b/Documentation/ramdisk.txt index 5b48c398714b..a00eb0a6468a 100644 --- a/Documentation/ramdisk.txt +++ b/Documentation/ramdisk.txt @@ -31,7 +31,7 @@ command has changed. Also, the new ramdisk supports up to 16 ramdisks out of the box, and can be reconfigured in rd.c to support up to 255 ramdisks. To use multiple ramdisk support with your system, run 'mknod /dev/ramX b 1 X' and chmod -(to change it's permissions) it to your liking. The default /dev/ram(disk) +(to change its permissions) it to your liking. The default /dev/ram(disk) uses minor #1, so start with ram2 and go from there. The old "ramdisk=" has been changed to "ramdisk_size=" @@ -42,7 +42,7 @@ The new ramdisk also has the ability to load compressed ramdisk images, allowing one to squeeze more programs onto an average installation or rescue floppy disk. -Notes: You may have "dev/ram" or "/dev/ramdisk" or both. They are +Notes: You may have "/dev/ram" or "/dev/ramdisk" or both. They are equivalent from the standpoint of this document. Also, the new ramdisk is a config option. When running "make config", make sure you enable ramdisk support for the kernel you intend to use the ramdisk with. @@ -113,7 +113,7 @@ size). Bit 14 indicates that a ramdisk is to be loaded, and bit 15 indicates whether a prompt/wait sequence is to be given before trying to read the ramdisk. Since the ramdisk dynamically grows as data is being written into it, a size field is no longer required. Bits 11 -to 13 are not presently used and may as well be zero. These numbers +to 13 are not currently used and may as well be zero. These numbers are no magical secrets, as seen below: ./arch/i386/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF @@ -160,7 +160,7 @@ of RAM. If using a spare disk partition instead of /dev/ram, then this restriction does not apply. a) Decide on the ramdisk size that you want. Say 2MB for this example. - Create it by writing to the ramdisk device. (This step is not presently + Create it by writing to the ramdisk device. (This step is not currently required, but may be in the future.) It is wise to zero out the area (esp. for disks) so that maximal compression is achieved for the unused blocks of the image that you are about to create. diff --git a/Documentation/riscom8.txt b/Documentation/riscom8.txt index 92c20238a761..a51dafefff43 100644 --- a/Documentation/riscom8.txt +++ b/Documentation/riscom8.txt @@ -13,20 +13,20 @@ Misc. notes for RISCom/8 serial driver, in no particular order :) as module use insmod options "iobase=0xXXX iobase1=0xXXX iobase2=..." 2) The driver partially supports famous 'setserial' program, you can use almost - any it option, exclude port & irq settings. + any of its options, excluding port & irq settings. 3) There are some misc. defines at the beginning of riscom8.c, please read the comments and try to change some of them in case of problems. 4) I consider the current state of the driver as BETA. - If you REALLY think you found the bug, send me e-mail, I hope I'll + If you REALLY think you found a bug, send me e-mail, I hope I'll fix it. For any other problems please ask support@sdlcomm.com. 5) SDL Communications WWW page is http://www.sdlcomm.com. 6) You can use the script at the end of this file to create RISCom/8 devices. -7) Minors number for 1-st board are 0-7, for second 8-15, etc. +7) Minor numbers for first board are 0-7, for second 8-15, etc. 22 Apr 1996. diff --git a/Documentation/scsi.txt b/Documentation/scsi.txt index 1ce285f0685d..69d8388c7659 100644 --- a/Documentation/scsi.txt +++ b/Documentation/scsi.txt @@ -5,7 +5,7 @@ understand your options, we should first define a few terms. The scsi-core contains the core of scsi support. Without it you can do nothing with any of the other scsi drivers. The scsi core -support can be a module (scsi_mod.o), or it can be build into the kernel. +support can be a module (scsi_mod.o), or it can be built into the kernel. If the core is a module, it must be the first scsi module loaded, and if you unload the modules, it will have to be the last one unloaded. diff --git a/Documentation/smp b/Documentation/smp index a0c4ab77e81b..5706552b03cb 100644 --- a/Documentation/smp +++ b/Documentation/smp @@ -8,7 +8,7 @@ kernel images on hand. Edit /etc/lilo.conf to create an entry for another kernel image called "linux-smp" or something. The next time you compile the kernel, when running a SMP kernel, -edit linux/Makefile and change "MAKE=make" "MAKE=make -jN" +edit linux/Makefile and change "MAKE=make" to "MAKE=make -jN" (where N = number of CPU + 1, or if you have tons of memory/swap you can just use "-j" without a number). Feel free to experiment with this one. @@ -20,4 +20,4 @@ Example: If you are using some Compaq MP compliant machines you will need to set the operating system in the BIOS settings to "Unixware" - don't ask me -why Compaq's dont work otherwise. +why Compaqs don't work otherwise. diff --git a/Documentation/smp.tex b/Documentation/smp.tex index ea582088c34d..e67f0897854e 100644 --- a/Documentation/smp.tex +++ b/Documentation/smp.tex @@ -39,11 +39,11 @@ The Intel pentium processors have a wide variety of inbuilt facilities for supporting multiprocessing, including hardware cache coherency, built in interprocessor interrupt handling and a set of atomic test and set, exchange and similar operations. The cache coherency in particular makes the -operating systems job far easier. +operating system's job far easier. The specification defines a detailed configuration structure in ROM that the boot up processor can read to find the full configuration of the -processors and busses. It also defines a procedure for starting up the +processors and buses. It also defines a procedure for starting up the other processors. @@ -53,7 +53,7 @@ locking and protection of its own tables to prevent two processes updating them at once and for example allocating the same memory block. There are two strategies for this within current Unix and Unixlike kernels. Traditional unix systems from the earliest of days use a scheme of 'Coarse -Grained Locking' where the entire kernel is protected as a small number of +Grained Locking' where the entire kernel is protected by a small number of locks only. Some modern systems use fine grained locking. Because fine grained locking has more overhead it is normally used only on multiprocessor kernels and real time kernels. In a real time kernel the @@ -64,7 +64,7 @@ Within the Linux kernel certain guarantees are made. No process running in kernel mode will be pre-empted by another kernel mode process unless it voluntarily sleeps. This ensures that blocks of kernel code are effectively atomic with respect to other processes and greatly simplifies -many operation. Secondly interrupts may pre-empt a kernel running process, +many operations. Secondly interrupts may pre-empt a kernel running process, but will always return to that process. A process in kernel mode may disable interrupts on the processor and guarantee such an interruption will not occur. The final guarantee is that an interrupt will not be pre-empted @@ -124,7 +124,7 @@ some processors to set each CPU up correctly. These functions will probably need to be modified in existing kernels to cope with this. -Each additional CPU the calls the architecture specific function +Each additional CPU then calls the architecture specific function {\tt \bf void smp\_callin(void)} @@ -142,7 +142,7 @@ they will run when they have no real work to process. \subsubsection{Scheduling} -The kernel scheduler implements a simple but very and effective task +The kernel scheduler implements a simple but very effective task scheduler. The basic structure of this scheduler is unchanged in the multiprocessor kernel. A processor field is added to each task, and this maintains the number of the processor executing a given task, or a magic @@ -185,7 +185,7 @@ that are provided by the processor specification functionality. These are {\tt \bf int smp\_processor\_id(void) } -which returns the identity of the process the call is executed upon. This +which returns the identity of the processor the call is executed upon. This call is assumed to be valid at all times. This may mean additional tests are needed during initialisation. @@ -203,7 +203,7 @@ work. Refer to the processor specific code documentation for more details. \subsection{Architecture Specific Code For the Intel MP Port} -The architecture specific code for the intel port splits fairly cleanly +The architecture specific code for the Intel port splits fairly cleanly into four sections. Firstly the initialisation code used to boot the system, secondly the message handling and support code, thirdly the interrupt and kernel syscall entry function handling and finally the @@ -286,7 +286,7 @@ spinlock it spins continually on the lock with interrupts disabled. This causes a specific deadlock problem. The lock owner may need to send an invalidate request to the rest of the processors and wait for these to complete before continuing. A processor spinning on the lock would not be -able to do thus. Thus the loop of the spinlock tests and handles invalidate +able to do this. Thus the loop of the spinlock tests and handles invalidate requests. If the invalidate bit for the spinning CPU is set the processor invalidates its TLB and atomically clears the bit. When the spinlock is obtained that processor will take an IPI and in the IPI test the bit and @@ -341,6 +341,6 @@ architecture which does not cover the 80386/80387 processor pair. \ The /proc filesystem support is changed so that the /proc/cpuinfo file contains a column for each processor present. This information is extracted -from the data save by smp\_store\_cpu\_info(). +from the data saved by smp\_store\_cpu\_info(). \end{document} diff --git a/Documentation/sound/AWE32 b/Documentation/sound/AWE32 index d85a7ad537aa..351f88a3855d 100644 --- a/Documentation/sound/AWE32 +++ b/Documentation/sound/AWE32 @@ -18,7 +18,7 @@ course with the suitable values for the parameters): alias char-major-14 sb post-install sb modprobe "-k" "adlib_card" options sb io=0x220 irq=5 dma=1 dma16=5 mpu_io=0x330 -options adlib_card io=0x388 # FM synthetiser +options adlib_card io=0x388 # FM synthesiser and then these two commands can be issued: diff --git a/Documentation/sound/Opti b/Documentation/sound/Opti index 90d4245081b2..bc8b15290611 100644 --- a/Documentation/sound/Opti +++ b/Documentation/sound/Opti @@ -24,7 +24,7 @@ and resources specified in /etc/isapnp.conf agree. Compiling the sound driver -------------------------- I highly recommend that you build a modularized sound driver. -This document does not cover sound-driver which is built in +This document does not cover a sound-driver which is built in the kernel. Sound card support should be enabled as a module (chose m). @@ -95,7 +95,7 @@ request to loading the main sound module. The main sound module contains only common code which is needed by all the sound drivers, and the driver for /dev/sndstat. -The sound module in it's turn will request loading of a sub-driver +The sound module in its turn will request loading of a sub-driver for mixer, audio, midi or synthesizer device. The first 3 are supported by the mad16 driver. The synth device is supported by the opl3 driver. @@ -105,7 +105,7 @@ if more than one card is installed. options sb mad16=1 -This is left for historical reason. If you enable the +This is left for historical reasons. If you enable the config option 'Support MIDI in older MAD16 based cards (requires SB)' or if you use an older mad16 driver it will force loading of the SoundBlaster driver. This option tells the SB driver not to look @@ -154,11 +154,11 @@ If you do use the sound card it is important that you load the mad16 driver (use "modprobe mad16" to prevent auto-unloading) before the cdrom is accessed the first time. -Using the sound driver built-in the kernel may help here. but... +Using the sound driver built-in to the kernel may help here, but... Most new systems have a PnP bios and also two IDE controllers. The IDE controller on the sound card may be needed only on older systems (which have only one IDE controller) but these systems -also do not have a PnP bios - requiring isapnptoosl and a modularized +also do not have a PnP bios - requiring isapnptools and a modularized driver. Known problems @@ -167,8 +167,8 @@ Known problems 2. On my system the codec cannot capture companded sound samples. (eg., recording from /dev/audio). When any companded capture is - requested I get a stereo-16 bit samples instead. Playback of - companded samples work well. Apparently this problem is not common + requested I get stereo-16 bit samples instead. Playback of + companded samples works well. Apparently this problem is not common to all C931 based cards. I do not know how to identify cards that have this problem. diff --git a/Documentation/sound/Soundblaster b/Documentation/sound/Soundblaster index 9ca8db777eb6..8fe9d42ef69e 100644 --- a/Documentation/sound/Soundblaster +++ b/Documentation/sound/Soundblaster @@ -4,7 +4,7 @@ insmod uart401 insmod sb ... This loads the driver for the soundblaster and assorted clones. Cards that -are covered by other drivers should not be using with this driver. +are covered by other drivers should not be using this driver. The soundblaster module takes the following arguments @@ -34,5 +34,5 @@ OS. IBM are being difficult about documenting how to load this firmware. Avance Logic ALS007 -This card isnt currently supported. I have patches to merge however that +This card isn't currently supported. I have patches to merge however that add limited support. diff --git a/Documentation/sound/mwave b/Documentation/sound/mwave index ad2a22965e13..8bf73a039fcf 100644 --- a/Documentation/sound/mwave +++ b/Documentation/sound/mwave @@ -36,7 +36,7 @@ The steps, then: BootGUI=0 [Note msdos.sys IS a text file but it needs to be 'unhidden' and make - read-writable before it can be eddited] + read-writable before it can be edited] Edit Config .sys to have multiple config menus. I have one for win95, and five for linux. Like this: @@ -123,7 +123,7 @@ initrd image, and has a parm file named LINDOC3.PAR in c:\linux\boot\parms: # c:\linux\boot\zImage.krn # first value must be the filename of the Linux-kernel root=/dev/hda3 # the device which gets mounted as root FS -ro # Other kernel agruments go here +ro # Other kernel arguments go here apm=off doc=yes 3 @@ -170,7 +170,7 @@ Default=SBPRO Reboot to Win95 and choose Linux. When booted, use sndconfig to configure the sound modules and VOILA - ThinkPad sound with Linux. -Now the gottchas - You can either have CD sound OR Mixers but not both. Thats a +Now the gotchas - You can either have CD sound OR Mixers but not both. That's a problem with the SB1.5(CD sound) or SBPRO(Mixers) settings. No-one knows why this is! diff --git a/Documentation/sound/ultrasound b/Documentation/sound/ultrasound index 552dff4ae649..51abe84a75ae 100644 --- a/Documentation/sound/ultrasound +++ b/Documentation/sound/ultrasound @@ -3,7 +3,7 @@ insmod sound insmod ad1848 insmod gus io=* irq=* dma=* ... -This loads the driver for the Gravis Ultrasound familily of soundcards. +This loads the driver for the Gravis Ultrasound family of soundcards. The gus modules takes the following arguments @@ -22,7 +22,7 @@ no_wave_dma option This option defaults to a value of 0, which allows the Ultrasound wavetable DSP to use DMA for for playback and downloading samples. This is the same as the old behaviour. If set to 1, no DMA is needed for downloading samples, -and allows owners of a GUS MAX to make use of simultanious digital audio +and allows owners of a GUS MAX to make use of simultaneous digital audio (/dev/dsp), MIDI, and wavetable playback. diff --git a/Documentation/specialix.txt b/Documentation/specialix.txt index cb4eeb7cb742..158699b1a857 100644 --- a/Documentation/specialix.txt +++ b/Documentation/specialix.txt @@ -44,10 +44,10 @@ instead of in a manual that can get lost. Ever misplace your Linux kernel sources? And the manual of one of the boards in your computer? -Adresses and interrupts -======================= +Addresses and interrupts +======================== -Addres dip switch settings: +Address dip switch settings: The dip switch sets bits 2-9 of the IO address. 9 8 7 6 5 4 3 2 @@ -111,7 +111,7 @@ The Specialix card uses a 25MHz crystal (in times two mode, which in fact is a divided by two mode). This is not enough to reach the rated 115k2 on all ports at the same time. With this clock rate you can only do 37% of this rate. This means that at 115k2 on all ports you are -going to loose characters (The chip cannot handle that many incoming +going to lose characters (The chip cannot handle that many incoming bits at this clock rate.) (Yes, you read that correctly: there is a limit to the number of -=bits=- per second that the chip can handle.) @@ -129,7 +129,7 @@ got: +++[0d]ATQ0V1H0[0d][0d][8a]O[cb][0d][8a] The three characters that have the "^^^" under them have suffered a bit error in the highest bit. In conclusion: I've tested it, and found -that it simply DOESN"T work for me. I also suspect that this is also +that it simply DOESN'T work for me. I also suspect that this is also caused by the baud rate being just a little bit out of tune. diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt index 1d749b9425dd..53f9362f5c90 100644 --- a/Documentation/spinlocks.txt +++ b/Documentation/spinlocks.txt @@ -6,7 +6,7 @@ On Fri, 2 Jan 1998, Doug Ledford wrote: > SMP safe as well as UP safe during interrupts and other manipulating > routines. So far, I've added a spin_lock variable to things like my queue > structs. Now, from what I recall, there are some spin lock functions I can -> use to lock these spin locks frmo other use as oppossed to a (nasty) +> use to lock these spin locks from other use as opposed to a (nasty) > save_flags(); cli(); stuff; restore_flags(); construct. Where do I find > these routines and go about making use of them? Do they only lock on a > per-processor basis or can they also lock say an interrupt routine from @@ -25,7 +25,7 @@ See . The basic version is: ... critical section here .. spin_unlock_irqrestore(&xxx_lock, flags); -and the above is always safe. It will disable interrupt _locally_, but the +and the above is always safe. It will disable interrupts _locally_, but the spinlock itself will guarantee the global lock, so it will guarantee that there is only one thread-of-control within the region(s) protected by that lock. diff --git a/Documentation/stallion.txt b/Documentation/stallion.txt index 5f1a95977f38..6c80ff368695 100644 --- a/Documentation/stallion.txt +++ b/Documentation/stallion.txt @@ -94,7 +94,7 @@ the kernel or modules. When the new kernel is booted, or the loadable module loaded then the driver will emit some kernel trace messages about whether the configured -boards where detected or not. Depending on how your system logger is set +boards were detected or not. Depending on how your system logger is set up these may come out on the console, or just be logged to /var/adm/messages. You should check the messages to confirm that all is well. @@ -141,7 +141,7 @@ addressing limit). The higher than 1Mb memory addresses are fully supported by this driver. Just enter the address as you normally would for a lower than 1Mb address -(in the drivers board configuration structure). +(in the driver's board configuration structure). @@ -165,7 +165,7 @@ ONboard boards is software programmable, but not on the Brumby boards. The intelligent boards also need to have their "firmware" code downloaded to them. This is done via a user level application supplied in the driver -utility package called "stlload". Compile this program where ever you dropped +utility package called "stlload". Compile this program wherever you dropped the package files, by typing "make". In its simplest form you can then type ./stlload -i cdk.sys in this directory and that will download board 0 (assuming board 0 is an @@ -220,7 +220,7 @@ intentional, obviously this is the easiest way to emulate its behavior! Since this driver tries to emulate the standard serial ports as much as possible, most system utilities should work as they do for the standard -COM ports. Most importantly "stty" works as expected and "setserial" can be +COM ports. Most importantly "stty" works as expected and "setserial" can also be used (excepting the ability to auto-configure the I/O and IRQ addresses of boards). Higher baud rates are supported in the usual fashion through setserial or using the CBAUDEX extensions. Note that the EasyIO and diff --git a/Documentation/svga.txt b/Documentation/svga.txt index aa8f106029e8..362eb4fefd02 100644 --- a/Documentation/svga.txt +++ b/Documentation/svga.txt @@ -5,8 +5,8 @@ 1. Intro ~~~~~~~~ This small document describes the "Video Mode Selection" feature which -allows to use various special video modes supported by the video BIOS. Due -to usage of the BIOS, the selection is limited to the boot time (before the +allows the use of various special video modes supported by the video BIOS. Due +to usage of the BIOS, the selection is limited to boot time (before the kernel decompression starts) and works only on 80X86 machines. The video mode to be used is selected by a kernel parameter which can be @@ -58,16 +58,16 @@ of chipsets is turned off by default (see CONFIG_VIDEO_SVGA in chapter 4 to see how to enable it if you really want) as it's inherently unreliable due to absolutely insane PC design. - "0 0F00 80x25" tells that the first menu item (the menu items are numbered + "0 0F00 80x25" means that the first menu item (the menu items are numbered from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the next section for a description of mode ID's). - encourages you to write the item number or mode ID + encourages you to enter the item number or mode ID you wish to set and press . If the computer complains something about -"Unknown mode ID", it tries to explain you that it isn't possible to set such +"Unknown mode ID", it is trying to tell you that it isn't possible to set such a mode. It's also possible to press only which leaves the current mode. - The mode list usually contains only few basic modes and some VESA modes. In + The mode list usually contains a few basic modes and some VESA modes. In case your chipset has been detected, some chipset-specific modes are shown as well (some of these might be missing or unusable on your machine as different BIOSes are often shipped with the same card and the mode numbers depend purely @@ -173,7 +173,7 @@ in setup.S, but it's better to upgrade the boot loader...) CONFIG_VIDEO_LOCAL - enables inclusion of "local modes" in the list. The local modes are added automatically to the beginning of the list not depending -by hardware configuration. The local modes are listed in the source text after +on hardware configuration. The local modes are listed in the source text after the "local_mode_table:" line. The comment before this line describes the format of the table (which also includes a video card name to be displayed on the top of the menu). @@ -201,7 +201,7 @@ your kernel with the video mode set directly via the kernel parameter. In either case, please send me a bug report containing what _exactly_ happens and how do the configuration switches affect the behaviour of the bug. - If you start Linux from the M$-DOS, you might also use some DOS tools for + If you start Linux from M$-DOS, you might also use some DOS tools for video mode setting. In this case, you must specify the 0x0f04 mode ("leave current settings") to Linux, because if you don't and you use any non-standard mode, Linux will switch to 80x25 automatically. diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 15615f36481a..b935b4ca976a 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -43,7 +43,7 @@ When the value in this file is 0, ctrl-alt-del is trapped and sent to the init(1) program to handle a graceful restart. When, however, the value is > 0, Linux's reaction to a Vulcan Nerve Pinch (tm) will be an immediate reboot, without even -syncing it's dirty buffers. +syncing its dirty buffers. Note: when a program (like dosemu) has the keyboard in 'raw' mode, the ctrl-alt-del is intercepted by the program before it @@ -68,7 +68,7 @@ struct { Dentries are dynamically allocated and deallocated, and nr_dentry seems to be 0 all the time. Hence it's safe to assume that only nr_unused, age_limit and want_pages are -used. Nr_unused seems to be exactly what it's name says. +used. Nr_unused seems to be exactly what its name says. Age_limit is the age in seconds after which dcache entries can be reclaimed when memory is short and want_pages is nonzero when shrink_dcache_pages() has been called and the @@ -101,7 +101,7 @@ The three values in file-nr denote the number of allocated file handles, the number of used file handles and the maximum number of file handles. When the allocated filehandles come close to the maximum, but the number of actually used ones is -far behind, you've encountered a peek in your filehandle usage +far behind, you've encountered a peak in your filehandle usage and you don't need to increase the maximum. ============================================================== @@ -112,7 +112,7 @@ As with filehandles, the kernel allocates the inode structures dynamically, but can't free them yet... The value in inode-max denotes the maximum number of inode -handlers. This value should be 3-4 times larger as the value +handlers. This value should be 3-4 times larger than the value in file-max, since stdin, stdout and network sockets also need an inode struct to handle them. When you regularly run out of inodes, you need to increase this value. @@ -126,7 +126,7 @@ nr_free_inodes and preshrink. Nr_inodes stands for the number of inodes the system has allocated, this can be slightly more than inode-max because -Linux allocates them one pagefull at a time. +Linux allocates them one pageful at a time. Nr_free_inodes represents the number of free inodes (?) and preshrink is nonzero when the nr_inodes > inode-max and the diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 348e002e862a..774da3d5f44f 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -62,7 +62,7 @@ of the buffer still have to be written to disk (as opposed to a clean buffer, which can just be forgotten about). Setting this to a high value means that Linux can delay disk writes for a long time, but it also means that it will have -to do a lot I/O at once when memory becomes short. A low +to do a lot of I/O at once when memory becomes short. A low value will spread out disk I/O more evenly. The second parameter (ndirty) gives the maximum number of @@ -94,7 +94,8 @@ buffermem: The three values in this file correspond to the values in the struct buffer_mem. It controls how much memory should -be used for buffer memory. +be used for buffer memory. The percentage is calculated +as a percentage of total system memory. The values are: min_percent -- this is the minumum percentage of memory @@ -111,29 +112,9 @@ freepages: This file contains the values in the struct freepages. That struct contains three members: min, low and high. -These numbers are used by the VM subsystem to keep a reasonable -number of pages on the free page list, so that programs can -allocate new pages without having to wait for the system to -free used pages first. The actual freeing of pages is done -by kswapd, a kernel daemon. - -min -- when the number of free pages reaches this - level, only the kernel can allocate memory - for _critical_ tasks only -low -- when the number of free pages drops below - this level, kswapd is woken up immediately -high -- this is kswapd's target, when more than - pages are free, kswapd will stop swapping. - -When the number of free pages is between low and high, -and kswapd hasn't run for swapout_interval jiffies, then -kswapd is woken up too. See swapout_interval for more info. - -When free memory is always low on your system, and kswapd has -trouble keeping up with allocations, you might want to -increase these values, especially high and perhaps low. -I've found that a 1:2:4 relation for these values tend to work -rather well in a heavily loaded system. +These variables are currently unused (?), but they're +very likely to be abused for something else in the near +future, so don't yet remove it from the source... ============================================================== @@ -209,23 +190,23 @@ typedef struct swap_control_v5 } swap_control_v5; -------------------------------------------------------------- -The first four variables are used to keep track of Linux' +The first four variables are used to keep track of Linux's page aging. Page aging is a bookkeeping method to keep track of which pages of memory are used often, and which pages can be swapped out without consequences. When a page is swapped in, it starts at sc_page_initial_age -(default 3) and when the page is scanned by kswapd, it's age +(default 3) and when the page is scanned by kswapd, its age is adjusted according to the following scheme: -- if the page was used since the last time we scanned, it's - age is increased sc_page_advance (default 3) up to a maximum +- if the page was used since the last time we scanned, its + age is increased by sc_page_advance (default 3) up to a maximum of sc_max_page_age (default 20) -- else (it wasn't used) it's age is decreased sc_page_decline +- else (it wasn't used) its age is decreased by sc_page_decline (default 1) And when a page reaches age 0, it's ready to be swapped out. The next four variables can be used to control kswapd's -agressiveness in swapping out pages. +aggressiveness in swapping out pages. sc_age_cluster_fract is used to calculate how many pages from a process are to be scanned by kswapd. The formula used is @@ -236,10 +217,10 @@ represented by sc_age_cluster_min, this is done so kswapd will also scan small processes. The values of sc_pageout_weight and sc_bufferout_weight are -used to control the how many tries kswapd will do in order +used to control how many tries kswapd will make in order to swapout one page / buffer. These values can be used to finetune the ratio between user pages and buffer/cache memory. -When you find that your Linux system is swapping out too much +When you find that your Linux system is swapping out too many process pages in order to satisfy buffer memory demands, you might want to either increase sc_bufferout_weight, or decrease the value of sc_pageout_weight. diff --git a/Documentation/transname.txt b/Documentation/transname.txt index d0a877141468..b778a1153f76 100644 --- a/Documentation/transname.txt +++ b/Documentation/transname.txt @@ -14,8 +14,8 @@ contents on the clients and on the server, but have to be replicated This duplication causes very large efforts in practise, since at least the /etc directory has to be duplicated for every client. Even in /etc many files are identical, for example sendmail.cf, initrc scripts and -others. Maintaining a large pool means to ensure coherence amoung the -duplicates. Classical methods like symlinks are unconvenient +others. Maintaining a large pool requires means to ensure coherence among +the duplicates. Classical methods like symlinks are inconvenient for this task because they have to be valid in the view of mounted filesystems at all clients, not at the server. @@ -28,17 +28,17 @@ create two different files named /etc/config#host=myserver# and file /etc/config (without the #...=...# suffix). On host "myclient", the corresponding other file will appear as /etc/config. So you can access the right file contents under the _same_ name, depending -on which host you are working. +on which host you are working on. -A similar concept can be found in elder HP-UX versions, but with -so-called "hidden directories" which don't allow contemporary viewing +A similar concept can be found in older HP-UX versions, but with +so-called "hidden directories" which don't allow contemporary viewing of all versions by default. In contrast, transname shows all context-dependent files in the dir listing and they can be edited using the fully qualified name. -Transname was developped for and is used at our Linux pool at the -University of Stuttgart with good results. Maintainance of the pool is -at a minimum, and adding new clients is a child's play. No worry with +Transname was developed for and is used at our Linux pool at the +University of Stuttgart with good results. Maintenance of the pool is +at a minimum, and adding new clients is child's play. No worry with keeping up mail configurations, newly installed tools, changed /etc/services, /etc/shells, /etc/resolv.conf and many, many others. In contrast to a sophisticated symlink solution, adding a new file to the /etc directory @@ -46,7 +46,7 @@ is seen immediately by all clients. An example for the use of linux-2.0-transname.patch: -For example, you can make your /etc/fstab context-dependend. If you want +For example, you can make your /etc/fstab context-dependent. If you want to do that, you should create an /etc/fstab#ktype=default# for the server and an /etc/fstab#ktype=diskless# for all clients. This is because your clients may not yet know their own hostname when they attempt to mount @@ -54,7 +54,7 @@ the root filesystem. You can compile in the kerneltypes "default" and "diskless" into different kernels for servers and clients. Of course, if your clients boot via bootp and know their names when mounting the root, you can use /etc/fstab#host=myclient# instead. But at least servers -booting from disk normally dont know their hostname at root mount time, +booting from disk normally don't know their hostname at root mount time, so you can mix methods and use /etc/fstab#ktype=default# for the server, /etc/fstab#ktype=diskless# for the majority of the clients and /etc/fstab#host=myclient# for some specific client, because translation @@ -73,7 +73,7 @@ five types of default translations are supported: Others may be added in future. -The current translation are displayed at boot time in the kernel messages +The current translations are displayed at boot time in the kernel messages for easier debugging, and can be retrieved by reading /proc/sys/kernel/nametrans which is a special file containing the currently valid translations. @@ -105,7 +105,7 @@ back to the default translations by executing echo "" > /proc/sys/kernel/nametrans Another drawback is that administration tools currently are not aware of -context-dependend files, so you cannot switch between contexts inside +context-dependent files, so you cannot switch between contexts inside one tool session. However, you can simulate administration sessions on the server as if they were running on some client. To do this, you have to set an environment variable NAMETRANS which has to be the @@ -131,7 +131,7 @@ exists a template filename like /etc/mtab#host=CREATE#. As soon as a process running on "mango" tries to create a file /etc/mtab, the version /etc/mtab#host=mango# is created instead (which appears in turn as hardlinked to /etc/mtab). Note that if you want to make /etc/fstab -context-dependend, you should execute "touch /etc/mtab#host=CREATE#" and +context-dependent, you should execute "touch /etc/mtab#host=CREATE#" and "touch /etc/mtab.tmp#host=CREATE#", because mount, umount and others running on different hosts would otherwise try to create one shared /etc/mtab which would result in a clash. Also one should execute @@ -155,11 +155,11 @@ or "make xconfig", just go to the section "filesystems". Take a look at the help texts that are associated with the transname options, they tell you further hints not mentioned in this README. Then build your kernel as usual, install it with a *new* kernel-filename, add a *new* entry to -/etc/lilo.conf and run lilo. **DONT CHANGE** any configuration files for the +/etc/lilo.conf and run lilo. **DON'T CHANGE** any configuration files for the first reboot! Just reboot the new kernel and play a little bit around with -creating context-dependend filenames in your home directory. +creating context-dependent filenames in your home directory. Try all modes including setting NAMETRANS to different values. As an example for the changes necessary on our LST-1.8-based Linux pool, @@ -247,7 +247,7 @@ where /usr/sbin/rpc.mountd has the contents #!/bin/sh exec /usr/bin/env - NAMETRANS= "`/usr/bin/env`" $0.notrans $* -Of course, that could be improved, but is a quick hack to get things work. +Of course, that could be improved, but is a quick hack to get things to work. Enjoy, diff --git a/Documentation/unicode.txt b/Documentation/unicode.txt index b9f44a6eb585..61242c097559 100644 --- a/Documentation/unicode.txt +++ b/Documentation/unicode.txt @@ -62,14 +62,14 @@ being a hacker-driven OS it seems this is a brilliant linguistic hack worth supporting. Hence I have chosen to add it to the list in the Linux Zone. -Several glyph forms for the Klingon alphabet has been proposed. +Several glyph forms for the Klingon alphabet have been proposed. However, since the set of symbols appear to be consistent throughout, with only the actual shapes being different, in keeping with standard Unicode practice these differences are considered font variants. Klingon has an alphabet of 26 characters, a positional numeric writing system with 10 digits, and is written left-to-right, top-to-bottom. -Punctuation appears to be only used in Latin transliteration; it is +Punctuation appears to be only used in Latin transliteration; it appears customary to write each sentence on its own line, and centered. Space has been reserved for punctuation should it prove necessary. diff --git a/arch/alpha/math-emu/ieee-math.c b/arch/alpha/math-emu/ieee-math.c index 59d7dfa6dbf7..b3d89638995b 100644 --- a/arch/alpha/math-emu/ieee-math.c +++ b/arch/alpha/math-emu/ieee-math.c @@ -733,19 +733,23 @@ ieee_CVTQT (int f, unsigned long a, unsigned long *b) * FPCR_INV if invalid operation occurred, etc. */ unsigned long -ieee_CVTTQ (int f, unsigned long a, unsigned long *b) +ieee_CVTTQ (int f, unsigned long a, unsigned long *pb) { unsigned int midway; - unsigned long ov, uv, res = 0; + unsigned long ov, uv, res, b; fpclass_t a_type; EXTENDED temp; - *b = 0; a_type = extend_ieee(a, &temp, DOUBLE); + + b = 0x7fffffffffffffff; + res = FPCR_INV; if (a_type == NaN || a_type == INFTY) - return FPCR_INV; + goto out; + + res = 0; if (a_type == QNaN) - return 0; + goto out; if (temp.e > 0) { ov = 0; @@ -757,7 +761,7 @@ ieee_CVTTQ (int f, unsigned long a, unsigned long *b) if (ov || (temp.f[1] & 0xffc0000000000000)) res |= FPCR_IOV | FPCR_INE; } - if (temp.e < 0) { + else if (temp.e < 0) { while (temp.e < 0) { ++temp.e; uv = temp.f[0] & 1; /* save sticky bit */ @@ -765,7 +769,8 @@ ieee_CVTTQ (int f, unsigned long a, unsigned long *b) temp.f[0] |= uv; } } - *b = ((temp.f[1] << 9) | (temp.f[0] >> 55)) & 0x7fffffffffffffff; + b = (temp.f[1] << 9) | (temp.f[0] >> 55); + /* * Notice: the fraction is only 52 bits long. Thus, rounding * cannot possibly result in an integer overflow. @@ -776,18 +781,18 @@ ieee_CVTTQ (int f, unsigned long a, unsigned long *b) midway = (temp.f[0] & 0x003fffffffffffff) == 0; if ((midway && (temp.f[0] & 0x0080000000000000)) || !midway) - ++*b; + ++b; } break; case ROUND_PINF: if ((temp.f[0] & 0x007fffffffffffff) != 0) - ++*b; + ++b; break; case ROUND_NINF: if ((temp.f[0] & 0x007fffffffffffff) != 0) - --*b; + --b; break; case ROUND_CHOP: @@ -798,8 +803,11 @@ ieee_CVTTQ (int f, unsigned long a, unsigned long *b) res |= FPCR_INE; if (temp.s) { - *b = -*b; + b = -b; } + +out: + *pb = b; return res; } diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 75527666f2cc..95568c73ea87 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -725,7 +725,7 @@ static inline void trigger_pending_irqs(unsigned int irq) { if (irq_events[irq] && !ipi_pending[irq]) { ipi_pending[irq] = 1; - send_IPI(smp_processor_id(), IO_APIC_VECTOR(irq)); + send_IPI(APIC_DEST_SELF, IO_APIC_VECTOR(irq)); } } diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 7a7a6caa5498..f5f92c2baf43 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -150,6 +150,7 @@ extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES]; extern int mpc_default_type; int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { -1, }; int mp_current_pci_id = 0; +unsigned long mp_lapic_addr = 0; /* #define SMP_DEBUG */ @@ -270,9 +271,8 @@ __initfunc(static int smp_read_mpc(struct mp_config_table *mpc)) printk("APIC at: 0x%lX\n",mpc->mpc_lapic); - /* check the local APIC address */ - if ((char *)phys_to_virt((unsigned long)mpc->mpc_lapic) != APIC_BASE) - panic("unexpected APIC address"); + /* save the local APIC address, it might be non-default */ + mp_lapic_addr = mpc->mpc_lapic; /* * Now process the configuration blocks. @@ -453,7 +453,7 @@ __initfunc(int smp_scan_config(unsigned long base, unsigned long length)) */ cfg=pg0[0]; - pg0[0] = ((unsigned long)APIC_BASE | 7); + pg0[0] = (mp_lapic_addr | 7); local_flush_tlb(); boot_cpu_id = GET_APIC_ID(*((volatile unsigned long *) APIC_ID)); diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index caa366e3a354..c33c53b9ae46 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -256,6 +256,7 @@ __initfunc(unsigned long paging_init(unsigned long start_mem, unsigned long end_ } #ifdef __SMP__ { + extern unsigned long mp_lapic_addr; pte_t pte; unsigned long apic_area = (unsigned long)APIC_BASE; @@ -266,10 +267,13 @@ __initfunc(unsigned long paging_init(unsigned long start_mem, unsigned long end_ if (smp_found_config) { /* - * Map the local APIC to FEE00000. + * Map the local APIC to FEE00000. (it's only the default + * value, thanks to Steve Hsieh for finding this out. We + * now save the real local-APIC physical address in smp_scan(), + * and use it here) */ pg_table = pte_offset((pmd_t *)pg_dir, apic_area); - pte = mk_pte(__va(apic_area), PAGE_KERNEL); + pte = mk_pte(__va(mp_lapic_addr), PAGE_KERNEL); set_pte(pg_table, pte); /* diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c index b6d6d4e564ff..d247e1f2dafa 100644 --- a/arch/sparc/mm/sun4c.c +++ b/arch/sparc/mm/sun4c.c @@ -8,6 +8,7 @@ * Copyright (C) 1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) */ +#include #include #include #include diff --git a/drivers/block/ide.c b/drivers/block/ide.c index febc57ffaba9..111186eba2c4 100644 --- a/drivers/block/ide.c +++ b/drivers/block/ide.c @@ -1125,9 +1125,6 @@ struct request **ide_get_queue (kdev_t dev) static void do_hwgroup_request (ide_hwgroup_t *hwgroup) { if (hwgroup->handler == NULL) { - ide_hwif_t *hgif = hwgroup->hwif; - ide_hwif_t *hwif = hgif; - del_timer(&hwgroup->timer); ide_get_lock(&ide_lock, ide_intr, hwgroup); hwgroup->active = 1; diff --git a/drivers/macintosh/imstt.c b/drivers/macintosh/imstt.c index 32513a5e2858..0cf18edc7123 100644 --- a/drivers/macintosh/imstt.c +++ b/drivers/macintosh/imstt.c @@ -11,8 +11,6 @@ */ #include -#include - #include #include #include diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 97eefddd9bb0..8dcf42ee94a0 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -248,6 +248,7 @@ static struct dev_info device_list[] = {"SONY","CD-ROM CDU-55S","1.0i", BLIST_NOLUN}, {"SONY","CD-ROM CDU-561","1.7x", BLIST_NOLUN}, {"TANDBERG","TDC 3600","U07", BLIST_NOLUN}, /* Locks up if polled for lun != 0 */ +{"TEAC","CD-R55S","1.0H", BLIST_NOLUN}, /* Locks up if polled for lun != 0 */ {"TEAC","CD-ROM","1.06", BLIST_NOLUN}, /* causes failed REQUEST SENSE on lun 1 * for seagate controller, which causes * SCSI code to reset bus.*/ diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index d4ddd14102fd..cc1fd35a5db1 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -572,6 +572,7 @@ static int empty_dir (struct inode * inode) ext2_warning (inode->i_sb, "empty_dir", "bad directory (dir #%lu) - no `.' or `..'", inode->i_ino); + brelse (bh); return 1; } offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); diff --git a/include/linux/if_ec.h b/include/linux/if_ec.h new file mode 100644 index 000000000000..4883f16a7a98 --- /dev/null +++ b/include/linux/if_ec.h @@ -0,0 +1,47 @@ +/* Definitions for Econet sockets. */ + +#ifndef __LINUX_IF_EC +#define __LINUX_IF_EC + +/* User visible stuff. Glibc provides its own but libc5 folk will use these */ + +struct ec_addr +{ + unsigned char station; /* Station number. */ + unsigned char net; /* Network number. */ +}; + +struct sockaddr_ec +{ + unsigned short sec_family; + unsigned char port; /* Port number. */ + unsigned char cb; /* Control/flag byte. */ + unsigned char type; /* Type of message. */ + struct ec_addr addr; + unsigned long cookie; +}; + +#define ECTYPE_PACKET_RECEIVED 0 /* Packet received */ +#define ECTYPE_TRANSMIT_STATUS 0x10 /* Transmit completed, + low nibble holds status */ + +#define ECTYPE_TRANSMIT_OK 1 +#define ECTYPE_TRANSMIT_NOT_LISTENING 2 +#define ECTYPE_TRANSMIT_NET_ERROR 3 +#define ECTYPE_TRANSMIT_NO_CLOCK 4 +#define ECTYPE_TRANSMIT_LINE_JAMMED 5 +#define ECTYPE_TRANSMIT_NOT_PRESENT 6 + +#ifdef __KERNEL__ + +struct econet_opt +{ + unsigned char cb; + unsigned char port; + unsigned char station; + unsigned char net; +}; + +#endif + +#endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e795ccb499b9..aa9568e837f4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -256,6 +256,7 @@ struct device struct Qdisc *qdisc; struct Qdisc *qdisc_sleeping; + struct Qdisc *qdisc_list; unsigned long tx_queue_len; /* Max frames per queue allowed */ /* Pointers to interface service routines. */ diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 83d9fae1bd62..bd4f9dd9b966 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -161,7 +161,7 @@ __nlmsg_put(struct sk_buff *skb, pid_t pid, u32 seq, int type, int len) } #define NLMSG_PUT(skb, pid, seq, type, len) \ -({ if (skb_tailroom(skb) < NLMSG_SPACE(len)) goto nlmsg_failure; \ +({ if (skb_tailroom(skb) < (int)NLMSG_SPACE(len)) goto nlmsg_failure; \ __nlmsg_put(skb, pid, seq, type, len); }) extern int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h new file mode 100644 index 000000000000..0c38cac8673f --- /dev/null +++ b/include/linux/pkt_cls.h @@ -0,0 +1,117 @@ +#ifndef __LINUX_PKT_CLS_H +#define __LINUX_PKT_CLS_H + +struct tc_police +{ + __u32 index; + int action; +#define TC_POLICE_UNSPEC (-1) +#define TC_POLICE_OK 0 +#define TC_POLICE_RECLASSIFY 1 +#define TC_POLICE_SHOT 2 + + __u32 limit; + __u32 burst; + __u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; +}; + +enum +{ + TCA_POLICE_UNSPEC, + TCA_POLICE_TBF, + TCA_POLICE_RATE, + TCA_POLICE_PEAKRATE, +}; + +#define TCA_POLICE_MAX TCA_POLICE_PEAKRATE + +/* U32 filters */ + +#define TC_U32_HTID(h) ((h)&0xFFF00000) +#define TC_U32_USERHTID(h) (TC_U32_HTID(h)>>20) +#define TC_U32_HASH(h) (((h)>>12)&0xFF) +#define TC_U32_NODE(h) ((h)&0xFFF) +#define TC_U32_KEY(h) ((h)&0xFFFFF) +#define TC_U32_UNSPEC 0 +#define TC_U32_ROOT (0xFFF00000) + +enum +{ + TCA_U32_UNSPEC, + TCA_U32_CLASSID, + TCA_U32_HASH, + TCA_U32_LINK, + TCA_U32_DIVISOR, + TCA_U32_SEL, + TCA_U32_POLICE, +}; + +#define TCA_U32_MAX TCA_U32_POLICE + +struct tc_u32_key +{ + __u32 mask; + __u32 val; + int off; + int offmask; +}; + +struct tc_u32_sel +{ + unsigned char flags; + unsigned char offshift; + unsigned char nkeys; + + __u16 offmask; + __u16 off; + short offoff; + + short hoff; + __u32 hmask; + + struct tc_u32_key keys[0]; +}; + +/* Flags */ + +#define TC_U32_TERMINAL 1 +#define TC_U32_OFFSET 2 +#define TC_U32_VAROFFSET 4 +#define TC_U32_EAT 8 + +#define TC_U32_MAXDEPTH 8 + + +/* RSVP filter */ + +enum +{ + TCA_RSVP_UNSPEC, + TCA_RSVP_CLASSID, + TCA_RSVP_DST, + TCA_RSVP_SRC, + TCA_RSVP_PINFO, + TCA_RSVP_POLICE, +}; + +#define TCA_RSVP_MAX TCA_RSVP_POLICE + +struct tc_rsvp_gpi +{ + __u32 key; + __u32 mask; + int offset; +}; + +struct tc_rsvp_pinfo +{ + struct tc_rsvp_gpi dpi; + struct tc_rsvp_gpi spi; + __u8 protocol; + __u8 tunnelid; + __u8 tunnelhdr; +}; + +#endif diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index b72ca41c1d96..4ec170dbd27a 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -1,15 +1,17 @@ #ifndef __LINUX_PKT_SCHED_H #define __LINUX_PKT_SCHED_H -#define PSCHED_TC_INIT 1 -#define PSCHED_TC_DESTROY 2 -#define PSCHED_TC_ATTACH 3 -#define PSCHED_TC_DETACH 4 +/* Logical priority bands not depending on specific packet scheduler. + Every scheduler will map them to real traffic classes, if it has + no more precise mechanism to classify packets. + These numbers have no special meaning, though their coincidence + with obsolete IPv6 values is not occasional :-). New IPv6 drafts + preferred full anarchy inspired by diffserv group. -/* "Logical" priority bands, not depending of concrete packet scheduler. - Every scheduler will map them to real traffic classes, if it have - no more precise machanism. + Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy + class, actually, as rule it will be handled with more care than + filler or even bulk. */ #define TC_PRIO_BESTEFFORT 0 @@ -19,75 +21,257 @@ #define TC_PRIO_INTERACTIVE 6 #define TC_PRIO_CONTROL 7 +#define TC_PRIO_MAX 15 -struct pschedctl +/* Generic queue statistics, available for all the elements. + Particular schedulers may have also their private records. + */ + +struct tc_stats { - int command; - int handle; - int child; - int ifindex; - char id[IFNAMSIZ]; - int arglen; - char args[0]; + __u64 bytes; /* NUmber of enqueues bytes */ + __u32 packets; /* Number of enqueued packets */ + __u32 drops; /* Packets dropped because of lack of resources */ + __u32 overlimits; /* Number of throttle events when this + * flow goes out of allocated bandwidth */ + __u32 bps; /* Current flow byte rate */ + __u32 pps; /* Current flow packet rate */ + __u32 qlen; + __u32 backlog; }; -/* CBQ section */ +struct tc_estimator +{ + char interval; + unsigned char ewma_log; +}; + +/* "Handles" + --------- + + All the traffic control objects have 32bit identifiers, or "handles". + + They can be considered as opaque numbers from user API viewpoint, + but actually they always consist of two fields: major and + minor numbers, which are interpreted by kernel specially, + that may be used by applications, though not recommended. + + F.e. qdisc handles always have minor number equal to zero, + classes (or flows) have major equal to parent qdisc major, and + minor uniquely identifying class inside qdisc. + + Macros to manipulate handles: + */ + +#define TC_H_MAJ_MASK (0xFFFF0000U) +#define TC_H_MIN_MASK (0x0000FFFFU) +#define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK) +#define TC_H_MIN(h) ((h)&TC_H_MIN_MASK) +#define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK)) + +#define TC_H_UNSPEC (0U) +#define TC_H_ROOT (0xFFFFFFFFU) + +struct tc_ratespec +{ + unsigned char cell_log; + unsigned char __reserved; + unsigned short feature; + short addend; + unsigned short mpu; + __u32 rate; +}; -#define CBQ_MAXPRIO 8 -#define CBQ_MAXLEVEL 8 +/* FIFO section */ + +struct tc_fifo_qopt +{ + __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ +}; + +/* PRIO section */ + +#define TCQ_PRIO_BANDS 16 + +struct tc_prio_qopt +{ + int bands; /* Number of bands */ + __u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band */ +}; /* CSZ section */ -struct cszctl +struct tc_csz_qopt { - int flow_id; - int handle; - unsigned long rate; - unsigned long max_bytes; - unsigned long depth; - unsigned long L_tab[256]; + int flows; /* Maximal number of guaranteed flows */ + unsigned char R_log; /* Fixed point position for round number */ + unsigned char delta_log; /* Log of maximal managed time interval */ + __u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> CSZ band */ }; -struct cszinitctl +struct tc_csz_copt { - int flows; - unsigned cell_log; + struct tc_ratespec slice; + struct tc_ratespec rate; + struct tc_ratespec peakrate; + __u32 limit; + __u32 buffer; + __u32 mtu; +}; + +enum +{ + TCA_CSZ_UNSPEC, + TCA_CSZ_PARMS, + TCA_CSZ_RTAB, + TCA_CSZ_PTAB, }; /* TBF section */ -struct tbfctl +struct tc_tbf_qopt { - unsigned cell_log; - unsigned long bytes; - unsigned long depth; - unsigned long L_tab[256]; + struct tc_ratespec rate; + struct tc_ratespec peakrate; + __u32 limit; + __u32 buffer; + __u32 mtu; }; +enum +{ + TCA_TBF_UNSPEC, + TCA_TBF_PARMS, + TCA_TBF_RTAB, + TCA_TBF_PTAB, +}; + + +/* TEQL section */ + +/* TEQL does not require any parameters */ + /* SFQ section */ -struct sfqctl +struct tc_sfq_qopt { - unsigned quantum; - unsigned depth; - unsigned divisor; - unsigned flows; + unsigned quantum; /* Bytes per round allocated to flow */ + int perturb_period; /* Period of hash perturbation */ + __u32 limit; /* Maximal packets in queue */ + unsigned divisor; /* Hash divisor */ + unsigned flows; /* Maximal number of flows */ }; +/* + * NOTE: limit, divisor and flows are hardwired to code at the moment. + * + * limit=flows=128, divisor=1024; + * + * The only reason for this is efficiency, it is possible + * to change these parameters in compile time. + */ + /* RED section */ -struct redctl +enum +{ + TCA_RED_UNSPEC, + TCA_RED_PARMS, + TCA_RED_STAB, +}; + +struct tc_red_qopt +{ + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ +}; + +/* CBQ section */ + +#define TC_CBQ_MAXPRIO 8 +#define TC_CBQ_MAXLEVEL 8 +#define TC_CBQ_DEF_EWMA 5 + +struct tc_cbq_lssopt +{ + unsigned char change; + unsigned char flags; +#define TCF_CBQ_LSS_BOUNDED 1 +#define TCF_CBQ_LSS_ISOLATED 2 + unsigned char ewma_log; + unsigned char level; +#define TCF_CBQ_LSS_FLAGS 1 +#define TCF_CBQ_LSS_EWMA 2 +#define TCF_CBQ_LSS_MAXIDLE 4 +#define TCF_CBQ_LSS_MINIDLE 8 +#define TCF_CBQ_LSS_OFFTIME 0x10 +#define TCF_CBQ_LSS_AVPKT 0x20 + __u32 maxidle; + __u32 minidle; + __u32 offtime; + __u32 avpkt; +}; + +struct tc_cbq_wrropt +{ + unsigned char flags; + unsigned char priority; + unsigned char cpriority; + unsigned char __reserved; + __u32 allot; + __u32 weight; +}; + +struct tc_cbq_ovl +{ + unsigned char strategy; +#define TC_CBQ_OVL_CLASSIC 0 +#define TC_CBQ_OVL_DELAY 1 +#define TC_CBQ_OVL_LOWPRIO 2 +#define TC_CBQ_OVL_DROP 3 +#define TC_CBQ_OVL_RCLASSIC 4 + unsigned char priority2; + __u32 penalty; +}; + +struct tc_cbq_police +{ + unsigned char police; + unsigned char __res1; + unsigned short __res2; +}; + +struct tc_cbq_fopt +{ + __u32 split; + __u32 defmap; + __u32 defchange; +}; + +struct tc_cbq_xstats +{ + __u32 borrows; + __u32 overactions; + __s32 avgidle; + __s32 undertime; +}; + +enum { - unsigned qmaxbytes; /* HARD maximal queue length */ - unsigned qth_min; /* Min average length threshold: A scaled */ - unsigned qth_max; /* Max average length threshold: A scaled */ - char Alog; /* Point position in average lengths */ - char Wlog; /* log(W) */ - char Rlog; /* random number bits */ - char C1log; /* log(1/C1) */ - char Slog; - char Stab[256]; + TCA_CBQ_UNSPEC, + TCA_CBQ_LSSOPT, + TCA_CBQ_WRROPT, + TCA_CBQ_FOPT, + TCA_CBQ_OVL_STRATEGY, + TCA_CBQ_RATE, + TCA_CBQ_RTAB, + TCA_CBQ_POLICE, }; +#define TCA_CBQ_MAX TCA_CBQ_POLICE #endif diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 7b6b3af7ed27..b0efb81a923a 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -37,12 +37,12 @@ #define RTM_GETRULE (RTM_BASE+18) #define RTM_NEWQDISC (RTM_BASE+20) -#define RTM_DELQDSIC (RTM_BASE+21) +#define RTM_DELQDISC (RTM_BASE+21) #define RTM_GETQDISC (RTM_BASE+22) -#define RTM_NEWTFLOW (RTM_BASE+24) -#define RTM_DELTFLOW (RTM_BASE+25) -#define RTM_GETTFLOW (RTM_BASE+26) +#define RTM_NEWTCLASS (RTM_BASE+24) +#define RTM_DELTCLASS (RTM_BASE+25) +#define RTM_GETTCLASS (RTM_BASE+26) #define RTM_NEWTFILTER (RTM_BASE+28) #define RTM_DELTFILTER (RTM_BASE+29) @@ -533,10 +533,11 @@ enum TCA_KIND, TCA_OPTIONS, TCA_STATS, - TCA_XSTATS + TCA_XSTATS, + TCA_RATE, }; -#define TCA_MAX TCA_XSTATS +#define TCA_MAX TCA_RATE #define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg)))) #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg)) @@ -551,6 +552,7 @@ enum #define RTMGRP_LINK 1 #define RTMGRP_NOTIFY 2 #define RTMGRP_NEIGH 4 +#define RTMGRP_TC 8 #define RTMGRP_IPV4_IFADDR 0x10 #define RTMGRP_IPV4_MROUTE 0x20 @@ -567,6 +569,14 @@ enum extern atomic_t rtnl_rlockct; extern struct wait_queue *rtnl_wait; +extern __inline__ int rtattr_strcmp(struct rtattr *rta, char *str) +{ + int len = strlen(str) + 1; + return len > rta->rta_len || memcmp(RTA_DATA(rta), str, len); +} + +extern int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len); + #ifdef CONFIG_RTNETLINK extern struct sock *rtnl; @@ -578,12 +588,12 @@ struct rtnetlink_link extern struct rtnetlink_link * rtnetlink_links[NPROTO]; extern int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb); - +extern int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo); extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data); #define RTA_PUT(skb, attrtype, attrlen, data) \ -({ if (skb_tailroom(skb) < RTA_SPACE(attrlen)) goto rtattr_failure; \ +({ if (skb_tailroom(skb) < (int)RTA_SPACE(attrlen)) goto rtattr_failure; \ __rta_fill(skb, attrtype, attrlen, data); }) extern unsigned long rtnl_wlockct; diff --git a/include/linux/socket.h b/include/linux/socket.h index 1c688de702f2..bbf9ecc57ed9 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -60,7 +60,7 @@ struct cmsghdr { #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) ) -#define CMSG_DATA(cmsg) ((void *)(cmsg) + CMSG_ALIGN(sizeof(struct cmsghdr))) +#define CMSG_DATA(cmsg) ((void *)((char *)(cmsg) + CMSG_ALIGN(sizeof(struct cmsghdr)))) #define CMSG_SPACE(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + CMSG_ALIGN(len)) #define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + (len)) @@ -75,6 +75,10 @@ struct cmsghdr { #ifdef __KERNEL__ #define __KINLINE extern __inline__ +#elif defined(__GNUC__) +#define __KINLINE static __inline__ +#elif defined(__cplusplus) +#define __KINLINE static inline #else #define __KINLINE static #endif @@ -138,7 +142,7 @@ struct ucred { #define AF_APPLETALK 5 /* Appletalk DDP */ #define AF_NETROM 6 /* Amateur Radio NET/ROM */ #define AF_BRIDGE 7 /* Multiprotocol bridge */ -#define AF_AAL5 8 /* Reserved for Werner's ATM */ +#define AF_ATMPVC 8 /* ATM PVCs */ #define AF_X25 9 /* Reserved for X.25 project */ #define AF_INET6 10 /* IP version 6 */ #define AF_ROSE 11 /* Amateur Radio X.25 PLP */ @@ -151,6 +155,7 @@ struct ucred { #define AF_PACKET 17 /* Packet family */ #define AF_ASH 18 /* Ash */ #define AF_ECONET 19 /* Acorn Econet */ +#define AF_ATMSVC 20 /* ATM SVCs */ #define AF_MAX 32 /* For now.. */ /* Protocol families, same as address families. */ @@ -163,7 +168,7 @@ struct ucred { #define PF_APPLETALK AF_APPLETALK #define PF_NETROM AF_NETROM #define PF_BRIDGE AF_BRIDGE -#define PF_AAL5 AF_AAL5 +#define PF_ATMPVC AF_ATMPVC #define PF_X25 AF_X25 #define PF_INET6 AF_INET6 #define PF_ROSE AF_ROSE @@ -175,6 +180,7 @@ struct ucred { #define PF_ROUTE AF_ROUTE #define PF_PACKET AF_PACKET #define PF_ASH AF_ASH +#define PF_ATMSVC AF_ATMSVC #define PF_MAX AF_MAX @@ -223,6 +229,8 @@ struct ucred { #define SOL_DECNET 261 #define SOL_X25 262 #define SOL_PACKET 263 +#define SOL_ATM 264 /* ATM layer (cell level) */ +#define SOL_AAL 265 /* ATM Adaption Layer (packet level) */ /* IPX options */ #define IPX_TYPE 1 diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 11552a24d417..69d785df1d4f 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -165,7 +165,6 @@ enum NET_IPV4_TCP_KEEPALIVE_PROBES, NET_IPV4_TCP_RETRIES1, NET_IPV4_TCP_RETRIES2, - NET_IPV4_TCP_MAX_DELAY_ACKS, NET_IPV4_TCP_FIN_TIMEOUT, NET_IPV4_IP_MASQ_DEBUG, NET_TCP_SYNCOOKIES, diff --git a/include/linux/wanrouter.h b/include/linux/wanrouter.h index 8bdba9cad64b..ed38830ceeca 100644 --- a/include/linux/wanrouter.h +++ b/include/linux/wanrouter.h @@ -359,10 +359,10 @@ typedef struct wan_device } wan_device_t; /* Public functions available for device drivers */ -extern int register_wandev (wan_device_t* wandev); -extern int unregister_wandev (char* name); -unsigned short wan_type_trans (struct sk_buff* skb, struct device* dev); -int wan_encapsulate (struct sk_buff* skb, struct device* dev); +extern int register_wan_device(wan_device_t* wandev); +extern int unregister_wan_device(char* name); +unsigned short wanrouter_type_trans(struct sk_buff* skb, struct device* dev); +int wanrouter_encapsulate(struct sk_buff* skb, struct device* dev); /* Proc interface functions. These must not be called by the drivers! */ extern int wanrouter_proc_init (void); diff --git a/include/net/dst.h b/include/net/dst.h index 0d18f60d2021..802eae48739b 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -51,6 +51,10 @@ struct dst_entry int (*input)(struct sk_buff*); int (*output)(struct sk_buff*); +#ifdef CONFIG_NET_CLS_ROUTE + __u32 tclassid; +#endif + struct dst_ops *ops; char info[0]; diff --git a/include/net/ip.h b/include/net/ip.h index 05c5b3b52b84..add85700b4a5 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -100,7 +100,6 @@ extern int ip_acct_output(struct sk_buff *skb); #define ip_acct_output dev_queue_xmit #endif extern void ip_fragment(struct sk_buff *skb, int (*out)(struct sk_buff*)); -extern struct sk_buff * ip_reply(struct sk_buff *skb, int payload); extern int ip_do_nat(struct sk_buff *skb); extern void ip_send_check(struct iphdr *ip); extern int ip_id_count; @@ -117,6 +116,18 @@ extern int ip_build_xmit(struct sock *sk, struct rtable *rt, int flags); + +struct ip_reply_arg { + struct iovec iov[2]; + int n_iov; /* redundant */ + u32 csum; + int csumoffset; /* u16 offset of csum in iov[0].iov_base */ + /* -1 if not needed */ +}; + +void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, + unsigned int len); + extern int __ip_finish_output(struct sk_buff *skb); struct ipv4_config diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index d725e78d49b7..f96fa618cc1a 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -49,6 +49,9 @@ struct fib_nh #ifdef CONFIG_IP_ROUTE_MULTIPATH int nh_weight; int nh_power; +#endif +#ifdef CONFIG_NET_CLS_ROUTE + __u32 nh_tclassid; #endif int nh_oif; u32 nh_gw; @@ -229,9 +232,11 @@ extern int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg extern int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); extern int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb); extern u32 fib_rules_map_destination(u32 daddr, struct fib_result *res); +#ifdef CONFIG_NET_CLS_ROUTE +extern u32 fib_rules_tclass(struct fib_result *res); +#endif extern u32 fib_rules_policy(u32 saddr, struct fib_result *res, unsigned *flags); extern void fib_rules_init(void); #endif - #endif _NET_FIB_H diff --git a/include/net/ip_masq.h b/include/net/ip_masq.h index 0faa88336a39..65282bfcbec9 100644 --- a/include/net/ip_masq.h +++ b/include/net/ip_masq.h @@ -24,11 +24,6 @@ * I used an extra 4K port-space */ -/* - * Linux ports don't normally get allocated above 32K. - * I used an extra 4K port-space - */ - #define PORT_MASQ_BEGIN 61000 #define PORT_MASQ_END (PORT_MASQ_BEGIN+4096) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h new file mode 100644 index 000000000000..0d3c25e25722 --- /dev/null +++ b/include/net/pkt_cls.h @@ -0,0 +1,83 @@ +#ifndef __NET_PKT_CLS_H +#define __NET_PKT_CLS_H + + +#include + +struct rtattr; +struct tcmsg; + +/* Basic packet classifier frontend definitions. */ + +struct tcf_result +{ + unsigned long class; + u32 classid; +}; + +struct tcf_proto +{ + /* Fast access part */ + struct tcf_proto *next; + void *root; + int (*classify)(struct sk_buff*, struct tcf_proto*, struct tcf_result *); + u32 protocol; + + /* All the rest */ + u32 prio; + u32 classid; + struct Qdisc *q; + void *data; + struct tcf_proto_ops *ops; +}; + +struct tcf_walker +{ + int stop; + int skip; + int count; + int (*fn)(struct tcf_proto *, unsigned long node, struct tcf_walker *); +}; + +struct tcf_proto_ops +{ + struct tcf_proto_ops *next; + char kind[IFNAMSIZ]; + + int (*classify)(struct sk_buff*, struct tcf_proto*, struct tcf_result *); + int (*init)(struct tcf_proto*); + void (*destroy)(struct tcf_proto*); + + unsigned long (*get)(struct tcf_proto*, u32 handle); + void (*put)(struct tcf_proto*, unsigned long); + int (*change)(struct tcf_proto*, u32 handle, struct rtattr **, unsigned long *); + int (*delete)(struct tcf_proto*, unsigned long); + void (*walk)(struct tcf_proto*, struct tcf_walker *arg); + + /* rtnetlink specific */ + int (*dump)(struct tcf_proto*, unsigned long, struct sk_buff *skb, struct tcmsg*); +}; + +/* Main classifier routine: scans classifier chain attached + to this qdisc, (optionally) tests for protocol and asks + specific classifiers. + */ + +extern __inline__ int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) +{ + int err = 0; + u32 protocol = skb->protocol; + + for ( ; tp; tp = tp->next) { + if ((tp->protocol == protocol || + tp->protocol == __constant_htons(ETH_P_ALL)) && + (err = tp->classify(skb, tp, res)) >= 0) + return err; + } + return -1; +} + +extern int register_tcf_proto_ops(struct tcf_proto_ops *ops); +extern int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); + +#endif diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 5faad9ad4caa..de7c7691a2ab 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -1,21 +1,64 @@ #ifndef __NET_PKT_SCHED_H #define __NET_PKT_SCHED_H +#define PSCHED_GETTIMEOFDAY 1 +#define PSCHED_JIFFIES 2 +#define PSCHED_CPU 3 + +#define PSCHED_CLOCK_SOURCE PSCHED_GETTIMEOFDAY + #include +#include + +struct rtattr; +struct Qdisc; + +struct qdisc_walker +{ + int stop; + int skip; + int count; + int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); +}; + +struct Qdisc_class_ops +{ + /* Child qdisc manipulation */ + int (*graft)(struct Qdisc *, unsigned long cl, struct Qdisc *, struct Qdisc **); + + /* Class manipulation routines */ + unsigned long (*get)(struct Qdisc *, u32 classid); + void (*put)(struct Qdisc *, unsigned long); + int (*change)(struct Qdisc *, u32, u32, struct rtattr **, unsigned long *); + int (*delete)(struct Qdisc *, unsigned long); + void (*walk)(struct Qdisc *, struct qdisc_walker * arg); + + /* Filter manipulation */ + struct tcf_proto ** (*tcf_chain)(struct Qdisc *, unsigned long); + unsigned long (*bind_tcf)(struct Qdisc *, u32 classid); + void (*unbind_tcf)(struct Qdisc *, unsigned long); + + /* rtnetlink specific */ + int (*dump)(struct Qdisc *, unsigned long, struct sk_buff *skb, struct tcmsg*); +}; struct Qdisc_ops { struct Qdisc_ops *next; + struct Qdisc_class_ops *cl_ops; char id[IFNAMSIZ]; - int refcnt; int priv_size; - int (*enqueue)(struct sk_buff *skb, struct Qdisc *); + + int (*enqueue)(struct sk_buff *, struct Qdisc *); struct sk_buff * (*dequeue)(struct Qdisc *); + int (*requeue)(struct sk_buff *, struct Qdisc *); + int (*drop)(struct Qdisc *); + + int (*init)(struct Qdisc *, struct rtattr *arg); void (*reset)(struct Qdisc *); void (*destroy)(struct Qdisc *); - int (*init)(struct Qdisc *, void *arg); - int (*control)(struct Qdisc *, void *); - int (*requeue)(struct sk_buff *skb, struct Qdisc *); + + int (*dump)(struct Qdisc *, struct sk_buff *); }; struct Qdisc_head @@ -30,23 +73,35 @@ struct Qdisc struct Qdisc_head h; int (*enqueue)(struct sk_buff *skb, struct Qdisc *dev); struct sk_buff * (*dequeue)(struct Qdisc *dev); + unsigned flags; +#define TCQ_F_DEFAULT 1 +#define TCQ_F_BUILTIN 2 struct Qdisc_ops *ops; - int handle; + struct Qdisc *next; + u32 handle; + u32 classid; struct Qdisc *parent; struct sk_buff_head q; struct device *dev; - unsigned long dropped; - unsigned long tx_last; + + struct tc_stats stats; unsigned long tx_timeo; + unsigned long tx_last; + int (*reshape_fail)(struct sk_buff *skb, struct Qdisc *q); char data[0]; }; +struct qdisc_rate_table +{ + struct tc_ratespec rate; + u32 data[256]; + struct qdisc_rate_table *next; + int refcnt; +}; -/* Yes, it is slow for [34]86, but we have no choice. - 10 msec resolution is appropriate only for bandwidth < 32Kbit/sec. - RULE: +/* Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth Normal IP packet size ~ 512byte, hence: @@ -57,22 +112,96 @@ struct Qdisc 10msec resolution -> <50Kbit/sec. The result: [34]86 is not good choice for QoS router :-( - */ + The things are not so bad, because we may use artifical + clock evaluated by integration of network data flow + in the most critical places. -typedef struct timeval psched_time_t; + Note: we do not use fastgettimeofday. + The reason is that, when it is not the same thing as + gettimeofday, it returns invalid timestamp, which is + not updated, when net_bh is active. -/* On 64bit architecures it would be clever to define: -typedef u64 psched_time_t; - and make all this boring arithmetics directly + So, use PSCHED_CLOCK_SOURCE = PSCHED_CPU on alpha and pentiums + with rtdsc. And PSCHED_JIFFIES on all other architectures, including [34]86 + and pentiums without rtdsc. + You can use PSCHED_GETTIMEOFDAY on another architectures, + which have fast and precise clock source, but it is too expensive. */ -#ifndef SCHEDULE_ONLY_LOW_BANDWIDTH + +#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY + +typedef struct timeval psched_time_t; +typedef long psched_tdiff_t; + #define PSCHED_GET_TIME(stamp) do_gettimeofday(&(stamp)) +#define PSCHED_US2JIFFIE(usecs) (((usecs)+(1000000/HZ-1))/(1000000/HZ)) + +#else /* PSCHED_CLOCK_SOURCE != PSCHED_GETTIMEOFDAY */ + +typedef u64 psched_time_t; +typedef long psched_tdiff_t; + +extern psched_time_t psched_time_base; + +#if PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES + +#define PSCHED_WATCHER + +extern unsigned long psched_time_mark; + +#if HZ == 100 +#define PSCHED_JSCALE 7 +#elif HZ == 1024 +#define PSCHED_JSCALE 10 #else -#define PSCHED_GET_TIME(stamp) ((stamp) = xtime) +#define PSCHED_JSCALE 0 #endif +#define PSCHED_GET_TIME(stamp) ((stamp) = psched_time_base + (((unsigned long)(jiffies-psched_time_mark))<>PSCHED_JSCALE) + +#elif PSCHED_CLOCK_SOURCE == PSCHED_CPU + +extern psched_tdiff_t psched_clock_per_hz; +extern int psched_clock_scale; + +#define PSCHED_US2JIFFIE(delay) (((delay)+psched_clock_per_hz-1)/psched_clock_per_hz) + +#if CPU == 586 || CPU == 686 + +#define PSCHED_GET_TIME(stamp) \ +({ u32 hi, lo; \ + __asm__ __volatile__ (".byte 0x0f,0x31" :"=a" (lo), "=d" (hi)); \ + (stamp) = ((((u64)hi)<<32) + lo)>>psched_clock_scale; \ +}) + +#elif defined (__alpha__) + +#define PSCHED_WATCHER + +extern u32 psched_time_mark; + +#define PSCHED_GET_TIME(stamp) \ +({ u32 __res; \ + __asm__ __volatile__ ("rpcc %0" : "r="(__res)); \ + if (__res <= psched_time_mark) psched_time_base += 0x100000000UL; \ + psched_time_mark = __res; \ + (stamp) = (psched_time_base + __res)>>psched_clock_scale; \ +}) + +#else + +#error PSCHED_CLOCK_SOURCE=PSCHED_CPU is not supported on this arch. + +#endif /* ARCH */ + +#endif /* PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES */ + +#endif /* PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY */ + +#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY #define PSCHED_TDIFF(tv1, tv2) \ ({ \ int __delta_sec = (tv1).tv_sec - (tv2).tv_sec; \ @@ -106,8 +235,6 @@ typedef u64 psched_time_t; __delta; \ }) -#define PSCHED_US2JIFFIE(usecs) (((usecs)+(1000000/HZ-1))/(1000000/HZ)) - #define PSCHED_TLESS(tv1, tv2) (((tv1).tv_usec < (tv2).tv_usec && \ (tv1).tv_sec <= (tv2).tv_sec) || \ (tv1).tv_sec < (tv2).tv_sec) @@ -127,24 +254,86 @@ typedef u64 psched_time_t; (tv).tv_usec -= 1000000; } \ }) -/* Set/check that undertime is in the "past perfect"; +/* Set/check that time is in the "past perfect"; it depends on concrete representation of system time */ #define PSCHED_SET_PASTPERFECT(t) ((t).tv_sec = 0) #define PSCHED_IS_PASTPERFECT(t) ((t).tv_sec == 0) +#define PSCHED_AUDIT_TDIFF(t) ({ if ((t) > 2000000) (t) = 2000000; }) + +#else + +#define PSCHED_TDIFF(tv1, tv2) (long)((tv1) - (tv2)) +#define PSCHED_TDIFF_SAFE(tv1, tv2, bound, guard) \ +({ \ + long __delta = (tv1) - (tv2); \ + if ( __delta > (bound)) { __delta = (bound); guard; } \ + __delta; \ +}) + + +#define PSCHED_TLESS(tv1, tv2) ((tv1) < (tv2)) +#define PSCHED_TADD2(tv, delta, tv_res) ((tv_res) = (tv) + (delta)) +#define PSCHED_TADD(tv, delta) ((tv) += (delta)) +#define PSCHED_SET_PASTPERFECT(t) ((t) = 0) +#define PSCHED_IS_PASTPERFECT(t) ((t) == 0) +#define PSCHED_AUDIT_TDIFF(t) + +#endif + +struct tcf_police +{ + struct tcf_police *next; + int refcnt; + u32 index; + + int action; + u32 burst; + u32 mtu; + + u32 toks; + u32 ptoks; + psched_time_t t_c; + struct qdisc_rate_table *R_tab; + struct qdisc_rate_table *P_tab; +}; + +extern void tcf_police_destroy(struct tcf_police *p); +extern struct tcf_police * tcf_police_locate(struct rtattr *rta); +extern int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p); +extern int tcf_police(struct sk_buff *skb, struct tcf_police *p); + +extern __inline__ void tcf_police_release(struct tcf_police *p) +{ + if (p && --p->refcnt == 0) + tcf_police_destroy(p); +} extern struct Qdisc noop_qdisc; +extern struct Qdisc_ops noop_qdisc_ops; +extern struct Qdisc_ops pfifo_qdisc_ops; +extern struct Qdisc_ops bfifo_qdisc_ops; int register_qdisc(struct Qdisc_ops *qops); int unregister_qdisc(struct Qdisc_ops *qops); +struct Qdisc *qdisc_lookup(struct device *dev, u32 handle); +struct Qdisc *qdisc_lookup_class(struct device *dev, u32 handle); void dev_init_scheduler(struct device *dev); void dev_shutdown(struct device *dev); void dev_activate(struct device *dev); void dev_deactivate(struct device *dev); void qdisc_reset(struct Qdisc *qdisc); void qdisc_destroy(struct Qdisc *qdisc); +struct Qdisc * qdisc_create_dflt(struct device *dev, struct Qdisc_ops *ops); +struct Qdisc * dev_set_scheduler(struct device *dev, struct Qdisc *qdisc); +int qdisc_new_estimator(struct tc_stats *stats, struct rtattr *opt); +void qdisc_kill_estimator(struct tc_stats *stats); +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab); +void qdisc_put_rtab(struct qdisc_rate_table *tab); +int teql_init(void); +int tc_filter_init(void); int pktsched_init(void); void qdisc_run_queues(void); @@ -161,4 +350,10 @@ extern __inline__ void qdisc_wakeup(struct device *dev) } } +extern __inline__ unsigned psched_mtu(struct device *dev) +{ + unsigned mtu = dev->mtu; + return dev->hard_header ? mtu + dev->hard_header_len : mtu; +} + #endif diff --git a/include/net/snmp.h b/include/net/snmp.h index 9cd239bf8e6a..eeeeb6aa2ff7 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -130,6 +130,7 @@ struct linux_mib unsigned long SyncookiesSent; unsigned long SyncookiesRecv; unsigned long SyncookiesFailed; + unsigned long EmbryonicRsts; }; #endif diff --git a/include/net/sock.h b/include/net/sock.h index afd00efff8a1..29d7985bef0c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -323,7 +323,7 @@ struct tcp_opt { /* Define this to get the sk->debug debugging facility. */ #define SOCK_DEBUGGING #ifdef SOCK_DEBUGGING -#define SOCK_DEBUG(sk, msg...) if((sk) && ((sk)->debug)) printk(KERN_DEBUG ## msg) +#define SOCK_DEBUG(sk, msg...) do { if((sk) && ((sk)->debug)) printk(KERN_DEBUG ## msg); } while (0) #else #define SOCK_DEBUG(sk, msg...) do { } while (0) #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index 7d4b7008215f..52853f44a5c1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -172,7 +172,8 @@ struct tcp_tw_bucket { __u32 rcv_nxt; struct tcp_func *af_specific; struct tcp_bind_bucket *tb; - struct timer_list timer; + struct tcp_tw_bucket *next_death; + int death_slot; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct in6_addr v6_daddr; struct in6_addr v6_rcv_saddr; @@ -248,9 +249,11 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #define MAX_RESET_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15) #define MAX_TCPHEADER_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + 20 + MAX_HEADER + 15) -#define MAX_WINDOW 32767 /* Never offer a window over 32767 without using - window scaling (not yet supported). Some poor - stacks do signed 16bit maths! */ +/* + * Never offer a window over 32767 without using window scaling. Some + * poor stacks do signed 16bit maths! + */ +#define MAX_WINDOW 32767 #define MIN_WINDOW 2048 #define MAX_ACK_BACKLOG 2 #define MAX_DELAY_ACK 2 @@ -293,13 +296,17 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_PERIOD ((75*HZ)>>2) /* period of keepalive check */ -#define TCP_SYNACK_PERIOD (HZ/2) +#define TCP_SYNACK_PERIOD (HZ/2) /* How often to run the synack slow timer */ #define TCP_QUICK_TRIES 8 /* How often we try to retransmit, until - * we tell the LL layer that it is something + * we tell the link layer that it is something * wrong (e.g. that it can expire redirects) */ #define TCP_BUCKETGC_PERIOD (HZ) +/* TIME_WAIT reaping mechanism. */ +#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ +#define TCP_TWKILL_PERIOD ((HZ*60)/TCP_TWKILL_SLOTS) + /* * TCP option */ @@ -564,6 +571,8 @@ extern struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss); +/* tcp_output.c */ + extern void tcp_read_wakeup(struct sock *); extern void tcp_write_xmit(struct sock *); extern void tcp_time_wait(struct sock *); @@ -572,8 +581,6 @@ extern void tcp_fack_retransmit(struct sock *); extern void tcp_xmit_retransmit_queue(struct sock *); extern void tcp_simple_retransmit(struct sock *); -/* tcp_output.c */ - extern void tcp_send_probe0(struct sock *); extern void tcp_send_partial(struct sock *); extern void tcp_write_wakeup(struct sock *); @@ -615,11 +622,38 @@ struct tcp_sl_timer { #define TCP_SLT_SYNACK 0 #define TCP_SLT_KEEPALIVE 1 -#define TCP_SLT_BUCKETGC 2 -#define TCP_SLT_MAX 3 +#define TCP_SLT_TWKILL 2 +#define TCP_SLT_BUCKETGC 3 +#define TCP_SLT_MAX 4 extern struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX]; +/* Compute the current effective MSS, taking SACKs and IP options, + * and even PMTU discovery events into account. + */ +static __inline__ unsigned int tcp_current_mss(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct dst_entry *dst = sk->dst_cache; + unsigned int mss_now = sk->mss; + + if(dst && (sk->mtu < dst->pmtu)) { + unsigned int mss_distance = (sk->mtu - sk->mss); + + /* PMTU discovery event has occurred. */ + sk->mtu = dst->pmtu; + sk->mss = sk->mtu - mss_distance; + } + + if(tp->sack_ok && tp->num_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + if(sk->opt) + mss_now -= sk->opt->optlen; + + return mss_now; +} + /* Compute the actual receive window we are currently advertising. */ static __inline__ u32 tcp_receive_window(struct tcp_opt *tp) { @@ -919,7 +953,7 @@ extern __inline__ void tcp_select_initial_window(__u32 space, __u16 mss, * our initial window offering to 32k. There should also * be a sysctl option to stop being nice. */ - (*rcv_wnd) = min(space,32767); + (*rcv_wnd) = min(space, MAX_WINDOW); (*rcv_wscale) = 0; if (wscale_ok) { /* See RFC1323 for an explanation of the limit to 14 */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2423d0d8bde4..fc815b4874b3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -108,17 +108,6 @@ static spinlock_t page_alloc_lock; * but this had better return false if any reasonable "get_free_page()" * allocation could currently fail.. * - * Currently we approve of the following situations: - * - the highest memory order has two entries - * - the highest memory order has one free entry and: - * - the next-highest memory order has two free entries - * - the highest memory order has one free entry and: - * - the next-highest memory order has one free entry - * - the next-next-highest memory order has two free entries - * - * [previously, there had to be two entries of the highest memory - * order, but this lead to problems on large-memory machines.] - * * This will return zero if no list was found, non-zero * if there was memory (the bigger, the better). */ @@ -129,13 +118,14 @@ int free_memory_available(int nr) struct free_area_struct * list; /* - * If we have more than about 6% of all memory free, + * If we have more than about 3% to 5% of all memory free, * consider it to be good enough for anything. * It may not be, due to fragmentation, but we * don't want to keep on forever trying to find * free unfragmented memory. + * Added low/high water marks to avoid thrashing -- Rik. */ - if (nr_free_pages > num_physpages >> 4) + if (nr_free_pages > (num_physpages >> 5) + (nr ? 0 : num_physpages >> 6)) return nr+1; list = free_area + NR_MEM_LISTS; diff --git a/mm/vmscan.c b/mm/vmscan.c index 0b970c06ae6b..8eaeb23d59e5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -441,9 +441,6 @@ static inline int do_try_to_free_page(int gfp_mask) int i=6; int stop; - /* Let the dcache know we're looking for memory ... */ - shrink_dcache(); - /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); @@ -458,17 +455,17 @@ static inline int do_try_to_free_page(int gfp_mask) switch (state) { do { case 0: + state = 1; if (shrink_mmap(i, gfp_mask)) return 1; - state = 1; case 1: + state = 2; if ((gfp_mask & __GFP_IO) && shm_swap(i, gfp_mask)) return 1; - state = 2; default: + state = 0; if (swap_out(i, gfp_mask)) return 1; - state = 0; i--; } while ((i - stop) >= 0); } @@ -556,23 +553,17 @@ int kswapd(void *unused) * more aggressive if we're really * low on free memory. * - * Normally this is called 4 times - * a second if we need more memory, - * so this has a normal rate of - * X*4 pages of memory free'd per - * second. That rate goes up when - * - * - we're really low on memory (we get woken - * up a lot more) - * - other processes fail to allocate memory, - * at which time they try to do their own - * freeing. - * - * A "tries" value of 50 means up to 200 pages - * per second (1.6MB/s). This should be a /proc - * thing. + * The number of tries is 512 divided by an + * 'urgency factor'. In practice this will mean + * a value of 512 / 8 = 64 pages at a time, + * giving 64 * 4 (times/sec) * 4k (pagesize) = + * 1 MB/s in lowest-priority background + * paging. This number rises to 8 MB/s when the + * priority is highest (but then we'll be woken + * up more often and the rate will be even higher). + * -- Should make this sysctl tunable... */ - tries = (50 << 2) >> free_memory_available(3); + tries = (512) >> free_memory_available(3); while (tries--) { int gfp_mask; @@ -625,7 +616,7 @@ void swap_tick(void) if ((long) (now - want) >= 0) { if (want_wakeup || (num_physpages * buffer_mem.max_percent) < (buffermem >> PAGE_SHIFT) * 100 - || (num_physpages * page_cache.max_percent < page_cache_size)) { + || (num_physpages * page_cache.max_percent < page_cache_size * 100)) { /* Set the next wake-up time */ next_swap_jiffies = now + swapout_interval; wake_up(&kswapd_wait); diff --git a/net/Config.in b/net/Config.in index b4547e569c49..62dfd430f57a 100644 --- a/net/Config.in +++ b/net/Config.in @@ -42,6 +42,11 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then # if [ "$CONFIG_LLC" = "y" ]; then # bool 'Netbeui (EXPERIMENTAL)' CONFIG_NETBEUI # fi + tristate 'Acorn Econet/AUN protocols (EXPERIMENTAL)' CONFIG_ECONET + if [ "$CONFIG_ECONET" != "n" ]; then + bool ' AUN over UDP' CONFIG_ECONET_AUNUDP + bool ' Native Econet' CONFIG_ECONET_NATIVE + fi tristate 'WAN router' CONFIG_WAN_ROUTER bool 'Fast switching (read help!)' CONFIG_NET_FASTROUTE bool 'Forwarding between high speed interfaces' CONFIG_NET_HW_FLOWCONTROL diff --git a/net/bridge/Makefile b/net/bridge/Makefile index bc432f31608f..bcccefb75ac9 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux TCP/IP (INET) layer. +# Makefile for the Linux Bridge layer. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here diff --git a/net/core/Makefile b/net/core/Makefile index fc9dc31c4f72..ecbe9d99a7cc 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,8 +9,7 @@ O_TARGET := core.o -O_OBJS := sock.o skbuff.o iovec.o datagram.o dst.o scm.o \ - neighbour.o rtnetlink.o utils.o +O_OBJS := sock.o skbuff.o iovec.o datagram.o scm.o ifeq ($(CONFIG_SYSCTL),y) O_OBJS += sysctl_net_core.o @@ -22,7 +21,7 @@ endif ifdef CONFIG_NET -O_OBJS += dev.o dev_mcast.o +O_OBJS += dev.o dev_mcast.o dst.o neighbour.o rtnetlink.o utils.o ifdef CONFIG_FIREWALL OX_OBJS += firewall.o diff --git a/net/core/dev.c b/net/core/dev.c index f2c62fab6a23..85312b12c0e6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1789,7 +1789,9 @@ __initfunc(int net_dev_init(void)) { struct device *dev, **dp; +#ifdef CONFIG_NET_SCHED pktsched_init(); +#endif /* * Initialise the packet receive queue. diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cf7fe8ff891c..4bbe84cac8ec 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -63,6 +63,19 @@ void rtnl_unlock() rtnl_shunlock(); } +int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len) +{ + memset(tb, 0, sizeof(struct rtattr*)*maxattr); + + while (RTA_OK(rta, len)) { + unsigned flavor = rta->rta_type; + if (flavor && flavor <= maxattr) + tb[flavor-1] = rta; + rta = RTA_NEXT(rta, len); + } + return 0; +} + #ifdef CONFIG_RTNETLINK struct sock *rtnl; @@ -109,6 +122,19 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data memcpy(RTA_DATA(rta), data, attrlen); } +int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +{ + int err = 0; + + NETLINK_CB(skb).dst_groups = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); + if (echo) + err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); + return err; +} + #ifdef CONFIG_RTNL_OLD_IFINFO static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, int type, pid_t pid, u32 seq) @@ -132,7 +158,7 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, strncpy(r->ifi_name, dev->name, IFNAMSIZ-1); r->ifi_qdiscname[0] = 0; r->ifi_qdisc = dev->qdisc_sleeping->handle; - if (dev->qdisc_sleeping->ops) + if (dev->qdisc_sleeping) strcpy(r->ifi_qdiscname, dev->qdisc_sleeping->ops->id); if (dev->get_stats) { struct net_device_stats *stats = dev->get_stats(dev); @@ -175,7 +201,7 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, } if (dev->ifindex != dev->iflink) RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); - if (dev->qdisc_sleeping->ops) + if (dev->qdisc_sleeping) RTA_PUT(skb, IFLA_QDISC, strlen(dev->qdisc_sleeping->ops->id) + 1, dev->qdisc_sleeping->ops->id); diff --git a/net/core/sock.c b/net/core/sock.c index b9ab1d2a853f..30e5d3e77d09 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -290,6 +290,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; +#ifdef CONFIG_NETDEVICES case SO_BINDTODEVICE: /* Bind this socket to a particular device like "eth0", * as specified in an ifreq structure. If the device @@ -316,6 +317,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, } } return 0; +#endif #ifdef CONFIG_FILTER diff --git a/net/econet/Makefile b/net/econet/Makefile new file mode 100644 index 000000000000..3675848735aa --- /dev/null +++ b/net/econet/Makefile @@ -0,0 +1,23 @@ +# +# Makefile for Econet support code. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +MOD_LIST_NAME := NET_MISC_MODULES + +O_OBJS := +M_OBJS := + +ifeq ($(CONFIG_ECONET),y) + O_OBJS += econet.o +else + ifeq ($(CONFIG_ECONET), m) + M_OBJS += econet.o + endif +endif + +include $(TOPDIR)/Rules.make diff --git a/net/econet/econet.c b/net/econet/econet.c new file mode 100644 index 000000000000..9bfbfd92123e --- /dev/null +++ b/net/econet/econet.c @@ -0,0 +1,1108 @@ +/* + * An implementation of the Acorn Econet and AUN protocols. + * Philip Blundell + * + * Fixes: + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct proto_ops econet_ops; +static struct sock *econet_sklist; + +#ifdef CONFIG_ECONET_AUNUDP +static struct socket *udpsock; +#define AUN_PORT 0x8000 + +struct aunhdr +{ + unsigned char code; /* AUN magic protocol byte */ + unsigned char port; + unsigned char cb; + unsigned char pad; + unsigned long handle; +}; + +static unsigned long aun_seq = 0; + +/* Queue of packets waiting to be transmitted. */ +static struct sk_buff_head aun_queue; +static struct timer_list ab_cleanup_timer; + +#endif /* CONFIG_ECONET_AUNUDP */ + +/* Per-packet information */ +struct ec_cb +{ + struct sockaddr_ec sec; + unsigned long cookie; /* Supplied by user. */ +#ifdef CONFIG_ECONET_AUNUDP + int done; + unsigned long seq; /* Sequencing */ + unsigned long timeout; /* Timeout */ + unsigned long start; /* jiffies */ +#endif +#ifdef CONFIG_ECONET_NATIVE + void (*sent)(struct sk_buff *, int result); +#endif +}; + +struct ec_device +{ + struct device *dev; /* Real device structure */ + unsigned char station, net; /* Econet protocol address */ + struct ec_device *prev, *next; /* Linked list */ +}; + +static struct ec_device *edevlist = NULL; + +static spinlock_t edevlist_lock; + +/* + * Faster version of edev_get - call with IRQs off + */ + +static __inline__ struct ec_device *__edev_get(struct device *dev) +{ + struct ec_device *edev; + for (edev = edevlist; edev; edev = edev->next) + { + if (edev->dev == dev) + break; + } + return edev; +} + +/* + * Find an Econet device given its `dev' pointer. This is IRQ safe. + */ + +static struct ec_device *edev_get(struct device *dev) +{ + struct ec_device *edev; + unsigned long flags; + spin_lock_irqsave(&edevlist_lock, flags); + edev = __edev_get(dev); + spin_unlock_irqrestore(&edevlist_lock, flags); + return edev; +} + +/* + * Pull a packet from our receive queue and hand it to the user. + * If necessary we block. + */ + +static int econet_recvmsg(struct socket *sock, struct msghdr *msg, int len, + int flags, struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied, err; + + msg->msg_namelen = sizeof(struct sockaddr_ec); + + /* + * Call the generic datagram receiver. This handles all sorts + * of horrible races and re-entrancy so we can forget about it + * in the protocol layers. + * + * Now it will return ENETDOWN, if device have just gone down, + * but then it will block. + */ + + skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err); + + /* + * An error occurred so return it. Because skb_recv_datagram() + * handles the blocking we don't see and worry about blocking + * retries. + */ + + if(skb==NULL) + goto out; + + /* + * You lose any data beyond the buffer you gave. If it worries a + * user program they can ask the device for its MTU anyway. + */ + + copied = skb->len; + if (copied > len) + { + copied=len; + msg->msg_flags|=MSG_TRUNC; + } + + /* We can't use skb_copy_datagram here */ + err = memcpy_toiovec(msg->msg_iov, skb->data, copied); + if (err) + goto out_free; + sk->stamp=skb->stamp; + + if (msg->msg_name) + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + + /* + * Free or return the buffer as appropriate. Again this + * hides all the races and re-entrancy issues from us. + */ + err = copied; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; +} + +/* + * Bind an Econet socket. + */ + +static int econet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr; + struct sock *sk=sock->sk; + + /* + * Check legality + */ + + if (addr_len < sizeof(struct sockaddr_ec)) + return -EINVAL; + if (sec->sec_family != AF_ECONET) + return -EINVAL; + + sk->protinfo.af_econet->cb = sec->cb; + sk->protinfo.af_econet->port = sec->port; + sk->protinfo.af_econet->station = sec->addr.station; + sk->protinfo.af_econet->net = sec->addr.net; + + return 0; +} + +/* + * Queue a transmit result for the user to be told about. + */ + +static void tx_result(struct sock *sk, unsigned long cookie, int result) +{ + struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); + struct ec_cb *eb; + struct sockaddr_ec *sec; + + if (skb == NULL) + { + printk(KERN_DEBUG "ec: memory squeeze, transmit result dropped.\n"); + return; + } + + eb = (struct ec_cb *)&skb->cb; + sec = (struct sockaddr_ec *)&eb->sec; + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->cookie = cookie; + sec->type = ECTYPE_TRANSMIT_STATUS | result; + sec->sec_family = AF_ECONET; + + if (sock_queue_rcv_skb(sk, skb) < 0) + kfree_skb(skb); +} + +#ifdef CONFIG_ECONET_NATIVE +/* + * Called by the Econet hardware driver when a packet transmit + * has completed. Tell the user. + */ + +static void ec_tx_done(struct sk_buff *skb, int result) +{ + struct ec_cb *eb = (struct ec_cb *)&skb->cb; + tx_result(skb->sk, eb->cookie, result); +} +#endif + +/* + * Send a packet. We have to work out which device it's going out on + * and hence whether to use real Econet or the UDP emulation. + */ + +static int econet_sendmsg(struct socket *sock, struct msghdr *msg, int len, + struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + struct sockaddr_ec *saddr=(struct sockaddr_ec *)msg->msg_name; + struct device *dev; + struct ec_addr addr; + struct ec_device *edev; + int err; + unsigned char port, cb; + struct sk_buff *skb; + struct ec_cb *eb; +#ifdef CONFIG_ECONET_NATIVE + unsigned short proto = 0; +#endif +#ifdef CONFIG_ECONET_AUNUDP + struct msghdr udpmsg; + struct iovec iov[msg->msg_iovlen+1]; + struct aunhdr ah; + struct sockaddr_in udpdest; + __kernel_size_t size; + int i; + mm_segment_t oldfs; +#endif + + /* + * Check the flags. + */ + + if (msg->msg_flags&~MSG_DONTWAIT) + return(-EINVAL); + + /* + * Get and verify the address. + */ + + if (saddr == NULL) { + addr.station = sk->protinfo.af_econet->station; + addr.net = sk->protinfo.af_econet->net; + port = sk->protinfo.af_econet->port; + cb = sk->protinfo.af_econet->cb; + } else { + if (msg->msg_namelen < sizeof(struct sockaddr_ec)) + return -EINVAL; + addr.station = saddr->addr.station; + addr.net = saddr->addr.net; + port = saddr->port; + cb = saddr->cb; + } + + /* Look for a device with the right network number. */ + for (edev = edevlist; edev && (edev->net != addr.net); + edev = edev->next); + + /* Bridge? What's that? */ + if (edev == NULL) + return -ENETUNREACH; + + dev = edev->dev; + + if (dev->type == ARPHRD_ECONET) + { + /* Real hardware Econet. We're not worthy etc. */ +#ifdef CONFIG_ECONET_NATIVE + unsigned char *p; + + dev_lock_list(); + + skb = sock_alloc_send_skb(sk, len+dev->hard_header_len+15, 0, + msg->msg_flags & MSG_DONTWAIT, &err); + if (skb==NULL) + goto out_unlock; + + skb_reserve(skb, (dev->hard_header_len+15)&~15); + skb->nh.raw = skb->data; + + eb = (struct ec_cb *)&skb->cb; + + eb->cookie = saddr->cookie; + eb->sec = *saddr; + eb->sent - ec_tx_done; + + if (dev->hard_header) { + int res; + err = -EINVAL; + res = dev->hard_header(skb, dev, ntohs(proto), &addr, NULL, len); + if (sock->type != SOCK_DGRAM) { + skb->tail = skb->data; + skb->len = 0; + } else if (res < 0) + goto out_free; + } + + /* Copy the data. Returns -EFAULT on error */ + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + skb->protocol = proto; + skb->dev = dev; + skb->priority = sk->priority; + if (err) + goto out_free; + + err = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_free; + + /* + * Now send it + */ + + dev_unlock_list(); + dev_queue_xmit(skb); + return(len); + + out_free: + kfree_skb(skb); + out_unlock: + dev_unlock_list(); +#else + err = -EPROTOTYPE; +#endif + return err; + } + +#ifdef CONFIG_ECONET_AUNUDP + /* AUN virtual Econet. */ + + if (udpsock == NULL) + return -ENETDOWN; /* No socket - can't send */ + + /* Make up a UDP datagram and hand it off to some higher intellect. */ + + memset(&udpdest, 0, sizeof(udpdest)); + udpdest.sin_family = AF_INET; + udpdest.sin_port = htons(AUN_PORT); + + /* At the moment we use the stupid Acorn scheme of Econet address + y.x maps to IP a.b.c.x. This should be replaced with something + more flexible and more aware of subnet masks. */ + { + struct in_device *idev = (struct in_device *)dev->ip_ptr; + unsigned long network = ntohl(idev->ifa_list->ifa_address) & + 0xffffff00; /* !!! */ + udpdest.sin_addr.s_addr = htonl(network | addr.station); + } + + ah.port = port; + ah.cb = cb & 0x7f; + ah.code = 2; /* magic */ + ah.pad = 0; + + /* tack our header on the front of the iovec */ + size = sizeof(struct aunhdr); + iov[0].iov_base = (void *)&ah; + iov[0].iov_len = size; + for (i = 0; i < msg->msg_iovlen; i++) { + void *base = msg->msg_iov[i].iov_base; + size_t len = msg->msg_iov[i].iov_len; + /* Check it now since we switch to KERNEL_DS later. */ + if ((err = verify_area(VERIFY_READ, base, len)) < 0) + return err; + iov[i+1].iov_base = base; + iov[i+1].iov_len = len; + size += len; + } + + /* Get a skbuff (no data, just holds our cb information) */ + if ((skb = sock_alloc_send_skb(sk, 0, 0, + msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) + return err; + + eb = (struct ec_cb *)&skb->cb; + + eb->cookie = saddr->cookie; + eb->timeout = (5*HZ); + eb->start = jiffies; + ah.handle = aun_seq; + eb->seq = (aun_seq++); + eb->sec = *saddr; + + skb_queue_tail(&aun_queue, skb); + + udpmsg.msg_name = (void *)&udpdest; + udpmsg.msg_namelen = sizeof(udpdest); + udpmsg.msg_iov = &iov[0]; + udpmsg.msg_iovlen = msg->msg_iovlen + 1; + udpmsg.msg_control = NULL; + udpmsg.msg_controllen = 0; + udpmsg.msg_flags=0; + + oldfs = get_fs(); set_fs(KERNEL_DS); /* More privs :-) */ + err = sock_sendmsg(udpsock, &udpmsg, size); + set_fs(oldfs); +#else + err = -EPROTOTYPE; +#endif + return err; +} + +/* + * Look up the address of a socket. + */ + +static int econet_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr; + + if (peer) + return -EOPNOTSUPP; + + sec->sec_family = AF_ECONET; + sec->port = sk->protinfo.af_econet->port; + sec->addr.station = sk->protinfo.af_econet->station; + sec->addr.net = sk->protinfo.af_econet->net; + + *uaddr_len = sizeof(*sec); + return 0; +} + +static void econet_destroy_timer(unsigned long data) +{ + struct sock *sk=(struct sock *)data; + + if (!atomic_read(&sk->wmem_alloc) && !atomic_read(&sk->rmem_alloc)) { + sk_free(sk); + MOD_DEC_USE_COUNT; + return; + } + + sk->timer.expires=jiffies+10*HZ; + add_timer(&sk->timer); + printk(KERN_DEBUG "econet socket destroy delayed\n"); +} + +/* + * Close an econet socket. + */ + +static int econet_release(struct socket *sock, struct socket *peersock) +{ + struct sk_buff *skb; + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + sklist_remove_socket(&econet_sklist, sk); + + /* + * Now the socket is dead. No more input will appear. + */ + + sk->state_change(sk); /* It is useless. Just for sanity. */ + + sock->sk = NULL; + sk->socket = NULL; + sk->dead = 1; + + /* Purge queues */ + + while ((skb=skb_dequeue(&sk->receive_queue))!=NULL) + kfree_skb(skb); + + if (atomic_read(&sk->rmem_alloc) || atomic_read(&sk->wmem_alloc)) { + sk->timer.data=(unsigned long)sk; + sk->timer.expires=jiffies+HZ; + sk->timer.function=econet_destroy_timer; + add_timer(&sk->timer); + return 0; + } + + sk_free(sk); + MOD_DEC_USE_COUNT; + return 0; +} + +/* + * Create an Econet socket + */ + +static int econet_create(struct socket *sock, int protocol) +{ + struct sock *sk; + int err; + + /* Econet only provides datagram services. */ + if (sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + sock->state = SS_UNCONNECTED; + MOD_INC_USE_COUNT; + + err = -ENOBUFS; + sk = sk_alloc(AF_ECONET, GFP_KERNEL, 1); + if (sk == NULL) + goto out; + + sk->reuse = 1; + sock->ops = &econet_ops; + sock_init_data(sock,sk); + + sk->protinfo.af_econet = kmalloc(sizeof(struct econet_opt), GFP_KERNEL); + if (sk->protinfo.af_econet == NULL) + goto out_free; + memset(sk->protinfo.af_econet, 0, sizeof(struct econet_opt)); + sk->zapped=0; + sk->family = AF_ECONET; + sk->num = protocol; + + sklist_insert_socket(&econet_sklist, sk); + return(0); + +out_free: + sk_free(sk); +out: + MOD_DEC_USE_COUNT; + return err; +} + +/* + * Handle Econet specific ioctls + */ + +static int ec_dev_ioctl(struct socket *sock, unsigned int cmd, void *arg) +{ + struct ifreq ifr; + struct ec_device *edev; + struct device *dev; + unsigned long flags; + struct sockaddr_ec *sec; + + /* + * Fetch the caller's info block into kernel space + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + if ((dev = dev_get(ifr.ifr_name)) == NULL) + return -ENODEV; + + sec = (struct sockaddr_ec *)&ifr.ifr_addr; + + switch (cmd) + { + case SIOCSIFADDR: + spin_lock_irqsave(&edevlist_lock, flags); + edev = __edev_get(dev); + if (edev == NULL) + { + /* Magic up a new one. */ + edev = kmalloc(GFP_KERNEL, sizeof(struct ec_device)); + if (edev == NULL) { + printk("af_ec: memory squeeze.\n"); + spin_unlock_irqrestore(&edevlist_lock, flags); + return -ENOMEM; + } + memset(edev, 0, sizeof(struct ec_device)); + edev->dev = dev; + edev->next = edevlist; + edevlist = edev; + } + edev->station = sec->addr.station; + edev->net = sec->addr.net; + spin_unlock_irqrestore(&edevlist_lock, flags); + return 0; + + case SIOCGIFADDR: + spin_lock_irqsave(&edevlist_lock, flags); + edev = __edev_get(dev); + if (edev == NULL) + { + spin_unlock_irqrestore(&edevlist_lock, flags); + return -ENODEV; + } + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->addr.station = edev->station; + sec->addr.net = edev->net; + sec->sec_family = AF_ECONET; + spin_unlock_irqrestore(&edevlist_lock, flags); + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; + } + + return -EINVAL; +} + +/* + * Handle generic ioctls + */ + +static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err; + int pid; + + switch(cmd) + { + case FIOSETOWN: + case SIOCSPGRP: + err = get_user(pid, (int *) arg); + if (err) + return err; + if (current->pid != pid && current->pgrp != -pid && !suser()) + return -EPERM; + sk->proc = pid; + return(0); + case FIOGETOWN: + case SIOCGPGRP: + return put_user(sk->proc, (int *)arg); + case SIOCGSTAMP: + if(sk->stamp.tv_sec==0) + return -ENOENT; + err = -EFAULT; + if (!copy_to_user((void *)arg, &sk->stamp, sizeof(struct timeval))) + err = 0; + return err; + case SIOCGIFFLAGS: + case SIOCSIFFLAGS: + case SIOCGIFCONF: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + case SIOCGIFMEM: + case SIOCSIFMEM: + case SIOCGIFMTU: + case SIOCSIFMTU: + case SIOCSIFLINK: + case SIOCGIFHWADDR: + case SIOCSIFHWADDR: + case SIOCSIFMAP: + case SIOCGIFMAP: + case SIOCSIFSLAVE: + case SIOCGIFSLAVE: + case SIOCGIFINDEX: + case SIOCGIFNAME: + case SIOCGIFCOUNT: + case SIOCSIFHWBROADCAST: + return(dev_ioctl(cmd,(void *) arg)); + + + case SIOCSIFADDR: + case SIOCGIFADDR: + return ec_dev_ioctl(sock, cmd, (void *)arg); + break; + + default: + if ((cmd >= SIOCDEVPRIVATE) && + (cmd <= (SIOCDEVPRIVATE + 15))) + return(dev_ioctl(cmd,(void *) arg)); + +#ifdef CONFIG_NET_RADIO + if((cmd >= SIOCIWFIRST) && (cmd <= SIOCIWLAST)) + return(dev_ioctl(cmd,(void *) arg)); +#endif + return -EOPNOTSUPP; + } + /*NOTREACHED*/ + return 0; +} + +static struct net_proto_family econet_family_ops = { + AF_ECONET, + econet_create +}; + +static struct proto_ops econet_ops = { + AF_ECONET, + + sock_no_dup, + econet_release, + econet_bind, + sock_no_connect, + NULL, + NULL, + econet_getname, + datagram_poll, + econet_ioctl, + sock_no_listen, + sock_no_shutdown, + sock_no_setsockopt, + sock_no_getsockopt, + sock_no_fcntl, + econet_sendmsg, + econet_recvmsg +}; + +/* + * Find the listening socket, if any, for the given data. + */ + +static struct sock *ec_listening_socket(unsigned char port, unsigned char + station, unsigned char net) +{ + struct sock *sk = econet_sklist; + + while (sk) + { + struct econet_opt *opt = sk->protinfo.af_econet; + if ((opt->port == port || opt->port == 0) && + (opt->station == station || opt->station == 0) && + (opt->net == net || opt->net == 0)) + return sk; + sk = sk->sklist_next; + } + + return NULL; +} + +#ifdef CONFIG_ECONET_AUNUDP + +/* + * Send an AUN protocol response. + */ + +static void aun_send_response(__u32 addr, unsigned long seq, int code, int cb) +{ + struct sockaddr_in sin; + struct iovec iov; + struct aunhdr ah; + struct msghdr udpmsg; + int err; + mm_segment_t oldfs; + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_port = htons(AUN_PORT); + sin.sin_addr.s_addr = addr; + + ah.code = code; + ah.pad = 0; + ah.port = 0; + ah.cb = cb; + ah.handle = seq; + + iov.iov_base = (void *)&ah; + iov.iov_len = sizeof(ah); + + udpmsg.msg_name = (void *)&sin; + udpmsg.msg_namelen = sizeof(sin); + udpmsg.msg_iov = &iov; + udpmsg.msg_iovlen = 1; + udpmsg.msg_control = NULL; + udpmsg.msg_controllen = 0; + udpmsg.msg_flags=0; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(udpsock, &udpmsg, sizeof(ah)); + set_fs(oldfs); +} + +/* + * Handle incoming AUN packets. Work out if anybody wants them, + * and send positive or negative acknowledgements as appropriate. + */ + +static void aun_incoming(struct sk_buff *skb, struct aunhdr *ah, size_t len) +{ + struct ec_device *edev = edev_get(skb->dev); + struct iphdr *ip = skb->nh.iph; + unsigned char stn = ntohl(ip->saddr) & 0xff; + struct sock *sk; + struct sk_buff *newskb; + struct ec_cb *eb; + struct sockaddr_ec *sec; + + if (edev == NULL) + return; /* Device not configured for AUN */ + + if ((sk = ec_listening_socket(ah->port, stn, edev->net)) == NULL) + goto bad; /* Nobody wants it */ + + newskb = alloc_skb((len - sizeof(struct aunhdr) + 15) & ~15, + GFP_ATOMIC); + if (newskb == NULL) + { + printk(KERN_DEBUG "AUN: memory squeeze, dropping packet.\n"); + /* Send nack and hope sender tries again */ + goto bad; + } + + eb = (struct ec_cb *)&newskb->cb; + sec = (struct sockaddr_ec *)&eb->sec; + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->sec_family = AF_ECONET; + sec->type = ECTYPE_PACKET_RECEIVED; + sec->port = ah->port; + sec->cb = ah->cb; + sec->addr.net = edev->net; + sec->addr.station = stn; + + memcpy(skb_put(newskb, len - sizeof(struct aunhdr)), (void *)(ah+1), + len - sizeof(struct aunhdr)); + + if (sock_queue_rcv_skb(sk, newskb) < 0) + { + /* Socket is bankrupt. */ + kfree_skb(newskb); + goto bad; + } + + aun_send_response(ip->saddr, ah->handle, 3, 0); + return; + +bad: + aun_send_response(ip->saddr, ah->handle, 4, 0); +} + +/* + * Handle incoming AUN transmit acknowledgements. If the sequence + * number matches something in our backlog then kill it and tell + * the user. If the remote took too long to reply then we may have + * dropped the packet already. + */ + +static void aun_tx_ack(unsigned long seq, int result) +{ + struct sk_buff *skb; + unsigned long flags; + struct ec_cb *eb; + + spin_lock_irqsave(&aun_queue_lock, flags); + skb = skb_peek(&aun_queue); + while (skb && skb != (struct sk_buff *)&aun_queue) + { + struct sk_buff *newskb = skb->next; + eb = (struct ec_cb *)&skb->cb; + if (eb->seq == seq) + goto foundit; + + skb = newskb; + } + spin_unlock_irqrestore(&aun_queue_lock, flags); + printk(KERN_DEBUG "AUN: unknown sequence %ld\n", seq); + return; + +foundit: + tx_result(skb->sk, eb->cookie, result); + skb_unlink(skb); + spin_unlock_irqrestore(&aun_queue_lock, flags); +} + +/* + * Deal with received AUN frames - sort out what type of thing it is + * and hand it to the right function. + */ + +static void aun_data_available(struct sock *sk, int slen) +{ + int err; + struct sk_buff *skb; + unsigned char *data; + struct aunhdr *ah; + struct iphdr *ip; + size_t len; + + while ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) { + if (err == -EAGAIN) { + printk(KERN_ERR "AUN: no data available?!"); + return; + } + printk(KERN_DEBUG "AUN: recvfrom() error %d\n", -err); + } + + data = skb->h.raw + sizeof(struct udphdr); + ah = (struct aunhdr *)data; + len = skb->len - sizeof(struct udphdr); + ip = skb->nh.iph; + + switch (ah->code) + { + case 2: + aun_incoming(skb, ah, len); + break; + case 3: + aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_OK); + break; + case 4: + aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_NOT_LISTENING); + break; +#if 0 + /* This isn't quite right yet. */ + case 5: + aun_send_response(ip->saddr, ah->handle, 6, ah->cb); + break; +#endif + default: + printk(KERN_DEBUG "unknown AUN packet (type %d)\n", data[0]); + } + + skb_free_datagram(sk, skb); +} + +/* + * Called by the timer to manage the AUN transmit queue. If a packet + * was sent to a dead or nonexistent host then we will never get an + * acknowledgement back. After a few seconds we need to spot this and + * drop the packet. + */ + +static spinlock_t aun_queue_lock; + +static void ab_cleanup(unsigned long h) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&aun_queue_lock, flags); + skb = skb_peek(&aun_queue); + while (skb && skb != (struct sk_buff *)&aun_queue) + { + struct sk_buff *newskb = skb->next; + struct ec_cb *eb = (struct ec_cb *)&skb->cb; + if ((jiffies - eb->start) > eb->timeout) + { + tx_result(skb->sk, eb->cookie, + ECTYPE_TRANSMIT_NOT_PRESENT); + skb_unlink(skb); + } + skb = newskb; + } + spin_unlock_irqrestore(&aun_queue_lock, flags); + + mod_timer(&ab_cleanup_timer, jiffies + (HZ*2)); +} + +__initfunc(static int aun_udp_initialise(void)) +{ + int error; + struct sockaddr_in sin; + + skb_queue_head_init(&aun_queue); + spin_lock_init(&aun_queue_lock); + init_timer(&ab_cleanup_timer); + ab_cleanup_timer.expires = jiffies + (HZ*2); + ab_cleanup_timer.function = ab_cleanup; + add_timer(&ab_cleanup_timer); + + memset(&sin, 0, sizeof(sin)); + sin.sin_port = htons(AUN_PORT); + + /* We can count ourselves lucky Acorn machines are too dim to + speak IPv6. :-) */ + if ((error = sock_create(AF_INET, SOCK_DGRAM, 0, &udpsock)) < 0) + { + printk("AUN: socket error %d\n", -error); + return error; + } + + udpsock->sk->reuse = 1; + udpsock->sk->allocation = GFP_ATOMIC; /* we're going to call it + from interrupts */ + + error = udpsock->ops->bind(udpsock, (struct sockaddr *)&sin, + sizeof(sin)); + if (error < 0) + { + printk("AUN: bind error %d\n", -error); + goto release; + } + + udpsock->sk->data_ready = aun_data_available; + + return 0; + +release: + sock_release(udpsock); + udpsock = NULL; + return error; +} +#endif + +static int econet_notifier(struct notifier_block *this, unsigned long msg, void *data) +{ + struct device *dev = (struct device *)data; + struct ec_device *edev; + unsigned long flags; + + switch (msg) { + case NETDEV_UNREGISTER: + /* A device has gone down - kill any data we hold for it. */ + spin_lock_irqsave(&edevlist_lock, flags); + for (edev = edevlist; edev; edev = edev->next) + { + if (edev->dev == dev) + { + if (edev->prev) + edev->prev->next = edev->next; + else + edevlist = edev->next; + if (edev->next) + edev->next->prev = edev->prev; + kfree(edev); + break; + } + } + spin_unlock_irqrestore(&edevlist_lock, flags); + break; + } + + return NOTIFY_DONE; +} + +struct notifier_block econet_netdev_notifier={ + econet_notifier, + NULL, + 0 +}; + +#ifdef MODULE +void cleanup_module(void) +{ +#ifdef CONFIG_ECONET_AUNUDP + del_timer(&ab_cleanup_timer); + if (udpsock) + sock_release(udpsock); +#endif + unregister_netdevice_notifier(&econet_netdev_notifier); + sock_unregister(econet_family_ops.family); + return; +} + +int init_module(void) +#else +__initfunc(void econet_proto_init(struct net_proto *pro)) +#endif +{ + spin_lock_init(&edevlist_lock); + spin_lock_init(&aun_queue_lock); + /* Stop warnings from happening on UP systems. */ + (void)edevlist_lock; + (void)aun_queue_lock; + sock_register(&econet_family_ops); +#ifdef CONFIG_ECONET_AUNUDP + aun_udp_initialise(); +#endif + register_netdevice_notifier(&econet_netdev_notifier); +#ifdef MODULE + return 0; +#endif +} diff --git a/net/ethernet/Makefile b/net/ethernet/Makefile index 8c9041b4c198..193d6af8b1a7 100644 --- a/net/ethernet/Makefile +++ b/net/ethernet/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux TCP/IP (INET) layer. +# Makefile for the Linux Ethernet layer. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2c5220e0fc49..bba2bb96555d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * AF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.69 1998/04/03 09:49:42 freitag Exp $ + * Version: $Id: af_inet.c,v 1.71 1998/04/16 05:38:16 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -1052,6 +1052,8 @@ static struct proc_dir_entry proc_net_udp = { #endif /* CONFIG_PROC_FS */ extern void tcp_init(void); +extern void tcp_v4_init(struct net_proto_family *); + /* * Called by socket.c on kernel startup. @@ -1101,9 +1103,12 @@ __initfunc(void inet_proto_init(struct net_proto *pro)) ip_init(); + tcp_v4_init(&inet_family_ops); + /* Setup TCP slab cache for open requests. */ tcp_init(); + /* * Set the ICMP layer up */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index cd9b5ba2136e..592ff5ffbf04 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: policy rules. * - * Version: $Id: fib_rules.c,v 1.4 1998/03/21 07:27:58 davem Exp $ + * Version: $Id: fib_rules.c,v 1.5 1998/04/28 06:21:57 davem Exp $ * * Authors: Alexey Kuznetsov, * @@ -65,6 +65,9 @@ struct fib_rule u8 r_flags; u8 r_tos; int r_ifindex; +#ifdef CONFIG_NET_CLS_ROUTE + __u32 r_tclassid; +#endif char r_ifname[IFNAMSIZ]; }; @@ -165,6 +168,10 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (dev) new_r->r_ifindex = dev->ifindex; } +#ifdef CONFIG_NET_CLS_ROUTE + if (rta[RTA_FLOW-1]) + memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4); +#endif rp = &fib_rules; if (!new_r->r_preference) { @@ -213,6 +220,16 @@ u32 fib_rules_policy(u32 saddr, struct fib_result *res, unsigned *flags) return saddr; } +#ifdef CONFIG_NET_CLS_ROUTE +u32 fib_rules_tclass(struct fib_result *res) +{ + if (res->r) + return res->r->r_tclassid; + return 0; +} +#endif + + static void fib_rules_detach(struct device *dev) { struct fib_rule *r; @@ -246,7 +263,7 @@ FRprintk("Lookup: %08x <- %08x ", key->dst, key->src); for (r = fib_rules; r; r=r->r_next) { if (((saddr^r->r_src) & r->r_srcmask) || ((daddr^r->r_dst) & r->r_dstmask) || -#ifdef CONFIG_IP_TOS_ROUTING +#ifdef CONFIG_IP_ROUTE_TOS (r->r_tos && r->r_tos != key->tos) || #endif (r->r_ifindex && r->r_ifindex != key->iif)) @@ -339,6 +356,10 @@ extern __inline__ int inet_fill_rule(struct sk_buff *skb, RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference); if (r->r_srcmap) RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap); +#ifdef CONFIG_NET_CLS_ROUTE + if (r->r_tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid); +#endif nlh->nlmsg_len = skb->tail - b; return skb->len; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d2d37e11e482..107f07791f1a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: semantics. * - * Version: $Id: fib_semantics.c,v 1.7 1998/03/08 05:56:18 davem Exp $ + * Version: $Id: fib_semantics.c,v 1.8 1998/04/28 06:21:58 davem Exp $ * * Authors: Alexey Kuznetsov, * @@ -123,6 +123,9 @@ extern __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info * nh->nh_scope != onh->nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight != onh->nh_weight || +#endif +#ifdef CONFIG_NET_CLS_ROUTE + nh->nh_tclassid != onh->nh_tclassid || #endif ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) return -1; @@ -217,8 +220,12 @@ fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags; nh->nh_oif = nhp->rtnh_ifindex; nh->nh_weight = nhp->rtnh_hops + 1; - if (attrlen) + if (attrlen) { nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); +#ifdef CONFIG_NET_CLS_ROUTE + nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW); +#endif + } nhp = RTNH_NEXT(nhp); } endfor_nexthops(fi); return 0; @@ -267,6 +274,11 @@ int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta, gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); if (gw && gw != nh->nh_gw) return 1; +#ifdef CONFIG_NET_CLS_ROUTE + gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW); + if (gw && gw != nh->nh_tclassid) + return 1; +#endif } nhp = RTNH_NEXT(nhp); } endfor_nexthops(fi); @@ -459,6 +471,10 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, goto err_inval; if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4)) goto err_inval; +#ifdef CONFIG_NET_CLS_ROUTE + if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4)) + goto err_inval; +#endif #else goto err_inval; #endif @@ -468,6 +484,10 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, nh->nh_oif = *rta->rta_oif; if (rta->rta_gw) memcpy(&nh->nh_gw, rta->rta_gw, 4); +#ifdef CONFIG_NET_CLS_ROUTE + if (rta->rta_flow) + memcpy(&nh->nh_tclassid, rta->rta_flow, 4); +#endif nh->nh_flags = r->rtm_flags; #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight = 1; @@ -654,6 +674,10 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, if (fi->fib_rtt) RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &fi->fib_rtt); #else +#ifdef CONFIG_NET_CLS_ROUTE + if (fi->fib_nh[0].nh_tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid); +#endif if (fi->fib_mtu || fi->fib_window || fi->fib_rtt) { int i; struct rtattr *mx = (struct rtattr *)skb->tail; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 85a38242bc41..20c412b5302d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.51 1998/03/28 00:55:34 davem Exp $ + * Version: $Id: ip_output.c,v 1.56 1998/04/17 02:36:46 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -31,6 +31,10 @@ * Andi Kleen: Fix broken PMTU recovery and remove * some redundant tests. * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Andi Kleen : Replace ip_reply with ip_send_reply. + * Andi Kleen : Split fast and slow ip_build_xmit path + * for decreased register pressure on x86 + * and more readibility. */ #include @@ -70,7 +74,6 @@ #include #include #include -#include /* * Shall we try to damage output packets if routing dev changes? @@ -88,6 +91,9 @@ __inline__ void ip_send_check(struct iphdr *iph) iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } +/* + * Add an ip header to a skbuff and send it out. + */ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, struct ip_options *opt) { @@ -303,16 +309,6 @@ void ip_queue_xmit(struct sk_buff *skb) if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT) goto drop; -#ifdef CONFIG_NET_SECURITY - /* Add an IP checksum (must do this before SECurity because - * of possible tunneling). - */ - ip_send_check(iph); - if (call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 4, &skb) < FW_ACCEPT) - goto drop; - iph = skb->nh.iph; - /* Don't update tot_len, as the dev->mtu is already decreased. */ -#endif /* This can happen when the transport layer has segments queued * with a cached route, and by the time we get here things are * re-routed to a device with a different MTU than the original @@ -335,10 +331,9 @@ void ip_queue_xmit(struct sk_buff *skb) if (tot_len > rt->u.dst.pmtu) goto fragment; -#ifndef CONFIG_NET_SECURITY /* Add an IP checksum. */ ip_send_check(iph); -#endif + skb->priority = sk->priority; skb->dst->output(skb); return; @@ -382,7 +377,7 @@ drop: * length to be copied. */ -int ip_build_xmit(struct sock *sk, +int ip_build_xmit_slow(struct sock *sk, int getfrag (const void *, char *, unsigned int, @@ -397,91 +392,16 @@ int ip_build_xmit(struct sock *sk, int err; int offset, mf; unsigned short id; - struct iphdr *iph; + int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; int nfrags=0; struct ip_options *opt = ipc->opt; int df = htons(IP_DF); -#ifdef CONFIG_NET_SECURITY - int fw_res; -#endif if (sk->ip_pmtudisc == IP_PMTUDISC_DONT || - (rt->u.dst.mxlock&(1<u.dst.mxlock&(1<ip_hdrincl) - length += sizeof(struct iphdr); - - if (length <= rt->u.dst.pmtu && opt == NULL) { - int error; - struct sk_buff *skb=sock_alloc_send_skb(sk, length+hh_len+15, - 0, flags&MSG_DONTWAIT, &error); - if(skb==NULL) { - ip_statistics.IpOutDiscards++; - return error; - } - - skb->priority = sk->priority; - skb->dst = dst_clone(&rt->u.dst); - skb_reserve(skb, hh_len); - - skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); - - dev_lock_list(); - - if(!sk->ip_hdrincl) { - iph->version=4; - iph->ihl=5; - iph->tos=sk->ip_tos; - iph->tot_len = htons(length); - iph->id=htons(ip_id_count++); - iph->frag_off = df; - iph->ttl=sk->ip_mc_ttl; - if (rt->rt_type != RTN_MULTICAST) - iph->ttl=sk->ip_ttl; - iph->protocol=sk->protocol; - iph->saddr=rt->rt_src; - iph->daddr=rt->rt_dst; - iph->check=0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4); - } - else - err = getfrag(frag, (void *)iph, 0, length); - dev_unlock_list(); - - if (err) - err = -EFAULT; - - if(!err && call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb) < FW_ACCEPT) - err = -EPERM; -#ifdef CONFIG_NET_SECURITY - if ((fw_res=call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 5, &skb))u.dst.output(skb); - } - + if (!sk->ip_hdrincl) length -= sizeof(struct iphdr); @@ -497,7 +417,7 @@ int ip_build_xmit(struct sock *sk, */ maxfraglen = ((rt->u.dst.pmtu-sizeof(struct iphdr)) & ~7) + fragheaderlen; - } + } if (length + fragheaderlen > 0xFFFF) return -EMSGSIZE; @@ -551,9 +471,9 @@ int ip_build_xmit(struct sock *sk, */ do { - struct sk_buff * skb; int error; char *data; + struct sk_buff * skb; /* * Get the memory we require with some space left for alignment. @@ -581,13 +501,15 @@ int ip_build_xmit(struct sock *sk, */ data = skb_put(skb, fraglen); - skb->nh.iph = iph = (struct iphdr *)data; + skb->nh.iph = (struct iphdr *)data; /* * Only write IP header onto non-raw packets */ if(!sk->ip_hdrincl) { + struct iphdr *iph = (struct iphdr *)data; + iph->version = 4; iph->ihl = 5; if (opt) { @@ -622,49 +544,148 @@ int ip_build_xmit(struct sock *sk, * User data callback */ - err = getfrag(frag, data, offset, fraglen-fragheaderlen); - if (err) + err = 0; + if (getfrag(frag, data, offset, fraglen-fragheaderlen)) err = -EFAULT; /* * Account for the fragment. */ - - if(!err && !offset && call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb) < FW_ACCEPT) + + if(!err && offset == 0 && + call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb) < FW_ACCEPT) err = -EPERM; -#ifdef CONFIG_NET_SECURITY - if ((fw_res=call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 6, &skb))u.dst.output(skb)) { - if (nfrags>1) - ip_statistics.IpFragCreates += nfrags; - dev_unlock_list(); - return -ENETDOWN; + err = -ENETDOWN; + ip_statistics.IpOutDiscards++; + break; } } while (offset >= 0); if (nfrags>1) ip_statistics.IpFragCreates += nfrags; + dev_unlock_list(); + return err; +} + + +/* + * Fast path for unfragmented packets. + */ +int ip_build_xmit(struct sock *sk, + int getfrag (const void *, + char *, + unsigned int, + unsigned int), + const void *frag, + unsigned length, + struct ipcm_cookie *ipc, + struct rtable *rt, + int flags) +{ + int err; + struct sk_buff *skb; + int df; + struct iphdr *iph; + + /* + * Try the simple case first. This leaves fragmented frames, and by + * choice RAW frames within 20 bytes of maximum size(rare) to the long path + */ + + if (!sk->ip_hdrincl) + length += sizeof(struct iphdr); + + /* + * Check for slow path. + */ + if (length > rt->u.dst.pmtu || ipc->opt != NULL) + return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags); + + /* + * Do path mtu discovery if needed. + */ + df = htons(IP_DF); + if (sk->ip_pmtudisc == IP_PMTUDISC_DONT || + (rt->u.dst.mxlock&(1<u.dst.dev->hard_header_len + 15)&~15; + + skb = sock_alloc_send_skb(sk, length+hh_len+15, + 0, flags&MSG_DONTWAIT, &err); + if(skb==NULL) + goto error; + skb_reserve(skb, hh_len); + } + + skb->priority = sk->priority; + skb->dst = dst_clone(&rt->u.dst); + + skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); + + dev_lock_list(); + + if(!sk->ip_hdrincl) { + iph->version=4; + iph->ihl=5; + iph->tos=sk->ip_tos; + iph->tot_len = htons(length); + iph->id=htons(ip_id_count++); + iph->frag_off = df; + iph->ttl=sk->ip_mc_ttl; + if (rt->rt_type != RTN_MULTICAST) + iph->ttl=sk->ip_ttl; + iph->protocol=sk->protocol; + iph->saddr=rt->rt_src; + iph->daddr=rt->rt_dst; + iph->check=0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4); + } + else + err = getfrag(frag, (void *)iph, 0, length); dev_unlock_list(); - return 0; + + if (err) + err = -EFAULT; + + if(!err && call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb) < FW_ACCEPT) + err = -EPERM; + + if (err) { + kfree_skb(skb); + goto error; + } + + return rt->u.dst.output(skb); + +error: + ip_statistics.IpOutDiscards++; + return err; } + + /* * This IP datagram is too large to be sent in one piece. Break it up into @@ -682,7 +703,7 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) unsigned char *ptr; struct device *dev; struct sk_buff *skb2; - int left, mtu, hlen, len; + unsigned int mtu, hlen, left, len; int offset; int not_last_frag; u16 dont_fragment; @@ -712,11 +733,8 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * in this case we were fortunate it didn't happen */ - if (mtu<8) { - ip_statistics.IpFragFails++; - kfree_skb(skb); - return; - } + if (mtu<8) + goto fail; /* * Fragment the datagram. @@ -745,8 +763,7 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) /* IF: we are not sending upto and including the packet end then align the next start on an eight byte boundary */ if (len < left) { - len/=8; - len*=8; + len &= ~7; } /* * Allocate buffer. @@ -754,9 +771,7 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) { NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); - ip_statistics.IpFragFails++; - kfree_skb(skb); - return; + goto fail; } /* @@ -829,61 +844,96 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) } kfree_skb(skb); ip_statistics.IpFragOKs++; + return; + +fail: + kfree_skb(skb); + ip_statistics.IpFragFails++; } -struct sk_buff * ip_reply(struct sk_buff *skb, int payload) +/* + * Fetch data from kernel space and fill in checksum if needed. + */ +static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset, + unsigned int fraglen) +{ + struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr; + u16 *pktp = (u16 *)to; + struct iovec *iov; + int len; + int hdrflag = 1; + +#if 0 + printk("ip_reply_glue_bits: offset=%u,flen=%u iov[0].l=%u,iov[1].len=%u\n", + offset,fraglen,dp->iov[0].iov_len,dp->iov[1].iov_len); +#endif + + iov = &dp->iov[0]; + if (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + hdrflag = 0; + } + len = iov->iov_len - offset; + if (fraglen > len) { /* overlapping. */ +#if 1 + if (iov > &dp->iov[0]) { + printk("frag too long! (o=%u,fl=%u)\n",offset,fraglen); + return -1; + } +#endif + dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len, + dp->csum); + offset = 0; + fraglen -= len; + to += len; + iov++; + } + + dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen, + dp->csum); + + if (hdrflag && dp->csumoffset) + *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */ + return 0; +} + +/* + * Generic function to send a packet as reply to another packet. + * Used to send TCP resets so far. ICMP should use this function too. + * + * Should run single threaded per socket because it uses the sock + * structure to pass arguments. + */ +void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, + unsigned int len) { struct { struct ip_options opt; char data[40]; } replyopts; - - struct rtable *rt = (struct rtable*)skb->dst; - struct sk_buff *reply; - int iphlen; - struct iphdr *iph; - struct ipcm_cookie ipc; u32 daddr; - + struct rtable *rt = (struct rtable*)skb->dst; + if (ip_options_echo(&replyopts.opt, skb)) - return NULL; + return; + + sk->ip_tos = skb->nh.iph->tos; + sk->priority = skb->priority; + sk->protocol = skb->nh.iph->protocol; daddr = ipc.addr = rt->rt_src; ipc.opt = &replyopts.opt; + if (ipc.opt->srr) daddr = replyopts.opt.faddr; - if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) - return NULL; - - iphlen = sizeof(struct iphdr) + replyopts.opt.optlen; - reply = alloc_skb(rt->u.dst.dev->hard_header_len+15+iphlen+payload, GFP_ATOMIC); - if (reply == NULL) { - ip_rt_put(rt); - return NULL; - } - - reply->priority = skb->priority; - reply->dst = &rt->u.dst; - skb_reserve(reply, (rt->u.dst.dev->hard_header_len+15)&~15); - - /* Now build the IP header. */ - reply->nh.iph = iph = (struct iphdr *)skb_put(reply, iphlen); - - iph->version = 4; - iph->ihl = iphlen>>2; - iph->tos = skb->nh.iph->tos; - iph->frag_off = 0; - iph->ttl = MAXTTL; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; - iph->protocol = skb->nh.iph->protocol; - iph->id = htons(ip_id_count++); - - ip_options_build(reply, &replyopts.opt, daddr, rt, 0); + return; - return reply; + /* And let IP do all the hard work. */ + ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT); + ip_rt_put(rt); } /* diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index df8dc1896014..61687ce4c43f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.33 1998/03/08 20:52:37 davem Exp $ + * Version: $Id: ipmr.c,v 1.34 1998/04/28 06:21:59 davem Exp $ * * Fixes: * Michael Chastain : Incorrect size of copying. @@ -703,8 +703,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) mrtsock_destruct(sk); return -EADDRINUSE; case MRT_DONE: - mrtsock_destruct(sk); - return 0; + return ip_ra_control(sk, 0, NULL); case MRT_ADD_VIF: case MRT_DEL_VIF: if(optlen!=sizeof(vif)) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index e930ad39aaec..b6e06242f2e1 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.28 1998/04/03 09:49:45 freitag Exp $ + * Version: $Id: proc.c,v 1.30 1998/04/16 16:29:05 freitag Exp $ * * Authors: Fred N. van Kempen, * Gerald J. Heim, @@ -99,12 +99,19 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format) destp = ntohs(destp); srcp = ntohs(srcp); if((format == 0) && (sp->state == TCP_TIME_WAIT)) { + extern int tcp_tw_death_row_slot; struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sp; + int slot_dist; tw_bucket = 1; timer_active1 = timer_active2 = 0; timer_active = 3; - timer_expires = tw->timer.expires; + slot_dist = tw->death_slot; + if(slot_dist > tcp_tw_death_row_slot) + slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot; + else + slot_dist = tcp_tw_death_row_slot - slot_dist; + timer_expires = jiffies + (slot_dist * TCP_TWKILL_PERIOD); } else { timer_active1 = del_timer(&tp->retransmit_timer); timer_active2 = del_timer(&sp->timer); @@ -349,11 +356,13 @@ int netstat_get_info(char *buffer, char **start, off_t offset, int length, int d int len; len = sprintf(buffer, - "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed\n" - "TcpExt: %lu %lu %lu\n", + "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed" + "EmbryonicRsts\n" + "TcpExt: %lu %lu %lu %lu\n", net_statistics.SyncookiesSent, net_statistics.SyncookiesRecv, - net_statistics.SyncookiesFailed); + net_statistics.SyncookiesFailed, + net_statistics.EmbryonicRsts); if (offset >= len) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 810240ddc2ac..bae4fe5c76f6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.42 1998/03/20 09:12:09 davem Exp $ + * Version: $Id: route.c,v 1.47 1998/04/28 06:22:01 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -577,7 +577,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) if (rt != NULL) { if (dst->obsolete || rt->rt_flags&RTCF_REDIRECTED) { #if RT_CACHE_DEBUG >= 1 - printk(KERN_DEBUG "ip_rt_advice: redirect to %08x/%02x dropped\n", rt->rt_dst, rt->key.tos); + printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos); #endif ip_rt_put(rt); rt_cache_flush(0); @@ -725,11 +725,11 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) mtu = guess_mtu(old_mtu); } - if (mtu < rth->u.dst.pmtu) { - /* New mtu received -> path was valid */ - dst_confirm(&rth->u.dst); - - rth->u.dst.pmtu = mtu; + if (mtu <= rth->u.dst.pmtu) { + if (mtu < rth->u.dst.pmtu) { + dst_confirm(&rth->u.dst); + rth->u.dst.pmtu = mtu; + } est_mtu = mtu; } } @@ -808,11 +808,18 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res) #endif rt->u.dst.window= fi->fib_window ? : 0; rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT; +#ifdef CONFIG_NET_CLS_ROUTE + rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; +#endif } else { rt->u.dst.pmtu = rt->u.dst.dev->mtu; rt->u.dst.window= 0; rt->u.dst.rtt = TCP_TIMEOUT_INIT; } +#ifdef CONFIG_NET_CLS_ROUTE + if (rt->u.dst.tclassid == 0) + rt->u.dst.tclassid = fib_rules_tclass(res); +#endif rt->rt_type = res->type; } @@ -1205,6 +1212,9 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int key.oif = oif; key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; res.fi = NULL; +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif if (saddr) { if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr)) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 373c7774192f..fd4284af992c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.111 1998/04/06 16:09:05 davem Exp $ + * Version: $Id: tcp.c,v 1.114 1998/04/26 01:11:33 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -702,7 +702,7 @@ static void wait_for_tcp_memory(struct sock * sk) int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int mss_now = sk->mss; + int mss_now; int err = 0; int copied = 0; @@ -715,14 +715,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) if((err = wait_for_tcp_connect(sk, flags)) != 0) return err; - /* The socket is locked, nothing can change the state of pending - * SACKs or IP options. - */ - if(tp->sack_ok && tp->num_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - if(sk->opt && sk->opt->optlen) - mss_now -= (sk->opt->optlen); + mss_now = tcp_current_mss(sk); /* Ok commence sending. */ while(--iovlen >= 0) { @@ -842,6 +835,11 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) goto do_interrupted; } wait_for_tcp_memory(sk); + + /* If SACK's were formed or PMTU events happened, + * we must find out about it. + */ + mss_now = tcp_current_mss(sk); continue; } @@ -908,10 +906,8 @@ void tcp_read_wakeup(struct sock *sk) /* If we're closed, don't send an ack, or we'll get a RST * from the closed destination. */ - if ((1 << sk->state) & (TCPF_CLOSE|TCPF_TIME_WAIT)) - return; - - tcp_send_ack(sk); + if (sk->state != TCP_CLOSE) + tcp_send_ack(sk); } /* @@ -1402,7 +1398,12 @@ void tcp_close(struct sock *sk, unsigned long timeout) return; } - sk->keepopen = 1; + /* It is questionable, what the role of this is now. + * In any event either it should be removed, or + * increment of SLT_KEEPALIVE be done, this is causing + * big problems. For now I comment it out. -DaveM + */ + /* sk->keepopen = 1; */ sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e53204a13d5c..d5b0b15c60ae 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.106 1998/04/10 23:56:19 davem Exp $ + * Version: $Id: tcp_input.c,v 1.114 1998/04/28 06:42:22 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -47,6 +47,9 @@ * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming * data segments. + * Andi Kleen: Make sure we never ack data there is not + * enough room for. Also make this condition + * a fatal error if it might still happen. */ #include @@ -76,6 +79,8 @@ int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; +static int prune_queue(struct sock *sk); + /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The @@ -343,6 +348,13 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i if (!no_fancy && sysctl_tcp_window_scaling) { tp->wscale_ok = 1; tp->snd_wscale = *(__u8 *)ptr; + if(tp->snd_wscale > 14) { + if(net_ratelimit()) + printk("tcp_parse_options: Illegal window " + "scaling value %d >14 received.", + tp->snd_wscale); + tp->snd_wscale = 14; + } } break; case TCPOPT_TIMESTAMP: @@ -598,11 +610,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, int acked = 0; while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + /* If our packet is before the ack sequence we can * discard it as it's confirmed to have arrived at * the other end. */ - if (after(TCP_SKB_CB(skb)->end_seq, ack)) + if (after(scb->end_seq, ack)) break; /* Initial outgoing SYN's get put onto the write_queue @@ -612,8 +626,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { - __u8 sacked = TCP_SKB_CB(skb)->sacked; + if(!(scb->flags & TCPCB_FLAG_SYN)) { + __u8 sacked = scb->sacked; acked |= FLAG_DATA_ACKED; if(sacked & TCPCB_SACKED_RETRANS) { @@ -634,8 +648,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, tp->retrans_head = NULL; } tp->packets_out--; - *seq = TCP_SKB_CB(skb)->seq; - *seq_rtt = now - TCP_SKB_CB(skb)->when; + *seq = scb->seq; + *seq_rtt = now - scb->when; __skb_unlink(skb, skb->list); kfree_skb(skb); } @@ -850,13 +864,12 @@ uninteresting_ack: } /* New-style handling of TIME_WAIT sockets. */ -static void tcp_timewait_kill(unsigned long __arg) -{ - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)__arg; - - /* Zap the timer. */ - del_timer(&tw->timer); +extern void tcp_tw_schedule(struct tcp_tw_bucket *tw); +extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw); +extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); +void tcp_timewait_kill(struct tcp_tw_bucket *tw) +{ /* Unlink from various places. */ if(tw->bind_next) tw->bind_next->bind_pprev = tw->bind_pprev; @@ -908,7 +921,8 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, isn = tw->rcv_nxt + 128000; if(isn == 0) isn++; - tcp_timewait_kill((unsigned long)tw); + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); sk = af_specific->get_sock(skb, th); if(sk == NULL || !ipsec_sk_policy(sk,skb)) return 0; @@ -925,16 +939,16 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if(sysctl_tcp_rfc1337 == 0) - tcp_timewait_kill((unsigned long)tw); - + if(sysctl_tcp_rfc1337 == 0) { + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + } if(!th->rst) return 1; /* toss a reset back */ } else { - if(th->ack) { - /* In this case we must reset the TIMEWAIT timer. */ - mod_timer(&tw->timer, jiffies + TCP_TIMEWAIT_LEN); - } + /* In this case we must reset the TIMEWAIT timer. */ + if(th->ack) + tcp_tw_reschedule(tw); } return 0; /* Discard the frame. */ } @@ -1010,11 +1024,7 @@ void tcp_time_wait(struct sock *sk) tcp_tw_hashdance(sk, tw); /* Get the TIME_WAIT timeout firing. */ - init_timer(&tw->timer); - tw->timer.function = tcp_timewait_kill; - tw->timer.data = (unsigned long) tw; - tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN; - add_timer(&tw->timer); + tcp_tw_schedule(tw); /* CLOSE the SK. */ if(sk->state == TCP_ESTABLISHED) @@ -1440,6 +1450,20 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) if (skb->len == 0 && !th->fin) return(0); + /* + * If our receive queue has grown past its limits shrink it. + * Make sure to do this before moving snd_nxt, otherwise + * data might be acked for that we don't have enough room. + */ + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { + if (prune_queue(sk) < 0) { + /* Still not enough room. That can happen when + * skb->true_size differs significantly from skb->len. + */ + return 0; + } + } + /* We no longer have anyone receiving data on this connection. */ tcp_data_queue(sk, skb); @@ -1497,7 +1521,7 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) */ /* Two full frames received or... */ - if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) || + if (((tp->rcv_nxt - tp->rcv_wup) >= sk->mss * MAX_DELAY_ACK) || /* We will update the window "significantly" or... */ tcp_raise_window(sk) || /* We entered "quick ACK" mode or... */ @@ -1599,7 +1623,7 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len * Clean first the out_of_order queue, then the receive queue until * the socket is in its memory limits again. */ -static void prune_queue(struct sock *sk) +static int prune_queue(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct sk_buff * skb; @@ -1613,7 +1637,7 @@ static void prune_queue(struct sock *sk) while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue))) { kfree_skb(skb); if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) - return; + return 0; } /* Now continue with the receive queue if it wasn't enough */ @@ -1626,9 +1650,10 @@ static void prune_queue(struct sock *sk) /* Never remove packets that have been already acked */ if (before(TCP_SKB_CB(skb)->end_seq, tp->last_ack_sent+1)) { - printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n", - tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->last_ack_sent); - break; + SOCK_DEBUG(sk, "prune_queue: hit acked data c=%x,%x,%x\n", + tp->copied_seq, TCP_SKB_CB(skb)->end_seq, + tp->last_ack_sent); + return -1; } __skb_unlink(skb, skb->list); tp->rcv_nxt = TCP_SKB_CB(skb)->seq; @@ -1639,6 +1664,7 @@ static void prune_queue(struct sock *sk) if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) break; } + return 0; } int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, @@ -1763,13 +1789,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* step 7: process the segment text */ queued = tcp_data(skb, sk, len); - tcp_data_snd_check(sk); - - /* If our receive queue has grown past its limits shrink it */ - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) - prune_queue(sk); - - tcp_ack_snd_check(sk); + /* Be careful, tcp_data() may have put this into TIME_WAIT. */ + if(sk->state != TCP_CLOSE) { + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + } if (!queued) { discard: @@ -1779,42 +1803,44 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, return 0; } -/* Shared between IPv4 and IPv6 now. */ -struct sock * -tcp_check_req(struct sock *sk, struct sk_buff *skb, struct open_request *req) +/* + * Process an incoming SYN or SYN-ACK. + */ + +struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, + struct open_request *req) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 flg; /* assumption: the socket is not in use. * as we checked the user count on tcp_rcv and we're * running from a soft interrupt. */ + /* Check for syn retransmission */ + flg = *(((u32 *)skb->h.th) + 3); + + flg &= __constant_htonl(0x00170000); + /* Only SYN set? */ + if (flg == __constant_htonl(0x00020000)) { + if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) { + /* retransmited syn. + */ + req->class->rtx_syn_ack(sk, req); + return NULL; + } else { + return sk; /* Pass new SYN to the listen socket. */ + } + } + + /* We know it's an ACK here */ if (req->sk) { /* socket already created but not * yet accepted()... */ sk = req->sk; } else { - u32 flg; - - /* Check for syn retransmission */ - flg = *(((u32 *)skb->h.th) + 3); - - flg &= __constant_htonl(0x00170000); - /* Only SYN set? */ - if (flg == __constant_htonl(0x00020000)) { - if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) { - /* retransmited syn. - */ - req->class->rtx_syn_ack(sk, req); - return NULL; - } else { - return sk; /* New SYN */ - } - } - - /* We know it's an ACK here */ /* In theory the packet could be for a cookie, but * TIME_WAIT should guard us against this. * XXX: Nevertheless check for cookies? @@ -1901,6 +1927,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* We got an ack, but it's not a good ack. */ if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len)) { + sk->err = ECONNRESET; + sk->state_change(sk); tcp_statistics.TcpAttemptFails++; return 1; } @@ -1914,6 +1942,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* A valid ack from a different connection * start. Shouldn't happen but cover it. */ + sk->err = ECONNRESET; + sk->state_change(sk); tcp_statistics.TcpAttemptFails++; return 1; } @@ -2112,8 +2142,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, break; case TCP_CLOSING: - if (tp->snd_una == tp->write_seq) + if (tp->snd_una == tp->write_seq) { tcp_time_wait(sk); + goto discard; + } break; case TCP_LAST_ACK: @@ -2155,10 +2187,6 @@ step6: case TCP_ESTABLISHED: queued = tcp_data(skb, sk, len); - - /* This can only happen when MTU+skbheader > rcvbuf */ - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) - prune_queue(sk); break; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7220fad7cc03..f901aa2bdbf9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.133 1998/04/06 08:42:28 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.141 1998/04/24 19:38:19 freitag Exp $ * * IPv4 specific functions * @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -60,6 +61,9 @@ #include +/* That should be really in a standard kernel include file. */ +#define offsetof(t,m) ((unsigned int) (&((t *)0)->m)) + extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; @@ -69,6 +73,10 @@ extern int sysctl_ip_dynaddr; /* Check TCP sequence numbers in ICMP packets. */ #define ICMP_MIN_LENGTH 8 +/* Socket used for sending RSTs */ +struct inode tcp_inode; +struct socket *tcp_socket=&tcp_inode.u.socket_i; + static void tcp_v4_send_reset(struct sk_buff *skb); void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, @@ -160,6 +168,18 @@ struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum) return tb; } +/* Ensure that the bound bucket for the port exists. + * Return 0 on success. + */ +static __inline__ int tcp_bucket_check(unsigned short snum) +{ + if (tcp_bound_hash[tcp_bhashfn(snum)] == NULL && + tcp_bucket_create(snum) == NULL) + return 1; + else + return 0; +} + static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) { struct tcp_bind_bucket *tb; @@ -850,49 +870,42 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, static void tcp_v4_send_reset(struct sk_buff *skb) { struct tcphdr *th = skb->h.th; + struct tcphdr rth; + struct ip_reply_arg arg; /* Never send a reset in response to a reset. */ - if (th->rst == 0) { - struct tcphdr *th = skb->h.th; - struct sk_buff *skb1 = ip_reply(skb, sizeof(struct tcphdr)); - struct tcphdr *th1; - - if (skb1 == NULL) - return; - - skb1->h.th = th1 = (struct tcphdr *) - skb_put(skb1, sizeof(struct tcphdr)); - - /* Swap the send and the receive. */ - memset(th1, 0, sizeof(*th1)); - th1->dest = th->source; - th1->source = th->dest; - th1->doff = sizeof(*th1)/4; - th1->rst = 1; - - if (th->ack) { - th1->seq = th->ack_seq; - } else { - th1->ack = 1; - if (!th->syn) - th1->ack_seq = th->seq; - else - th1->ack_seq = htonl(ntohl(th->seq)+1); - } - skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0); - th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr, - skb1->nh.iph->daddr, skb1->csum); - - /* Finish up some IP bits. */ - skb1->nh.iph->tot_len = htons(skb1->len); - ip_send_check(skb1->nh.iph); + if (th->rst) + return; - /* All the other work was done by ip_reply(). */ - skb1->dst->output(skb1); + /* Swap the send and the receive. */ + memset(&rth, 0, sizeof(struct tcphdr)); + rth.dest = th->source; + rth.source = th->dest; + rth.doff = sizeof(struct tcphdr)/4; + rth.rst = 1; - tcp_statistics.TcpOutSegs++; - tcp_statistics.TcpOutRsts++; + if (th->ack) { + rth.seq = th->ack_seq; + } else { + rth.ack = 1; + rth.ack_seq = th->syn ? htonl(ntohl(th->seq)+1) : th->seq; } + + memset(&arg, 0, sizeof arg); + arg.iov[0].iov_base = (unsigned char *)&rth; + arg.iov[0].iov_len = sizeof rth; + arg.csum = csum_tcpudp_magic(skb->nh.iph->daddr, + skb->nh.iph->saddr, /*XXX*/ + sizeof(struct tcphdr), + IPPROTO_TCP, + 0); + arg.n_iov = 1; + arg.csumoffset = offsetof(struct tcphdr, check) / sizeof(u16); + + ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth); + + tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutRsts++; } #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -1277,12 +1290,19 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, return NULL; dst = &rt->u.dst; } - - sk->tp_pinfo.af_tcp.syn_backlog--; - sk->ack_backlog++; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* The new socket created for transparent proxy may fall + * into a non-existed bind bucket because sk->num != newsk->num. + * Ensure existance of the bucket now. The placement of the check + * later will require to destroy just created newsk in the case of fail. + * 1998/04/22 Andrey V. Savochkin + */ + if (tcp_bucket_check(ntohs(skb->h.th->dest))) + goto exit; +#endif mtu = dst->pmtu; - if (mtu < 68) + if (mtu < 68) /* XXX: we should turn pmtu disc off when this happens. */ mtu = 68; snd_mss = mtu - sizeof(struct iphdr); @@ -1290,6 +1310,9 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (!newsk) goto exit; + sk->tp_pinfo.af_tcp.syn_backlog--; + sk->ack_backlog++; + newsk->dst_cache = dst; newtp = &(newsk->tp_pinfo.af_tcp); @@ -1329,6 +1352,8 @@ static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb) (req->sk ? sk->ack_backlog : tp->syn_backlog)--; req->class->destructor(req); tcp_openreq_free(req); + + net_statistics.EmbryonicRsts++; } /* Check for embryonic sockets (open_requests) We check packets with @@ -1358,9 +1383,9 @@ static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) sk = tcp_check_req(sk, skb, req); } #ifdef CONFIG_SYN_COOKIES - else { + else { sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); - } + } #endif } return sk; @@ -1454,9 +1479,9 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) { printk(KERN_DEBUG "TCPv4 bad checksum from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, " "len=%d/%d/%d\n", - NIPQUAD(ntohl(skb->nh.iph->saddr)), + NIPQUAD(skb->nh.iph->saddr), ntohs(th->source), - NIPQUAD(ntohl(skb->nh.iph->daddr)), + NIPQUAD(skb->nh.iph->daddr), ntohs(th->dest), len, skb->len, ntohs(skb->nh.iph->tot_len)); @@ -1712,3 +1737,25 @@ struct proto tcp_prot = { 0, /* inuse */ 0 /* highestinuse */ }; + + + +__initfunc(void tcp_v4_init(struct net_proto_family *ops)) +{ + int err; + + tcp_inode.i_mode = S_IFSOCK; + tcp_inode.i_sock = 1; + tcp_inode.i_uid = 0; + tcp_inode.i_gid = 0; + + tcp_socket->inode = &tcp_inode; + tcp_socket->state = SS_UNCONNECTED; + tcp_socket->type=SOCK_RAW; + + if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0) + panic("Failed to create the TCP control socket.\n"); + tcp_socket->sk->allocation=GFP_ATOMIC; + tcp_socket->sk->num = 256; /* Don't receive any data */ + tcp_socket->sk->ip_ttl = MAXTTL; +} diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d615e6c6e350..482ca262cf0b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.84 1998/04/06 08:48:29 davem Exp $ + * Version: $Id: tcp_output.c,v 1.87 1998/04/26 01:11:35 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -234,18 +234,14 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) void tcp_write_xmit(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int mss_now = sk->mss; + unsigned int mss_now; /* Account for SACKS, we may need to fragment due to this. * It is just like the real MSS changing on us midstream. * We also handle things correctly when the user adds some * IP options mid-stream. Silly to do, but cover it. */ - if(tp->sack_ok && tp->num_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - if(sk->opt && sk->opt->optlen) - mss_now -= sk->opt->optlen; + mss_now = tcp_current_mss(sk); /* If we are zapped, the bytes will have to remain here. * In time closedown will empty the write queue and all @@ -439,14 +435,14 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m } /* Do a simple retransmit without using the backoff mechanisms in - * tcp_timer. This is used to speed up path mtu recovery. Note that - * these simple retransmits aren't counted in the usual tcp retransmit - * backoff counters. + * tcp_timer. This is used for path mtu discovery. * The socket is already locked here. */ void tcp_simple_retransmit(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + unsigned int mss = tcp_current_mss(sk); /* Don't muck with the congestion window here. */ tp->dup_acks = 0; @@ -457,7 +453,10 @@ void tcp_simple_retransmit(struct sock *sk) * and not use it for RTT calculation in the absence of * the timestamp option. */ - tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + for (skb = skb_peek(&sk->write_queue); skb != tp->send_head; + skb = skb->next) + if (skb->len > mss) + tcp_retransmit_skb(sk, skb); } static __inline__ void update_retrans_head(struct sock *sk) @@ -477,17 +476,10 @@ static __inline__ void update_retrans_head(struct sock *sk) int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int current_mss = sk->mss; + unsigned int cur_mss = tcp_current_mss(sk); - /* Account for outgoing SACKS and IP options, if any. */ - if(tp->sack_ok && tp->num_sacks) - current_mss -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - if(sk->opt && sk->opt->optlen) - current_mss -= sk->opt->optlen; - - if(skb->len > current_mss) { - if(tcp_fragment(sk, skb, current_mss)) + if(skb->len > cur_mss) { + if(tcp_fragment(sk, skb, cur_mss)) return 1; /* We'll try again later. */ /* New SKB created, account for it. */ @@ -496,11 +488,11 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Collapse two adjacent packets if worthwhile and we can. */ if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && - (skb->len < (current_mss >> 1)) && + (skb->len < (cur_mss >> 1)) && (skb->next != tp->send_head) && (skb->next != (struct sk_buff *)&sk->write_queue) && (sysctl_tcp_retrans_collapse != 0)) - tcp_retrans_try_collapse(sk, skb, current_mss); + tcp_retrans_try_collapse(sk, skb, cur_mss); if(tp->af_specific->rebuild_header(sk)) return 1; /* Routing failure or similar. */ @@ -602,17 +594,14 @@ void tcp_send_fin(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb = skb_peek_tail(&sk->write_queue); - int mss_now = sk->mss; + unsigned int mss_now; /* Optimization, tack on the FIN if we have a queue of * unsent frames. But be careful about outgoing SACKS * and IP options. */ - if(tp->sack_ok && tp->num_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - if(sk->opt && sk->opt->optlen) - mss_now -= sk->opt->optlen; + mss_now = tcp_current_mss(sk); + if((tp->send_head != NULL) && (skb->len < mss_now)) { /* tcp_write_xmit() takes care of the rest. */ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; @@ -720,6 +709,9 @@ int tcp_send_synack(struct sock *sk) return 0; } +/* + * Prepare a SYN-ACK. + */ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct open_request *req, int mss) { @@ -792,7 +784,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb->csum = 0; th->doff = (tcp_header_size >> 2); - tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutSegs++; return skb; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8f478fac803e..9a0f9dfbb336 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.48 1998/04/06 08:42:30 davem Exp $ + * Version: $Id: tcp_timer.c,v 1.50 1998/04/14 09:08:59 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -32,6 +32,7 @@ static void tcp_sltimer_handler(unsigned long); static void tcp_syn_recv_timer(unsigned long); static void tcp_keepalive(unsigned long data); static void tcp_bucketgc(unsigned long); +static void tcp_twkill(unsigned long); struct timer_list tcp_slow_timer = { NULL, NULL, @@ -43,6 +44,7 @@ struct timer_list tcp_slow_timer = { struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = { {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */ {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}, /* KEEPALIVE */ + {ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill}, /* TWKILL */ {ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD, 0, tcp_bucketgc} /* BUCKETGC */ }; @@ -166,11 +168,10 @@ void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; - if(sk->zapped) - return; - - if (sk->tp_pinfo.af_tcp.delayed_acks) - tcp_read_wakeup(sk); + if(!sk->zapped && + sk->tp_pinfo.af_tcp.delayed_acks && + sk->state != TCP_CLOSE) + tcp_send_ack(sk); } void tcp_probe_timer(unsigned long data) @@ -240,9 +241,9 @@ static __inline__ int tcp_keepopen_proc(struct sock *sk) } /* Garbage collect TCP bind buckets. */ -static void tcp_bucketgc(unsigned long __unused) +static void tcp_bucketgc(unsigned long data) { - int i; + int i, reaped = 0;; for(i = 0; i < TCP_BHTABLE_SIZE; i++) { struct tcp_bind_bucket *tb = tcp_bound_hash[i]; @@ -252,8 +253,7 @@ static void tcp_bucketgc(unsigned long __unused) if((tb->owners == NULL) && !(tb->flags & TCPB_FLAG_LOCKED)) { - /* Eat timer reference. */ - tcp_dec_slow_timer(TCP_SLT_BUCKETGC); + reaped++; /* Unlink bucket. */ if(tb->next) @@ -266,6 +266,92 @@ static void tcp_bucketgc(unsigned long __unused) tb = next; } } + if(reaped != 0) { + struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data; + + /* Eat timer references. */ + atomic_sub(reaped, &slt->count); + } +} + +/* Kill off TIME_WAIT sockets once their lifetime has expired. */ +int tcp_tw_death_row_slot = 0; +static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] = + { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; + +extern void tcp_timewait_kill(struct tcp_tw_bucket *tw); + +static void tcp_twkill(unsigned long data) +{ + struct tcp_tw_bucket *tw; + int killed = 0; + + tw = tcp_tw_death_row[tcp_tw_death_row_slot]; + tcp_tw_death_row[tcp_tw_death_row_slot] = NULL; + while(tw != NULL) { + struct tcp_tw_bucket *next = tw->next_death; + + tcp_timewait_kill(tw); + killed++; + tw = next; + } + if(killed != 0) { + struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data; + atomic_sub(killed, &slt->count); + } + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); +} + +/* These are always called from BH context. See callers in + * tcp_input.c to verify this. + */ +void tcp_tw_schedule(struct tcp_tw_bucket *tw) +{ + int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); + + tw->death_slot = slot; + tw->next_death = tcp_tw_death_row[slot]; + tcp_tw_death_row[slot] = tw; + tcp_inc_slow_timer(TCP_SLT_TWKILL); +} + +/* Happens rarely if at all, no care about scalability here. */ +void tcp_tw_reschedule(struct tcp_tw_bucket *tw) +{ + struct tcp_tw_bucket *walk; + int slot = tw->death_slot; + + walk = tcp_tw_death_row[slot]; + if(walk == tw) { + tcp_tw_death_row[slot] = tw->next_death; + } else { + while(walk->next_death != tw) + walk = walk->next_death; + walk->next_death = tw->next_death; + } + slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); + tw->death_slot = slot; + tw->next_death = tcp_tw_death_row[slot]; + tcp_tw_death_row[slot] = tw; + /* Timer was incremented when we first entered the table. */ +} + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +{ + struct tcp_tw_bucket *walk; + int slot = tw->death_slot; + + walk = tcp_tw_death_row[slot]; + if(walk == tw) { + tcp_tw_death_row[slot] = tw->next_death; + } else { + while(walk->next_death != tw) + walk = walk->next_death; + walk->next_death = tw->next_death; + } + tcp_dec_slow_timer(TCP_SLT_TWKILL); } /* @@ -511,14 +597,14 @@ void tcp_sltimer_handler(unsigned long data) slt->last = now; trigger = slt->period; } - next = min(next, trigger); - } - } - if (next != ~0UL) { - tcp_slow_timer.expires = now + next; - add_timer(&tcp_slow_timer); + /* Only reschedule if some events remain. */ + if (atomic_read(&slt->count)) + next = min(next, trigger); + } } + if (next != ~0UL) + mod_timer(&tcp_slow_timer, (now + next)); } void __tcp_inc_slow_timer(struct tcp_sl_timer *slt) @@ -531,9 +617,8 @@ void __tcp_inc_slow_timer(struct tcp_sl_timer *slt) when = now + slt->period; if (tcp_slow_timer.prev) { - if ((long)(tcp_slow_timer.expires - when) >= 0) { + if ((long)(tcp_slow_timer.expires - when) >= 0) mod_timer(&tcp_slow_timer, when); - } } else { tcp_slow_timer.expires = when; add_timer(&tcp_slow_timer); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 735ceeb5f653..693caaf3bafd 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: ip6_fib.c,v 1.12 1998/03/20 09:12:16 davem Exp $ + * $Id: ip6_fib.c,v 1.13 1998/04/28 06:22:03 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -694,8 +694,13 @@ static void fib6_del_2(struct fib6_node *fn) /* * We can't tidy a case of two children. */ - - if (children > 1 || (fn->fn_flags & RTN_RTINFO)) + if (children > 1) { + if (fn->leaf == NULL) + goto split_repair; + break; + } + + if (fn->fn_flags & RTN_RTINFO) break; /* @@ -765,6 +770,8 @@ static void fib6_del_2(struct fib6_node *fn) stree_node: rt6_release(fn->leaf); + +split_repair: rt = fib6_find_prefix(fn); if (rt == NULL) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e92b4e878fb9..eb3984f5574b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: ip6_output.c,v 1.11 1998/03/28 08:29:39 davem Exp $ + * $Id: ip6_output.c,v 1.12 1998/04/11 22:11:06 davem Exp $ * * Based on linux/net/ipv4/ip_output.c * @@ -75,7 +75,6 @@ int ip6_output(struct sk_buff *skb) } else if (dst->neighbour) return dst->neighbour->output(skb); - printk(KERN_DEBUG "khm\n"); kfree_skb(skb); return -EINVAL; } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index c010b0964410..9b24b4948f23 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -7,7 +7,7 @@ * PROC file system. This is very similar to the IPv4 version, * except it reports the sockets in the INET6 address family. * - * Version: $Id: proc.c,v 1.7 1998/03/18 07:52:13 davem Exp $ + * Version: $Id: proc.c,v 1.8 1998/04/13 17:06:03 davem Exp $ * * Authors: David S. Miller (davem@caip.rutgers.edu) * @@ -71,9 +71,17 @@ static int get__netinfo6(struct proto *pro, char *buffer, int format, char **sta destp = ntohs(sp->dport); srcp = ntohs(sp->sport); if((format == 0) && (sp->state == TCP_TIME_WAIT)) { + extern int tcp_tw_death_row_slot; + int slot_dist; + timer_active1 = timer_active2 = 0; timer_active = 3; - timer_expires = tw->timer.expires; + slot_dist = tw->death_slot; + if(slot_dist > tcp_tw_death_row_slot) + slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot; + else + slot_dist = tcp_tw_death_row_slot - slot_dist; + timer_expires = jiffies + (slot_dist * TCP_TWKILL_PERIOD); } else { timer_active1 = del_timer(&tp->retransmit_timer); timer_active2 = del_timer(&sp->timer); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3015d254bc3f..a71c9c0e5833 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: route.c,v 1.27 1998/03/21 07:28:04 davem Exp $ + * $Id: route.c,v 1.28 1998/04/28 06:22:04 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -90,7 +90,11 @@ struct rt6_info ip6_null_entry = { {{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), NULL, -1, 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, - ip6_pkt_discard, ip6_pkt_discard, &ip6_dst_ops}}, + ip6_pkt_discard, ip6_pkt_discard, +#ifdef CONFIG_NET_CLS_ROUTE + 0, +#endif + &ip6_dst_ops}}, NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0U, 255, 0, {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0} }; @@ -751,7 +755,7 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) goto out; } - grt = rt6_lookup(gw_addr, NULL, dev->ifindex, RTF_LINKRT); + grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, RTF_LINKRT); if (grt == NULL || (grt->rt6i_flags&RTF_GATEWAY)) { *err = -EHOSTUNREACH; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 52e02ef2d44e..721677fa6350 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: tcp_ipv6.c,v 1.76 1998/04/06 08:42:34 davem Exp $ + * $Id: tcp_ipv6.c,v 1.78 1998/04/16 16:29:22 freitag Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -48,7 +48,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb); static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); -static int tcp_v6_backlog_rcv(struct sock *sk, struct sk_buff *skb); +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static void tcp_v6_xmit(struct sk_buff *skb); static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, struct ipv6hdr *ip6h, @@ -403,7 +403,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) { sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific; - sk->backlog_rcv = tcp_v6_backlog_rcv; + sk->backlog_rcv = tcp_v6_do_rcv; } return err; @@ -654,9 +654,6 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, } -/* FIXME: this is substantially similar to the ipv4 code. - * Can some kind of merge be done? -- erics - */ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) { struct sk_buff * skb; @@ -1011,128 +1008,161 @@ static void tcp_v6_rst_req(struct sock *sk, struct sk_buff *skb) tcp_synq_unlink(tp, req, prev); req->class->destructor(req); tcp_openreq_free(req); + net_statistics.EmbryonicRsts++; } -int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, - struct in6_addr *saddr, struct in6_addr *daddr, - struct ipv6_options *opt, unsigned short len, - int redo, struct inet6_protocol *protocol) +static inline struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th; - struct sock *sk; + struct tcphdr *th = skb->h.th; + u32 flg = ((u32 *)th)[3]; - /* - * "redo" is 1 if we have already seen this skb but couldn't - * use it at that time (the socket was locked). In that case - * we have already done a lot of the work (looked up the socket - * etc). + /* Check for RST */ + if (flg & __constant_htonl(0x00040000)) { + tcp_v6_rst_req(sk, skb); + return NULL; + } + + /* Check SYN|ACK */ + if (flg & __constant_htonl(0x00120000)) { + struct open_request *req, *dummy; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + req = tcp_v6_search_req(tp, skb->nh.ipv6h,th, &dummy); + if (req) { + sk = tcp_check_req(sk, skb, req); + } +#if 0 /*def CONFIG_SYN_COOKIES */ + else { + sk = cookie_v6_check(sk, skb, (struct ipv6_options *) skb->cb); + } +#endif + } + return sk; +} + +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. + From backlog it always goes here. Kerboom... + Fortunately, tcp_rcv_established and rcv_established + handle them correctly, but it is not case with + tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK */ - th = skb->h.th; + if (skb->protocol == __constant_htons(ETH_P_IP)) + return tcp_v4_do_rcv(sk, skb); - sk = skb->sk; + /* + * socket locking is here for SMP purposes as backlog rcv + * is currently called with bh processing disabled. + */ - if (!redo) { - if (skb->pkt_type != PACKET_HOST) - goto discard_it; + /* XXX We need to think more about socket locking + * XXX wrt. backlog queues, __release_sock(), etc. -DaveM + */ + lock_sock(sk); - /* - * Pull up the IP header. - */ + /* + * This doesn't check if the socket has enough room for the packet. + * Either process the packet _without_ queueing it and then free it, + * or do the check later. + */ + skb_set_owner_r(skb, sk); - __skb_pull(skb, skb->h.raw - skb->data); + if (sk->state == TCP_ESTABLISHED) { /* Fast path */ + if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) + goto reset; + release_sock(sk); + return 0; + } - /* - * Count it even if it's bad. - */ + if (sk->state == TCP_LISTEN) { + struct sock *nsk; + + nsk = tcp_v6_hnd_req(sk, skb); + if (!nsk) + goto discard; + lock_sock(nsk); + release_sock(sk); + sk = nsk; + } - tcp_statistics.TcpInSegs++; + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->cb, skb->len)) + goto reset; + release_sock(sk); + return 0; - /* - * Try to use the device checksum if provided. - */ +reset: + tcp_v6_send_reset(skb); +discard: + kfree_skb(skb); + release_sock(sk); + return 0; +} - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = csum_partial((char *)th, len, 0); - case CHECKSUM_HW: - if (tcp_v6_check(th,len,saddr,daddr,skb->csum)) { - printk(KERN_DEBUG "tcp csum failed\n"); - tcp_statistics.TcpInErrs++; - goto discard_it; - } - default: - /* CHECKSUM_UNNECESSARY */ - }; +int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + struct ipv6_options *opt, unsigned short len, + int redo, struct inet6_protocol *protocol) +{ + struct tcphdr *th; + struct sock *sk; - sk = __tcp_v6_lookup(th, saddr, th->source, daddr, th->dest, dev->ifindex); + th = skb->h.th; - if (!sk) { - printk(KERN_DEBUG "socket not found\n"); - goto no_tcp_socket; - } + if (skb->pkt_type != PACKET_HOST) + goto discard_it; - TCP_SKB_CB(skb)->seq = ntohl(th->seq); - TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + - len - th->doff*4); - TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - skb->used = 0; - if(sk->state == TCP_TIME_WAIT) - goto do_time_wait; + /* + * Pull up the IP header. + */ - skb->sk = sk; - } + __skb_pull(skb, skb->h.raw - skb->data); /* - * We may need to add it to the backlog here. + * Count it even if it's bad. */ - if (sk->sock_readers) { - __skb_queue_tail(&sk->back_log, skb); - return(0); - } + tcp_statistics.TcpInSegs++; - skb_set_owner_r(skb, sk); + /* + * Try to use the device checksum if provided. + */ - if (sk->state == TCP_ESTABLISHED) { - if (tcp_rcv_established(sk, skb, th, len)) - goto no_tcp_socket; - return 0; - } + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = csum_partial((char *)th, len, 0); + case CHECKSUM_HW: + if (tcp_v6_check(th,len,saddr,daddr,skb->csum)) { + printk(KERN_DEBUG "tcp csum failed\n"); + tcp_statistics.TcpInErrs++; + goto discard_it; + } + default: + /* CHECKSUM_UNNECESSARY */ + }; - if (sk->state == TCP_LISTEN) { - __u32 flg = ((u32 *)th)[3]; + sk = __tcp_v6_lookup(th, saddr, th->source, daddr, th->dest, dev->ifindex); - /* Check for RST */ - if (flg & __constant_htonl(0x00040000)) { - tcp_v6_rst_req(sk, skb); - } - - /* Check SYN|ACK */ - if (flg & __constant_htonl(0x00120000)) { - struct open_request *req, *prev; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - req = tcp_v6_search_req(tp, skb->nh.ipv6h,th,&prev); - if (req) { - sk = tcp_check_req(sk, skb, req); - } - /* else do syncookies (add them here) */ - if (sk == NULL) - goto discard_it; - } - } + if (!sk) + goto no_tcp_socket; - if (tcp_rcv_state_process(sk, skb, th, opt, len) == 0) - return 0; + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + len - th->doff*4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + skb->used = 0; + if(sk->state == TCP_TIME_WAIT) + goto do_time_wait; -no_tcp_socket: + if (!sk->sock_readers) + return tcp_v6_do_rcv(sk, skb); - /* - * No such TCB. If th->rst is 0 send a reset - * (checked in tcp_v6_send_reset) - */ + __skb_queue_tail(&sk->back_log, skb); + return(0); +no_tcp_socket: tcp_v6_send_reset(skb); discard_it: @@ -1182,18 +1212,6 @@ static int tcp_v6_rebuild_header(struct sock *sk) return dst->error; } -static int tcp_v6_backlog_rcv(struct sock *sk, struct sk_buff *skb) -{ - int res; - - res = tcp_v6_rcv(skb, skb->dev, - &skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, - (struct ipv6_options *) skb->cb, - skb->len, 1, - (struct inet6_protocol *) sk->pair); - return res; -} - static struct sock * tcp_v6_get_sock(struct sk_buff *skb, struct tcphdr *th) { struct in6_addr *saddr; @@ -1372,7 +1390,7 @@ struct proto tcpv6_prot = { tcp_v6_sendmsg, /* sendmsg */ tcp_recvmsg, /* recvmsg */ NULL, /* bind */ - tcp_v6_backlog_rcv, /* backlog_rcv */ + tcp_v6_do_rcv, /* backlog_rcv */ tcp_v6_hash, /* hash */ tcp_v6_unhash, /* unhash */ tcp_v6_rehash, /* rehash */ diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 32fadd36652f..f035e8c62110 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -48,6 +48,7 @@ * Revision 0.37: Began adding POSIXisms. * Revision 0.38: Asynchronous socket stuff made current. * Revision 0.39: SPX interfaces + * Revision 0.40: Tiny SIOCGSTAMP fix (chris@cybernet.co.nz) * * Protect the module by a MOD_INC_USE_COUNT/MOD_DEC_USE_COUNT * pair. Also, now usage count is managed this way @@ -627,6 +628,14 @@ static int ipxitf_send(ipx_interface *intrfc, struct sk_buff *skb, char *node) if (ipx->ipx_source.net != intrfc->if_netnum) { + /* + * Unshare the buffer before modifying the count in + * case its a flood or tcpdump + */ + skb=skb_unshare(skb, GFP_ATOMIC); + if(!skb) + return 0; + ipx = skb->nh.ipxh; if (++(ipx->ipx_tctrl) > ipxcfg_max_hops) send_to_wire = 0; } @@ -725,7 +734,7 @@ static int ipxitf_rcv(ipx_interface *intrfc, struct sk_buff *skb) } } - if( ipx->ipx_type == IPX_TYPE_PPROP && ipx->ipx_tctrl < 8 && skb->pkt_type == PACKET_HOST ) + if( ipx->ipx_type == IPX_TYPE_PPROP && ipx->ipx_tctrl < 8 && skb->pkt_type != PACKET_OTHERHOST ) { int i; ipx_interface *ifcs; @@ -2169,6 +2178,7 @@ static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, int size, copied); if (err) goto out_free; + sk->stamp=skb->stamp; msg->msg_namelen = sizeof(*sipx); @@ -2429,7 +2439,7 @@ EXPORT_SYMBOL(ipx_unregister_spx); * sockets be closed from user space. */ -__initfunc(static void ipx_proto_finito(void)) +static void ipx_proto_finito(void) { ipx_interface *ifc; diff --git a/net/netsyms.c b/net/netsyms.c index f4fbfdeffe9e..f61d4763f9b0 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -15,8 +15,8 @@ #include #include #include +#include -#ifdef CONFIG_INET #include #include #include @@ -34,6 +34,7 @@ #include #include +#ifdef CONFIG_INET extern struct net_proto_family inet_family_ops; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -41,6 +42,8 @@ extern struct net_proto_family inet_family_ops; #include #include #include + +extern int tcp_tw_death_row_slot; #endif #endif @@ -66,16 +69,14 @@ extern void destroy_EII_client(struct datalink_proto *); extern void destroy_8023_client(struct datalink_proto *); #endif -#ifdef CONFIG_IPV6_MODULE -#ifdef CONFIG_SYSCTL -extern int sysctl_max_syn_backlog; -#endif -#endif - #ifdef CONFIG_ATALK_MODULE #include #endif +#ifdef CONFIG_SYSCTL +extern int sysctl_max_syn_backlog; +#endif + EXPORT_SYMBOL(dev_lockct); /* Skbuff symbols. */ @@ -293,6 +294,8 @@ EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_transmit_skb); EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); +EXPORT_SYMBOL(tcp_tw_death_row_slot); +EXPORT_SYMBOL(net_statistics); EXPORT_SYMBOL(xrlim_allow); @@ -320,6 +323,7 @@ EXPORT_SYMBOL(netlink_post); #endif #ifdef CONFIG_RTNETLINK +EXPORT_SYMBOL(rtattr_parse); EXPORT_SYMBOL(rtnetlink_links); EXPORT_SYMBOL(__rta_fill); EXPORT_SYMBOL(rtnetlink_dump_ifinfo); @@ -417,12 +421,14 @@ EXPORT_SYMBOL(ip_acct_output); EXPORT_SYMBOL(dev_base); EXPORT_SYMBOL(dev_close); EXPORT_SYMBOL(dev_mc_add); -EXPORT_SYMBOL(arp_find); EXPORT_SYMBOL(n_tty_ioctl); EXPORT_SYMBOL(tty_register_ldisc); EXPORT_SYMBOL(kill_fasync); +#ifdef CONFIG_INET +EXPORT_SYMBOL(arp_find); EXPORT_SYMBOL(ip_rcv); EXPORT_SYMBOL(arp_rcv); +#endif EXPORT_SYMBOL(dev_mc_delete); EXPORT_SYMBOL(if_port_text); @@ -438,13 +444,35 @@ EXPORT_SYMBOL(dlci_ioctl_hook); #endif /* Packet scheduler modules want these. */ +#ifdef CONFIG_NET_SCHED EXPORT_SYMBOL(qdisc_destroy); EXPORT_SYMBOL(qdisc_reset); EXPORT_SYMBOL(qdisc_restart); EXPORT_SYMBOL(qdisc_head); +EXPORT_SYMBOL(qdisc_create_dflt); +EXPORT_SYMBOL(pfifo_qdisc_ops); +EXPORT_SYMBOL(noop_qdisc); EXPORT_SYMBOL(register_qdisc); EXPORT_SYMBOL(unregister_qdisc); -EXPORT_SYMBOL(noop_qdisc); +EXPORT_SYMBOL(qdisc_get_rtab); +EXPORT_SYMBOL(qdisc_put_rtab); +#ifdef CONFIG_NET_ESTIMATOR +EXPORT_SYMBOL(qdisc_new_estimator); +EXPORT_SYMBOL(qdisc_kill_estimator); +#endif +#ifdef CONFIG_NET_POLICE +EXPORT_SYMBOL(tcf_police); +EXPORT_SYMBOL(tcf_police_locate); +EXPORT_SYMBOL(tcf_police_destroy); +#ifdef CONFIG_RTNETLINK +EXPORT_SYMBOL(tcf_police_dump); +#endif +#endif +#endif +#ifdef CONFIG_NET_CLS +EXPORT_SYMBOL(register_tcf_proto_ops); +EXPORT_SYMBOL(unregister_tcf_proto_ops); +#endif EXPORT_SYMBOL(register_gifconf); diff --git a/net/sched/Config.in b/net/sched/Config.in index d1287a781523..052b62281d4c 100644 --- a/net/sched/Config.in +++ b/net/sched/Config.in @@ -3,9 +3,28 @@ # tristate 'CBQ packet scheduler' CONFIG_NET_SCH_CBQ tristate 'CSZ packet scheduler' CONFIG_NET_SCH_CSZ -#tristate 'HFQ packet scheduler' CONFIG_NET_SCH_HFQ -tristate 'RED queueing discipline' CONFIG_NET_SCH_RED -tristate 'SFQ queueing discipline' CONFIG_NET_SCH_SFQ -tristate 'auxiliary TBF queue' CONFIG_NET_SCH_TBF -tristate 'auxiliary FIFO queue' CONFIG_NET_SCH_PFIFO -tristate 'auxiliary PRIO queue' CONFIG_NET_SCH_PRIO +#tristate 'H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ +#tristate 'H-FSC packet scheduler' CONFIG_NET_SCH_HFCS +tristate 'The simplest PRIO pseudoscheduler' CONFIG_NET_SCH_PRIO +tristate 'RED queue' CONFIG_NET_SCH_RED +tristate 'SFQ queue' CONFIG_NET_SCH_SFQ +tristate 'TEQL queue' CONFIG_NET_SCH_TEQL +tristate 'TBF queue' CONFIG_NET_SCH_TBF +bool 'QoS support' CONFIG_NET_QOS +if [ "$CONFIG_NET_QOS" = "y" ]; then + bool 'Rate estimator' CONFIG_NET_ESTIMATOR +fi +if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then + bool 'Packet classifier API' CONFIG_NET_CLS +fi +if [ "$CONFIG_NET_CLS" = "y" ]; then + bool 'Routing tables based classifier' CONFIG_NET_CLS_ROUTE +# bool 'Firewall based classifier' CONFIG_NET_CLS_FW + tristate 'U32 classifier' CONFIG_NET_CLS_U32 + if [ "$CONFIG_NET_QOS" = "y" ]; then + tristate 'Special RSVP classifier' CONFIG_NET_CLS_RSVP + tristate 'Special RSVP classifier for IPv6' CONFIG_NET_CLS_RSVP6 + bool 'Ingres traffic policing' CONFIG_NET_CLS_POLICE + fi +fi + diff --git a/net/sched/Makefile b/net/sched/Makefile index cbb6704c1669..21a1cf07ab1f 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -11,6 +11,23 @@ O_TARGET := sched.o O_OBJS := sch_generic.o +ifeq ($(CONFIG_NET_SCHED), y) + +O_OBJS += sch_api.o sch_fifo.o + +ifeq ($(CONFIG_NET_ESTIMATOR), y) +O_OBJS += estimator.o +endif + +ifeq ($(CONFIG_NET_CLS), y) +O_OBJS += cls_api.o + +ifeq ($(CONFIG_NET_CLS_POLICE), y) +O_OBJS += police.o +endif + +endif + ifeq ($(CONFIG_NET_SCH_CBQ), y) O_OBJS += sch_cbq.o else @@ -27,6 +44,23 @@ else endif endif +ifeq ($(CONFIG_NET_SCH_HPFQ), y) +O_OBJS += sch_hpfq.o +else + ifeq ($(CONFIG_NET_SCH_HPFQ), m) + M_OBJS += sch_hpfq.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_HFSC), y) +O_OBJS += sch_hfsc.o +else + ifeq ($(CONFIG_NET_SCH_HFSC), m) + M_OBJS += sch_hfsc.o + endif +endif + + ifeq ($(CONFIG_NET_SCH_SFQ), y) O_OBJS += sch_sfq.o else @@ -51,21 +85,54 @@ else endif endif +ifeq ($(CONFIG_NET_SCH_PRIO), y) +O_OBJS += sch_prio.o +else + ifeq ($(CONFIG_NET_SCH_PRIO), m) + M_OBJS += sch_prio.o + endif +endif -ifeq ($(CONFIG_NET_SCH_PFIFO), y) -O_OBJS += sch_fifo.o +ifeq ($(CONFIG_NET_SCH_TEQL), y) +O_OBJS += sch_teql.o else - ifeq ($(CONFIG_NET_SCH_PFIFO), m) - M_OBJS += sch_fifo.o + ifeq ($(CONFIG_NET_SCH_TEQL), m) + M_OBJS += sch_teql.o endif endif -ifeq ($(CONFIG_NET_SCH_PRIO), y) -O_OBJS += sch_prio.o +ifeq ($(CONFIG_NET_CLS_U32), y) +O_OBJS += cls_u32.o else - ifeq ($(CONFIG_NET_SCH_PRIO), m) - M_OBJS += sch_prio.o + ifeq ($(CONFIG_NET_CLS_U32), m) + M_OBJS += cls_u32.o + endif +endif + +ifeq ($(CONFIG_NET_CLS_RSVP), y) +O_OBJS += cls_rsvp.o +else + ifeq ($(CONFIG_NET_CLS_RSVP), m) + M_OBJS += cls_rsvp.o endif endif +ifeq ($(CONFIG_NET_CLS_RSVP6), y) +O_OBJS += cls_rsvp6.o +else + ifeq ($(CONFIG_NET_CLS_RSVP6), m) + M_OBJS += cls_rsvp6.o + endif +endif + +ifeq ($(CONFIG_NET_CLS_ROUTE), y) +O_OBJS += cls_route.o +endif + +ifeq ($(CONFIG_NET_CLS_FW), y) +O_OBJS += cls_fw.o +endif + +endif + include $(TOPDIR)/Rules.make diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c new file mode 100644 index 000000000000..6eae05d7b237 --- /dev/null +++ b/net/sched/cls_api.c @@ -0,0 +1,432 @@ +/* + * net/sched/cls_api.c Packet classifier API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* The list of all installed classifier types */ + +static struct tcf_proto_ops *tcf_proto_base; + + +/* Find classifier type by string name */ + +struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind) +{ + struct tcf_proto_ops *t; + + if (kind) { + for (t = tcf_proto_base; t; t = t->next) { + if (rtattr_strcmp(kind, t->kind) == 0) + return t; + } + } + return NULL; +} + +/* Register(unregister) new classifier type */ + +int register_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + + for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) + if (strcmp(ops->kind, t->kind) == 0) + return -EEXIST; + + ops->next = NULL; + *tp = ops; + return 0; +} + +int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + + for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) + if (t == ops) + break; + + if (!t) + return -ENOENT; + *tp = t->next; + return 0; +} + +#ifdef CONFIG_RTNETLINK + +static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct tcf_proto *tp, unsigned long fh, int event); + + +/* Select new prio value from the range, managed by kernel. */ + +static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp, u32 prio) +{ + u32 first = TC_H_MAKE(0xC0000000U,0U); + + if (!tp || tp->next == NULL) + return first; + + if (prio == TC_H_MAKE(0xFFFF0000U,0U)) + first = tp->prio+1; + else + first = tp->prio-1; + + if (first == prio) + first = tp->prio; + + return first; +} + +/* Add/change/delete/get a filter node */ + +static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct rtattr **tca = arg; + struct tcmsg *t = NLMSG_DATA(n); + u32 protocol = TC_H_MIN(t->tcm_info); + u32 prio = TC_H_MAJ(t->tcm_info); + u32 nprio = prio; + struct device *dev; + struct Qdisc *q; + struct tcf_proto **back, **chain; + struct tcf_proto *tp = NULL; + struct tcf_proto_ops *tp_ops; + struct Qdisc_class_ops *cops; + unsigned long cl = 0; + unsigned long fh; + int err; + + if (prio == 0) { + /* If no priority is given, user wants we allocated it. */ + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + return -ENOENT; + if (n->nlmsg_flags&NLM_F_APPEND) + prio = TC_H_MAKE(0xFFFF0000U,0U); + else + prio = TC_H_MAKE(0x80000000U,0U); + } + + /* Find head of filter chain. */ + + /* Find link */ + if ((dev = dev_get_by_index(t->tcm_ifindex)) == NULL) + return -ENODEV; + + /* Find qdisc */ + if (!t->tcm_parent) + q = dev->qdisc_sleeping; + else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL) + return -EINVAL; + + /* Is it classful? */ + if ((cops = q->ops->cl_ops) == NULL) + return -EINVAL; + + /* Do we search for filter, attached to class? */ + if (TC_H_MIN(t->tcm_parent)) { + cl = cops->get(q, t->tcm_parent); + if (cl == 0) + return -ENOENT; + } + + /* And the last stroke */ + chain = cops->tcf_chain(q, cl); + err = -EINVAL; + if (chain == NULL) + goto errout; + + /* Check the chain for existence of proto-tcf with this priority */ + for (back = chain; (tp=*back) != NULL; back = &tp->next) { + if (tp->prio >= prio) { + if (tp->prio == prio) { + if (!nprio || (tp->protocol != protocol && protocol)) + goto errout; + } else + tp = NULL; + break; + } + } + + if (tp == NULL) { + /* Proto-tcf does not exist, create new one */ + + if (tca[TCA_KIND-1] == NULL || !protocol) + goto errout; + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + goto errout; + + + /* Create new proto tcf */ + + err = -ENOBUFS; + if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL) + goto errout; + tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]); + if (tp_ops == NULL) { + err = -EINVAL; + kfree(tp); + goto errout; + } + memset(tp, 0, sizeof(*tp)); + tp->ops = tp_ops; + tp->protocol = protocol; + tp->prio = nprio ? : tcf_auto_prio(*back, prio); + tp->q = q; + tp->classify = tp_ops->classify; + tp->classid = t->tcm_parent; + err = tp_ops->init(tp); + if (err) { + kfree(tp); + goto errout; + } + tp->next = *back; + *back = tp; + } else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind)) + goto errout; + + fh = tp->ops->get(tp, t->tcm_handle); + + if (fh == 0) { + if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { + *back = tp->next; + tp->ops->destroy(tp); + kfree(tp); + err = 0; + goto errout; + } + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + goto errout; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTFILTER: + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto errout; + break; + case RTM_DELTFILTER: + err = tp->ops->delete(tp, fh); + goto errout; + case RTM_GETTFILTER: + err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + goto errout; + default: + err = -EINVAL; + goto errout; + } + } + + err = tp->ops->change(tp, t->tcm_handle, tca, &fh); + if (err == 0) + tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + +errout: + if (cl) + cops->put(q, cl); + return err; +} + +static int +tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, + u32 pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = tp->q->dev->ifindex; + tcm->tcm_parent = tp->classid; + tcm->tcm_handle = 0; + tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind); + if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) + goto rtattr_failure; + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct tcf_proto *tp, unsigned long fh, int event) +{ + struct sk_buff *skb; + pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +} + +struct tcf_dump_args +{ + struct tcf_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg) +{ + struct tcf_dump_args *a = (void*)arg; + + return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); +} + +static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct device *dev; + struct Qdisc *q; + struct tcf_proto *tp, **chain; + struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + unsigned long cl = 0; + struct Qdisc_class_ops *cops; + struct tcf_dump_args arg; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return skb->len; + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return skb->len; + if ((q = qdisc_lookup(dev, tcm->tcm_parent)) == NULL) + return skb->len; + cops = q->ops->cl_ops; + if (TC_H_MIN(tcm->tcm_parent)) { + if (cops) + cl = cops->get(q, tcm->tcm_parent); + if (cl == 0) + goto errout; + } + chain = cops->tcf_chain(q, cl); + if (chain == NULL) + goto errout; + + s_t = cb->args[0]; + + for (tp=*chain, t=0; tp; tp = tp->next, t++) { + if (t < s_t) continue; + if (TC_H_MAJ(tcm->tcm_info) && + TC_H_MAJ(tcm->tcm_info) != tp->prio) + continue; + if (TC_H_MIN(tcm->tcm_info) && + TC_H_MIN(tcm->tcm_info) != tp->protocol) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); + if (cb->args[1] == 0) { + if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) { + break; + } + cb->args[1] = 1; + } + if (tp->ops->walk == NULL) + continue; + arg.w.fn = tcf_node_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]-1; + arg.w.count = 0; + tp->ops->walk(tp, &arg.w); + cb->args[1] = arg.w.count+1; + if (arg.w.stop) + break; + } + + cb->args[0] = t; + +errout: + if (cl) + cops->put(q, cl); + + return skb->len; +} + +#endif + + +__initfunc(int tc_filter_init(void)) +{ +#ifdef CONFIG_RTNETLINK + struct rtnetlink_link *link_p = rtnetlink_links[AF_UNSPEC]; + + /* Setup rtnetlink links. It is made here to avoid + exporting large number of public symbols. + */ + + if (link_p) { + link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter; + } +#endif +#define INIT_TC_FILTER(name) { \ + extern struct tcf_proto_ops cls_##name##_ops; \ + register_tcf_proto_ops(&cls_##name##_ops); \ + } + +#ifdef CONFIG_NET_CLS_U32 + INIT_TC_FILTER(u32); +#endif +#ifdef CONFIG_NET_CLS_ROUTE + INIT_TC_FILTER(route); +#endif +#ifdef CONFIG_NET_CLS_FW + INIT_TC_FILTER(fw); +#endif +#ifdef CONFIG_NET_CLS_RSVP + INIT_TC_FILTER(rsvp); +#endif +#ifdef CONFIG_NET_CLS_RSVP6 + INIT_TC_FILTER(rsvp6); +#endif + return 0; +} diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c new file mode 100644 index 000000000000..c146bf4f0d6d --- /dev/null +++ b/net/sched/cls_fw.c @@ -0,0 +1,97 @@ +/* + * net/sched/cls_fw.c Routing table based packet classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ +#if 0 /* XXX skb->fwmark, where is it? -DaveM */ + u32 clid = skb->fwmark; + + if (clid && (TC_H_MAJ(clid) == 0 || + !(TC_H_MAJ(clid^tp->q->handle)))) { + res->classid = clid; + res->class = 0; + return 0; + } +#endif + return -1; +} + +static unsigned long fw_get(struct tcf_proto *tp, u32 handle) +{ + return 0; +} + +static void fw_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int fw_init(struct tcf_proto *tp) +{ + return 0; +} + +static void fw_destroy(struct tcf_proto *tp) +{ +} + +static int fw_delete(struct tcf_proto *tp, unsigned long arg) +{ + return -EINVAL; +} + +static int fw_change(struct tcf_proto *tp, u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + return handle ? -EINVAL : 0; +} + +struct tcf_proto_ops fw_cls_ops = { + NULL, + "fw", + fw_classify, + fw_init, + fw_destroy, + + fw_get, + fw_put, + fw_change, + fw_delete, + NULL, +}; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c new file mode 100644 index 000000000000..bbcef7b4312b --- /dev/null +++ b/net/sched/cls_route.c @@ -0,0 +1,99 @@ +/* + * net/sched/cls_route.c Routing table based packet classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static int route_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct dst_entry *dst = skb->dst; + + if (dst) { + u32 clid = dst->tclassid; + + if (clid && (TC_H_MAJ(clid) == 0 || + !(TC_H_MAJ(clid^tp->q->handle)))) { + res->classid = clid; + res->class = 0; + return 0; + } + } + return -1; +} + +static unsigned long route_get(struct tcf_proto *tp, u32 handle) +{ + return 0; +} + +static void route_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int route_init(struct tcf_proto *tp) +{ + return 0; +} + +static void route_destroy(struct tcf_proto *tp) +{ +} + +static int route_delete(struct tcf_proto *tp, unsigned long arg) +{ + return -EINVAL; +} + +static int route_change(struct tcf_proto *tp, u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + return handle ? -EINVAL : 0; +} + +struct tcf_proto_ops cls_route_ops = { + NULL, + "route", + route_classify, + route_init, + route_destroy, + + route_get, + route_put, + route_change, + route_delete, + NULL, +}; diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c new file mode 100644 index 000000000000..0fe3bd4dea7a --- /dev/null +++ b/net/sched/cls_rsvp.c @@ -0,0 +1,42 @@ +/* + * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RSVP_DST_LEN 1 +#define RSVP_ID "rsvp" +#define RSVP_OPS cls_rsvp_ops + +#include "cls_rsvp.h" diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h new file mode 100644 index 000000000000..6e6b58f8fe0b --- /dev/null +++ b/net/sched/cls_rsvp.h @@ -0,0 +1,670 @@ +/* + * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +/* + Comparing to general packet classification problem, + RSVP needs only sevaral relatively simple rules: + + * (dst, protocol) are always specified, + so that we are able to hash them. + * src may be exact, and may be wildcard, so that + we can keep hash table plus one wildcard entry. + * source port (or flow label) is important only if src is given. + + IMPLEMENTATION. + + We use two level hash table: top level is keyed by + destination address and protocol ID, every bucket contains list of + "rsvp sessions", identified by destination address, protocol + and DPI(="Destination Port ID"): triple (key, mask, offset). + + Every bucket has smaller hash table keyed by source address + (cf. RSVP flowspec) and one wildcard entry for wildcard reservations. + Every bucket is again list of "RSVP flows", selected by + source address and SPI(="Source Port ID" here rather than + "security parameter index"): triple (key, mask, offset). + + + NOTE 1. All the packets with IPv6 extension headers (but AH and ESP) + and all fragmented packets go to best-effort traffic class. + + + NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires + only one "Generalized Port Identifier". So that for classic + ah, esp (and udp,tcp) both *pi should coincide or one of them + should be wildcard. + + From the first sight, this redundancy is just waste of CPU + resources. But, DPI and SPI add possibility to assign different + priorities to GPIs. Look also note 4 about tunnels below. + + + NOTE 3. One complication is the case of tunneled packets. + We implement it as the following: if the first lookup + matches special session with "tunnelhdr" value not zero, + flowid contains not true flow ID, but tunnel ID (1...255). + In this case, we pull tunnelhdr bytes and restart lookup + with tunnel ID added to list of keys. Simple and stupid 8)8) + It's enough for PIMREG and IPIP. + + + NOTE 4. Two GPIs make possible to parse even GRE packets. + F.e. DPI can select ETH_P_IP (and necessary flags to make + tunnelhdr correct) in GRE protocol field and SPI matches + GRE key. Is it not nice? 8)8) + + + Well, as result, despite of simplicity, we get pretty + powerful clsssification engine. + */ + +struct rsvp_head +{ + u32 tmap[256/32]; + u32 hgenerator; + u8 tgenerator; + struct rsvp_session *ht[256]; +}; + +struct rsvp_session +{ + struct rsvp_session *next; + u32 dst[RSVP_DST_LEN]; + struct tc_rsvp_gpi dpi; + u8 protocol; + u8 tunnelid; + /* 16 (src,sport) hash slots, and one wildcard source slot */ + struct rsvp_filter *ht[16+1]; +}; + + +struct rsvp_filter +{ + struct rsvp_filter *next; + u32 src[RSVP_DST_LEN]; + struct tc_rsvp_gpi spi; + u8 tunnelhdr; + + struct tcf_result res; +#ifdef CONFIG_NET_CLS_POLICE + struct tcf_police *police; +#endif + + u32 handle; + struct rsvp_session *sess; +}; + +static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid) +{ + unsigned h = dst[RSVP_DST_LEN-1]; + h ^= h>>16; + h ^= h>>8; + return (h ^ protocol ^ tunnelid) & 0xFF; +} + +static __inline__ unsigned hash_src(u32 *src) +{ + unsigned h = src[RSVP_DST_LEN-1]; + h ^= h>>16; + h ^= h>>8; + h ^= h>>4; + return h & 0xF; +} + +static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned h1, h2; + u32 *dst, *src; + u8 protocol; + u8 tunnelid = 0; + u8 *xprt; +#if RSVP_DST_LEN == 4 + struct ipv6hdr *nhptr = skb->nh.ipv6h; +#else + struct iphdr *nhptr = skb->nh.iph; +#endif + +#ifndef __i386__ + if ((unsigned long)nhptr & 3) + return -1; +#endif + +restart: + +#if RSVP_DST_LEN == 4 + src = &nhptr->saddr.s6_addr32[0]; + dst = &nhptr->daddr.s6_addr32[0]; + protocol = nhptr->nexthdr; + xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr); +#else + src = &nhptr->saddr; + dst = &nhptr->daddr; + protocol = nhptr->protocol; + xprt = ((u8*)nhptr) + (nhptr->ihl<<2); + if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET)) + return -1; +#endif + + h1 = hash_dst(dst, protocol, tunnelid); + h2 = hash_src(src); + + for (s = sht[h1]; s; s = s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + protocol == s->protocol && + !(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) +#if RSVP_DST_LEN == 4 + && dst[0] == s->dst[0] + && dst[1] == s->dst[1] + && dst[2] == s->dst[2] +#endif + && tunnelid == s->tunnelid) { + + for (f = s->ht[h2]; f; f = f->next) { + if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] && + !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key)) +#if RSVP_DST_LEN == 4 + && src[0] == f->src[0] + && src[1] == f->src[1] + && src[2] == f->src[2] +#endif + ) { +matched: + if (f->tunnelhdr == 0) { + *res = f->res; +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) + return tcf_police(skb, f->police); +#endif + return 0; + } else { + tunnelid = f->res.classid; + nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr)); + goto restart; + } + } + } + + /* And wildcard bucket... */ + if ((f = s->ht[16]) != NULL) + goto matched; + return -1; + } + } + return -1; +} + +static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) +{ + struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned h1 = handle&0xFF; + unsigned h2 = (handle>>8)&0xFF; + + if (h2 > 16) + return 0; + + for (s = sht[h1]; s; s = s->next) { + for (f = s->ht[h2]; f; f = f->next) { + if (f->handle == handle) + return (unsigned long)f; + } + } + return 0; +} + +static void rsvp_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int rsvp_init(struct tcf_proto *tp) +{ + struct rsvp_head *data; + + MOD_INC_USE_COUNT; + data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL); + if (data) { + memset(data, 0, sizeof(struct rsvp_head)); + tp->root = data; + return 0; + } + MOD_DEC_USE_COUNT; + return -ENOBUFS; +} + +static void rsvp_destroy(struct tcf_proto *tp) +{ + struct rsvp_head *data = xchg(&tp->root, NULL); + struct rsvp_session **sht; + int h1, h2; + + if (data == NULL) + return; + + sht = data->ht; + + for (h1=0; h1<256; h1++) { + struct rsvp_session *s; + + while ((s = sht[h1]) != NULL) { + + sht[h1] = s->next; + + for (h2=0; h2<=16; h2++) { + struct rsvp_filter *f; + + while ((f = s->ht[h2]) != NULL) { + unsigned long cl; + + s->ht[h2] = f->next; + if ((cl = xchg(&f->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(f->police); +#endif + kfree(f); + } + } + kfree(s); + } + } + kfree(data); + MOD_DEC_USE_COUNT; +} + +static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg; + unsigned h = f->handle; + struct rsvp_session **sp; + struct rsvp_session *s = f->sess; + int i; + + for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + unsigned long cl; + + *fp = f->next; + if ((cl = xchg(&f->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(f->police); +#endif + + kfree(f); + + /* Strip tree */ + + for (i=0; i<=16; i++) + if (s->ht[i]) + return 0; + + /* OK, session has no flows */ + for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; + *sp; sp = &(*sp)->next) { + if (*sp == s) { + *sp = s->next; + kfree(s); + return 0; + } + } + + return 0; + } + } + return 0; +} + +static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) +{ + struct rsvp_head *data = tp->root; + int i = 0xFFFF; + + while (i-- > 0) { + u32 h; + if ((data->hgenerator += 0x10000) == 0) + data->hgenerator = 0x10000; + h = data->hgenerator|salt; + if (rsvp_get(tp, h) == 0) + return h; + } + return 0; +} + +static int tunnel_bts(struct rsvp_head *data) +{ + int n = data->tgenerator>>5; + u32 b = 1<<(data->tgenerator&0x1F); + + if (data->tmap[n]&b) + return 0; + data->tmap[n] |= b; + return 1; +} + +static void tunnel_recycle(struct rsvp_head *data) +{ + struct rsvp_session **sht = data->ht; + u32 tmap[256/32]; + int h1, h2; + + memset(tmap, 0, sizeof(tmap)); + + for (h1=0; h1<256; h1++) { + struct rsvp_session *s; + for (s = sht[h1]; s; s = s->next) { + for (h2=0; h2<=16; h2++) { + struct rsvp_filter *f; + + for (f = s->ht[h2]; f; f = f->next) { + if (f->tunnelhdr == 0) + continue; + data->tgenerator = f->res.classid; + tunnel_bts(data); + } + } + } + } + + memcpy(data->tmap, tmap, sizeof(tmap)); +} + +static u32 gen_tunnel(struct rsvp_head *data) +{ + int i, k; + + for (k=0; k<2; k++) { + for (i=255; i>0; i--) { + if (++data->tgenerator == 0) + data->tgenerator = 1; + if (tunnel_bts(data)) + return data->tgenerator; + } + tunnel_recycle(data); + } + return 0; +} + +static int rsvp_change(struct tcf_proto *tp, u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct rsvp_head *data = tp->root; + struct rsvp_filter *f, **fp; + struct rsvp_session *s, **sp; + struct tc_rsvp_pinfo *pinfo = NULL; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_RSVP_MAX]; + unsigned h1, h2; + u32 *dst; + int err; + + if (opt == NULL) + return -EINVAL; + + if (rtattr_parse(tb, TCA_RSVP_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) + return -EINVAL; + + if ((f = (struct rsvp_filter*)*arg) != NULL) { + /* Node exists: adjust only classid */ + + if (f->handle != handle && handle) + return -EINVAL; + if (tb[TCA_RSVP_CLASSID-1]) { + unsigned long cl = xchg(&f->res.class, 0); + if (cl) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + f->res.class = tp->q->ops->cl_ops->bind_tcf(tp->q, f->res.classid); + } +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_RSVP_POLICE-1]) { + struct tcf_police *police = tcf_police_locate(tb[TCA_RSVP_POLICE-1]); + + tcf_police_release(xchg(&f->police, police)); + } +#endif + return 0; + } + + /* Now more serious part... */ + if (handle) + return -EINVAL; + if (tb[TCA_RSVP_DST-1] == NULL) + return -EINVAL; + + f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL); + if (f == NULL) + return -ENOBUFS; + + memset(f, 0, sizeof(*f)); + h2 = 16; + if (tb[TCA_RSVP_SRC-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src)) + goto errout; + memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); + h2 = hash_src(f->src); + } + if (tb[TCA_RSVP_PINFO-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo)) + goto errout; + pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]); + f->spi = pinfo->spi; + f->tunnelhdr = pinfo->tunnelhdr; + } + if (tb[TCA_RSVP_CLASSID-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4) + goto errout; + f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + } + + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src)) + goto errout; + dst = RTA_DATA(tb[TCA_RSVP_DST-1]); + h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); + + err = -ENOMEM; + if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0) + goto errout; + + if (f->tunnelhdr) { + err = -EINVAL; + if (f->res.classid > 255) + goto errout; + + err = -ENOMEM; + if (f->res.classid == 0 && + (f->res.classid = gen_tunnel(data)) == 0) + goto errout; + } + + for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + pinfo->protocol == s->protocol && + memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 +#if RSVP_DST_LEN == 4 + && dst[0] == s->dst[0] + && dst[1] == s->dst[1] + && dst[2] == s->dst[2] +#endif + && pinfo->tunnelid == s->tunnelid) { + +insert: + /* OK, we found appropriate session */ + + fp = &s->ht[h2]; + + f->sess = s; + if (f->tunnelhdr == 0) + f->res.class = tp->q->ops->cl_ops->bind_tcf(tp->q, f->res.classid); +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_RSVP_POLICE-1]) + f->police = tcf_police_locate(tb[TCA_RSVP_POLICE-1]); +#endif + + for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) + if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask) + break; + f->next = *fp; + *fp = f; + return 0; + } + } + + /* No session found. Create new one. */ + + err = -ENOBUFS; + s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL); + if (s == NULL) + goto errout; + memset(s, 0, sizeof(*s)); + memcpy(s->dst, dst, sizeof(*dst)); + s->dpi = pinfo->dpi; + s->protocol = pinfo->protocol; + s->tunnelid = pinfo->tunnelid; + for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) { + if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask) + break; + } + s->next = *sp; + *sp = s; + goto insert; + +errout: + if (f) + kfree(f); + return err; +} + +static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct rsvp_head *head = tp->root; + unsigned h, h1; + + if (arg->stop) + return; + + for (h = 0; h < 256; h++) { + struct rsvp_session *s; + + for (s = head->ht[h]; s; s = s->next) { + for (h1 = 0; h1 <= 16; h1++) { + struct rsvp_filter *f; + + for (f = s->ht[h1]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } + } + } + } +} + +#ifdef CONFIG_RTNETLINK +static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct rsvp_head *head = tp->root; + struct rsvp_filter *f = (struct rsvp_filter*)fh; + struct rsvp_session *s; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_rsvp_pinfo pinfo; + + if (f == NULL) + return skb->len; + s = f->sess; + + t->tcm_handle = f->handle; + + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); + pinfo.dpi = s->dpi; + pinfo.spi = f->spi; + pinfo.protocol = s->protocol; + pinfo.tunnelid = s->tunnelid; + pinfo.tunnelhdr = f->tunnelhdr; + RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); + if (f->res.classid) + RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); + if (((f->handle>>8)&0xFF) != 16) + RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) { + struct rtattr * p_rta = (struct rtattr*)skb->tail; + + RTA_PUT(skb, TCA_RSVP_POLICE, 0, NULL); + + if (tcf_police_dump(skb, f->police) < 0) + goto rtattr_failure; + + p_rta->rta_len = skb->tail - (u8*)p_rta; + } +#endif + + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +struct tcf_proto_ops RSVP_OPS = { + NULL, + RSVP_ID, + rsvp_classify, + rsvp_init, + rsvp_destroy, + + rsvp_get, + rsvp_put, + rsvp_change, + rsvp_delete, + rsvp_walk, +#ifdef CONFIG_RTNETLINK + rsvp_dump +#else + NULL +#endif +}; + +#ifdef MODULE +int init_module(void) +{ + return register_tcf_proto_ops(&RSVP_OPS); +} + +void cleanup_module(void) +{ + unregister_tcf_proto_ops(&RSVP_OPS); +} +#endif diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c new file mode 100644 index 000000000000..fff952828be6 --- /dev/null +++ b/net/sched/cls_rsvp6.c @@ -0,0 +1,43 @@ +/* + * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RSVP_DST_LEN 4 +#define RSVP_ID "rsvp6" +#define RSVP_OPS cls_rsvp6_ops + +#include "cls_rsvp.h" diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c new file mode 100644 index 000000000000..10e355201a68 --- /dev/null +++ b/net/sched/cls_u32.c @@ -0,0 +1,704 @@ +/* + * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * The filters are packed to hash tables of key nodes + * with a set of 32bit key/mask pairs at every node. + * Nodes reference next level hash tables etc. + * + * This scheme is the best universal classifier + * I managed to invent; it is not super-fast, but it is not slow + * (provided you programmed it correctly), and enough general. + * And its relative speed grows, when number of rules becomes larger. + * + * Seems, it presents the best middle point between speed and + * managability both by human and by machine. + * + * It is especially useful for link sharing and link sharing, combined + * with QoS; pure RSVP need not such general approach and can use + * much simpler (and faster) schemes, sort of cls_rsvp.c. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } + + +struct tc_u_knode +{ + struct tc_u_knode *next; + u32 handle; + struct tc_u_hnode *ht_up; +#ifdef CONFIG_NET_CLS_POLICE + struct tcf_police *police; +#endif + struct tcf_result res; + struct tc_u_hnode *ht_down; + struct tc_u32_sel sel; +}; + +struct tc_u_hnode +{ + struct tc_u_hnode *next; + u32 handle; + struct tc_u_common *tp_c; + int refcnt; + unsigned divisor; + u32 hgenerator; + struct tc_u_knode *ht[1]; +}; + +struct tc_u_common +{ + struct tc_u_common *next; + struct tc_u_hnode *hlist; + struct Qdisc *q; + int refcnt; + u32 hgenerator; +}; + +static struct tc_u_common *u32_list; + +static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel) +{ + unsigned h = key & sel->hmask; + + h ^= h>>16; + h ^= h>>8; + return h; +} + +static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) +{ + struct { + struct tc_u_knode *knode; + u8 *ptr; + } stack[TC_U32_MAXDEPTH]; + + struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; + u8 *ptr = skb->nh.raw; + struct tc_u_knode *n; + int sdepth = 0; + int off2 = 0; + int sel = 0; + int i; + +#ifndef __i386__ + if ((unsigned long)ptr & 3) + return -1; +#endif + +next_ht: + n = ht->ht[sel]; + +next_knode: + if (n) { + struct tc_u32_key *key = n->sel.keys; + + for (i = n->sel.nkeys; i>0; i--, key++) { + if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) { + n = n->next; + goto next_knode; + } + } + if (n->ht_down == NULL) { +check_terminal: + if (n->sel.flags&TC_U32_TERMINAL) { + *res = n->res; +#ifdef CONFIG_NET_CLS_POLICE + if (n->police) + return tcf_police(skb, n->police); +#endif + return 0; + } + n = n->next; + goto next_knode; + } + + /* PUSH */ + if (sdepth >= TC_U32_MAXDEPTH) + goto deadloop; + stack[sdepth].knode = n; + stack[sdepth].ptr = ptr; + sdepth++; + + ht = n->ht_down; + sel = 0; + if (ht->divisor) + sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel); + + if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) + goto next_ht; + + if (n->sel.flags&(TC_U32_EAT|TC_U32_VAROFFSET)) { + off2 = n->sel.off + 3; + if (n->sel.flags&TC_U32_VAROFFSET) + off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift; + off2 &= ~3; + } + if (n->sel.flags&TC_U32_EAT) { + ptr += off2; + off2 = 0; + } + + if (ptr < skb->tail) + goto next_ht; + } + + /* POP */ + if (sdepth--) { + n = stack[sdepth].knode; + ht = n->ht_up; + ptr = stack[sdepth].ptr; + goto check_terminal; + } + return -1; + +deadloop: + if (net_ratelimit()) + printk("cls_u32: dead loop\n"); + return -1; +} + +static __inline__ struct tc_u_hnode * +u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) +{ + struct tc_u_hnode *ht; + + for (ht = tp_c->hlist; ht; ht = ht->next) + if (ht->handle == handle) + break; + + return ht; +} + +static __inline__ struct tc_u_knode * +u32_lookup_key(struct tc_u_hnode *ht, u32 handle) +{ + unsigned sel; + struct tc_u_knode *n; + + sel = TC_U32_HASH(handle); + if (sel > ht->divisor) + return 0; + + for (n = ht->ht[sel]; n; n = n->next) + if (n->handle == handle) + return n; + + return NULL; +} + + +static unsigned long u32_get(struct tcf_proto *tp, u32 handle) +{ + struct tc_u_hnode *ht; + struct tc_u_common *tp_c = tp->data; + + if (TC_U32_HTID(handle) == TC_U32_ROOT) + ht = tp->root; + else + ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); + + if (!ht) + return 0; + + if (TC_U32_KEY(handle) == 0) + return (unsigned long)ht; + + return (unsigned long)u32_lookup_key(ht, handle); +} + +static void u32_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static u32 gen_new_htid(struct tc_u_common *tp_c) +{ + int i = 0x800; + + do { + if (++tp_c->hgenerator == 0x7FF) + tp_c->hgenerator = 1; + } while (i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); + + return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; +} + +static int u32_init(struct tcf_proto *tp) +{ + struct tc_u_hnode *root_ht; + struct tc_u_common *tp_c; + + MOD_INC_USE_COUNT; + + for (tp_c = u32_list; tp_c; tp_c = tp_c->next) + if (tp_c->q == tp->q) + break; + + root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL); + if (root_ht == NULL) { + MOD_DEC_USE_COUNT; + return -ENOBUFS; + } + memset(root_ht, 0, sizeof(*root_ht)); + root_ht->divisor = 0; + root_ht->refcnt++; + root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; + + if (tp_c == NULL) { + tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL); + if (tp_c == NULL) { + kfree(root_ht); + MOD_DEC_USE_COUNT; + return -ENOBUFS; + } + memset(tp_c, 0, sizeof(*tp_c)); + tp_c->q = tp->q; + tp_c->next = u32_list; + u32_list = tp_c; + } + + tp_c->refcnt++; + root_ht->next = tp_c->hlist; + tp_c->hlist = root_ht; + root_ht->tp_c = tp_c; + + tp->root = root_ht; + tp->data = tp_c; + return 0; +} + +static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) +{ + unsigned long cl; + + if ((cl = xchg(&n->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(n->police); +#endif + if (n->ht_down) + n->ht_down->refcnt--; + kfree(n); + return 0; +} + +static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) +{ + struct tc_u_knode **kp; + struct tc_u_hnode *ht = key->ht_up; + + if (ht) { + for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) { + if (*kp == key) { + *kp = key->next; + u32_destroy_key(tp, key); + return 0; + } + } + } + BUG_TRAP(0); + return 0; +} + +static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_knode *n; + unsigned h; + + for (h=0; h<=ht->divisor; h++) { + while ((n = ht->ht[h]) != NULL) { + ht->ht[h] = n->next; + u32_destroy_key(tp, n); + } + } +} + +static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode **hn; + + BUG_TRAP(!ht->refcnt); + + u32_clear_hnode(tp, ht); + + for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) { + if (*hn == ht) { + *hn = ht->next; + kfree(ht); + return 0; + } + } + + BUG_TRAP(0); + return -ENOENT; +} + +static void u32_destroy(struct tcf_proto *tp) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *root_ht = xchg(&tp->root, NULL); + + BUG_TRAP(root_ht != NULL); + + if (root_ht && --root_ht->refcnt == 0) + u32_destroy_hnode(tp, root_ht); + + if (--tp_c->refcnt == 0) { + struct tc_u_hnode *ht; + struct tc_u_common **tp_cp; + + for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) { + if (*tp_cp == tp_c) { + *tp_cp = tp_c->next; + break; + } + } + + for (ht=tp_c->hlist; ht; ht = ht->next) + u32_clear_hnode(tp, ht); + + while ((ht = tp_c->hlist) != NULL) { + tp_c->hlist = ht->next; + + BUG_TRAP(ht->refcnt == 0); + + kfree(ht); + }; + + kfree(tp_c); + } + + tp->data = NULL; +} + +static int u32_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct tc_u_hnode *ht = (struct tc_u_hnode*)arg; + + if (ht == NULL) + return 0; + + if (TC_U32_KEY(ht->handle)) + return u32_delete_key(tp, (struct tc_u_knode*)ht); + + if (tp->root == ht) + return -EINVAL; + + if (--ht->refcnt == 0) + u32_destroy_hnode(tp, ht); + + return 0; +} + +static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) +{ + struct tc_u_knode *n; + unsigned i = 0x7FF; + + for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next) + if (i < TC_U32_NODE(n->handle)) + i = TC_U32_NODE(n->handle); + i++; + + return handle|(i>0xFFF ? 0xFFF : i); +} + +static int u32_set_parms(struct Qdisc *q, struct tc_u_hnode *ht, + struct tc_u_knode *n, struct rtattr **tb) +{ + if (tb[TCA_U32_LINK-1]) { + u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]); + struct tc_u_hnode *ht_down = NULL; + + if (TC_U32_KEY(handle)) + return -EINVAL; + + if (handle) { + ht_down = u32_lookup_ht(ht->tp_c, handle); + + if (ht_down == NULL) + return -EINVAL; + ht_down->refcnt++; + } + + ht_down = xchg(&n->ht_down, ht_down); + + if (ht_down) + ht_down->refcnt--; + } + if (tb[TCA_U32_CLASSID-1]) { + unsigned long cl = xchg(&n->res.class, 0); + if (cl) + q->ops->cl_ops->unbind_tcf(q, cl); + n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]); + n->res.class = q->ops->cl_ops->bind_tcf(q, n->res.classid); + } +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_U32_POLICE-1]) { + struct tcf_police *police = tcf_police_locate(tb[TCA_U32_POLICE-1]); + + tcf_police_release(xchg(&n->police, police)); + } +#endif + return 0; +} + +static int u32_change(struct tcf_proto *tp, u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + struct tc_u32_sel *s; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_U32_MAX]; + u32 htid; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + if (rtattr_parse(tb, TCA_U32_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) + return -EINVAL; + + if ((n = (struct tc_u_knode*)*arg) != NULL) { + if (TC_U32_KEY(n->handle) == 0) + return -EINVAL; + + return u32_set_parms(tp->q, n->ht_up, n, tb); + } + + if (tb[TCA_U32_DIVISOR-1]) { + unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]); + + if (--divisor > 0x100) + return -EINVAL; + if (TC_U32_KEY(handle)) + return -EINVAL; + if (handle == 0) { + handle = gen_new_htid(tp->data); + if (handle == 0) + return -ENOMEM; + } + ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL); + if (ht == NULL) + return -ENOBUFS; + memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*)); + ht->tp_c = tp_c; + ht->refcnt = 0; + ht->divisor = divisor; + ht->handle = handle; + ht->next = tp_c->hlist; + tp_c->hlist = ht; + *arg = (unsigned long)ht; + return 0; + } + + if (tb[TCA_U32_HASH-1]) { + htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]); + if (TC_U32_HTID(handle) == TC_U32_ROOT) { + ht = tp->root; + htid = ht->handle; + } else { + ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); + if (ht == NULL) + return -EINVAL; + } + } else { + ht = tp->root; + htid = ht->handle; + } + + if (ht->divisor < TC_U32_HASH(htid)) + return -EINVAL; + + if (handle) { + if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) + return -EINVAL; + if (TC_U32_HASH(handle) && TC_U32_HASH(handle^htid)) + return -EINVAL; + handle = htid | TC_U32_NODE(handle); + } else + handle = gen_new_kid(ht, htid); + + if (tb[TCA_U32_SEL-1] == 0 || + RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel)) + return -EINVAL; + + s = RTA_DATA(tb[TCA_U32_SEL-1]); + n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); + if (n == NULL) + return -ENOBUFS; + memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key)); + memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); + n->ht_up = ht; + n->handle = handle; + err = u32_set_parms(tp->q, ht, n, tb); + if (err == 0) { + struct tc_u_knode **ins; + for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) + if (TC_U32_NODE(handle) >= TC_U32_NODE((*ins)->handle)) + break; + n->next = *ins; + *ins = n; + *arg = (unsigned long)n; + return 0; + } + kfree(n); + return err; +} + +static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + unsigned h; + + if (arg->stop) + return; + + for (ht = tp_c->hlist; ht; ht = ht->next) { + if (arg->count >= arg->skip) { + if (arg->fn(tp, (unsigned long)ht, arg) < 0) { + arg->stop = 1; + return; + } + } + arg->count++; + for (h = 0; h <= ht->divisor; h++) { + for (n = ht->ht[h]; n; n = n->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)n, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } +} + +#ifdef CONFIG_RTNETLINK +static int u32_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct tc_u_knode *n = (struct tc_u_knode*)fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (n == NULL) + return skb->len; + + t->tcm_handle = n->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (TC_U32_KEY(n->handle) == 0) { + struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; + u32 divisor = ht->divisor+1; + RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor); + } else { + RTA_PUT(skb, TCA_U32_SEL, + sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), + &n->sel); + if (n->ht_up) { + u32 htid = n->handle & 0xFFFFF000; + RTA_PUT(skb, TCA_U32_HASH, 4, &htid); + } + if (n->res.classid) + RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid); + if (n->ht_down) + RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle); +#ifdef CONFIG_NET_CLS_POLICE + if (n->police) { + struct rtattr * p_rta = (struct rtattr*)skb->tail; + + RTA_PUT(skb, TCA_U32_POLICE, 0, NULL); + + if (tcf_police_dump(skb, n->police) < 0) + goto rtattr_failure; + + p_rta->rta_len = skb->tail - (u8*)p_rta; + } +#endif + } + + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +struct tcf_proto_ops cls_u32_ops = { + NULL, + "u32", + u32_classify, + u32_init, + u32_destroy, + + u32_get, + u32_put, + u32_change, + u32_delete, + u32_walk, +#ifdef CONFIG_RTNETLINK + u32_dump +#else + NULL +#endif +}; + +#ifdef MODULE +int init_module(void) +{ + return register_tcf_proto_ops(&cls_u32_ops); +} + +void cleanup_module(void) +{ + unregister_tcf_proto_ops(&cls_u32_ops); +} +#endif diff --git a/net/sched/estimator.c b/net/sched/estimator.c new file mode 100644 index 000000000000..8b0ca4e5ffac --- /dev/null +++ b/net/sched/estimator.c @@ -0,0 +1,184 @@ +/* + * net/sched/estimator.c Simple rate estimator. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + This text is NOT intended to be used for statistics collection, + its purpose is to provide base for statistical multiplexing + for controlled load service. + If you need only statistics, run user level daemon, which will + periodically read byte counters. + + Unfortunately, rate estimation is not very easy task. + F.e. I did not find a simple way to estimate current peak rate + and even failed to formulate the problem 8)8) + + So that I preferred not to built estimator in scheduler, + but run this task separately. + Ideally, it should be kernel thread(s), but for now it runs + from timers, which puts apparent top bounds on number of rated + flows, but has minimal overhead on small, but enough + to handle controlled load service, sets of aggregates. + + We measure rate over A=(1<next) { + u64 nbytes = e->stats->bytes; + u32 npackets = e->stats->packets; + u32 rate; + + rate = (nbytes - e->last_bytes)<<(7 - idx); + e->last_bytes = nbytes; + e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; + e->stats->bps = (e->avbps+0xF)>>5; + + rate = (npackets - e->last_packets)<<(12 - idx); + e->last_packets = npackets; + e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; + e->stats->pps = (e->avpps+0x1FF)>>10; + } + + elist[idx].timer.expires = jiffies + ((HZ/4)<interval < -2 || parm->interval > 3) + return -EINVAL; + + est = kmalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOBUFS; + + memset(est, 0, sizeof(*est)); + est->interval = parm->interval + 2; + est->stats = stats; + est->ewma_log = parm->ewma_log; + est->last_bytes = stats->bytes; + est->avbps = stats->bps<<5; + est->last_packets = stats->packets; + est->avpps = stats->pps<<10; + + est->next = elist[est->interval].list; + if (est->next == NULL) { + init_timer(&elist[est->interval].timer); + elist[est->interval].timer.data = est->interval; + elist[est->interval].timer.expires = jiffies + ((HZ/4)<interval); + elist[est->interval].timer.function = est_timer; + add_timer(&elist[est->interval].timer); + } + elist[est->interval].list = est; + return 0; +} + +void qdisc_kill_estimator(struct tc_stats *stats) +{ + int idx; + struct qdisc_estimator *est, **pest; + + for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { + int killed = 0; + pest = &elist[idx].list; + while ((est=*pest) != NULL) { + if (est->stats != stats) { + pest = &est->next; + continue; + } + /* ATOMIC_SET */ + *pest = est->next; + kfree(est); + killed++; + } + if (killed && elist[idx].list == NULL) + del_timer(&elist[idx].timer); + } +} + diff --git a/net/sched/police.c b/net/sched/police.c new file mode 100644 index 000000000000..13599ac490f4 --- /dev/null +++ b/net/sched/police.c @@ -0,0 +1,196 @@ +/* + * net/sched/police.c Input police filter. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } + +#define L2T(p,L) ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log]) +#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log]) + +static u32 idx_gen; +static struct tcf_police *tcf_police_ht[16]; + +static __inline__ unsigned tcf_police_hash(u32 index) +{ + return index&0xF; +} + +static __inline__ struct tcf_police * tcf_police_lookup(u32 index) +{ + struct tcf_police *p; + + for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) { + if (p->index == index) + return p; + } + return NULL; +} + +static __inline__ u32 tcf_police_new_index(void) +{ + do { + if (++idx_gen == 0) + idx_gen = 1; + } while (tcf_police_lookup(idx_gen)); + + return idx_gen; +} + + +void tcf_police_destroy(struct tcf_police *p) +{ + unsigned h = tcf_police_hash(p->index); + struct tcf_police **p1p; + + for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) { + if (*p1p == p) { + *p1p = p->next; + if (p->R_tab) + qdisc_put_rtab(p->R_tab); + if (p->P_tab) + qdisc_put_rtab(p->P_tab); + kfree(p); + return; + } + } + BUG_TRAP(0); +} + +struct tcf_police * tcf_police_locate(struct rtattr *rta) +{ + unsigned h; + struct tcf_police *p; + struct rtattr *tb[TCA_POLICE_MAX]; + struct tc_police *parm; + + if (rtattr_parse(tb, TCA_POLICE_MAX, RTA_DATA(rta), RTA_PAYLOAD(rta)) < 0) + return NULL; + + if (tb[TCA_POLICE_TBF-1] == NULL) + return NULL; + + parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); + + if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { + p->refcnt++; + return p; + } + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + return NULL; + + memset(p, 0, sizeof(*p)); + p->refcnt = 1; + if ((p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1])) == NULL) + goto failure; + if (parm->peakrate.rate && + (p->P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE-1])) == NULL) + goto failure; + p->toks = p->burst = parm->burst; + p->mtu = parm->mtu; + if (p->mtu == 0) + p->mtu = 255<R_tab->rate.cell_log; + if (p->P_tab) + p->ptoks = L2T_P(p, p->mtu); + PSCHED_GET_TIME(p->t_c); + p->index = parm->index ? : tcf_police_new_index(); + p->action = parm->action; + h = tcf_police_hash(p->index); + p->next = tcf_police_ht[h]; + tcf_police_ht[h] = p; + return p; + +failure: + if (p->R_tab) + qdisc_put_rtab(p->R_tab); + kfree(p); + return NULL; +} + +int tcf_police(struct sk_buff *skb, struct tcf_police *p) +{ + psched_time_t now; + long toks; + long ptoks = 0; + + if (skb->len <= p->mtu) { + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst, 0); + + if (p->P_tab) { + ptoks = toks + p->ptoks; + if (ptoks > (long)L2T_P(p, p->mtu)) + ptoks = (long)L2T_P(p, p->mtu); + ptoks -= L2T_P(p, skb->len); + } + toks += p->toks; + if (toks > (long)p->burst) + toks = p->burst; + toks -= L2T(p, skb->len); + + if ((toks|ptoks) >= 0) { + p->t_c = now; + p->toks = toks; + p->ptoks = ptoks; + return TC_POLICE_OK; + } + } + + return p->action; +} + +#ifdef CONFIG_RTNETLINK +int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p) +{ + unsigned char *b = skb->tail; + struct tc_police opt; + + opt.index = p->index; + opt.action = p->action; + opt.mtu = p->mtu; + opt.burst = p->burst; + opt.rate = p->R_tab->rate; + if (p->P_tab) + opt.peakrate = p->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c new file mode 100644 index 000000000000..6d36af30d464 --- /dev/null +++ b/net/sched/sch_api.c @@ -0,0 +1,994 @@ +/* + * net/sched/sch_api.c Packet scheduler API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } + +#ifdef CONFIG_RTNETLINK +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *old, struct Qdisc *new); +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *q, unsigned long cl, int event); +#endif + +/* + + Short review. + ------------- + + This file consists of two interrelated parts: + + 1. queueing disciplines manager frontend. + 2. traffic classes manager frontend. + + Generally, queueing discipline ("qdisc") is a black box, + which is able to enqueue packets and to dequeue them (when + device is ready to send something) in order and at times + determined by algorithm hidden in it. + + qdisc's are divided to two categories: + - "queues", which have no internal structure visible from outside. + - "schedulers", which split all the packets to "traffic classes", + using "packet classifiers" (look at cls_api.c) + + In turn, classes may have child qdiscs (as rule, queues) + attached to them etc. etc. etc. + + The goal of the routines in this file is to translate + information supplied by user in the form of handles + to more intelligible for kernel form, to make some sanity + checks and part of work, which is common to all qdiscs + and to provide rtnetlink notifications. + + All real intelligent work is done inside qdisc modules. + + + + Every discipline has two major routines: enqueue and dequeue. + + ---dequeue + + dequeue usually returns a skb to send. It is allowed to return NULL, + but it does not mean that queue is empty, it just means that + discipline does not want to send anything this time. + Queue is really empty if q->q.qlen == 0. + For complicated disciplines with multiple queues q->q is not + real packet queue, but however q->q.qlen must be valid. + + ---enqueue + + enqueue returns number of enqueued packets i.e. this number is 1, + if packet was enqueued sucessfully and <1 if something (not + necessary THIS packet) was dropped. + + Auxiliary routines: + + ---requeue + + requeues once dequeued packet. It is used for non-standard or + just buggy devices, which can defer output even if dev->tbusy=0. + + ---reset + + returns qdisc to initial state: purge all buffers, clear all + timers, counters (except for statistics) etc. + + ---init + + initializes newly created qdisc. + + ---destroy + + destroys resources allocated by init and during lifetime of qdisc. + */ + +/************************************************ + * Queueing disciplines manipulation. * + ************************************************/ + + +/* The list of all installed queueing disciplines. */ + +static struct Qdisc_ops *qdisc_base = NULL; + +/* Register/uregister queueing discipline */ + +int register_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + if (strcmp(qops->id, q->id) == 0) + return -EEXIST; + + if (qops->enqueue == NULL) + qops->enqueue = noop_qdisc_ops.enqueue; + if (qops->requeue == NULL) + qops->requeue = noop_qdisc_ops.requeue; + if (qops->dequeue == NULL) + qops->dequeue = noop_qdisc_ops.dequeue; + + qops->next = NULL; + *qp = qops; + return 0; +} + +int unregister_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + if (q == qops) + break; + if (!q) + return -ENOENT; + *qp = q->next; + q->next = NULL; + return 0; +} + +/* We know handle. Find qdisc among all qdisc's attached to device + (root qdisc, all its children, children of children etc.) + */ + +struct Qdisc *qdisc_lookup(struct device *dev, u32 handle) +{ + struct Qdisc *q; + + for (q = dev->qdisc_list; q; q = q->next) { + if (q->handle == handle) + return q; + } + return NULL; +} + +/* We know classid. Find qdisc among all qdisc's attached to device + (root qdisc, all its children, children of children etc.) + */ + +struct Qdisc *qdisc_lookup_class(struct device *dev, u32 classid) +{ + struct Qdisc *q; + + for (q = dev->qdisc_list; q; q = q->next) { + if (q->classid == classid) + return q; + } + return NULL; +} + + +/* Find queueing discipline by name */ + +struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) +{ + struct Qdisc_ops *q; + + if (kind) { + for (q = qdisc_base; q; q = q->next) { + if (rtattr_strcmp(kind, q->id) == 0) + return q; + } + } + return NULL; +} + +static struct qdisc_rate_table *qdisc_rtab_list; + +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) +{ + struct qdisc_rate_table *rtab; + + for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { + if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { + rtab->refcnt++; + return rtab; + } + } + + if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) + return NULL; + + rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); + if (rtab) { + rtab->rate = *r; + rtab->refcnt = 1; + memcpy(rtab->data, RTA_DATA(tab), 1024); + rtab->next = qdisc_rtab_list; + qdisc_rtab_list = rtab; + } + return rtab; +} + +void qdisc_put_rtab(struct qdisc_rate_table *tab) +{ + struct qdisc_rate_table *rtab, **rtabp; + + if (!tab || --tab->refcnt) + return; + + for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { + if (rtab == tab) { + *rtabp = rtab->next; + kfree(rtab); + return; + } + } +} + + +/* Allocate an unique handle from space managed by kernel */ + +u32 qdisc_alloc_handle(struct device *dev) +{ + int i = 0x10000; + static u32 autohandle = TC_H_MAKE(0x80000000U, 0); + + do { + autohandle += TC_H_MAKE(0x10000U, 0); + if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) + autohandle = TC_H_MAKE(0x80000000U, 0); + } while (qdisc_lookup(dev, autohandle) && --i > 0); + + return i>0 ? autohandle : 0; +} + +/* Graft qdisc "new" to class "classid" of qdisc "parent" or + to device "dev". + + Old qdisc is not destroyed but returned in *old. + */ + +int qdisc_graft(struct device *dev, struct Qdisc *parent, u32 classid, + struct Qdisc *new, struct Qdisc **old) +{ + int err = 0; + + if (parent == NULL) { + BUG_TRAP(classid == TC_H_ROOT); + if (new) { + new->parent = NULL; + new->classid = TC_H_ROOT; + } + *old = dev_set_scheduler(dev, new); + } else { + struct Qdisc_class_ops *cops = parent->ops->cl_ops; + + BUG_TRAP(classid != TC_H_ROOT); + + err = -EINVAL; + + if (cops) { + unsigned long cl = cops->get(parent, classid); + if (cl) { + err = cops->graft(parent, cl, new, old); + cops->put(parent, cl); + } + } + } + return err; +} + +#ifdef CONFIG_RTNETLINK + +/* + Allocate and initialize new qdisc. + + Parameters are passed via opt. + */ + +static struct Qdisc * +qdisc_create(struct device *dev, struct Qdisc_ops *ops, u32 handle, + u32 parentid, struct rtattr **tca, int *errp) +{ + int err; + struct rtattr *kind = tca[TCA_KIND-1]; + struct Qdisc *sch = NULL; + int size; + int new = 0; + + if (ops == NULL) { + ops = qdisc_lookup_ops(kind); + err = -EINVAL; + if (ops == NULL) + goto err_out; + new = 1; + } + + size = sizeof(*sch) + ops->priv_size; + + sch = kmalloc(size, GFP_KERNEL); + err = -ENOBUFS; + if (!sch) + goto err_out; + + /* Grrr... Resolve race condition with module unload */ + + err = -EINVAL; + if (new) { + if (ops != qdisc_lookup_ops(kind)) + goto err_out; + } else if (kind) { + if (rtattr_strcmp(kind, ops->id)) + goto err_out; + } + + memset(sch, 0, size); + + skb_queue_head_init(&sch->q); + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev = dev; + if (handle == 0) { + handle = qdisc_alloc_handle(dev); + err = -ENOMEM; + if (handle == 0) + goto err_out; + } + sch->handle = handle; + sch->classid = parentid; + + if (ops->init && (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { + sch->next = dev->qdisc_list; + dev->qdisc_list = sch; +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); +#endif + return sch; + } + +err_out: + *errp = err; + if (sch) + kfree(sch); + return NULL; +} + + +/* + Create/delete/change/get qdisc. + */ + +static int tc_ctl_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm = NLMSG_DATA(n); + struct rtattr **tca = arg; + struct device *dev; + u32 clid = tcm->tcm_parent; + struct Qdisc *old_q; + struct Qdisc *q = NULL; + struct Qdisc *p = NULL; + struct Qdisc *leaf = NULL; + struct Qdisc_ops *qops = NULL; + int err; + + /* Find device */ + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + /* If parent is specified, it must exist + and tcm_parent selects a class in parent which + new qdisc will be attached to. + + The place may be already busy by another qdisc, + remember this fact, if it was not auto-created discipline. + */ + if (clid) { + if (clid != TC_H_ROOT) { + p = qdisc_lookup(dev, TC_H_MAJ(clid)); + if (p == NULL) + return -ENOENT; + leaf = qdisc_lookup_class(dev, clid); + } else + leaf = dev->qdisc_sleeping; + + if (leaf && leaf->flags&TCQ_F_DEFAULT && n->nlmsg_type == RTM_NEWQDISC) + leaf = NULL; + + /* + Also, leaf may be exactly that qdisc, which we want + to control. Remember this to avoid one more qdisc_lookup. + */ + + if (leaf && leaf->handle == tcm->tcm_handle) + q = leaf; + } + + /* Try to locate the discipline */ + if (tcm->tcm_handle && q == NULL) { + if (TC_H_MIN(tcm->tcm_handle)) + return -EINVAL; + q = qdisc_lookup(dev, tcm->tcm_handle); + } + + /* If discipline already exists, check that its real parent + matches to one selected by tcm_parent. + */ + + if (q) { + if (clid && p != q->parent) + return -EINVAL; + BUG_TRAP(!leaf || leaf == q); + if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + return -EINVAL; + clid = q->classid; + goto process_existing; + } + + /* The discipline is known not to exist. + If parent was not selected too, return error. + */ + if (clid == 0) + return tcm->tcm_handle ? -ENOENT : -EINVAL; + + /* Check for the case when leaf is exactly the thing, + that you want. + */ + + if (leaf && tcm->tcm_handle == 0) { + q = leaf; + if (!tca[TCA_KIND-1] || rtattr_strcmp(tca[TCA_KIND-1], q->ops->id) == 0) + goto process_existing; + } + + if (n->nlmsg_type != RTM_NEWQDISC || !(n->nlmsg_flags&NLM_F_CREATE)) + return -ENOENT; + if (leaf && n->nlmsg_flags&NLM_F_EXCL) + return -EEXIST; + +create_and_graft: + q = qdisc_create(dev, qops, tcm->tcm_handle, clid, tca, &err); + if (q == NULL) + return err; + +graft: + err = qdisc_graft(dev, p, clid, q, &old_q); + if (err) { + if (q) + qdisc_destroy(q); + return err; + } + qdisc_notify(skb, n, old_q, q); + if (old_q) + qdisc_destroy(old_q); + return 0; + +process_existing: + + switch (n->nlmsg_type) { + case RTM_NEWQDISC: + if (n->nlmsg_flags&NLM_F_EXCL) + return -EEXIST; + qops = q->ops; + goto create_and_graft; + case RTM_GETQDISC: + qdisc_notify(skb, n, NULL, q); + return 0; + case RTM_DELQDISC: + q = NULL; + goto graft; + default: + return -EINVAL; + } +} + +static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, + pid_t pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; + tcm->tcm_parent = q->classid; + tcm->tcm_handle = q->handle; + tcm->tcm_info = 0; + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (q->ops->dump && q->ops->dump(q, skb) < 0) + goto rtattr_failure; + q->stats.qlen = q->q.qlen; + RTA_PUT(skb, TCA_STATS, sizeof(q->stats), &q->stats); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *old, struct Qdisc *new) +{ + struct sk_buff *skb; + pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (old && !(old->flags&TCQ_F_DEFAULT)) { + if (tc_fill_qdisc(skb, old, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) + goto err_out; + } + if (new) { + if (tc_fill_qdisc(skb, new, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + goto err_out; + } + + if (skb->len) + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + +err_out: + kfree_skb(skb); + return -EINVAL; +} + +static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, q_idx; + int s_idx, s_q_idx; + struct device *dev; + struct Qdisc *q; + + s_idx = cb->args[0]; + s_q_idx = q_idx = cb->args[1]; + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_q_idx = 0; + for (q = dev->qdisc_list, q_idx = 0; q; + q = q->next, q_idx++) { + if (q_idx < s_q_idx) + continue; + if (tc_fill_qdisc(skb, q, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) + goto done; + } + } + +done: + cb->args[0] = idx; + cb->args[1] = q_idx; + + return skb->len; +} + + + +/************************************************ + * Traffic classes manipulation. * + ************************************************/ + + + +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm = NLMSG_DATA(n); + struct rtattr **tca = arg; + struct device *dev; + struct Qdisc *q = NULL; + struct Qdisc_class_ops *cops; + unsigned long cl = 0; + unsigned long new_cl; + u32 pid = tcm->tcm_parent; + u32 clid = tcm->tcm_handle; + u32 qid = TC_H_MAJ(clid); + int err; + + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + /* + parent == TC_H_UNSPEC - unspecified parent. + parent == TC_H_ROOT - class is root, which has no parent. + parent == X:0 - parent is root class. + parent == X:Y - parent is a node in hierarchy. + parent == 0:Y - parent is X:Y, where X:0 is qdisc. + + handle == 0:0 - generate handle from kernel pool. + handle == 0:Y - class is X:Y, where X:0 is qdisc. + handle == X:Y - clear. + handle == X:0 - root class. + */ + + /* Step 1. Determine qdisc handle X:0 */ + + if (pid != TC_H_ROOT) { + u32 qid1 = TC_H_MAJ(pid); + + if (qid && qid1) { + /* If both majors are known, they must be identical. */ + if (qid != qid1) + return -EINVAL; + } else if (qid1) { + qid = qid1; + } else if (qid == 0) + qid = dev->qdisc_sleeping->handle; + + /* Now qid is genuine qdisc handle consistent + both with parent and child. + + TC_H_MAJ(pid) still may be unspecified, complete it now. + */ + if (pid) + pid = TC_H_MAKE(qid, pid); + } else { + if (qid == 0) + qid = dev->qdisc_sleeping->handle; + } + + /* OK. Locate qdisc */ + if ((q = qdisc_lookup(dev, qid)) == NULL) + return -ENOENT; + + /* An check that it supports classes */ + cops = q->ops->cl_ops; + if (cops == NULL) + return -EINVAL; + + /* Now try to get class */ + if (clid == 0) { + if (pid == TC_H_ROOT) + clid = qid; + } else + clid = TC_H_MAKE(qid, clid); + + if (clid) + cl = cops->get(q, clid); + + if (cl == 0) { + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) + goto out; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTCLASS: + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto out; + break; + case RTM_DELTCLASS: + err = cops->delete(q, cl); + if (err == 0) + tclass_notify(skb, n, q, cl, RTM_DELTCLASS); + goto out; + case RTM_GETTCLASS: + err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); + goto out; + default: + err = -EINVAL; + goto out; + } + } + + new_cl = cl; + err = cops->change(q, clid, pid, tca, &new_cl); + if (err == 0) + tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); + +out: + if (cl) + cops->put(q, cl); + + return err; +} + + +static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, + unsigned long cl, + pid_t pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; + tcm->tcm_parent = q->handle; + tcm->tcm_handle = q->handle; + tcm->tcm_info = 0; + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0) + goto rtattr_failure; + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *q, unsigned long cl, int event) +{ + struct sk_buff *skb; + pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +} + +struct qdisc_dump_args +{ + struct qdisc_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) +{ + struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; + + return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); +} + +static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct device *dev; + struct Qdisc *q; + struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + struct qdisc_dump_args arg; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return 0; + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return 0; + + s_t = cb->args[0]; + + for (q=dev->qdisc_list, t=0; q; q = q->next, t++) { + if (t < s_t) continue; + if (!q->ops->cl_ops) continue; + if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle + && (tcm->tcm_parent != TC_H_ROOT || q->parent != NULL)) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); + arg.w.fn = qdisc_class_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]; + arg.w.count = 0; + q->ops->cl_ops->walk(q, &arg.w); + cb->args[1] = arg.w.count; + if (arg.w.stop) + break; + } + + cb->args[0] = t; + + return skb->len; +} +#endif + +int psched_us_per_tick = 1; +int psched_tick_per_us = 1; + +#ifdef CONFIG_PROC_FS +static int psched_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + int len; + + len = sprintf(buffer, "%08x %08x\n", + psched_tick_per_us, psched_us_per_tick); + + len -= offset; + + if (len > length) + len = length; + if(len < 0) + len = 0; + + *start = buffer + offset; + *eof = 1; + + return len; +} +#endif + +psched_time_t psched_time_base; + +#if PSCHED_CLOCK_SOURCE == PSCHED_CPU +psched_tdiff_t psched_clock_per_hz; +int psched_clock_scale; +#endif + +#ifdef PSCHED_WATCHER +u32 psched_time_mark; + +static void psched_tick(unsigned long); + +static struct timer_list psched_timer = + { NULL, NULL, 0, 0L, psched_tick }; + +static void psched_tick(unsigned long dummy) +{ +#if PSCHED_CLOCK_SOURCE == PSCHED_CPU + psched_time_t dummy_stamp; + PSCHED_GET_TIME(dummy_stamp); + psched_timer.expires = jiffies + 4*HZ; +#else + unsigned long jiffies = now; + psched_time_base = ((u64)now)< delay) + return -1; + delay /= rdelay; + psched_tick_per_us = delay; + while ((delay>>=1) != 0) + psched_clock_scale++; + psched_us_per_tick = 1<>psched_clock_scale; + return 0; +} +#endif + +__initfunc(int pktsched_init(void)) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; +#endif + +#if PSCHED_CLOCK_SOURCE == PSCHED_CPU + if (psched_calibrate_clock() < 0) + return -1; +#elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES + psched_tick_per_us = HZ<read_proc = psched_read_proc; +#endif + + return 0; +} diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 626afe555345..759ef4d5715c 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -10,6 +10,8 @@ * */ +#include +#include #include #include #include @@ -47,222 +49,279 @@ [3] Sally Floyd, "Notes on Class-Based Queueing: Setting Parameters", 1996 - Algorithm skeleton is taken from from NS simulator cbq.cc. + [4] Sally Floyd and Michael Speer, "Experimental Results + for Class-Based Queueing", 1998, not published. ----------------------------------------------------------------------- - Differences from NS version. - - --- WRR algorith is different. Our version looks more reasonable :-) - and fair when quanta are allowed to be less than MTU. - - --- cl->aveidle is REALLY limited from below by cl->minidle. - Seems, it was bug in NS. - - --- Purely lexical change: "depth" -> "level", "maxdepth" -> "toplevel". - When depth increases we expect, that the thing becomes lower, does not it? :-) - Besides that, "depth" word is semantically overloaded --- - "token bucket depth", "sfq depth"... Besides that, the algorithm - was called "top-LEVEL sharing". - - PROBLEM. - - --- Linux has no EOI event at the moment, so that we cannot - estimate true class idle time. Three workarounds are possible, - all of them have drawbacks: - - 1. (as now) Consider the next dequeue event as sign that - previous packet is finished. It is wrong because of ping-pong - buffers, but on permanently loaded link it is true. - 2. (NS approach) Use as link busy time estimate skb->leb/"physical - bandwidth". Even more wrong f.e. on ethernet real busy time much - higher because of collisions. - 3. (seems, the most clever) Split net bh to two parts: - NETRX_BH (for received packets) and preserve NET_BH for transmitter. - It will not require driver changes (NETRX_BH flag will be set - in netif_rx), but will allow to trace EOIs more precisely - and will save useless checks in net_bh. Besides that we will - have to eliminate random calling hard_start_xmit with dev->tbusy flag - (done) and to drop failure_q --- i.e. if !dev->tbusy hard_start_xmit - MUST succeed; failed packets will be dropped on the floor. + Algorithm skeleton is taken from from NS simulator cbq.cc. + If someone wants to check this text against LBL version, + he should take into account that ONLY skeleton is borrowed, + implementation is different. Particularly: + + --- WRR algorithm is different. Our version looks + more reasonable (I hope) and works when quanta are allowed + to be less than MTU, which always is the case, when real time + classes have small rates. Note, that the statement of [3] is incomplete, + Actually delay may be estimated even if class per-round allotment + less than MTU. Namely, if per-round allotment is W*r_i, + and r_1+...+r_k = r < 1 + + delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B + + In the worst case we have IntServ estimate with D = W*r+k*MTU + and C = MTU*r. The proof (if correct at all) is trivial. + + + --- Seems, cbq-2.0 is not very accurate. At least, I cannot + interpret some places, which look like wrong translation + from NS. Anyone is advertised to found these differences + and explain me, why I am wrong 8). + + --- Linux has no EOI event, so that we cannot estimate true class + idle time. Workaround is to consider the next dequeue event + as sign that previous packet is finished. It is wrong because of + internal device queueing, but on permanently loaded link it is true. + Moreover, combined with clock integrator, this scheme looks + very close to ideal solution. */ -#define CBQ_TOPLEVEL_SHARING -/* #define CBQ_NO_TRICKERY */ +struct cbq_sched_data; -#define CBQ_CLASSIFIER(skb, q) ((q)->fallback_class) struct cbq_class { + struct cbq_class *next; /* hash table link */ + struct cbq_class *next_alive; /* next class with backlog in this priority band */ + /* Parameters */ - int priority; /* priority */ -#ifdef CBQ_TOPLEVEL_SHARING - int level; /* level of the class in hierarchy: - 0 for leaf classes, and maximal - level of childrens + 1 for nodes. - */ + u32 classid; + unsigned char priority; /* class priority */ + unsigned char priority2; /* priority to be used after overlimit */ + unsigned char ewma_log; /* time constant for idle time calculation */ + unsigned char ovl_strategy; +#ifdef CONFIG_NET_CLS_POLICE + unsigned char police; #endif + u32 defmap; + + /* Link-sharing scheduler parameters */ long maxidle; /* Class paramters: see below. */ + long offtime; long minidle; - int filter_log; -#ifndef CBQ_NO_TRICKERY - long extradelay; -#endif + u32 avpkt; + struct qdisc_rate_table *R_tab; - long quantum; /* Allotment per WRR round */ - long rquantum; /* Relative allotment: see below */ + /* Overlimit strategy parameters */ + void (*overlimit)(struct cbq_class *cl); + long penalty; - int cell_log; - unsigned long L_tab[256]; + /* General scheduler (WRR) parameters */ + long allot; + long quantum; /* Allotment per WRR round */ + long weight; /* Relative allotment: see below */ - struct Qdisc *qdisc; /* ptr to CBQ discipline */ - struct cbq_class *root; /* Ptr to root class; - root can be not unique. - */ - struct cbq_class *parent; /* Ptr to parent in the class tree */ + struct Qdisc *qdisc; /* Ptr to CBQ discipline */ + struct cbq_class *split; /* Ptr to split node */ + struct cbq_class *share; /* Ptr to LS parent in the class tree */ + struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ struct cbq_class *borrow; /* NULL if class is bandwidth limited; parent otherwise */ + struct cbq_class *sibling; /* Sibling chain */ + struct cbq_class *children; /* Pointer to children chain */ struct Qdisc *q; /* Elementary queueing discipline */ - struct cbq_class *next; /* next class in this priority band */ - struct cbq_class *next_alive; /* next class with backlog in this priority band */ /* Variables */ - psched_time_t last; + unsigned char cpriority; /* Effective priority */ + unsigned char delayed; + unsigned char level; /* level of the class in hierarchy: + 0 for leaf classes, and maximal + level of children + 1 for nodes. + */ + + psched_time_t last; /* Last end of service */ psched_time_t undertime; long avgidle; long deficit; /* Saved deficit for WRR */ - char awake; /* Class is in alive list */ + unsigned long penalized; + struct tc_stats stats; + struct tc_cbq_xstats xstats; -#if 0 - void (*overlimit)(struct cbq_class *cl); -#endif -}; + struct tcf_proto *filter_list; -#define L2T(cl,len) ((cl)->L_tab[(len)>>(cl)->cell_log]) + int refcnt; + int filters; + + struct cbq_class *defaults[TC_PRIO_MAX+1]; +}; struct cbq_sched_data { - struct cbq_class *classes[CBQ_MAXPRIO]; /* List of all classes */ - int nclasses[CBQ_MAXPRIO]; - unsigned quanta[CBQ_MAXPRIO]; - unsigned mtu; - int cell_log; - unsigned long L_tab[256]; - struct cbq_class *fallback_class; + struct cbq_class *classes[16]; /* Hash table of all classes */ + int nclasses[TC_CBQ_MAXPRIO+1]; + unsigned quanta[TC_CBQ_MAXPRIO+1]; + + struct cbq_class link; unsigned activemask; - struct cbq_class *active[CBQ_MAXPRIO]; /* List of all classes - with backlog */ - struct cbq_class *last_sent; - int last_sent_len; + struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes + with backlog */ + struct cbq_class *tx_class; + struct cbq_class *tx_borrowed; + int tx_len; psched_time_t now; /* Cached timestamp */ + unsigned pmask; + struct timer_list delay_timer; struct timer_list wd_timer; /* Wathchdog timer, that started when CBQ has backlog, but cannot transmit just now */ - unsigned long wd_expires; -#ifdef CBQ_TOPLEVEL_SHARING - struct cbq_class *borrowed; + long wd_expires; int toplevel; -#endif + u32 hgenerator; }; -/* - WRR quanta - ---------- - cl->quantum is number added to class allotment on every round. - cl->rquantum is "relative" quantum. +#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log]) - For real-time classes: +#define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } - cl->quantum = (cl->rquantum*q->nclasses[prio]*q->mtu)/q->quanta[prio] - where q->quanta[prio] is sum of all rquanta for given priority. - cl->rquantum can be identified with absolute rate of the class - in arbitrary units (f.e. bytes/sec) +static __inline__ unsigned cbq_hash(u32 h) +{ + h ^= h>>8; + h ^= h>>4; + return h&0xF; +} - In this case, delay introduced by round-robin was estimated by - Sally Floyd [2] as: +static __inline__ struct cbq_class * +cbq_class_lookup(struct cbq_sched_data *q, u32 classid) +{ + struct cbq_class *cl; - D = q->nclasses*q->mtu/(bandwidth/2) + for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next) + if (cl->classid == classid) + return cl; + return NULL; +} - Note, that D does not depend on class rate (it is very bad), - but not much worse than Gallager-Parekh estimate for CSZ - C/R = q->mtu/rate, when real-time classes have close rates. +#ifdef CONFIG_NET_CLS_POLICE - For not real-time classes this folmula is not necessary, - so that cl->quantum can be set to any reasonable not zero value. - Apparently, it should be proportional to class rate, if the - rate is not zero. -*/ +static struct cbq_class * +cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) +{ + struct cbq_class *cl, *new; -/* - maxidle, minidle, extradelay - ---------------------------- - - CBQ estimator calculates smoothed class idle time cl->aveidle, - considering class as virtual interface with corresponding bandwidth. - When cl->aveidle wants to be less than zero, class is overlimit. - When it is positive, class is underlimit. - - * maxidle bounds aveidle from above. - It controls maximal length of burst in this class after - long period of idle time. Burstness of active class - is controlled by filter constant cl->filter_log, - but this number is related to burst length only indirectly. - - * minidle is a negative number, normally set to zero. - Setting it to not zero value allows avgidle to drop - below zero, effectively penalizing class, when it is overlimit. - When the class load will decrease, it will take a time to - raise negative avgidle to put the class at limit. - It should be set to zero for leaf classes. - - * extradelay is penalty in delay, when a class goes overlimit. - I believe this parameter is useless and confusing. - Setting it to not zero forces class to accumulate - its "idleness" for extradelay and then send BURST of packets - until going to overlimit again. Non-sense. - - For details see [1] and [3]. - - Really, minidle and extradelay are irrelevant to real scheduling - task. As I understand, SF&VJ introduced them to experiment - with CBQ simulator in attempts to fix erratic behaviour - of ancestor-only (and, partially, top-level) algorithm. - - WARNING. - - User passes them measured in usecs, but cl->minidle, - cl->maxidle and cl->aveidle are scaled with cl->filter_log - in the text of the scheduler. -*/ + for (cl = this->tparent; cl; cl = cl->tparent) + if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) + return new; + + return NULL; +} + +#endif + +/* Classify packet. The procedure is pretty complicated, but + it allows us to combine link sharing and priority scheduling + transparently. + + Namely, you can put link sharing rules (f.e. route based) at root of CBQ, + so that it resolves to split nodes. Then packeta are classified + by logical priority, or more specific classifier may be attached + to split node. + */ + +static struct cbq_class * +cbq_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + struct cbq_class *head = &q->link; + struct cbq_class **defmap; + struct cbq_class *cl = NULL; + u32 prio = skb->priority; + struct tcf_result res; + + /* + * Step 1. If skb->priority points to one of our classes, use it. + */ + if (TC_H_MAJ(prio^sch->handle) == 0 && + (cl = cbq_class_lookup(q, prio)) != NULL) + return cl; + + for (;;) { + int result = 0; + + defmap = head->defaults; + + /* + * Step 2+n. Apply classifier. + */ + if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0) + goto fallback; + + if ((cl = (void*)res.class) == NULL) { + if (TC_H_MAJ(res.classid)) + cl = cbq_class_lookup(q, res.classid); + else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) + cl = defmap[TC_PRIO_BESTEFFORT]; + + if (cl == NULL) + goto fallback; + } + + if (cl->level == 0) { +#ifdef CONFIG_NET_CLS_POLICE + if (result) + return cbq_reclassify(skb, cl); +#endif + return cl; + } + + /* + * Step 3+n. If classifier selected link sharing class, + * apply agency specific classifier. + * Repeat this procdure until we hit leaf node. + */ + head = cl; + } + +fallback: + cl = head; + + /* + * Step 4. No success... + */ + if (TC_H_MAJ(prio) == 0 && + !(cl = head->defaults[prio&TC_PRIO_MAX]) && + !(cl = head->defaults[TC_PRIO_BESTEFFORT])) + return head; + + return cl; +} /* A packet has just been enqueued on the empty class. - cbq_wakeup_class adds it to the tail of active class list + cbq_activate_class adds it to the tail of active class list of its priority band. */ -static __inline__ void cbq_wakeup_class(struct cbq_class *cl) +static __inline__ void cbq_activate_class(struct cbq_class *cl) { struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; - int prio = cl->priority; + int prio = cl->cpriority; struct cbq_class *cl_tail; - cl->awake = 1; - cl_tail = q->active[prio]; q->active[prio] = cl; if (cl_tail != NULL) { cl->next_alive = cl_tail->next_alive; + cl_tail->next_alive = cl; cl->deficit = 0; } else { cl->next_alive = cl; @@ -271,58 +330,353 @@ static __inline__ void cbq_wakeup_class(struct cbq_class *cl) } } +/* + Unlink class from active chain. + Note, that the same procedure is made directly in cbq_dequeue* + during round-robin procedure. + */ + +static void cbq_deactivate_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + int prio = this->cpriority; + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + + do { + cl = cl_prev->next_alive; + if (cl == this) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + q->activemask &= ~(1<next_alive; + cl->deficit += cl->quantum; + return; + } + } while ((cl_prev = cl) != q->active[prio]); +} + +static __inline__ void +cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) +{ + if (q->toplevel > 0) { + psched_time_t now; + PSCHED_GET_TIME(now); + if (PSCHED_TLESS(now, q->now)) + now = q->now; + if (PSCHED_TLESS(cl->undertime, now)) { + q->toplevel = 0; + return; + } + while ((cl = cl->borrow) != NULL + && q->toplevel > cl->level) { + if (PSCHED_TLESS(cl->borrow->undertime, now)) { + q->toplevel = cl->level; + return; + } + } + } +} + static int cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; - struct cbq_class *cl = CBQ_CLASSIFIER(skb, q); + struct cbq_class *cl = cbq_classify(skb, sch); + int len = skb->len; - if (cl->q->enqueue(skb, cl->q) == 1) { + if (cl && cl->q->enqueue(skb, cl->q) == 1) { sch->q.qlen++; + sch->stats.packets++; + cl->stats.packets++; + sch->stats.bytes+=len; + cl->stats.bytes+=len; + cbq_mark_toplevel(q, cl); + if (!cl->next_alive) + cbq_activate_class(cl); + return 1; + } -#ifdef CBQ_TOPLEVEL_SHARING - if (q->toplevel > 0) { - psched_time_t now; - PSCHED_GET_TIME(now); - if (PSCHED_TLESS(cl->undertime, now)) - q->toplevel = 0; - else if (q->toplevel > 1 && cl->borrow && - PSCHED_TLESS(cl->borrow->undertime, now)) - q->toplevel = 1; - } -#endif - if (!cl->awake) - cbq_wakeup_class(cl); + sch->stats.drops++; + if (cl == NULL) + kfree_skb(skb); + else + cl->stats.drops++; + return 0; +} + +static int +cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl; + + if ((cl = q->tx_class) == NULL) { + kfree_skb(skb); + sch->stats.drops++; + return 0; + } + q->tx_class = NULL; + + if (cl->q->ops->requeue(skb, cl->q) == 1) { + sch->q.qlen++; + cbq_mark_toplevel(q, cl); + if (!cl->next_alive) + cbq_activate_class(cl); return 1; } + sch->stats.drops++; + cl->stats.drops++; return 0; } -static __inline__ void cbq_delay(struct cbq_sched_data *q, struct cbq_class *cl) +/* Overlimit actions */ + +/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ + +static void cbq_ovl_classic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; + + if (!cl->delayed) { + psched_tdiff_t delay; + + delay = PSCHED_TDIFF(cl->undertime, q->now); + delay += cl->offtime; + + /* + Class goes to sleep, so that it will have no + chance to work avgidle. Let's forgive it 8) + + BTW cbq-2.0 has a crap in this + place, apparently they forgot to shift it by cl->ewma_log. + */ + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + if (delay < 0) + delay = 0; + PSCHED_TADD2(q->now, delay, cl->undertime); + + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; + cl->xstats.overactions++; + cl->delayed = 1; + } +} + +/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when + they go overlimit + */ + +static void cbq_ovl_rclassic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; + + while (cl && cl->delayed) { + cl = cl->borrow; + if (cl->level > q->toplevel) + return; + } + + if (cl) + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ + +static void cbq_ovl_delay(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; + + if (!cl->delayed) { + psched_tdiff_t delay; + unsigned long sched = jiffies; + + delay = PSCHED_TDIFF(cl->undertime, q->now); + delay += cl->offtime; + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + PSCHED_TADD2(q->now, delay, cl->undertime); + + if (delay > 0) { + sched += PSCHED_US2JIFFIE(delay) + cl->penalty; + cl->penalized = sched; + cl->cpriority = TC_CBQ_MAXPRIO; + q->pmask |= (1<delay_timer) && + (long)(q->delay_timer.expires - sched) > 0) + q->delay_timer.expires = sched; + add_timer(&q->delay_timer); + cl->delayed = 1; + cl->xstats.overactions++; + } + } +} + +/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ + +static void cbq_ovl_lowprio(struct cbq_class *cl) { - long delay; + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + + cl->penalized = jiffies + cl->penalty; + + if (cl->cpriority != cl->priority2) { + cl->cpriority = cl->priority2; + q->pmask |= (1<cpriority); + cl->xstats.overactions++; + } + cbq_ovl_classic(cl); +} - delay = PSCHED_TDIFF(cl->undertime, q->now); - if (q->wd_expires == 0 || q->wd_expires - delay > 0) - q->wd_expires = delay; +/* TC_CBQ_OVL_DROP: penalize class by dropping */ + +static void cbq_ovl_drop(struct cbq_class *cl) +{ + if (cl->q->ops->drop) + if (cl->q->ops->drop(cl->q)) + cl->qdisc->q.qlen--; + cl->xstats.overactions++; + cbq_ovl_classic(cl); } static void cbq_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + qdisc_wakeup(sch->dev); +} + +static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + unsigned long now = jiffies; + unsigned long sched = now; + + if (cl_prev == NULL) + return now; + + do { + cl = cl_prev->next_alive; + if ((long)(now - cl->penalized) > 0) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + cl->cpriority = cl->priority; + cl->delayed = 0; + cbq_activate_class(cl); + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + return 0; + } + } + + cl = cl_prev->next_alive; + } else if ((long)(sched - cl->penalized) > 0) + sched = cl->penalized; + } while ((cl_prev = cl) != q->active[prio]); + + return (long)(sched - now); +} + +static void cbq_undelay(unsigned long arg) { struct Qdisc *sch = (struct Qdisc*)arg; struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + long delay = 0; + unsigned pmask; + + pmask = q->pmask; + q->pmask = 0; + + while (pmask) { + int prio = ffz(~pmask); + long tmp; + + pmask &= ~(1< 0) { + q->pmask |= 1<delay_timer.expires = jiffies + delay; + add_timer(&q->delay_timer); + } - q->wd_timer.expires = 0; - q->wd_timer.function = NULL; qdisc_wakeup(sch->dev); } + +#ifdef CONFIG_NET_CLS_POLICE + +static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) +{ + int len = skb->len; + struct Qdisc *sch = child->parent; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = cbq_class_lookup(q, child->classid); + + if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { + if (cl->q->enqueue(skb, cl->q) == 1) { + sch->q.qlen++; + sch->stats.packets++; + cl->stats.packets++; + sch->stats.bytes+=len; + cl->stats.bytes+=len; + cbq_mark_toplevel(q, cl); + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + sch->stats.drops++; + return 0; + } + + sch->stats.drops++; + return -1; +} +#endif + +static __inline__ void +cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) +{ + if (cl && q->toplevel >= cl->level) { + if (cl->q->q.qlen <= 1 || PSCHED_TLESS(q->now, cl->undertime)) + q->toplevel = TC_CBQ_MAXLEVEL; + else /* BUGGGG? if (cl != this) */ + q->toplevel = cl->level; + } +} + static __inline__ void cbq_update(struct cbq_sched_data *q) { - struct cbq_class *cl; + struct cbq_class *cl = q->tx_class; + int len = q->tx_len; + + q->tx_class = NULL; - for (cl = q->last_sent; cl; cl = cl->parent) { + for ( ; cl; cl = cl->share) { long avgidle = cl->avgidle; long idle; @@ -333,26 +687,17 @@ cbq_update(struct cbq_sched_data *q) idle = (now - last) - last_pktlen/rate */ - idle = PSCHED_TDIFF(q->now, cl->last) - - L2T(cl, q->last_sent_len); + idle = PSCHED_TDIFF(q->now, cl->last) - L2T(cl, len); /* true_avgidle := (1-W)*true_avgidle + W*idle, - where W=2^{-filter_log}. But cl->avgidle is scaled: + where W=2^{-ewma_log}. But cl->avgidle is scaled: cl->avgidle == true_avgidle/W, hence: */ - avgidle += idle - (avgidle>>cl->filter_log); + avgidle += idle - (avgidle>>cl->ewma_log); if (avgidle <= 0) { /* Overlimit or at-limit */ -#ifdef CBQ_NO_TRICKERY - avgidle = 0; -#else - if (avgidle < cl->minidle) - avgidle = cl->minidle; -#endif - - /* This line was missing in NS. */ cl->avgidle = avgidle; /* Calculate expected time, when this class @@ -362,29 +707,24 @@ cbq_update(struct cbq_sched_data *q) idle = (1/W - 1)*(-true_avgidle) or idle = (1 - W)*(-cl->avgidle); + */ + idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); + /* That is not all. - We want to set undertime to the moment, when - the class is allowed to start next transmission i.e. - (undertime + next_pktlen/phys_bandwidth) - - now - next_pktlen/rate = idle - or - undertime = now + idle + next_pktlen/rate - - next_pktlen/phys_bandwidth - - We do not know next packet length, but can - estimate it with average packet length - or current packet_length. + To maintain rate allocated to class, + we add to undertime virtual clock, + necassry to complete transmitted packet. + (len/phys_bandwidth has been already passed + to the moment of cbq_update) */ - idle = (-avgidle) - ((-avgidle) >> cl->filter_log); - idle += L2T(q, q->last_sent_len); - idle -= L2T(cl, q->last_sent_len); + idle -= L2T(&q->link, len); + idle += L2T(cl, len); + + PSCHED_AUDIT_TDIFF(idle); + PSCHED_TADD2(q->now, idle, cl->undertime); -#ifndef CBQ_NO_TRICKERY - /* Do not forget extra delay :-) */ - PSCHED_TADD(cl->undertime, cl->extradelay); -#endif } else { /* Underlimit */ @@ -393,60 +733,44 @@ cbq_update(struct cbq_sched_data *q) cl->avgidle = cl->maxidle; else cl->avgidle = avgidle; + } cl->last = q->now; } -#ifdef CBQ_TOPLEVEL_SHARING - cl = q->last_sent; - - if (q->borrowed && q->toplevel >= q->borrowed->level) { - if (cl->q->q.qlen <= 1 || PSCHED_TLESS(q->now, q->borrowed->undertime)) - q->toplevel = CBQ_MAXLEVEL; - else if (q->borrowed != cl) - q->toplevel = q->borrowed->level; - } -#endif - - q->last_sent = NULL; + cbq_update_toplevel(q, q->tx_borrowed); } -static __inline__ int +static __inline__ struct cbq_class * cbq_under_limit(struct cbq_class *cl) { struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; struct cbq_class *this_cl = cl; - if (PSCHED_IS_PASTPERFECT(cl->undertime) || cl->parent == NULL) - return 1; + if (cl->tparent == NULL) + return cl; - if (PSCHED_TLESS(cl->undertime, q->now)) { - q->borrowed = cl; - return 1; + if (PSCHED_IS_PASTPERFECT(cl->undertime) || + PSCHED_TLESS(cl->undertime, q->now)) { + cl->delayed = 0; + return cl; } while (!PSCHED_IS_PASTPERFECT(cl->undertime) && PSCHED_TLESS(q->now, cl->undertime)) { - cl = cl->borrow; - if (cl == NULL -#ifdef CBQ_TOPLEVEL_SHARING - || cl->level > q->toplevel -#endif - ) { -#if 0 + if ((cl = cl->borrow) == NULL || cl->level > q->toplevel) { + this_cl->stats.overlimits++; this_cl->overlimit(this_cl); -#else - cbq_delay(q, this_cl); -#endif - return 0; + return NULL; } } - q->borrowed = cl; - return 1; + this_cl->xstats.borrows++; + cl->xstats.borrows++; + return cl; } static __inline__ struct sk_buff * -cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback) +cbq_dequeue_prio(struct Qdisc *sch, int prio) { struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; struct cbq_class *cl_tail, *cl_prev, *cl; @@ -461,23 +785,14 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback) /* Start round */ do { + struct cbq_class *borrow; + /* Class is empty */ - if (cl->q->q.qlen == 0) + if (cl->q->q.qlen == 0) goto skip_class; - - if (fallback) { - /* Fallback pass: all classes are overlimit; - we send from the first class that is allowed - to borrow. - */ - if (cl->borrow == NULL) - goto skip_class; - } else { - /* Normal pass: check that class is under limit */ - if (!cbq_under_limit(cl)) - goto skip_class; - } + if ((borrow = cbq_under_limit(cl)) == NULL) + goto skip_class; if (cl->deficit <= 0) { /* Class exhausted its allotment per this @@ -496,8 +811,9 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback) goto skip_class; cl->deficit -= skb->len; - q->last_sent = cl; - q->last_sent_len = skb->len; + q->tx_class = cl; + q->tx_borrowed = borrow; + q->tx_len = skb->len; if (cl->deficit <= 0) { q->active[prio] = cl; @@ -509,10 +825,12 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback) skip_class: cl->deficit = 0; - if (cl->q->q.qlen == 0) { - /* Class is empty, declare it dead */ + if (cl->q->q.qlen == 0 || prio != cl->cpriority) { + /* Class is empty or penalized. + Unlink it from active chain. + */ cl_prev->next_alive = cl->next_alive; - cl->awake = 0; + cl->next_alive = NULL; /* Did cl_tail point to it? */ if (cl == cl_tail) { @@ -524,9 +842,17 @@ skip_class: /* Kill the band! */ q->active[prio] = NULL; q->activemask &= ~(1<q->q.qlen) + cbq_activate_class(cl); return NULL; } + + q->active[prio] = cl_tail; } + if (cl->q->q.qlen) + cbq_activate_class(cl); + + cl = cl_prev; } next_class: @@ -537,22 +863,22 @@ next_class: } while (deficit); q->active[prio] = cl_prev; - + return NULL; } static __inline__ struct sk_buff * -cbq_dequeue_1(struct Qdisc *sch, int fallback) +cbq_dequeue_1(struct Qdisc *sch) { struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; struct sk_buff *skb; unsigned activemask; - activemask = q->activemask; + activemask = q->activemask&0xFF; while (activemask) { int prio = ffz(~activemask); activemask &= ~(1<data; + psched_time_t now; - PSCHED_GET_TIME(q->now); + PSCHED_GET_TIME(now); - if (q->last_sent) + if (q->tx_class) { + /* Time integrator. We calculate EOS time + by adding expected packet transmittion time. + If real time is greater, we warp artificial clock, + so that: + + cbq_time = max(real_time, work); + */ + PSCHED_TADD(q->now, L2T(&q->link, q->tx_len)); + if (PSCHED_TLESS(q->now, now)) + q->now = now; cbq_update(q); + } else if (PSCHED_TLESS(q->now, now)) + q->now = now; - q->wd_expires = 0; + for (;;) { + q->wd_expires = 0; - skb = cbq_dequeue_1(sch, 0); - if (skb) - return skb; + skb = cbq_dequeue_1(sch); + if (skb) { + sch->q.qlen--; + return skb; + } - /* All the classes are overlimit. - Search for overlimit class, which is allowed to borrow - and use it as fallback case. - */ + /* All the classes are overlimit. -#ifdef CBQ_TOPLEVEL_SHARING - q->toplevel = CBQ_MAXLEVEL; -#endif + It is possible, if: + + 1. Scheduler is empty. + 2. Toplevel cutoff inhibited borrowing. + 3. Root class is overlimit. + + Reset 2d and 3d conditions and retry. + + Note, that NS and cbq-2.0 are buggy, peeking + an arbitrary class is appropriate for ancestor-only + sharing, but not for toplevel algorithm. + + Our version is better, but slower, because requires + two passes, but it is inavoidable with top-level sharing. + */ - skb = cbq_dequeue_1(sch, 1); - if (skb) - return skb; + if (q->toplevel == TC_CBQ_MAXLEVEL && + PSCHED_IS_PASTPERFECT(q->link.undertime)) + break; + + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_SET_PASTPERFECT(q->link.undertime); + } /* No packets in scheduler or nobody wants to give them to us :-( Sigh... start watchdog timer in the last case. */ - if (sch->q.qlen && q->wd_expires) { - if (q->wd_timer.function) + if (sch->q.qlen) { + sch->stats.overlimits++; + if (q->wd_expires && !sch->dev->tbusy) { + long delay = PSCHED_US2JIFFIE(q->wd_expires); del_timer(&q->wd_timer); - q->wd_timer.function = cbq_watchdog; - q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires); - add_timer(&q->wd_timer); + if (delay <= 0) + delay = 1; + q->wd_timer.expires = jiffies + delay; + add_timer(&q->wd_timer); + } } return NULL; } @@ -606,234 +965,974 @@ cbq_dequeue(struct Qdisc *sch) static void cbq_adjust_levels(struct cbq_class *this) { - struct cbq_class *cl; + if (this == NULL) + return; - for (cl = this->parent; cl; cl = cl->parent) { - if (cl->level > this->level) - return; - cl->level = this->level + 1; - this = cl; - } + do { + int level = 0; + struct cbq_class *cl; + + if ((cl = this->children) != NULL) { + do { + if (cl->level > level) + level = cl->level; + } while ((cl = cl->sibling) != this->children); + } + this->level = level+1; + } while ((this = this->tparent) != NULL); } static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) { struct cbq_class *cl; + unsigned h; if (q->quanta[prio] == 0) return; - for (cl = q->classes[prio]; cl; cl = cl->next) { - if (cl->rquantum) - cl->quantum = (cl->rquantum*q->mtu*q->nclasses[prio])/ - q->quanta[prio]; + for (h=0; h<16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + /* BUGGGG... Beware! This expression suffer of + arithmetic overflows! + */ + if (cl->priority == prio) { + cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ + q->quanta[prio]; + } + if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { + printk("Damn! %08x cl->quantum==%ld\n", cl->classid, cl->quantum); + cl->quantum = 1; + } + } } } -static __inline__ int cbq_unlink_class(struct cbq_class *this) +static void cbq_sync_defmap(struct cbq_class *cl) { - struct cbq_class *cl, **clp; - struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + struct cbq_class *split = cl->split; + unsigned h; + int i; - for (clp = &q->classes[this->priority]; (cl = *clp) != NULL; - clp = &cl->next) { - if (cl == this) { - *clp = cl->next; - return 0; - } + if (split == NULL) + return; + + for (i=0; i<=TC_PRIO_MAX; i++) { + if (split->defaults[i] == cl && !(cl->defmap&(1<defaults[i] = NULL; } - return -ENOENT; -} -static int cbq_prune(struct cbq_class *this) -{ - struct cbq_class *cl; - int prio = this->priority; - struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + for (i=0; i<=TC_PRIO_MAX; i++) { + int level = split->level; - qdisc_reset(this->q); + if (split->defaults[i]) + continue; - if (cbq_unlink_class(this)) - return -ENOENT; + for (h=0; h<16; h++) { + struct cbq_class *c; - if (this->awake) { - struct cbq_class *cl_prev = q->active[prio]; - do { - cl = cl_prev->next_alive; - if (cl == this) { - cl_prev->next_alive = cl->next_alive; - - if (cl == q->active[prio]) { - q->active[prio] = cl; - if (cl == q->active[prio]) { - q->active[prio] = NULL; - q->activemask &= ~(1<classes[h]; c; c = c->next) { + if (c->split == split && c->level < level && + c->defmap&(1<defaults[i] = c; + level = c->level; } - - cl = cl->next_alive; - cl->deficit += cl->quantum; - break; } - } while ((cl_prev = cl) != q->active[prio]); + } + } +} + +static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) +{ + struct cbq_class *split = NULL; + + if (splitid == 0) { + if ((split = cl->split) == NULL) + return; + splitid = split->classid; } - --q->nclasses[prio]; - if (this->rquantum) { - q->quanta[prio] -= this->rquantum; - cbq_normalize_quanta(q, prio); + if (split == NULL || split->classid != splitid) { + for (split = cl->tparent; split; split = split->tparent) + if (split->classid == splitid) + break; } - if (q->fallback_class == this) - q->fallback_class = NULL; + if (split == NULL) + return; + + if (cl->split != split) { + cl->defmap = 0; + cbq_sync_defmap(cl); + cl->split = split; + cl->defmap = def&mask; + } else + cl->defmap = (cl->defmap&~mask)|(def&mask); - this->parent = NULL; - this->borrow = NULL; - this->root = this; - this->qdisc = NULL; - return 0; + cbq_sync_defmap(cl); } -static int cbq_graft(struct cbq_class *this, struct cbq_class *parent) +static void cbq_unlink_class(struct cbq_class *this) { struct cbq_class *cl, **clp; - int prio = this->priority; struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; - qdisc_reset(this->q); + for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) { + if (cl == this) { + *clp = cl->next; + cl->next = NULL; + break; + } + } + if (this->tparent) { + clp=&this->sibling; + cl = *clp; + do { + if (cl == this) { + *clp = cl->sibling; + break; + } + clp = &cl->sibling; + } while ((cl = *clp) != this->sibling); - for (clp = &q->classes[prio]; (cl = *clp) != NULL; clp = &cl->next) { - if (cl == this) - return -EBUSY; + if (this->tparent->children == this) { + this->tparent->children = this->sibling; + if (this->sibling == this) + this->tparent->children = NULL; + } + } else { + BUG_TRAP(this->sibling == this); } +} + +static void cbq_link_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + unsigned h = cbq_hash(this->classid); + struct cbq_class *parent = this->tparent; - cl->next = NULL; - *clp = cl; - - cl->parent = parent; - cl->borrow = parent; - cl->root = parent ? parent->root : cl; + this->sibling = this; + this->next = q->classes[h]; + q->classes[h] = this; - ++q->nclasses[prio]; - if (this->rquantum) { - q->quanta[prio] += this->rquantum; - cbq_normalize_quanta(q, prio); + if (parent == NULL) + return; + + if (parent->children == NULL) { + parent->children = this; + } else { + this->sibling = parent->children->sibling; + parent->children->sibling = this; } - - cbq_adjust_levels(this); +} +static int cbq_drop(struct Qdisc* sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl; + int h; + + for (h = TC_CBQ_MAXPRIO; h >= 0; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + if (cl->q->ops->drop && cl->q->ops->drop(cl->q)) + return 1; + } + } return 0; } - static void cbq_reset(struct Qdisc* sch) { struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; struct cbq_class *cl; int prio; + unsigned h; q->activemask = 0; - q->last_sent = NULL; - if (q->wd_timer.function) { - del_timer(&q->wd_timer); - q->wd_timer.expires = 0; - q->wd_timer.function = NULL; - } -#ifdef CBQ_TOPLEVEL_SHARING - q->toplevel = CBQ_MAXLEVEL; -#endif - - for (prio = 0; prio < CBQ_MAXPRIO; prio++) { + q->pmask = 0; + q->tx_class = NULL; + q->tx_borrowed = NULL; + del_timer(&q->wd_timer); + del_timer(&q->delay_timer); + q->toplevel = TC_CBQ_MAXLEVEL; + + for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) q->active[prio] = NULL; - - for (cl = q->classes[prio]; cl; cl = cl->next) { + + for (h = 0; h < 16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { qdisc_reset(cl->q); cl->next_alive = NULL; PSCHED_SET_PASTPERFECT(cl->undertime); cl->avgidle = 0; cl->deficit = 0; - cl->awake = 0; + cl->cpriority = cl->priority; + } + } + sch->q.qlen = 0; +} + + +static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) +{ + if (lss->change&TCF_CBQ_LSS_FLAGS) { + cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; + cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; + } + if (lss->change&TCF_CBQ_LSS_EWMA) + cl->ewma_log = lss->ewma_log; + if (lss->change&TCF_CBQ_LSS_AVPKT) + cl->avpkt = lss->avpkt; + if (lss->change&TCF_CBQ_LSS_MINIDLE) + cl->minidle = -(long)lss->minidle; + if (lss->change&TCF_CBQ_LSS_MAXIDLE) + cl->maxidle = lss->maxidle; + if (lss->change&TCF_CBQ_LSS_OFFTIME) + cl->offtime = lss->offtime; + return 0; +} + +static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]--; + q->quanta[cl->priority] -= cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]++; + q->quanta[cl->priority] += cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; + + if (wrr->allot) + cl->allot = wrr->allot; + if (wrr->weight) + cl->weight = wrr->weight; + if (wrr->priority) { + cl->priority = wrr->priority-1; + cl->cpriority = cl->priority; + if (cl->priority >= cl->priority2) + cl->priority2 = TC_CBQ_MAXPRIO-1; + } + + cbq_addprio(q, cl); + return 0; +} + +static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) +{ + switch (ovl->strategy) { + case TC_CBQ_OVL_CLASSIC: + cl->overlimit = cbq_ovl_classic; + break; + case TC_CBQ_OVL_DELAY: + cl->overlimit = cbq_ovl_delay; + break; + case TC_CBQ_OVL_LOWPRIO: + if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || + ovl->priority2-1 <= cl->priority) + return -EINVAL; + cl->priority2 = ovl->priority2-1; + cl->overlimit = cbq_ovl_lowprio; + break; + case TC_CBQ_OVL_DROP: + cl->overlimit = cbq_ovl_drop; + break; + case TC_CBQ_OVL_RCLASSIC: + cl->overlimit = cbq_ovl_rclassic; + break; + default: + return -EINVAL; + } + cl->penalty = (ovl->penalty*HZ)/1000; + return 0; +} + +#ifdef CONFIG_NET_CLS_POLICE +static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) +{ + cl->police = p->police; + + if (!(cl->q->flags&TCQ_F_DEFAULT)) { + if (p->police == TC_POLICE_RECLASSIFY) + cl->q->reshape_fail = cbq_reshape_fail; + else + cl->q->reshape_fail = NULL; + } + return 0; +} +#endif + +static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) +{ + cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); + return 0; +} + +static int cbq_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + struct rtattr *tb[TCA_CBQ_MAX]; + struct tc_ratespec *r; + + if (rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0 || + tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + r = RTA_DATA(tb[TCA_CBQ_RATE-1]); + + MOD_INC_USE_COUNT; + if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) { + MOD_DEC_USE_COUNT; + return -EINVAL; + } + + q->link.sibling = &q->link; + q->link.classid = sch->handle; + q->link.qdisc = sch; + if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + q->link.q = &noop_qdisc; + + q->link.priority = TC_CBQ_MAXPRIO-1; + q->link.priority2 = TC_CBQ_MAXPRIO-1; + q->link.cpriority = TC_CBQ_MAXPRIO-1; + q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; + q->link.overlimit = cbq_ovl_classic; + q->link.allot = psched_mtu(sch->dev); + q->link.quantum = q->link.allot; + q->link.weight = q->link.R_tab->rate.rate; + + q->link.ewma_log = TC_CBQ_DEF_EWMA; + q->link.avpkt = q->link.allot/2; + q->link.minidle = -0x7FFFFFFF; + + init_timer(&q->wd_timer); + q->wd_timer.data = (unsigned long)sch; + q->wd_timer.function = cbq_watchdog; + init_timer(&q->delay_timer); + q->delay_timer.data = (unsigned long)sch; + q->delay_timer.function = cbq_undelay; + q->toplevel = TC_CBQ_MAXLEVEL; + + cbq_link_class(&q->link); + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + cbq_addprio(q, &q->link); + return 0; +} + +#ifdef CONFIG_RTNETLINK + +static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + + RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_lssopt opt; + + opt.flags = 0; + if (cl->borrow == NULL) + opt.flags |= TCF_CBQ_LSS_BOUNDED; + if (cl->share == NULL) + opt.flags |= TCF_CBQ_LSS_ISOLATED; + opt.ewma_log = cl->ewma_log; + opt.level = cl->level; + opt.avpkt = cl->avpkt; + opt.maxidle = cl->maxidle; + opt.minidle = (u32)(-cl->minidle); + opt.offtime = cl->offtime; + opt.change = ~0; + RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_wrropt opt; + + opt.flags = 0; + opt.allot = cl->allot; + opt.priority = cl->priority+1; + opt.cpriority = cl->cpriority+1; + opt.weight = cl->weight; + RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_ovl opt; + + opt.strategy = cl->ovl_strategy; + opt.priority2 = cl->priority2+1; + opt.penalty = (cl->penalty*1000)/HZ; + RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_fopt opt; + + if (cl->split || cl->defmap) { + opt.split = cl->split ? cl->split->classid : 0; + opt.defmap = cl->defmap; + opt.defchange = ~0; + RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_NET_CLS_POLICE +static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_police opt; + + if (cl->police) { + opt.police = cl->police; + RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) +{ + if (cbq_dump_lss(skb, cl) < 0 || + cbq_dump_rate(skb, cl) < 0 || + cbq_dump_wrr(skb, cl) < 0 || + cbq_dump_ovl(skb, cl) < 0 || +#ifdef CONFIG_NET_CLS_POLICE + cbq_dump_police(skb, cl) < 0 || +#endif + cbq_dump_fopt(skb, cl) < 0) + return -1; + return 0; +} + +static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, &q->link) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + q->link.xstats.avgidle = q->link.avgidle; + RTA_PUT(skb, TCA_XSTATS, sizeof(q->link.xstats), &q->link.xstats); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +cbq_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + struct cbq_class *cl = (struct cbq_class*)arg; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (cl->tparent) + tcm->tcm_parent = cl->tparent->classid; + else + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->classid; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, cl) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + cl->stats.qlen = cl->q->q.qlen; + RTA_PUT(skb, TCA_STATS, sizeof(cl->stats), &cl->stats); + cl->xstats.avgidle = cl->avgidle; + cl->xstats.undertime = 0; + if (!PSCHED_IS_PASTPERFECT(cl->undertime)) + cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now); + RTA_PUT(skb, TCA_XSTATS, sizeof(cl->xstats), &cl->xstats); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#endif + +static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl) { + if (new == NULL) { + if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL) + return -ENOBUFS; + } else { +#ifdef CONFIG_NET_CLS_POLICE + if (cl->police == TC_POLICE_RECLASSIFY) + new->reshape_fail = cbq_reshape_fail; +#endif } + if ((*old = xchg(&cl->q, new)) != NULL) + qdisc_reset(*old); + + return 0; + } + return -ENOENT; +} + +static unsigned long cbq_get(struct Qdisc *sch, u32 classid) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + cl->refcnt++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_destroy_filters(struct cbq_class *cl) +{ + struct tcf_proto *tp; + + while ((tp = cl->filter_list) != NULL) { + cl->filter_list = tp->next; + tp->ops->destroy(tp); } } +static void cbq_destroy_class(struct cbq_class *cl) +{ + cbq_destroy_filters(cl); + qdisc_destroy(cl->q); + qdisc_put_rtab(cl->R_tab); +#ifdef CONFIG_NET_ESTIMATOR + qdisc_kill_estimator(&cl->stats); +#endif +} + static void cbq_destroy(struct Qdisc* sch) { struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; - struct cbq_class *cl, **clp; - int prio; + struct cbq_class *cl; + unsigned h; + + for (h = 0; h < 16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) + cbq_destroy_filters(cl); + } + + for (h = 0; h < 16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) + if (cl != &q->link) + cbq_destroy_class(cl); + } + + qdisc_put_rtab(q->link.R_tab); +} - for (prio = 0; prio < CBQ_MAXPRIO; prio++) { - struct cbq_class *cl_head = q->classes[prio]; - - for (clp = &cl_head; (cl=*clp) != NULL; clp = &cl->next) { - qdisc_destroy(cl->q); - kfree(cl); +static void cbq_put(struct Qdisc *q, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (--cl->refcnt == 0) + cbq_destroy_class(cl); + return; +} + +static int +cbq_change(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, + unsigned long *arg) +{ + int err; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = (struct cbq_class*)*arg; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_CBQ_MAX]; + struct cbq_class *parent; + struct qdisc_rate_table *rtab = NULL; + + if (opt==NULL || + rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt))) + return -EINVAL; + + if (tb[TCA_CBQ_OVL_STRATEGY-1] && + RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl)) + return -EINVAL; + + if (tb[TCA_CBQ_FOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt)) + return -EINVAL; + + if (tb[TCA_CBQ_RATE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt)) + return -EINVAL; + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police)) + return -EINVAL; +#endif + + if (cl) { + /* Check parent */ + if (parentid) { + if (cl->tparent && cl->tparent->classid != parentid) + return -EINVAL; + if (!cl->tparent && parentid != TC_H_ROOT) + return -EINVAL; + } + + if (tb[TCA_CBQ_RATE-1]) { + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; } + + /* Change class parameters */ + start_bh_atomic(); + + if (cl->next_alive != NULL) + cbq_deactivate_class(cl); + + if (rtab) { + rtab = xchg(&cl->R_tab, rtab); + qdisc_put_rtab(rtab); + } + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + if (tb[TCA_CBQ_WRROPT-1]) { + cbq_rmprio(q, cl); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + } + + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + + if (cl->q->q.qlen) + cbq_activate_class(cl); + + end_bh_atomic(); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) { + qdisc_kill_estimator(&cl->stats); + qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); + } +#endif + return 0; + } + + if (parentid == TC_H_ROOT) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + tb[TCA_CBQ_LSSOPT-1] == NULL) + return -EINVAL; + + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; + + if (classid) { + err = -EINVAL; + if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) + goto failure; + } else { + int i; + classid = TC_H_MAKE(sch->handle,0x8000); + + for (i=0; i<0x8000; i++) { + if (++q->hgenerator >= 0x8000) + q->hgenerator = 1; + if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) + break; + } + err = -ENOSR; + if (i >= 0x8000) + goto failure; + classid = classid|q->hgenerator; } + + parent = &q->link; + if (parentid) { + parent = cbq_class_lookup(q, parentid); + err = -EINVAL; + if (parent == NULL) + goto failure; + } + + err = -ENOBUFS; + cl = kmalloc(sizeof(*cl), GFP_KERNEL); + if (cl == NULL) + goto failure; + memset(cl, 0, sizeof(*cl)); + cl->R_tab = rtab; + rtab = NULL; + cl->refcnt = 1; + if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + cl->q = &noop_qdisc; + cl->classid = classid; + cl->tparent = parent; + cl->qdisc = sch; + cl->allot = parent->allot; + cl->quantum = cl->allot; + cl->weight = cl->R_tab->rate.rate; + + start_bh_atomic(); + cbq_link_class(cl); + cl->borrow = cl->tparent; + if (cl->tparent != &q->link) + cl->share = cl->tparent; + cl->minidle = -0x7FFFFFFF; + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + if (cl->ewma_log==0) + cl->ewma_log = q->link.ewma_log; + if (cl->maxidle==0) + cl->maxidle = q->link.maxidle; + if (cl->avpkt==0) + cl->avpkt = q->link.avpkt; + cl->overlimit = cbq_ovl_classic; + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + cbq_adjust_levels(parent); + end_bh_atomic(); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); +#endif + + *arg = (unsigned long)cl; + return 0; + +failure: + qdisc_put_rtab(rtab); + return err; } -static int cbq_control(struct Qdisc *sch, void *arg) +static int cbq_delete(struct Qdisc *sch, unsigned long arg) { - struct cbq_sched_data *q; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl->filters || cl->children || cl == &q->link) + return -EBUSY; + + start_bh_atomic(); + + if (cl->next_alive) + cbq_deactivate_class(cl); + + if (q->tx_class == cl) + q->tx_class = cl->borrow; + if (q->tx_borrowed == cl) + q->tx_borrowed = q->tx_class; + + cbq_unlink_class(cl); + cbq_adjust_levels(cl->tparent); + cl->defmap = 0; + cbq_sync_defmap(cl); + + cbq_rmprio(q, cl); - q = (struct cbq_sched_data *)sch->data; + if (--cl->refcnt == 0) + cbq_destroy_class(cl); - /* Do attachment here. It is the last thing to do. */ + end_bh_atomic(); - return -EINVAL; + return 0; } -static int cbq_init(struct Qdisc *sch, void *arg) +static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) { - struct cbq_sched_data *q; - struct cbqctl *ctl = (struct cbqctl*)arg; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = (struct cbq_class *)arg; - q = (struct cbq_sched_data *)sch->data; - init_timer(&q->wd_timer); - q->wd_timer.data = (unsigned long)sch; -#ifdef CBQ_TOPLEVEL_SHARING - q->toplevel = CBQ_MAXLEVEL; -#endif + if (cl == NULL) + cl = &q->link; + + return &cl->filter_list; +} + +static unsigned long cbq_bind_filter(struct Qdisc *sch, u32 classid) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = cbq_class_lookup(q, classid); + if (cl) { + cl->filters++; + return (unsigned long)cl; + } return 0; } +static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; -struct Qdisc_ops cbq_ops = + cl->filters--; +} + +static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + unsigned h; + + if (arg->stop) + return; + + for (h = 0; h < 16; h++) { + struct cbq_class *cl; + + for (cl = q->classes[h]; cl; cl = cl->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } + } +} + +static struct Qdisc_class_ops cbq_class_ops = +{ + cbq_graft, + cbq_get, + cbq_put, + cbq_change, + cbq_delete, + cbq_walk, + + cbq_find_tcf, + cbq_bind_filter, + cbq_unbind_filter, + +#ifdef CONFIG_RTNETLINK + cbq_dump_class, +#endif +}; + +struct Qdisc_ops cbq_qdisc_ops = { NULL, + &cbq_class_ops, "cbq", - 0, sizeof(struct cbq_sched_data), + cbq_enqueue, cbq_dequeue, + cbq_requeue, + cbq_drop, + + cbq_init, cbq_reset, cbq_destroy, - cbq_init, - cbq_control, + +#ifdef CONFIG_RTNETLINK + cbq_dump, +#endif }; #ifdef MODULE int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&cbq_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&cbq_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&cbq_qdisc_ops); } #endif diff --git a/net/sched/sch_csz.c b/net/sched/sch_csz.c index 5e10ac097bb2..c21d8ac43aca 100644 --- a/net/sched/sch_csz.c +++ b/net/sched/sch_csz.c @@ -10,6 +10,8 @@ * */ +#include +#include #include #include #include @@ -48,16 +50,16 @@ but it has pretty poor delay characteristics. Round-robin scheduling and link-sharing goals apparently contradict to minimization of network delay and jitter. - Moreover, correct handling of predicted flows seems to be + Moreover, correct handling of predictive flows seems to be impossible in CBQ. CSZ presents more precise but less flexible and less efficient approach. As I understand, the main idea is to create WFQ flows for each guaranteed service and to allocate the rest of bandwith to dummy flow-0. Flow-0 comprises - the predicted services and the best effort traffic; + the predictive services and the best effort traffic; it is handled by a priority scheduler with the highest - priority band allocated for predicted services, and the rest --- + priority band allocated for predictive services, and the rest --- to the best effort packets. Note, that in CSZ flows are NOT limited to their bandwidth. @@ -67,14 +69,16 @@ will introduce undesired delays and raise jitter. At the moment CSZ is the only scheduler that provides - real guaranteed service. Another schemes (including CBQ) + true guaranteed service. Another schemes (including CBQ) do not provide guaranteed delay and randomize jitter. There exists the statement (Sally Floyd), that delay can be estimated by a IntServ compliant formulae. This result is true formally, but it is wrong in principle. - At first, it ignores delays introduced by link sharing. - And the second (and main) it limits bandwidth, - it is fatal flaw. + It takes into account only round-robin delays, + ignoring delays introduced by link sharing i.e. overlimiting. + Note, that temporary overlimits are inevitable because + real links are not ideal, and true algorithm must take it + into account. ALGORITHM. @@ -204,9 +208,8 @@ /* This number is arbitrary */ -#define CSZ_MAX_GUARANTEED 16 - -#define CSZ_FLOW_ID(skb) (CSZ_MAX_GUARANTEED) +#define CSZ_GUARANTEED 16 +#define CSZ_FLOWS (CSZ_GUARANTEED+4) struct csz_head { @@ -224,12 +227,15 @@ struct csz_flow struct csz_head *fprev; /* Parameters */ - unsigned long rate; /* Flow rate. Fixed point is at rate_log */ - unsigned long *L_tab; /* Lookup table for L/(B*r_a) values */ - unsigned long max_bytes; /* Maximal length of queue */ + struct tc_ratespec rate; + struct tc_ratespec slice; + u32 *L_tab; /* Lookup table for L/(B*r_a) values */ + unsigned long limit; /* Maximal length of queue */ #ifdef CSZ_PLUS_TBF - unsigned long depth; /* Depth of token bucket, normalized + struct tc_ratespec peakrate; + __u32 buffer; /* Depth of token bucket, normalized as L/(B*r_a) */ + __u32 mtu; #endif /* Variables */ @@ -246,12 +252,11 @@ struct csz_flow struct sk_buff_head q; /* FIFO queue */ }; -#define L2R(q,f,L) ((f)->L_tab[(L)>>(q)->cell_log]) +#define L2R(f,L) ((f)->L_tab[(L)>>(f)->slice.cell_log]) struct csz_sched_data { /* Parameters */ - unsigned char cell_log; /* 1< 2.1sec is MAXIMAL value */ /* Variables */ + struct tcf_proto *filter_list; + u8 prio2band[TC_PRIO_MAX+1]; #ifdef CSZ_PLUS_TBF struct timer_list wd_timer; long wd_expires; @@ -270,8 +277,8 @@ struct csz_sched_data struct csz_head f; /* Flows sorted by "finish" */ struct sk_buff_head other[4];/* Predicted (0) and the best efforts - classes (1,2,3) */ - struct csz_flow flow[CSZ_MAX_GUARANTEED]; /* Array of flows */ + classes (1,2,3) */ + struct csz_flow flow[CSZ_GUARANTEED]; /* Array of flows */ }; /* These routines (csz_insert_finish and csz_insert_start) are @@ -353,7 +360,11 @@ extern __inline__ void csz_insert_start(struct csz_head *b, It is another time consuming part, but it is impossible to avoid it. - Fixed point arithmetic is not ... does not ... Well, it is just CRAP. + It costs O(N) that make all the algorithm useful only + to play with closest to ideal fluid model. + + There exist less academic, but more practical modifications, + which might have even better characteristics (WF2Q+, HPFQ, HFSC) */ static unsigned long csz_update(struct Qdisc *sch) @@ -430,9 +441,9 @@ do_reset: tmp = ((F-q->R_c)*q->rate)<R_log; R_c = F; - q->rate -= a->rate; + q->rate -= a->slice.rate; - if (delay - tmp >= 0) { + if ((long)(delay - tmp) >= 0) { delay -= tmp; continue; } @@ -443,35 +454,41 @@ do_reset: return tmp; } +unsigned csz_classify(struct sk_buff *skb, struct csz_sched_data *q) +{ + return CSZ_GUARANTEED; +} + static int csz_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct csz_sched_data *q = (struct csz_sched_data *)sch->data; - unsigned flow_id = CSZ_FLOW_ID(skb); + unsigned flow_id = csz_classify(skb, q); unsigned long R; - int prio; + int prio = 0; struct csz_flow *this; - if (flow_id >= CSZ_MAX_GUARANTEED) { - prio = flow_id - CSZ_MAX_GUARANTEED; + if (flow_id >= CSZ_GUARANTEED) { + prio = flow_id - CSZ_GUARANTEED; flow_id = 0; } this = &q->flow[flow_id]; - if (this->q.qlen >= this->max_bytes || this->L_tab == NULL) { + if (this->q.qlen >= this->limit || this->L_tab == NULL) { + sch->stats.drops++; kfree_skb(skb); return 0; } R = csz_update(sch); - if (this->finish - R >= 0) { + if ((long)(this->finish - R) >= 0) { /* It was active */ - this->finish += L2R(q,this,skb->len); + this->finish += L2R(this,skb->len); } else { /* It is inactive; activate it */ - this->finish = R + L2R(q,this,skb->len); - q->rate += this->rate; + this->finish = R + L2R(this,skb->len); + q->rate += this->slice.rate; csz_insert_finish(&q->f, this); } @@ -486,6 +503,8 @@ csz_enqueue(struct sk_buff *skb, struct Qdisc* sch) else skb_queue_tail(&q->other[prio], skb); sch->q.qlen++; + sch->stats.bytes += skb->len; + sch->stats.packets++; return 1; } @@ -524,10 +543,6 @@ skb_peek_best(struct csz_sched_data * q) static void csz_watchdog(unsigned long arg) { struct Qdisc *sch = (struct Qdisc*)arg; - struct csz_sched_data *q = (struct csz_sched_data*)sch->data; - - q->wd_timer.expires = 0; - q->wd_timer.function = NULL; qdisc_wakeup(sch->dev); } @@ -568,7 +583,7 @@ static __inline__ int csz_enough_tokens(struct csz_sched_data *q, if (toks >= 0) { /* Now we have enough tokens to proceed */ - this->tokens = toks <= this->depth ? toks ? this->depth; + this->tokens = toks <= this->depth ? toks : this->depth; this->t_tbf = now; if (!this->throttled) @@ -601,7 +616,7 @@ static __inline__ int csz_enough_tokens(struct csz_sched_data *q, This apriory shift in R will be adjusted later to reflect real delay. We cannot avoid it because of: - throttled flow continues to be active from the viewpoint - of CSZ, so that it would acquire highest priority, + of CSZ, so that it would acquire the highest priority, if you not adjusted start numbers. - Eventually, finish number would become less than round number and flow were declared inactive. @@ -654,7 +669,7 @@ csz_dequeue(struct Qdisc* sch) #endif if (this->q.qlen) { struct sk_buff *nskb = skb_peek(&this->q); - this->start += L2R(q,this,nskb->len); + this->start += L2R(this,nskb->len); csz_insert_start(&q->s, this); } sch->q.qlen--; @@ -668,7 +683,7 @@ csz_dequeue(struct Qdisc* sch) if (--this->q.qlen) { struct sk_buff *nskb; - unsigned dequeued = L2R(q,this,skb->len); + unsigned dequeued = L2R(this,skb->len); /* We got not the same thing that peeked earlier; adjust start number @@ -677,7 +692,7 @@ csz_dequeue(struct Qdisc* sch) this->start += dequeued - peeked; nskb = skb_peek_best(q); - peeked = L2R(q,this,nskb->len); + peeked = L2R(this,nskb->len); this->start += peeked; this->peeked = peeked; csz_insert_start(&q->s, this); @@ -692,11 +707,13 @@ csz_dequeue(struct Qdisc* sch) Schedule watchdog timer, if it occured because of shaping. */ if (q->wd_expires) { - if (q->wd_timer.function) - del_timer(&q->wd_timer); - q->wd_timer.function = csz_watchdog; - q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires); + unsigned long delay = PSCHED_US2JIFFIE(q->wd_expires); + del_timer(&q->wd_timer); + if (delay == 0) + delay = 1; + q->wd_timer.expires = jiffies + delay; add_timer(&q->wd_timer); + sch->stats.overlimits++; } #endif return NULL; @@ -706,17 +723,14 @@ static void csz_reset(struct Qdisc* sch) { struct csz_sched_data *q = (struct csz_sched_data *)sch->data; - struct sk_buff *skb; int i; for (i=0; i<4; i++) - while ((skb=skb_dequeue(&q->other[i])) != NULL) - kfree_skb(skb); + skb_queue_purge(&q->other[i]); - for (i=0; iflow + i; - while ((skb = skb_dequeue(&this->q)) != NULL) - kfree_skb(skb); + skb_queue_purge(&this->q); this->snext = this->sprev = this->fnext = this->fprev = (struct csz_head*)this; this->start = this->finish = 0; @@ -727,10 +741,7 @@ csz_reset(struct Qdisc* sch) #ifdef CSZ_PLUS_TBF PSCHED_GET_TIME(&q->t_tbf); q->tokens = q->depth; - if (q->wd_timer.function) { - del_timer(&q->wd_timer); - q->wd_timer.function = NULL; - } + del_timer(&q->wd_timer); #endif sch->q.qlen = 0; } @@ -738,25 +749,34 @@ csz_reset(struct Qdisc* sch) static void csz_destroy(struct Qdisc* sch) { -/* - struct csz_sched_data *q = (struct csz_sched_data *)sch->data; - int i; - - for (i=0; i<4; i++) - qdisc_destroy(q->other[i]); - */ + MOD_DEC_USE_COUNT; } -static int csz_init(struct Qdisc *sch, void *arg) +static int csz_init(struct Qdisc *sch, struct rtattr *opt) { struct csz_sched_data *q = (struct csz_sched_data *)sch->data; - struct cszinitctl *ctl = (struct cszinitctl*)arg; + struct rtattr *tb[TCA_CSZ_PTAB]; + struct tc_csz_qopt *qopt; int i; + rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)); + if (tb[TCA_CSZ_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*qopt)) + return -EINVAL; + qopt = RTA_DATA(tb[TCA_CSZ_PARMS-1]); + + q->R_log = qopt->R_log; + q->delta_log = qopt->delta_log; + for (i=0; i<=TC_PRIO_MAX; i++) { + if (qopt->priomap[i] >= CSZ_FLOWS) + return -EINVAL; + q->prio2band[i] = qopt->priomap[i]; + } + for (i=0; i<4; i++) skb_queue_head_init(&q->other[i]); - for (i=0; iflow + i; skb_queue_head_init(&this->q); this->snext = this->sprev = @@ -769,64 +789,268 @@ static int csz_init(struct Qdisc *sch, void *arg) #ifdef CSZ_PLUS_TBF init_timer(&q->wd_timer); q->wd_timer.data = (unsigned long)sch; + q->wd_timer.function = csz_watchdog; +#endif + MOD_INC_USE_COUNT; + return 0; +} + +#ifdef CONFIG_RTNETLINK +static int csz_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_csz_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt.flows = CSZ_FLOWS; + memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); + RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} #endif - if (ctl) { - if (ctl->flows != CSZ_MAX_GUARANTEED) + + +static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, + struct Qdisc **old) +{ + return -EINVAL; +} + +static unsigned long csz_get(struct Qdisc *sch, u32 classid) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned long band = TC_H_MIN(classid) - 1; + + if (band >= CSZ_FLOWS) + return 0; + + if (band < CSZ_GUARANTEED && q->flow[band].L_tab == NULL) + return 0; + + return band+1; +} + +static void csz_put(struct Qdisc *sch, unsigned long cl) +{ + return; +} + +static int csz_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) +{ + unsigned long cl = *arg; + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_CSZ_PTAB]; + struct tc_csz_copt *copt; + + rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)); + if (tb[TCA_CSZ_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*copt)) + return -EINVAL; + copt = RTA_DATA(tb[TCA_CSZ_PARMS-1]); + + if (tb[TCA_CSZ_RTAB-1] && + RTA_PAYLOAD(tb[TCA_CSZ_RTAB-1]) < 1024) + return -EINVAL; + + if (cl) { + struct csz_flow *a; + cl--; + if (cl >= CSZ_FLOWS) + return -ENOENT; + if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL) return -EINVAL; - q->cell_log = ctl->cell_log; + + a = &q->flow[cl]; + + start_bh_atomic(); +#if 0 + a->rate_log = copt->rate_log; +#endif +#ifdef CSZ_PLUS_TBF + a->limit = copt->limit; + a->rate = copt->rate; + a->buffer = copt->buffer; + a->mtu = copt->mtu; +#endif + + if (tb[TCA_CSZ_RTAB-1]) + memcpy(a->L_tab, RTA_DATA(tb[TCA_CSZ_RTAB-1]), 1024); + + end_bh_atomic(); + return 0; } + /* NI */ return 0; } -static int csz_control(struct Qdisc *sch, struct pschedctl *gctl) +static int csz_delete(struct Qdisc *sch, unsigned long cl) { -/* struct csz_sched_data *q = (struct csz_sched_data *)sch->data; - struct cszctl *ctl = (struct cszctl*)gctl->arg; - struct sk_buff *skb; - int i; + struct csz_flow *a; + + cl--; + + if (cl >= CSZ_FLOWS) + return -ENOENT; + if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL) + return -EINVAL; + + a = &q->flow[cl]; + + start_bh_atomic(); + a->fprev->fnext = a->fnext; + a->fnext->fprev = a->fprev; + a->sprev->snext = a->snext; + a->snext->sprev = a->sprev; + a->start = a->finish = 0; + kfree(xchg(&q->flow[cl].L_tab, NULL)); + end_bh_atomic(); - if (op == PSCHED_TC_ATTACH) { - - } -*/ return 0; } +#ifdef CONFIG_RTNETLINK +static int csz_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_csz_copt opt; + + tcm->tcm_handle = sch->handle|cl; + + cl--; + + if (cl > CSZ_FLOWS) + goto rtattr_failure; + + if (cl < CSZ_GUARANTEED) { + struct csz_flow *f = &q->flow[cl]; + + if (f->L_tab == NULL) + goto rtattr_failure; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt.limit = f->limit; + opt.rate = f->rate; + opt.slice = f->slice; + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); +#ifdef CSZ_PLUS_TBF + opt.buffer = f->buffer; + opt.mtu = f->mtu; +#else + opt.buffer = 0; + opt.mtu = 0; +#endif + + RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + } + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static void csz_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + int prio = 0; + + if (arg->stop) + return; + + for (prio = 0; prio < CSZ_FLOWS; prio++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (prio < CSZ_GUARANTEED && q->flow[prio].L_tab == NULL) { + arg->count++; + continue; + } + if (arg->fn(sch, prio+1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + + if (cl) + return NULL; + return &q->filter_list; +} +struct Qdisc_class_ops csz_class_ops = +{ + csz_graft, + csz_get, + csz_put, + csz_change, + csz_delete, + csz_walk, + + csz_find_tcf, + csz_get, + csz_put, + +#ifdef CONFIG_RTNETLINK + csz_dump_class, +#endif +}; -struct Qdisc_ops csz_ops = +struct Qdisc_ops csz_qdisc_ops = { NULL, + &csz_class_ops, "csz", - 0, sizeof(struct csz_sched_data), + csz_enqueue, csz_dequeue, + NULL, + NULL, + + csz_init, csz_reset, csz_destroy, - csz_init, - csz_control, + +#ifdef CONFIG_RTNETLINK + csz_dump, +#endif }; #ifdef MODULE -#include int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&csz_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&csz_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&csz_qdisc_ops); } #endif diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index af44d4e7517c..14bc8bb8bfd7 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -1,9 +1,15 @@ /* - * net/sched/sch_fifo.c Simple FIFO "scheduler" + * net/sched/sch_fifo.c The simplest FIFO queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. * * Authors: Alexey Kuznetsov, */ +#include #include #include #include @@ -32,9 +38,7 @@ struct fifo_sched_data { - int qmaxbytes; - int qmaxlen; - int qbytes; + unsigned limit; }; static int @@ -42,41 +46,62 @@ bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; - if (q->qbytes <= q->qmaxbytes) { - skb_queue_tail(&sch->q, skb); - q->qbytes += skb->len; - return 0; + if (sch->stats.backlog <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->stats.backlog += skb->len; + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 1; } - kfree_skb(skb); + sch->stats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return 0; +} + +static int +bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); + sch->stats.backlog += skb->len; return 1; } static struct sk_buff * bfifo_dequeue(struct Qdisc* sch) { - struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; struct sk_buff *skb; - skb = skb_dequeue(&sch->q); + skb = __skb_dequeue(&sch->q); if (skb) - q->qbytes -= skb->len; + sch->stats.backlog -= skb->len; return skb; } -static void -bfifo_reset(struct Qdisc* sch) +static int +fifo_drop(struct Qdisc* sch) { - struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; struct sk_buff *skb; - while((skb=skb_dequeue(&sch->q)) != NULL) { - q->qbytes -= skb->len; + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; kfree_skb(skb); + return 1; } - if (q->qbytes) { - printk("fifo_reset: qbytes=%d\n", q->qbytes); - q->qbytes = 0; - } + return 0; +} + +static void +fifo_reset(struct Qdisc* sch) +{ + struct sk_buff *skb; + + while ((skb=__skb_dequeue(&sch->q)) != NULL) + kfree_skb(skb); + sch->stats.backlog = 0; } static int @@ -84,96 +109,106 @@ pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; - if (sch->q.qlen <= q->qmaxlen) { - skb_queue_tail(&sch->q, skb); - return 0; + if (sch->q.qlen <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 1; } - kfree_skb(skb); + sch->stats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return 0; +} + +static int +pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); return 1; } + static struct sk_buff * pfifo_dequeue(struct Qdisc* sch) { - return skb_dequeue(&sch->q); + return __skb_dequeue(&sch->q); } -static void -pfifo_reset(struct Qdisc* sch) -{ - struct sk_buff *skb; - while((skb=skb_dequeue(&sch->q))!=NULL) - kfree_skb(skb); +static int fifo_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct fifo_sched_data *q = (void*)sch->data; + + if (opt == NULL) { + q->limit = sch->dev->tx_queue_len; + if (sch->ops == &bfifo_qdisc_ops) + q->limit *= sch->dev->mtu; + } else { + struct tc_fifo_qopt *ctl = RTA_DATA(opt); + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + q->limit = ctl->limit; + } + return 0; } - -static int fifo_init(struct Qdisc *sch, void *arg /* int bytes, int pkts */) +#ifdef CONFIG_RTNETLINK +static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { - struct fifo_sched_data *q; -/* - struct device *dev = sch->dev; - */ + struct fifo_sched_data *q = (void*)sch->data; + unsigned char *b = skb->tail; + struct tc_fifo_qopt opt; - q = (struct fifo_sched_data *)sch->data; -/* - if (pkts<0) - pkts = dev->tx_queue_len; - if (bytes<0) - bytes = pkts*dev->mtu; - q->qmaxbytes = bytes; - q->qmaxlen = pkts; - */ - return 0; + opt.limit = q->limit; + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; } +#endif -struct Qdisc_ops pfifo_ops = +struct Qdisc_ops pfifo_qdisc_ops = { + NULL, NULL, "pfifo", - 0, sizeof(struct fifo_sched_data), + pfifo_enqueue, pfifo_dequeue, - pfifo_reset, - NULL, + pfifo_requeue, + fifo_drop, + fifo_init, + fifo_reset, + NULL, +#ifdef CONFIG_RTNETLINK + fifo_dump, +#endif }; -struct Qdisc_ops bfifo_ops = +struct Qdisc_ops bfifo_qdisc_ops = { NULL, - "pfifo", - 0, + NULL, + "bfifo", sizeof(struct fifo_sched_data), + bfifo_enqueue, bfifo_dequeue, - bfifo_reset, - NULL, - fifo_init, -}; - -#ifdef MODULE -#include -int init_module(void) -{ - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; + bfifo_requeue, + fifo_drop, - err = register_qdisc(&pfifo_ops); - if (err == 0) { - err = register_qdisc(&bfifo_ops); - if (err) - unregister_qdisc(&pfifo_ops); - } - if (err) - MOD_DEC_USE_COUNT; - return err; -} - -void cleanup_module(void) -{ -} + fifo_init, + fifo_reset, + NULL, +#ifdef CONFIG_RTNETLINK + fifo_dump, #endif +}; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index c3399f9c1edd..5e07bced81f3 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -30,66 +30,116 @@ #include #include +#define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } + +/* Main transmission queue. */ + struct Qdisc_head qdisc_head = { &qdisc_head }; -static struct Qdisc_ops *qdisc_base = NULL; +/* Kick device. + Note, that this procedure can be called by watchdog timer, so that + we do not check dev->tbusy flag here. + + Returns: 0 - queue is empty. + >0 - queue is not empty, but throttled. + <0 - queue is not empty. Device is throttled, if dev->tbusy != 0. + + NOTE: Called only from NET BH +*/ -static int default_requeue(struct sk_buff *skb, struct Qdisc* qdisc); +int qdisc_restart(struct device *dev) +{ + struct Qdisc *q = dev->qdisc; + struct sk_buff *skb; + if ((skb = q->dequeue(q)) != NULL) { + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); -/* NOTES. + if (dev->hard_start_xmit(skb, dev) == 0) { + q->tx_last = jiffies; + return -1; + } - Every discipline has two major routines: enqueue and dequeue. + /* Device kicked us out :( + It is possible in three cases: - ---dequeue + 1. fastroute is enabled + 2. device cannot determine busy state + before start of transmission (f.e. dialout) + 3. device is buggy (ppp) + */ - dequeue usually returns a skb to send. It is allowed to return NULL, - but it does not mean that queue is empty, it just means that - discipline does not want to send anything this time. - Queue is really empty if q->q.qlen == 0. - For complicated disciplines with multiple queues q->q is not - real packet queue, but however q->q.qlen must be valid. + q->ops->requeue(skb, q); + return -1; + } + return q->q.qlen; +} - ---enqueue +/* Scan transmission queue and kick devices. - enqueue returns number of enqueued packets i.e. this number is 1, - if packet was enqueued sucessfully and <1 if something (not - necessary THIS packet) was dropped. + Deficiency: slow devices (ppp) and fast ones (100Mb ethernet) + share one queue. It means, that if we have a lot of loaded ppp channels, + we will scan a long list on every 100Mb EOI. + I have no idea how to solve it using only "anonymous" Linux mark_bh(). + To change queue from device interrupt? Ough... only not this... */ -int register_qdisc(struct Qdisc_ops *qops) +void qdisc_run_queues(void) { - struct Qdisc_ops *q, **qp; - for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) - if (strcmp(qops->id, q->id) == 0) - return -EEXIST; - qops->next = NULL; - qops->refcnt = 0; - *qp = qops; - return 0; -} + struct Qdisc_head **hp, *h; -int unregister_qdisc(struct Qdisc_ops *qops) -{ - struct Qdisc_ops *q, **qp; - for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) - if (q == qops) - break; - if (!q) - return -ENOENT; - if (q->requeue == NULL) - q->requeue = default_requeue; - *qp = q->next; - return 0; + hp = &qdisc_head.forw; + while ((h = *hp) != &qdisc_head) { + int res = -1; + struct Qdisc *q = (struct Qdisc*)h; + struct device *dev = q->dev; + + while (!dev->tbusy && (res = qdisc_restart(dev)) < 0) + /* NOTHING */; + + /* The explanation is necessary here. + qdisc_restart called dev->hard_start_xmit, + if device is virtual, it could trigger one more + dev_queue_xmit and new device could appear + in active chain. In this case we cannot unlink + empty queue, because we lost back pointer. + No problem, we will unlink it during the next round. + */ + + if (res == 0 && *hp == h) { + *hp = h->forw; + h->forw = NULL; + continue; + } + hp = &h->forw; + } } -struct Qdisc *qdisc_lookup(int handle) +/* Periodic watchdoc timer to recover of hard/soft device bugs. */ + +static void dev_do_watchdog(unsigned long dummy); + +static struct timer_list dev_watchdog = + { NULL, NULL, 0L, 0L, &dev_do_watchdog }; + +static void dev_do_watchdog(unsigned long dummy) { - return NULL; + struct Qdisc_head *h; + + for (h = qdisc_head.forw; h != &qdisc_head; h = h->forw) { + struct Qdisc *q = (struct Qdisc*)h; + struct device *dev = q->dev; + if (dev->tbusy && jiffies - q->tx_last > q->tx_timeo) + qdisc_restart(dev); + } + dev_watchdog.expires = jiffies + 5*HZ; + add_timer(&dev_watchdog); } + /* "NOOP" scheduler: the best scheduler, recommended for all interfaces in all curcumstances. It is difficult to invent anything more fast or cheap. @@ -108,11 +158,48 @@ noop_dequeue(struct Qdisc * qdisc) return NULL; } +static int +noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + if (net_ratelimit()) + printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); + kfree_skb(skb); + return 0; +} + +struct Qdisc_ops noop_qdisc_ops = +{ + NULL, + NULL, + "noop", + 0, + + noop_enqueue, + noop_dequeue, + noop_requeue, +}; + struct Qdisc noop_qdisc = { { NULL }, noop_enqueue, noop_dequeue, + TCQ_F_DEFAULT|TCQ_F_BUILTIN, + &noop_qdisc_ops, +}; + + +struct Qdisc_ops noqueue_qdisc_ops = +{ + NULL, + NULL, + "noqueue", + 0, + + noop_enqueue, + noop_dequeue, + noop_requeue, + }; struct Qdisc noqueue_qdisc = @@ -120,25 +207,32 @@ struct Qdisc noqueue_qdisc = { NULL }, NULL, NULL, + TCQ_F_DEFAULT|TCQ_F_BUILTIN, + &noqueue_qdisc_ops, }; +static const u8 prio2band[TC_PRIO_MAX+1] = +{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; -/* 3-band FIFO queue: old style, but should be a bit faster (several CPU insns) */ +/* 3-band FIFO queue: old style, but should be a bit faster than + generic prio+fifo combination. + */ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) { - const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 }; struct sk_buff_head *list; - list = ((struct sk_buff_head*)qdisc->data) + prio2band[skb->priority&7]; + list = ((struct sk_buff_head*)qdisc->data) + + prio2band[skb->priority&TC_PRIO_MAX]; if (list->qlen <= skb->dev->tx_queue_len) { __skb_queue_tail(list, skb); + qdisc->q.qlen++; return 1; } - qdisc->dropped++; + qdisc->stats.drops++; kfree_skb(skb); return 0; } @@ -152,8 +246,10 @@ pfifo_fast_dequeue(struct Qdisc* qdisc) for (prio = 0; prio < 3; prio++, list++) { skb = __skb_dequeue(list); - if (skb) + if (skb) { + qdisc->q.qlen--; return skb; + } } return NULL; } @@ -161,12 +257,13 @@ pfifo_fast_dequeue(struct Qdisc* qdisc) static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc) { - const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 }; struct sk_buff_head *list; - list = ((struct sk_buff_head*)qdisc->data) + prio2band[skb->priority&7]; + list = ((struct sk_buff_head*)qdisc->data) + + prio2band[skb->priority&TC_PRIO_MAX]; __skb_queue_head(list, skb); + qdisc->q.qlen++; return 1; } @@ -178,16 +275,17 @@ pfifo_fast_reset(struct Qdisc* qdisc) for (prio=0; prio < 3; prio++) skb_queue_purge(list+prio); + qdisc->q.qlen = 0; } -static int pfifo_fast_init(struct Qdisc *qdisc, void *arg) +static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) { int i; struct sk_buff_head *list; list = ((struct sk_buff_head*)qdisc->data); - for(i=0; i<3; i++) + for (i=0; i<3; i++) skb_queue_head_init(list+i); return 0; @@ -195,30 +293,21 @@ static int pfifo_fast_init(struct Qdisc *qdisc, void *arg) static struct Qdisc_ops pfifo_fast_ops = { + NULL, NULL, "pfifo_fast", - 1, 3 * sizeof(struct sk_buff_head), + pfifo_fast_enqueue, pfifo_fast_dequeue, - pfifo_fast_reset, + pfifo_fast_requeue, NULL, + pfifo_fast_init, - NULL, - pfifo_fast_requeue + pfifo_fast_reset, }; -static int -default_requeue(struct sk_buff *skb, struct Qdisc* qdisc) -{ - if (net_ratelimit()) - printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); - kfree_skb(skb); - return 0; -} - -static struct Qdisc * -qdisc_alloc(struct device *dev, struct Qdisc_ops *ops, void *arg) +struct Qdisc * qdisc_create_dflt(struct device *dev, struct Qdisc_ops *ops) { struct Qdisc *sch; int size = sizeof(*sch) + ops->priv_size; @@ -233,56 +322,48 @@ qdisc_alloc(struct device *dev, struct Qdisc_ops *ops, void *arg) sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev = dev; - if (ops->init && ops->init(sch, arg)) - return NULL; - ops->refcnt++; - return sch; + sch->flags |= TCQ_F_DEFAULT; + if (ops->init && ops->init(sch, NULL) == 0) + return sch; + + kfree(sch); + return NULL; } void qdisc_reset(struct Qdisc *qdisc) { struct Qdisc_ops *ops = qdisc->ops; - if (ops) { - start_bh_atomic(); - if (ops->reset) - ops->reset(qdisc); - end_bh_atomic(); - } + start_bh_atomic(); + if (ops->reset) + ops->reset(qdisc); + end_bh_atomic(); } void qdisc_destroy(struct Qdisc *qdisc) { struct Qdisc_ops *ops = qdisc->ops; - if (ops) { - start_bh_atomic(); - if (ops->reset) - ops->reset(qdisc); - if (ops->destroy) - ops->destroy(qdisc); - ops->refcnt--; - end_bh_atomic(); - kfree(qdisc); - } -} - -static void dev_do_watchdog(unsigned long dummy); - -static struct timer_list dev_watchdog = - { NULL, NULL, 0L, 0L, &dev_do_watchdog }; - -static void dev_do_watchdog(unsigned long dummy) -{ - struct Qdisc_head *h; - - for (h = qdisc_head.forw; h != &qdisc_head; h = h->forw) { - struct Qdisc *q = (struct Qdisc*)h; - struct device *dev = q->dev; - if (dev->tbusy && jiffies - q->tx_last > q->tx_timeo) { - qdisc_restart(dev); - } +#ifdef CONFIG_NET_SCHED + if (qdisc->dev) { + struct Qdisc *q, **qp; + for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) + if (q == qdisc) { + *qp = q->next; + q->next = NULL; + break; + } } - dev_watchdog.expires = jiffies + 5*HZ; - add_timer(&dev_watchdog); +#ifdef CONFIG_NET_ESTIMATOR + qdisc_kill_estimator(&qdisc->stats); +#endif +#endif + start_bh_atomic(); + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + end_bh_atomic(); + if (!(qdisc->flags&TCQ_F_BUILTIN)) + kfree(qdisc); } @@ -291,15 +372,17 @@ void dev_activate(struct device *dev) /* No queueing discipline is attached to device; create default one i.e. pfifo_fast for devices, which need queueing and noqueue_qdisc for - virtual intrfaces + virtual interfaces */ if (dev->qdisc_sleeping == &noop_qdisc) { if (dev->tx_queue_len) { struct Qdisc *qdisc; - qdisc = qdisc_alloc(dev, &pfifo_fast_ops, NULL); - if (qdisc == NULL) + qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops); + if (qdisc == NULL) { + printk(KERN_INFO "%s: activation failed\n", dev->name); return; + } dev->qdisc_sleeping = qdisc; } else dev->qdisc_sleeping = &noqueue_qdisc; @@ -309,10 +392,9 @@ void dev_activate(struct device *dev) if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) { dev->qdisc->tx_timeo = 5*HZ; dev->qdisc->tx_last = jiffies - dev->qdisc->tx_timeo; - if (!dev_watchdog.expires) { + if (!del_timer(&dev_watchdog)) dev_watchdog.expires = jiffies + 5*HZ; - add_timer(&dev_watchdog); - } + add_timer(&dev_watchdog); } end_bh_atomic(); } @@ -323,8 +405,7 @@ void dev_deactivate(struct device *dev) start_bh_atomic(); - qdisc = dev->qdisc; - dev->qdisc = &noop_qdisc; + qdisc = xchg(&dev->qdisc, &noop_qdisc); qdisc_reset(qdisc); @@ -346,6 +427,7 @@ void dev_init_scheduler(struct device *dev) { dev->qdisc = &noop_qdisc; dev->qdisc_sleeping = &noop_qdisc; + dev->qdisc_list = NULL; } void dev_shutdown(struct device *dev) @@ -354,12 +436,15 @@ void dev_shutdown(struct device *dev) start_bh_atomic(); qdisc = dev->qdisc_sleeping; + dev->qdisc = &noop_qdisc; dev->qdisc_sleeping = &noop_qdisc; - qdisc_destroy(qdisc); + qdisc_destroy(qdisc); + BUG_TRAP(dev->qdisc_list == NULL); + dev->qdisc_list = NULL; end_bh_atomic(); } -void dev_set_scheduler(struct device *dev, struct Qdisc *qdisc) +struct Qdisc * dev_set_scheduler(struct device *dev, struct Qdisc *qdisc) { struct Qdisc *oqdisc; @@ -369,195 +454,20 @@ void dev_set_scheduler(struct device *dev, struct Qdisc *qdisc) start_bh_atomic(); oqdisc = dev->qdisc_sleeping; - /* Destroy old scheduler */ + /* Prune old scheduler */ if (oqdisc) - qdisc_destroy(oqdisc); + qdisc_reset(oqdisc); - /* ... and attach new one */ + /* ... and graft new one */ + if (qdisc == NULL) + qdisc = &noop_qdisc; dev->qdisc_sleeping = qdisc; dev->qdisc = &noop_qdisc; end_bh_atomic(); if (dev->flags & IFF_UP) dev_activate(dev); -} - -/* Kick the queue "q". - Note, that this procedure is called by watchdog timer, so that - we do not check dev->tbusy flag here. - Returns: 0 - queue is empty. - >0 - queue is not empty, but throttled. - <0 - queue is not empty. Device is throttled, if dev->tbusy != 0. - - NOTE: Called only from NET BH -*/ - - -int qdisc_restart(struct device *dev) -{ - struct Qdisc *q = dev->qdisc; - struct sk_buff *skb; - - if ((skb = q->dequeue(q)) != NULL) { - if (netdev_nit) - dev_queue_xmit_nit(skb, dev); - - if (dev->hard_start_xmit(skb, dev) == 0) { - q->tx_last = jiffies; - return -1; - } - - if (q->ops) { - q->ops->requeue(skb, q); - return -1; - } - - printk(KERN_DEBUG "%s: it is impossible!!!\n", dev->name); - kfree_skb(skb); - } - return q->q.qlen; + return oqdisc; } -void qdisc_run_queues(void) -{ - struct Qdisc_head **hp, *h; - - hp = &qdisc_head.forw; - while ((h = *hp) != &qdisc_head) { - int res = -1; - struct Qdisc *q = (struct Qdisc*)h; - struct device *dev = q->dev; - - while (!dev->tbusy && (res = qdisc_restart(dev)) < 0) - /* NOTHING */; - - /* The explanation is necessary here. - qdisc_restart called dev->hard_start_xmit, - if device is virtual, it could trigger one more - dev_queue_xmit and new device could appear - in active chain. In this case we cannot unlink - empty queue, because we lost back pointer. - No problem, we will unlink it during the next round. - */ - - if (res == 0 && *hp == h) { - *hp = h->forw; - h->forw = NULL; - continue; - } - hp = &h->forw; - } -} - - -int tc_init(struct pschedctl *pctl) -{ - struct Qdisc *q; - struct Qdisc_ops *qops; - - if (pctl->handle) { - q = qdisc_lookup(pctl->handle); - if (q == NULL) - return -ENOENT; - qops = q->ops; - if (pctl->ifindex && q->dev->ifindex != pctl->ifindex) - return -EINVAL; - } - return -EINVAL; -} - -int tc_destroy(struct pschedctl *pctl) -{ - return -EINVAL; -} - -int tc_attach(struct pschedctl *pctl) -{ - return -EINVAL; -} - -int tc_detach(struct pschedctl *pctl) -{ - return -EINVAL; -} - - -int psched_ioctl(void *arg) -{ - struct pschedctl ctl; - struct pschedctl *pctl = &ctl; - int err; - - if (copy_from_user(&ctl, arg, sizeof(ctl))) - return -EFAULT; - - if (ctl.arglen > 0) { - pctl = kmalloc(sizeof(ctl) + ctl.arglen, GFP_KERNEL); - if (pctl == NULL) - return -ENOBUFS; - memcpy(pctl, &ctl, sizeof(ctl)); - if (copy_from_user(pctl->args, ((struct pschedctl*)arg)->args, ctl.arglen)) { - kfree(pctl); - return -EFAULT; - } - } - - rtnl_lock(); - - switch (ctl.command) { - case PSCHED_TC_INIT: - err = tc_init(pctl); - break; - case PSCHED_TC_DESTROY: - err = tc_destroy(pctl); - break; - case PSCHED_TC_ATTACH: - err = tc_attach(pctl); - break; - case PSCHED_TC_DETACH: - err = tc_detach(pctl); - break; - default: - err = -EINVAL; - } - - rtnl_unlock(); - - if (pctl != &ctl) - kfree(pctl); - return err; -} - -__initfunc(int pktsched_init(void)) -{ -#define INIT_QDISC(name) { \ - extern struct Qdisc_ops name##_ops; \ - register_qdisc(&##name##_ops); \ - } - - register_qdisc(&pfifo_fast_ops); -#ifdef CONFIG_NET_SCH_CBQ - INIT_QDISC(cbq); -#endif -#ifdef CONFIG_NET_SCH_CSZ - INIT_QDISC(csz); -#endif -#ifdef CONFIG_NET_SCH_RED - INIT_QDISC(red); -#endif -#ifdef CONFIG_NET_SCH_SFQ - INIT_QDISC(sfq); -#endif -#ifdef CONFIG_NET_SCH_TBF - INIT_QDISC(tbf); -#endif -#ifdef CONFIG_NET_SCH_PFIFO - INIT_QDISC(pfifo); - INIT_QDISC(bfifo); -#endif -#ifdef CONFIG_NET_SCH_PRIO - INIT_QDISC(prio); -#endif - return 0; -} diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index a3806eda4352..5b7b39fea00d 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -1,9 +1,16 @@ /* * net/sched/sch_prio.c Simple 3-band priority "scheduler". * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * * Authors: Alexey Kuznetsov, */ +#include +#include #include #include #include @@ -28,32 +35,69 @@ #include #include -/* New N-band generic scheduler */ struct prio_sched_data { - int qbytes; int bands; - u8 prio2band[8]; - struct Qdisc *queues[8]; + struct tcf_proto *filter_list; + u8 prio2band[TC_PRIO_MAX+1]; + struct Qdisc *queues[TCQ_PRIO_BANDS]; }; + +static __inline__ unsigned prio_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + struct tcf_result res; + + res.classid = skb->priority; + if (TC_H_MAJ(res.classid) != sch->handle) { + if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) { + if (TC_H_MAJ(res.classid)) + res.classid = 0; + res.classid = q->prio2band[res.classid&TC_PRIO_MAX] + 1; + } + } + + return res.classid - 1; +} + static int prio_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct prio_sched_data *q = (struct prio_sched_data *)sch->data; - int prio = q->prio2band[skb->priority&7]; struct Qdisc *qdisc; - qdisc = q->queues[prio]; - if (qdisc->enqueue(skb, qdisc) == 0) { - q->qbytes += skb->len; + qdisc = q->queues[prio_classify(skb, sch)]; + + if (qdisc->enqueue(skb, qdisc) == 1) { + sch->stats.bytes += skb->len; + sch->stats.packets++; sch->q.qlen++; - return 0; + return 1; + } + sch->stats.drops++; + return 0; +} + + +static int +prio_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + struct Qdisc *qdisc; + + qdisc = q->queues[prio_classify(skb, sch)]; + + if (qdisc->ops->requeue(skb, qdisc) == 1) { + sch->q.qlen++; + return 1; } - return 1; + sch->stats.drops++; + return 0; } + static struct sk_buff * prio_dequeue(struct Qdisc* sch) { @@ -66,7 +110,6 @@ prio_dequeue(struct Qdisc* sch) qdisc = q->queues[prio]; skb = qdisc->dequeue(qdisc); if (skb) { - q->qbytes -= skb->len; sch->q.qlen--; return skb; } @@ -75,6 +118,24 @@ prio_dequeue(struct Qdisc* sch) } +static int +prio_drop(struct Qdisc* sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int prio; + struct Qdisc *qdisc; + + for (prio = q->bands-1; prio >= 0; prio--) { + qdisc = q->queues[prio]; + if (qdisc->ops->drop(qdisc)) { + sch->q.qlen--; + return 1; + } + } + return 0; +} + + static void prio_reset(struct Qdisc* sch) { @@ -83,7 +144,7 @@ prio_reset(struct Qdisc* sch) for (prio=0; priobands; prio++) qdisc_reset(q->queues[prio]); - q->qbytes = 0; + sch->q.qlen = 0; } static void @@ -96,51 +157,205 @@ prio_destroy(struct Qdisc* sch) qdisc_destroy(q->queues[prio]); q->queues[prio] = &noop_qdisc; } + MOD_DEC_USE_COUNT; } -static int prio_init(struct Qdisc *sch, void *arg) +static int prio_init(struct Qdisc *sch, struct rtattr *opt) { - const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 }; - struct prio_sched_data *q; + static const u8 prio2band[TC_PRIO_MAX+1] = + { 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned mask = 0; int i; - q = (struct prio_sched_data *)sch->data; - q->bands = 3; - memcpy(q->prio2band, prio2band, sizeof(prio2band)); - for (i=0; ibands; i++) - q->queues[i] = &noop_qdisc; + if (opt == NULL) { + q->bands = 3; + memcpy(q->prio2band, prio2band, sizeof(prio2band)); + mask = 7; + } else { + struct tc_prio_qopt *qopt = RTA_DATA(opt); + + if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) + return -EINVAL; + if (qopt->bands > TCQ_PRIO_BANDS) + return -EINVAL; + q->bands = qopt->bands; + for (i=0; i<=TC_PRIO_MAX; i++) { + if (qopt->priomap[i] >= q->bands) + return -EINVAL; + q->prio2band[i] = qopt->priomap[i]; + mask |= (1<priomap[i]); + } + } + for (i=0; iqueues[i] = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (q->queues[i] == NULL) + q->queues[i] = &noop_qdisc; + } + MOD_INC_USE_COUNT; + return 0; +} + +#ifdef CONFIG_RTNETLINK +static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct tc_prio_qopt opt; + + opt.bands = q->bands; + memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned long band = arg - 1; + + if (band >= q->bands) + return -EINVAL; + + if (new == NULL) + new = &noop_qdisc; + + *old = xchg(&q->queues[band], new); + return 0; } -struct Qdisc_ops prio_ops = +static unsigned long prio_get(struct Qdisc *sch, u32 classid) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned long band = TC_H_MIN(classid); + + if (band - 1 >= q->bands) + return 0; + return band; +} + +static void prio_put(struct Qdisc *q, unsigned long cl) +{ + return; +} + +static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) +{ + unsigned long cl = *arg; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + +static int prio_delete(struct Qdisc *sch, unsigned long cl) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + + +#ifdef CONFIG_RTNETLINK +static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} +#endif + +static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int prio; + + if (arg->stop) + return; + + for (prio = 0; prio < q->bands; prio++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, prio+1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + if (cl) + return NULL; + return &q->filter_list; +} + +static struct Qdisc_class_ops prio_class_ops = +{ + prio_graft, + prio_get, + prio_put, + prio_change, + prio_delete, + prio_walk, + + prio_find_tcf, + prio_get, + prio_put, + +#ifdef CONFIG_RTNETLINK + prio_dump_class, +#endif +}; + +struct Qdisc_ops prio_qdisc_ops = { NULL, + &prio_class_ops, "prio", - 0, sizeof(struct prio_sched_data), + prio_enqueue, prio_dequeue, + prio_requeue, + prio_drop, + + prio_init, prio_reset, prio_destroy, - prio_init, + +#ifdef CONFIG_RTNETLINK + prio_dump, +#endif }; #ifdef MODULE -#include + int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&prio_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&prio_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&prio_qdisc_ops); } + #endif diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 637288d99002..56d1651f369d 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -1,5 +1,5 @@ /* - * net/sched/sch_red.c Random Early Detection scheduler. + * net/sched/sch_red.c Random Early Detection queue. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -9,6 +9,8 @@ * Authors: Alexey Kuznetsov, */ +#include +#include #include #include #include @@ -62,32 +64,42 @@ Short description. and mark (drop) packet with this probability. Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). - max_P should be small (not 1!). - - NB. SF&VJ assumed that Pb[avg] is linear function. I think it - is wrong. I'd make: - P[th_min] = 0, P[th_max] = 1; - dP/davg[th_min] = 0, dP/davg[th_max] = infinity, or a large number. - - I choose max_P as a number between 0.01 and 0.1, so that - C1 = max_P/(th_max-th_min) is power of two: C1 = 2^(-C1log) - - Parameters, settable by user (with default values): - - qmaxbytes=256K - hard limit on queue length, should be chosen >qth_max - to allow packet bursts. This parameter does not - affect algorithm behaviour and can be chosen - arbitrarily high (well, less than ram size) - Really, this limit will never be achieved - if RED works correctly. - qth_min=32K - qth_max=128K - qth_max should be at least 2*qth_min - Wlog=8 - log(1/W). - Alog=Wlog - fixed point position in th_min and th_max. - Rlog=10 - C1log=24 - C1log = trueC1log+Alog-Rlog - so that trueC1log=22 and max_P~0.02 - + max_P should be small (not 1), usually 0.01..0.02 is good value. + + max_P is chosen as a number, so that max_P/(th_max-th_min) + is negative power of two in order arithmetics to contain + only shifts. + + + Parameters, settable by user: + ----------------------------- + + limit - bytes (must be > qth_max + burst) + + Hard limit on queue length, should be chosen >qth_max + to allow packet bursts. This parameter does not + affect algorithm behaviour and can be chosen + arbitrarily high (well, less than ram size) + Really, this limit will never be achieved + if RED works correctly. + + qth_min - bytes (should be < qth_max/2) + qth_max - bytes (should be at least 2*qth_min and less limit) + Wlog - bits (<32) log(1/W). + Plog - bits (<32) + + Plog is related to max_P by formula: + + max_P = (qth_max-qth_min)/2^Plog; + + F.e. if qth_max=128K and qth_min=32K, then Plog=22 + corresponds to max_P=0.02 + + Scell_log + Stab + + Lookup table for log((1-W)^(t/t_ave). + NOTES: @@ -97,10 +109,10 @@ Upper bound on W. If you want to allow bursts of L packets of size S, you should choose W: - L + 1 -th_min/S < (1-(1-W)^L)/W - - For th_min/S = 32 + L + 1 - th_min/S < (1-(1-W)^L)/W + th_min/S = 32 th_min/S = 4 + log(W) L -1 33 -2 35 @@ -117,33 +129,24 @@ Upper bound on W. struct red_sched_data { /* Parameters */ - unsigned long qmaxbytes; /* HARD maximal queue length */ - unsigned long qth_min; /* Min average length threshold: A scaled */ - unsigned long qth_max; /* Max average length threshold: A scaled */ - char Alog; /* Point position in average lengths */ + u32 limit; /* HARD maximal queue length */ + u32 qth_min; /* Min average length threshold: A scaled */ + u32 qth_max; /* Max average length threshold: A scaled */ + u32 Rmask; + u32 Scell_max; char Wlog; /* log(W) */ - char Rlog; /* random number bits */ - char C1log; /* log(1/C1) */ - char Slog; - char Stab[256]; + char Plog; /* random number bits */ + char Scell_log; + u8 Stab[256]; /* Variables */ - unsigned long qbytes; /* Queue length in bytes */ unsigned long qave; /* Average queue length: A scaled */ int qcount; /* Packets since last random number generation */ - unsigned qR; /* Cached random number [0..1qidlestart); PSCHED_GET_TIME(now); - us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, (256<Slog)-1, 0); - -/* It is wrong, but I do not think that SF+VJ proposal is reasonable - and did not invented anything more clever 8) + us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0); +/* The problem: ideally, average length queue recalcultion should be done over constant clock intervals. It is too expensive, so that calculation is driven by outgoing packets. When queue is idle we have to model this clock by hands. - SF+VJ proposed to "generate" m = (idletime/bandwidth)*average_pkt_size + SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth) dummy packets as burst after idle time, i.e. q->qave *= (1-W)^m @@ -175,129 +176,193 @@ red_enqueue(struct sk_buff *skb, struct Qdisc* sch) I believe, that a simpler model may be used here, but it is field for experiments. */ - q->qave >>= q->Stab[(us_idle>>q->Slog)&0xFF]; + q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; } - q->qave += ((q->qbytes<Alog) - q->qave) >> q->Wlog; + q->qave += sch->stats.backlog - (q->qave >> q->Wlog); if (q->qave < q->qth_min) { enqueue: q->qcount = -1; - if (q->qbytes <= q->qmaxbytes) { - skb_queue_tail(&sch->q, skb); - q->qbytes += skb->len; + if (sch->stats.backlog <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->stats.backlog += skb->len; + sch->stats.bytes += skb->len; + sch->stats.packets++; return 1; } drop: kfree_skb(skb); + sch->stats.drops++; return 0; } if (q->qave >= q->qth_max) { q->qcount = -1; + sch->stats.overlimits++; goto drop; } - q->qcount++; - if (q->qcount++) { - if ((((q->qave - q->qth_min)*q->qcount)>>q->C1log) < q->qR) + if (++q->qcount) { + if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) goto enqueue; q->qcount = 0; - q->qR = red_random(q->Rlog); + q->qR = net_random()&q->Rmask; + sch->stats.overlimits++; goto drop; } - q->qR = red_random(q->Rlog); + q->qR = net_random()&q->Rmask; goto enqueue; } +static int +red_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + + PSCHED_SET_PASTPERFECT(q->qidlestart); + + __skb_queue_head(&sch->q, skb); + sch->stats.backlog += skb->len; + return 1; +} + static struct sk_buff * red_dequeue(struct Qdisc* sch) { struct sk_buff *skb; struct red_sched_data *q = (struct red_sched_data *)sch->data; - skb = skb_dequeue(&sch->q); + skb = __skb_dequeue(&sch->q); if (skb) { - q->qbytes -= skb->len; + sch->stats.backlog -= skb->len; return skb; } PSCHED_GET_TIME(q->qidlestart); return NULL; } -static void -red_reset(struct Qdisc* sch) +static int +red_drop(struct Qdisc* sch) { - struct red_sched_data *q = (struct red_sched_data *)sch->data; struct sk_buff *skb; + struct red_sched_data *q = (struct red_sched_data *)sch->data; - while((skb=skb_dequeue(&sch->q))!=NULL) { - q->qbytes -= skb->len; + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + sch->stats.drops++; kfree_skb(skb); + return 1; } - if (q->qbytes) { - printk("red_reset: qbytes=%lu\n", q->qbytes); - q->qbytes = 0; - } + PSCHED_GET_TIME(q->qidlestart); + return 0; +} + +static void red_reset(struct Qdisc* sch) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + struct sk_buff *skb; + + while((skb=__skb_dequeue(&sch->q))!=NULL) + kfree_skb(skb); + sch->stats.backlog = 0; PSCHED_SET_PASTPERFECT(q->qidlestart); q->qave = 0; q->qcount = -1; } -static int red_init(struct Qdisc *sch, struct pschedctl *pctl) +static int red_init(struct Qdisc *sch, struct rtattr *opt) { - struct red_sched_data *q; - struct redctl *ctl = (struct redctl*)pctl->args; - - q = (struct red_sched_data *)sch->data; - - if (pctl->arglen < sizeof(struct redctl)) + struct red_sched_data *q = (struct red_sched_data *)sch->data; + struct rtattr *tb[TCA_RED_STAB]; + struct tc_red_qopt *ctl; + + if (opt == NULL || + rtattr_parse(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) || + tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || + RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || + RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) return -EINVAL; + ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); + q->Wlog = ctl->Wlog; - q->Alog = ctl->Alog; - q->Rlog = ctl->Rlog; - q->C1log = ctl->C1log; - q->Slog = ctl->Slog; - q->qth_min = ctl->qth_min; - q->qth_max = ctl->qth_max; - q->qmaxbytes = ctl->qmaxbytes; - memcpy(q->Stab, ctl->Stab, 256); + q->Plog = ctl->Plog; + q->Rmask = ctl->Plog < 32 ? ((1<Plog) - 1) : ~0UL; + q->Scell_log = ctl->Scell_log; + q->Scell_max = (256<Scell_log)-1; + q->qth_min = ctl->qth_min<Wlog; + q->qth_max = ctl->qth_max<Wlog; + q->limit = ctl->limit; + memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); q->qcount = -1; PSCHED_SET_PASTPERFECT(q->qidlestart); + MOD_INC_USE_COUNT; return 0; } -struct Qdisc_ops red_ops = +#ifdef CONFIG_RTNETLINK +static int red_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_red_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + opt.limit = q->limit; + opt.qth_min = q->qth_min>>q->Wlog; + opt.qth_max = q->qth_max>>q->Wlog; + opt.Wlog = q->Wlog; + opt.Plog = q->Plog; + opt.Scell_log = q->Scell_log; + RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static void red_destroy(struct Qdisc *sch) +{ + MOD_DEC_USE_COUNT; +} + +struct Qdisc_ops red_qdisc_ops = { + NULL, NULL, "red", - 0, sizeof(struct red_sched_data), + red_enqueue, red_dequeue, - red_reset, - NULL, + red_requeue, + red_drop, + red_init, - NULL + red_reset, + red_destroy, + +#ifdef CONFIG_RTNETLINK + red_dump, +#endif }; #ifdef MODULE -#include int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&red_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&red_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&red_qdisc_ops); } #endif diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 7a90df655dd0..7cc2b6e5fb3c 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -1,5 +1,5 @@ /* - * net/sched/sch_sfq.c Stochastic Fairness Queueing scheduler. + * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -9,6 +9,8 @@ * Authors: Alexey Kuznetsov, */ +#include +#include #include #include #include @@ -30,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -84,14 +87,12 @@ scattered over different locations. It is not good, but it allowed to put it into 4K. - It is easy to increase these values. + It is easy to increase these values, but not in flight. */ #define SFQ_DEPTH 128 #define SFQ_HASH_DIVISOR 1024 -#define SFQ_HASH(a) 0 - /* This type should contain at least SFQ_DEPTH*2 values */ typedef unsigned char sfq_index; @@ -104,9 +105,12 @@ struct sfq_head struct sfq_sched_data { /* Parameters */ + int perturb_period; unsigned quantum; /* Allotment per round: MUST BE >= MTU */ /* Variables */ + struct timer_list perturb_timer; + int perturbation; sfq_index tail; /* Index of current slot in round */ sfq_index max_depth; /* Maximal depth */ @@ -118,10 +122,59 @@ struct sfq_sched_data struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */ }; +static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) +{ + int pert = q->perturbation; + + /* Have we any rotation primitives? If not, WHY? */ + h ^= (h1<>(0x1F - pert)); + h ^= h>>10; + return h & 0x3FF; +} + +#ifndef IPPROTO_ESP +#define IPPROTO_ESP 50 +#endif + +static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) +{ + u32 h, h2; + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + { + struct iphdr *iph = skb->nh.iph; + h = iph->daddr; + h2 = iph->saddr^iph->protocol; + if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && + (iph->protocol == IPPROTO_TCP || + iph->protocol == IPPROTO_UDP || + iph->protocol == IPPROTO_ESP)) + h2 ^= *(((u32*)iph) + iph->ihl); + break; + } + case __constant_htons(ETH_P_IPV6): + { + struct ipv6hdr *iph = skb->nh.ipv6h; + h = iph->daddr.s6_addr32[3]; + h2 = iph->saddr.s6_addr32[3]^iph->nexthdr; + if (iph->nexthdr == IPPROTO_TCP || + iph->nexthdr == IPPROTO_UDP || + iph->nexthdr == IPPROTO_ESP) + h2 ^= *(u32*)&iph[1]; + break; + } + default: + h = (u32)(unsigned long)skb->dst^skb->protocol; + h2 = (u32)(unsigned long)skb->sk; + } + return sfq_fold_hash(q, h, h2); +} + extern __inline__ void sfq_link(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; - int d = q->qs[x].qlen; + int d = q->qs[x].qlen + SFQ_DEPTH; p = d; n = q->dep[d].next; @@ -161,47 +214,49 @@ extern __inline__ void sfq_inc(struct sfq_sched_data *q, sfq_index x) sfq_link(q, x); } -static __inline__ void sfq_drop(struct sfq_sched_data *q) +static int sfq_drop(struct Qdisc *sch) { - struct sk_buff *skb; + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; sfq_index d = q->max_depth; + struct sk_buff *skb; /* Queue is full! Find the longest slot and drop a packet from it */ - if (d != 1) { - sfq_index x = q->dep[d].next; + if (d > 1) { + sfq_index x = q->dep[d+SFQ_DEPTH].next; skb = q->qs[x].prev; __skb_unlink(skb, &q->qs[x]); kfree_skb(skb); sfq_dec(q, x); -/* sch->q.qlen--; - */ - return; + sch->stats.drops++; + return 1; } - /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + if (d == 1) { + /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + d = q->next[q->tail]; + q->next[q->tail] = q->next[d]; + q->allot[q->next[d]] += q->quantum; + skb = q->qs[d].prev; + __skb_unlink(skb, &q->qs[d]); + kfree_skb(skb); + sfq_dec(q, d); + sch->q.qlen--; + q->ht[q->hash[d]] = SFQ_DEPTH; + sch->stats.drops++; + return 1; + } - d = q->next[q->tail]; - q->next[q->tail] = q->next[d]; - q->allot[q->next[d]] += q->quantum; - skb = q->qs[d].prev; - __skb_unlink(skb, &q->qs[d]); - kfree_skb(skb); - sfq_dec(q, d); -/* - sch->q.qlen--; - */ - q->ht[q->hash[d]] = SFQ_DEPTH; - return; + return 0; } static int sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; - unsigned hash = SFQ_HASH(skb); + unsigned hash = sfq_hash(q, skb); sfq_index x; x = q->ht[hash]; @@ -222,13 +277,52 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) q->tail = x; } } + if (++sch->q.qlen < SFQ_DEPTH-1) { + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 1; + } + + sfq_drop(sch); + return 0; +} + +static int +sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + unsigned hash = sfq_hash(q, skb); + sfq_index x; + + x = q->ht[hash]; + if (x == SFQ_DEPTH) { + q->ht[hash] = x = q->dep[SFQ_DEPTH].next; + q->hash[x] = hash; + } + __skb_queue_head(&q->qs[x], skb); + sfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == SFQ_DEPTH) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } if (++sch->q.qlen < SFQ_DEPTH-1) return 1; - sfq_drop(q); + sch->stats.drops++; + sfq_drop(sch); return 0; } + + + static struct sk_buff * sfq_dequeue(struct Qdisc* sch) { @@ -273,13 +367,28 @@ sfq_reset(struct Qdisc* sch) kfree_skb(skb); } +static void sfq_perturbation(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + + q->perturbation = net_random()&0x1F; + q->perturb_timer.expires = jiffies + q->perturb_period; -static int sfq_open(struct Qdisc *sch, void *arg) + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } +} + +static int sfq_init(struct Qdisc *sch, struct rtattr *opt) { - struct sfq_sched_data *q; + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; int i; - q = (struct sfq_sched_data *)sch->data; + q->perturb_timer.data = (unsigned long)sch; + q->perturb_timer.function = sfq_perturbation; + init_timer(&q->perturb_timer); for (i=0; iht[i] = SFQ_DEPTH; @@ -290,43 +399,89 @@ static int sfq_open(struct Qdisc *sch, void *arg) } q->max_depth = 0; q->tail = SFQ_DEPTH; - q->quantum = sch->dev->mtu; - if (sch->dev->hard_header) - q->quantum += sch->dev->hard_header_len; + if (opt == NULL) { + q->quantum = sch->dev->mtu; + q->perturb_period = 0; + if (sch->dev->hard_header) + q->quantum += sch->dev->hard_header_len; + } else { + struct tc_sfq_qopt *ctl = RTA_DATA(opt); + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + q->quantum = ctl->quantum ? : psched_mtu(sch->dev); + q->perturb_period = ctl->perturb_period*HZ; + /* The rest is compiled in */ + } for (i=0; iperturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } + MOD_INC_USE_COUNT; return 0; } +static void sfq_destroy(struct Qdisc *sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + del_timer(&q->perturb_timer); + MOD_DEC_USE_COUNT; +} + +#ifdef CONFIG_RTNETLINK +static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct tc_sfq_qopt opt; + + opt.quantum = q->quantum; + opt.perturb_period = q->perturb_period/HZ; + + opt.limit = SFQ_DEPTH; + opt.divisor = SFQ_HASH_DIVISOR; + opt.flows = SFQ_DEPTH; + + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif -struct Qdisc_ops sfq_ops = +struct Qdisc_ops sfq_qdisc_ops = { + NULL, NULL, "sfq", - 0, sizeof(struct sfq_sched_data), + sfq_enqueue, sfq_dequeue, + sfq_requeue, + sfq_drop, + + sfq_init, sfq_reset, - NULL, - sfq_open, + sfq_destroy, + +#ifdef CONFIG_RTNETLINK + sfq_dump, +#endif }; #ifdef MODULE int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&sfq_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&sfq_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&sfq_qdisc_ops); } #endif diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index b4f1417615ee..109ae7bec441 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -1,5 +1,5 @@ /* - * net/sched/sch_tbf.c Token Bucket Filter. + * net/sched/sch_tbf.c Token Bucket Filter queue. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -10,6 +10,8 @@ * */ +#include +#include #include #include #include @@ -39,69 +41,91 @@ ======================================= SOURCE. + ------- None. - ALGORITHM. + Description. + ------------ + + Data flow obeys TBF with rate R and depth B, if for any + time interval t_i...t_f number of transmitted bits + does not exceed B + R*(t_f-t_i). + + Packetized version of this definition: + sequence of packets of sizes s_i served at moments t_i + obeys TBF, if for any i<=k: + + s_i+....+s_k <= B + R*(t_k - t_i) + + Algorithm. + ---------- + + Let N(t_i) be B/R initially and N(t) grows continuously with time as: + + N(t+delta) = min{B/R, N(t) + delta} + + If the first packet in queue has length S, it may be + transmited only at the time t_* when S/R <= N(t_*), + and in this case N(t) jumps: + + N(t_* + 0) = N(t_* - 0) - S/R. + + - Sequence of packets satisfy token bucket filter with - rate $r$ and depth $b$, if all the numbers defined by: - \begin{eqnarray*} - n_0 &=& b, \\ - n_i &=& {\rm max} ( b, n_{i-1} + r*(t_i-t_{i-1}) - L_i ), - \end{eqnarray*} - where $t_i$ --- departure time of $i$-th packet and - $L_i$ -- its length, never less than zero. + Actually, QoS requires two TBF to be applied to data stream. + One of them controls steady state burst size, another + with rate P (peak rate) and depth M (equal to link MTU) + limits bursts at smaller time scale. - It is convenient to rescale $n_i$ by factor $r$, so - that the sequence has "canonical" form: - \[ - n_0 = b/r, - n_i = max { b/r, n_{i-1} + t_i - t_{i-1} - L_i/r }, - \] + Apparently, P>R, and B>M. If P is infinity, this double + TBF is equivalent to single one. + + When TBF works in reshaping mode, latency is estimated as: + + lat = max ((L-B)/R, (L-M)/P) - If a packet has n_i < 0, we throttle filter - by $-n_i$ usecs. NOTES. + ------ If TBF throttles, it starts watchdog timer, which will wake up it - after 0...10 msec. + when it will be ready to transmit. + Note, that minimal timer resolution is 1/HZ. If no new packets will arrive during this period, or device will not be awaken by EOI for previous packet, - tbf could stop its activity for 10 msec. + tbf could stop its activity for 1/HZ. + - It means that tbf will sometimes introduce pathological - 10msec delays to flow corresponding to rate*10msec bytes. - For 10Mbit/sec flow it is about 12Kb, on 100Mbit/sec -- ~100Kb. - This number puts lower reasonbale bound on token bucket depth, - but even if depth is larger traffic is erratic at large rates. + It means, that with depth B, the maximal rate is - This problem is not specific for THIS implementation. Really, - there exists statement that any attempt to shape traffic - in transit will increase delays and jitter much more than - we expected naively. + R_crit = B*HZ - Particularily, it means that delay/jitter sensitive traffic - MUST NOT be shaped. Cf. CBQ (wrong) and CSZ (correct) approaches. + F.e. for 10Mbit ethernet and HZ=100 minimal allowed B is ~10Kbytes. + + Note, that peak rate TBF is much more tough: with MTU 1500 + P_crit = 150Kbytes/sec. So that, if you need greater peak + rates, use alpha with HZ=1000 :-) */ struct tbf_sched_data { /* Parameters */ - int cell_log; /* 1<= MTU/B */ - unsigned long max_bytes; /* Maximal length of backlog: bytes */ + u32 limit; /* Maximal length of backlog: bytes */ + u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ + u32 mtu; + struct qdisc_rate_table *R_tab; + struct qdisc_rate_table *P_tab; /* Variables */ - unsigned long bytes; /* Current length of backlog */ - unsigned long tokens; /* Current number of tokens */ + long tokens; /* Current number of B tokens */ + long ptokens; /* Current number of P tokens */ psched_time_t t_c; /* Time check-point */ struct timer_list wd_timer; /* Watchdog timer */ }; -#define L2T(q,L) ((q)->L_tab[(L)>>(q)->cell_log]) +#define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log]) +#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log]) static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) @@ -109,30 +133,56 @@ tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; __skb_queue_tail(&sch->q, skb); - if ((q->bytes += skb->len) <= q->max_bytes) + if ((sch->stats.backlog += skb->len) <= q->limit) { + sch->stats.bytes += skb->len; + sch->stats.packets++; return 1; + } /* Drop action: undo the things that we just made, * i.e. make tail drop */ __skb_unlink(skb, &sch->q); - q->bytes -= skb->len; - kfree_skb(skb); + sch->stats.backlog -= skb->len; + sch->stats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return 0; +} + +static int +tbf_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); + sch->stats.backlog += skb->len; + return 1; +} + +static int +tbf_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + sch->stats.drops++; + kfree_skb(skb); + return 1; + } return 0; } static void tbf_watchdog(unsigned long arg) { struct Qdisc *sch = (struct Qdisc*)arg; - struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; - - q->wd_timer.function = NULL; qdisc_wakeup(sch->dev); } - static struct sk_buff * tbf_dequeue(struct Qdisc* sch) { @@ -144,19 +194,42 @@ tbf_dequeue(struct Qdisc* sch) if (skb) { psched_time_t now; long toks; + long ptoks = 0; PSCHED_GET_TIME(now); - toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->depth, 0) - + q->tokens - L2T(q,skb->len); + toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer, 0); - if (toks >= 0) { + if (q->P_tab) { + ptoks = toks + q->ptokens; + if (ptoks > (long)q->mtu) + ptoks = q->mtu; + ptoks -= L2T_P(q, skb->len); + } + toks += q->tokens; + if (toks > (long)q->buffer) + toks = q->buffer; + toks -= L2T(q, skb->len); + + if ((toks|ptoks) >= 0) { q->t_c = now; - q->tokens = toks <= q->depth ? toks : q->depth; - q->bytes -= skb->len; + q->tokens = toks; + q->ptokens = ptoks; + sch->stats.backlog -= skb->len; return skb; } + if (!sch->dev->tbusy) { + long delay = PSCHED_US2JIFFIE(max(-toks, -ptoks)); + + if (delay == 0) + delay = 1; + + del_timer(&q->wd_timer); + q->wd_timer.expires = jiffies + delay; + add_timer(&q->wd_timer); + } + /* Maybe, we have in queue a shorter packet, which can be sent now. It sounds cool, but, however, wrong in principle. @@ -164,17 +237,12 @@ tbf_dequeue(struct Qdisc* sch) Really, if we splitted flow to independent subflows, it would be very good solution. - Look at sch_csz.c. + It is main idea of all FQ algorithms + (cf. CSZ, HPFQ, HFCS) */ __skb_queue_head(&sch->q, skb); - if (!sch->dev->tbusy) { - if (q->wd_timer.function) - del_timer(&q->wd_timer); - q->wd_timer.function = tbf_watchdog; - q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(-toks); - add_timer(&q->wd_timer); - } + sch->stats.overlimits++; } return NULL; } @@ -184,69 +252,135 @@ static void tbf_reset(struct Qdisc* sch) { struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; - struct sk_buff *skb; - while ((skb = __skb_dequeue(&sch->q)) != NULL) - kfree_skb(skb); - q->bytes = 0; + skb_queue_purge(&sch->q); + sch->stats.backlog = 0; PSCHED_GET_TIME(q->t_c); - q->tokens = q->depth; - if (q->wd_timer.function) { - del_timer(&q->wd_timer); - q->wd_timer.function = NULL; - } + q->tokens = q->buffer; + q->ptokens = q->mtu; + del_timer(&q->wd_timer); } -static int tbf_init(struct Qdisc* sch, void *arg) +static int tbf_init(struct Qdisc* sch, struct rtattr *opt) { struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; - struct tbfctl *ctl = (struct tbfctl*)arg; + struct rtattr *tb[TCA_TBF_PTAB]; + struct tc_tbf_qopt *qopt; + + MOD_INC_USE_COUNT; + + if (opt == NULL || + rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) || + tb[TCA_TBF_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) { + MOD_DEC_USE_COUNT; + return -EINVAL; + } + + qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]); + q->R_tab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]); + if (q->R_tab == NULL) { + MOD_DEC_USE_COUNT; + return -EINVAL; + } + + if (qopt->peakrate.rate) { + q->P_tab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_PTAB-1]); + if (q->P_tab == NULL) { + MOD_DEC_USE_COUNT; + qdisc_put_rtab(q->R_tab); + return -EINVAL; + } + } PSCHED_GET_TIME(q->t_c); init_timer(&q->wd_timer); - q->wd_timer.function = NULL; + q->wd_timer.function = tbf_watchdog; q->wd_timer.data = (unsigned long)sch; - if (ctl) { - q->max_bytes = ctl->bytes; - q->depth = ctl->depth; - q->tokens = q->tokens; - q->cell_log = ctl->cell_log; - memcpy(q->L_tab, ctl->L_tab, 256*sizeof(unsigned long)); - } + q->limit = qopt->limit; + q->mtu = qopt->mtu; + if (q->mtu == 0) + q->mtu = psched_mtu(sch->dev); + q->buffer = qopt->buffer; + q->tokens = q->buffer; + q->ptokens = q->mtu; return 0; } -struct Qdisc_ops tbf_ops = +static void tbf_destroy(struct Qdisc *sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + del_timer(&q->wd_timer); + + if (q->P_tab) + qdisc_put_rtab(q->P_tab); + if (q->R_tab) + qdisc_put_rtab(q->R_tab); + + MOD_DEC_USE_COUNT; +} + +#ifdef CONFIG_RTNETLINK +static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) { + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_tbf_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt.limit = q->limit; + opt.rate = q->R_tab->rate; + if (q->P_tab) + opt.peakrate = q->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + opt.mtu = q->mtu; + opt.buffer = q->buffer; + RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +struct Qdisc_ops tbf_qdisc_ops = +{ + NULL, NULL, "tbf", - 0, sizeof(struct tbf_sched_data), + tbf_enqueue, tbf_dequeue, - tbf_reset, - NULL, + tbf_requeue, + tbf_drop, + tbf_init, - NULL, + tbf_reset, + tbf_destroy, + +#ifdef CONFIG_RTNETLINK + tbf_dump, +#endif }; #ifdef MODULE -#include int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&tbf_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&tbf_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&tbf_qdisc_ops); } #endif diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c new file mode 100644 index 000000000000..7f94c73003d9 --- /dev/null +++ b/net/sched/sch_teql.c @@ -0,0 +1,475 @@ +/* net/sched/sch_teql.c "True" (or "trivial") link equalizer. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + How to setup it. + ---------------- + + After loading this module you will find new device teqlN + and new qdisc with the same name. To join a slave to equalizer + you should just set this qdisc on a device f.e. + + # tc qdisc add dev eth0 root teql0 + # tc qdisc add dev eth1 root teql0 + + That's all. Full PnP 8) + + Applicability. + -------------- + + 1. Slave devices MUST be active devices i.e. must raise tbusy + signal and generate EOI event. If you want to equalize virtual devices + sort of tunnels, use normal eql device. + 2. This device puts no limitations on physical slave characteristics + f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-) + Certainly, large difference in link speeds will make resulting eqalized + link unusable, because of huge packet reordering. I estimated upper + useful difference as ~10 times. + 3. If slave requires address resolution, only protocols using + neighbour cache (IPv4/IPv6) will work over equalized link. + Another protocols still are allowed to use slave device directly, + which will not break load balancing, though native slave + traffic will have the highest priority. + */ + +struct teql_master +{ + struct Qdisc_ops qops; + struct device dev; + struct Qdisc *slaves; + struct net_device_stats stats; + char name[IFNAMSIZ]; +}; + +struct teql_sched_data +{ + struct Qdisc *next; + struct teql_master *m; + struct neighbour *ncache; + struct sk_buff_head q; +}; + +#define NEXT_SLAVE(q) (((struct teql_sched_data*)((q)->data))->next) + +#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST) + +/* "teql*" qdisc routines */ + +static int +teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct device *dev = sch->dev; + struct teql_sched_data *q = (struct teql_sched_data *)sch->data; + + __skb_queue_tail(&q->q, skb); + if (q->q.qlen <= dev->tx_queue_len) { + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 1; + } + + __skb_unlink(skb, &q->q); + kfree_skb(skb); + sch->stats.drops++; + return 0; +} + +static int +teql_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct teql_sched_data *q = (struct teql_sched_data *)sch->data; + + __skb_queue_head(&q->q, skb); + return 1; +} + +static struct sk_buff * +teql_dequeue(struct Qdisc* sch) +{ + struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; + struct sk_buff *skb; + + skb = __skb_dequeue(&dat->q); + if (skb == NULL) { + struct device *m = dat->m->dev.qdisc->dev; + if (m) { + m->tbusy = 0; + dat->m->slaves = sch; + qdisc_restart(m); + } + } + sch->q.qlen = dat->q.qlen + dat->m->dev.qdisc->q.qlen; + return skb; +} + +static __inline__ void +teql_neigh_release(struct neighbour *n) +{ + if (n) + neigh_release(n); +} + +static void +teql_reset(struct Qdisc* sch) +{ + struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; + + skb_queue_purge(&dat->q); + sch->q.qlen = 0; + teql_neigh_release(xchg(&dat->ncache, NULL)); +} + +static void +teql_destroy(struct Qdisc* sch) +{ + struct Qdisc *q, *prev; + struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; + struct teql_master *master = dat->m; + + if ((prev = master->slaves) != NULL) { + do { + q = NEXT_SLAVE(prev); + if (q == sch) { + NEXT_SLAVE(prev) = NEXT_SLAVE(q); + if (q == master->slaves) { + master->slaves = NEXT_SLAVE(q); + if (q == master->slaves) + master->slaves = NULL; + } + skb_queue_purge(&dat->q); + teql_neigh_release(xchg(&dat->ncache, NULL)); + break; + } + + } while ((prev = q) != master->slaves); + } + + MOD_DEC_USE_COUNT; +} + +static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct device *dev = sch->dev; + struct teql_master *m = (struct teql_master*)sch->ops; + struct teql_sched_data *q = (struct teql_sched_data *)sch->data; + + if (dev->hard_header_len > m->dev.hard_header_len) + return -EINVAL; + + q->m = m; + + skb_queue_head_init(&q->q); + + if (m->slaves) { + if (m->dev.flags & IFF_UP) { + if ((m->dev.flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT)) + || (m->dev.flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST)) + || (m->dev.flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST)) + || dev->mtu < m->dev.mtu) + return -EINVAL; + } else { + if (!(dev->flags&IFF_POINTOPOINT)) + m->dev.flags &= ~IFF_POINTOPOINT; + if (!(dev->flags&IFF_BROADCAST)) + m->dev.flags &= ~IFF_BROADCAST; + if (!(dev->flags&IFF_MULTICAST)) + m->dev.flags &= ~IFF_MULTICAST; + if (dev->mtu < m->dev.mtu) + m->dev.mtu = dev->mtu; + } + q->next = NEXT_SLAVE(m->slaves); + NEXT_SLAVE(m->slaves) = sch; + } else { + q->next = sch; + m->slaves = sch; + m->dev.mtu = dev->mtu; + m->dev.flags = (m->dev.flags&~FMASK)|(dev->flags&FMASK); + } + + MOD_INC_USE_COUNT; + return 0; +} + +/* "teql*" netdevice routines */ + +static int +__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct device *dev) +{ + struct teql_sched_data *q = (void*)dev->qdisc->data; + struct neighbour *mn = skb->dst->neighbour; + struct neighbour *n = q->ncache; + + if (mn->tbl == NULL) + return -EINVAL; + if (n && n->tbl == mn->tbl && + memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { + atomic_inc(&n->refcnt); + } else { + n = __neigh_lookup(mn->tbl, mn->primary_key, dev, 1); + if (n == NULL) + return -ENOBUFS; + } + if (neigh_event_send(n, skb_res) == 0) { + if (dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len) < 0) { + neigh_release(n); + return -EINVAL; + } + teql_neigh_release(xchg(&q->ncache, n)); + return 0; + } + neigh_release(n); + return (skb_res != NULL); +} + +static __inline__ int +teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct device *dev) +{ + if (dev->hard_header == NULL || + skb->dst == NULL || + skb->dst->neighbour == NULL) + return 0; + return __teql_resolve(skb, skb_res, dev); +} + +static int teql_master_xmit(struct sk_buff *skb, struct device *dev) +{ + struct teql_master *master = (void*)dev->priv; + struct Qdisc *start, *q; + int busy; + int nores; + struct sk_buff *skb_res = NULL; + + dev->tbusy = 1; + + start = master->slaves; + +restart: + nores = 0; + busy = 1; + + if ((q = start) == NULL) + goto drop; + + do { + struct device *slave = q->dev; + + if (!slave->tbusy && slave->qdisc_sleeping == q) { + busy = 0; + + if (q->h.forw == NULL) { + q->h.forw = qdisc_head.forw; + qdisc_head.forw = &q->h; + } + + switch (teql_resolve(skb, skb_res, slave)) { + case 0: + if (slave->hard_start_xmit(skb, slave) == 0) { + master->slaves = NEXT_SLAVE(q); + dev->tbusy = 0; + return 0; + } + break; + case 1: + nores = 1; + break; + default: + master->slaves = NEXT_SLAVE(q); + dev->tbusy = 0; + return 0; + } + __skb_pull(skb, skb->nh.raw - skb->data); + } + } while ((q = NEXT_SLAVE(q)) != start); + + if (nores && skb_res == NULL) { + skb_res = skb; + goto restart; + } + + dev->tbusy = busy; + if (busy) + return 1; + +drop: + dev_kfree_skb(skb); + return 0; +} + +static int teql_master_open(struct device *dev) +{ + struct Qdisc * q; + struct teql_master *m = (void*)dev->priv; + int mtu = 0xFFFE; + unsigned flags = IFF_NOARP|IFF_MULTICAST; + + if (m->slaves == NULL) + return -EUNATCH; + + flags = FMASK; + + q = m->slaves; + do { + struct device *slave = q->dev; + + if (slave == NULL) + return -EUNATCH; + + if (slave->mtu < mtu) + mtu = slave->mtu; + if (slave->hard_header_len > LL_MAX_HEADER) + return -EINVAL; + + /* If all the slaves are BROADCAST, master is BROADCAST + If all the slaves are PtP, master is PtP + Otherwise, master is NBMA. + */ + if (!(slave->flags&IFF_POINTOPOINT)) + flags &= ~IFF_POINTOPOINT; + if (!(slave->flags&IFF_BROADCAST)) + flags &= ~IFF_BROADCAST; + if (!(slave->flags&IFF_MULTICAST)) + flags &= ~IFF_MULTICAST; + } while ((q = NEXT_SLAVE(q)) != m->slaves); + + m->dev.mtu = mtu; + m->dev.flags = (m->dev.flags&~FMASK) | flags; + MOD_INC_USE_COUNT; + return 0; +} + +static int teql_master_close(struct device *dev) +{ + MOD_DEC_USE_COUNT; + return 0; +} + +static struct net_device_stats *teql_master_stats(struct device *dev) +{ + struct teql_master *m = (void*)dev->priv; + return &m->stats; +} + +static int teql_master_mtu(struct device *dev, int new_mtu) +{ + struct teql_master *m = (void*)dev->priv; + struct Qdisc *q; + + if (new_mtu < 68) + return -EINVAL; + + q = m->slaves; + if (q) { + do { + if (new_mtu > q->dev->mtu) + return -EINVAL; + } while ((q=NEXT_SLAVE(q)) != m->slaves); + } + + dev->mtu = new_mtu; + return 0; +} + +static int teql_master_init(struct device *dev) +{ + dev->open = teql_master_open; + dev->hard_start_xmit = teql_master_xmit; + dev->stop = teql_master_close; + dev->get_stats = teql_master_stats; + dev->change_mtu = teql_master_mtu; + dev->type = 0; + dev->mtu = 1500; + dev->tx_queue_len = 100; + dev->flags = IFF_NOARP; + dev->hard_header_len = LL_MAX_HEADER; + return 0; +} + +static struct teql_master the_master = { +{ + NULL, + NULL, + "", + sizeof(struct teql_sched_data), + + teql_enqueue, + teql_dequeue, + teql_requeue, + NULL, + + teql_qdisc_init, + teql_reset, + teql_destroy, +},}; + + +#ifdef MODULE +int init_module(void) +#else +__initfunc(int teql_init(void)) +#endif +{ + int err; + + rtnl_lock(); + + the_master.dev.priv = (void*)&the_master; + the_master.dev.name = (void*)&the_master.name; + err = dev_alloc_name(&the_master.dev, "teql%d"); + if (err < 0) + return err; + memcpy(the_master.qops.id, the_master.name, IFNAMSIZ); + the_master.dev.init = teql_master_init; + + err = register_netdevice(&the_master.dev); + if (err == 0) { + err = register_qdisc(&the_master.qops); + if (err) + unregister_netdevice(&the_master.dev); + } + rtnl_unlock(); + return err; +} + +#ifdef MODULE +void cleanup_module(void) +{ + rtnl_lock(); + unregister_qdisc(&the_master.qops); + unregister_netdevice(&the_master.dev); + rtnl_unlock(); +} +#endif diff --git a/net/unix/Makefile b/net/unix/Makefile index f0bebfae360b..a335b486daaa 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux TCP/IP (INET) layer. +# Makefile for the Linux unix domain socket layer. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here diff --git a/net/wanrouter/Makefile b/net/wanrouter/Makefile index 12afaee5d846..beafe5059b7b 100644 --- a/net/wanrouter/Makefile +++ b/net/wanrouter/Makefile @@ -8,7 +8,8 @@ # Note 2! The CFLAGS definition is now in the main makefile... O_TARGET := wanrouter.o -O_OBJS := wanmain.o wanproc.o +OX_OBJS := wanmain.o +O_OBJS := wanproc.o M_OBJS := $(O_TARGET) include $(TOPDIR)/Rules.make diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index f92ac29bb5b2..30e2c2034bda 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -18,11 +18,12 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * ============================================================================ -* Oct 15, 1997 Farhan Thawar changed wan_encapsulate to add a pad byte of 0 -* Jun 27, 1997 Alan Cox realigned with vendor code +* Dec 27, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) * Jan 16, 1997 Gene Kozin router_devlist made public * Jan 31, 1997 Alan Cox Hacked it about a bit for 2.1 -* Dec 27, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) +* Jun 27, 1997 Alan Cox realigned with vendor code +* Oct 15, 1997 Farhan Thawar changed wan_encapsulate to add a pad byte of 0 +* Apr 20, 1998 Alan Cox Fixed 2.1 symbols *****************************************************************************/ #include /* offsetof(), etc. */ @@ -165,6 +166,7 @@ __initfunc(void wanrouter_init(void)) * Context: process */ + int register_wan_device(wan_device_t* wandev) { int err, namelen; @@ -223,6 +225,7 @@ int register_wan_device(wan_device_t* wandev) * <0 error. * Context: process */ + int unregister_wan_device(char* name) { @@ -269,6 +272,7 @@ int unregister_wan_device(char* name) * 1. This function may be called on interrupt context. */ + int wanrouter_encapsulate (struct sk_buff* skb, struct device* dev) { int hdr_len = 0; @@ -310,6 +314,7 @@ int wanrouter_encapsulate (struct sk_buff* skb, struct device* dev) * 1. This function may be called on interrupt context. */ + unsigned short wanrouter_type_trans (struct sk_buff* skb, struct device* dev) { int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */ @@ -679,6 +684,14 @@ static int delete_interface (wan_device_t* wandev, char* name, int force) return 0; } +#ifdef MODULE +EXPORT_SYMBOL(register_wan_device); +EXPORT_SYMBOL(unregister_wan_device); +EXPORT_SYMBOL(wanrouter_encapsulate); +EXPORT_SYMBOL(wanrouter_type_trans); +#endif + /* * End */ + diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 16396040913e..fc1de5c87259 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1118,14 +1118,13 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct x25_facilities facilities; if (copy_from_user(&facilities, (void *)arg, sizeof(facilities))) return -EFAULT; - if (sk->state != TCP_LISTEN && sk->state != TCP_CLOSE) + if (sk->state != TCP_LISTEN) return -EINVAL; if (facilities.pacsize_in < X25_PS16 || facilities.pacsize_in > X25_PS4096) return -EINVAL; if (facilities.pacsize_out < X25_PS16 || facilities.pacsize_out > X25_PS4096) return -EINVAL; - if (sk->state == TCP_CLOSE || sk->protinfo.x25->neighbour->extended) - { + if (sk->protinfo.x25->neighbour->extended) { if (facilities.winsize_in < 1 || facilities.winsize_in > 127) return -EINVAL; if (facilities.winsize_out < 1 || facilities.winsize_out > 127) -- 2.39.5