From 7f563ad63270d13b8412eb7c89d7b7b44f40d0e1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Nov 2007 15:14:08 -0500 Subject: [PATCH] Import 2.1.68 --- CREDITS | 9 + Documentation/Configure.help | 18 +- Documentation/ide.txt | 10 +- arch/i386/defconfig | 11 +- drivers/block/Config.in | 12 +- drivers/block/Makefile | 8 +- drivers/block/floppy.c | 2 +- drivers/block/ide-disk.c | 62 +- drivers/block/ide-dma.c | 626 +++++++ drivers/block/ide-floppy.c | 20 +- drivers/block/ide-probe.c | 27 +- drivers/block/ide-tape.c | 28 +- drivers/block/ide.c | 305 +--- drivers/block/ide.h | 41 +- drivers/block/opti621.c | 197 +-- drivers/block/{promise.c => pdc4030.c} | 31 +- drivers/block/{promise.h => pdc4030.h} | 12 +- drivers/block/rz1000.c | 58 +- drivers/block/triton.c | 631 ------- drivers/char/lp.c | 6 +- drivers/char/videodev.c | 4 +- drivers/net/Config.in | 13 +- drivers/net/net_init.c | 301 +--- drivers/net/plip.c | 3 - drivers/net/ppp.c | 13 +- drivers/net/slip.c | 4 + drivers/net/strip.c | 3 - drivers/pci/pci.c | 2 +- drivers/sound/Makefile | 2 +- drivers/sound/lowlevel/awe_wave.c | 4 +- include/linux/acct.h | 12 +- include/linux/etherdevice.h | 2 + include/linux/hdreg.h | 30 +- include/linux/if_tunnel.h | 29 + include/linux/igmp.h | 62 +- include/linux/in.h | 17 +- include/linux/in_route.h | 31 + include/linux/inetdevice.h | 118 ++ include/linux/mroute.h | 71 +- include/linux/net.h | 8 +- include/linux/net_alias.h | 187 --- include/linux/netdevice.h | 97 +- include/linux/netlink.h | 174 +- include/linux/pci.h | 2 +- include/linux/pkt_sched.h | 93 ++ include/linux/proc_fs.h | 5 - include/linux/route.h | 132 +- include/linux/rtnetlink.h | 555 +++++++ include/linux/skbuff.h | 37 +- include/linux/socket.h | 71 +- include/linux/sockios.h | 13 +- include/linux/sysctl.h | 17 +- include/linux/tcp.h | 14 + include/net/dst.h | 10 +- include/net/gc.h | 46 - include/net/icmp.h | 3 + include/net/inet_common.h | 2 +- include/net/ip.h | 31 +- include/net/ip_alias.h | 23 - include/net/ip_fib.h | 230 ++- include/net/ip_masq.h | 16 +- include/net/ipconfig.h | 19 + include/net/ipip.h | 31 +- include/net/netlink.h | 65 - include/net/pkt_sched.h | 164 ++ include/net/protocol.h | 2 +- include/net/raw.h | 3 +- include/net/rose.h | 7 +- include/net/route.h | 96 +- include/net/sit.h | 39 - include/net/slhc_vj.h | 4 +- include/net/sock.h | 53 +- include/net/tcp.h | 28 +- include/net/udp.h | 2 +- include/net/x25.h | 4 +- init/main.c | 4 + net/Config.in | 20 +- net/Makefile | 58 +- net/ax25/af_ax25.c | 1 - net/ax25/ax25_ds_subr.c | 1 - net/ax25/ax25_ip.c | 1 - net/ax25/ax25_out.c | 13 +- net/ax25/ax25_subr.c | 1 - net/core/Makefile | 8 +- net/core/dev.c | 1277 +++++++-------- net/core/dev_mcast.c | 21 +- net/core/iovec.c | 92 +- net/core/net_alias.c | 1464 ----------------- net/core/rtnetlink.c | 436 +++++ net/core/scm.c | 15 +- net/core/skbuff.c | 2 +- net/core/sock.c | 49 +- net/ethernet/eth.c | 7 + net/ipv4/Config.in | 43 +- net/ipv4/Makefile | 36 +- net/ipv4/af_inet.c | 110 +- net/ipv4/arp.c | 249 ++- net/ipv4/devinet.c | 958 ++++++++--- net/ipv4/fib.c | 2077 ------------------------ net/ipv4/fib_frontend.c | 572 +++++++ net/ipv4/fib_hash.c | 754 +++++++++ net/ipv4/fib_rules.c | 363 +++++ net/ipv4/fib_semantics.c | 908 +++++++++++ net/ipv4/icmp.c | 358 ++-- net/ipv4/igmp.c | 759 +++++---- net/ipv4/ip_alias.c | 170 -- net/ipv4/ip_forward.c | 107 +- net/ipv4/ip_fragment.c | 14 +- net/ipv4/ip_fw.c | 44 +- net/ipv4/ip_gre.c | 1191 ++++++++++++++ net/ipv4/ip_input.c | 100 +- net/ipv4/ip_masq.c | 24 +- net/ipv4/ip_masq_app.c | 8 +- net/ipv4/ip_masq_ftp.c | 4 +- net/ipv4/ip_masq_irc.c | 4 +- net/ipv4/ip_masq_quake.c | 6 +- net/ipv4/ip_masq_raudio.c | 6 +- net/ipv4/ip_nat_dumb.c | 2 + net/ipv4/ip_options.c | 48 +- net/ipv4/ip_output.c | 110 +- net/ipv4/ip_sockglue.c | 442 +++-- net/ipv4/ipconfig.c | 1160 +++++++++++++ net/ipv4/ipip.c | 796 ++++++++- net/ipv4/ipmr.c | 953 ++++++++--- net/ipv4/packet.c | 528 ------ net/ipv4/proc.c | 12 +- net/ipv4/protocol.c | 54 +- net/ipv4/rarp.c | 60 +- net/ipv4/raw.c | 172 +- net/ipv4/route.c | 1206 ++++++++------ net/ipv4/syncookies.c | 10 +- net/ipv4/sysctl_net_ipv4.c | 73 +- net/ipv4/tcp.c | 30 +- net/ipv4/tcp_input.c | 168 +- net/ipv4/tcp_ipv4.c | 444 +++-- net/ipv4/tcp_output.c | 26 +- net/ipv4/tcp_timer.c | 15 +- net/ipv4/timer.c | 2 +- net/ipv4/udp.c | 219 ++- net/ipv4/utils.c | 2 +- net/ipv6/Config.in | 7 + net/ipv6/addrconf.c | 685 ++++---- net/ipv6/af_inet6.c | 10 +- net/ipv6/icmp.c | 2 +- net/ipv6/ip6_fib.c | 3 +- net/ipv6/ip6_fw.c | 7 +- net/ipv6/ip6_input.c | 2 +- net/ipv6/ip6_output.c | 7 +- net/ipv6/ipv6_sockglue.c | 4 +- net/ipv6/mcast.c | 7 +- net/ipv6/ndisc.c | 204 ++- net/ipv6/raw.c | 111 +- net/ipv6/route.c | 287 ++-- net/ipv6/sit.c | 970 ++++++----- net/ipv6/tcp_ipv6.c | 202 ++- net/ipv6/udp.c | 7 +- net/ipx/af_ipx.c | 1 - net/netlink.c | 475 ------ net/netlink/Makefile | 26 + net/netlink/af_netlink.c | 1025 ++++++++++++ net/netlink/netlink_dev.c | 213 +++ net/netrom/nr_dev.c | 8 - net/netsyms.c | 88 +- net/packet/Makefile | 24 + net/packet/af_packet.c | 1251 ++++++++++++++ net/protocols.c | 22 +- net/rose/af_rose.c | 19 +- net/rose/rose_dev.c | 8 - net/rose/rose_in.c | 57 +- net/rose/rose_link.c | 2 +- net/rose/rose_out.c | 84 +- net/rose/rose_route.c | 70 +- net/rose/rose_subr.c | 43 +- net/sched/Makefile | 71 + net/sched/sch_cbq.c | 839 ++++++++++ net/sched/sch_csz.c | 832 ++++++++++ net/sched/sch_fifo.c | 179 ++ net/sched/sch_generic.c | 541 ++++++ net/sched/sch_prio.c | 146 ++ net/sched/sch_red.c | 303 ++++ net/sched/sch_sfq.c | 333 ++++ net/sched/sch_tbf.c | 252 +++ net/socket.c | 48 +- net/sunrpc/sunrpc_syms.c | 2 +- net/sysctl_net.c | 8 +- net/unix/Makefile | 1 + net/unix/af_unix.c | 35 + net/unix/sysctl_net_unix.c | 29 +- net/x25/af_x25.c | 1 + net/x25/x25_dev.c | 3 - net/x25/x25_in.c | 16 +- net/x25/x25_out.c | 37 +- net/x25/x25_subr.c | 46 + 193 files changed, 21680 insertions(+), 11958 deletions(-) create mode 100644 drivers/block/ide-dma.c rename drivers/block/{promise.c => pdc4030.c} (92%) rename drivers/block/{promise.h => pdc4030.h} (71%) delete mode 100644 drivers/block/triton.c create mode 100644 include/linux/if_tunnel.h create mode 100644 include/linux/in_route.h create mode 100644 include/linux/inetdevice.h delete mode 100644 include/linux/net_alias.h create mode 100644 include/linux/pkt_sched.h create mode 100644 include/linux/rtnetlink.h delete mode 100644 include/net/gc.h delete mode 100644 include/net/ip_alias.h create mode 100644 include/net/ipconfig.h delete mode 100644 include/net/netlink.h create mode 100644 include/net/pkt_sched.h delete mode 100644 include/net/sit.h delete mode 100644 net/core/net_alias.c create mode 100644 net/core/rtnetlink.c delete mode 100644 net/ipv4/fib.c create mode 100644 net/ipv4/fib_frontend.c create mode 100644 net/ipv4/fib_hash.c create mode 100644 net/ipv4/fib_rules.c create mode 100644 net/ipv4/fib_semantics.c delete mode 100644 net/ipv4/ip_alias.c create mode 100644 net/ipv4/ip_gre.c create mode 100644 net/ipv4/ipconfig.c delete mode 100644 net/ipv4/packet.c create mode 100644 net/ipv6/Config.in delete mode 100644 net/netlink.c create mode 100644 net/netlink/Makefile create mode 100644 net/netlink/af_netlink.c create mode 100644 net/netlink/netlink_dev.c create mode 100644 net/packet/Makefile create mode 100644 net/packet/af_packet.c create mode 100644 net/sched/Makefile create mode 100644 net/sched/sch_cbq.c create mode 100644 net/sched/sch_csz.c create mode 100644 net/sched/sch_fifo.c create mode 100644 net/sched/sch_generic.c create mode 100644 net/sched/sch_prio.c create mode 100644 net/sched/sch_red.c create mode 100644 net/sched/sch_sfq.c create mode 100644 net/sched/sch_tbf.c diff --git a/CREDITS b/CREDITS index aa2f178fa09c..d305dd2b1bfa 100644 --- a/CREDITS +++ b/CREDITS @@ -43,6 +43,15 @@ S: 4390 Albany Dr. #46 S: San Jose, California 95129 S: USA +N: Andrea Arcangeli +E: arcangeli@mbox.queen.it +W: http://www-linux.deis.unibo.it/~mirror/ +P: 1024/CB4660B9 CC A0 71 81 F4 A0 63 AC C0 4B 81 1D 8C 15 C8 E5 +D: parport sharing fix. Various other kernel hacks. +S: Via Ciaclini 26 +S: Imola 40026 +S: Italy + N: Derek Atkins E: warlord@MIT.EDU D: Linux-AFS Port, random kernel hacker, diff --git a/Documentation/Configure.help b/Documentation/Configure.help index 401ce09187b2..10171cb246da 100644 --- a/Documentation/Configure.help +++ b/Documentation/Configure.help @@ -304,13 +304,12 @@ CONFIG_BLK_DEV_RZ1000 things will operate 100% reliably. If unsure, say Y. Intel 82371 PIIX (Triton I/II), VIA VP-1 DMA support -CONFIG_BLK_DEV_TRITON - If your PCI system uses an IDE harddrive (as opposed to SCSI, say) - and includes the Intel Triton I/II IDE interface chipset (i82371FB, - i82371SB or i82371AB), or the VIA VP-1 IDE interface chipset - (VT82C586), you will want to enable this option to allow use of +CONFIG_BLK_DEV_IDEDMA + If your PCI system uses IDE drive(s) (as opposed to SCSI, say) + and is capable of bus-master DMA operation (most Pentium PCI + systems), you will want to enable this option to allow use of bus-mastering DMA data transfers. Read the comments at the - beginning of drivers/block/triton.c and Documentation/ide.txt. + beginning of drivers/block/idedma.c and Documentation/ide.txt. You can get the latest version of the hdparm utility via ftp (user: anonymous) from sunsite.unc.edu/pub/Linux/kernel/patches/diskdrives/; it is @@ -343,18 +342,19 @@ CONFIG_BLK_DEV_HT6560B See the Documentation/ide.txt and ht6560b.c files for more info. PROMISE DC4030 support (EXPERIMENTAL) -CONFIG_BLK_DEV_PROMISE +CONFIG_BLK_DEV_PDC4030 This driver provides support for the secondary IDE interface and cache of Promise IDE chipsets, e.g. DC4030 and DC5030. This driver is known to incur timeouts/retries during heavy I/O to drives attached to the secondary interface. CDROM and TAPE devices are not supported yet. This driver is enabled at runtime using the "ide0=dc4030" kernel boot parameter. See the Documentation/ide.txt - and drivers/block/promise.c files for more info. + and drivers/block/pdc4030.c files for more info. OPTi 82C621 support (EXPERIMENTAL) CONFIG_BLK_DEV_OPTI621 - This is a driver for the OPTi 82C621 EIDE controller. + This driver allows use of hdparm to change the PIO timings + for drives attached to an OPTi MIDE controller. Please read the comments at the top of drivers/block/opti621.c. QDI QD6580 support diff --git a/Documentation/ide.txt b/Documentation/ide.txt index f9ececdd525c..a6e7e551e585 100644 --- a/Documentation/ide.txt +++ b/Documentation/ide.txt @@ -1,4 +1,4 @@ -ide.txt -- Information regarding the Enhanced IDE drive in Linux 2.1.xx +ide.txt -- Information regarding the Enhanced IDE drive in Linux 2.1.68+ =============================================================================== Supported by: Mark Lord -- disks, interfaces, probing @@ -56,17 +56,19 @@ NEW! - support for IDE ATAPI *floppy* drives (courtesy of Juha Laiho ). - auto-detect of disk translations by examining partition table - ide-cd.c now compiles separate from ide.c - - Bus-Master DMA support for Intel PCI Triton chipset IDE interfaces - - for details, see comments at top of triton.c - ide-cd.c now supports door locking and auto-loading. - Also preliminary support for multisession and direct reads of audio data. - experimental support for Promise DC4030VL caching interface card - email thanks/problems to: peterd@pnd-pc.demon.co.uk - the hdparm-3.1 package can be used to set PIO modes for some chipsets. -NEW! - support for the OPTi 82C621 chipset, courtesy of Jaromir Koutek. +NEW! - support for setting PIO modes with the OPTi 82C621, courtesy of Jaromir Koutek. NEW! - support for loadable modules NEW! - optional SCSI host adapter emulation for ATAPI devices +NEW! - generic PCI Bus-Master DMA support +NEW! - works with most Pentium PCI systems, chipsets, add-on cards +NEW! - works with regular DMA as well as Ultra DMA +NEW! - automatically probes for all PCI IDE interfaces For work in progress, see the comments in ide.c, ide-cd.c, triton.c, ... diff --git a/arch/i386/defconfig b/arch/i386/defconfig index bd100e08d594..eb511e3c48da 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -58,7 +58,7 @@ CONFIG_BLK_DEV_IDECD=y CONFIG_BLK_DEV_CMD640=y # CONFIG_BLK_DEV_CMD640_ENHANCED is not set CONFIG_BLK_DEV_RZ1000=y -CONFIG_BLK_DEV_TRITON=y +CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDE_CHIPSETS is not set # @@ -74,22 +74,27 @@ CONFIG_BLK_DEV_TRITON=y # # Networking options # +# CONFIG_PACKET is not set # CONFIG_NETLINK is not set # CONFIG_FIREWALL is not set # CONFIG_NET_ALIAS is not set +CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IP_MULTICAST is not set +# CONFIG_IP_ADVANCED_ROUTER is not set +# CONFIG_IP_PNP is not set # CONFIG_IP_ACCT is not set +# CONFIG_IP_MASQUERADE is not set # CONFIG_IP_ROUTER is not set # CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_IP_ALIAS is not set # CONFIG_SYN_COOKIES is not set # # (it is safe to leave these untouched) # -# CONFIG_INET_PCTCP is not set # CONFIG_INET_RARP is not set -CONFIG_PATH_MTU_DISCOVERY=y CONFIG_IP_NOSR=y CONFIG_SKB_LARGE=y diff --git a/drivers/block/Config.in b/drivers/block/Config.in index 9b52e48c1088..277cbf22cb62 100644 --- a/drivers/block/Config.in +++ b/drivers/block/Config.in @@ -23,7 +23,12 @@ else fi if [ "$CONFIG_PCI" = "y" ]; then bool ' RZ1000 chipset bugfix/support' CONFIG_BLK_DEV_RZ1000 - bool ' Intel PIIX/PIIX3/PIIX4 (Triton 430FX/HX/VX/TX, 440FX) DMA support' CONFIG_BLK_DEV_TRITON + bool ' PCI bus-master DMA support' CONFIG_BLK_DEV_IDEDMA + if [ "$CONFIG_BLK_DEV_IDEDMA" = "y" ]; then + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + bool ' OPTi 82C621 enhanced support (EXPERIMENTAL)' CONFIG_BLK_DEV_OPTI621 + fi + fi fi bool ' Other IDE chipset support' CONFIG_IDE_CHIPSETS if [ "$CONFIG_IDE_CHIPSETS" = "y" ]; then @@ -32,10 +37,7 @@ else bool ' DTC-2278 support' CONFIG_BLK_DEV_DTC2278 bool ' Holtek HT6560B support' CONFIG_BLK_DEV_HT6560B if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then - bool ' PROMISE DC4030 support (EXPERIMENTAL)' CONFIG_BLK_DEV_PROMISE - if [ "$CONFIG_PCI" = "y" ]; then - bool ' OPTi 82C621 support (EXPERIMENTAL)' CONFIG_BLK_DEV_OPTI621 - fi + bool ' PROMISE DC4030 support (EXPERIMENTAL)' CONFIG_BLK_DEV_PDC4030 fi bool ' QDI QD6580 support' CONFIG_BLK_DEV_QD6580 bool ' UMC 8672 support' CONFIG_BLK_DEV_UMC8672 diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 89ce8a00432a..b61cc9e23127 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -96,8 +96,8 @@ ifeq ($(CONFIG_BLK_DEV_CMD640),y) L_OBJS += cmd640.o endif -ifeq ($(CONFIG_BLK_DEV_TRITON),y) -L_OBJS += triton.o +ifeq ($(CONFIG_BLK_DEV_IDEDMA),y) +L_OBJS += ide-dma.o endif ifeq ($(CONFIG_BLK_DEV_PS2),y) @@ -125,8 +125,8 @@ ifeq ($(CONFIG_BLK_DEV_ALI14XX),y) L_OBJS += ali14xx.o endif -ifeq ($(CONFIG_BLK_DEV_PROMISE),y) -L_OBJS += promise.o +ifeq ($(CONFIG_BLK_DEV_PDC4030),y) +L_OBJS += pdc4030.o endif ifeq ($(CONFIG_BLK_DEV_OPTI621),y) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 3c9b085c8900..725543c41eb9 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4065,7 +4065,7 @@ static int floppy_grab_irq_and_dma(void) if (FDCS->address != -1){ if (check_region(FDCS->address, 6) < 0 || check_region(FDCS->address+7, 1) < 0) { - DPRINT("Floppy io-port 0x%04x in use\n", FDCS->address); + DPRINT("Floppy io-port 0x%04lx in use\n", FDCS->address); fd_free_irq(); fd_free_dma(); while(--fdc >= 0) { diff --git a/drivers/block/ide-disk.c b/drivers/block/ide-disk.c index fbf8c3833090..3a3944072220 100644 --- a/drivers/block/ide-disk.c +++ b/drivers/block/ide-disk.c @@ -1,7 +1,7 @@ /* - * linux/drivers/block/ide-disk.c Version 1.01 Nov 25, 1996 + * linux/drivers/block/ide-disk.c Version 1.02 Nov 29, 1997 * - * Copyright (C) 1994-1996 Linus Torvalds & authors (see below) + * Copyright (C) 1994-1998 Linus Torvalds & authors (see below) */ /* @@ -39,6 +39,7 @@ * Version 1.00 move disk only code from ide.c to ide-disk.c * support optional byte-swapping of all data * Version 1.01 fix previous byte-swapping code + * Verions 1.02 remove ", LBA" from drive identification msgs */ #undef REALLY_SLOW_IO /* most systems can safely undef this */ @@ -308,23 +309,23 @@ static void recal_intr (ide_drive_t *drive) */ static void do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) { -#ifdef CONFIG_BLK_DEV_PROMISE +#ifdef CONFIG_BLK_DEV_PDC4030 ide_hwif_t *hwif = HWIF(drive); - int use_promise_io = 0; -#endif /* CONFIG_BLK_DEV_PROMISE */ + int use_pdc4030_io = 0; +#endif /* CONFIG_BLK_DEV_PDC4030 */ OUT_BYTE(drive->ctl,IDE_CONTROL_REG); OUT_BYTE(rq->nr_sectors,IDE_NSECTOR_REG); -#ifdef CONFIG_BLK_DEV_PROMISE - if (IS_PROMISE_DRIVE) { - if (hwif->is_promise2 || rq->cmd == READ) { - use_promise_io = 1; +#ifdef CONFIG_BLK_DEV_PDC4030 + if (IS_PDC4030_DRIVE) { + if (hwif->is_pdc4030_2 || rq->cmd == READ) { + use_pdc4030_io = 1; } } - if (drive->select.b.lba || use_promise_io) { -#else /* !CONFIG_BLK_DEV_PROMISE */ + if (drive->select.b.lba || use_pdc4030_io) { +#else /* !CONFIG_BLK_DEV_PDC4030 */ if (drive->select.b.lba) { -#endif /* CONFIG_BLK_DEV_PROMISE */ +#endif /* CONFIG_BLK_DEV_PDC4030 */ #ifdef DEBUG printk("%s: %sing: LBAsect=%ld, sectors=%ld, buffer=0x%08lx\n", drive->name, (rq->cmd==READ)?"read":"writ", @@ -350,26 +351,27 @@ static void do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long bl head, sect, rq->nr_sectors, (unsigned long) rq->buffer); #endif } -#ifdef CONFIG_BLK_DEV_PROMISE - if (use_promise_io) { - do_promise_io (drive, rq); +#ifdef CONFIG_BLK_DEV_PDC4030 + if (use_pdc4030_io) { + extern void do_pdc4030_io(ide_drive_t *, struct request *); + do_pdc4030_io (drive, rq); return; } -#endif /* CONFIG_BLK_DEV_PROMISE */ +#endif /* CONFIG_BLK_DEV_PDC4030 */ if (rq->cmd == READ) { -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA if (drive->using_dma && !(HWIF(drive)->dmaproc(ide_dma_read, drive))) return; -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ ide_set_handler(drive, &read_intr, WAIT_CMD); OUT_BYTE(drive->mult_count ? WIN_MULTREAD : WIN_READ, IDE_COMMAND_REG); return; } if (rq->cmd == WRITE) { -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA if (drive->using_dma && !(HWIF(drive)->dmaproc(ide_dma_write, drive))) return; -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ OUT_BYTE(drive->mult_count ? WIN_MULTWRITE : WIN_WRITE, IDE_COMMAND_REG); if (ide_wait_stat(drive, DATA_READY, drive->bad_wstat, WAIT_DRQ)) { printk(KERN_ERR "%s: no DRQ after issuing %s\n", drive->name, @@ -459,17 +461,17 @@ static void idedisk_special (ide_drive_t *drive) OUT_BYTE(drive->cyl,IDE_LCYL_REG); OUT_BYTE(drive->cyl>>8,IDE_HCYL_REG); OUT_BYTE(((drive->head-1)|drive->select.all)&0xBF,IDE_SELECT_REG); - if (!IS_PROMISE_DRIVE) + if (!IS_PDC4030_DRIVE) ide_cmd(drive, WIN_SPECIFY, drive->sect, &set_geometry_intr); } else if (s->b.recalibrate) { s->b.recalibrate = 0; - if (!IS_PROMISE_DRIVE) + if (!IS_PDC4030_DRIVE) ide_cmd(drive, WIN_RESTORE, drive->sect, &recal_intr); } else if (s->b.set_multmode) { s->b.set_multmode = 0; if (drive->id && drive->mult_req > drive->id->max_multsect) drive->mult_req = drive->id->max_multsect; - if (!IS_PROMISE_DRIVE) + if (!IS_PDC4030_DRIVE) ide_cmd(drive, WIN_SETMULT, drive->mult_req, &set_multmode_intr); } else if (s->all) { int special = s->all; @@ -602,12 +604,16 @@ static void idedisk_setup (ide_drive_t *drive) (void) idedisk_capacity (drive); /* initialize LBA selection */ - printk (KERN_INFO "%s: %.40s, %ldMB w/%dkB Cache, %sCHS=%d/%d/%d%s\n", + printk (KERN_INFO "%s: %.40s, %ldMB w/%dkB Cache, CHS=%d/%d/%d", drive->name, id->model, idedisk_capacity(drive)/2048L, id->buf_size/2, - drive->select.b.lba ? "LBA, " : "", - drive->bios_cyl, drive->bios_head, drive->bios_sect, - drive->using_dma ? ", DMA" : ""); - + drive->bios_cyl, drive->bios_head, drive->bios_sect); + if (drive->using_dma) { + if ((id->field_valid & 4) && (id->dma_ultra & (id->dma_ultra >> 8) & 7)) + printk(", UDMA"); + else + printk(", DMA"); + } + printk("\n"); drive->mult_count = 0; if (id->max_multsect) { drive->mult_req = INITIAL_MULT_COUNT; diff --git a/drivers/block/ide-dma.c b/drivers/block/ide-dma.c new file mode 100644 index 000000000000..9ccb325db357 --- /dev/null +++ b/drivers/block/ide-dma.c @@ -0,0 +1,626 @@ +/* + * linux/drivers/block/ide-dma.c Version 4.01 November 30, 1997 + * + * Copyright (c) 1995-1998 Mark Lord + * May be copied or modified under the terms of the GNU General Public License + */ + +/* + * This module provides support for the bus-master IDE DMA functions + * of various PCI chipsets, including the Intel PIIX (i82371FB for + * the 430 FX chipset), the PIIX3 (i82371SB for the 430 HX/VX and + * 440 chipsets), and the PIIX4 (i82371AB for the 430 TX chipset) + * ("PIIX" stands for "PCI ISA IDE Xcellerator"). + * + * Pretty much the same code works for other IDE PCI bus-mastering chipsets. + * + * DMA is supported for all IDE devices (disk drives, cdroms, tapes, floppies). + * + * By default, DMA support is prepared for use, but is currently enabled only + * for drives which already have DMA enabled (UltraDMA or mode 2 multi/single), + * or which are recognized as "good" (see table below). Drives with only mode0 + * or mode1 (multi/single) DMA should also work with this chipset/driver + * (eg. MC2112A) but are not enabled by default. + * + * Use "hdparm -i" to view modes supported by a given drive. + * + * The hdparm-2.4 (or later) utility can be used for manually enabling/disabling + * DMA support, but must be (re-)compiled against this kernel version or later. + * + * To enable DMA, use "hdparm -d1 /dev/hd?" on a per-drive basis after booting. + * If problems arise, ide.c will disable DMA operation after a few retries. + * This error recovery mechanism works and has been extremely well exercised. + * + * IDE drives, depending on their vintage, may support several different modes + * of DMA operation. The boot-time modes are indicated with a "*" in + * the "hdparm -i" listing, and can be changed with *knowledgeable* use of + * the "hdparm -X" feature. There is seldom a need to do this, as drives + * normally power-up with their "best" PIO/DMA modes enabled. + * + * Testing has been done with a rather extensive number of drives, + * with Quantum & Western Digital models generally outperforming the pack, + * and Fujitsu & Conner (and some Seagate which are really Conner) drives + * showing more lackluster throughput. + * + * Keep an eye on /var/adm/messages for "DMA disabled" messages. + * + * Some people have reported trouble with Intel Zappa motherboards. + * This can be fixed by upgrading the AMI BIOS to version 1.00.04.BS0, + * available from ftp://ftp.intel.com/pub/bios/10004bs0.exe + * (thanks to Glen Morrell for researching this). + * + * Thanks to "Christopher J. Reimer" for fixing the + * problem with some (all?) ACER motherboards/BIOSs. Hopefully the fix + * still works here (?). + * + * Thanks to "Benoit Poulot-Cazajous" for testing + * "TX" chipset compatibility and for providing patches for the "TX" chipset. + * + * Thanks to Christian Brunner for taking a good first crack + * at generic DMA -- his patches were referred to when preparing this code. + * + * Most importantly, thanks to Robert Bringman + * for supplying a Promise UDMA board & WD UDMA drive for this work! + * + * And, yes, Intel Zappa boards really *do* use both PIIX IDE ports. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "ide.h" + +/* + * good_dma_drives() lists the model names (from "hdparm -i") + * of drives which do not support mode2 DMA but which are + * known to work fine with this interface under Linux. + */ +const char *good_dma_drives[] = {"Micropolis 2112A", + "CONNER CTMA 4000", + NULL}; + +/* + * Our Physical Region Descriptor (PRD) table should be large enough + * to handle the biggest I/O request we are likely to see. Since requests + * can have no more than 256 sectors, and since the typical blocksize is + * two or more sectors, we could get by with a limit of 128 entries here for + * the usual worst case. Most requests seem to include some contiguous blocks, + * further reducing the number of table entries required. + * + * The driver reverts to PIO mode for individual requests that exceed + * this limit (possible with 512 byte blocksizes, eg. MSDOS f/s), so handling + * 100% of all crazy scenarios here is not necessary. + * + * As it turns out though, we must allocate a full 4KB page for this, + * so the two PRD tables (ide0 & ide1) will each get half of that, + * allowing each to have about 256 entries (8 bytes each) from this. + */ +#define PRD_BYTES 8 +#define PRD_ENTRIES (PAGE_SIZE / (2 * PRD_BYTES)) + +static int config_drive_for_dma (ide_drive_t *); + +/* + * dma_intr() is the handler for disk read/write DMA interrupts + */ +static void dma_intr (ide_drive_t *drive) +{ + byte stat, dma_stat; + int i; + struct request *rq = HWGROUP(drive)->rq; + unsigned short dma_base = HWIF(drive)->dma_base; + + dma_stat = inb(dma_base+2); /* get DMA status */ + outb(inb(dma_base)&~1, dma_base); /* stop DMA operation */ + stat = GET_STAT(); /* get drive status */ + if (OK_STAT(stat,DRIVE_READY,drive->bad_wstat|DRQ_STAT)) { + if ((dma_stat & 7) == 4) { /* verify good DMA status */ + rq = HWGROUP(drive)->rq; + for (i = rq->nr_sectors; i > 0;) { + i -= rq->current_nr_sectors; + ide_end_request(1, HWGROUP(drive)); + } + return; + } + printk("%s: bad DMA status: 0x%02x\n", drive->name, dma_stat); + } + sti(); + ide_error(drive, "dma_intr", stat); +} + +/* + * build_dmatable() prepares a dma request. + * Returns 0 if all went okay, returns 1 otherwise. + */ +static int build_dmatable (ide_drive_t *drive) +{ + struct request *rq = HWGROUP(drive)->rq; + struct buffer_head *bh = rq->bh; + unsigned long size, addr, *table = HWIF(drive)->dmatable; + unsigned int count = 0; + + do { + /* + * Determine addr and size of next buffer area. We assume that + * individual virtual buffers are always composed linearly in + * physical memory. For example, we assume that any 8kB buffer + * is always composed of two adjacent physical 4kB pages rather + * than two possibly non-adjacent physical 4kB pages. + */ + if (bh == NULL) { /* paging requests have (rq->bh == NULL) */ + addr = virt_to_bus (rq->buffer); + size = rq->nr_sectors << 9; + } else { + /* group sequential buffers into one large buffer */ + addr = virt_to_bus (bh->b_data); + size = bh->b_size; + while ((bh = bh->b_reqnext) != NULL) { + if ((addr + size) != virt_to_bus (bh->b_data)) + break; + size += bh->b_size; + } + } + + /* + * Fill in the dma table, without crossing any 64kB boundaries. + * We assume 16-bit alignment of all blocks. + */ + while (size) { + if (++count >= PRD_ENTRIES) { + printk("%s: DMA table too small\n", drive->name); + return 1; /* revert to PIO for this request */ + } else { + unsigned long bcount = 0x10000 - (addr & 0xffff); + if (bcount > size) + bcount = size; + *table++ = addr; + *table++ = bcount & 0xffff; + addr += bcount; + size -= bcount; + } + } + } while (bh != NULL); + if (count) { + *--table |= 0x80000000; /* set End-Of-Table (EOT) bit */ + return 0; + } + printk("%s: empty DMA table?\n", drive->name); + return 1; /* let the PIO routines handle this weirdness */ +} + +/* + * ide_dmaproc() initiates/aborts DMA read/write operations on a drive. + * + * The caller is assumed to have selected the drive and programmed the drive's + * sector address using CHS or LBA. All that remains is to prepare for DMA + * and then issue the actual read/write DMA/PIO command to the drive. + * + * For ATAPI devices, we just prepare for DMA and return. The caller should + * then issue the packet command to the drive and call us again with + * ide_dma_begin afterwards. + * + * Returns 0 if all went well. + * Returns 1 if DMA read/write could not be started, in which case + * the caller should revert to PIO for the current request. + */ +static int ide_dmaproc (ide_dma_action_t func, ide_drive_t *drive) +{ + unsigned long dma_base = HWIF(drive)->dma_base; + unsigned int reading = 0; + + switch (func) { + case ide_dma_off: + printk("%s: DMA disabled\n", drive->name); + case ide_dma_off_quietly: + case ide_dma_on: + drive->using_dma = (func == ide_dma_on); + return 0; + case ide_dma_abort: + outb(inb(dma_base)&~1, dma_base); /* stop DMA */ + return 0; + case ide_dma_check: + return config_drive_for_dma (drive); + case ide_dma_status_bad: + return ((inb(dma_base+2) & 7) != 4); /* verify good DMA status */ + case ide_dma_transferred: + return 0; /* NOT IMPLEMENTED: number of bytes actually transferred */ + case ide_dma_begin: + outb(inb(dma_base)|1, dma_base); /* begin DMA */ + return 0; + default: + printk("ide_dmaproc: unsupported func: %d\n", func); + return 1; + case ide_dma_read: + reading = (1 << 3); + case ide_dma_write: + if (build_dmatable (drive)) + return 1; + outl(virt_to_bus (HWIF(drive)->dmatable), dma_base + 4); /* PRD table */ + outb(reading, dma_base); /* specify r/w */ + outb(inb(dma_base+2)|0x06, dma_base+2); /* clear status bits */ + if (drive->media != ide_disk) + return 0; + ide_set_handler(drive, &dma_intr, WAIT_CMD); /* issue cmd to drive */ + OUT_BYTE(reading ? WIN_READDMA : WIN_WRITEDMA, IDE_COMMAND_REG); + outb(inb(dma_base)|1, dma_base); /* begin DMA */ + return 0; + } +} + +static int config_drive_for_dma (ide_drive_t *drive) +{ + const char **list; + + struct hd_driveid *id = drive->id; + if (id && (id->capability & 1)) { + /* Enable DMA on any drive that has UltraDMA (mode 0/1/2) enabled */ + if (id->field_valid & 4) /* UltraDMA */ + if ((id->dma_ultra & (id->dma_ultra >> 8) & 7)) + return ide_dmaproc(ide_dma_on, drive); + /* Enable DMA on any drive that has mode2 DMA (multi or single) enabled */ + if (id->field_valid & 2) /* regular DMA */ + if ((id->dma_mword & 0x404) == 0x404 || (id->dma_1word & 0x404) == 0x404) + return ide_dmaproc(ide_dma_on, drive); + /* Consult the list of known "good" drives */ + list = good_dma_drives; + while (*list) { + if (!strcmp(*list++,id->model)) + return ide_dmaproc(ide_dma_on, drive); + } + } + return ide_dmaproc(ide_dma_off_quietly, drive); +} + +#define DEVID_PIIX (PCI_VENDOR_ID_INTEL |(PCI_DEVICE_ID_INTEL_82371_1 <<16)) +#define DEVID_PIIX3 (PCI_VENDOR_ID_INTEL |(PCI_DEVICE_ID_INTEL_82371SB_1 <<16)) +#define DEVID_PIIX4 (PCI_VENDOR_ID_INTEL |(PCI_DEVICE_ID_INTEL_82371AB <<16)) +#define DEVID_VP_IDE (PCI_VENDOR_ID_VIA |(PCI_DEVICE_ID_VIA_82C586_1 <<16)) +#define DEVID_PDC2046 (PCI_VENDOR_ID_PROMISE|(PCI_DEVICE_ID_PROMISE_20246 <<16)) +#define DEVID_RZ1000 (PCI_VENDOR_ID_PCTECH |(PCI_DEVICE_ID_PCTECH_RZ1000 <<16)) +#define DEVID_RZ1001 (PCI_VENDOR_ID_PCTECH |(PCI_DEVICE_ID_PCTECH_RZ1001 <<16)) +#define DEVID_CMD640 (PCI_VENDOR_ID_CMD |(PCI_DEVICE_ID_CMD_640 <<16)) +#define DEVID_CMD646 (PCI_VENDOR_ID_CMD |(PCI_DEVICE_ID_CMD_646 <<16)) +#define DEVID_SIS5513 (PCI_VENDOR_ID_SI |(PCI_DEVICE_ID_SI_5513 <<16)) +#define DEVID_OPTI (PCI_VENDOR_ID_OPTI |(PCI_DEVICE_ID_OPTI_82C621 <<16)) +#define DEVID_OPTI2 (PCI_VENDOR_ID_OPTI |(0xd568 /* from datasheets */ <<16)) + +#ifdef CONFIG_BLK_DEV_OPTI621 +extern void ide_init_opti621(byte, byte, ide_hwif_t *); +#define INIT_OPTI (&ide_init_opti621) +#else +#define INIT_OPTI (NULL) +#endif + +typedef struct ide_pci_enablebit_s { + byte reg; /* byte pci reg holding the enable-bit */ + byte mask; /* mask to isolate the enable-bit */ + byte val; /* value of masked reg when "enabled" */ +} ide_pci_enablebit_t; + +typedef struct ide_pci_device_s { + unsigned int id; + const char *name; + void (*init_hwif)(byte bus, byte fn, ide_hwif_t *hwif); + ide_pci_enablebit_t enablebits[2]; +} ide_pci_device_t; + +static ide_pci_device_t ide_pci_chipsets[] = { + {DEVID_PIIX, "PIIX", NULL, {{0x41,0x80,0x80}, {0x43,0x80,0x80}} }, + {DEVID_PIIX3, "PIIX3", NULL, {{0x41,0x80,0x80}, {0x43,0x80,0x80}} }, + {DEVID_PIIX4, "PIIX4", NULL, {{0x41,0x80,0x80}, {0x43,0x80,0x80}} }, + {DEVID_VP_IDE, "VP_IDE", NULL, {{0x40,0x02,0x02}, {0x40,0x01,0x01}} }, + {DEVID_PDC2046, "PDC2046", NULL, {{0x50,0x02,0x02}, {0x50,0x04,0x04}} }, + {DEVID_RZ1000, NULL, NULL, {{0x00,0x00,0x00}, {0x00,0x00,0x00}} }, + {DEVID_RZ1001, NULL, NULL, {{0x00,0x00,0x00}, {0x00,0x00,0x00}} }, + {DEVID_CMD640, NULL, NULL, {{0x00,0x00,0x00}, {0x00,0x00,0x00}} }, + {DEVID_OPTI, "OPTI", INIT_OPTI, {{0x45,0x80,0x00}, {0x40,0x08,0x00}} }, + {DEVID_OPTI2, "OPTI2", INIT_OPTI, {{0x45,0x80,0x00}, {0x40,0x08,0x00}} }, + {DEVID_SIS5513, "SIS5513", NULL, {{0x00,0x00,0x00}, {0x00,0x00,0x00}} }, + {DEVID_CMD646, "CMD646", NULL, {{0x00,0x00,0x00}, {0x51,0x80,0x80}} }, + {0, "PCI_IDE", NULL, {{0x00,0x00,0x00}, {0x00,0x00,0x00}} }}; + +__initfunc(static ide_pci_device_t *lookup_devid(unsigned int devid)) +{ + ide_pci_device_t *d = ide_pci_chipsets; + while (d->id && d->id != devid) + ++d; + return d; +} + +__initfunc(static void ide_setup_dma (ide_hwif_t *hwif, unsigned short dmabase)) +{ + static unsigned long dmatable = 0; + static unsigned leftover = 0; + + printk(" %s: BM-DMA at 0x%04x-0x%04x", hwif->name, dmabase, dmabase+7); + if (check_region(dmabase, 8)) { + printk(" -- ERROR, PORTS ALREADY IN USE"); + } else { + request_region(dmabase, 8, hwif->name); + hwif->dma_base = dmabase; + if (leftover < (PRD_ENTRIES * PRD_BYTES)) { + /* + * The BM-DMA uses full 32bit addr, so we can + * safely use __get_free_page() here instead + * of __get_dma_pages() -- no ISA limitations. + */ + dmatable = __get_free_pages(GFP_KERNEL,1,0); + leftover = dmatable ? PAGE_SIZE : 0; + } + if (dmatable) { + printk(", PRD table at %08lx", dmatable); + hwif->dmatable = (unsigned long *) dmatable; + dmatable += (PRD_ENTRIES * PRD_BYTES); + leftover -= (PRD_ENTRIES * PRD_BYTES); + outl(virt_to_bus(hwif->dmatable), dmabase + 4); + hwif->dmaproc = &ide_dmaproc; + } + } + printk("\n"); +} + +/* The next two functions were stolen from cmd640.c, with + a few modifications */ + +__initfunc(static void write_pcicfg_dword (byte fn, unsigned short reg, long val)) +{ + unsigned long flags; + + save_flags(flags); + cli(); + outl_p((reg & 0xfc) | ((fn * 0x100) + 0x80000000), 0xcf8); + outl_p(val, (reg & 3) | 0xcfc); + restore_flags(flags); +} + +__initfunc(static long read_pcicfg_dword (byte fn, unsigned short reg)) +{ + long b; + unsigned long flags; + + save_flags(flags); + cli(); + outl_p((reg & 0xfc) | ((fn * 0x100) + 0x80000000), 0xcf8); + b = inl_p((reg & 3) | 0xcfc); + restore_flags(flags); + return b; +} + +/* + * Search for an (apparently) unused block of I/O space + * of "size" bytes in length. + */ +__initfunc(static short find_free_region (unsigned short size)) +{ + unsigned short i, base = 0xe800; + for (base = 0xe800; base > 0; base -= 0x800) { + if (!check_region(base,size)) { + for (i = 0; i < size; i++) { + if (inb(base+i) != 0xff) + goto next; + } + return base; /* success */ + } + next: + } + return 0; /* failure */ +} + +/* + * Fetch the Bus-Master I/O Base-Address (BMIBA) from PCI space: + */ +__initfunc(static unsigned int ide_get_or_set_bmiba (byte bus, byte fn, const char *name)) +{ + unsigned int bmiba = 0; + unsigned short base; + int rc; + + if ((rc = pcibios_read_config_dword(bus, fn, 0x20, &bmiba))) { + printk("%s: failed to read BMIBA\n", name); + } else if ((bmiba &= 0xfff0) == 0) { + printk("%s: BMIBA is invalid (0x%04x, BIOS problem)\n", name, bmiba); + base = find_free_region(16); + if (base) { + printk("%s: setting BMIBA to 0x%04x\n", name, base); + pcibios_write_config_dword(bus, fn, 0x20, base | 1); + pcibios_read_config_dword(bus, fn, 0x20, &bmiba); + bmiba &= 0xfff0; + if (bmiba != base) { + if (bus == 0) { + printk("%s: operation failed, bypassing BIOS to try again\n", name); + write_pcicfg_dword(fn, 0x20, base | 1); + bmiba = read_pcicfg_dword(fn, 0x20) & 0xfff0; + } + if (bmiba != base) { + printk("%s: operation failed, DMA disabled\n", name); + bmiba = 0; + } + } + } + } + return bmiba; +} + +/* + * Match a PCI IDE port against an entry in ide_hwifs[], + * based on io_base port if possible. + */ +__initfunc(ide_hwif_t *ide_match_hwif (unsigned int io_base)) +{ + int h; + ide_hwif_t *hwif; + + /* + * Look for a hwif with matching io_base specified using + * parameters to ide_setup(). + */ + for (h = 0; h < MAX_HWIFS; ++h) { + hwif = &ide_hwifs[h]; + if (hwif->io_ports[IDE_DATA_OFFSET] == io_base) { + if (hwif->chipset == ide_generic) + return hwif; /* a perfect match */ + } + } + /* + * Look for a hwif with matching io_base default value. + * If chipset is "ide_unknown", then claim that hwif slot. + * Otherwise, some other chipset has already claimed it.. :( + */ + for (h = 0; h < MAX_HWIFS; ++h) { + hwif = &ide_hwifs[h]; + if (hwif->io_ports[IDE_DATA_OFFSET] == io_base) { + if (hwif->chipset == ide_unknown) + return hwif; /* match */ + return NULL; /* already claimed */ + } + } + /* + * Okay, there is no hwif matching our io_base, + * so we'll just claim an unassigned slot. + * Give preference to claiming ide2/ide3 before ide0/ide1, + * just in case there's another interface yet-to-be-scanned + * which uses ports 1f0/170 (the ide0/ide1 defaults). + */ + for (h = 0; h < MAX_HWIFS; ++h) { + int hwifs[] = {2,3,1,0}; /* assign 3rd/4th before 1st/2nd */ + hwif = &ide_hwifs[hwifs[h]]; + if (hwif->chipset == ide_unknown) + return hwif; /* pick an unused entry */ + } + return NULL; +} + +/* + * ide_setup_pci_device() looks at the primary/secondary interfaces + * on a PCI IDE device and, if they are enabled, prepares the IDE driver + * for use with them. This generic code works for most PCI chipsets. + * + * One thing that is not standardized is the location of the + * primary/secondary interface "enable/disable" bits. For chipsets that + * we "know" about, this information is in the ide_pci_device_t struct; + * for all other chipsets, we just assume both interfaces are enabled. + */ +__initfunc(static void ide_setup_pci_device (byte bus, byte fn, unsigned int bmiba, ide_pci_device_t *d)) +{ + unsigned int port, at_least_one_hwif_enabled = 0; + unsigned short base = 0, ctl = 0; + byte tmp = 0, pciirq = 0; + ide_hwif_t *hwif; + + if (pcibios_read_config_byte(bus, fn, 0x3c, &pciirq)) + pciirq = 0; /* probe later if not set */ + for (port = 0; port <= 1; ++port) { + ide_pci_enablebit_t *e = &(d->enablebits[port]); + if (e->reg) { + if (pcibios_read_config_byte(bus, fn, e->reg, &tmp)) { + printk("%s: unable to read pci reg 0x%x\n", d->name, e->reg); + } else if ((tmp & e->mask) != e->val) + continue; /* port not enabled */ + } + if (pcibios_read_config_word(bus, fn, 0x14+(port*8), &ctl)) + ctl = 0; + if ((ctl &= 0xfffc) == 0) + ctl = 0x3f4 ^ (port << 7); + if (pcibios_read_config_word(bus, fn, 0x10+(port*8), &base)) + base = 0; + if ((base &= 0xfff8) == 0) + base = 0x1F0 ^ (port << 7); + if ((hwif = ide_match_hwif(base)) == NULL) { + printk("%s: no room in hwif table for port %d\n", d->name, port); + continue; + } + hwif->chipset = ide_pci; + if (hwif->io_ports[IDE_DATA_OFFSET] != base) { + ide_init_hwif_ports(hwif->io_ports, base, NULL); + hwif->io_ports[IDE_CONTROL_OFFSET] = ctl + 2; + } + if (!hwif->irq) + hwif->irq = port ? 0 : pciirq; /* always probe for secondary irq */ + if (bmiba) { + if ((inb(bmiba+2) & 0x80)) { /* simplex DMA only? */ + printk("%s: simplex device: DMA disabled\n", d->name); + } else { /* supports simultaneous DMA on both channels */ + ide_setup_dma(hwif, bmiba + (8 * port)); + } + } + if (d->init_hwif) /* Call chipset-specific routine for each enabled hwif */ + d->init_hwif(bus, fn, hwif); + at_least_one_hwif_enabled = 1; + } + if (!at_least_one_hwif_enabled) + printk("%s: neither IDE port is enabled\n", d->name); +} + +/* + * ide_scan_pci_device() examines all functions of a PCI device, + * looking for IDE interfaces and/or devices in ide_pci_chipsets[]. + */ +__initfunc(static inline void ide_scan_pci_device (unsigned int bus, unsigned int fn)) +{ + unsigned int devid, ccode; + unsigned short pcicmd; + ide_pci_device_t *d; + byte hedt; + + if (pcibios_read_config_byte(bus, fn, 0x0e, &hedt)) + hedt = 0; + do { + if (pcibios_read_config_dword(bus, fn, 0x00, &devid) + || devid == 0xffffffff + || pcibios_read_config_dword(bus, fn, 0x08, &ccode)) + return; + d = lookup_devid(devid); + if (d->name == NULL) /* some chips (cmd640 & rz1000) are handled elsewhere */ + continue; + if (d->id || (ccode >> 16) == PCI_CLASS_STORAGE_IDE) { + printk("%s: %sIDE device on PCI bus %d function %d\n", d->name, d->id ? "" : "unknown ", bus, fn); + /* + * See if IDE ports are enabled + */ + if (pcibios_read_config_word(bus, fn, 0x04, &pcicmd)) { + printk("%s: error accessing PCICMD\n", d->name); + } else if ((pcicmd & 1) == 0) { + printk("%s: device is disabled (BIOS)\n", d->name); + } else { + unsigned int bmiba = 0; + /* + * Check for Bus-Master DMA capability + */ + if (!(pcicmd & 4) || !(bmiba = ide_get_or_set_bmiba(bus, fn, d->name))) { + if ((ccode >> 16) == PCI_CLASS_STORAGE_RAID || (ccode && 0x8000)) + printk("%s: Bus-Master DMA is disabled (BIOS)\n", d->name); + } + ide_setup_pci_device(bus, fn, bmiba, d); + } + } + } while (hedt == 0x80 && (++fn & 7)); +} + +/* + * ide_scan_pcibus() gets invoked at boot time from ide.c + */ +__initfunc(void ide_scan_pcibus (void)) +{ + unsigned int bus, dev; + + if (!pcibios_present()) + return; + for (bus = 0; bus <= 255; ++bus) { + for (dev = 0; dev <= 31; ++dev) { + ide_scan_pci_device(bus, dev << 3); + } + } +} + diff --git a/drivers/block/ide-floppy.c b/drivers/block/ide-floppy.c index 5a0a603b5f28..2856f330f435 100644 --- a/drivers/block/ide-floppy.c +++ b/drivers/block/ide-floppy.c @@ -532,7 +532,7 @@ static void idefloppy_output_buffers (ide_drive_t *drive, idefloppy_pc_t *pc, un } } -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA static void idefloppy_update_buffers (ide_drive_t *drive, idefloppy_pc_t *pc) { struct request *rq = pc->rq; @@ -541,7 +541,7 @@ static void idefloppy_update_buffers (ide_drive_t *drive, idefloppy_pc_t *pc) while ((bh = rq->bh) != NULL) idefloppy_end_request (1, HWGROUP(drive)); } -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ /* * idefloppy_queue_pc_head generates a new packet command request in front @@ -681,7 +681,7 @@ static void idefloppy_pc_intr (ide_drive_t *drive) printk (KERN_INFO "ide-floppy: Reached idefloppy_pc_intr interrupt handler\n"); #endif /* IDEFLOPPY_DEBUG_LOG */ -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA if (test_bit (PC_DMA_IN_PROGRESS, &pc->flags)) { if (HWIF(drive)->dmaproc(ide_dma_status_bad, drive)) { set_bit (PC_DMA_ERROR, &pc->flags); @@ -694,7 +694,7 @@ static void idefloppy_pc_intr (ide_drive_t *drive) printk (KERN_INFO "ide-floppy: DMA finished\n"); #endif /* IDEFLOPPY_DEBUG_LOG */ } -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ status.all = GET_STAT(); /* Clear the interrupt */ @@ -725,7 +725,7 @@ static void idefloppy_pc_intr (ide_drive_t *drive) pc->callback(drive); /* Command finished - Call the callback function */ return; } -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA if (test_and_clear_bit (PC_DMA_IN_PROGRESS, &pc->flags)) { printk (KERN_ERR "ide-floppy: The floppy wants to issue more interrupts in DMA mode\n"); printk (KERN_ERR "ide-floppy: DMA disabled, reverting to PIO\n"); @@ -733,7 +733,7 @@ static void idefloppy_pc_intr (ide_drive_t *drive) ide_do_reset (drive); return; } -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ bcount.b.high=IN_BYTE (IDE_BCOUNTH_REG); /* Get the number of bytes to transfer */ bcount.b.low=IN_BYTE (IDE_BCOUNTL_REG); /* on this interrupt */ ireason.all=IN_BYTE (IDE_IREASON_REG); @@ -841,14 +841,14 @@ static void idefloppy_issue_pc (ide_drive_t *drive, idefloppy_pc_t *pc) pc->current_position=pc->buffer; bcount.all=pc->request_transfer; /* Request to transfer the entire buffer at once */ -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA if (test_and_clear_bit (PC_DMA_ERROR, &pc->flags)) { printk (KERN_WARNING "ide-floppy: DMA disabled, reverting to PIO\n"); HWIF(drive)->dmaproc(ide_dma_off, drive); } if (test_bit (PC_DMA_RECOMMENDED, &pc->flags) && drive->using_dma) dma_ok=!HWIF(drive)->dmaproc(test_bit (PC_WRITING, &pc->flags) ? ide_dma_write : ide_dma_read, drive); -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ OUT_BYTE (drive->ctl,IDE_CONTROL_REG); OUT_BYTE (dma_ok ? 1:0,IDE_FEATURE_REG); /* Use PIO/DMA */ @@ -856,12 +856,12 @@ static void idefloppy_issue_pc (ide_drive_t *drive, idefloppy_pc_t *pc) OUT_BYTE (bcount.b.low,IDE_BCOUNTL_REG); OUT_BYTE (drive->select.all,IDE_SELECT_REG); -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA if (dma_ok) { /* Begin DMA, if necessary */ set_bit (PC_DMA_IN_PROGRESS, &pc->flags); (void) (HWIF(drive)->dmaproc(ide_dma_begin, drive)); } -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ if (test_bit (IDEFLOPPY_DRQ_INTERRUPT, &floppy->flags)) { ide_set_handler (drive, &idefloppy_transfer_pc, WAIT_CMD); diff --git a/drivers/block/ide-probe.c b/drivers/block/ide-probe.c index 3ce3e882a628..fdc13bef9a6f 100644 --- a/drivers/block/ide-probe.c +++ b/drivers/block/ide-probe.c @@ -114,13 +114,13 @@ static inline void do_identify (ide_drive_t *drive, byte cmd) if (cmd == WIN_PIDENTIFY) { byte type = (id->config >> 8) & 0x1f; printk("ATAPI "); -#ifdef CONFIG_BLK_DEV_PROMISE - if (HWIF(drive)->is_promise2) { +#ifdef CONFIG_BLK_DEV_PDC4030 + if (HWIF(drive)->is_pdc4030_2) { printk(" -- not supported on 2nd Promise port\n"); drive->present = 0; return; } -#endif /* CONFIG_BLK_DEV_PROMISE */ +#endif /* CONFIG_BLK_DEV_PDC4030 */ switch (type) { case ide_floppy: if (!strstr(id->model, "oppy") && !strstr(id->model, "poyp") && !strstr(id->model, "ZIP")) @@ -192,15 +192,16 @@ static int try_to_identify (ide_drive_t *drive, byte cmd) } else hd_status = IDE_ALTSTATUS_REG; /* use non-intrusive polling */ -#if CONFIG_BLK_DEV_PROMISE - if (IS_PROMISE_DRIVE) { - if (promise_cmd(drive,PROMISE_IDENTIFY)) { +#if CONFIG_BLK_DEV_PDC4030 + if (IS_PDC4030_DRIVE) { + extern int pdc4030_cmd(ide_drive_t *, byte); + if (pdc4030_cmd(drive,PROMISE_IDENTIFY)) { if (irqs) (void) probe_irq_off(irqs); return 1; } } else -#endif /* CONFIG_BLK_DEV_PROMISE */ +#endif /* CONFIG_BLK_DEV_PDC4030 */ OUT_BYTE(cmd,IDE_COMMAND_REG); /* ask drive for ID */ timeout = ((cmd == WIN_IDENTIFY) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2; timeout += jiffies; @@ -363,10 +364,10 @@ static void probe_cmos_for_drives (ide_hwif_t *hwif) byte cmos_disks, *BIOS = (byte *) &drive_info; int unit; -#ifdef CONFIG_BLK_DEV_PROMISE - if (hwif->is_promise2) +#ifdef CONFIG_BLK_DEV_PDC4030 + if (hwif->is_pdc4030_2) return; -#endif /* CONFIG_BLK_DEV_PROMISE */ +#endif /* CONFIG_BLK_DEV_PDC4030 */ outb_p(0x12,0x70); /* specify CMOS address 0x12 */ cmos_disks = inb_p(0x71); /* read the data from 0x12 */ /* Extract drive geometry from CMOS+BIOS if not already setup */ @@ -397,12 +398,12 @@ static void probe_hwif (ide_hwif_t *hwif) return; if (hwif->io_ports[IDE_DATA_OFFSET] == HD_DATA) probe_cmos_for_drives (hwif); -#if CONFIG_BLK_DEV_PROMISE - if (!hwif->is_promise2 && +#if CONFIG_BLK_DEV_PDC4030 + if (!hwif->is_pdc4030_2 && (ide_check_region(hwif->io_ports[IDE_DATA_OFFSET],8) || ide_check_region(hwif->io_ports[IDE_CONTROL_OFFSET],1))) { #else if (ide_check_region(hwif->io_ports[IDE_DATA_OFFSET],8) || ide_check_region(hwif->io_ports[IDE_CONTROL_OFFSET],1)) { -#endif /* CONFIG_BLK_DEV_PROMISE */ +#endif /* CONFIG_BLK_DEV_PDC4030 */ int msgout = 0; for (unit = 0; unit < MAX_DRIVES; ++unit) { ide_drive_t *drive = &hwif->drives[unit]; diff --git a/drivers/block/ide-tape.c b/drivers/block/ide-tape.c index 4df355141caa..e9d7f1433e5e 100644 --- a/drivers/block/ide-tape.c +++ b/drivers/block/ide-tape.c @@ -170,13 +170,13 @@ * unit, making performance almost independent of the * chosen user block size. * Some improvements in error recovery. - * By cooperating with triton.c, bus mastering DMA can + * By cooperating with ide-dma.c, bus mastering DMA can * now sometimes be used with IDE tape drives as well. * Bus mastering DMA has the potential to dramatically * reduce the CPU's overhead when accessing the device, * and can be enabled by using hdparm -d1 on the tape's * block device interface. For more info, read the - * comments in triton.c. + * comments in ide-dma.c. * Ver 1.4 Mar 13 96 Fixed serialize support. * Ver 1.5 Apr 12 96 Fixed shared interface operation, broken in 1.3.85. * Fixed pipelined read mode inefficiency. @@ -1093,7 +1093,7 @@ static void idetape_output_buffers (ide_drive_t *drive, idetape_pc_t *pc, unsign } } -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA static void idetape_update_buffers (idetape_pc_t *pc) { struct buffer_head *bh = pc->bh; @@ -1116,7 +1116,7 @@ static void idetape_update_buffers (idetape_pc_t *pc) } pc->bh = bh; } -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ /* * idetape_postpone_request postpones the current request so that @@ -1610,7 +1610,7 @@ static void idetape_analyze_error (ide_drive_t *drive,idetape_request_sense_resu printk (KERN_INFO "ide-tape: pc = %x, sense key = %x, asc = %x, ascq = %x\n",pc->c[0],result->sense_key,result->asc,result->ascq); #endif /* IDETAPE_DEBUG_LOG */ -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA /* * Correct pc->actually_transferred by asking the tape. @@ -1619,7 +1619,7 @@ static void idetape_analyze_error (ide_drive_t *drive,idetape_request_sense_resu pc->actually_transferred = pc->request_transfer - tape->tape_block_size * ntohl (get_unaligned (&result->information)); idetape_update_buffers (pc); } -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ if (pc->c[0] == IDETAPE_READ_CMD && result->filemark) { pc->error = IDETAPE_ERROR_FILEMARK; set_bit (PC_ABORT, &pc->flags); @@ -1721,7 +1721,7 @@ static void idetape_pc_intr (ide_drive_t *drive) printk (KERN_INFO "ide-tape: Reached idetape_pc_intr interrupt handler\n"); #endif /* IDETAPE_DEBUG_LOG */ -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA if (test_bit (PC_DMA_IN_PROGRESS, &pc->flags)) { if (HWIF(drive)->dmaproc(ide_dma_status_bad, drive)) { set_bit (PC_DMA_ERROR, &pc->flags); @@ -1739,7 +1739,7 @@ static void idetape_pc_intr (ide_drive_t *drive) printk (KERN_INFO "ide-tape: DMA finished\n"); #endif /* IDETAPE_DEBUG_LOG */ } -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ status.all = GET_STAT(); /* Clear the interrupt */ @@ -1776,7 +1776,7 @@ static void idetape_pc_intr (ide_drive_t *drive) pc->callback(drive); /* Command finished - Call the callback function */ return; } -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA if (test_and_clear_bit (PC_DMA_IN_PROGRESS, &pc->flags)) { printk (KERN_ERR "ide-tape: The tape wants to issue more interrupts in DMA mode\n"); printk (KERN_ERR "ide-tape: DMA disabled, reverting to PIO\n"); @@ -1784,7 +1784,7 @@ static void idetape_pc_intr (ide_drive_t *drive) ide_do_reset (drive); return; } -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ bcount.b.high=IN_BYTE (IDE_BCOUNTH_REG); /* Get the number of bytes to transfer */ bcount.b.low=IN_BYTE (IDE_BCOUNTL_REG); /* on this interrupt */ ireason.all=IN_BYTE (IDE_IREASON_REG); @@ -1915,14 +1915,14 @@ static void idetape_issue_packet_command (ide_drive_t *drive, idetape_pc_t *pc) pc->current_position=pc->buffer; bcount.all=pc->request_transfer; /* Request to transfer the entire buffer at once */ -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA if (test_and_clear_bit (PC_DMA_ERROR, &pc->flags)) { printk (KERN_WARNING "ide-tape: DMA disabled, reverting to PIO\n"); HWIF(drive)->dmaproc(ide_dma_off, drive); } if (test_bit (PC_DMA_RECOMMENDED, &pc->flags) && drive->using_dma) dma_ok=!HWIF(drive)->dmaproc(test_bit (PC_WRITING, &pc->flags) ? ide_dma_write : ide_dma_read, drive); -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ OUT_BYTE (drive->ctl,IDE_CONTROL_REG); OUT_BYTE (dma_ok ? 1:0,IDE_FEATURE_REG); /* Use PIO/DMA */ @@ -1952,12 +1952,12 @@ static void idetape_issue_packet_command (ide_drive_t *drive, idetape_pc_t *pc) return; } atapi_output_bytes (drive,pc->c,12); /* Send the actual packet */ -#ifdef CONFIG_BLK_DEV_TRITON +#ifdef CONFIG_BLK_DEV_IDEDMA if (dma_ok) { /* Begin DMA, if necessary */ set_bit (PC_DMA_IN_PROGRESS, &pc->flags); (void) (HWIF(drive)->dmaproc(ide_dma_begin, drive)); } -#endif /* CONFIG_BLK_DEV_TRITON */ +#endif /* CONFIG_BLK_DEV_IDEDMA */ } static void idetape_media_access_finished (ide_drive_t *drive) diff --git a/drivers/block/ide.c b/drivers/block/ide.c index 1a5fdc5a8b05..9c2bde0d7094 100644 --- a/drivers/block/ide.c +++ b/drivers/block/ide.c @@ -1,7 +1,7 @@ /* - * linux/drivers/block/ide.c Version 6.03 June 4, 1997 + * linux/drivers/block/ide.c Version 6.05 November 30, 1997 * - * Copyright (C) 1994-1997 Linus Torvalds & authors (see below) + * Copyright (C) 1994-1998 Linus Torvalds & authors (see below) */ #define _IDE_C /* needed by */ @@ -65,197 +65,6 @@ * Version 1.4 BETA added auto probing for irq(s) * Version 1.5 BETA added ALPHA (untested) support for IDE cd-roms, * ... - * Version 3.5 correct the bios_cyl field if it's too small - * (linux 1.1.76) (to help fdisk with brain-dead BIOSs) - * Version 3.6 cosmetic corrections to comments and stuff - * (linux 1.1.77) reorganise probing code to make it understandable - * added halfway retry to probing for drive identification - * added "hdx=noprobe" command line option - * allow setting multmode even when identification fails - * Version 3.7 move set_geometry=1 from do_identify() to ide_init() - * increase DRQ_WAIT to eliminate nuisance messages - * wait for DRQ_STAT instead of DATA_READY during probing - * (courtesy of Gary Thomas gary@efland.UU.NET) - * Version 3.8 fixed byte-swapping for confused Mitsumi cdrom drives - * update of ide-cd.c from Scott, allows blocksize=1024 - * cdrom probe fixes, inspired by jprang@uni-duisburg.de - * Version 3.9 don't use LBA if lba_capacity looks funny - * correct the drive capacity calculations - * fix probing for old Seagates without IDE_ALTSTATUS_REG - * fix byte-ordering for some NEC cdrom drives - * Version 3.10 disable multiple mode by default; was causing trouble - * Version 3.11 fix mis-identification of old WD disks as cdroms - * Version 3,12 simplify logic for selecting initial mult_count - * (fixes problems with buggy WD drives) - * Version 3.13 remove excess "multiple mode disabled" messages - * Version 3.14 fix ide_error() handling of BUSY_STAT - * fix byte-swapped cdrom strings (again.. arghh!) - * ignore INDEX bit when checking the ALTSTATUS reg - * Version 3.15 add SINGLE_THREADED flag for use with dual-CMD i/f - * ignore WRERR_STAT for non-write operations - * added vlb_sync support for DC-2000A & others, - * (incl. some Promise chips), courtesy of Frank Gockel - * Version 3.16 convert vlb_32bit and vlb_sync into runtime flags - * add ioctls to get/set VLB flags (HDIO_[SG]ET_CHIPSET) - * rename SINGLE_THREADED to SUPPORT_SERIALIZE, - * add boot flag to "serialize" operation for CMD i/f - * add optional support for DTC2278 interfaces, - * courtesy of andy@cercle.cts.com (Dyan Wile). - * add boot flag to enable "dtc2278" probe - * add probe to avoid EATA (SCSI) interfaces, - * courtesy of neuffer@goofy.zdv.uni-mainz.de. - * Version 4.00 tidy up verify_area() calls - heiko@colossus.escape.de - * add flag to ignore WRERR_STAT for some drives - * courtesy of David.H.West@um.cc.umich.edu - * assembly syntax tweak to vlb_sync - * removable drive support from scuba@cs.tu-berlin.de - * add transparent support for DiskManager-6.0x "Dynamic - * Disk Overlay" (DDO), most of this is in genhd.c - * eliminate "multiple mode turned off" message at boot - * Version 4.10 fix bug in ioctl for "hdparm -c3" - * fix DM6:DDO support -- now works with LILO, fdisk, ... - * don't treat some naughty WD drives as removable - * Version 4.11 updated DM6 support using info provided by OnTrack - * Version 5.00 major overhaul, multmode setting fixed, vlb_sync fixed - * added support for 3rd/4th/alternative IDE ports - * created ide.h; ide-cd.c now compiles separate from ide.c - * hopefully fixed infinite "unexpected_intr" from cdroms - * zillions of other changes and restructuring - * somehow reduced overall memory usage by several kB - * probably slowed things down slightly, but worth it - * Version 5.01 AT LAST!! Finally understood why "unexpected_intr" - * was happening at various times/places: whenever the - * ide-interface's ctl_port was used to "mask" the irq, - * it also would trigger an edge in the process of masking - * which would result in a self-inflicted interrupt!! - * (such a stupid way to build a hardware interrupt mask). - * This is now fixed (after a year of head-scratching). - * Version 5.02 got rid of need for {enable,disable}_irq_list() - * Version 5.03 tune-ups, comments, remove "busy wait" from drive resets - * removed PROBE_FOR_IRQS option -- no longer needed - * OOOPS! fixed "bad access" bug for 2nd drive on an i/f - * Version 5.04 changed "ira %d" to "irq %d" in DEBUG message - * added more comments, cleaned up unexpected_intr() - * OOOPS! fixed null pointer problem in ide reset code - * added autodetect for Triton chipset -- no effect yet - * Version 5.05 OOOPS! fixed bug in revalidate_disk() - * OOOPS! fixed bug in ide_do_request() - * added ATAPI reset sequence for cdroms - * Version 5.10 added Bus-Mastered DMA support for Triton Chipset - * some (mostly) cosmetic changes - * Version 5.11 added ht6560b support by malafoss@snakemail.hut.fi - * reworked PCI scanning code - * added automatic RZ1000 detection/support - * added automatic PCI CMD640 detection/support - * added option for VLB CMD640 support - * tweaked probe to find cdrom on hdb with disks on hda,hdc - * Version 5.12 some performance tuning - * added message to alert user to bad /dev/hd[cd] entries - * OOOPS! fixed bug in atapi reset - * driver now forces "serialize" again for all cmd640 chips - * noticed REALLY_SLOW_IO had no effect, moved it to ide.c - * made do_drive_cmd() into public ide_do_drive_cmd() - * Version 5.13 fixed typo ('B'), thanks to houston@boyd.geog.mcgill.ca - * fixed ht6560b support - * Version 5.13b (sss) fix problem in calling ide_cdrom_setup() - * don't bother invalidating nonexistent partitions - * Version 5.14 fixes to cmd640 support.. maybe it works now(?) - * added & tested full EZ-DRIVE support -- don't use LILO! - * don't enable 2nd CMD640 PCI port during init - conflict - * Version 5.15 bug fix in init_cmd640_vlb() - * bug fix in interrupt sharing code - * Version 5.16 ugh.. fix "serialize" support, broken in 5.15 - * remove "Huh?" from cmd640 code - * added qd6580 interface speed select from Colten Edwards - * Version 5.17 kludge around bug in BIOS32 on Intel triton motherboards - * Version 5.18 new CMD640 code, moved to cmd640.c, #include'd for now - * new UMC8672 code, moved to umc8672.c, #include'd for now - * disallow turning on DMA when h/w not capable of DMA - * Version 5.19 fix potential infinite timeout on resets - * extend reset poll into a general purpose polling scheme - * add atapi tape drive support from Gadi Oxman - * simplify exit from _intr routines -- no IDE_DO_REQUEST - * Version 5.20 leave current rq on blkdev request list during I/O - * generalized ide_do_drive_cmd() for tape/cdrom driver use - * Version 5.21 fix nasty cdrom/tape bug (ide_preempt was messed up) - * Version 5.22 fix ide_xlate_1024() to work with/without drive->id - * Version 5.23 miscellaneous touch-ups - * Version 5.24 fix #if's for SUPPORT_CMD640 - * Version 5.25 more touch-ups, fix cdrom resets, ... - * cmd640.c now configs/compiles separate from ide.c - * Version 5.26 keep_settings now maintains the using_dma flag - * fix [EZD] remap message to only output at boot time - * fix "bad /dev/ entry" message to say hdc, not hdc0 - * fix ide_xlate_1024() to respect user specified CHS - * use CHS from partn table if it looks translated - * re-merged flags chipset,vlb_32bit,vlb_sync into io_32bit - * keep track of interface chipset type, when known - * add generic PIO mode "tuneproc" mechanism - * fix cmd640_vlb option - * fix ht6560b support (was completely broken) - * umc8672.c now configures/compiles separate from ide.c - * move dtc2278 support to dtc2278.c - * move ht6560b support to ht6560b.c - * move qd6580 support to qd6580.c - * add ali14xx support in ali14xx.c - * Version 5.27 add [no]autotune parameters to help cmd640 - * move rz1000 support to rz1000.c - * Version 5.28 #include "ide_modes.h" - * fix disallow_unmask: now per-interface "no_unmask" bit - * force io_32bit to be the same on drive pairs of dtc2278 - * improved IDE tape error handling, and tape DMA support - * bugfix in ide_do_drive_cmd() for cdroms + serialize - * Version 5.29 fixed non-IDE check for too many physical heads - * don't use LBA if capacity is smaller than CHS - * Version 5.30 remove real_devices kludge, formerly used by genhd.c - * Version 5.32 change "KB" to "kB" - * fix serialize (was broken in kernel 1.3.72) - * add support for "hdparm -I" - * use common code for disk/tape/cdrom IDE_DRIVE_CMDs - * add support for Promise DC4030VL caching card - * improved serialize support - * put partition check back into alphabetical order - * add config option for PCMCIA baggage - * try to make PCMCIA support safer to use - * improve security on ioctls(): all are suser() only - * Version 5.33 improve handling of HDIO_DRIVE_CMDs that read data - * Version 5.34 fix irq-sharing problem from 5.33 - * fix cdrom ioctl problem from 5.33 - * Version 5.35 cosmetic changes - * fix cli() problem in try_to_identify() - * Version 5.36 fixes to optional PCMCIA support - * Version 5.37 don't use DMA when "noautotune" is specified - * Version 5.37a (go) fix shared irq probing (was broken in kernel 1.3.72) - * call unplug_device() from ide_do_drive_cmd() - * Version 5.38 add "hdx=none" option, courtesy of Joel Maslak - * mask drive irq after use, if sharing with another hwif - * add code to help debug weird cmd640 problems - * Version 5.39 fix horrible error in earlier irq sharing "fix" - * Version 5.40 fix serialization -- was broken in 5.39 - * help sharing by masking device irq after probing - * Version 5.41 more fixes to irq sharing/serialize detection - * disable io_32bit by default on drive reset - * Version 5.42 simplify irq-masking after probe - * fix NULL pointer deref in save_match() - * Version 5.43 Ugh.. unexpected_intr is back: try to exterminate it - * Version 5.44 Fix for "irq probe failed" on cmd640 - * change path on message regarding MAKEDEV.ide - * add a throttle to the unexpected_intr() messages - * Version 5.45 fix ugly parameter parsing bugs (thanks Derek) - * include Gadi's magic fix for cmd640 unexpected_intr - * include mc68000 patches from Geert Uytterhoeven - * add Gadi's fix for PCMCIA cdroms - * Version 5.46 remove the mc68000 #ifdefs for 2.0.x - * Version 5.47 fix set_tune race condition - * fix bug in earlier PCMCIA cdrom update - * Version 5.48 if def'd, invoke CMD640_DUMP_REGS when irq probe fails - * lengthen the do_reset1() pulse, for laptops - * add idebus=xx parameter for cmd640 and ali chipsets - * no_unmask flag now per-drive instead of per-hwif - * fix tune_req so that it gets done immediately - * fix missing restore_flags() in ide_ioctl - * prevent use of io_32bit on cmd640 with no prefetch - * Version 5.49 fix minor quirks in probing routines * Version 5.50 allow values as small as 20 for idebus= * Version 5.51 force non io_32bit in drive_cmd_intr() * change delay_10ms() to delay_50ms() to fix problems @@ -281,6 +90,11 @@ * Version 6.02 fix ide_ack_intr() call * check partition table on floppies * Version 6.03 handle bad status bit sequencing in ide_wait_stat() + * Version 6.10 deleted old entries from this list of updates + * replaced triton.c with ide-dma.c generic PCI DMA + * added support for BIOS-enabled UltraDMA + * rename all "promise" things to "pdc4030" + * fix EZ-DRIVE handling on small disks * * Some additional driver compile-time options are in ide.h * @@ -867,14 +681,14 @@ byte ide_dump_status (ide_drive_t *drive, const char *msg, byte stat) #if FANCY_STATUS_DUMPS if (drive->media == ide_disk) { printk(" { "); - if (err & BBD_ERR) printk("BadSector "); + if (err & ABRT_ERR) printk("DriveStatusError "); + if (err & ICRC_ERR) printk((err & ABRT_ERR) ? "BadCRC " : "BadSector "); if (err & ECC_ERR) printk("UncorrectableError "); if (err & ID_ERR) printk("SectorIdNotFound "); - if (err & ABRT_ERR) printk("DriveStatusError "); if (err & TRK0_ERR) printk("TrackZeroNotFound "); if (err & MARK_ERR) printk("AddrMarkNotFound "); printk("}"); - if (err & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) { + if ((err & (BBD_ERR | ABRT_ERR)) == BBD_ERR || (err & (ECC_ERR|ID_ERR|MARK_ERR))) { byte cur = IN_BYTE(IDE_SELECT_REG); if (cur & 0x40) { /* using LBA? */ printk(", LBAsect=%ld", (unsigned long) @@ -922,7 +736,7 @@ static void try_to_flush_leftover_data (ide_drive_t *drive) } /* - * ide_error() takes action based on the error returned by the controller. + * ide_error() takes action based on the error returned by the drive. */ void ide_error (ide_drive_t *drive, const char *msg, byte stat) { @@ -943,7 +757,12 @@ void ide_error (ide_drive_t *drive, const char *msg, byte stat) } else { if (drive->media == ide_disk && (stat & ERR_STAT)) { /* err has different meaning on cdrom and tape */ - if (err & (BBD_ERR | ECC_ERR)) /* retries won't help these */ + if (err == ABRT_ERR) { + if (drive->select.b.lba && IN_BYTE(IDE_COMMAND_REG) == WIN_SPECIFY) + return; /* some newer drives don't support WIN_SPECIFY */ + } else if ((err & (ABRT_ERR | ICRC_ERR)) == (ABRT_ERR | ICRC_ERR)) + ; /* UDMA crc error -- just retry the operation */ + else if (err & (BBD_ERR | ECC_ERR)) /* retries won't help these */ rq->errors = ERROR_MAX; else if (err & TRK0_ERR) /* help it find track zero */ rq->errors |= ERROR_RECAL; @@ -1545,7 +1364,7 @@ int ide_do_drive_cmd (ide_drive_t *drive, struct request *rq, ide_action_t actio struct request *cur_rq; struct semaphore sem = MUTEX_LOCKED; - if (IS_PROMISE_DRIVE && rq->buffer != NULL) + if (IS_PDC4030_DRIVE && rq->buffer != NULL) return -ENOSYS; /* special drive cmds not supported */ rq->errors = 0; rq->rq_status = RQ_ACTIVE; @@ -1931,8 +1750,13 @@ static int ide_ioctl (struct inode *inode, struct file *file, return -EINVAL; if (drive->id == NULL) return -ENOMSG; +#if 0 if (copy_to_user((char *)arg, (char *)drive->id, sizeof(*drive->id))) return -EFAULT; +#else + if (copy_to_user((char *)arg, (char *)drive->id, 142)) + return -EFAULT; +#endif return 0; case HDIO_GET_NOWERR: @@ -2379,13 +2203,14 @@ __initfunc(void ide_setup (char *s)) case -12: /* "reset" */ hwif->reset = 1; goto done; -#ifdef CONFIG_BLK_DEV_PROMISE +#ifdef CONFIG_BLK_DEV_PDC4030 case -11: /* "dc4030" */ { - setup_dc4030(hwif); + extern void setup_pdc4030(ide_hwif_t *); + setup_pdc4030(hwif); goto done; } -#endif /* CONFIG_BLK_DEV_PROMISE */ +#endif /* CONFIG_BLK_DEV_PDC4030 */ #ifdef CONFIG_BLK_DEV_ALI14XX case -10: /* "ali14xx" */ { @@ -2510,6 +2335,9 @@ int ide_xlate_1024 (kdev_t i_rdev, int xparm, const char *msg) printk("%s ", msg); + if (xparm == -1 && drive->bios_cyl < 1024) + return 0; /* small disk: no translation needed */ + if (drive->id) { drive->cyl = drive->id->cyls; drive->head = drive->id->heads; @@ -2550,32 +2378,6 @@ int ide_xlate_1024 (kdev_t i_rdev, int xparm, const char *msg) return 1; } -#ifdef CONFIG_PCI -#if defined(CONFIG_BLK_DEV_RZ1000) || defined(CONFIG_BLK_DEV_TRITON) || defined(CONFIG_BLK_DEV_OPTI621) - -typedef void (ide_pci_init_proc_t)(byte, byte); - -/* - * ide_probe_pci() scans PCI for a specific vendor/device function, - * and invokes the supplied init routine for each instance detected. - */ -__initfunc(static void ide_probe_pci (unsigned short vendor, unsigned short device, ide_pci_init_proc_t *init, int func_adj)) -{ - unsigned long flags; - unsigned index; - byte fn, bus; - - save_flags(flags); - cli(); - for (index = 0; !pcibios_find_device (vendor, device, index, &bus, &fn); ++index) { - init (bus, fn + func_adj); - } - restore_flags(flags); -} - -#endif /* defined(CONFIG_BLK_DEV_RZ1000) || defined(CONFIG_BLK_DEV_TRITON) || defined(CONFIG_BLK_DEV_OPTI621) */ -#endif /* CONFIG_PCI */ - /* * probe_for_hwifs() finds/initializes "known" IDE interfaces * @@ -2588,50 +2390,43 @@ __initfunc(static void probe_for_hwifs (void)) /* * Find/initialize PCI IDE interfaces */ - if (pcibios_present()) { + if (pcibios_present()) + { +#ifdef CONFIG_BLK_DEV_IDEDMA + { + extern void ide_scan_pcibus(void); + ide_scan_pcibus(); + } +#endif #ifdef CONFIG_BLK_DEV_RZ1000 - ide_pci_init_proc_t init_rz1000; - ide_probe_pci (PCI_VENDOR_ID_PCTECH, PCI_DEVICE_ID_PCTECH_RZ1000, &init_rz1000, 0); - ide_probe_pci (PCI_VENDOR_ID_PCTECH, PCI_DEVICE_ID_PCTECH_RZ1001, &init_rz1000, 0); -#endif /* CONFIG_BLK_DEV_RZ1000 */ -#ifdef CONFIG_BLK_DEV_TRITON - /* - * Apparently the BIOS32 services on Intel motherboards are - * buggy and won't find the PCI_DEVICE_ID_INTEL_82371_1 for us. - * So instead, we search for PCI_DEVICE_ID_INTEL_82371_0, - * and then add 1. - */ - ide_probe_pci (PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371_0, &ide_init_triton, 1); - ide_probe_pci (PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371SB_1, &ide_init_triton, 0); - ide_probe_pci (PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB, &ide_init_triton, 0); - ide_probe_pci (PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_1, &ide_init_triton, 0); -#endif /* CONFIG_BLK_DEV_TRITON */ -#ifdef CONFIG_BLK_DEV_OPTI621 - ide_probe_pci (PCI_VENDOR_ID_OPTI, PCI_DEVICE_ID_OPTI_82C621, &ide_init_opti621, 0); -#endif /* CONFIG_BLK_DEV_OPTI621 */ + { + extern void ide_probe_for_rz100x(void); + ide_probe_for_rz100x(); + } +#endif } #endif /* CONFIG_PCI */ #ifdef CONFIG_BLK_DEV_CMD640 { - extern void ide_probe_for_cmd640x (void); + extern void ide_probe_for_cmd640x(void); ide_probe_for_cmd640x(); } #endif -#ifdef CONFIG_BLK_DEV_PROMISE - init_dc4030(); +#ifdef CONFIG_BLK_DEV_PDC4030 + { + extern int init_pdc4030(void); + (void) init_pdc4030(); + } #endif } __initfunc(void ide_init_builtin_drivers (void)) { /* - * Probe for special "known" interface chipsets + * Probe for special PCI and other "known" interface chipsets */ probe_for_hwifs (); - /* - * Probe for devices - */ #ifdef CONFIG_BLK_DEV_IDE #ifdef __mc68000__ if (ide_hwifs[0].io_ports[IDE_DATA_OFFSET]) { diff --git a/drivers/block/ide.h b/drivers/block/ide.h index 5c69eb353774..f40bc7e7f226 100644 --- a/drivers/block/ide.h +++ b/drivers/block/ide.h @@ -3,7 +3,7 @@ /* * linux/drivers/block/ide.h * - * Copyright (C) 1994-1996 Linus Torvalds & authors + * Copyright (C) 1994-1998 Linus Torvalds & authors */ #include @@ -164,7 +164,7 @@ typedef unsigned char byte; /* used everywhere */ #define WAIT_CMD (10*HZ) /* 10sec - maximum wait for an IRQ to happen */ #define WAIT_MIN_SLEEP (2*HZ/100) /* 20msec - minimum sleep time */ -#if defined(CONFIG_BLK_DEV_HT6560B) || defined(CONFIG_BLK_DEV_PROMISE) +#if defined(CONFIG_BLK_DEV_HT6560B) || defined(CONFIG_BLK_DEV_PDC4030) #define SELECT_DRIVE(hwif,drive) \ { \ if (hwif->selectproc) \ @@ -174,7 +174,7 @@ typedef unsigned char byte; /* used everywhere */ } #else #define SELECT_DRIVE(hwif,drive) OUT_BYTE((drive)->select.all, hwif->io_ports[IDE_SELECT_OFFSET]); -#endif /* CONFIG_BLK_DEV_HT6560B || CONFIG_BLK_DEV_PROMISE */ +#endif /* CONFIG_BLK_DEV_HT6560B || CONFIG_BLK_DEV_PDC4030 */ /* * Now for the data we need to maintain per-drive: ide_drive_t @@ -244,6 +244,7 @@ typedef struct ide_drive_s { byte bios_sect; /* BIOS/fdisk/LILO sectors per track */ unsigned short bios_cyl; /* BIOS/fdisk/LILO number of cyls */ unsigned short cyl; /* "real" number of cyls */ + unsigned int timing_data; /* for use by tuneproc()'s */ void *hwif; /* actually (ide_hwif_t *) */ struct wait_queue *wqueue; /* used to wait for drive in open() */ struct hd_driveid *id; /* drive model identification info */ @@ -268,7 +269,7 @@ typedef enum { ide_dma_read = 0, ide_dma_write = 1, ide_dma_abort = 2, ide_dma_check = 3, ide_dma_status_bad = 4, ide_dma_transferred = 5, ide_dma_begin = 6, ide_dma_on = 7, - ide_dma_off = 8 } + ide_dma_off = 8, ide_dma_off_quietly = 9 } ide_dma_action_t; typedef int (ide_dmaproc_t)(ide_dma_action_t, ide_drive_t *); @@ -288,7 +289,7 @@ typedef int (ide_dmaproc_t)(ide_dma_action_t, ide_drive_t *); typedef void (ide_tuneproc_t)(ide_drive_t *, byte); /* - * This is used to provide HT6560B & PROMISE interface support. + * This is used to provide HT6560B & PDC4030 interface support. */ typedef void (ide_selectproc_t) (ide_drive_t *); @@ -296,10 +297,10 @@ typedef void (ide_selectproc_t) (ide_drive_t *); * hwif_chipset_t is used to keep track of the specific hardware * chipset used by each IDE interface, if known. */ -typedef enum { ide_unknown, ide_generic, ide_triton, +typedef enum { ide_unknown, ide_generic, ide_pci, ide_cmd640, ide_dtc2278, ide_ali14xx, ide_qd6580, ide_umc8672, ide_ht6560b, - ide_promise, ide_via } + ide_pdc4030, ide_rz1000 } hwif_chipset_t; typedef struct hwif_s { @@ -309,7 +310,7 @@ typedef struct hwif_s { ide_drive_t drives[MAX_DRIVES]; /* drive info */ struct gendisk *gd; /* gendisk structure */ ide_tuneproc_t *tuneproc; /* routine to tune PIO mode for drives */ -#if defined(CONFIG_BLK_DEV_HT6560B) || defined(CONFIG_BLK_DEV_PROMISE) +#if defined(CONFIG_BLK_DEV_HT6560B) || defined(CONFIG_BLK_DEV_PDC4030) ide_selectproc_t *selectproc; /* tweaks hardware to select drive */ #endif ide_dmaproc_t *dmaproc; /* dma read/write/abort routine */ @@ -324,9 +325,9 @@ typedef struct hwif_s { unsigned present : 1; /* this interface exists */ unsigned serialized : 1; /* serialized operation with mate hwif */ unsigned sharing_irq: 1; /* 1 = sharing irq with another hwif */ -#ifdef CONFIG_BLK_DEV_PROMISE - unsigned is_promise2: 1; /* 2nd i/f on promise DC4030 */ -#endif /* CONFIG_BLK_DEV_PROMISE */ +#ifdef CONFIG_BLK_DEV_PDC4030 + unsigned is_pdc4030_2: 1;/* 2nd i/f on pdc4030 */ +#endif /* CONFIG_BLK_DEV_PDC4030 */ unsigned reset : 1; /* reset after probe */ #if (DISK_RECOVERY_TIME > 0) unsigned long last_time; /* time when previous rq was done */ @@ -609,23 +610,15 @@ ide_drive_t *ide_scan_devices (byte media, ide_driver_t *driver, int n); int ide_register_subdriver (ide_drive_t *drive, ide_driver_t *driver, int version); int ide_unregister_subdriver (ide_drive_t *drive); -#ifdef CONFIG_BLK_DEV_TRITON -void ide_init_triton (byte, byte); -#endif /* CONFIG_BLK_DEV_TRITON */ - -#ifdef CONFIG_BLK_DEV_OPTI621 -void ide_init_opti621 (byte, byte); -#endif /* CONFIG_BLK_DEV_OPTI621 */ - #ifdef CONFIG_BLK_DEV_IDE int ideprobe_init (void); #endif /* CONFIG_BLK_DEV_IDE */ -#ifdef CONFIG_BLK_DEV_PROMISE -#include "promise.h" -#define IS_PROMISE_DRIVE (HWIF(drive)->chipset == ide_promise) +#ifdef CONFIG_BLK_DEV_PDC4030 +#include "pdc4030.h" +#define IS_PDC4030_DRIVE (HWIF(drive)->chipset == ide_pdc4030) #else -#define IS_PROMISE_DRIVE (0) /* auto-NULLs out Promise code */ -#endif /* CONFIG_BLK_DEV_PROMISE */ +#define IS_PDC4030_DRIVE (0) /* auto-NULLs out pdc4030 code */ +#endif /* CONFIG_BLK_DEV_PDC4030 */ #endif /* _IDE_H */ diff --git a/drivers/block/opti621.c b/drivers/block/opti621.c index 2e7bd733cf41..87997e05fb96 100644 --- a/drivers/block/opti621.c +++ b/drivers/block/opti621.c @@ -1,7 +1,7 @@ /* - * linux/drivers/block/opti621.c Version 0.1 Oct 26, 1996 + * linux/drivers/block/opti621.c Version 0.3 Nov 29, 1997 * - * Copyright (C) 1996 Linus Torvalds & author (see below) + * Copyright (C) 1996-1998 Linus Torvalds & author (see below) */ /* @@ -33,8 +33,8 @@ * PIO 3 and slave PIO 0, driver have to set some timings of * master for PIO 0. Second problem is that opti621_tune_drive * got only one drive to set, but have to set both drives. - * This is solved in opti621_compute_pios. If you don't set - * the second drive, opti621_compute_pios use ide_get_best_pio_mode + * This is solved in compute_pios. If you don't set + * the second drive, compute_pios use ide_get_best_pio_mode * for autoselect mode (you can change it to PIO 0, if you want). * If you then set the second drive to another PIO, the old value * (automatically selected) will be overrided by yours. @@ -48,7 +48,7 @@ * settings of jumpers on the card and I have to boot Linux with * Loadlin except LILO, cause I have to run the setupvic.exe program * already or I get disk errors (my test: rpm -Vf - * /usr/X11R6/bin/XF86_SVGA - or any big file). + * /usr/X11R6/bin/XF86_SVGA - or any big file). * Some numbers from hdparm -t /dev/hda: * Timing buffer-cache reads: 32 MB in 3.02 seconds =10.60 MB/sec * Timing buffered disk reads: 16 MB in 5.52 seconds = 2.90 MB/sec @@ -84,7 +84,7 @@ * address: 25 ns, data: 25 ns, recovery: 50 ns; * on 20MHz PCI bus (pulse 50 ns): * address: 50 ns, data: 50 ns, recovery: 100 ns. - */ + */ /* #define READ_PREFETCH 0 */ /* Uncommnent for disable read prefetch. @@ -103,58 +103,35 @@ #define MISC_REG 6 /* index of Miscellaneous register */ #define CNTRL_REG 3 /* index of Control register */ int reg_base; -int opti621_primary_base, opti621_secondary_base; #define PIO_NOT_EXIST 254 #define PIO_DONT_KNOW 255 -int opti621_drive_pio_modes[4]; + /* there are stored pio numbers from other calls of opti621_tune_drive */ -void opti621_compute_pios(ide_hwif_t *drv, int second_contr, int slave_drive, byte pio) -/* Store values into opti621_drive_pio_modes: +static void compute_pios(ide_drive_t *drive, byte pio) +/* Store values into drive->timing_data * second_contr - 0 for primary controller, 1 for secondary * slave_drive - 0 -> pio is for master, 1 -> pio is for slave - * pio - PIO mode for selected drive (for other we don't know) - */ + * pio - PIO mode for selected drive (for other we don't know) + */ { - ide_drive_t *p1, *p2, *drive; - int i; - - i = 2*second_contr; - p1 = &drv->drives[0]; - p2 = &drv->drives[1]; - drive = &drv->drives[slave_drive]; - pio = ide_get_best_pio_mode(drive, pio, OPTI621_MAX_PIO, NULL); - opti621_drive_pio_modes[i+slave_drive]=pio; + int d; + ide_hwif_t *hwif = HWIF(drive); - if (p1->present) { - if (opti621_drive_pio_modes[i]==PIO_DONT_KNOW) - opti621_drive_pio_modes[i]=ide_get_best_pio_mode(p1, - 255, OPTI621_MAX_PIO, NULL); - /* we don't know the selected PIO mode, so we have to autoselect */ - } else - opti621_drive_pio_modes[i]=PIO_NOT_EXIST; - if (p2->present) { - if (opti621_drive_pio_modes[i+1]==PIO_DONT_KNOW) - opti621_drive_pio_modes[i+1]=ide_get_best_pio_mode(p2, - 255, OPTI621_MAX_PIO, NULL); - /* we don't know the selected PIO mode, so we have to autoselect */ - } else - opti621_drive_pio_modes[i+1]=PIO_NOT_EXIST; - /* in opti621_drive_pio_modes[i] and [i+1] are valid PIO modes (or PIO_NOT_EXIST, - if drive is not connected), we can continue */ + drive->timing_data = ide_get_best_pio_mode(drive, pio, OPTI621_MAX_PIO, NULL); + for (d = 0; d < 2; ++d) { + drive = &hwif->drives[d]; + if (drive->present) { + if (drive->timing_data == PIO_DONT_KNOW) + drive->timing_data = ide_get_best_pio_mode(drive, 255, OPTI621_MAX_PIO, NULL); #ifdef OPTI621_DEBUG - printk("%s: (master): ", p1->name); - if (p1->present) - printk("PIO mode %d\n", opti621_drive_pio_modes[i]); - else - printk("not present\n"); - printk("%s: (slave): ", p2->name); - if (p2->present) - printk("PIO mode %d\n", opti621_drive_pio_modes[i+1]); - else - printk("not present\n"); + printk("%s: Selected PIO mode %d\n", drive->name, drive->timing_data); #endif + } else { + drive->timing_data = PIO_NOT_EXIST; + } + } } int cmpt_clk(int time, int bus_speed) @@ -169,7 +146,7 @@ int cmpt_clk(int time, int bus_speed) return ((time*bus_speed+999)/1000); } -void write_reg(byte value, int reg) +static void write_reg(byte value, int reg) /* Write value to register reg, base of register * is at reg_base (0x1f0 primary, 0x170 secondary, * if not changed by PCI configuration). @@ -180,12 +157,12 @@ void write_reg(byte value, int reg) inw(reg_base+1); outb(3, reg_base+2); outb(value, reg_base+reg); - outb(0x83, reg_base+2); + outb(0x83, reg_base+2); } -byte read_reg(int reg) +static byte read_reg(int reg) /* Read value from register reg, base of register - * is at reg_base (0x1f0 primary, 0x170 secondary, + * is at reg_base (0x1f0 primary, 0x170 secondary, * if not changed by PCI configuration). * This is from setupvic.exe program. */ @@ -195,7 +172,7 @@ byte read_reg(int reg) inw(reg_base+1); outb(3, reg_base+2); ret=inb(reg_base+reg); - outb(0x83, reg_base+2); + outb(0x83, reg_base+2); return ret; } @@ -205,9 +182,9 @@ typedef struct pio_clocks_s { int recovery_time; /* Recovery time (clocks) */ } pio_clocks_t; -void compute_clocks(int pio, pio_clocks_t *clks) +static void compute_clocks(int pio, pio_clocks_t *clks) { - if (pio!=PIO_NOT_EXIST) { + if (pio != PIO_NOT_EXIST) { int adr_setup, data_pls, bus_speed; bus_speed = ide_system_bus_speed(); adr_setup = ide_pio_timings[pio].setup_time; @@ -230,108 +207,80 @@ void compute_clocks(int pio, pio_clocks_t *clks) } } -static void opti621_tune_drive (ide_drive_t *drive, byte pio) /* Main tune procedure, hooked by tuneproc. */ +static void opti621_tune_drive (ide_drive_t *drive, byte pio) { - /* primary and secondary drives share some (but not same) registers, - so we have to program both drives */ + /* primary and secondary drives share some registers, + * so we have to program both drives + */ unsigned long flags; byte pio1, pio2; - int second_contr, slave_drive; pio_clocks_t first, second; int ax, drdy; byte cycle1, cycle2, misc; - - second_contr=HWIF(drive)->index; - if ((second_contr!=0) && (second_contr!=1)) - return; /* invalid controller number */ - if (((second_contr==0) && (opti621_primary_base==0)) || - ((second_contr==1) && (opti621_secondary_base==0))) - return; /* controller is unaccessible/not exist */ - slave_drive = drive->select.b.unit; - /* set opti621_drive_pio_modes[] */ - opti621_compute_pios(HWIF(drive), second_contr, slave_drive, pio); - - reg_base = second_contr ? opti621_primary_base : opti621_secondary_base; + ide_hwif_t *hwif = HWIF(drive); + + /* set drive->timing_data for both drives */ + compute_pios(drive, pio); + pio1 = hwif->drives[0].timing_data; + pio2 = hwif->drives[1].timing_data; - pio1 = opti621_drive_pio_modes[second_contr*2]; - pio2 = opti621_drive_pio_modes[second_contr*2+1]; - compute_clocks(pio1, &first); compute_clocks(pio2, &second); - - ax = (first.address_timename, ax, first.data_time, first.recovery_time, drdy); + hwif->name, ax, first.data_time, first.recovery_time, drdy); printk("%s: slave: address: %d, data: %d, recovery: %d, drdy: %d [clk]\n", - HWIF(drive)->name, ax, second.data_time, second.recovery_time, drdy); + hwif->name, ax, second.data_time, second.recovery_time, drdy); #endif save_flags(flags); cli(); - + + reg_base = hwif->io_ports[IDE_DATA_OFFSET]; outb(0xc0, reg_base+CNTRL_REG); /* allow Register-B */ outb(0xff, reg_base+5); /* hmm, setupvic.exe does this ;-) */ inb(reg_base+CNTRL_REG); /* if reads 0xff, adapter not exist? */ read_reg(CNTRL_REG); /* if reads 0xc0, no interface exist? */ read_reg(5); /* read version, probably 0 */ - - /* programming primary drive - 0 or 2 */ - write_reg(0, MISC_REG); /* select Index-0 for Register-A */ + + /* program primary drive */ + write_reg(0, MISC_REG); /* select Index-0 for Register-A */ write_reg(cycle1, READ_REG); /* set read cycle timings */ write_reg(cycle1, WRITE_REG); /* set write cycle timings */ - /* programming secondary drive - 1 or 3 */ - write_reg(1, MISC_REG); /* select Index-1 for Register-B */ - write_reg(cycle2, READ_REG); /* set read cycle timings */ - write_reg(cycle2, WRITE_REG); /* set write cycle timings */ - - write_reg(0x85, CNTRL_REG); /* use Register-A for drive 0 (or 2) and - Register-B for drive 1 (or 3) */ - - write_reg(misc, MISC_REG); /* set address setup, DRDY timings - and read prefetch for both drives */ - + /* program secondary drive */ + write_reg(1, MISC_REG); /* select Index-1 for Register-B */ + write_reg(cycle2, READ_REG); /* set read cycle timings */ + write_reg(cycle2, WRITE_REG); /* set write cycle timings */ + + write_reg(0x85, CNTRL_REG); /* use Register-A for drive 0 */ + /* use Register-B for drive 1 */ + + write_reg(misc, MISC_REG); /* set address setup, DRDY timings, */ + /* and read prefetch for both drives */ + restore_flags(flags); } -void ide_init_opti621 (byte bus, byte fn) -/* Init controller. Called on kernel boot. */ +/* + * ide_init_opti621() is Called from idedma.c once for each hwif found at boot. + */ +void ide_init_opti621 (byte bus, byte fn, ide_hwif_t *hwifs) { - int rc, i; - unsigned char sreg; - unsigned short reg; - unsigned int dreg; - unsigned char revision; - for (i=0; i<4; i++) - opti621_drive_pio_modes[i] = PIO_DONT_KNOW; - printk("ide: OPTi 82C621 on PCI bus %d function %d\n", bus, fn); - if ((rc = pcibios_read_config_byte (bus, fn, 0x08, &sreg))) - goto quit; - revision = sreg; - if ((rc = pcibios_read_config_dword (bus, fn, 0x10, &dreg))) - goto quit; - opti621_primary_base = ((dreg==0) || (dreg>0xffff)) ? 0 : dreg-1; - if ((rc = pcibios_read_config_dword (bus, fn, 0x18, &dreg))) - goto quit; - opti621_secondary_base = ((dreg==0) || (dreg>0xffff)) ? 0 : dreg-1; - printk("ide: revision %d, primary: 0x%04x, secondary: 0x%04x\n", - revision, opti621_primary_base, opti621_secondary_base); - if ((rc = pcibios_read_config_word (bus, fn, PCI_COMMAND, ®))) - goto quit; - if (!(reg & 1)) { - printk("ide: ports are not enabled (BIOS)\n"); - } else { - ide_hwifs[0].tuneproc = &opti621_tune_drive; - ide_hwifs[1].tuneproc = &opti621_tune_drive; - } - quit: if (rc) printk("ide: pcibios access failed - %s\n", pcibios_strerror(rc)); + if (hwif->io_ports[IDE_DATA_OFFSET]) { + hwif->drives[0].timing_data = PIO_DONT_KNOW; + hwif->drives[1].timing_data = PIO_DONT_KNOW; + hwif->tuneproc = &opti621_tune_drive; + } } diff --git a/drivers/block/promise.c b/drivers/block/pdc4030.c similarity index 92% rename from drivers/block/promise.c rename to drivers/block/pdc4030.c index f3a8d5529394..55bef9ca6d8f 100644 --- a/drivers/block/promise.c +++ b/drivers/block/pdc4030.c @@ -1,7 +1,7 @@ /* -*- linux-c -*- - * linux/drivers/block/promise.c Version 0.07 Mar 26, 1996 + * linux/drivers/block/pdc4030.c Version 0.08 Nov 30, 1997 * - * Copyright (C) 1995-1996 Linus Torvalds & authors (see below) + * Copyright (C) 1995-1998 Linus Torvalds & authors (see below) */ /* @@ -28,6 +28,7 @@ * Version 0.06 Ooops. Add hwgroup to direct call of ide_intr() -ml * Version 0.07 Added support for DC4030 variants * Secondary interface autodetection + * Version 0.08 Renamed to pdc4030.c */ /* @@ -56,7 +57,7 @@ #include #include #include "ide.h" -#include "promise.h" +#include "pdc4030.h" /* This is needed as the controller may not interrupt if the required data is available in the cache. We have to simulate an interrupt. Ugh! */ @@ -73,15 +74,15 @@ static void promise_selectproc (ide_drive_t *drive) OUT_BYTE(drive->select.all,IDE_SELECT_REG); udelay(1); /* paranoia */ - number = ((HWIF(drive)->is_promise2)<<1) + drive->select.b.unit; + number = ((HWIF(drive)->is_pdc4030_2)<<1) + drive->select.b.unit; OUT_BYTE(number,IDE_FEATURE_REG); } /* - * promise_cmd handles the set of vendor specific commands that are initiated + * pdc4030_cmd handles the set of vendor specific commands that are initiated * by command F0. They all have the same success/failure notification. */ -int promise_cmd(ide_drive_t *drive, byte cmd) +int pdc4030_cmd(ide_drive_t *drive, byte cmd) { unsigned long timeout, timer; byte status_val; @@ -111,17 +112,17 @@ int promise_cmd(ide_drive_t *drive, byte cmd) ide_hwif_t *hwif_required = NULL; -void setup_dc4030 (ide_hwif_t *hwif) +void setup_pdc4030 (ide_hwif_t *hwif) { hwif_required = hwif; } /* -init_dc4030: Test for presence of a Promise caching controller card. +init_pdc4030: Test for presence of a Promise caching controller card. Returns: 0 if no Promise card present at this io_base 1 if Promise card found */ -int init_dc4030 (void) +int init_pdc4030 (void) { ide_hwif_t *hwif = hwif_required; ide_drive_t *drive; @@ -133,7 +134,7 @@ int init_dc4030 (void) drive = &hwif->drives[0]; second_hwif = &ide_hwifs[hwif->index+1]; - if(hwif->is_promise2) /* we've already been found ! */ + if(hwif->is_pdc4030_2) /* we've already been found ! */ return 1; if(IN_BYTE(IDE_NSECTOR_REG) == 0xFF || IN_BYTE(IDE_SECTOR_REG) == 0xFF) @@ -141,7 +142,7 @@ int init_dc4030 (void) return 0; } OUT_BYTE(0x08,IDE_CONTROL_REG); - if(promise_cmd(drive,PROMISE_GET_CONFIG)) { + if(pdc4030_cmd(drive,PROMISE_GET_CONFIG)) { return 0; } if(ide_wait_stat(drive,DATA_READY,BAD_W_STAT,WAIT_DRQ)) { @@ -168,7 +169,7 @@ int init_dc4030 (void) default: hwif->irq = 15; break; } printk("on IRQ %d\n",hwif->irq); - hwif->chipset = second_hwif->chipset = ide_promise; + hwif->chipset = second_hwif->chipset = ide_pdc4030; hwif->selectproc = second_hwif->selectproc = &promise_selectproc; /* Shift the remaining interfaces down by one */ for (i=MAX_HWIFS-1 ; i > hwif->index+1 ; i--) { @@ -179,7 +180,7 @@ int init_dc4030 (void) h->io_ports[IDE_CONTROL_OFFSET] = (h-1)->io_ports[IDE_CONTROL_OFFSET]; h->noprobe = (h-1)->noprobe; } - second_hwif->is_promise2 = 1; + second_hwif->is_pdc4030_2 = 1; ide_init_hwif_ports(second_hwif->io_ports, hwif->io_ports[IDE_DATA_OFFSET], NULL); second_hwif->io_ports[IDE_CONTROL_OFFSET] = hwif->io_ports[IDE_CONTROL_OFFSET]; second_hwif->irq = hwif->irq; @@ -304,11 +305,11 @@ static void promise_write (ide_drive_t *drive) } /* - * do_promise_io() is called from do_rw_disk, having had the block number + * do_pdc4030_io() is called from do_rw_disk, having had the block number * already set up. It issues a READ or WRITE command to the Promise * controller, assuming LBA has been used to set up the block number. */ -void do_promise_io (ide_drive_t *drive, struct request *rq) +void do_pdc4030_io (ide_drive_t *drive, struct request *rq) { unsigned long timeout; byte stat; diff --git a/drivers/block/promise.h b/drivers/block/pdc4030.h similarity index 71% rename from drivers/block/promise.h rename to drivers/block/pdc4030.h index e82541d2afd6..9f08da5aae22 100644 --- a/drivers/block/promise.h +++ b/drivers/block/pdc4030.h @@ -1,7 +1,7 @@ /* - * linux/drivers/block/promise.h + * linux/drivers/block/pdc4030.h * - * Copyright (C) 1995-6 Linus Torvalds & authors + * Copyright (C) 1995-1998 Linus Torvalds & authors */ /* @@ -41,12 +41,4 @@ struct dc_ident { u8 pad[SECTOR_WORDS*4 - 32]; }; -/* - * Routines exported to ide.c: - */ -void do_promise_io (ide_drive_t *, struct request *); -int promise_cmd(ide_drive_t *, byte); -void setup_dc4030 (ide_hwif_t *); -int init_dc4030 (void); - #endif IDE_PROMISE_H diff --git a/drivers/block/rz1000.c b/drivers/block/rz1000.c index 41b26f2777e7..b7270c559ff0 100644 --- a/drivers/block/rz1000.c +++ b/drivers/block/rz1000.c @@ -26,34 +26,42 @@ #include #include "ide.h" -static void ide_pci_access_error (int rc) +static void init_rz1000 (byte bus, byte fn, const char *name) { - printk("ide: pcibios access failed - %s\n", pcibios_strerror(rc)); + unsigned short reg, h; + + printk("%s: buggy IDE controller: ", name); + if (!pcibios_read_config_word (bus, fn, PCI_COMMAND, ®) && !(reg & 1)) { + printk("disabled (BIOS)\n"); + return; + } + if (!pcibios_read_config_word (bus, fn, 0x40, ®) + && !pcibios_write_config_word(bus, fn, 0x40, reg & 0xdfff)) + { + printk("disabled read-ahead\n"); + } else { + printk("\n"); + for (h = 0; h < MAX_HWIFS; ++h) { + ide_hwif_t *hwif = &ide_hwifs[h]; + if ((hwif->io_ports[IDE_DATA_OFFSET] == 0x1f0 || hwif->io_ports[IDE_DATA_OFFSET] == 0x170) + && (hwif->chipset == ide_unknown || hwif->chipset == ide_generic)) + { + hwif->chipset = ide_rz1000; + hwif->serialized = 1; + hwif->drives[0].no_unmask = 1; + hwif->drives[1].no_unmask = 1; + printk(" %s: serialized, disabled unmasking\n", hwif->name); + } + } + } } -void init_rz1000 (byte bus, byte fn) +void ide_probe_for_rz100x (void) { - int rc; - unsigned short reg; + byte index, bus, fn; - printk("ide0: buggy RZ1000 interface: "); - if ((rc = pcibios_read_config_word (bus, fn, PCI_COMMAND, ®))) { - ide_pci_access_error (rc); - } else if (!(reg & 1)) { - printk("not enabled\n"); - } else { - if ((rc = pcibios_read_config_word(bus, fn, 0x40, ®)) - || (rc = pcibios_write_config_word(bus, fn, 0x40, reg & 0xdfff))) - { - ide_hwifs[0].drives[0].no_unmask = 1; - ide_hwifs[0].drives[1].no_unmask = 1; - ide_hwifs[1].drives[0].no_unmask = 1; - ide_hwifs[1].drives[1].no_unmask = 1; - ide_hwifs[0].serialized = 1; - ide_hwifs[1].serialized = 1; - ide_pci_access_error (rc); - printk("serialized, disabled unmasking\n"); - } else - printk("disabled read-ahead\n"); - } + for (index = 0; !pcibios_find_device (PCI_VENDOR_ID_PCTECH, PCI_DEVICE_ID_PCTECH_RZ1000, index, &bus, &fn); ++index) + init_rz1000 (bus, fn, "RZ1000"); + for (index = 0; !pcibios_find_device (PCI_VENDOR_ID_PCTECH, PCI_DEVICE_ID_PCTECH_RZ1001, index, &bus, &fn); ++index) + init_rz1000 (bus, fn, "RZ1001"); } diff --git a/drivers/block/triton.c b/drivers/block/triton.c deleted file mode 100644 index 006051af656b..000000000000 --- a/drivers/block/triton.c +++ /dev/null @@ -1,631 +0,0 @@ -/* - * linux/drivers/block/triton.c Version 2.10 April 22, 1997 - * - * Copyright (c) 1995-1997 Mark Lord - * May be copied or modified under the terms of the GNU General Public License - */ - -/* - * This module provides support for the bus-master IDE DMA function - * of the Intel PCI Triton chipset families, which use the PIIX (i82371FB, - * for the 430 FX chipset), the PIIX3 (i82371SB for the 430 HX/VX and - * 440 chipsets), and the PIIX4 (i82371AB for the 430 TX chipset). - * - * "PIIX" stands for "PCI ISA IDE Xcellerator". - * - * Pretty much the same code could work for other IDE PCI bus-mastering chipsets. - * Look for DMA support for this someday in the not too distant future. - * - * DMA is supported for all IDE devices (disk drives, cdroms, tapes, floppies). - * - * Up to four drives may be enabled for DMA, and the PIIX* chips - * will arbitrate the PCI bus among them. Note that the PIIX/PIIX3 - * provides a single "line buffer" for the BM IDE function, so performance of - * multiple (two) drives doing DMA simultaneously will suffer somewhat, - * as they contest for that resource bottleneck. This is handled transparently - * inside the PIIX/PIIX3. The PIIX4 does not have this problem. - * - * By default, DMA support is prepared for use, but is currently enabled only - * for drives which support DMA mode2 (multi/single word), or which are - * recognized as "good" (see table below). Drives with only mode0 or mode1 - * (multi/single word) DMA should also work with this chipset/driver (eg. MC2112A) - * but are not enabled by default. Use "hdparm -i" to view modes supported - * by a given drive. - * - * The hdparm-2.4 (or later) utility can be used for manually enabling/disabling - * DMA support, but must be (re-)compiled against this kernel version or later. - * - * To enable DMA, use "hdparm -d1 /dev/hd?" on a per-drive basis after booting. - * If problems arise, ide.c will disable DMA operation after a few retries. - * This error recovery mechanism works and has been extremely well exercised. - * - * IDE drives, depending on their vintage, may support several different modes - * of DMA operation. The boot-time modes are indicated with a "*" in - * the "hdparm -i" listing, and can be changed with *knowledgeable* use of - * the "hdparm -X" feature. There is seldom a need to do this, as drives - * normally power-up with their "best" PIO/DMA modes enabled. - * - * Testing has been done with a rather extensive number of drives, - * with Quantum & Western Digital models generally outperforming the pack, - * and Fujitsu & Conner (and some Seagate which are really Conner) drives - * showing more lackluster throughput. - * - * Keep an eye on /var/adm/messages for "DMA disabled" messages. - * - * Some people have reported trouble with Intel Zappa motherboards. - * This can be fixed by upgrading the AMI BIOS to version 1.00.04.BS0, - * available from ftp://ftp.intel.com/pub/bios/10004bs0.exe - * (thanks to Glen Morrell for researching this). - * - * Thanks to "Christopher J. Reimer" for fixing the - * problem with some (all?) ACER motherboards/BIOSs. - * - * Thanks to "Benoit Poulot-Cazajous" for testing - * "TX" chipset compatibility and for providing patches for the "TX" chipset. - * - * And, yes, Intel Zappa boards really *do* use both PIIX IDE ports. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "ide.h" -#include "ide_modes.h" - -#define DISPLAY_PIIX_TIMINGS /* define this to display timings */ - -/* - * good_dma_drives() lists the model names (from "hdparm -i") - * of drives which do not support mode2 DMA but which are - * known to work fine with this interface under Linux. - */ -const char *good_dma_drives[] = {"Micropolis 2112A", - "CONNER CTMA 4000", - NULL}; - -/* - * Our Physical Region Descriptor (PRD) table should be large enough - * to handle the biggest I/O request we are likely to see. Since requests - * can have no more than 256 sectors, and since the typical blocksize is - * two sectors, we could get by with a limit of 128 entries here for the - * usual worst case. Most requests seem to include some contiguous blocks, - * further reducing the number of table entries required. - * - * The driver reverts to PIO mode for individual requests that exceed - * this limit (possible with 512 byte blocksizes, eg. MSDOS f/s), so handling - * 100% of all crazy scenarios here is not necessary. - * - * As it turns out though, we must allocate a full 4KB page for this, - * so the two PRD tables (ide0 & ide1) will each get half of that, - * allowing each to have about 256 entries (8 bytes each) from this. - */ -#define PRD_BYTES 8 -#define PRD_ENTRIES (PAGE_SIZE / (2 * PRD_BYTES)) - -/* - * Interface to access piix registers - */ -static unsigned int piix_key; - -#define PIIX_FLAGS_FAST_PIO 1 -#define PIIX_FLAGS_USE_IORDY 2 -#define PIIX_FLAGS_PREFETCH 4 -#define PIIX_FLAGS_FAST_DMA 8 - - -union chip_en_reg_u { - struct { - unsigned d0_flags :4; - unsigned d1_flags :4; - unsigned recovery :2; - unsigned reserved :2; - unsigned sample :2; - unsigned sidetim_enabled:1; - unsigned ports_enabled :1; - } piix_s; - struct { - unsigned sec_en :1; - unsigned pri_en :1; - unsigned reserved :14; - } via_s; -}; - -typedef union chip_en_reg_u piix_timing_t; - -typedef struct { - unsigned pri_recovery :2; - unsigned pri_sample :2; - unsigned sec_recovery :2; - unsigned sec_sample :2; -} piix_sidetim_t; - - -/* - * We currently can handle only one PIIX chip here - */ -static piix_pci_bus = 0; -static piix_pci_fn = 0; - -static int config_drive_for_dma (ide_drive_t *); - -/* - * dma_intr() is the handler for disk read/write DMA interrupts - */ -static void dma_intr (ide_drive_t *drive) -{ - byte stat, dma_stat; - int i; - struct request *rq = HWGROUP(drive)->rq; - unsigned short dma_base = HWIF(drive)->dma_base; - - dma_stat = inb(dma_base+2); /* get DMA status */ - outb(inb(dma_base)&~1, dma_base); /* stop DMA operation */ - stat = GET_STAT(); /* get drive status */ - if (OK_STAT(stat,DRIVE_READY,drive->bad_wstat|DRQ_STAT)) { - if ((dma_stat & 7) == 4) { /* verify good DMA status */ - rq = HWGROUP(drive)->rq; - for (i = rq->nr_sectors; i > 0;) { - i -= rq->current_nr_sectors; - ide_end_request(1, HWGROUP(drive)); - } - return; - } - printk("%s: bad DMA status: 0x%02x\n", drive->name, dma_stat); - } - sti(); - ide_error(drive, "dma_intr", stat); -} - -/* - * build_dmatable() prepares a dma request. - * Returns 0 if all went okay, returns 1 otherwise. - */ -static int build_dmatable (ide_drive_t *drive) -{ - struct request *rq = HWGROUP(drive)->rq; - struct buffer_head *bh = rq->bh; - unsigned long size, addr, *table = HWIF(drive)->dmatable; - unsigned int count = 0; - - do { - /* - * Determine addr and size of next buffer area. We assume that - * individual virtual buffers are always composed linearly in - * physical memory. For example, we assume that any 8kB buffer - * is always composed of two adjacent physical 4kB pages rather - * than two possibly non-adjacent physical 4kB pages. - */ - if (bh == NULL) { /* paging requests have (rq->bh == NULL) */ - addr = virt_to_bus (rq->buffer); - size = rq->nr_sectors << 9; - } else { - /* group sequential buffers into one large buffer */ - addr = virt_to_bus (bh->b_data); - size = bh->b_size; - while ((bh = bh->b_reqnext) != NULL) { - if ((addr + size) != virt_to_bus (bh->b_data)) - break; - size += bh->b_size; - } - } - - /* - * Fill in the dma table, without crossing any 64kB boundaries. - * We assume 16-bit alignment of all blocks. - */ - while (size) { - if (++count >= PRD_ENTRIES) { - printk("%s: DMA table too small\n", drive->name); - return 1; /* revert to PIO for this request */ - } else { - unsigned long bcount = 0x10000 - (addr & 0xffff); - if (bcount > size) - bcount = size; - *table++ = addr; - *table++ = bcount & 0xffff; - addr += bcount; - size -= bcount; - } - } - } while (bh != NULL); - if (count) { - *--table |= 0x80000000; /* set End-Of-Table (EOT) bit */ - return 0; - } - printk("%s: empty DMA table?\n", drive->name); - return 1; /* let the PIO routines handle this weirdness */ -} - -/* - * piix_dmaproc() initiates/aborts DMA read/write operations on a drive. - * - * The caller is assumed to have selected the drive and programmed the drive's - * sector address using CHS or LBA. All that remains is to prepare for DMA - * and then issue the actual read/write DMA/PIO command to the drive. - * - * For ATAPI devices, we just prepare for DMA and return. The caller should - * then issue the packet command to the drive and call us again with - * ide_dma_begin afterwards. - * - * Returns 0 if all went well. - * Returns 1 if DMA read/write could not be started, in which case - * the caller should revert to PIO for the current request. - */ -static int piix_dmaproc (ide_dma_action_t func, ide_drive_t *drive) -{ - unsigned long dma_base = HWIF(drive)->dma_base; - unsigned int reading = (1 << 3); - piix_timing_t timing; - unsigned short reg; - byte dflags; - - switch (func) { - case ide_dma_off: - printk("%s: DMA disabled\n", drive->name); - case ide_dma_on: - drive->using_dma = (func == ide_dma_on); - reg = (HWIF(drive)->io_ports[IDE_DATA_OFFSET] == 0x170) ? 0x42 : 0x40; - if (pcibios_read_config_word(piix_pci_bus, piix_pci_fn, reg, (short *)&timing)) { - printk("%s: pcibios read failed\n", HWIF(drive)->name); - return 1; - } - dflags = drive->select.b.unit ? timing.piix_s.d1_flags : timing.piix_s.d0_flags; - if (dflags & PIIX_FLAGS_FAST_PIO) { - if (func == ide_dma_on && drive->media == ide_disk) - dflags |= PIIX_FLAGS_FAST_DMA; - else - dflags &= ~PIIX_FLAGS_FAST_DMA; - if (drive->select.b.unit == 0) - timing.piix_s.d0_flags = dflags; - else - timing.piix_s.d1_flags = dflags; - if (pcibios_write_config_word(piix_pci_bus, piix_pci_fn, reg, *(short *)&timing)) { - printk("%s: pcibios write failed\n", HWIF(drive)->name); - return 1; - } - } - return 0; - case ide_dma_abort: - outb(inb(dma_base)&~1, dma_base); /* stop DMA */ - return 0; - case ide_dma_check: - return config_drive_for_dma (drive); - case ide_dma_write: - reading = 0; - case ide_dma_read: - break; - case ide_dma_status_bad: - return ((inb(dma_base+2) & 7) != 4); /* verify good DMA status */ - case ide_dma_transferred: -#if 0 - return (number of bytes actually transferred); -#else - return (0); -#endif - case ide_dma_begin: - outb(inb(dma_base)|1, dma_base); /* begin DMA */ - return 0; - default: - printk("piix_dmaproc: unsupported func: %d\n", func); - return 1; - } - if (build_dmatable (drive)) - return 1; - outl(virt_to_bus (HWIF(drive)->dmatable), dma_base + 4); /* PRD table */ - outb(reading, dma_base); /* specify r/w */ - outb(inb(dma_base+2)|0x06, dma_base+2); /* clear status bits */ - if (drive->media != ide_disk) - return 0; - ide_set_handler(drive, &dma_intr, WAIT_CMD); /* issue cmd to drive */ - OUT_BYTE(reading ? WIN_READDMA : WIN_WRITEDMA, IDE_COMMAND_REG); - outb(inb(dma_base)|1, dma_base); /* begin DMA */ - return 0; -} - -static int config_drive_for_dma (ide_drive_t *drive) -{ - const char **list; - - struct hd_driveid *id = drive->id; - if (id && (id->capability & 1)) { - /* Enable DMA on any drive that supports mode2 (multi/single word) DMA */ - if (id->field_valid & 2) - if ((id->dma_mword & 0x404) == 0x404 || (id->dma_1word & 0x404) == 0x404) - return piix_dmaproc(ide_dma_on, drive); - /* Consult the list of known "good" drives */ - list = good_dma_drives; - while (*list) { - if (!strcmp(*list++,id->model)) - return piix_dmaproc(ide_dma_on, drive); - } - } - return piix_dmaproc(ide_dma_off, drive); -} - -#ifdef DISPLAY_PIIX_TIMINGS -/* - * print_piix_drive_flags() displays the currently programmed options - * in the PIIX/PIIX3/PIIX4 for a given drive. - */ -static void print_piix_drive_flags (const char *unit, byte dflags) -{ - printk(" %s ", unit); - printk( "fastDMA=%s", (dflags & PIIX_FLAGS_FAST_PIO) ? "yes" : "no "); - printk(" PreFetch=%s", (dflags & PIIX_FLAGS_PREFETCH) ? "on " : "off"); - printk(" IORDY=%s", (dflags & PIIX_FLAGS_USE_IORDY) ? "on " : "off"); - printk(" fastPIO=%s\n", ((dflags & (PIIX_FLAGS_FAST_PIO|PIIX_FLAGS_FAST_DMA)) == PIIX_FLAGS_FAST_PIO) ? "on " : "off"); -} -#endif /* DISPLAY_PIIX_TIMINGS */ - -static void init_piix_dma (ide_hwif_t *hwif, unsigned short base) -{ - static unsigned long dmatable = 0; - - printk(" %s: BM-DMA at 0x%04x-0x%04x", hwif->name, base, base+7); - if (check_region(base, 8)) { - printk(" -- ERROR, PORTS ALREADY IN USE"); - } else { - request_region(base, 8, "IDE DMA"); - hwif->dma_base = base; - if (!dmatable) { - /* - * The BM-DMA uses a full 32-bits, so we can - * safely use __get_free_page() here instead - * of __get_dma_pages() -- no ISA limitations. - */ - dmatable = __get_free_page(GFP_KERNEL); - } - if (dmatable) { - hwif->dmatable = (unsigned long *) dmatable; - dmatable += (PRD_ENTRIES * PRD_BYTES); - outl(virt_to_bus(hwif->dmatable), base + 4); - hwif->dmaproc = &piix_dmaproc; - } - } - printk("\n"); -} - -/* The next two functions were stolen from cmd640.c, with - a few modifications */ - -static void put_piix_reg (unsigned short reg, long val) -{ - unsigned long flags; - - save_flags(flags); - cli(); - outl_p((reg & 0xfc) | piix_key, 0xcf8); - outl_p(val, (reg & 3) | 0xcfc); - restore_flags(flags); -} - -static long get_piix_reg (unsigned short reg) -{ - long b; - unsigned long flags; - - save_flags(flags); - cli(); - outl_p((reg & 0xfc) | piix_key, 0xcf8); - b = inl_p((reg & 3) | 0xcfc); - restore_flags(flags); - return b; -} - -/* - * Search for an (apparently) unused block of I/O space - * of "size" bytes in length. - */ -static short find_free_region (unsigned short size) -{ - unsigned short i, base = 0xe800; - for (base = 0xe800; base > 0; base -= 0x800) { - if (!check_region(base,size)) { - for (i = 0; i < size; i++) { - if (inb(base+i) != 0xff) - goto next; - } - return base; /* success */ - } - next: - } - return 0; /* failure */ -} - -/* - * ide_init_triton() prepares the IDE driver for DMA operation. - * This routine is called once, from ide.c during driver initialization, - * for each triton chipset which is found (unlikely to be more than one). - */ -void ide_init_triton (byte bus, byte fn) -{ - int rc = 0, h; - int dma_enabled = 0; - unsigned short pcicmd, devid; - unsigned int bmiba; - const char *chipset = "ide"; - piix_timing_t timings[2]; - - piix_pci_bus = bus; - piix_pci_fn = fn; - - if (pcibios_read_config_word(bus, fn, 0x02, &devid)) - goto quit; - - if (devid == PCI_DEVICE_ID_INTEL_82371AB) - chipset = "PIIX4"; - else if (devid == PCI_DEVICE_ID_INTEL_82371SB_1) - chipset = "PIIX3"; - else if (devid == PCI_DEVICE_ID_INTEL_82371_1) - chipset = "PIIX"; - else if (devid == PCI_DEVICE_ID_VIA_82C586_1) - chipset = "VP1"; - else { - printk("Unknown PCI IDE interface 0x%x\n", devid); - goto quit; - } - - printk("%s: bus-master IDE device on PCI bus %d function %d\n", chipset, bus, fn); - - /* - * See if IDE ports are enabled - */ - if ((rc = pcibios_read_config_word(bus, fn, 0x04, &pcicmd))) - goto quit; - if ((pcicmd & 1) == 0) { - printk("%s: IDE ports are not enabled (BIOS)\n", chipset); - goto quit; - } - if (devid == PCI_DEVICE_ID_VIA_82C586_1) { - /* pri and sec channel enables are in port 0x40 */ - if ((rc = pcibios_read_config_word(bus, fn, 0x40, (short *)&timings[0]))) - goto quit; - if ((!timings[0].via_s.pri_en && (!timings[0].via_s.sec_en))) { - printk("%s: neither IDE port is enabled\n", chipset); - goto quit; - } - } - else { /* INTEL piix */ - if ((rc = pcibios_read_config_word(bus, fn, 0x40, (short *)&timings[0]))) - goto quit; - if ((rc = pcibios_read_config_word(bus, fn, 0x42, (short *)&timings[1]))) - goto quit; - if ((!timings[0].piix_s.ports_enabled) && (!timings[1].piix_s.ports_enabled)) { - printk("%s: neither IDE port is enabled\n", chipset); - goto quit; - } - } - - /* - * See if Bus-Mastered DMA is enabled - */ - if ((pcicmd & 4) == 0) { - printk("%s: bus-master DMA feature is not enabled (BIOS)\n", chipset); - } else { - /* - * Get the bmiba base address - */ - if ((rc = pcibios_read_config_dword(bus, fn, 0x20, &bmiba))) - goto quit; - bmiba &= 0xfff0; /* extract port base address */ - if (bmiba) { - dma_enabled = 1; - } else { - unsigned short base; - printk("%s: bus-master base address is invalid (0x%04x, BIOS problem)\n", chipset, bmiba); - base = find_free_region(16); - if (base) { - printk("%s: bypassing BIOS; setting bus-master base address to 0x%04x\n", chipset, base); - piix_key = 0x80000000 + (fn * 0x100); - put_piix_reg(0x04,get_piix_reg(0x04)&~5); - put_piix_reg(0x20,(get_piix_reg(0x20)&0xFFFF000F)|base|1); - put_piix_reg(0x04,get_piix_reg(0x04)|5); - bmiba = get_piix_reg(0x20)&0x0000FFF0; - if (bmiba == base && (get_piix_reg(0x04) & 5) == 5) - dma_enabled = 1; - else - printk("%s: operation failed\n", chipset); - } - if (!dma_enabled) - printk("%s: DMA is disabled (BIOS)\n", chipset); - } - } - - /* - * Save the dma_base port addr for each interface - */ - for (h = 0; h < MAX_HWIFS; ++h) { - unsigned int pri_sec; - piix_timing_t timing; - ide_hwif_t *hwif = &ide_hwifs[h]; - switch (hwif->io_ports[IDE_DATA_OFFSET]) { - case 0x1f0: pri_sec = 0; break; - case 0x170: pri_sec = 1; break; - default: continue; - } - - if (devid == PCI_DEVICE_ID_VIA_82C586_1) { - timing = timings[0]; - switch (h) { - case 0: - if (!timing.piix_s.ports_enabled) { - printk("port 0 DMA not enabled\n"); - continue; - } - case 1: - if (!timing.piix_s.sidetim_enabled) { - printk("port 1 DMA not enabled\n"); - continue; - } - } - hwif->chipset = ide_via; - } - else { /* PIIX */ - - timing = timings[pri_sec]; - if (!timing.piix_s.ports_enabled) /* interface disabled? */ - continue; - hwif->chipset = ide_triton; - } - if (dma_enabled) - init_piix_dma(hwif, bmiba + (pri_sec ? 8 : 0)); -#ifdef DISPLAY_PIIX_TIMINGS - /* - * Display drive timings/modes - */ - { - const char *slave; - piix_sidetim_t sidetim; - byte sample = 5 - timing.piix_s.sample; - byte recovery = 4 - timing.piix_s.recovery; - unsigned int drvtim; - - if (devid == PCI_DEVICE_ID_VIA_82C586_1) { - pcibios_read_config_dword(bus, fn, 0x48, &drvtim); - if (pri_sec == 0) { - printk(" %s master: active_pulse_CLKs=%d, recovery_CLKs=%d\n", hwif->name, 1+(drvtim>>28), 1+((drvtim & 0x0f000000)>>24)); - printk(" %s slave: active_pulse_CLKs=%d, recovery_CLKs=%d\n", hwif->name, 1+((drvtim & 0xf00000)>>20), 1+((drvtim & 0x0f0000)>>16)); - continue; - } else { - printk(" %s master: active_pulse_CLKs=%d, recovery_CLKs=%d\n", hwif->name, 1+((drvtim & 0xf000)>>12), 1+((drvtim & 0x0f00)>>8)); - printk(" %s slave: active_pulse_CLKs=%d, recovery_CLKs=%d\n", hwif->name, 1+((drvtim & 0xf0)>>4), 1+(drvtim & 0x0f)); - continue; - } - } - - if ((devid == PCI_DEVICE_ID_INTEL_82371SB_1 - || devid == PCI_DEVICE_ID_INTEL_82371AB) - && timing.piix_s.sidetim_enabled - && !pcibios_read_config_byte(bus, fn, 0x44, (byte *) &sidetim)) - slave = ""; /* PIIX3 and later */ - else - slave = "/slave"; /* PIIX, or PIIX3 in compatibility mode */ - printk(" %s master%s: sample_CLKs=%d, recovery_CLKs=%d\n", hwif->name, slave, sample, recovery); - print_piix_drive_flags ("master:", timing.piix_s.d0_flags); - if (!*slave) { - if (pri_sec == 0) { - sample = 5 - sidetim.pri_sample; - recovery = 4 - sidetim.pri_recovery; - } else { - sample = 5 - sidetim.sec_sample; - recovery = 4 - sidetim.sec_recovery; - } - printk(" slave : sample_CLKs=%d, recovery_CLKs=%d\n", sample, recovery); - } - print_piix_drive_flags ("slave :", timing.piix_s.d1_flags); - } -#endif /* DISPLAY_PIIX_TIMINGS */ - } - -quit: if (rc) printk("%s: pcibios access failed - %s\n", chipset, pcibios_strerror(rc)); -} diff --git a/drivers/char/lp.c b/drivers/char/lp.c index aec3e3212027..fb30c3e0c9ba 100644 --- a/drivers/char/lp.c +++ b/drivers/char/lp.c @@ -128,7 +128,7 @@ static inline int lp_char(char lpchar, int minor, int use_polling) do { status = r_str(minor); count++; - if (resched_needed()) + if (need_resched) lp_schedule (minor); } while (((use_polling && !LP_READY(minor, status)) || (!use_polling && !(status & LP_PBUSY))) && @@ -356,7 +356,7 @@ static ssize_t lp_read(struct file * file, char * buf, status=(r_str(minor) & 0x40); udelay(50); counter++; - if (resched_needed()) + if (need_resched) schedule (); } while ( (status == 0x40) && (counter < 20) ); if ( counter == 20 ) { /* Timeout */ @@ -375,7 +375,7 @@ static ssize_t lp_read(struct file * file, char * buf, status=(r_str(minor) & 0x40); udelay(20); counter++; - if (resched_needed()) + if (need_resched) schedule (); } while ( (status == 0) && (counter < 20) ); if (counter == 20) { /* Timeout */ diff --git a/drivers/char/videodev.c b/drivers/char/videodev.c index 71e88f04c40b..38b9a67438d5 100644 --- a/drivers/char/videodev.c +++ b/drivers/char/videodev.c @@ -249,7 +249,7 @@ int videodev_init(void) return 0; } - +#ifdef MODULE int init_module(void) { return videodev_init(); @@ -260,5 +260,7 @@ void cleanup_module(void) unregister_chrdev(VIDEO_MAJOR, "video_capture"); } +#endif + EXPORT_SYMBOL(video_register_device); EXPORT_SYMBOL(video_unregister_device); diff --git a/drivers/net/Config.in b/drivers/net/Config.in index 322ab2b6966d..f26a649d128e 100644 --- a/drivers/net/Config.in +++ b/drivers/net/Config.in @@ -140,9 +140,9 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then fi fi -#if [ ! "$CONFIG_PARPORT" = "n" ]; then -# dep_tristate 'PLIP (parallel port) support' CONFIG_PLIP $CONFIG_PARPORT -#fi +if [ ! "$CONFIG_PARPORT" = "n" ]; then + dep_tristate 'PLIP (parallel port) support' CONFIG_PLIP $CONFIG_PARPORT +fi tristate 'PPP (point-to-point) support' CONFIG_PPP if [ ! "$CONFIG_PPP" = "n" ]; then @@ -181,13 +181,8 @@ if [ "$CONFIG_NET_RADIO" != "n" ]; then bool 'HFmodem support for Soundblaster and compatible cards' CONFIG_HFMODEM_SBC bool 'HFmodem support for WSS and Crystal cards' CONFIG_HFMODEM_WSS fi - tristate 'Shortwave radio modem driver' CONFIG_HFMODEM - if [ "$CONFIG_HFMODEM" != "n" ]; then - bool 'HFmodem support for Soundblaster and compatible cards' CONFIG_HFMODEM_SBC - bool 'HFmodem support for WSS and Crystal cards' CONFIG_HFMODEM_WSS - fi fi -# tristate 'STRIP (Metricom starmode radio IP)' CONFIG_STRIP + tristate 'STRIP (Metricom starmode radio IP)' CONFIG_STRIP tristate 'AT&T WaveLAN & DEC RoamAbout DS support' CONFIG_WAVELAN fi diff --git a/drivers/net/net_init.c b/drivers/net/net_init.c index ae57553a3b6a..5a9c1cc3b27e 100644 --- a/drivers/net/net_init.c +++ b/drivers/net/net_init.c @@ -38,8 +38,8 @@ #include #include #include -#include #include +#include /* The network devices currently exist only in the socket namespace, so these entries are unused. The only ones that make sense are @@ -112,7 +112,7 @@ init_etherdev(struct device *dev, int sizeof_priv) new_device = 1; } - found: /* From the double loop above. */ +found: /* From the double loop above. */ if (dev->name && ((dev->name[0] == '\0') || (dev->name[0] == ' '))) { @@ -126,14 +126,9 @@ init_etherdev(struct device *dev, int sizeof_priv) ether_setup(dev); /* Hmmm, should this be called here? */ - if (new_device) { - /* Append the device to the device queue. */ - struct device **old_devp = &dev_base; - while ((*old_devp)->next) - old_devp = & (*old_devp)->next; - (*old_devp)->next = dev; - dev->next = 0; - } + if (new_device) + register_netdevice(dev); + return dev; } @@ -173,8 +168,6 @@ void ether_setup(struct device *dev) int i; /* Fill in the fields of the device structure with ethernet-generic values. This should be in a common file instead of per-driver. */ - - dev_init_buffers(dev); /* register boot-defined "eth" devices */ if (dev->name && (strncmp(dev->name, "eth", 3) == 0)) { @@ -195,6 +188,7 @@ void ether_setup(struct device *dev) dev->set_mac_address = eth_mac_addr; dev->hard_header_cache = eth_header_cache; dev->header_cache_update= eth_header_cache_update; + dev->hard_header_parse = eth_header_parse; dev->type = ARPHRD_ETHER; dev->hard_header_len = ETH_HLEN; @@ -206,11 +200,8 @@ void ether_setup(struct device *dev) /* New-style flags. */ dev->flags = IFF_BROADCAST|IFF_MULTICAST; - dev->family = AF_INET; - dev->pa_addr = 0; - dev->pa_brdaddr = 0; - dev->pa_mask = 0; - dev->pa_alen = 4; + + dev_init_buffers(dev); } #ifdef CONFIG_FDDI @@ -222,8 +213,6 @@ void fddi_setup(struct device *dev) * This should be in a common file instead of per-driver. */ - dev_init_buffers(dev); - dev->change_mtu = fddi_change_mtu; dev->hard_header = fddi_header; dev->rebuild_header = fddi_rebuild_header; @@ -238,11 +227,9 @@ void fddi_setup(struct device *dev) /* New-style flags */ dev->flags = IFF_BROADCAST | IFF_MULTICAST; - dev->family = AF_INET; - dev->pa_addr = 0; - dev->pa_brdaddr = 0; - dev->pa_mask = 0; - dev->pa_alen = 4; + + dev_init_buffers(dev); + return; } @@ -264,8 +251,6 @@ static int ltalk_mac_addr(struct device *dev, void *addr) void ltalk_setup(struct device *dev) { /* Fill in the fields of the device structure with localtalk-generic values. */ - - dev_init_buffers(dev); dev->change_mtu = ltalk_change_mtu; dev->hard_header = NULL; @@ -283,11 +268,8 @@ void ltalk_setup(struct device *dev) dev->broadcast[0] = 0xFF; dev->flags = IFF_BROADCAST|IFF_MULTICAST|IFF_NOARP; - dev->family = AF_APPLETALK; - dev->pa_addr = 0; - dev->pa_brdaddr = 0; - dev->pa_mask = 0; - dev->pa_alen = 1; + + dev_init_buffers(dev); } #endif @@ -309,134 +291,61 @@ int ether_config(struct device *dev, struct ifmap *map) return 0; } -int register_netdev(struct device *dev) +static int etherdev_get_index(struct device *dev) { - struct device *d = dev_base; - unsigned long flags; int i=MAX_ETH_CARDS; - save_flags(flags); - cli(); - - if (dev) { - if (dev->name && - ((dev->name[0] == '\0') || (dev->name[0] == ' '))) { - for (i = 0; i < MAX_ETH_CARDS; ++i) - if (ethdev_index[i] == NULL) { - sprintf(dev->name, "eth%d", i); - printk("loading device '%s'...\n", dev->name); - ethdev_index[i] = dev; - break; - } - } - - if (dev->init) { - sti(); /* device probes assume interrupts enabled */ - if (dev->init(dev) != 0) { - if (i < MAX_ETH_CARDS) ethdev_index[i] = NULL; - restore_flags(flags); - return -EIO; - } - cli(); - } - - /* Add device to end of chain */ - if (dev_base) { - while (d->next) - d = d->next; - d->next = dev; + for (i = 0; i < MAX_ETH_CARDS; ++i) { + if (ethdev_index[i] == NULL) { + sprintf(dev->name, "eth%d", i); + printk("loading device '%s'...\n", dev->name); + ethdev_index[i] = dev; + return i; } - else - dev_base = dev; - dev->next = NULL; - dev->ifindex = dev_new_index(); } - restore_flags(flags); - return 0; + return -1; } -void unregister_netdev(struct device *dev) +static void etherdev_put_index(struct device *dev) { - struct device *d = dev_base; - unsigned long flags; int i; - - save_flags(flags); - cli(); - - if (dev == NULL) - { - printk("was NULL\n"); - restore_flags(flags); - return; - } - /* else */ - if (dev->start) - printk("ERROR '%s' busy and not MOD_IN_USE.\n", dev->name); - - /* - * must jump over main_device+aliases - * avoid alias devices unregistration so that only - * net_alias module manages them - */ -#ifdef CONFIG_NET_ALIAS - if (dev_base == dev) - dev_base = net_alias_nextdev(dev); - else - { - while(d && (net_alias_nextdev(d) != dev)) /* skip aliases */ - d = net_alias_nextdev(d); - - if (d && (net_alias_nextdev(d) == dev)) - { - /* - * Critical: Bypass by consider devices as blocks (maindev+aliases) - */ - net_alias_nextdev_set(d, net_alias_nextdev(dev)); - } -#else - if (dev_base == dev) - dev_base = dev->next; - else - { - while (d && (d->next != dev)) - d = d->next; - - if (d && (d->next == dev)) - { - d->next = dev->next; - } -#endif - else - { - printk("unregister_netdev: '%s' not found\n", dev->name); - restore_flags(flags); - return; - } - } - for (i = 0; i < MAX_ETH_CARDS; ++i) - { - if (ethdev_index[i] == dev) - { + for (i = 0; i < MAX_ETH_CARDS; ++i) { + if (ethdev_index[i] == dev) { ethdev_index[i] = NULL; break; } } +} + +int register_netdev(struct device *dev) +{ + int i=-1; - restore_flags(flags); + rtnl_lock(); - /* - * You can i.e use a interfaces in a route though it is not up. - * We call close_dev (which is changed: it will down a device even if - * dev->flags==0 (but it will not call dev->stop if IFF_UP - * is not set). - * This will call notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev), - * dev_mc_discard(dev), .... - */ - - dev_close(dev); + if (dev->name && + (dev->name[0] == '\0' || dev->name[0] == ' ')) + i = etherdev_get_index(dev); + + if (register_netdevice(dev)) { + if (i >= 0) + etherdev_put_index(dev); + rtnl_unlock(); + return -EIO; + } + rtnl_unlock(); + return 0; +} + +void unregister_netdev(struct device *dev) +{ + rtnl_lock(); + unregister_netdevice(dev); + etherdev_put_index(dev); + rtnl_unlock(); } + #ifdef CONFIG_TR /* The list of used and available "tr" slots */ #define MAX_TR_CARDS 16 @@ -488,15 +397,6 @@ trfound: /* From the double loop above. */ break; } - if (new_device) { - /* Append the device to the device queue. */ - struct device **old_devp = &dev_base; - - while ((*old_devp)->next) - old_devp = & (*old_devp)->next; - (*old_devp)->next = dev; - dev->next = 0; - } dev->hard_header = tr_header; dev->rebuild_header = tr_rebuild_header; @@ -511,11 +411,9 @@ trfound: /* From the double loop above. */ /* New-style flags. */ dev->flags = IFF_BROADCAST; - dev->family = AF_INET; - dev->pa_addr = 0; - dev->pa_brdaddr = 0; - dev->pa_mask = 0; - dev->pa_alen = 4; + + if (new_device) + register_netdevice(dev); return dev; } @@ -553,99 +451,21 @@ void tr_freedev(struct device *dev) int register_trdev(struct device *dev) { - unsigned long flags; - dev_init_buffers(dev); - save_flags(flags); - - if (dev && dev->init) { - sti(); /* device probes assume interrupts enabled */ - if (dev->init(dev) != 0) { - unregister_trdev(dev); - restore_flags(flags); - return -EIO; - } - cli(); - + if (dev->init && dev->init(dev) != 0) { + unregister_trdev(dev); + return -EIO; } - restore_flags(flags); return 0; } void unregister_trdev(struct device *dev) { - struct device *d = dev_base; - unsigned long flags; - - save_flags(flags); - cli(); - - if (dev == NULL) - { - printk("was NULL\n"); - restore_flags(flags); - return; - } - /* else */ - if (dev->start) - printk("ERROR '%s' busy and not MOD_IN_USE.\n", dev->name); - - /* - * must jump over main_device+aliases - * avoid alias devices unregistration so that only - * net_alias module manages them - */ -#ifdef CONFIG_NET_ALIAS - if (dev_base == dev) - dev_base = net_alias_nextdev(dev); - else - { - while(d && (net_alias_nextdev(d) != dev)) /* skip aliases */ - d = net_alias_nextdev(d); - - if (d && (net_alias_nextdev(d) == dev)) - { - /* - * Critical: Bypass by consider devices as blocks (maindev+aliases) - */ - net_alias_nextdev_set(d, net_alias_nextdev(dev)); - } -#else - if (dev_base == dev) - dev_base = dev->next; - else - { - while (d && (d->next != dev)) - d = d->next; - - if (d && (d->next == dev)) - { - d->next = dev->next; - } -#endif - else - { - printk("unregister_trdev: '%s' not found\n", dev->name); - restore_flags(flags); - return; - } - } - + rtnl_lock(); + unregister_netdevice(dev); + rtnl_unlock(); tr_freedev(dev); - - restore_flags(flags); - - /* - * You can i.e use a interfaces in a route though it is not up. - * We call close_dev (which is changed: it will down a device even if - * dev->flags==0 (but it will not call dev->stop if IFF_UP - * is not set). - * This will call notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev), - * dev_mc_discard(dev), .... - */ - - dev_close(dev); } #endif @@ -655,6 +475,5 @@ void unregister_trdev(struct device *dev) * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/net/inet -Wall -Wstrict-prototypes -O6 -m486 -c net_init.c" * version-control: t * kept-new-versions: 5 - * tab-width: 4 * End: */ diff --git a/drivers/net/plip.c b/drivers/net/plip.c index 17c85a9ba687..1774de0bb33b 100644 --- a/drivers/net/plip.c +++ b/drivers/net/plip.c @@ -1,5 +1,3 @@ -#warning This wont work until we merge the networking changes -#if 0 /* $Id: plip.c,v 1.3.6.2 1997/04/16 15:07:56 phil Exp $ */ /* PLIP: A parallel port "network" driver for Linux. */ /* This driver is for parallel port with 5-bit cable (LapLink (R) cable). */ @@ -1238,4 +1236,3 @@ plip_init(void)) * compile-command: "gcc -DMODULE -DMODVERSIONS -D__KERNEL__ -Wall -Wstrict-prototypes -O2 -g -fomit-frame-pointer -pipe -m486 -c plip.c" * End: */ -#endif diff --git a/drivers/net/ppp.c b/drivers/net/ppp.c index 3c86a779fa64..cb977abde895 100644 --- a/drivers/net/ppp.c +++ b/drivers/net/ppp.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include #include @@ -682,10 +683,12 @@ ppp_release (struct ppp *ppp) if (tty != NULL && tty->disc_data == ppp) tty->disc_data = NULL; /* Break the tty->ppp link */ + rtnl_lock(); /* Strong layering violation. */ - if (dev && dev->flags & IFF_UP) { - dev_close (dev); /* close the device properly */ - } + if (dev && dev->flags & IFF_UP) { + dev_close (dev); /* close the device properly */ + } + rtnl_unlock(); ppp_free_buf (ppp->rbuf); ppp_free_buf (ppp->wbuf); @@ -3017,8 +3020,8 @@ ppp_dev_xmit (sk_buff *skb, struct device *dev) */ if (!ppp->inuse) { dev_kfree_skb (skb, FREE_WRITE); - printk("I am dying to know, are you still alive?\n"); -#ifdef main_got_it_is_something + printk(KERN_WARNING "ppp: I am dying to know, are you still alive?\n"); +#if 0 dev_close (dev); #endif return 0; diff --git a/drivers/net/slip.c b/drivers/net/slip.c index 76b0aec81dc8..9493fe371570 100644 --- a/drivers/net/slip.c +++ b/drivers/net/slip.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -733,6 +734,7 @@ slip_close(struct tty_struct *tty) return; } + rtnl_lock(); if (sl->dev->flags & IFF_UP) { /* STRONG layering violation! --ANK */ @@ -749,6 +751,8 @@ slip_close(struct tty_struct *tty) (void)del_timer (&sl->outfill_timer); #endif sl_free(sl); + unregister_netdevice(sl->dev); + rtnl_unlock(); MOD_DEC_USE_COUNT; } diff --git a/drivers/net/strip.c b/drivers/net/strip.c index 7b6d9fc8f872..fa1733dc1a1a 100644 --- a/drivers/net/strip.c +++ b/drivers/net/strip.c @@ -1,5 +1,3 @@ -#warning "will not compile until the networking is merged" -#if 0 /* * Copyright 1996 The Board of Trustees of The Leland Stanford * Junior University. All Rights Reserved. @@ -2782,4 +2780,3 @@ void cleanup_module(void) printk(KERN_INFO "STRIP: Module Unloaded\n"); } #endif /* MODULE */ -#endif diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 0a4cdecd1be1..2e444bc48db2 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -162,7 +162,7 @@ struct pci_dev_info dev_info[] = { DEVICE( MOTOROLA, MOTOROLA_MPC105,"MPC105 Eagle"), DEVICE( MOTOROLA, MOTOROLA_MPC106,"MPC106 Grackle"), DEVICE( MOTOROLA, MOTOROLA_RAVEN, "Raven"), - DEVICE( PROMISE, PROMISE_IDE_UDMA,"IDE Ultra DMA/33"), + DEVICE( PROMISE, PROMISE_20246, "IDE UltraDMA/33"), DEVICE( PROMISE, PROMISE_5300, "DC5030"), DEVICE( N9, N9_I128, "Imagine 128"), DEVICE( N9, N9_I128_2, "Imagine 128v2"), diff --git a/drivers/sound/Makefile b/drivers/sound/Makefile index 037f6ad228c5..edd69e438c4c 100644 --- a/drivers/sound/Makefile +++ b/drivers/sound/Makefile @@ -34,7 +34,7 @@ kernelconfig: else ifeq (.defines,$(wildcard .defines)) -#include .defines +include .defines include .objects endif diff --git a/drivers/sound/lowlevel/awe_wave.c b/drivers/sound/lowlevel/awe_wave.c index 196882423be5..1f87670957b9 100644 --- a/drivers/sound/lowlevel/awe_wave.c +++ b/drivers/sound/lowlevel/awe_wave.c @@ -551,7 +551,7 @@ int attach_awe(void) INIT_TABLE(samples, max_samples, AWE_MAX_SAMPLES, awe_sample_list); INIT_TABLE(infos, max_infos, AWE_MAX_INFOS, awe_voice_list); - if (my_dev=sound_alloc_synthdev()) + if ((my_dev=sound_alloc_synthdev())!=-1) printk(KERN_WARNING "AWE32 Error: too many synthesizers\n"); else { voice_alloc = &awe_operations.alloc; @@ -560,7 +560,7 @@ int attach_awe(void) } #ifdef CONFIG_AWE32_MIXER - if (my_mixerdev=sound_alloc_mixerdev()) { + if ((my_mixerdev=sound_alloc_mixerdev())!=-1) { mixer_devs[my_mixerdev] = &awe_mixer_operations; } #endif diff --git a/include/linux/acct.h b/include/linux/acct.h index 615322863f90..e1c96a4c1285 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -23,7 +23,7 @@ * specific encoding system used. */ -typedef u16 comp_t; +typedef __u16 comp_t; /* * accounting file record @@ -41,10 +41,10 @@ struct acct * No binary format break with 2.0 - but when we hit 32bit uid we'll * have to bite one */ - u16 ac_uid; /* Accounting Real User ID */ - u16 ac_gid; /* Accounting Real Group ID */ - u16 ac_tty; /* Accounting Control Terminal */ - u32 ac_btime; /* Accounting Process Creation Time */ + __u16 ac_uid; /* Accounting Real User ID */ + __u16 ac_gid; /* Accounting Real Group ID */ + __u16 ac_tty; /* Accounting Control Terminal */ + __u32 ac_btime; /* Accounting Process Creation Time */ comp_t ac_utime; /* Accounting User Time */ comp_t ac_stime; /* Accounting System Time */ comp_t ac_etime; /* Accounting Elapsed Time */ @@ -54,7 +54,7 @@ struct acct comp_t ac_minflt; /* Accounting Minor Pagefaults */ comp_t ac_majflt; /* Accounting Major Pagefaults */ comp_t ac_swaps; /* Accounting Number of Swaps */ - u32 ac_exitcode; /* Accounting Exitcode */ + __u32 ac_exitcode; /* Accounting Exitcode */ char ac_comm[ACCT_COMM + 1]; /* Accounting Command Name */ char ac_pad[10]; /* Accounting Padding Bytes */ }; diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index e0a3ab3c5779..c0384391dcf0 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -38,6 +38,8 @@ extern void eth_header_cache_update(struct hh_cache *hh, struct device *dev, extern int eth_header_cache(struct dst_entry *dst, struct neighbour *neigh, struct hh_cache *hh); +extern int eth_header_parse(struct sk_buff *skb, + unsigned char *haddr); extern struct device * init_etherdev(struct device *, int); #ifdef CONFIG_IP_ROUTER diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h index 80fbfc79ba22..cedbd1e2c552 100644 --- a/include/linux/hdreg.h +++ b/include/linux/hdreg.h @@ -73,7 +73,8 @@ #define ABRT_ERR 0x04 /* Command aborted */ #define ID_ERR 0x10 /* ID field not found */ #define ECC_ERR 0x40 /* Uncorrectable ECC error */ -#define BBD_ERR 0x80 /* block marked bad */ +#define BBD_ERR 0x80 /* pre-EIDE meaning: block marked bad */ +#define ICRC_ERR 0x80 /* new meaning: CRC error during transfer */ struct hd_geometry { unsigned char heads; @@ -149,11 +150,28 @@ struct hd_driveid { unsigned short eide_dma_time; /* recommended mword dma cycle time (ns) */ unsigned short eide_pio; /* min cycle time (ns), no IORDY */ unsigned short eide_pio_iordy; /* min cycle time (ns), with IORDY */ - unsigned short reserved69; /* reserved (word 69) */ - unsigned short reserved70; /* reserved (word 70) */ - /* unsigned short reservedxx[57];*/ /* reserved (words 71-127) */ - /* unsigned short vendor7 [32];*/ /* vendor unique (words 128-159) */ - /* unsigned short reservedyy[96];*/ /* reserved (words 160-255) */ + unsigned short word69; + unsigned short word70; + /* HDIO_GET_IDENTITY currently returns only words 0 through 70 */ + unsigned short word71; + unsigned short word72; + unsigned short word73; + unsigned short word74; + unsigned short word75; + unsigned short word76; + unsigned short word77; + unsigned short word78; + unsigned short word79; + unsigned short word80; + unsigned short word81; + unsigned short word82; + unsigned short word83; + unsigned short word84; + unsigned short word85; + unsigned short word86; + unsigned short word87; + unsigned short dma_ultra; + unsigned short reserved[167]; }; /* diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h new file mode 100644 index 000000000000..bef9f8fd93b3 --- /dev/null +++ b/include/linux/if_tunnel.h @@ -0,0 +1,29 @@ +#ifndef _IF_TUNNEL_H_ +#define _IF_TUNNEL_H_ + +#define SIOCGETTUNNEL (SIOCDEVPRIVATE + 0) +#define SIOCADDTUNNEL (SIOCDEVPRIVATE + 1) +#define SIOCDELTUNNEL (SIOCDEVPRIVATE + 2) +#define SIOCCHGTUNNEL (SIOCDEVPRIVATE + 3) + +#define GRE_CSUM __constant_htons(0x8000) +#define GRE_ROUTING __constant_htons(0x4000) +#define GRE_KEY __constant_htons(0x2000) +#define GRE_SEQ __constant_htons(0x1000) +#define GRE_STRICT __constant_htons(0x0800) +#define GRE_REC __constant_htons(0x0700) +#define GRE_FLAGS __constant_htons(0x00F8) +#define GRE_VERSION __constant_htons(0x0007) + +struct ip_tunnel_parm +{ + char name[IFNAMSIZ]; + int link; + __u16 i_flags; + __u16 o_flags; + __u32 i_key; + __u32 o_key; + struct iphdr iph; +}; + +#endif /* _IF_TUNNEL_H_ */ diff --git a/include/linux/igmp.h b/include/linux/igmp.h index b4a16b5655b1..8be2d1b87f26 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -38,7 +38,7 @@ struct igmphdr #define IGMP_PIM 0x14 /* PIM routing */ #define IGMP_TRACE 0x15 #define IGMP_HOST_NEW_MEMBERSHIP_REPORT 0x16 /* New version of 0x11 */ -#define IGMP_HOST_LEAVE_MESSAGE 0x17 /* An extra BSD seems to send */ +#define IGMP_HOST_LEAVE_MESSAGE 0x17 #define IGMP_MTRACE_RESP 0x1e #define IGMP_MTRACE 0x1f @@ -54,9 +54,6 @@ struct igmphdr #define IGMP_SLEEPING_MEMBER 0x04 #define IGMP_AWAKENING_MEMBER 0x05 -#define IGMP_OLD_ROUTER 0x00 -#define IGMP_NEW_ROUTER 0x01 - #define IGMP_MINLEN 8 #define IGMP_MAX_HOST_REPORT_DELAY 10 /* max delay for response to */ @@ -65,7 +62,7 @@ struct igmphdr #define IGMP_TIMER_SCALE 10 /* denotes that the igmphdr->timer field */ /* specifies time in 10th of seconds */ -#define IGMP_AGE_THRESHOLD 540 /* If this host don't hear any IGMP V1 */ +#define IGMP_AGE_THRESHOLD 400 /* If this host don't hear any IGMP V1 */ /* message in this period of time, */ /* revert to IGMP v2 router. */ @@ -79,40 +76,53 @@ struct igmphdr */ #ifdef __KERNEL__ + +/* ip_mc_socklist is real list now. Speed is not argument; + this list never used in fast path code + */ + struct ip_mc_socklist { - unsigned long multiaddr[IP_MAX_MEMBERSHIPS]; /* This is a speed trade off */ - struct device *multidev[IP_MAX_MEMBERSHIPS]; + struct ip_mc_socklist *next; + int count; + struct ip_mreqn multi; }; struct ip_mc_list { - struct device *interface; - unsigned long multiaddr; - struct ip_mc_list *next; - struct timer_list timer; - int users; - char tm_running; - char reporter; + struct in_device *interface; + unsigned long multiaddr; + struct ip_mc_list *next; + struct timer_list timer; + int users; + char tm_running; + char reporter; + char unsolicit_count; }; -struct ip_router_info +extern __inline__ int ip_check_mc(struct device *dev, u32 mc_addr) { - struct device *dev; - int type; /* type of router which is querier on this interface */ - int time; /* # of slow timeouts since last old query */ - struct timer_list timer; - struct ip_router_info *next; -}; - -extern struct ip_mc_list *ip_mc_head; + struct in_device *in_dev = dev->ip_ptr; + struct ip_mc_list *im; + if (in_dev) { + for (im=in_dev->mc_list; im; im=im->next) + if (im->multiaddr == mc_addr) + return 1; + } + return 0; +} extern int igmp_rcv(struct sk_buff *, unsigned short); -extern void ip_mc_drop_device(struct device *dev); -extern int ip_mc_join_group(struct sock *sk, struct device *dev, unsigned long addr); -extern int ip_mc_leave_group(struct sock *sk, struct device *dev,unsigned long addr); +extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); +extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern void ip_mc_drop_socket(struct sock *sk); extern void ip_mr_init(void); +extern void ip_mc_init_dev(struct in_device *); +extern void ip_mc_destroy_dev(struct in_device *); +extern void ip_mc_up(struct in_device *); +extern void ip_mc_down(struct in_device *); +extern int ip_mc_dec_group(struct in_device *in_dev, u32 addr); +extern void ip_mc_inc_group(struct in_device *in_dev, u32 addr); #endif #endif diff --git a/include/linux/in.h b/include/linux/in.h index 0b77670e1323..2cc007a434f8 100644 --- a/include/linux/in.h +++ b/include/linux/in.h @@ -31,9 +31,13 @@ enum { IPPROTO_PUP = 12, /* PUP protocol */ IPPROTO_UDP = 17, /* User Datagram Protocol */ IPPROTO_IDP = 22, /* XNS IDP protocol */ + IPPROTO_RSVP = 46, /* RSVP protocol */ + IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */ IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */ + IPPROTO_PIM = 103, /* Protocol Independent Multicast */ + IPPROTO_RAW = 255, /* Raw IP packets */ IPPROTO_MAX }; @@ -48,15 +52,15 @@ struct in_addr { #define IP_TTL 2 #define IP_HDRINCL 3 #define IP_OPTIONS 4 -#define IP_LOCALADDR 5 /* Cannot remove; a lot of apps still use it. ANK */ +#define IP_ROUTER_ALERT 5 #define IP_RECVOPTS 6 #define IP_RETOPTS 7 -#define IP_RXINFO 8 -#define IP_TXINFO IP_RXINFO -/* Gated uses it. Remove later or preserve for 4.4BSD compatibility??? */ -#define IP_RECVDSTADDR 9 +#define IP_PKTINFO 8 +#define IP_PKTOPTIONS 9 #define IP_PMTUDISC 10 #define IP_RECVERR 11 +#define IP_RECVTTL 12 +#define IP_RECVTOS 13 /* BSD compatibility */ #define IP_RECVRETOPTS IP_RETOPTS @@ -71,9 +75,6 @@ struct in_addr { #define IP_MULTICAST_LOOP 34 #define IP_ADD_MEMBERSHIP 35 #define IP_DROP_MEMBERSHIP 36 -#define IP_MULTICAST_IFN 37 -#define IP_ADD_MEMBERSHIPN 38 -#define IP_DROP_MEMBERSHIPN 39 /* These need to appear somewhere around here */ #define IP_DEFAULT_MULTICAST_TTL 1 diff --git a/include/linux/in_route.h b/include/linux/in_route.h new file mode 100644 index 000000000000..6eaa7992a73b --- /dev/null +++ b/include/linux/in_route.h @@ -0,0 +1,31 @@ +#ifndef _LINUX_IN_ROUTE_H +#define _LINUX_IN_ROUTE_H + +/* IPv4 routing cache flags */ + +#define RTCF_DEAD RTNH_F_DEAD +#define RTCF_ONLINK RTNH_F_ONLINK + +#define RTCF_NOPMTUDISC RTM_F_NOPMTUDISC + +#define RTCF_NOTIFY 0x00010000 +#define RTCF_DIRECTDST 0x00020000 +#define RTCF_REDIRECTED 0x00040000 + +#define RTCF_VALVE 0x00200000 +#define RTCF_MASQ 0x00400000 +#define RTCF_SNAT 0x00800000 +#define RTCF_DOREDIRECT 0x01000000 +#define RTCF_LOG 0x02000000 +#define RTCF_DIRECTSRC 0x04000000 +#define RTCF_DNAT 0x08000000 +#define RTCF_BROADCAST 0x10000000 +#define RTCF_MULTICAST 0x20000000 +#define RTCF_REJECT 0x40000000 +#define RTCF_LOCAL 0x80000000 + +#define RTCF_NAT (RTCF_DNAT|RTCF_SNAT) + +#define RT_TOS(tos) ((tos)&IPTOS_TOS_MASK) + +#endif /* _LINUX_IN_ROUTE_H */ diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h new file mode 100644 index 000000000000..5f84c38de1c4 --- /dev/null +++ b/include/linux/inetdevice.h @@ -0,0 +1,118 @@ +#ifndef _LINUX_INETDEVICE_H +#define _LINUX_INETDEVICE_H + +/* IPv4 specific flags. They are initialized from global sysctl variables, + when IPv4 is initialized. + */ + +#define IFF_IP_FORWARD 1 +#define IFF_IP_PROXYARP 2 +#define IFF_IP_RXREDIRECTS 4 +#define IFF_IP_TXREDIRECTS 8 +#define IFF_IP_SHAREDMEDIA 0x10 +#define IFF_IP_MFORWARD 0x20 +#define IFF_IP_RPFILTER 0x40 + +#ifdef __KERNEL__ + +struct in_device +{ + struct device *dev; + struct in_ifaddr *ifa_list; /* IP ifaddr chain */ + struct ip_mc_list *mc_list; /* IP multicast filter chain */ + unsigned long mr_v1_seen; + unsigned flags; +}; + + +#define IN_DEV_RPFILTER(in_dev) (ipv4_config.rfc1812_filter && ((in_dev)->flags&IFF_IP_RPFILTER)) +#define IN_DEV_MFORWARD(in_dev) (ipv4_config.multicast_route && ((in_dev)->flags&IFF_IP_MFORWARD)) +#define IN_DEV_PROXY_ARP(in_dev) ((in_dev)->flags&IFF_IP_PROXYARP) + +#if 1 +#define IN_DEV_FORWARD(in_dev) (IS_ROUTER) +#define IN_DEV_RX_REDIRECTS(in_dev) (ipv4_config.accept_redirects) +#define IN_DEV_TX_REDIRECTS(in_dev) (1) +#define IN_DEV_SHARED_MEDIA(in_dev) (ipv4_config.rfc1620_redirects) +#else +#define IN_DEV_FORWARD(in_dev) (ipv4_config.ip_forwarding==1 && ((in_dev)->flags&IFF_IP_FORWARD)) +#define IN_DEV_RX_REDIRECTS(in_dev) ((in_dev)->flags&IFF_IP_RXREDIRECTS) +#define IN_DEV_TX_REDIRECTS(in_dev) ((in_dev)->flags&IFF_IP_TXREDIRECTS) +#define IN_DEV_SHARED_MEDIA(in_dev) ((in_dev)->flags&IFF_IP_SHAREDMEDIA) +#endif + +struct in_ifaddr +{ + struct in_ifaddr *ifa_next; + struct in_device *ifa_dev; + u32 ifa_local; + u32 ifa_address; + u32 ifa_mask; + u32 ifa_broadcast; + u32 ifa_anycast; + unsigned char ifa_scope; + unsigned char ifa_flags; + unsigned char ifa_prefixlen; + char ifa_label[IFNAMSIZ]; +}; + +extern int register_inetaddr_notifier(struct notifier_block *nb); +extern int unregister_inetaddr_notifier(struct notifier_block *nb); + +extern struct device *ip_dev_find(u32 addr); +extern struct in_ifaddr *inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b); +extern int devinet_ioctl(unsigned int cmd, void *); +extern void devinet_init(void); +extern struct in_device *inetdev_init(struct device *dev); +extern struct in_device *inetdev_by_index(int); +extern u32 inet_select_addr(struct device *dev, u32 dst, int scope); +extern struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask); +extern int inet_add_bootp_addr(struct device *dev); +extern void inet_del_bootp_addr(struct device *dev); + +extern __inline__ int inet_ifa_match(u32 addr, struct in_ifaddr *ifa) +{ + return !((addr^ifa->ifa_address)&ifa->ifa_mask); +} + +/* + * Check if a mask is acceptable. + */ + +extern __inline__ int bad_mask(u32 mask, u32 addr) +{ + if (addr & (mask = ~mask)) + return 1; + mask = ntohl(mask); + if (mask & (mask+1)) + return 1; + return 0; +} + +#define for_primary_ifa(in_dev) { struct in_ifaddr *ifa; \ + for (ifa = (in_dev)->ifa_list; ifa && !(ifa->ifa_flags&IFA_F_SECONDARY); ifa = ifa->ifa_next) + +#define for_ifa(in_dev) { struct in_ifaddr *ifa; \ + for (ifa = (in_dev)->ifa_list; ifa; ifa = ifa->ifa_next) + + +#define endfor_ifa(in_dev) } + +#endif /* __KERNEL__ */ + +extern __inline__ __u32 inet_make_mask(int logmask) +{ + if (logmask) + return htonl(~((1<<(32-logmask))-1)); + return 0; +} + +extern __inline__ int inet_mask_len(__u32 mask) +{ + if (!(mask = ntohl(mask))) + return 0; + return 32 - ffz(~mask); +} + + +#endif /* _LINUX_INETDEVICE_H */ diff --git a/include/linux/mroute.h b/include/linux/mroute.h index c0dc052b2e8c..55193867d55e 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -10,6 +10,9 @@ * * See the mrouted code for the original history. * + * Protocol Independent Multicast (PIM) data structures included + * Carlos Picoto (cap@di.fc.ul.pt) + * */ #define MRT_BASE 200 @@ -57,16 +60,9 @@ struct vifctl { struct in_addr vifc_rmt_addr; /* IPIP tunnel addr */ }; -#define VIFF_TUNNEL 0x1 /* IPIP tunnel */ -#define VIFF_SRCRT 0x2 /* NI */ - - -/* PIM Vif Flags */ -#define VIFF_DR 0x0010 /* designated router */ -#define VIFF_NOMRT 0x0020 /* no neighbor on vif */ -#define VIFF_DOWN 0x0040 /* interface is down */ -#define VIFF_DISABLED 0x0080 /* disabled interafce */ -#define VIFF_REGISTER 0x00A0 /* MIssing cap@di.fc.ul.pt */ +#define VIFF_TUNNEL 0x1 /* IPIP tunnel */ +#define VIFF_SRCRT 0x2 /* NI */ +#define VIFF_REGISTER 0x4 /* register vif */ /* * Cache manipulation structures for mrouted and PIMd @@ -110,16 +106,6 @@ struct sioc_vif_req unsigned long obytes; /* Out bytes */ }; -/* - * To get RPF from unicast routing table (PIM: cap@di.fc.ul.pt) - */ -struct sioc_rpf_req -{ - unsigned long source; /* Source address */ - unsigned long rpfneighbor; /* RPF */ - vifi_t iif; /* Incoming Interface */ -}; - /* * This is the format the mroute daemon expects to see IGMP control * data. Magically happens to be like an IP packet as per the original @@ -127,7 +113,7 @@ struct sioc_rpf_req struct igmpmsg { - unsigned long unused1,unused2; + __u32 unused1,unused2; unsigned char im_msgtype; /* What is this */ unsigned char im_mbz; /* Must be zero */ unsigned char im_vif; /* Interface (this ought to be a vifi_t!) */ @@ -147,22 +133,19 @@ extern int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg); extern void mroute_close(struct sock *sk); extern void ipmr_forward(struct sk_buff *skb, int is_frag); extern int ip_mr_find_tunnel(__u32, __u32); +extern void ip_mr_init(void); struct vif_device { - union - { - struct device *dev; /* Device we are using */ - struct rtable *rt; /* Route for tunnel */ - } u; + struct device *dev; /* Device we are using */ unsigned long bytes_in,bytes_out; unsigned long pkt_in,pkt_out; /* Statistics */ unsigned long rate_limit; /* Traffic shaping (NI) */ unsigned char threshold; /* TTL threshold */ unsigned short flags; /* Control flags */ - unsigned long local,remote; /* Addresses(remote for tunnels)*/ - unsigned long uptime; + __u32 local,remote; /* Addresses(remote for tunnels)*/ + int link; /* Physical interface index */ }; struct mfc_cache @@ -175,11 +158,9 @@ struct mfc_cache int mfc_flags; /* Flags on line */ struct sk_buff_head mfc_unresolved; /* Unresolved buffers */ int mfc_queuelen; /* Unresolved buffer counter */ - unsigned mfc_last_assert; + unsigned long mfc_last_assert; int mfc_minvif; int mfc_maxvif; - unsigned long uptime; - unsigned long expire; unsigned long mfc_bytes; unsigned long mfc_pkt; unsigned long mfc_wrong_if; @@ -188,6 +169,7 @@ struct mfc_cache #define MFC_QUEUED 1 #define MFC_RESOLVED 2 +#define MFC_NOTIFY 4 #define MFC_LINES 64 @@ -211,4 +193,31 @@ struct mfc_cache #define IGMPMSG_WRONGVIF 2 /* For PIM assert processing (unused) */ #define IGMPMSG_WHOLEPKT 3 /* For PIM Register processing */ +#ifdef __KERNEL__ + +#define PIM_V1_VERSION __constant_htonl(0x10000000) +#define PIM_V1_REGISTER 1 + +#define PIM_VERSION 2 +#define PIM_REGISTER 1 + +#define PIM_NULL_REGISTER __constant_htonl(0x40000000) + +/* PIMv2 register message header layout (ietf-draft-idmr-pimvsm-v2-00.ps */ + +struct pimreghdr +{ + __u8 type; + __u8 reserved; + __u16 csum; + __u32 flags; +}; + +extern int pim_rcv(struct sk_buff * , unsigned short); +extern int pim_rcv_v1(struct sk_buff * , unsigned short len); + +struct rtmsg; +extern int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm); +#endif + #endif diff --git a/include/linux/net.h b/include/linux/net.h index 82a4b7570bbf..015f2d8b23a6 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -18,11 +18,11 @@ #ifndef _LINUX_NET_H #define _LINUX_NET_H - -#include #include -#define NPROTO 16 /* should be enough for now.. */ +struct poll_table_struct; + +#define NPROTO 32 /* should be enough for now.. */ #define SYS_SOCKET 1 /* sys_socket(2) */ @@ -93,7 +93,7 @@ struct proto_ops { int flags); int (*getname) (struct socket *sock, struct sockaddr *uaddr, int *usockaddr_len, int peer); - unsigned int (*poll) (struct socket *sock, poll_table *wait); + unsigned int (*poll) (struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); int (*listen) (struct socket *sock, int len); diff --git a/include/linux/net_alias.h b/include/linux/net_alias.h deleted file mode 100644 index 54ba8525c974..000000000000 --- a/include/linux/net_alias.h +++ /dev/null @@ -1,187 +0,0 @@ -/* - * NET_ALIAS network device aliasing definitions. - * - * - * Version: @(#)net_alias.h 0.43 12/20/95 - * - * Author: Juan Jose Ciarlante, - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#ifndef _NET_ALIAS_H -#define _NET_ALIAS_H - -#include - -#ifdef CONFIG_NET_ALIAS -#include -#include -#include - -/* - * max. alias slot number allowed - */ - -#define NET_ALIAS_MAX_SLOT 256 - -struct net_alias; -struct net_alias_info; -struct net_alias_type; - - -/* - * Main alias structure - * Note that *defines* dev & devname. - */ - -struct net_alias -{ - struct device dev; /* alias device defn*/ - char name[IFNAMSIZ]; /* device name defn */ - unsigned hash; /* my hash value: for quick rehash */ - unsigned slot; /* slot number */ - void *data; /* private data */ - struct device *main_dev; /* pointer to main device */ - struct net_alias_type *nat; /* alias type object bound */ - struct net_alias *next; /* next alias (hashed linked list) */ -}; - - -/* - * alias structure pointed by main device - * it holds main device's alias hash table - */ - -struct net_alias_info -{ - int n_aliases; /* num aliases */ - struct device *taildev; /* my last (alias) device */ - struct net_alias *hash_tab[16]; /* hashed alias table */ -}; - -/* - * net_alias_type class - * Declares a generic (AF_ independent) structure that will - * manage generic to family-specific behavior. - */ - -struct net_alias_type -{ - int type; /* aliasing type: address family */ - int n_attach; /* number of aliases attached */ - char name[16]; /* af_name */ - __u32 (*get_addr32) /* get __u32 addr 'representation'*/ - (struct net_alias_type *this, struct sockaddr*); - int (*dev_addr_chk) /* address checking func: */ - (struct net_alias_type *this, struct device *, struct sockaddr *); - struct device * (*dev_select) /* closest alias selector*/ - (struct net_alias_type *this, struct device *, struct sockaddr *sa); - int (*alias_init_1) /* called after alias creation: */ - (struct net_alias_type *this,struct net_alias *alias, struct sockaddr *sa); - int (*alias_done_1) /* called before alias deletion */ - (struct net_alias_type *this, struct net_alias *alias); - int (*alias_print_1) - (struct net_alias_type *this, struct net_alias *alias, char *buf, int len); - struct net_alias_type *next; /* link */ -}; - - -/* - * is dev an alias? - */ - -#ifdef CONFIG_NET_ALIAS - -extern __inline__ int net_alias_is(struct device *dev) -{ - return (dev->my_alias != NULL); -} - -/* - * Does dev have aliases? - */ - -extern __inline__ int net_alias_has(struct device *dev) -{ - return (dev->alias_info != NULL); -} - -/* - * Returns MY 'true' main device - * intended for alias devices - */ - -extern __inline__ struct device *net_alias_main_dev(struct device *dev) -{ - return (net_alias_is(dev))? dev->my_alias->main_dev : dev; -} - - -/* - * Returns NEXT 'true' device - * intended for true devices - */ - -extern __inline__ struct device *net_alias_nextdev(struct device *dev) -{ - return (dev->alias_info)? dev->alias_info->taildev->next : dev->next; -} - -/* - * Sets NEXT 'true' device - * Intended for main devices (treat main device as block: dev+aliases). - */ - -extern __inline__ struct device *net_alias_nextdev_set(struct device *dev, struct device *nextdev) -{ - struct device *pdev = dev; - if (net_alias_has(dev)) - { - pdev = dev->alias_info->taildev; /* point to last dev alias */ - } - pdev->next = nextdev; - return nextdev; -} - -#else - -#define net_alias_has(dev) (0) -#define net_alias_is(dev) (0) -#define net_alias_main_dev(dev) (dev) -#endif - - -extern void net_alias_init(void); - -extern struct device * net_alias_dev_get(char *dev_name, int aliasing_ok, int *err, struct sockaddr *sa, void *data); -extern int net_alias_dev_rehash(struct device *dev, struct sockaddr *sa); - -extern int net_alias_getinfo(char *buf, char **, off_t , int , int ); -extern int net_alias_types_getinfo(char *buf, char **, off_t , int , int ); - -extern int register_net_alias_type(struct net_alias_type *nat, int type); -extern int unregister_net_alias_type(struct net_alias_type *nat); - -extern struct device * net_alias_dev_chk(struct device *main_dev, struct sockaddr *sa, int flags_on, int flags_off); -extern struct device * net_alias_dev_chk32(struct device *main_dev, int family, __u32 addr32, int flags_on, int flags_off); - -extern struct device * net_alias_dev_rcv_sel(struct device *main_dev, struct sockaddr *sa_src, struct sockaddr *sa_dst); -extern struct device * net_alias_dev_rcv_sel32(struct device *main_dev, int family, __u32 src, __u32 dst); - - - -#else - -#define net_alias_is(a) 0 -#define net_alias_main_dev(dev) (dev) -#define net_alias_has(dev) 0 - -#endif - -#endif /* _NET_ALIAS_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4a530fe8fedf..9d1f67cc0d7c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -34,7 +35,6 @@ * For future expansion when we will have different priorities. */ -#define DEV_NUMBUFFS 3 /* Number of queues per device */ #define MAX_ADDR_LEN 7 /* Largest hardware address length */ /* @@ -59,18 +59,6 @@ #define MAX_HEADER (LL_MAX_HEADER + 48) #endif -#define IS_MYADDR 1 /* address is (one of) our own */ -#define IS_LOOPBACK 2 /* address is for LOOPBACK */ -#define IS_BROADCAST 3 /* address is a valid broadcast */ -#define IS_INVBCAST 4 /* Wrong netmask bcast not for us (unused)*/ -#define IS_MULTICAST 5 /* Multicast IP address */ - -/* NOTE: move to ipv4_device.h */ - -#define IFF_IP_ADDR_OK 1 -#define IFF_IP_MASK_OK 2 -#define IFF_IP_BRD_OK 4 - struct neighbour; /* @@ -188,10 +176,11 @@ struct device /* The device initialization function. Called only once. */ int (*init)(struct device *dev); + void (*destructor)(struct device *dev); /* Interface index. Unique device identifier */ int ifindex; - struct device *next_up; + int iflink; /* * Some hardware also needs these fields, but they are not @@ -215,7 +204,7 @@ struct device unsigned long last_rx; /* Time of last Rx */ unsigned short flags; /* interface flags (a la BSD) */ - unsigned short family; /* address family ID (AF_INET) */ + unsigned short gflags; unsigned short metric; /* routing metric (not used) */ unsigned short mtu; /* interface MTU value */ unsigned short type; /* interface hardware type */ @@ -227,34 +216,25 @@ struct device unsigned char pad; /* make dev_addr aligned to 8 bytes */ unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address */ unsigned char addr_len; /* hardware address length */ - unsigned long pa_addr; /* protocol address */ - - unsigned long pa_brdaddr; /* protocol broadcast addr */ - unsigned long pa_dstaddr; /* protocol P-P other side addr */ - unsigned long pa_mask; /* protocol netmask */ - unsigned short pa_alen; /* protocol address length */ struct dev_mc_list *mc_list; /* Multicast mac addresses */ int mc_count; /* Number of installed mcasts */ - - struct ip_mc_list *ip_mc_list; /* IP multicast filter chain */ - unsigned ip_flags; /* IP layer control flags */ - __u32 tx_queue_len; /* Max frames per queue allowed */ + int promiscuity; + int allmulti; /* For load balancing driver pair support */ unsigned long pkt_queue; /* Packets queued */ struct device *slave; /* Slave device */ - struct net_alias_info *alias_info; /* main dev alias info */ - struct net_alias *my_alias; /* alias devs */ /* Protocol specific pointers */ void *atalk_ptr; /* Appletalk link */ - void *ip_ptr; /* Not used yet */ + void *ip_ptr; /* IPv4 specific data */ - /* Pointer to the interface buffers. */ - struct sk_buff_head buffs[DEV_NUMBUFFS]; + struct Qdisc *qdisc; + struct Qdisc *qdisc_sleeping; + unsigned long tx_queue_len; /* Max frames per queue allowed */ /* Pointers to interface service routines. */ int (*open)(struct device *dev); @@ -289,6 +269,8 @@ struct device #define HAVE_CHANGE_MTU int (*change_mtu)(struct device *dev, int new_mtu); + int (*hard_header_parse)(struct sk_buff *skb, + unsigned char *haddr); }; @@ -309,16 +291,8 @@ struct packet_type extern struct device loopback_dev; /* The loopback */ extern struct device *dev_base; /* All devices */ extern struct packet_type *ptype_base[16]; /* Hashed types */ - -/* NOTE: move to INET specific header; - __ip_chk_addr is deprecated, do not use if it's possible. - */ - -extern int __ip_chk_addr(unsigned long addr); -extern struct device *ip_dev_find(unsigned long addr, char *name); -/* This is the wrong place but it'll do for the moment */ -extern void ip_mc_allhost(struct device *dev); -extern int devinet_ioctl(unsigned int cmd, void *); +extern int netdev_dropping; +extern int net_cpu_congestion; extern struct device *dev_getbyhwaddr(unsigned short type, char *hwaddr); extern void dev_add_pack(struct packet_type *pt); @@ -330,16 +304,28 @@ extern int dev_open(struct device *dev); extern int dev_close(struct device *dev); extern int dev_queue_xmit(struct sk_buff *skb); extern void dev_loopback_xmit(struct sk_buff *skb); - +extern int register_netdevice(struct device *dev); +extern int unregister_netdevice(struct device *dev); +extern int register_netdevice_notifier(struct notifier_block *nb); +extern int unregister_netdevice_notifier(struct notifier_block *nb); +extern int dev_new_index(void); +extern struct device *dev_get_by_index(int ifindex); +extern int register_gifconf(int family, int (*func)(struct device *dev, char *bufptr, int len)); +extern int dev_restart(struct device *dev); + #define HAVE_NETIF_RX 1 extern void netif_rx(struct sk_buff *skb); extern void net_bh(void); extern void dev_tint(struct device *dev); extern int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy); extern int dev_ioctl(unsigned int cmd, void *); +extern int dev_change_flags(struct device *, unsigned); +extern void dev_queue_xmit_nit(struct sk_buff *skb, struct device *dev); extern void dev_init(void); +extern int netdev_nit; + /* Locking protection for page faults during outputs to devices unloaded during the fault */ extern atomic_t dev_lockct; @@ -365,30 +351,24 @@ extern __inline__ void dev_unlock_list(void) * * FIXME: What if this is being run as a real time process ?? * Linus: We need a way to force a yield here ? + * + * FIXME: Though dev_lockct is atomic varible, locking procedure + * is not atomic. */ - + extern __inline__ void dev_lock_wait(void) { - while(atomic_read(&dev_lockct)) + while (atomic_read(&dev_lockct)) { + current->counter = 0; schedule(); + } } -/* - * Buffer initialisation function. This used to appear in all the - * drivers but is now an inline in case we ever want to change the - * schemes used. - */ - extern __inline__ void dev_init_buffers(struct device *dev) { - int i; - for(i=0;ibuffs[i]); - } + /* DO NOTHING */ } - /* These functions live elsewhere (drivers/net/net_init.c, but related) */ extern void ether_setup(struct device *dev); @@ -399,8 +379,6 @@ extern int ether_config(struct device *dev, struct ifmap *map); /* Support for loadable net-drivers */ extern int register_netdev(struct device *dev); extern void unregister_netdev(struct device *dev); -extern int register_netdevice_notifier(struct notifier_block *nb); -extern int unregister_netdevice_notifier(struct notifier_block *nb); extern int register_trdev(struct device *dev); extern void unregister_trdev(struct device *dev); /* Functions used for multicast support */ @@ -408,10 +386,11 @@ extern void dev_mc_upload(struct device *dev); extern void dev_mc_delete(struct device *dev, void *addr, int alen, int all); extern void dev_mc_add(struct device *dev, void *addr, int alen, int newonly); extern void dev_mc_discard(struct device *dev); +extern void dev_set_promiscuity(struct device *dev, int inc); +extern void dev_set_allmulti(struct device *dev, int inc); /* Load a device via the kerneld */ extern void dev_load(const char *name); -extern int dev_new_index(void); -extern struct device * dev_get_by_index(int ifindex); + #endif /* __KERNEL__ */ diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 711687af8dc4..8766af2b4a42 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -1,20 +1,174 @@ #ifndef __LINUX_NETLINK_H #define __LINUX_NETLINK_H +#define NETLINK_ROUTE 0 /* Routing/device hook */ +#define NETLINK_SKIP 1 /* Reserved for ENskip */ +#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ +#define NETLINK_FIREWALL 3 /* Firewalling hook */ +#define NETLINK_ARPD 8 +#define NETLINK_ROUTE6 11 /* af_inet6 route comm channel */ +#define NETLINK_IP6_FW 13 +#define NETLINK_TAPBASE 16 /* 16 to 31 are ethertap */ + +#define MAX_LINKS 32 + +struct sockaddr_nl +{ + sa_family_t nl_family; /* AF_NETLINK */ + unsigned short nl_pad; /* zero */ + __kernel_pid_t nl_pid; /* process pid */ + unsigned nl_groups; /* multicast groups mask */ +}; + struct nlmsghdr { - unsigned long nlmsg_len; /* Length of message including header */ - unsigned long nlmsg_type; /* Message type */ - unsigned long nlmsg_seq; /* Sequence number */ - unsigned long nlmsg_pid; /* Sending process PID */ - unsigned char nlmsg_data[0]; + __u32 nlmsg_len; /* Length of message including header */ + __u16 nlmsg_type; /* Message content */ + __u16 nlmsg_flags; /* Additional flags */ + __u32 nlmsg_seq; /* Sequence number */ + __kernel_pid_t nlmsg_pid; /* Sending process PID */ +}; + +/* Flags values */ + +#define NLM_F_REQUEST 1 /* It is request message. */ +#define NLM_F_MULTI 2 /* Multipart message, terminated by NLMSG_DONE */ +#define NLM_F_ACK 4 /* If succeed, reply with ack */ +#define NLM_F_ECHO 8 /* Echo this request */ + +/* Modifiers to GET request */ +#define NLM_F_ROOT 0x100 /* specify tree root */ +#define NLM_F_MATCH 0x200 /* return all matching */ +#define NLM_F_ATOMIC 0x400 /* atomic GET */ +#define NLM_F_DUMP (NLM_F_ROOT|NLM_F_MATCH) + +/* Modifiers to NEW request */ +#define NLM_F_REPLACE 0x100 /* Override existing */ +#define NLM_F_EXCL 0x200 /* Do not touch, if it exists */ +#define NLM_F_CREATE 0x400 /* Create, if it does not exist */ + +/* + 4.4BSD ADD NLM_F_CREATE|NLM_F_EXCL + 4.4BSD CHANGE NLM_F_REPLACE + + True CHANGE NLM_F_CREATE|NLM_F_REPLACE + Append NLM_F_CREATE + Check NLM_F_EXCL + */ + +#define NLMSG_ALIGNTO 4 +#define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) ) +#define NLMSG_LENGTH(len) ((len)+NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) +#define NLMSG_DATA(nlh) ((void*)(((char*)nlh) + NLMSG_LENGTH(0))) +#define NLMSG_NEXT(nlh,len) ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \ + (struct nlmsghdr*)(((char*)(nlh)) + NLMSG_ALIGN((nlh)->nlmsg_len))) +#define NLMSG_OK(nlh,len) ((nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len <= (len)) + +#define NLMSG_NOOP 0x1 /* Nothing. */ +#define NLMSG_ERROR 0x2 /* Error */ +#define NLMSG_DONE 0x3 /* End of a dump */ +#define NLMSG_OVERRUN 0x4 /* Data lost */ + +struct nlmsgerr +{ + int error; + struct nlmsghdr msg; +}; + +#define NET_MAJOR 36 /* Major 36 is reserved for networking */ + +#ifdef __KERNEL__ + +struct netlink_skb_parms +{ + struct ucred creds; /* Skb credentials */ + pid_t pid; + unsigned groups; + pid_t dst_pid; + unsigned dst_groups; }; -#define NLMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) ) +#define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) +#define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) -#define NLMSG_ACK 0x01 /* int - error code */ -#define NLMSG_OVERRUN 0x02 /* unsigned long[2] - start and end - * of lost message sequence numbers. - */ + +extern int netlink_attach(int unit, int (*function)(int,struct sk_buff *skb)); +extern void netlink_detach(int unit); +extern int netlink_post(int unit, struct sk_buff *skb); +extern int init_netlink(void); +extern struct sock *netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)); +extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); +extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, pid_t pid, int nonblock); +extern void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, pid_t pid, + unsigned group, int allocation); +extern void netlink_set_err(struct sock *ssk, pid_t pid, unsigned group, int code); + +/* + * skb should fit one page. This choice is good for headerless malloc. + * + * FIXME: What is the best size for SLAB???? --ANK + */ +#define NLMSG_GOODSIZE (PAGE_SIZE - ((sizeof(struct sk_buff)+0xF)&~0xF)) + + +struct netlink_callback +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); + int (*done)(struct netlink_callback *cb); + long args[4]; +}; + +#if 0 + +void* nlmsg_broadcast(struct sock*, unsigned long type, int len, unsigned groups); +struct skb_buff *nlmsg_alloc(unsigned long type, int len, + unsigned long seq, unsigned long pid, int allocation); +void __nlmsg_transmit(struct sock*, int allocation); + +extern __inline__ void nlmsg_release(struct sk_buff *skb) +{ + atomic_dec(skb->users); +} + +extern __inline__ void nlmsg_transmit(struct sk_buff *sk, int allocation) +{ + if (sk->write_queue.qlen) + __nlmsg_transmit(sk, allocation); +} #endif + +extern __inline__ struct nlmsghdr * +__nlmsg_put(struct sk_buff *skb, pid_t pid, u32 seq, int type, int len) +{ + struct nlmsghdr *nlh; + int size = NLMSG_LENGTH(len); + + nlh = (struct nlmsghdr*)skb_put(skb, NLMSG_ALIGN(size)); + nlh->nlmsg_type = type; + nlh->nlmsg_len = size; + nlh->nlmsg_flags = 0; + nlh->nlmsg_pid = pid; + nlh->nlmsg_seq = seq; + return nlh; +} + +#define NLMSG_PUT(skb, pid, seq, type, len) \ +({ if (skb_tailroom(skb) < NLMSG_SPACE(len)) goto nlmsg_failure; \ + __nlmsg_put(skb, pid, seq, type, len); }) + +extern int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + struct nlmsghdr *nlh, + int (*dump)(struct sk_buff *skb, struct netlink_callback*), + int (*done)(struct netlink_callback*)); + + +extern void netlink_proto_init(struct net_proto *pro); + +#endif /* __KERNEL__ */ + +#endif /* __LINUX_NETLINK_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 9fe8b38ef364..08a53b7ae698 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -404,7 +404,7 @@ #define PCI_DEVICE_ID_MOTOROLA_RAVEN 0x4801 #define PCI_VENDOR_ID_PROMISE 0x105a -#define PCI_DEVICE_ID_PROMISE_IDE_UDMA 0x4d33 +#define PCI_DEVICE_ID_PROMISE_20246 0x4d33 #define PCI_DEVICE_ID_PROMISE_5300 0x5300 #define PCI_VENDOR_ID_N9 0x105d diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h new file mode 100644 index 000000000000..b72ca41c1d96 --- /dev/null +++ b/include/linux/pkt_sched.h @@ -0,0 +1,93 @@ +#ifndef __LINUX_PKT_SCHED_H +#define __LINUX_PKT_SCHED_H + +#define PSCHED_TC_INIT 1 +#define PSCHED_TC_DESTROY 2 +#define PSCHED_TC_ATTACH 3 +#define PSCHED_TC_DETACH 4 + + +/* "Logical" priority bands, not depending of concrete packet scheduler. + Every scheduler will map them to real traffic classes, if it have + no more precise machanism. + */ + +#define TC_PRIO_BESTEFFORT 0 +#define TC_PRIO_FILLER 1 +#define TC_PRIO_BULK 2 +#define TC_PRIO_INTERACTIVE_BULK 4 +#define TC_PRIO_INTERACTIVE 6 +#define TC_PRIO_CONTROL 7 + + +struct pschedctl +{ + int command; + int handle; + int child; + int ifindex; + char id[IFNAMSIZ]; + int arglen; + char args[0]; +}; + +/* CBQ section */ + +#define CBQ_MAXPRIO 8 +#define CBQ_MAXLEVEL 8 + +/* CSZ section */ + +struct cszctl +{ + int flow_id; + int handle; + unsigned long rate; + unsigned long max_bytes; + unsigned long depth; + unsigned long L_tab[256]; +}; + +struct cszinitctl +{ + int flows; + unsigned cell_log; +}; + +/* TBF section */ + +struct tbfctl +{ + unsigned cell_log; + unsigned long bytes; + unsigned long depth; + unsigned long L_tab[256]; +}; + +/* SFQ section */ + +struct sfqctl +{ + unsigned quantum; + unsigned depth; + unsigned divisor; + unsigned flows; +}; + +/* RED section */ + +struct redctl +{ + unsigned qmaxbytes; /* HARD maximal queue length */ + unsigned qth_min; /* Min average length threshold: A scaled */ + unsigned qth_max; /* Max average length threshold: A scaled */ + char Alog; /* Point position in average lengths */ + char Wlog; /* log(W) */ + char Rlog; /* random number bits */ + char C1log; /* log(1/C1) */ + char Slog; + char Stab[256]; +}; + + +#endif diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 5cff15720ac0..eefe988abc8d 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -81,9 +81,6 @@ enum net_directory_inos { PROC_NET_UNIX = 128, PROC_NET_ARP, PROC_NET_ROUTE, - PROC_NET_RTCLASSES, - PROC_NET_RTLOCAL, - PROC_NET_RTRULES, PROC_NET_DEV, PROC_NET_RAW, PROC_NET_RAW6, @@ -118,8 +115,6 @@ enum net_directory_inos { PROC_NET_SOCKSTAT6, PROC_NET_RTCACHE, PROC_NET_AX25_BPQETHER, - PROC_NET_ALIAS_TYPES, - PROC_NET_ALIASES, PROC_NET_IP_MASQ_APP, PROC_NET_RT6, PROC_NET_RT6_TREE, diff --git a/include/linux/route.h b/include/linux/route.h index 4432d1900525..2582f9a896b4 100644 --- a/include/linux/route.h +++ b/include/linux/route.h @@ -33,9 +33,7 @@ struct rtentry unsigned short rt_flags; short rt_pad2; unsigned long rt_pad3; - unsigned char rt_tos; - unsigned char rt_class; - short rt_pad4; + void *rt_pad4; short rt_metric; /* +1 for binary compatibility! */ char *rt_dev; /* forcing the device at add */ unsigned long rt_mtu; /* per route MTU/Window */ @@ -44,13 +42,11 @@ struct rtentry #endif unsigned long rt_window; /* Window clamping */ unsigned short rt_irtt; /* Initial RTT */ - }; #define RTF_UP 0x0001 /* route usable */ #define RTF_GATEWAY 0x0002 /* destination is a gateway */ - #define RTF_HOST 0x0004 /* host entry (net otherwise) */ #define RTF_REINSTATE 0x0008 /* reinstate route after tmout */ #define RTF_DYNAMIC 0x0010 /* created dyn. (by redirect) */ @@ -60,138 +56,12 @@ struct rtentry #define RTF_WINDOW 0x0080 /* per route window clamping */ #define RTF_IRTT 0x0100 /* Initial round trip time */ #define RTF_REJECT 0x0200 /* Reject route */ -#define RTF_STATIC 0x0400 /* Manually injected route */ -#define RTF_XRESOLVE 0x0800 /* External resolver */ -#define RTF_NOFORWARD 0x1000 /* Forwarding inhibited */ -#define RTF_THROW 0x2000 /* Go to next class */ -#define RTF_NOPMTUDISC 0x4000 /* Do not send packets with DF */ - -#define RTF_MAGIC 0x8000 /* Route added/deleted authomatically, - * when interface changes its state. - */ /* * uses RTF values >= 64k */ -#define RTCF_VALVE 0x00200000 -#define RTCF_MASQ 0x00400000 -#define RTCF_NAT 0x00800000 -#define RTCF_DOREDIRECT 0x01000000 -#define RTCF_LOG 0x02000000 -#define RTCF_DIRECTSRC 0x04000000 - -#define RTF_LOCAL 0x80000000 -#define RTF_INTERFACE 0x40000000 -#define RTF_MULTICAST 0x20000000 -#define RTF_BROADCAST 0x10000000 -#define RTF_NAT 0x08000000 - -#define RTF_ADDRCLASSMASK 0xF8000000 -#define RT_ADDRCLASS(flags) ((__u32)flags>>23) - -#define RT_TOS(tos) ((tos)&IPTOS_TOS_MASK) - -#define RT_LOCALADDR(flags) ((flags&RTF_ADDRCLASSMASK) == (RTF_LOCAL|RTF_INTERFACE)) - -#define RT_CLASS_UNSPEC 0 -#define RT_CLASS_DEFAULT 253 - -#define RT_CLASS_MAIN 254 -#define RT_CLASS_LOCAL 255 -#define RT_CLASS_MAX 255 - -#ifdef _LINUX_IN_H /* hack to check that in.h included */ -/* - * This structure is passed from the kernel to user space by netlink - * routing/device announcements - */ - -struct in_rtmsg -{ - struct in_addr rtmsg_prefix; - struct in_addr rtmsg_gateway; - unsigned rtmsg_flags; - unsigned long rtmsg_mtu; - unsigned long rtmsg_window; - unsigned short rtmsg_rtt; - short rtmsg_metric; - unsigned char rtmsg_tos; - unsigned char rtmsg_class; - unsigned char rtmsg_prefixlen; - unsigned char rtmsg_reserved; - int rtmsg_ifindex; -}; - - -struct in_ifmsg -{ - struct sockaddr ifmsg_lladdr; - struct in_addr ifmsg_prefix; - struct in_addr ifmsg_brd; - unsigned ifmsg_flags; - unsigned long ifmsg_mtu; - short ifmsg_metric; - unsigned char ifmsg_prefixlen; - unsigned char ifmsg_reserved; - int ifmsg_index; - char ifmsg_name[16]; -}; - -enum rtrule_actions -{ - RTP_GO, - RTP_NAT, - RTP_DROP, - RTP_UNREACHABLE, - RTP_PROHIBIT, - RTP_MASQUERADE -}; - -#define RTRF_LOG 1 /* Log route creations */ -#define RTRF_VALVE 2 /* One-way route */ - -struct in_rtrulemsg -{ - struct in_addr rtrmsg_src; - struct in_addr rtrmsg_dst; - struct in_addr rtrmsg_srcmap; - int rtrmsg_ifindex; - unsigned char rtrmsg_srclen; - unsigned char rtrmsg_dstlen; - unsigned char rtrmsg_tos; - unsigned char rtrmsg_class; - unsigned char rtrmsg_flags; - unsigned char rtrmsg_action; - unsigned char rtrmsg_preference; - unsigned char rtrmsg_rtmsgs; - struct in_rtmsg rtrmsg_rtmsg[1]; -}; - -struct in_rtctlmsg -{ - unsigned rtcmsg_flags; - int rtcmsg_delay; -}; - -#define RTCTL_ECHO 1 /* Echo route changes */ -#define RTCTL_FLUSH 2 /* Send flush updates */ -#define RTCTL_ACK 4 /* Send acks */ -#define RTCTL_DELAY 8 /* Set netlink delay */ -#define RTCTL_OWNER 0x10 /* Set netlink reader */ -#endif - -#define RTMSG_ACK NLMSG_ACK -#define RTMSG_OVERRUN NLMSG_OVERRUN -#define RTMSG_NEWDEVICE 0x11 -#define RTMSG_DELDEVICE 0x12 -#define RTMSG_NEWROUTE 0x21 -#define RTMSG_DELROUTE 0x22 -#define RTMSG_NEWRULE 0x31 -#define RTMSG_DELRULE 0x32 -#define RTMSG_CONTROL 0x40 -#define RTMSG_AR_FAILED 0x51 /* Address Resolution failed */ #endif /* _LINUX_ROUTE_H */ diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h new file mode 100644 index 000000000000..01f60fb24e0a --- /dev/null +++ b/include/linux/rtnetlink.h @@ -0,0 +1,555 @@ +#ifndef __LINUX_RTNETLINK_H +#define __LINUX_RTNETLINK_H + +#include +#include + +#define RTNL_DEBUG 1 + + +/**** + * Routing/neighbour discovery messages. + ****/ + +/* Types of messages */ + +#define RTM_BASE 0x10 + +#define RTM_NEWLINK (RTM_BASE+0) +#define RTM_DELLINK (RTM_BASE+1) +#define RTM_GETLINK (RTM_BASE+2) + +#define RTM_NEWADDR (RTM_BASE+4) +#define RTM_DELADDR (RTM_BASE+5) +#define RTM_GETADDR (RTM_BASE+6) + +#define RTM_NEWROUTE (RTM_BASE+8) +#define RTM_DELROUTE (RTM_BASE+9) +#define RTM_GETROUTE (RTM_BASE+10) + +#define RTM_NEWNEIGH (RTM_BASE+12) +#define RTM_DELNEIGH (RTM_BASE+13) +#define RTM_GETNEIGH (RTM_BASE+14) + +#define RTM_NEWRULE (RTM_BASE+16) +#define RTM_DELRULE (RTM_BASE+17) +#define RTM_GETRULE (RTM_BASE+18) + +#define RTM_MAX (RTM_BASE+19) + + +/* Generic structure for encapsulation optional route + information. It is reminiscent of sockaddr, but with sa_family + replaced with attribute type. + It would be good, if constructions of sort: + struct something { + struct rtattr rta; + struct a_content a; + } + had correct alignment. It is true for x86, but I have no idea + how to make it on 64bit architectures. Please, teach me. --ANK + */ + +struct rtattr +{ + unsigned short rta_len; + unsigned short rta_type; +/* + unsigned char rta_data[0]; + */ +}; + +enum rtattr_type_t +{ + RTA_UNSPEC, + RTA_DST, + RTA_SRC, + RTA_IIF, + RTA_OIF, + RTA_GATEWAY, + RTA_PRIORITY, + RTA_PREFSRC, + RTA_WINDOW, + RTA_RTT, + RTA_MTU, + RTA_IFNAME +}; + +#define RTA_MAX RTA_IFNAME + +/* Macros to handle rtattributes */ + +#define RTA_ALIGNTO 4 +#define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) ) +#define RTA_OK(rta,len) ((rta)->rta_len > sizeof(struct rtattr) && \ + (rta)->rta_len <= (len)) +#define RTA_NEXT(rta,attrlen) ((attrlen) -= RTA_ALIGN((rta)->rta_len), \ + (struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len))) +#define RTA_LENGTH(len) (RTA_ALIGN(sizeof(struct rtattr)) + (len)) +#define RTA_SPACE(len) RTA_ALIGN(RTA_LENGTH(len)) +#define RTA_DATA(rta) ((void*)(((char*)(rta)) + RTA_LENGTH(0))) + + +/* + * "struct rtnexthop" describres all necessary nexthop information, + * i.e. parameters of path to a destination via this nextop. + * + * At the moment it is impossible to set different prefsrc, mtu, window + * and rtt for different paths from multipath. + */ + +struct rtnexthop +{ + unsigned short rtnh_len; + unsigned char rtnh_flags; + unsigned char rtnh_hops; + int rtnh_ifindex; +/* + struct rtattr rtnh_data[0]; + */ +}; + +/* rtnh_flags */ + +#define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ +#define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ +#define RTNH_F_ONLINK 4 /* Gateway is forced on link */ + +/* Macros to handle hexthops */ + +#define RTNH_ALIGNTO 4 +#define RTNH_ALIGN(len) ( ((len)+RTNH_ALIGNTO-1) & ~(RTNH_ALIGNTO-1) ) +#define RTNH_OK(rtnh,len) ((rtnh)->rtnh_len >= sizeof(struct rtnexthop) && \ + (rtnh)->rtnh_len <= (len)) +#define RTNH_NEXT(rtnh) ((struct rtnexthop*)(((char*)(rtnh)) + RTNH_ALIGN((rtnh)->rtnh_len))) +#define RTNH_LENGTH(len) (RTNH_ALIGN(sizeof(struct rtnexthop)) + (len)) +#define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len)) +#define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0))) + + +struct rtmsg +{ + unsigned char rtm_family; + unsigned char rtm_dst_len; + unsigned char rtm_src_len; + unsigned char rtm_tos; + unsigned char rtm_table; /* Routing table id */ + unsigned char rtm_protocol; /* Routing protocol; see below */ + unsigned char rtm_nhs; /* Number of nexthops */ + unsigned char rtm_type; /* See below */ + unsigned short rtm_optlen; /* Byte length of rtm_opt */ + unsigned char rtm_scope; /* See below */ + unsigned char rtm_whatsit; /* Unused byte */ + unsigned rtm_flags; +/* + struct rtattr rtm_opt[0]; + struct rtnexthop rtm_nh[0]; + */ +}; + +#define RTM_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg)))) +#define RTM_RTNH(r) ((struct rtnexthop*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg)) \ + + NLMSG_ALIGN((r)->rtm_optlen))) +#define RTM_NHLEN(nlh,r) ((nlh)->nlmsg_len - NLMSG_SPACE(sizeof(struct rtmsg)) - NLMSG_ALIGN((r)->rtm_optlen)) + +/* rtm_type */ + +enum +{ + RTN_UNSPEC, + RTN_UNICAST, /* Gateway or direct route */ + RTN_LOCAL, /* Accept locally */ + RTN_BROADCAST, /* Accept locally as broadcast, + send as broadcast */ + RTN_ANYCAST, /* Accept locally as broadcast, + but send as unicast */ + RTN_MULTICAST, /* Multicast route */ + RTN_BLACKHOLE, /* Drop */ + RTN_UNREACHABLE, /* Destination is unreachable */ + RTN_PROHIBIT, /* Administratively prohibited */ + RTN_THROW, /* Not in this table */ + RTN_NAT, /* Translate this address */ + RTN_XRESOLVE, /* Use external resolver */ +}; + +#define RTN_MAX RTN_XRESOLVE + +/* rtm_protocol */ + +#define RTPROT_UNSPEC 0 +#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; + not used by current IPv4 */ +#define RTPROT_KERNEL 2 /* Route installed by kernel */ +#define RTPROT_BOOT 3 /* Route installed during boot */ +#define RTPROT_STATIC 4 /* Route installed by administrator */ + +/* Values of protocol >= RTPROT_STATIC are not interpreted by kernel; + they just passed from user and back as is. + It will be used by hypothetical multiple routing daemons. + Note that protocol values should be standardized in order to + avoid conflicts. + */ + +#define RTPROT_GATED 8 /* Apparently, GateD */ +#define RTPROT_RA 9 /* RDISC router advertisment */ + + +/* rtm_scope + + Really it is not scope, but sort of distance to the destination. + NOWHERE are reserved for not existing destinations, HOST is our + local addresses, LINK are destinations, locate on directly attached + link and UNIVERSE is everywhere in the Universe :-) + + Intermediate values are also possible f.e. interior routes + could be assigned a value between UNIVERSE and LINK. +*/ + +enum rt_scope_t +{ + RT_SCOPE_UNIVERSE=0, +/* User defined values f.e. "site" */ + RT_SCOPE_LINK=253, + RT_SCOPE_HOST=254, + RT_SCOPE_NOWHERE=255 +}; + +/* rtm_flags */ + +#define RTM_F_NOTIFY 0x100 /* Notify user of route change */ +#define RTM_F_CLONED 0x200 /* This route is cloned */ +#define RTM_F_NOPMTUDISC 0x400 /* Do not make PMTU discovery */ +#define RTM_F_EQUALIZE 0x800 /* Multipath equalizer: NI */ + +/* Reserved table identifiers */ + +enum rt_class_t +{ + RT_TABLE_UNSPEC=0, +/* User defined values */ + RT_TABLE_DEFAULT=253, + RT_TABLE_MAIN=254, + RT_TABLE_LOCAL=255 +}; +#define RT_TABLE_MAX RT_TABLE_LOCAL + + +/********************************************************* + * Interface address. + ****/ + +struct ifaddrmsg +{ + unsigned char ifa_family; + unsigned char ifa_prefixlen; /* The prefix length */ + unsigned char ifa_flags; /* Flags */ + unsigned char ifa_scope; /* See above */ + int ifa_index; /* Link index */ +/* + struct rtattr ifa_data[0]; + */ +}; + +enum +{ + IFA_UNSPEC, + IFA_ADDRESS, + IFA_LOCAL, + IFA_LABEL, + IFA_BROADCAST, + IFA_ANYCAST +}; + +#define IFA_MAX IFA_ANYCAST + +/* ifa_flags */ + +#define IFA_F_SECONDARY 1 + + +#define IFA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifaddrmsg)))) + +/* + Important comment: + IFA_ADDRESS is prefix address, rather than local interface address. + It makes no difference for normally configured broadcast interfaces, + but for point-to-point IFA_ADDRESS is DESTINATION address, + local address is supplied in IFA_LOCAL attribute. + */ + +/************************************************************** + * Neighbour discovery. + ****/ + +struct ndmsg +{ + unsigned char nd_family; + int nd_ifindex; /* Link index */ + unsigned nd_flags; +/* + struct rtattr nd_data[0]; + */ +}; + +enum +{ + NDA_UNSPEC, + NDA_DST, + NDA_LLADDR, +}; + +#define NDA_MAX NDA_LLADDR + +#define NDA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg)))) + +/**** + * General form of address family dependent message. + ****/ + +struct rtgenmsg +{ + unsigned char rtgen_family; +}; + +/***************************************************************** + * Link layer specific messages. + ****/ + +/* struct ifinfomsg + * passes link level specific information, not dependent + * on network protocol. + */ + +struct ifinfomsg +{ + unsigned char ifi_family; /* Dummy */ + unsigned char ifi_addrlen; /* Length of HW address */ + unsigned short ifi_pad__; + int ifi_index; /* Link index */ + int ifi_link; /* Physical device */ + char ifi_name[IFNAMSIZ]; + struct sockaddr ifi_address; /* HW address */ + struct sockaddr ifi_broadcast; /* HW broadcast */ + unsigned ifi_flags; /* IFF_* flags */ + int ifi_mtu; /* Link mtu */ + char ifi_qdiscname[IFNAMSIZ];/* Id of packet scheduler */ + int ifi_qdisc; /* Packet scheduler handle */ +}; + +/* ifi_flags. + + IFF_* flags. + + The only change is: + IFF_LOOPBACK, IFF_BROADCAST and IFF_POINTOPOINT are + more not changeable by user. They describe link media + characteristics and set by device driver. + + Comments: + - Combination IFF_BROADCAST|IFF_POINTOPOINT is invalid + - If neiher of these three flags are set; + the interface is NBMA. + + - IFF_MULTICAST does not mean anything special: + multicasts can be used on all not-NBMA links. + IFF_MULTICAST means that this media uses special encapsulation + for multicast frames. Apparently, all IFF_POINTOPOINT and + IFF_BROADCAST devices are able to use multicasts too. + */ + +/* ifi_link. + For usual devices it is equal ifi_index. + If it is a "virtual interface" (f.e. tunnel), ifi_link + can point to real physical interface (f.e. for bandwidth calculations), + or maybe 0, what means, that real media is unknown (usual + for IPIP tunnels, when route to endpoint is allowed to change) + */ + +#define RTMGRP_LINK 1 +#define RTMGRP_NOTIFY 2 + +#define RTMGRP_IPV4_IFADDR 0x10 +#define RTMGRP_IPV4_NDISC 0x20 +#define RTMGRP_IPV4_ROUTE 0x40 +#define RTMGRP_IPV4_MROUTE 0x80 + +#define RTMGRP_IPV6_IFADDR 0x100 +#define RTMGRP_IPV6_NDISC 0x200 +#define RTMGRP_IPV6_ROUTE 0x400 +#define RTMGRP_IPV6_MROUTE 0x800 + + +#ifdef __KERNEL__ + +struct kern_rta +{ + void *rta_dst; + void *rta_src; + int *rta_iif; + int *rta_oif; + void *rta_gw; + u32 *rta_priority; + void *rta_prefsrc; + unsigned *rta_window; + unsigned *rta_rtt; + unsigned *rta_mtu; + unsigned char *rta_ifname; +}; + +struct kern_ifa +{ + void *ifa_address; + void *ifa_local; + unsigned char *ifa_label; + void *ifa_broadcast; + void *ifa_anycast; +}; + + +extern atomic_t rtnl_rlockct; +extern struct wait_queue *rtnl_wait; + +#ifdef CONFIG_RTNETLINK +extern struct sock *rtnl; + +struct rtnetlink_link +{ + int (*doit)(struct sk_buff *, struct nlmsghdr*, void *attr); + int (*dumpit)(struct sk_buff *, struct netlink_callback *cb); +}; + +extern struct rtnetlink_link * rtnetlink_links[NPROTO]; +extern int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb); + + +extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data); + +#define RTA_PUT(skb, attrtype, attrlen, data) \ +({ if (skb_tailroom(skb) < RTA_SPACE(attrlen)) goto rtattr_failure; \ + __rta_fill(skb, attrtype, attrlen, data); }) + +extern unsigned long rtnl_wlockct; + +/* NOTE: these locks are not interrupt safe, are not SMP safe, + * they are even not atomic. 8)8)8) ... and it is not a bug. + * Really, if these locks will be programmed correctly, + * all the addressing/routing machine would become SMP safe, + * but is absolutely useless at the moment, because all the kernel + * is not reenterable in any case. --ANK + * + * Well, atomic_* and set_bit provide the only thing here: + * gcc is confused not to overoptimize them, that's all. + * I remember as gcc splitted ++ operation, but cannot reproduce + * it with gcc-2.7.*. --ANK + * + * One more note: rwlock facility should be written and put + * to a kernel wide location: f.e. current implementation of semaphores + * (especially, for x86) looks like a wonder. It would be good + * to have something similar for rwlock. Recursive lock could be also + * useful thing. --ANK + */ + +extern __inline__ int rtnl_shlock_nowait(void) +{ + atomic_inc(&rtnl_rlockct); + if (test_bit(0, &rtnl_wlockct)) { + atomic_dec(&rtnl_rlockct); + return -EAGAIN; + } + return 0; +} + +extern __inline__ void rtnl_shlock(void) +{ + while (rtnl_shlock_nowait()) + sleep_on(&rtnl_wait); +} + +/* Check for possibility to PROMOTE shared lock to exclusive. + Shared lock must be already grabbed with rtnl_shlock*(). + */ + +extern __inline__ int rtnl_exlock_nowait(void) +{ + if (atomic_read(&rtnl_rlockct) > 1) + return -EAGAIN; + if (test_and_set_bit(0, &rtnl_wlockct)) + return -EAGAIN; + return 0; +} + +extern __inline__ void rtnl_exlock(void) +{ + while (rtnl_exlock_nowait()) + sleep_on(&rtnl_wait); +} + +#if 0 +extern __inline__ void rtnl_shunlock(void) +{ + atomic_dec(&rtnl_rlockct); + if (atomic_read(&rtnl_rlockct) <= 1) { + wake_up(&rtnl_wait); + if (rtnl->receive_queue.qlen) + rtnl->data_ready(rtnl, 0); + } +} +#else + +/* The problem: inline requires to include and, hence, + almost all of net includes :-( + */ + +#define rtnl_shunlock() ({ \ + atomic_dec(&rtnl_rlockct); \ + if (atomic_read(&rtnl_rlockct) <= 1) { \ + wake_up(&rtnl_wait); \ + if (rtnl->receive_queue.qlen) \ + rtnl->data_ready(rtnl, 0); \ + } \ +}) +#endif + +/* Release exclusive lock. Note, that we do not wake up rtnetlink socket, + * it will be done later after releasing shared lock. + */ + +extern __inline__ void rtnl_exunlock(void) +{ + clear_bit(0, &rtnl_wlockct); + wake_up(&rtnl_wait); +} + +#else + +extern __inline__ void rtnl_shlock(void) +{ + while (atomic_read(&rtnl_rlockct)) + sleep_on(&rtnl_wait); + atomic_inc(&rtnl_rlockct); +} + +extern __inline__ void rtnl_shunlock(void) +{ + if (atomic_dec_and_test(&rtnl_rlockct)) + wake_up(&rtnl_wait); +} + +extern __inline__ void rtnl_exlock(void) +{ +} + +extern __inline__ void rtnl_exunlock(void) +{ +} + +#endif + +extern void rtnl_lock(void); +extern void rtnl_unlock(void); +extern void rtnetlink_init(void); + +#endif /* __KERNEL__ */ + + +#endif /* __LINUX_RTNETLINK_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a2617b15cdab..3668712d85b7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -90,15 +90,10 @@ struct sk_buff arp; /* Has IP/ARP resolution finished */ unsigned char tries, /* Times tried */ inclone, /* Inline clone */ - priority, pkt_type, /* Packet class */ pkt_bridged, /* Tracker for bridging */ ip_summed; /* Driver fed us an IP checksum */ -#define PACKET_HOST 0 /* To us */ -#define PACKET_BROADCAST 1 /* To all */ -#define PACKET_MULTICAST 2 /* To group */ -#define PACKET_OTHERHOST 3 /* To someone else */ -#define PACKET_NDISC 17 /* Outgoing NDISC packet */ + __u32 priority; atomic_t users; /* User count - see datagram.c,tcp.c */ unsigned short protocol; /* Packet protocol from driver. */ unsigned short security; /* Security level of packet */ @@ -447,13 +442,17 @@ here: ; return skb->data; } -extern __inline__ unsigned char * skb_pull(struct sk_buff *skb, unsigned int len) +extern __inline__ char *__skb_pull(struct sk_buff *skb, unsigned int len) { + skb->len-=len; + return skb->data+=len; +} + +extern __inline__ unsigned char * skb_pull(struct sk_buff *skb, unsigned int len) +{ if (len > skb->len) return NULL; - skb->data+=len; - skb->len-=len; - return skb->data; + return __skb_pull(skb,len); } extern __inline__ int skb_headroom(struct sk_buff *skb) @@ -472,11 +471,16 @@ extern __inline__ void skb_reserve(struct sk_buff *skb, unsigned int len) skb->tail+=len; } +extern __inline__ void __skb_trim(struct sk_buff *skb, unsigned int len) +{ + skb->len = len; + skb->tail = skb->data+len; +} + extern __inline__ void skb_trim(struct sk_buff *skb, unsigned int len) { if (skb->len > len) { - skb->len = len; - skb->tail = skb->data+len; + __skb_trim(skb, len); } } @@ -515,8 +519,15 @@ extern __inline__ void skb_orphan(struct sk_buff *skb) skb->sk = NULL; } +extern __inline__ void skb_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb=skb_dequeue(list))!=NULL) + kfree_skb(skb,0); +} + extern struct sk_buff * skb_recv_datagram(struct sock *sk,unsigned flags,int noblock, int *err); -extern unsigned int datagram_poll(struct socket *sock, poll_table *wait); +extern unsigned int datagram_poll(struct socket *sock, struct poll_table_struct *wait); extern int skb_copy_datagram(struct sk_buff *from, int offset, char *to,int size); extern int skb_copy_datagram_iovec(struct sk_buff *from, int offset, struct iovec *to,int size); extern void skb_free_datagram(struct sock * sk, struct sk_buff *skb); diff --git a/include/linux/socket.h b/include/linux/socket.h index d3365f698da2..7ba4fe3939ab 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -12,8 +12,7 @@ typedef unsigned short sa_family_t; * 1003.1g requires sa_family_t and that sa_data is char. */ -struct sockaddr -{ +struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ char sa_data[14]; /* 14 bytes of protocol address */ }; @@ -29,8 +28,7 @@ struct linger { * belong in an obscure libc emulation or the bin. */ -struct msghdr -{ +struct msghdr { void * msg_name; /* Socket name */ int msg_namelen; /* Length of name */ struct iovec * msg_iov; /* Data blocks */ @@ -57,7 +55,8 @@ struct cmsghdr { * Table 5-14 of POSIX 1003.1g */ -#define CMSG_NXTHDR(mhdr, cmsg) cmsg_nxthdr(mhdr, cmsg) +#define __CMSG_NXTHDR(ctl, len, cmsg) __cmsg_nxthdr((ctl),(len),(cmsg)) +#define CMSG_NXTHDR(mhdr, cmsg) cmsg_nxthdr((mhdr), (cmsg)) #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) ) @@ -65,18 +64,19 @@ struct cmsghdr { #define CMSG_SPACE(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + CMSG_ALIGN(len)) #define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + (len)) -#define CMSG_FIRSTHDR(msg) ((msg)->msg_controllen >= sizeof(struct cmsghdr) ? \ - (struct cmsghdr *)(msg)->msg_control : \ - (struct cmsghdr *)NULL) +#define __CMSG_FIRSTHDR(ctl,len) ((len) >= sizeof(struct cmsghdr) ? \ + (struct cmsghdr *)(ctl) : \ + (struct cmsghdr *)NULL) +#define CMSG_FIRSTHDR(msg) __CMSG_FIRSTHDR((msg)->msg_control, (msg)->msg_controllen) /* * This mess will go away with glibc */ #ifdef __KERNEL__ -#define KINLINE extern __inline__ +#define __KINLINE extern __inline__ #else -#define KINLINE static +#define __KINLINE static #endif @@ -84,20 +84,23 @@ struct cmsghdr { * Get the next cmsg header */ -KINLINE struct cmsghdr * cmsg_nxthdr(struct msghdr *mhdr, - struct cmsghdr *cmsg) +__KINLINE struct cmsghdr * __cmsg_nxthdr(void *__ctl, __kernel_size_t __size, + struct cmsghdr *__cmsg) { - unsigned char * ptr; + unsigned char * __ptr; - if (cmsg->cmsg_len < sizeof(struct cmsghdr)) - { + if (__cmsg->cmsg_len < sizeof(struct cmsghdr)) return NULL; - } - ptr = ((unsigned char *) cmsg) + CMSG_ALIGN(cmsg->cmsg_len); - if (ptr >= (unsigned char *) mhdr->msg_control + mhdr->msg_controllen) + __ptr = ((unsigned char *) __cmsg) + CMSG_ALIGN(__cmsg->cmsg_len); + if (__ptr >= (unsigned char *) __ctl + __size) return NULL; - return (struct cmsghdr *) ptr; + return (struct cmsghdr *) __ptr; +} + +__KINLINE struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr *__cmsg) +{ + return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } /* "Socket"-level control message types: */ @@ -106,8 +109,7 @@ KINLINE struct cmsghdr * cmsg_nxthdr(struct msghdr *mhdr, #define SCM_CREDENTIALS 0x02 /* rw: struct ucred */ #define SCM_CONNECT 0x03 /* rw: struct scm_connect */ -struct ucred -{ +struct ucred { __kernel_pid_t pid; __kernel_uid_t uid; __kernel_gid_t gid; @@ -144,6 +146,9 @@ struct ucred #define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/ #define AF_SECURITY 14 /* Security callback pseudo AF */ #define pseudo_AF_KEY 15 /* PF_KEY key management API */ +#define AF_NETLINK 16 +#define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */ +#define AF_PACKET 17 /* Packet family */ #define AF_MAX 32 /* For now.. */ /* Protocol families, same as address families. */ @@ -164,6 +169,9 @@ struct ucred #define PF_NETBEUI AF_NETBEUI #define PF_SECURITY AF_SECURITY #define PF_KEY pseudo_AF_KEY +#define PF_NETLINK AF_NETLINK +#define PF_ROUTE AF_ROUTE +#define PF_PACKET AF_PACKET #define PF_MAX AF_MAX @@ -196,6 +204,9 @@ struct ucred /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 +/* #define SOL_ICMP 1 No-no-no! Due to Linux :-) we cannot use SOL_ICMP=1 */ +#define SOL_TCP 6 +#define SOL_UDP 17 #define SOL_IPV6 41 #define SOL_ICMPV6 58 #define SOL_RAW 255 @@ -206,8 +217,7 @@ struct ucred #define SOL_ROSE 260 #define SOL_DECNET 261 #define SOL_X25 262 -#define SOL_TCP 6 -#define SOL_UDP 17 +#define SOL_PACKET 263 /* IPX options */ #define IPX_TYPE 1 @@ -216,24 +226,19 @@ struct ucred #define TCP_NODELAY 1 #define TCP_MAXSEG 2 -/* The various priorities. */ -#define SOPRI_INTERACTIVE 0 -#define SOPRI_NORMAL 1 -#define SOPRI_BACKGROUND 2 - #ifdef __KERNEL__ extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len); extern int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, int len); -extern unsigned int csum_partial_copy_fromiovecend(unsigned char *kdata, - struct iovec *iov, - int offset, - int len, int csum); +extern int csum_partial_copy_fromiovecend(unsigned char *kdata, + struct iovec *iov, + int offset, + int len, int *csump); extern int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode); extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len); extern int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen); extern int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr); -extern void put_cmsg(struct msghdr*, int level, int type, int len, void *data); +extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); #endif #endif /* _LINUX_SOCKET_H */ diff --git a/include/linux/sockios.h b/include/linux/sockios.h index 5147dd23ce40..5465bc6b840c 100644 --- a/include/linux/sockios.h +++ b/include/linux/sockios.h @@ -53,18 +53,19 @@ #define SIOCSIFSLAVE 0x8930 #define SIOCADDMULTI 0x8931 /* Multicast address lists */ #define SIOCDELMULTI 0x8932 -#define SIOGIFINDEX 0x8933 /* name -> if_index mapping */ -#define SIOGIFNAME 0x8934 /* if_index -> name mapping */ -#define SIOCGIFCOUNT 0x8935 /* get number of interfaces */ +#define SIOCGIFINDEX 0x8933 /* name -> if_index mapping */ +#define SIOGIFINDEX SIOCGIFINDEX /* misprint compatibility :-) */ +#define SIOCSIFPFLAGS 0x8934 /* set/get extended flags set */ +#define SIOCGIFPFLAGS 0x8935 #define SIOCDIFADDR 0x8936 /* delete PA address */ +#define SIOCSIFHWBROADCAST 0x8937 /* set hardware broadcast addr */ +#define SIOCGIFCOUNT 0x8938 /* get number of devices */ #define SIOCGIFBR 0x8940 /* Bridging support */ #define SIOCSIFBR 0x8941 /* Set bridging options */ /* ARP cache control calls. */ -#define OLD_SIOCDARP 0x8950 /* old delete ARP table entry */ -#define OLD_SIOCGARP 0x8951 /* old get ARP table entry */ -#define OLD_SIOCSARP 0x8952 /* old set ARP table entry */ + /* 0x8950 - 0x8952 * obsolete calls, don't re-use */ #define SIOCDARP 0x8953 /* delete ARP table entry */ #define SIOCGARP 0x8954 /* get ARP table entry */ #define SIOCSARP 0x8955 /* set ARP table entry */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 587aaa847a82..0fe55b2715bf 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -150,14 +150,15 @@ enum NET_IPV4_RFC1812_FILTER, NET_IPV4_LOG_MARTIANS, NET_IPV4_SOURCE_ROUTE, - NET_IPV4_ADDRMASK_AGENT, - NET_IPV4_BOOTP_AGENT, + NET_IPV4_SEND_REDIRECTS, + NET_IPV4_AUTOCONFIG, NET_IPV4_BOOTP_RELAY, - NET_IPV4_FIB_MODEL, + NET_IPV4_PROXY_ARP, NET_IPV4_NO_PMTU_DISC, NET_IPV4_ACCEPT_REDIRECTS, NET_IPV4_SECURE_REDIRECTS, NET_IPV4_RFC1620_REDIRECTS, + NET_IPV4_RTCACHE_FLUSH, NET_IPV4_TCP_SYN_RETRIES, NET_IPV4_IPFRAG_HIGH_THRESH, NET_IPV4_IPFRAG_LOW_THRESH, @@ -176,6 +177,14 @@ enum NET_TCP_STDURG, NET_TCP_SYN_TAILDROP, NET_TCP_MAX_SYN_BACKLOG, + NET_IPV4_LOCAL_PORT_RANGE, + NET_IPV4_ICMP_ECHO_IGNORE_ALL, + NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, + NET_IPV4_ICMP_SOURCEQUENCH_RATE, + NET_IPV4_ICMP_DESTUNREACH_RATE, + NET_IPV4_ICMP_TIMEEXCEED_RATE, + NET_IPV4_ICMP_PARAMPROB_RATE, + NET_IPV4_ICMP_ECHOREPLY_RATE }; @@ -198,6 +207,8 @@ enum { NET_IPV6_RTR_SOLICITS, NET_IPV6_RTR_SOLICIT_INTERVAL, NET_IPV6_RTR_SOLICIT_DELAY, + + NET_IPV6_ICMPV6_TIME, }; /* /proc/sys/net/ipx */ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ae6a063e32dc..594b3108a27b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -68,4 +68,18 @@ enum { TCP_CLOSING /* now a valid state */ }; +enum { + TCPF_ESTABLISHED = (1 << 1), + TCPF_SYN_SENT = (1 << 2), + TCPF_SYN_RECV = (1 << 3), + TCPF_FIN_WAIT1 = (1 << 4), + TCPF_FIN_WAIT2 = (1 << 5), + TCPF_TIME_WAIT = (1 << 6), + TCPF_CLOSE = (1 << 7), + TCPF_CLOSE_WAIT = (1 << 8), + TCPF_LAST_ACK = (1 << 9), + TCPF_LISTEN = (1 << 10), + TCPF_CLOSING = (1 << 11) +}; + #endif /* _LINUX_TCP_H */ diff --git a/include/net/dst.h b/include/net/dst.h index 9d2a69100b04..155662f9d085 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -40,6 +40,9 @@ struct dst_entry unsigned window; unsigned pmtu; unsigned rtt; + unsigned long rate_last; /* rate limiting for ICMP */ + unsigned long rate_tokens; + int error; struct neighbour *neighbour; @@ -49,7 +52,7 @@ struct dst_entry int (*output)(struct sk_buff*); struct dst_ops *ops; - + char info[0]; }; @@ -57,12 +60,14 @@ struct dst_entry struct dst_ops { unsigned short family; - struct dst_entry * (*check)(struct dst_entry *, u32 cookie); + struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); struct dst_entry * (*reroute)(struct dst_entry *, struct sk_buff *); void (*destroy)(struct dst_entry *); }; +#ifdef __KERNEL__ + extern struct dst_entry * dst_garbage_list; extern atomic_t dst_total; @@ -122,5 +127,6 @@ void dst_free(struct dst_entry * dst) } __dst_free(dst); } +#endif #endif /* _NET_DST_H */ diff --git a/include/net/gc.h b/include/net/gc.h deleted file mode 100644 index 0b28c098ec4d..000000000000 --- a/include/net/gc.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Interface routines assumed by gc() - * - * Copyright (C) Barak A. Pearlmutter. - * Released under the GPL version 2 or later. - * - */ - -typedef struct object *pobj; /* pointer to a guy of the type we gc */ - -/* - * How to mark and unmark objects - */ - -extern void gc_mark(pobj); -extern void gc_unmark(pobj); -extern int gc_marked(pobj); - -/* - * How to count and access an object's children - */ - -extern int n_children(pobj); /* how many children */ -extern pobj child_n(pobj, int); /* child i, numbered 0..n-1 */ - -/* - * How to access the root set - */ - -extern int root_size(void); /* number of things in root set */ -extern pobj root_elt(int); /* element i of root set, numbered 0..n-1 */ - -/* - * How to access the free list - */ - -extern void clear_freelist(void); -extern void add_to_free_list(pobj); - -/* - * How to iterate through all objects in memory - */ - -extern int N_OBJS; -extern pobj obj_number(int); - diff --git a/include/net/icmp.h b/include/net/icmp.h index a936803eb2e2..6e9c541887de 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -33,6 +33,9 @@ extern int icmp_rcv(struct sk_buff *skb, unsigned short len); extern int icmp_ioctl(struct sock *sk, int cmd, unsigned long arg); extern void icmp_init(struct net_proto_family *ops); +/* Move into dst.h ? */ +extern int xrlim_allow(struct dst_entry *dst, int timeout); + /* CONFIG_IP_TRANSPARENT_PROXY */ extern int icmp_chkaddr(struct sk_buff *skb); diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 728b6662e750..af1128bafea5 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -28,7 +28,7 @@ extern int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size, struct scm_cookie *scm); extern int inet_shutdown(struct socket *sock, int how); -extern unsigned int inet_poll(struct socket *sock, poll_table *wait); +extern unsigned int inet_poll(struct socket *sock, struct poll_table_struct *wait); extern int inet_setsockopt(struct socket *sock, int level, int optname, char *optval, int optlen); diff --git a/include/net/ip.h b/include/net/ip.h index e5d59dd33cd3..29c40ca87409 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #ifndef _SNMP_H @@ -38,21 +40,30 @@ struct inet_skb_parm struct ip_options opt; /* Compiled IP options */ u16 redirport; /* Redirect port */ unsigned char flags; - char vif; #define IPSKB_MASQUERADED 1 #define IPSKB_TRANSLATED 2 -#define IPSKB_TUNNELED 4 +#define IPSKB_FORWARDED 4 }; struct ipcm_cookie { u32 addr; + int oif; struct ip_options *opt; }; #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) +struct ip_ra_chain +{ + struct ip_ra_chain *next; + struct sock *sk; + void (*destructor)(struct sock *); +}; + +extern struct ip_ra_chain *ip_ra_chain; + /* IP flags. */ #define IP_CE 0x8000 /* Flag: "Congestion" */ #define IP_DF 0x4000 /* Flag: "Don't Fragment" */ @@ -134,20 +145,23 @@ struct ipv4_config int secure_redirects; int rfc1620_redirects; int rfc1812_filter; - int addrmask_agent; + int send_redirects; int log_martians; int source_route; int multicast_route; - int bootp_agent; + int proxy_arp; int bootp_relay; - int fib_model; + int autoconfig; int no_pmtu_disc; }; extern struct ipv4_config ipv4_config; +extern int sysctl_local_port_range[2]; #define IS_ROUTER (ip_statistics.IpForwarding == 1) +extern int ip_call_ra_chain(struct sk_buff *skb); + /* * Functions provided by ip_fragment.o */ @@ -165,7 +179,7 @@ extern int ip_net_unreachable(struct sk_buff *skb); * Functions provided by ip_options.c */ -extern void ip_options_build(struct sk_buff *skb, struct ip_options *opt, u32 daddr, u32 saddr, int is_frag); +extern void ip_options_build(struct sk_buff *skb, struct ip_options *opt, u32 daddr, struct rtable *rt, int is_frag); extern int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb); extern void ip_options_fragment(struct sk_buff *skb); extern int ip_options_compile(struct ip_options *opt, struct sk_buff *skb); @@ -179,9 +193,12 @@ extern int ip_options_rcv_srr(struct sk_buff *skb); */ extern void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb); -extern int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc, struct device **devp); +extern int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc); extern int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen); extern int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen); +extern int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)); extern int ipv4_backlog_rcv(struct sock *sk, struct sk_buff *skb); + + #endif /* _IP_H */ diff --git a/include/net/ip_alias.h b/include/net/ip_alias.h deleted file mode 100644 index 683a04276434..000000000000 --- a/include/net/ip_alias.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * IP_ALIAS (AF_INET) aliasing definitions. - * - * - * Version: @(#)ip_alias.h 0.43 12/20/95 - * - * Author: Juan Jose Ciarlante, - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#ifndef _IP_ALIAS_H -#define _IP_ALIAS_H - -extern int ip_alias_init(void); -extern int ip_alias_done(void); - -#endif /* _IP_ALIAS_H */ diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e96378d777ab..cd46bc7dcd67 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -16,118 +16,190 @@ #ifndef _NET_IP_FIB_H #define _NET_IP_FIB_H +#include -struct fib_node +struct fib_nh { - struct fib_node *fib_next; - u32 fib_key; - struct fib_info *fib_info; - short fib_metric; - u8 fib_tos; - u8 fib_flag; + struct device *nh_dev; + unsigned nh_flags; + unsigned char nh_scope; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int nh_weight; + int nh_power; +#endif + int nh_oif; + u32 nh_gw; }; -#define FIBFLG_DOWN 1 /* Ignore this node */ -#define FIBFLG_THROW 2 /* Class lookup failed */ -#define FIBFLG_REJECT 4 /* Route lookup failed */ - -#define MAGIC_METRIC 0x7FFF - /* * This structure contains data shared by many of routes. - */ + */ struct fib_info { struct fib_info *fib_next; struct fib_info *fib_prev; - u32 fib_gateway; - struct device *fib_dev; int fib_refcnt; - unsigned long fib_window; unsigned fib_flags; - unsigned short fib_mtu; - unsigned short fib_irtt; + int fib_protocol; + u32 fib_prefsrc; + unsigned fib_mtu; + unsigned fib_rtt; + unsigned fib_window; + int fib_nhs; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int fib_power; +#endif + struct fib_nh fib_nh[0]; +#define fib_dev fib_nh[0].nh_dev }; -struct fib_zone -{ - struct fib_zone *fz_next; - struct fib_node **fz_hash; - int fz_nent; - int fz_divisor; - u32 fz_hashmask; - int fz_logmask; - u32 fz_mask; -}; -struct fib_class -{ - unsigned char cl_id; - unsigned char cl_auto; - struct fib_zone *fib_zones[33]; - struct fib_zone *fib_zone_list; - int cl_users; -}; +#ifdef CONFIG_IP_MULTIPLE_TABLES +struct fib_rule; +#endif -struct fib_rule +struct fib_result { - struct fib_rule *cl_next; - struct fib_class *cl_class; - u32 cl_src; - u32 cl_srcmask; - u32 cl_dst; - u32 cl_dstmask; - u32 cl_srcmap; - u8 cl_action; - u8 cl_flags; - u8 cl_tos; - u8 cl_preference; - struct device *cl_dev; + u32 *prefix; + unsigned char prefixlen; + unsigned char nh_sel; + unsigned char type; + unsigned char scope; + struct fib_info *fi; +#ifdef CONFIG_IP_MULTIPLE_TABLES + struct fib_rule *r; +#endif }; -struct fib_result +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +#define FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel]) +#define FIB_RES_RESET(res) ((res).nh_sel = 0) + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define FIB_RES_NH(res) ((res).fi->fib_nh[0]) +#define FIB_RES_RESET(res) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define FIB_RES_PREFSRC(res) ((res).fi->fib_prefsrc ? : __fib_res_prefsrc(&res)) +#define FIB_RES_GW(res) (FIB_RES_NH(res).nh_gw) +#define FIB_RES_DEV(res) (FIB_RES_NH(res).nh_dev) +#define FIB_RES_OIF(res) (FIB_RES_NH(res).nh_oif) + +struct fib_table { - struct fib_node *f; - struct fib_rule *fr; - int fm; + unsigned char tb_id; + unsigned tb_stamp; + int (*tb_lookup)(struct fib_table *tb, const struct rt_key *key, struct fib_result *res); + int (*tb_insert)(struct fib_table *table, struct rtmsg *r, + struct kern_rta *rta, struct nlmsghdr *n, + struct netlink_skb_parms *req); + int (*tb_delete)(struct fib_table *table, struct rtmsg *r, + struct kern_rta *rta, struct nlmsghdr *n, + struct netlink_skb_parms *req); + int (*tb_dump)(struct fib_table *table, struct sk_buff *skb, + struct netlink_callback *cb); + int (*tb_flush)(struct fib_table *table); + int (*tb_get_info)(struct fib_table *table, char *buf, + int first, int count); + + unsigned char tb_data[0]; }; -void ip_fib_init(void); -unsigned ip_fib_chk_addr(u32 addr); -int ip_fib_chk_default_gw(u32 addr, struct device*); +#ifndef CONFIG_IP_MULTIPLE_TABLES + +extern struct fib_table *local_table; +extern struct fib_table *main_table; + +extern __inline__ struct fib_table *fib_get_table(int id) +{ + if (id != RT_TABLE_LOCAL) + return main_table; + return local_table; +} -int fib_lookup(struct fib_result *, u32 daddr, u32 src, u8 tos, struct device *devin, - struct device *devout); +extern __inline__ struct fib_table *fib_new_table(int id) +{ + return fib_get_table(id); +} -static __inline__ struct fib_info * -fib_lookup_info(u32 dst, u32 src, u8 tos, struct device *devin, - struct device *devout) +extern __inline__ int fib_lookup(const struct rt_key *key, struct fib_result *res) { - struct fib_result res; - if (fib_lookup(&res, dst, src, tos, devin, devout) < 0) - return NULL; - return res.f->fib_info; + if (local_table->tb_lookup(local_table, key, res)) + return main_table->tb_lookup(main_table, key, res); + return 0; } -static __inline__ struct device * get_gw_dev(u32 gw, struct device *dev) +#else /* CONFIG_IP_MULTIPLE_TABLES */ +#define local_table (fib_tables[RT_TABLE_LOCAL]) +#define main_table (fib_tables[RT_TABLE_MAIN]) + +extern struct fib_table * fib_tables[RT_TABLE_MAX+1]; +extern int fib_lookup(const struct rt_key *key, struct fib_result *res); +extern struct fib_table *__fib_new_table(int id); + +extern __inline__ struct fib_table *fib_get_table(int id) { - struct fib_info * fi; + if (id == 0) + id = RT_TABLE_MAIN; - fi = fib_lookup_info(gw, 0, 1, &loopback_dev, dev); - if (fi) - return fi->fib_dev; - return NULL; + return fib_tables[id]; } -extern int ip_rt_event(int event, struct device *dev); -extern int ip_rt_ioctl(unsigned int cmd, void *arg); -extern void ip_rt_change_broadcast(struct device *, u32); -extern void ip_rt_change_dstaddr(struct device *, u32); -extern void ip_rt_change_netmask(struct device *, u32); -extern void ip_rt_multicast_event(struct device *dev); +extern __inline__ struct fib_table *fib_new_table(int id) +{ + if (id == 0) + id = RT_TABLE_MAIN; -extern struct device * ip_dev_find_tunnel(u32 daddr, u32 saddr); + return fib_tables[id] ? : __fib_new_table(id); +} +#endif /* CONFIG_IP_MULTIPLE_TABLES */ + +/* Exported by fib_frontend.c */ +extern void ip_fib_init(void); +extern void fib_flush(void); +extern int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); +extern int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); +extern int inet_rtm_getroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); +extern int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb); +extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, + struct device *dev, u32 *spec_dst); +extern void fib_select_multipath(const struct rt_key *key, struct fib_result *res); + +/* Exported by fib_semantics.c */ +extern int ip_fib_check_default(u32 gw, struct device *dev); +extern void fib_release_info(struct fib_info *); +extern int fib_semantic_match(int type, struct fib_info *, + const struct rt_key *, struct fib_result*); +extern struct fib_info *fib_create_info(const struct rtmsg *r, struct kern_rta *rta, + const struct nlmsghdr *, int *err); +extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *, struct kern_rta *rta, struct fib_info *fi); +extern int fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, + u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos, + struct fib_info *fi); +extern int fib_sync_down(u32 local, struct device *dev); +extern int fib_sync_up(struct device *dev); +extern int fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, + struct kern_rta *rta, struct rtentry *r); +extern void fib_node_get_info(int type, int dead, struct fib_info *fi, u32 prefix, u32 mask, char *buffer); +extern u32 __fib_res_prefsrc(struct fib_result *res); + +/* Exported by fib_hash.c */ +extern struct fib_table *fib_hash_init(int id); + +#ifdef CONFIG_IP_MULTIPLE_TABLES +/* Exported by fib_rules.c */ + +extern int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); +extern int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); +extern int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb); +extern u32 fib_rules_map_destination(u32 daddr, struct fib_result *res); +extern u32 fib_rules_policy(u32 saddr, struct fib_result *res, unsigned *flags); +extern void fib_rules_init(void); +#endif #endif _NET_FIB_H diff --git a/include/net/ip_masq.h b/include/net/ip_masq.h index 8b29eeb153fa..4a0b10a55a08 100644 --- a/include/net/ip_masq.h +++ b/include/net/ip_masq.h @@ -88,14 +88,14 @@ extern int ip_masq_init(void); /* * functions called from ip layer */ -extern int ip_fw_masquerade(struct sk_buff **, struct device *); -extern int ip_fw_masq_icmp(struct sk_buff **, struct device *); -extern int ip_fw_demasquerade(struct sk_buff **, struct device *); +extern int ip_fw_masquerade(struct sk_buff **, __u32 maddr); +extern int ip_fw_masq_icmp(struct sk_buff **); +extern int ip_fw_demasquerade(struct sk_buff **); /* * ip_masq obj creation/deletion functions. */ -extern struct ip_masq *ip_masq_new(struct device *dev, int proto, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned flags); +extern struct ip_masq *ip_masq_new(__u32 maddr, int proto, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned flags); extern void ip_masq_set_expire(struct ip_masq *ms, unsigned long tout); @@ -116,9 +116,9 @@ struct ip_masq_app int (*masq_done_1) /* ip_masq fin. */ (struct ip_masq_app *, struct ip_masq *); int (*pkt_out) /* output (masquerading) hook */ - (struct ip_masq_app *, struct ip_masq *, struct sk_buff **, struct device *); + (struct ip_masq_app *, struct ip_masq *, struct sk_buff **, __u32); int (*pkt_in) /* input (demasq) hook */ - (struct ip_masq_app *, struct ip_masq *, struct sk_buff **, struct device *); + (struct ip_masq_app *, struct ip_masq *, struct sk_buff **); }; /* @@ -147,8 +147,8 @@ extern int ip_masq_unbind_app(struct ip_masq *ms); * output and input app. masquerading hooks. * */ -extern int ip_masq_app_pkt_out(struct ip_masq *, struct sk_buff **skb_p, struct device *dev); -extern int ip_masq_app_pkt_in(struct ip_masq *, struct sk_buff **skb_p, struct device *dev); +extern int ip_masq_app_pkt_out(struct ip_masq *, struct sk_buff **skb_p, __u32 maddr); +extern int ip_masq_app_pkt_in(struct ip_masq *, struct sk_buff **skb_p); /* * service routine(s). diff --git a/include/net/ipconfig.h b/include/net/ipconfig.h new file mode 100644 index 000000000000..db64243a6742 --- /dev/null +++ b/include/net/ipconfig.h @@ -0,0 +1,19 @@ +/* + * $Id: ipconfig.h,v 1.2 1997/10/17 12:41:16 mj Exp $ + * + * Copyright (C) 1997 Martin Mares + * + * Automatic IP Layer Configuration + */ + +extern __u32 root_server_addr; +extern u8 root_server_path[]; +extern u32 ic_myaddr; +extern u32 ic_servaddr; +extern u32 ic_gateway; +extern u32 ic_netmask; +extern int ic_bootp_flag; +extern int ic_rarp_flag; +extern int ic_enable; +extern int ic_host_name_set; +extern int ic_set_manually; diff --git a/include/net/ipip.h b/include/net/ipip.h index 64ce7a29dda9..22c464c3c043 100644 --- a/include/net/ipip.h +++ b/include/net/ipip.h @@ -1,8 +1,33 @@ #ifndef __NET_IPIP_H #define __NET_IPIP_H 1 -extern void ipip_err(struct sk_buff *skb, unsigned char*); -extern int ipip_rcv(struct sk_buff *skb, unsigned short len); - +#include + +/* Keep error state on tunnel for 30 sec */ +#define IPTUNNEL_ERR_TIMEO (30*HZ) + +struct ip_tunnel +{ + struct ip_tunnel *next; + struct device *dev; + struct net_device_stats stat; + + int recursion; /* Depth of hard_start_xmit recursion */ + int err_count; /* Number of arrived ICMP errors */ + unsigned long err_time; /* Time when the last ICMP error arrived */ + + /* These four fields used only by GRE */ + __u32 i_seqno; /* The last seen seqno */ + __u32 o_seqno; /* The last output seqno */ + int hlen; /* Precalculated GRE header length */ + int mlink; + + struct ip_tunnel_parm parms; +}; + +extern int ipip_init(void); +extern int ipgre_init(void); +extern int sit_init(void); +extern void sit_cleanup(void); #endif diff --git a/include/net/netlink.h b/include/net/netlink.h deleted file mode 100644 index 17aa555832dc..000000000000 --- a/include/net/netlink.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef __NET_NETLINK_H -#define __NET_NETLINK_H - -#define NET_MAJOR 36 /* Major 18 is reserved for networking */ -#define MAX_LINKS 32 -#define MAX_QBYTES 32768 /* Maximum bytes in the queue */ - -#include - -extern int netlink_attach(int unit, int (*function)(int,struct sk_buff *skb)); -extern int netlink_donothing(int, struct sk_buff *skb); -extern void netlink_detach(int unit); -extern int netlink_post(int unit, struct sk_buff *skb); -extern int init_netlink(void); - -/* - * skb should fit one page. This choice is good for headerless malloc. - */ -#define NLMSG_GOODSIZE (PAGE_SIZE - ((sizeof(struct sk_buff)+0xF)&~0xF)-32) - -#define NLMSG_RECOVERY_TIMEO (HZ/2) /* If deleivery was failed, - retry after */ - -struct nlmsg_ctl -{ - struct timer_list nlmsg_timer; - struct sk_buff *nlmsg_skb; /* Partially built skb */ - int nlmsg_unit; - int nlmsg_delay; /* Time to delay skb send*/ - int nlmsg_maxsize; /* Maximal message size */ - int nlmsg_force; /* post immediately */ - unsigned long nlmsg_overrun_start; /* seqno starting lossage*/ - unsigned long nlmsg_overrun_end; /* the last lost message */ - char nlmsg_overrun; /* overrun flag */ -}; - -void* nlmsg_send(struct nlmsg_ctl*, unsigned long type, int len, - unsigned long seq, unsigned long pid); -void nlmsg_transmit(struct nlmsg_ctl*); - -extern __inline__ void nlmsg_ack(struct nlmsg_ctl* ctl, unsigned long seq, - unsigned long pid, int err) -{ - int *r; - - start_bh_atomic(); - r = nlmsg_send(ctl, NLMSG_ACK, sizeof(r), seq, pid); - if (r) - *r = err; - end_bh_atomic(); -} - - -#define NETLINK_ROUTE 0 /* Routing/device hook */ -#define NETLINK_SKIP 1 /* Reserved for ENskip */ -#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ -#define NETLINK_FIREWALL 3 /* Firewalling hook */ -#define NETLINK_FREE 4 /* PSI devices - 4 to 7 (obsolete) */ -#define NETLINK_ARPD 8 /* ARP daemon for big switched networks */ -#define NETLINK_IPSEC 10 /* IPSEC (JI) */ -#define NETLINK_ROUTE6 11 /* Af_inet6 route communication channel */ -#define NETLINK_IP6_FW 13 /* IPv6 firewall trap outs */ -#define NETLINK_DNRT 14 /* DECnet routing messages */ -#define NETLINK_TAPBASE 16 /* 16->31 are the ethertap devices */ -#endif diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h new file mode 100644 index 000000000000..b83c1048bd8b --- /dev/null +++ b/include/net/pkt_sched.h @@ -0,0 +1,164 @@ +#ifndef __NET_PKT_SCHED_H +#define __NET_PKT_SCHED_H + +#include + +struct Qdisc_ops +{ + struct Qdisc_ops *next; + char id[IFNAMSIZ]; + int refcnt; + int priv_size; + int (*enqueue)(struct sk_buff *skb, struct Qdisc *); + struct sk_buff * (*dequeue)(struct Qdisc *); + void (*reset)(struct Qdisc *); + void (*destroy)(struct Qdisc *); + int (*init)(struct Qdisc *, void *arg); + int (*control)(struct Qdisc *, void *); +}; + +struct Qdisc_head +{ + struct Qdisc_head *forw; +}; + +extern struct Qdisc_head qdisc_head; + +struct Qdisc +{ + struct Qdisc_head h; + int (*enqueue)(struct sk_buff *skb, struct Qdisc *dev); + struct sk_buff * (*dequeue)(struct Qdisc *dev); + struct Qdisc_ops *ops; + int handle; + struct Qdisc *parent; + struct sk_buff_head q; + struct device *dev; + struct sk_buff_head failure_q; + unsigned long dropped; + unsigned long tx_last; + unsigned long tx_timeo; + + char data[0]; +}; + + +/* Yes, it is slow for [34]86, but we have no choice. + 10 msec resolution is appropriate only for bandwidth < 32Kbit/sec. + + RULE: + Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth + + Normal IP packet size ~ 512byte, hence: + + 0.5Kbyte/1Mbyte/sec = 0.5msec, so that we need 50usec timer for + 10Mbit ethernet. + + 10msec resolution -> <50Kbit/sec. + + The result: [34]86 is not good choice for QoS router :-( + */ + + +typedef struct timeval psched_time_t; + +/* On 64bit architecures it would be clever to define: +typedef u64 psched_time_t; + and make all this boring arithmetics directly + */ + +#ifndef SCHEDULE_ONLY_LOW_BANDWIDTH +#define PSCHED_GET_TIME(stamp) do_gettimeofday(&(stamp)) +#else +#define PSCHED_GET_TIME(stamp) ((stamp) = xtime) +#endif + +#define PSCHED_TDIFF(tv1, tv2) \ +({ \ + int __delta_sec = (tv1).tv_sec - (tv2).tv_sec; \ + int __delta = (tv1).tv_usec - (tv2).tv_usec; \ + if (__delta_sec) { \ + switch (__delta_sec) { \ + default: \ + __delta = 0; \ + case 2: \ + __delta += 1000000; \ + case 1: \ + __delta += 1000000; \ + } \ + } \ + __delta; \ +}) + +#define PSCHED_TDIFF_SAFE(tv1, tv2, bound, guard) \ +({ \ + int __delta_sec = (tv1).tv_sec - (tv2).tv_sec; \ + int __delta = (tv1).tv_usec - (tv2).tv_usec; \ + switch (__delta_sec) { \ + default: \ + __delta = (bound); guard; break; \ + case 2: \ + __delta += 1000000; \ + case 1: \ + __delta += 1000000; \ + case 0: ; \ + } \ + __delta; \ +}) + +#define PSCHED_US2JIFFIE(usecs) (((usecs)+(1000000/HZ-1))/(1000000/HZ)) + +#define PSCHED_TLESS(tv1, tv2) (((tv1).tv_usec < (tv2).tv_usec && \ + (tv1).tv_sec < (tv2).tv_sec) || \ + (tv1).tv_sec < (tv2).tv_sec) + +#define PSCHED_TADD2(tv, delta, tv_res) \ +({ \ + int __delta = (tv).tv_usec + (delta); \ + (tv_res).tv_sec = (tv).tv_sec; \ + if (__delta > 1000000) { (tv_res).tv_sec++; __delta -= 1000000; } \ + (tv_res).tv_sec = __delta; \ +}) + +#define PSCHED_TADD(tv, delta) \ +({ \ + (tv).tv_usec += (delta); \ + if ((tv).tv_usec > 1000000) { (tv).tv_sec++; \ + (tv).tv_usec -= 1000000; } \ +}) + +/* Set/check that undertime is in the "past perfect"; + it depends on concrete representation of system time + */ + +#define PSCHED_SET_PASTPERFECT(t) ((t).tv_sec = 0) +#define PSCHED_IS_PASTPERFECT(t) ((t).tv_sec == 0) + + +extern struct Qdisc noop_qdisc; + +int register_qdisc(struct Qdisc_ops *qops); +int unregister_qdisc(struct Qdisc_ops *qops); +void dev_init_scheduler(struct device *dev); +void dev_shutdown(struct device *dev); +void dev_activate(struct device *dev); +void dev_deactivate(struct device *dev); +void qdisc_reset(struct Qdisc *qdisc); +void qdisc_destroy(struct Qdisc *qdisc); +int pktsched_init(void); + +void qdisc_run_queues(void); +int qdisc_restart(struct device *dev); + +extern __inline__ void qdisc_wakeup(struct device *dev) +{ + if (!dev->tbusy) { + struct Qdisc *q = dev->qdisc; + if (qdisc_restart(dev) && q->h.forw == NULL) { + q->h.forw = qdisc_head.forw; + qdisc_head.forw = &q->h; + } + } +} + +#endif diff --git a/include/net/protocol.h b/include/net/protocol.h index 3e60c1b313bc..3c00907ea4fa 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -37,7 +37,7 @@ struct inet_protocol { int (*handler)(struct sk_buff *skb, unsigned short len); - void (*err_handler)(struct sk_buff *skb, unsigned char *dp); + void (*err_handler)(struct sk_buff *skb, unsigned char *dp, int len); struct inet_protocol *next; unsigned char protocol; unsigned char copy:1; diff --git a/include/net/raw.h b/include/net/raw.h index 064b5bf7a658..4d2e6e98ba54 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -32,6 +32,7 @@ extern struct sock *raw_v4_htable[RAWV4_HTABLE_SIZE]; extern struct sock *raw_v4_lookup(struct sock *sk, unsigned short num, - unsigned long raddr, unsigned long laddr); + unsigned long raddr, unsigned long laddr, + int dif); #endif /* _RAW_H */ diff --git a/include/net/rose.h b/include/net/rose.h index 86f6a6721f72..8e86c14573d6 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -129,10 +129,9 @@ typedef struct { unsigned char cause, diagnostic; unsigned short vs, vr, va, vl; unsigned long t1, t2, t3, hb, idle; - unsigned short fraglen; + struct sk_buff_head ack_queue; struct timer_list timer; struct timer_list idletimer; - struct sk_buff_head frag_queue; struct sock *sk; /* Backlink to socket */ } rose_cb; @@ -181,10 +180,8 @@ extern void rose_transmit_clear_request(struct rose_neigh *, unsigned int, unsig extern void rose_transmit_link(struct sk_buff *, struct rose_neigh *); /* rose_out.c */ -extern void rose_output(struct sock *, struct sk_buff *); extern void rose_kick(struct sock *); extern void rose_enquiry_response(struct sock *); -extern void rose_check_iframes_acked(struct sock *, unsigned short); /* rose_route.c */ extern void rose_rt_device_down(struct device *); @@ -204,6 +201,8 @@ extern void rose_rt_free(void); /* rose_subr.c */ extern void rose_clear_queues(struct sock *); +extern void rose_frames_acked(struct sock *, unsigned short); +extern void rose_requeue_frames(struct sock *); extern int rose_validate_nr(struct sock *, unsigned short); extern void rose_write_internal(struct sock *, int); extern int rose_decode(struct sk_buff *, int *, int *, int *, int *, int *); diff --git a/include/net/route.h b/include/net/route.h index d309ab63f047..486508ce029b 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -22,9 +22,10 @@ #ifndef _ROUTE_H #define _ROUTE_H -#include +#include #include - +#include +#include #define RT_HASH_DIVISOR 256 #define RT_CACHE_MAX_SIZE 256 @@ -37,12 +38,12 @@ /* * Cache invalidations can be delayed by: */ -#define RT_FLUSH_DELAY (2*HZ) +#define RT_FLUSH_DELAY (5*HZ) #define RT_REDIRECT_NUMBER 9 #define RT_REDIRECT_LOAD (HZ/50) /* 20 msec */ #define RT_REDIRECT_SILENCE (RT_REDIRECT_LOAD<<(RT_REDIRECT_NUMBER+1)) - /* 20sec */ +/* 20sec */ #define RT_ERROR_LOAD (1*HZ) @@ -55,7 +56,17 @@ #include -struct rtable +struct rt_key +{ + __u32 dst; + __u32 src; + int iif; + int oif; + __u8 tos; + __u8 scope; +}; + +struct rtable { union { @@ -64,92 +75,76 @@ struct rtable } u; unsigned rt_flags; + unsigned rt_type; - u32 rt_dst; /* Path destination */ - u32 rt_src; /* Path source */ - struct device *rt_src_dev; /* Path source device */ + __u32 rt_dst; /* Path destination */ + __u32 rt_src; /* Path source */ + int rt_iif; /* Info on neighbour */ - u32 rt_gateway; + __u32 rt_gateway; /* Cache lookup keys */ - struct - { - u32 dst; - u32 src; - struct device *src_dev; - struct device *dst_dev; - u8 tos; - } key; + struct rt_key key; /* Miscellaneous cached information */ - u32 rt_spec_dst; /* RFC1122 specific destination */ - u32 rt_src_map; - u32 rt_dst_map; + __u32 rt_spec_dst; /* RFC1122 specific destination */ + +#ifdef CONFIG_IP_ROUTE_NAT + __u32 rt_src_map; + __u32 rt_dst_map; +#endif /* ICMP statistics */ unsigned long last_error; unsigned long errors; }; - -#define RTF_IFBRD (RTF_UP|RTF_MAGIC|RTF_LOCAL|RTF_BROADCAST) -#define RTF_IFLOCAL (RTF_UP|RTF_MAGIC|RTF_LOCAL|RTF_INTERFACE) -#define RTF_IFPREFIX (RTF_UP|RTF_MAGIC|RTF_INTERFACE) - -/* - * Flags not visible at user level. - */ -#define RTF_INTERNAL 0xFFFF8000 /* to get RTF_MAGIC as well... */ - -/* - * Flags saved in FIB. - */ -#define RTF_FIB (RTF_UP|RTF_GATEWAY|RTF_REJECT|RTF_THROW|RTF_STATIC|\ - RTF_XRESOLVE|RTF_NOPMTUDISC|RTF_NOFORWARD|RTF_INTERNAL) - +#ifdef __KERNEL__ extern void ip_rt_init(void); extern void ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw, u32 src, u8 tos, struct device *dev); extern void ip_rt_check_expire(void); extern void ip_rt_advice(struct rtable **rp, int advice); extern void rt_cache_flush(int how); -extern int ip_route_output(struct rtable **, u32 dst, u32 src, u8 tos, struct device *devout); -extern int ip_route_output_dev(struct rtable **, u32 dst, u32 src, u8 tos, int); +extern int ip_route_output(struct rtable **, u32 dst, u32 src, u8 tos, int oif); extern int ip_route_input(struct sk_buff*, u32 dst, u32 src, u8 tos, struct device *devin); extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu); extern void ip_rt_send_redirect(struct sk_buff *skb); -static __inline__ void ip_rt_put(struct rtable * rt) +extern unsigned inet_addr_type(u32 addr); +extern void ip_rt_multicast_event(struct in_device *); +extern int ip_rt_ioctl(unsigned int cmd, void *arg); +extern void ip_rt_get_source(u8 *src, struct rtable *rt); + + +extern __inline__ void ip_rt_put(struct rtable * rt) { if (rt) dst_release(&rt->u.dst); } -static __inline__ char rt_tos2priority(u8 tos) +extern __u8 ip_tos2prio[16]; + +extern __inline__ char rt_tos2priority(u8 tos) { - if (tos & IPTOS_LOWDELAY) - return SOPRI_INTERACTIVE; - if (tos & (IPTOS_THROUGHPUT|IPTOS_MINCOST)) - return SOPRI_BACKGROUND; - return SOPRI_NORMAL; + return ip_tos2prio[IPTOS_TOS(tos)>>1]; } - -static __inline__ int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos) +extern __inline__ int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos, int oif) { int err; - err = ip_route_output(rp, dst, src, tos, NULL); + err = ip_route_output(rp, dst, src, tos, oif); if (err || (dst && src)) return err; dst = (*rp)->rt_dst; src = (*rp)->rt_src; ip_rt_put(*rp); *rp = NULL; - return ip_route_output(rp, dst, src, tos, NULL); + return ip_route_output(rp, dst, src, tos, oif); } -static __inline__ void ip_ll_header(struct sk_buff *skb) +extern __inline__ void ip_ll_header(struct sk_buff *skb) { struct rtable *rt = (struct rtable*)skb->dst; struct device *dev = rt->u.dst.dev; @@ -169,6 +164,7 @@ static __inline__ void ip_ll_header(struct sk_buff *skb) skb->mac.raw = skb->data; } +#endif #endif /* _ROUTE_H */ diff --git a/include/net/sit.h b/include/net/sit.h deleted file mode 100644 index 98bb5b386d1f..000000000000 --- a/include/net/sit.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * SIT tunneling device - definitions - * Linux INET6 implementation - * - * Authors: - * Pedro Roque - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _NET_SIT_H -#define _NET_SIT_H - -struct sit_mtu_info { - __u32 addr; /* IPv4 destination */ - unsigned long tstamp; /* last use tstamp */ - __u32 mtu; /* Path MTU */ - struct sit_mtu_info *next; -}; - -struct sit_vif { - char name[8]; - struct device *dev; - struct sit_vif *next; -}; - -extern int sit_init(void); -extern void sit_cleanup(void); - -extern struct device * sit_add_tunnel(__u32 dstaddr); - -#define SIT_GC_TIMEOUT (3*60*HZ) -#define SIT_GC_FREQUENCY (2*60*HZ) - -#endif diff --git a/include/net/slhc_vj.h b/include/net/slhc_vj.h index 387b848d84eb..04387d8a8d00 100644 --- a/include/net/slhc_vj.h +++ b/include/net/slhc_vj.h @@ -112,8 +112,8 @@ * int int32 long 32 bits */ -typedef unsigned char byte_t; -typedef unsigned long int32; +typedef __u8 byte_t; +typedef __u32 int32; /* * "state" data for each active tcp conversation on the wire. This is diff --git a/include/net/sock.h b/include/net/sock.h index 1e40d3a2df58..8dd105485ecb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -44,8 +44,10 @@ #include /* struct ipv6_mc_socklist */ #endif +#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) +#include +#endif #include /* struct tcphdr */ -#include #include #include /* struct sk_buff */ @@ -75,8 +77,6 @@ #include #endif -#include - #include /* @@ -98,18 +98,20 @@ struct unix_opt int inflight; }; -/* - * IP packet socket options - */ +#ifdef CONFIG_NETLINK +struct netlink_callback; -struct inet_packet_opt +struct netlink_opt { - struct notifier_block notifier; /* Used when bound */ - struct device *bound_dev; - unsigned long dev_stamp; - struct packet_type *prot_hook; - char device_name[15]; + pid_t pid; + unsigned groups; + pid_t dst_pid; + unsigned dst_groups; + int (*handler)(int unit, struct sk_buff *skb); + atomic_t locks; + struct netlink_callback *cb; }; +#endif /* * Once the IPX ncpd patches are in these are going into protinfo @@ -184,6 +186,12 @@ struct raw6_opt { #endif /* IPV6 */ +#if defined(CONFIG_INET) || defined(CONFIG_INET_MODULE) +struct raw_opt { + struct icmp_filter filter; +}; +#endif + struct tcp_opt { @@ -374,6 +382,7 @@ struct sock broadcast, nonagle, bsdism; + int bound_dev_if; unsigned long lingertime; int proc; @@ -401,9 +410,6 @@ struct sock __u32 rcv_saddr; /* Bound address */ struct dst_entry *dst_cache; - - unsigned short max_unacked; - /* * mss is min(mtu, max_window) */ @@ -422,6 +428,9 @@ struct sock union { struct tcp_opt af_tcp; +#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) + struct raw_opt tp_raw4; +#endif #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) struct raw6_opt tp_raw; #endif @@ -435,8 +444,8 @@ struct sock volatile unsigned char state; unsigned short ack_backlog; unsigned short max_ack_backlog; - unsigned char priority; unsigned char debug; + __u32 priority; int rcvbuf; int sndbuf; unsigned short type; @@ -462,8 +471,10 @@ struct sock #if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) struct ipx_opt af_ipx; #endif +#if defined (CONFIG_PACKET) || defined(CONFIG_PACKET_MODULE) + struct packet_opt *af_packet; +#endif #ifdef CONFIG_INET - struct inet_packet_opt af_packet; #ifdef CONFIG_NUTCP struct tcp_opt af_tcp; #endif @@ -482,6 +493,9 @@ struct sock #endif #if defined(CONFIG_DECNET) || defined(CONFIG_DECNET_MODULE) dn_cb *dn; +#endif +#ifdef CONFIG_NETLINK + struct netlink_opt af_netlink; #endif } protinfo; @@ -560,7 +574,8 @@ struct proto void (*write_wakeup)(struct sock *sk); void (*read_wakeup)(struct sock *sk); - unsigned int (*poll)(struct socket *sock, poll_table *wait); + unsigned int (*poll)(struct socket *sock, + struct poll_table_struct *wait); int (*ioctl)(struct sock *sk, int cmd, unsigned long arg); @@ -783,7 +798,7 @@ extern int sock_no_accept(struct socket *, extern int sock_no_getname(struct socket *, struct sockaddr *, int *, int); extern unsigned int sock_no_poll(struct socket *, - poll_table *); + struct poll_table_struct *); extern int sock_no_ioctl(struct socket *, unsigned int, unsigned long); extern int sock_no_listen(struct socket *, int); diff --git a/include/net/tcp.h b/include/net/tcp.h index 57840f7e9d51..f47a273d0006 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -226,6 +226,7 @@ struct open_request; struct or_calltable { void (*rtx_syn_ack) (struct sock *sk, struct open_request *req); void (*destructor) (struct open_request *req); + void (*send_reset) (struct sk_buff *skb); }; struct tcp_v4_open_req { @@ -306,11 +307,6 @@ struct tcp_func { struct open_request *req, struct dst_entry *dst); -#if 0 - __u32 (*init_sequence) (struct sock *sk, - struct sk_buff *skb); -#endif - struct sock * (*get_sock) (struct sk_buff *skb, struct tcphdr *th); @@ -330,15 +326,6 @@ struct tcp_func { void (*addr2sockaddr) (struct sock *sk, struct sockaddr *); - void (*send_reset) (struct sk_buff *skb); - - struct open_request * (*search_open_req) (struct tcp_opt *, void *, - struct tcphdr *, - struct open_request **); - - struct sock * (*cookie_check) (struct sock *, struct sk_buff *, - void *); - int sockaddr_len; }; @@ -371,7 +358,7 @@ extern struct tcp_mib tcp_statistics; extern unsigned short tcp_good_socknum(void); extern void tcp_v4_err(struct sk_buff *skb, - unsigned char *); + unsigned char *, int); extern void tcp_shutdown (struct sock *sk, int how); @@ -399,7 +386,7 @@ extern int tcp_rcv_established(struct sock *sk, extern void tcp_close(struct sock *sk, unsigned long timeout); extern struct sock * tcp_accept(struct sock *sk, int flags); -extern unsigned int tcp_poll(struct socket *sock, poll_table *wait); +extern unsigned int tcp_poll(struct socket *sock, struct poll_table_struct *wait); extern int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen); @@ -485,8 +472,7 @@ extern void tcp_delack_timer(unsigned long); extern void tcp_probe_timer(unsigned long); extern struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, - void *); - + struct open_request *req); /* * TCP slow timer @@ -546,9 +532,9 @@ extern unsigned short tcp_select_window(struct sock *sk); extern __inline const int tcp_connected(const int state) { - return(state == TCP_ESTABLISHED || state == TCP_CLOSE_WAIT || - state == TCP_FIN_WAIT1 || state == TCP_FIN_WAIT2 || - state == TCP_SYN_RECV); + return ((1 << state) & + (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| + TCPF_FIN_WAIT2|TCPF_SYN_RECV)); } /* diff --git a/include/net/udp.h b/include/net/udp.h index 0fa99fb1c038..5af3c18b72ab 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -40,7 +40,7 @@ extern unsigned short udp_good_socknum(void); extern struct proto udp_prot; -extern void udp_err(struct sk_buff *, unsigned char *); +extern void udp_err(struct sk_buff *, unsigned char *, int); extern int udp_connect(struct sock *sk, struct sockaddr *usin, int addr_len); diff --git a/include/net/x25.h b/include/net/x25.h index 7b58ad4e30e8..a72bd04265dd 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -123,6 +123,7 @@ typedef struct { unsigned short vs, vr, va, vl; unsigned long t2, t21, t22, t23; unsigned short fraglen; + struct sk_buff_head ack_queue; struct sk_buff_head fragment_queue; struct sk_buff_head interrupt_in_queue; struct sk_buff_head interrupt_out_queue; @@ -183,7 +184,6 @@ extern void x25_link_free(void); extern void x25_output(struct sock *, struct sk_buff *); extern void x25_kick(struct sock *); extern void x25_enquiry_response(struct sock *); -extern void x25_check_iframes_acked(struct sock *, unsigned short); /* x25_route.c */ extern struct device *x25_get_route(x25_address *); @@ -195,6 +195,8 @@ extern void x25_route_free(void); /* x25_subr.c */ extern void x25_clear_queues(struct sock *); +extern void x25_frames_acked(struct sock *, unsigned short); +extern void x25_requeue_frames(struct sock *); extern int x25_validate_nr(struct sock *, unsigned short); extern void x25_write_internal(struct sock *, int); extern int x25_decode(struct sock *, struct sk_buff *, int *, int *, int *, int *, int *); diff --git a/init/main.c b/init/main.c index 0b2055c80ecd..ad8af68075cc 100644 --- a/init/main.c +++ b/init/main.c @@ -696,6 +696,10 @@ __initfunc(static void parse_root_dev(char * line)) { "hdb", 0x0340 }, { "hdc", 0x1600 }, { "hdd", 0x1640 }, + { "hde", 0x2100 }, + { "hdf", 0x2140 }, + { "hdg", 0x2200 }, + { "hdh", 0x2240 }, { "sda", 0x0800 }, { "sdb", 0x0810 }, { "sdc", 0x0820 }, diff --git a/net/Config.in b/net/Config.in index b57dc9e3d87a..9785899bab9d 100644 --- a/net/Config.in +++ b/net/Config.in @@ -3,9 +3,11 @@ # mainmenu_option next_comment comment 'Networking options' -bool 'Kernel/User network link driver' CONFIG_NETLINK +tristate 'Packet socket' CONFIG_PACKET +bool 'Kernel/User netlink socket' CONFIG_NETLINK if [ "$CONFIG_NETLINK" = "y" ]; then bool 'Routing messages' CONFIG_RTNETLINK + tristate 'Netlink device emulation' CONFIG_NETLINK_DEV fi bool 'Network firewalls' CONFIG_FIREWALL if [ "$CONFIG_FIREWALL" = "y" ]; then @@ -14,11 +16,15 @@ if [ "$CONFIG_FIREWALL" = "y" ]; then fi fi bool 'Network aliasing' CONFIG_NET_ALIAS +tristate 'BSD Unix domain sockets' CONFIG_UNIX bool 'TCP/IP networking' CONFIG_INET if [ "$CONFIG_INET" = "y" ]; then source net/ipv4/Config.in if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then tristate 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6 + if [ "$CONFIG_IPV6" != "n" ]; then + source net/ipv6/Config.in + fi fi fi @@ -48,5 +54,17 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then # bool 'Netbeui (EXPERIMENTAL)' CONFIG_NETBEUI # fi tristate 'WAN router' CONFIG_WAN_ROUTER + bool 'CPU is too slow to handle full bandwidth' CONFIG_CPU_IS_SLOW + bool 'QoS and/or fair queueing' CONFIG_NET_SCHED + if [ "$CONFIG_NET_SCHED" = "y" ]; then + tristate 'CBQ packet scheduler' CONFIG_NET_SCH_CBQ + tristate 'CSZ packet scheduler' CONFIG_NET_SCH_CSZ + tristate 'HFQ packet scheduler' CONFIG_NET_SCH_HFQ + tristate 'RED queueing discipline' CONFIG_NET_SCH_RED + tristate 'SFQ queueing discipline' CONFIG_NET_SCH_SFQ + tristate 'auxiliary TBF queue' CONFIG_NET_SCH_TBF + tristate 'auxiliary FIFO queue' CONFIG_NET_SCH_PFIFO + tristate 'auxiliary PRIO queue' CONFIG_NET_SCH_PRIO + fi fi endmenu diff --git a/net/Makefile b/net/Makefile index 09924ff8939e..0f32c8397a30 100644 --- a/net/Makefile +++ b/net/Makefile @@ -9,8 +9,8 @@ MOD_SUB_DIRS := ipv4 ALL_SUB_DIRS := 802 ax25 bridge core ethernet ipv4 ipv6 ipx unix appletalk \ - netrom rose lapb x25 wanrouter sunrpc #decnet -SUB_DIRS := core ethernet unix + netrom rose lapb x25 wanrouter netlink sched packet sunrpc #decnet +SUB_DIRS := core ethernet sched MOD_LIST_NAME := NET_MISC_MODULES ifeq ($(CONFIG_NET),y) @@ -21,6 +21,14 @@ ifeq ($(CONFIG_INET),y) SUB_DIRS += ipv4 endif +ifeq ($(CONFIG_UNIX),y) +SUB_DIRS += unix +else + ifeq ($(CONFIG_UNIX),m) + MOD_SUB_DIRS += unix + endif +endif + ifeq ($(CONFIG_IPV6),y) SUB_DIRS += ipv6 else @@ -29,6 +37,25 @@ else endif endif +ifeq ($(CONFIG_NETLINK),y) +SUB_DIRS += netlink + ifeq ($(CONFIG_NETLINK_DEV),m) + MOD_SUB_DIRS += netlink + endif +endif + +ifeq ($(CONFIG_PACKET),y) +SUB_DIRS += packet +else + ifeq ($(CONFIG_PACKET),m) + MOD_SUB_DIRS += packet + endif +endif + +ifeq ($(CONFIG_NET_SCHED),y) + MOD_SUB_DIRS += sched +endif + ifeq ($(CONFIG_BRIDGE),y) SUB_DIRS += bridge endif @@ -135,31 +162,4 @@ ifeq ($(CONFIG_SYSCTL),y) L_OBJS += sysctl_net.o endif -CONFIG_NETLINK_BUILTIN := -CONFIG_NETLINK_MODULE := - -ifeq ($(CONFIG_NETLINK), y) - CONFIG_NETLINK_BUILTIN = y -endif - -ifeq ($(CONFIG_IPV6), y) - CONFIG_NETLINK_BUILTIN = y -endif - -ifeq ($(CONFIG_NETLINK), m) - CONFIG_NETLINK_MODULE = y -endif - -ifeq ($(CONFIG_IPV6), m) - CONFIG_NETLINK_MODULE = y -endif - -ifdef CONFIG_NETLINK_BUILTIN -L_OBJS += netlink.o -else - ifdef CONFIG_NETLINK_MODULE - M_OBJS += netlink.o - endif -endif - include $(TOPDIR)/Rules.make diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 2fa92c4ad88d..e406fc86d3cb 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1412,7 +1412,6 @@ static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct /* Datagram frames go straight out of the door as UI */ skb->dev = sk->protinfo.ax25->ax25_dev->dev; - skb->priority = SOPRI_NORMAL; ax25_queue_xmit(skb); diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c index 89ca64f3fbce..3db7995649ad 100644 --- a/net/ax25/ax25_ds_subr.c +++ b/net/ax25/ax25_ds_subr.c @@ -154,7 +154,6 @@ static void ax25_kiss_cmd(ax25_dev *ax25_dev, unsigned char cmd, unsigned char p skb->arp = 1; skb->dev = ax25_dev->dev; - skb->priority = SOPRI_NORMAL; skb->protocol = htons(ETH_P_AX25); dev_queue_xmit(skb); diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index 3a8594fba94b..a50822b9043e 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -177,7 +177,6 @@ int ax25_rebuild_header(struct sk_buff *skb) } skb->dev = dev; - skb->priority = SOPRI_NORMAL; ax25_queue_xmit(skb); diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index 4550302d731b..ce5018d75b86 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -58,8 +58,16 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax2 ax25_dev *ax25_dev; ax25_cb *ax25; - if (skb == NULL) - return 0; + /* + * Take the default packet length for the device if zero is + * specified. + */ + if (paclen == 0) { + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return NULL; + + paclen = ax25_dev->values[AX25_VALUES_PACLEN]; + } /* * Look for an existing connection. @@ -339,7 +347,6 @@ void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type) ax25_addr_build(ptr, &ax25->source_addr, &ax25->dest_addr, ax25->digipeat, type, ax25->modulus); skb->dev = ax25->ax25_dev->dev; - skb->priority = SOPRI_NORMAL; ax25_queue_xmit(skb); } diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 39dfd7d422f5..98a977182b0c 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -252,7 +252,6 @@ void ax25_return_dm(struct device *dev, ax25_address *src, ax25_address *dest, a dptr += ax25_addr_build(dptr, dest, src, &retdigi, AX25_RESPONSE, AX25_MODULUS); skb->dev = dev; - skb->priority = SOPRI_NORMAL; ax25_queue_xmit(skb); } diff --git a/net/core/Makefile b/net/core/Makefile index b7efbe6b4f5a..2ae7761573c6 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux TCP/IP (INET) layer. +# Makefile for the Linux networking core. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here @@ -10,7 +10,7 @@ O_TARGET := core.o O_OBJS := sock.o skbuff.o iovec.o datagram.o dst.o scm.o \ - neighbour.o + neighbour.o rtnetlink.o ifeq ($(CONFIG_SYSCTL),y) O_OBJS += sysctl_net_core.o @@ -24,10 +24,6 @@ ifdef CONFIG_FIREWALL OX_OBJS += firewall.o endif -ifdef CONFIG_NET_ALIAS -O_OBJS += net_alias.o -endif - endif include $(TOPDIR)/Rules.make diff --git a/net/core/dev.c b/net/core/dev.c index cfe85df7ebad..ff8548f282d3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -15,6 +15,7 @@ * Florian la Roche * Alan Cox * David Hinds + * Alexey Kuznetsov * * Changes: * Alan Cox : device private ioctl copies fields back. @@ -61,24 +62,20 @@ #include #include #include -#include #include #include #include -#include #include #include #include -#include -#include #include #include -#include +#include #include #include #include #include -#include +#include #include #ifdef CONFIG_KERNELD #include @@ -90,6 +87,7 @@ extern int plip_init(void); #endif + const char *if_port_text[] = { "unknown", "BNC", @@ -100,12 +98,6 @@ const char *if_port_text[] = { "100baseFX" }; -/* - * The list of devices, that are able to output. - */ - -static struct device *dev_up_base; - /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -130,16 +122,17 @@ struct packet_type *ptype_base[16]; /* 16 way hashed list */ struct packet_type *ptype_all = NULL; /* Taps */ /* - * Device list lock + * Device list lock. Setting it provides that interface + * will not disappear unexpectedly while kernel sleeps. */ atomic_t dev_lockct = ATOMIC_INIT(0); - + /* * Our notifier list */ -struct notifier_block *netdev_chain=NULL; +static struct notifier_block *netdev_chain=NULL; /* * Device drivers call our routines to queue packets here. We empty the @@ -148,14 +141,6 @@ struct notifier_block *netdev_chain=NULL; static struct sk_buff_head backlog; -/* - * We don't overdo the queue or we will thrash memory badly. - */ - -static int backlog_size = 0; - - - /****************************************************************************************** Protocol management and registration routines @@ -166,7 +151,7 @@ static int backlog_size = 0; * For efficiency */ -static int dev_nit=0; +int netdev_nit=0; /* * Add a protocol ID to the list. Now that the input handler is @@ -179,7 +164,7 @@ void dev_add_pack(struct packet_type *pt) int hash; if(pt->type==htons(ETH_P_ALL)) { - dev_nit++; + netdev_nit++; pt->next=ptype_all; ptype_all=pt; } @@ -201,7 +186,7 @@ void dev_remove_pack(struct packet_type *pt) struct packet_type **pt1; if(pt->type==htons(ETH_P_ALL)) { - dev_nit--; + netdev_nit--; pt1=&ptype_all; } else @@ -258,7 +243,6 @@ struct device *dev_getbyhwaddr(unsigned short type, char *ha) for (dev = dev_base; dev != NULL; dev = dev->next) { if (dev->type == type && - !(dev->flags&(IFF_LOOPBACK|IFF_NOARP)) && memcmp(dev->dev_addr, ha, dev->addr_len) == 0) return(dev); } @@ -312,19 +296,20 @@ struct device *dev_alloc(const char *name, int *err) void dev_load(const char *name) { - if(!dev_get(name)) { -#ifdef CONFIG_NET_ALIAS - const char *sptr; - - for (sptr=name ; *sptr ; sptr++) if(*sptr==':') break; - if (!(*sptr && *(sptr+1))) -#endif + if(!dev_get(name)) request_module(name); - } } #endif - + +static int +default_rebuild_header(struct sk_buff *skb) +{ + printk(KERN_DEBUG "%s: !skb->arp & !rebuild_header -- BUG!\n", skb->dev->name); + kfree_skb(skb, FREE_WRITE); + return 1; +} + /* * Prepare an interface for use. */ @@ -333,6 +318,13 @@ int dev_open(struct device *dev) { int ret = 0; + /* + * Is it already up? + */ + + if (dev->flags&IFF_UP) + return 0; + /* * Call device private open method */ @@ -341,29 +333,39 @@ int dev_open(struct device *dev) ret = dev->open(dev); /* - * If it went open OK then set the flags + * If it went open OK then: */ if (ret == 0) { + /* + * nil rebuild_header routine, + * that should be never called and used as just bug trap. + */ + + if (dev->rebuild_header == NULL) + dev->rebuild_header = default_rebuild_header; + + /* + * Set the flags. + */ dev->flags |= (IFF_UP | IFF_RUNNING); + /* - * Initialise multicasting status + * Initialize multicasting status */ dev_mc_upload(dev); - notifier_call_chain(&netdev_chain, NETDEV_UP, dev); - + /* - * Passive non transmitting devices (including - * aliases) need not be on this chain. + * Wakeup transmit queue engine */ - if (!net_alias_is(dev) && dev->tx_queue_len) - { - cli(); - dev->next_up = dev_up_base; - dev_up_base = dev; - sti(); - } + dev_activate(dev); + + /* + * ... and announce new interface. + */ + notifier_call_chain(&netdev_chain, NETDEV_UP, dev); + } return(ret); } @@ -375,17 +377,24 @@ int dev_open(struct device *dev) int dev_close(struct device *dev) { - int ct=0; - struct device **devp; + if (!(dev->flags&IFF_UP)) + return 0; + + dev_deactivate(dev); + + dev_lock_wait(); /* * Call the device specific close. This cannot fail. * Only if device is UP */ - if ((dev->flags & IFF_UP) && dev->stop) + if (dev->stop) dev->stop(dev); + if (dev->start) + printk("dev_close: bug %s still running\n", dev->name); + /* * Device is now down. */ @@ -397,36 +406,7 @@ int dev_close(struct device *dev) */ notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); - /* - * Flush the multicast chain - */ - dev_mc_discard(dev); - - /* - * Purge any queued packets when we down the link - */ - while(ctbuffs[ct]))!=NULL) - kfree_skb(skb,FREE_WRITE); - ct++; - } - /* - * The device is no longer up. Drop it from the list. - */ - - devp = &dev_up_base; - while (*devp) - { - if (*devp == dev) - { - *devp = dev->next_up; - break; - } - devp = &(*devp)->next_up; - } return(0); } @@ -451,7 +431,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) * taps currently in use. */ -static void queue_xmit_nit(struct sk_buff *skb, struct device *dev) +void dev_queue_xmit_nit(struct sk_buff *skb, struct device *dev) { struct packet_type *ptype; get_fast_time(&skb->stamp); @@ -467,180 +447,111 @@ static void queue_xmit_nit(struct sk_buff *skb, struct device *dev) struct sk_buff *skb2; if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) break; - skb2->mac.raw = skb2->data; - skb2->nh.raw = skb2->h.raw = skb2->data + dev->hard_header_len; - ptype->func(skb2, skb->dev, ptype); - } - } -} - -/* - * Send (or queue for sending) a packet. - * - * IMPORTANT: When this is called to resend frames. The caller MUST - * already have locked the sk_buff. Apart from that we do the - * rest of the magic. - */ - -static void do_dev_queue_xmit(struct sk_buff *skb, struct device *dev, int pri) -{ - unsigned long flags; - struct sk_buff_head *list; - int retransmission = 0; /* used to say if the packet should go */ - /* at the front or the back of the */ - /* queue - front is a retransmit try */ - - /* - * Negative priority is used to flag a frame that is being pulled from the - * queue front as a retransmit attempt. It therefore goes back on the queue - * start on a failure. - */ - - if (pri < 0) - { - pri = -pri-1; - retransmission = 1; - } - -#ifdef CONFIG_NET_DEBUG - if (pri >= DEV_NUMBUFFS) - { - printk(KERN_WARNING "bad priority in do_dev_queue_xmit.\n"); - pri = 1; - } -#endif - - /* - * If we are bridging and this is directly generated output - * pass the frame via the bridge. - */ - -#ifdef CONFIG_BRIDGE - if(skb->pkt_bridged!=IS_BRIDGED && br_stats.flags & BR_UP) - { - if(br_tx_frame(skb)) - return; - } -#endif - - list = dev->buffs + pri; - save_flags(flags); + /* Code, following below is wrong. - /* - * If this isn't a retransmission, use the first packet instead. - * Note: We don't do strict priority ordering here. We will in - * fact kick the queue that is our priority. The dev_tint reload - * does strict priority queueing. In effect what we are doing here - * is to add some random jitter to the queues and to do so by - * saving clocks. Doing a perfect priority queue isn't a good idea - * as you get some fascinating timing interactions. - */ + The only reason, why it does work is that + ONLY packet sockets receive outgoing + packets. If such a packet will be (occasionally) + received by normal packet handler, which expects + that mac header is pulled... + */ - if (!retransmission) - { - /* avoid overrunning the device queue.. */ - if (skb_queue_len(list) > dev->tx_queue_len) - { - dev_kfree_skb(skb, FREE_WRITE); - return; - } + /* More sensible variant. skb->nh should be correctly + set by sender, so that the second statement is + just protection against buggy protocols. + */ + skb2->mac.raw = skb2->data; - /* copy outgoing packets to any sniffer packet handlers */ - if (dev_nit) - queue_xmit_nit(skb,dev); + if (skb2->nh.raw < skb2->data || skb2->nh.raw >= skb2->tail) { + if (net_ratelimit()) + printk(KERN_DEBUG "protocol %04x is buggy, dev %s\n", skb2->protocol, dev->name); + skb2->nh.raw = skb2->data; + if (dev->hard_header) + skb2->nh.raw += dev->hard_header_len; + } - if (skb_queue_len(list)) { - cli(); - __skb_queue_tail(list, skb); - skb = __skb_dequeue(list); - restore_flags(flags); + skb2->h.raw = skb2->nh.raw; + skb2->pkt_type = PACKET_OUTGOING; + ptype->func(skb2, skb->dev, ptype); } } - if (dev->hard_start_xmit(skb, dev) == 0) { - /* - * Packet is now solely the responsibility of the driver - */ - return; - } - - /* - * Transmission failed, put skb back into a list. Once on the list it's safe and - * no longer device locked (it can be freed safely from the device queue) - */ - cli(); - __skb_queue_head(list,skb); - restore_flags(flags); } /* - * Entry point for transmitting frames. + * Fast path for loopback frames. */ +void dev_loopback_xmit(struct sk_buff *skb) +{ + struct sk_buff *newskb=skb_clone(skb, GFP_ATOMIC); + if (newskb==NULL) + return; + + skb_pull(newskb, newskb->nh.raw - newskb->data); + newskb->ip_summed = CHECKSUM_UNNECESSARY; + if (newskb->dst==NULL) + printk(KERN_DEBUG "BUG: packet without dst looped back 1\n"); + netif_rx(newskb); +} + int dev_queue_xmit(struct sk_buff *skb) { struct device *dev = skb->dev; - - start_bh_atomic(); + struct Qdisc *q; /* * If the address has not been resolved. Call the device header rebuilder. * This can cover all protocols and technically not just ARP either. + * + * This call must be moved to protocol layer. + * Now it works only for IPv6 and for IPv4 in + * some unusual curcumstances (eql device). --ANK */ - if (!skb->arp) - { - /* - * FIXME: we should make the printk for no rebuild - * header a default rebuild_header routine and drop - * this call. Similarly we should make hard_header - * have a default NULL operation not check conditions. - */ - if (dev->rebuild_header) - { - if (dev->rebuild_header(skb)) - { - end_bh_atomic(); - return 0; - } - } - else - printk(KERN_DEBUG "%s: !skb->arp & !rebuild_header!\n", dev->name); + if (!skb->arp && dev->rebuild_header(skb)) + return 0; + + q = dev->qdisc; + if (q->enqueue) { + start_bh_atomic(); + q->enqueue(skb, q); + qdisc_wakeup(dev); + end_bh_atomic(); + return 0; } - /* - * - * If dev is an alias, switch to its main device. - * "arp" resolution has been made with alias device, so - * arp entries refer to alias, not main. - * - */ + /* The device has no queue. Common case for software devices: + loopback, all the sorts of tunnels... - if (net_alias_is(dev)) - skb->dev = dev = net_alias_main_dev(dev); - - do_dev_queue_xmit(skb, dev, skb->priority); - end_bh_atomic(); + Really, it is unlikely that bh protection is necessary here: + virtual devices do not generate EOI events. + However, it is possible, that they rely on bh protection + made by us here. + */ + if (dev->flags&IFF_UP) { + start_bh_atomic(); + if (netdev_nit) + dev_queue_xmit_nit(skb,dev); + if (dev->hard_start_xmit(skb, dev) == 0) { + end_bh_atomic(); + return 0; + } + if (net_ratelimit()) + printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name); + end_bh_atomic(); + } + kfree_skb(skb, FREE_WRITE); return 0; } -/* - * Fast path for loopback frames. - */ - -void dev_loopback_xmit(struct sk_buff *skb) -{ - struct sk_buff *newskb=skb_clone(skb, GFP_ATOMIC); - if (newskb==NULL) - return; - skb_pull(newskb, newskb->nh.raw - newskb->data); - newskb->ip_summed = CHECKSUM_UNNECESSARY; - if (newskb->dst==NULL) - printk(KERN_DEBUG "BUG: packet without dst looped back 1\n"); - netif_rx(newskb); -} +/*======================================================================= + Receiver rotutines + =======================================================================*/ +int netdev_dropping = 0; +atomic_t netdev_rx_dropped; /* * Receive a packet from a device driver and queue it for the upper @@ -649,15 +560,6 @@ void dev_loopback_xmit(struct sk_buff *skb) void netif_rx(struct sk_buff *skb) { - static int dropping = 0; - - /* - * Any received buffers are un-owned and should be discarded - * when freed. These will be updated later as the frames get - * owners. - */ - - skb->sk = NULL; if(skb->stamp.tv_sec==0) get_fast_time(&skb->stamp); @@ -665,13 +567,14 @@ void netif_rx(struct sk_buff *skb) * Check that we aren't overdoing things. */ - if (!backlog_size) - dropping = 0; - else if (backlog_size > 300) - dropping = 1; + if (!backlog.qlen) + netdev_dropping = 0; + else if (backlog.qlen > 300) + netdev_dropping = 1; - if (dropping) + if (netdev_dropping) { + atomic_inc(&netdev_rx_dropped); kfree_skb(skb, FREE_READ); return; } @@ -681,7 +584,6 @@ void netif_rx(struct sk_buff *skb) */ skb_queue_tail(&backlog,skb); - backlog_size++; /* * If any packet arrived, mark it for processing after the @@ -692,32 +594,37 @@ void netif_rx(struct sk_buff *skb) return; } -/* - * This routine causes all interfaces to try to send some data. - */ - -static void dev_transmit(void) +#ifdef CONFIG_BRIDGE +static inline void handle_bridge(struct skbuff *skb, unsigned short type) { - struct device *dev; - - for (dev = dev_up_base; dev != NULL; dev = dev->next_up) + if (br_stats.flags & BR_UP && br_protocol_ok(ntohs(type))) { - if (dev->flags != 0 && !dev->tbusy) + /* + * We pass the bridge a complete frame. This means + * recovering the MAC header first. + */ + + int offset=skb->data-skb->mac.raw; + cli(); + skb_push(skb,offset); /* Put header back on for bridge */ + if(br_receive_frame(skb)) { - /* - * Kick the device - */ - dev_tint(dev); + sti(); + continue; } + /* + * Pull the MAC header off for the copy going to + * the upper layers. + */ + skb_pull(skb,offset); + sti(); } } +#endif - -/********************************************************************************** - - Receive Queue Processor - -***********************************************************************************/ +#ifdef CONFIG_CPU_IS_SLOW +int net_cpu_congestion; +#endif /* * When we are called the queue is ready to grab, the interrupts are @@ -732,7 +639,15 @@ void net_bh(void) struct packet_type *ptype; struct packet_type *pt_prev; unsigned short type; - int nit = 301; + unsigned long start_time = jiffies; +#ifdef CONFIG_CPU_IS_SLOW + static unsigned long start_busy = 0; + static unsigned long ave_busy = 0; + + if (start_busy == 0) + start_busy = start_time; + net_cpu_congestion = ave_busy>>8; +#endif /* * Can we send anything now? We want to clear the @@ -741,7 +656,8 @@ void net_bh(void) * latency on a transmit interrupt bh. */ - dev_transmit(); + if (qdisc_head.forw != &qdisc_head) + qdisc_run_queues(); /* * Any data left to process. This may occur because a @@ -761,55 +677,43 @@ void net_bh(void) { struct sk_buff * skb = backlog.next; + if (jiffies - start_time > 1) { + /* Give chance to other bottom halves to run */ + mark_bh(NET_BH); + return; + } + /* * We have a packet. Therefore the queue has shrunk */ cli(); __skb_unlink(skb, &backlog); - backlog_size--; sti(); - /* - * We do not want to spin in net_bh infinitely. --ANK - */ - if (--nit <= 0) - { - if (nit == 0) - printk(KERN_WARNING "net_bh: too many loops, dropping...\n"); +#ifdef CONFIG_CPU_IS_SLOW + if (ave_busy > 128*16) { kfree_skb(skb, FREE_WRITE); - continue; + while ((skb = skb_dequeue(&backlog)) != NULL) + kfree_skb(skb, FREE_WRITE); + break; } +#endif + -#ifdef CONFIG_BRIDGE + /* + * Fetch the packet protocol ID. + */ + + type = skb->protocol; + +#ifdef CONFIG_BRIDGE /* * If we are bridging then pass the frame up to the * bridging code (if this protocol is to be bridged). * If it is bridged then move on */ - - if (br_stats.flags & BR_UP && br_protocol_ok(ntohs(skb->protocol))) - { - /* - * We pass the bridge a complete frame. This means - * recovering the MAC header first. - */ - - int offset=skb->data-skb->mac.raw; - cli(); - skb_push(skb,offset); /* Put header back on for bridge */ - if(br_receive_frame(skb)) - { - sti(); - continue; - } - /* - * Pull the MAC header off for the copy going to - * the upper layers. - */ - skb_pull(skb,offset); - sti(); - } + handle_bridge(skb, type); #endif /* @@ -822,12 +726,6 @@ void net_bh(void) /* XXX until we figure out every place to modify.. */ skb->h.raw = skb->nh.raw = skb->data; - /* - * Fetch the packet protocol ID. - */ - - type = skb->protocol; - /* * We got a packet ID. Now loop over the "known protocols" * list. There are two lists. The ptype_all list of taps (normally empty) @@ -837,15 +735,17 @@ void net_bh(void) pt_prev = NULL; for (ptype = ptype_all; ptype!=NULL; ptype=ptype->next) { - if(pt_prev) - { - struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC); - if(skb2) - pt_prev->func(skb2,skb->dev, pt_prev); + if (!ptype->dev || ptype->dev == skb->dev) { + if(pt_prev) + { + struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC); + if(skb2) + pt_prev->func(skb2,skb->dev, pt_prev); + } + pt_prev=ptype; } - pt_prev=ptype; } - + for (ptype = ptype_base[ntohs(type)&15]; ptype != NULL; ptype = ptype->next) { if (ptype->type == type && (!ptype->dev || ptype->dev==skb->dev)) @@ -872,7 +772,7 @@ void net_bh(void) pt_prev=ptype; } } /* End of protocol list loop */ - + /* * Is there a last item to send to ? */ @@ -883,16 +783,9 @@ void net_bh(void) * Has an unknown packet has been received ? */ - else + else { kfree_skb(skb, FREE_WRITE); - /* - * Again, see if we can transmit anything now. - * [Ought to take this out judging by tests it slows - * us down not speeds us up] - */ -#ifdef XMIT_EVERY - dev_transmit(); -#endif + } } /* End of queue loop */ /* @@ -903,64 +796,47 @@ void net_bh(void) * One last output flush. */ - dev_transmit(); + if (qdisc_head.forw != &qdisc_head) + qdisc_run_queues(); + +#ifdef CONFIG_CPU_IS_SLOW +{ + unsigned long start_idle = jiffies; + ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4); + start_busy = 0; +} +#endif } +/* Protocol dependent address dumping routines */ -/* - * This routine is called when an device driver (i.e. an - * interface) is ready to transmit a packet. - */ - -void dev_tint(struct device *dev) +static int (*gifconf[NPROTO])(struct device *dev, char *bufptr, int len); + +int register_gifconf(int family, int (*func)(struct device *dev, char *bufptr, int len)) { - int i; - unsigned long flags; - struct sk_buff_head * head; - - /* - * aliases do not transmit (for now :) ) - */ + if (family<0 || family>=NPROTO) + return -EINVAL; + gifconf[family] = func; + return 0; +} - if (net_alias_is(dev)) { - printk(KERN_DEBUG "net alias %s transmits\n", dev->name); - return; - } - head = dev->buffs; - save_flags(flags); - cli(); +/* + This ioctl is wrong by design. It really existed in some + old SYSV systems, only was named SIOCGIFNUM. + In multiprotocol environment it is just useless. + Well, SIOCGIFCONF is wrong too, but we have to preserve + it by compatibility reasons. - /* - * Work the queues in priority order - */ - for(i = 0;i < DEV_NUMBUFFS; i++,head++) - { + If someone wants to achieve the same effect, please, use undocumented + feature of SIOCGIFCONF: it returns buffer length, if buffer + is not supplied. - while (!skb_queue_empty(head)) { - struct sk_buff *skb; + Let's remove it, until someone started to use it. --ANK - skb = head->next; - __skb_unlink(skb, head); - /* - * Stop anyone freeing the buffer while we retransmit it - */ - restore_flags(flags); - /* - * Feed them to the output stage and if it fails - * indicate they re-queue at the front. - */ - do_dev_queue_xmit(skb,dev,-i - 1); - /* - * If we can take no more then stop here. - */ - if (dev->tbusy) - return; - cli(); - } - } - restore_flags(flags); -} + In any case, if someone cannot live without it, it should + be renamed to SIOCGIFNUM. + */ /* @@ -970,20 +846,26 @@ void dev_tint(struct device *dev) static int dev_ifcount(unsigned int *arg) { struct device *dev; - int err; unsigned int count = 0; for (dev = dev_base; dev != NULL; dev = dev->next) count++; - err = copy_to_user(arg, &count, sizeof(unsigned int)); - if (err) - return -EFAULT; - return 0; + return put_user(count, arg); } /* - * Map an interface index to its name (SIOGIFNAME) + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * This call is useful, but I'd remove it too. + * + * The reason is purely aestetical, it is the only call + * from SIOC* family using struct ifreq in reversed manner. + * Besides that, it is pretty silly to put "drawing" facility + * to kernel, it is useful only to print ifindices + * in readable form, is not it? --ANK */ static int dev_ifname(struct ifreq *arg) @@ -1019,7 +901,6 @@ static int dev_ifname(struct ifreq *arg) static int dev_ifconf(char *arg) { struct ifconf ifc; - struct ifreq ifr; struct device *dev; char *pos; unsigned int len; @@ -1031,68 +912,51 @@ static int dev_ifconf(char *arg) err = copy_from_user(&ifc, arg, sizeof(struct ifconf)); if (err) - return -EFAULT; - len = ifc.ifc_len; + return -EFAULT; + pos = ifc.ifc_buf; + if (pos==NULL) + ifc.ifc_len=0; + len = ifc.ifc_len; - /* - * We now walk the device list filling each active device - * into the array. - */ - /* * Loop over the interfaces, and write an info block for each. */ - - dev_lock_wait(); - dev_lock_list(); - for (dev = dev_base; dev != NULL; dev = dev->next) - { - /* - * Have we run out of space here ? - */ - - if (len < sizeof(struct ifreq)) - break; + for (dev = dev_base; dev != NULL; dev = dev->next) { + int i; + for (i=0; iname); - (*(struct sockaddr_in *) &ifr.ifr_addr).sin_family = dev->family; - (*(struct sockaddr_in *) &ifr.ifr_addr).sin_addr.s_addr = dev->pa_addr; + if (gifconf[i] == NULL) + continue; + done = gifconf[i](dev, pos, len); - /* - * Write this block to the caller's space. - */ - - err = copy_to_user(pos, &ifr, sizeof(struct ifreq)); - if (err) - return -EFAULT; - pos += sizeof(struct ifreq); - len -= sizeof(struct ifreq); + if (done<0) + return -EFAULT; + + len -= done; + if (pos) + pos += done; + } } - dev_unlock_list(); - /* * All done. Write the updated control block back to the caller. */ - - ifc.ifc_len = (pos - ifc.ifc_buf); - ifc.ifc_req = (struct ifreq *) ifc.ifc_buf; - err = copy_to_user(arg, &ifc, sizeof(struct ifconf)); - if (err) + ifc.ifc_len -= len; + + if (copy_to_user(arg, &ifc, sizeof(struct ifconf))) return -EFAULT; /* * Report how much was filled in */ - return(pos - arg); + return ifc.ifc_len; } - /* * This is invoked by the /proc filesystem handler to display a device * in detail. @@ -1105,7 +969,7 @@ static int sprintf_stats(char *buffer, struct device *dev) int size; if (stats) - size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %4lu %8lu %8lu %4lu %4lu %4lu %5lu %4lu\n", + size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %4lu %8lu %8lu %4lu %4lu %4lu %5lu %4lu %4lu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, @@ -1117,7 +981,8 @@ static int sprintf_stats(char *buffer, struct device *dev) stats->tx_packets, stats->tx_errors, stats->tx_dropped, stats->tx_fifo_errors, stats->collisions, stats->tx_carrier_errors + stats->tx_aborted_errors - + stats->tx_window_errors + stats->tx_heartbeat_errors); + + stats->tx_window_errors + stats->tx_heartbeat_errors, + stats->multicast); else size = sprintf(buffer, "%6s: No statistics available.\n", dev->name); @@ -1252,272 +1117,218 @@ int dev_get_wireless_info(char * buffer, char **start, off_t offset, #endif /* CONFIG_PROC_FS */ #endif /* CONFIG_NET_RADIO */ +void dev_set_promiscuity(struct device *dev, int inc) +{ + unsigned short old_flags = dev->flags; -/* - * Perform the SIOCxIFxxx calls. - * - * The socket layer has seen an ioctl the address family thinks is - * for the device. At this point we get invoked to make a decision - */ - -static int dev_ifsioc(void *arg, unsigned int getset) + dev->flags |= IFF_PROMISC; + if ((dev->promiscuity += inc) == 0) + dev->flags &= ~IFF_PROMISC; + if (dev->flags^old_flags) { + dev_mc_upload(dev); + printk(KERN_INFO "device %s %s promiscuous mode\n", + dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "leaved"); + } +} + +void dev_set_allmulti(struct device *dev, int inc) { - struct ifreq ifr; - struct device *dev; - int ret, err; + unsigned short old_flags = dev->flags; + + dev->flags |= IFF_ALLMULTI; + if ((dev->allmulti += inc) == 0) + dev->flags &= ~IFF_ALLMULTI; + if (dev->flags^old_flags) + dev_mc_upload(dev); +} + +int dev_change_flags(struct device *dev, unsigned flags) +{ + int ret; + int old_flags = dev->flags; /* - * Fetch the caller's info block into kernel space + * Set the flags on our device. */ - - err = copy_from_user(&ifr, arg, sizeof(struct ifreq)); - if (err) - return -EFAULT; + + dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_RUNNING|IFF_NOARP| + IFF_SLAVE|IFF_MASTER| + IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) | + (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC)); /* - * See which interface the caller is talking about. - */ - + * Load in the correct multicast list now the flags have changed. + */ + + dev_mc_upload(dev); + /* - * - * net_alias_dev_get(): dev_get() with added alias naming magic. - * only allow alias creation/deletion if (getset==SIOCSIFADDR) - * + * Have we downed the interface. We handle IFF_UP ourselves + * according to user attempts to set it, rather than blindly + * setting it. */ - -#ifdef CONFIG_KERNELD - dev_load(ifr.ifr_name); -#endif -#ifdef CONFIG_NET_ALIAS - if ((dev = net_alias_dev_get(ifr.ifr_name, getset == SIOCSIFADDR, &err, NULL, NULL)) == NULL) - return(err); -#else - if ((dev = dev_get(ifr.ifr_name)) == NULL) - return(-ENODEV); -#endif - switch(getset) + ret = 0; + if ((old_flags^flags)&IFF_UP) /* Bit is different ? */ + { + if(old_flags&IFF_UP) /* Gone down */ + ret=dev_close(dev); + else /* Come up */ + ret=dev_open(dev); + + if (ret == 0) + dev_mc_upload(dev); + } + + if (dev->flags&IFF_UP && + ((old_flags^dev->flags)&~(IFF_UP|IFF_RUNNING|IFF_PROMISC|IFF_VOLATILE))) { + printk(KERN_DEBUG "SIFFL %s(%s)\n", dev->name, current->comm); + notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); + } + + if ((flags^dev->gflags)&IFF_PROMISC) { + int inc = (flags&IFF_PROMISC) ? +1 : -1; + dev->gflags ^= IFF_PROMISC; + dev_set_promiscuity(dev, inc); + } + + return ret; +} + +/* + * Perform the SIOCxIFxxx calls. + */ + +static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) +{ + struct device *dev; + int err; + + if ((dev = dev_get(ifr->ifr_name)) == NULL) + return -ENODEV; + + switch(cmd) { case SIOCGIFFLAGS: /* Get interface flags */ - ifr.ifr_flags = dev->flags; - goto rarok; + ifr->ifr_flags = (dev->flags&~IFF_PROMISC)|(dev->gflags&IFF_PROMISC); + return 0; case SIOCSIFFLAGS: /* Set interface flags */ - { - int old_flags = dev->flags; - - /* - * We are not allowed to potentially close/unload - * a device until we get this lock. - */ - - dev_lock_wait(); - dev_lock_list(); - - /* - * Set the flags on our device. - */ - - dev->flags = (ifr.ifr_flags & ( - IFF_BROADCAST | IFF_DEBUG | IFF_LOOPBACK | IFF_PORTSEL | - IFF_POINTOPOINT | IFF_NOTRAILERS | IFF_RUNNING | IFF_AUTOMEDIA | - IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI | IFF_SLAVE | IFF_MASTER - | IFF_MULTICAST)) | (dev->flags & IFF_UP); - /* - * Load in the correct multicast list now the flags have changed. - */ - - dev_mc_upload(dev); - - /* - * Have we downed the interface. We handle IFF_UP ourselves - * according to user attempts to set it, rather than blindly - * setting it. - */ - - if ((old_flags^ifr.ifr_flags)&IFF_UP) /* Bit is different ? */ - { - if(old_flags&IFF_UP) /* Gone down */ - ret=dev_close(dev); - else /* Come up */ - { - ret=dev_open(dev); - if(ret<0) - dev->flags&=~IFF_UP; /* Open failed */ - } - } - else - ret=0; - /* - * Load in the correct multicast list now the flags have changed. - */ - - dev_mc_upload(dev); - if ((dev->flags&IFF_UP) && ((old_flags^dev->flags)&~(IFF_UP|IFF_RUNNING|IFF_PROMISC))) - { - printk(KERN_DEBUG "SIFFL %s(%s)\n", dev->name, current->comm); - notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); - } - if ((dev->flags^old_flags)&IFF_PROMISC) { - if (dev->flags&IFF_PROMISC) - printk(KERN_INFO "%s enters promiscuous mode.\n", dev->name); - else - printk(KERN_INFO "%s leave promiscuous mode.\n", dev->name); - } - dev_unlock_list(); - } - break; + return dev_change_flags(dev, ifr->ifr_flags); case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */ - - ifr.ifr_metric = dev->metric; - goto rarok; + ifr->ifr_metric = dev->metric; + return 0; case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ - dev->metric = ifr.ifr_metric; - ret=0; - break; + dev->metric = ifr->ifr_metric; + return 0; case SIOCGIFMTU: /* Get the MTU of a device */ - ifr.ifr_mtu = dev->mtu; - goto rarok; + ifr->ifr_mtu = dev->mtu; + return 0; case SIOCSIFMTU: /* Set the MTU of a device */ - - if (ifr.ifr_mtu == dev->mtu) { - ret = 0; - break; - } + if (ifr->ifr_mtu == dev->mtu) + return 0; /* * MTU must be positive. */ - if(ifr.ifr_mtu<68) + if (ifr->ifr_mtu<0) return -EINVAL; if (dev->change_mtu) - ret = dev->change_mtu(dev, ifr.ifr_mtu); - else - { - dev->mtu = ifr.ifr_mtu; - ret = 0; + err = dev->change_mtu(dev, ifr->ifr_mtu); + else { + dev->mtu = ifr->ifr_mtu; + err = 0; } - if (!ret && dev->flags&IFF_UP) { + if (!err && dev->flags&IFF_UP) { printk(KERN_DEBUG "SIFMTU %s(%s)\n", dev->name, current->comm); notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev); } - break; - - case SIOCGIFMEM: /* Get the per device memory space. We can add this but currently - do not support it */ - ret = -EINVAL; - break; - - case SIOCSIFMEM: /* Set the per device memory buffer space. Not applicable in our case */ - ret = -EINVAL; - break; + return err; case SIOCGIFHWADDR: - memcpy(ifr.ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN); - ifr.ifr_hwaddr.sa_family=dev->type; - goto rarok; + memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN); + ifr->ifr_hwaddr.sa_family=dev->type; + return 0; case SIOCSIFHWADDR: if(dev->set_mac_address==NULL) return -EOPNOTSUPP; - if(ifr.ifr_hwaddr.sa_family!=dev->type) + if(ifr->ifr_hwaddr.sa_family!=dev->type) return -EINVAL; - ret=dev->set_mac_address(dev,&ifr.ifr_hwaddr); - if (!ret) + err=dev->set_mac_address(dev,&ifr->ifr_hwaddr); + if (!err) notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); - break; + return err; + case SIOCSIFHWBROADCAST: + if(ifr->ifr_hwaddr.sa_family!=dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN); + notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); + return 0; + case SIOCGIFMAP: - ifr.ifr_map.mem_start=dev->mem_start; - ifr.ifr_map.mem_end=dev->mem_end; - ifr.ifr_map.base_addr=dev->base_addr; - ifr.ifr_map.irq=dev->irq; - ifr.ifr_map.dma=dev->dma; - ifr.ifr_map.port=dev->if_port; - goto rarok; + ifr->ifr_map.mem_start=dev->mem_start; + ifr->ifr_map.mem_end=dev->mem_end; + ifr->ifr_map.base_addr=dev->base_addr; + ifr->ifr_map.irq=dev->irq; + ifr->ifr_map.dma=dev->dma; + ifr->ifr_map.port=dev->if_port; + return 0; case SIOCSIFMAP: - if(dev->set_config==NULL) - return -EOPNOTSUPP; - return dev->set_config(dev,&ifr.ifr_map); + if (dev->set_config) + return dev->set_config(dev,&ifr->ifr_map); + return -EOPNOTSUPP; case SIOCADDMULTI: - if(dev->set_multicast_list==NULL) - return -EINVAL; - if(ifr.ifr_hwaddr.sa_family!=AF_UNSPEC) + if(dev->set_multicast_list==NULL || + ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) return -EINVAL; - dev_mc_add(dev,ifr.ifr_hwaddr.sa_data, dev->addr_len, 1); + printk(KERN_DEBUG "SIOCADDMULTI ioctl is deprecated\n"); + dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1); return 0; case SIOCDELMULTI: - if(dev->set_multicast_list==NULL) + if(dev->set_multicast_list==NULL || + ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) return -EINVAL; - if(ifr.ifr_hwaddr.sa_family!=AF_UNSPEC) - return -EINVAL; - dev_mc_delete(dev,ifr.ifr_hwaddr.sa_data,dev->addr_len, 1); + printk(KERN_DEBUG "SIOCDELMULTI ioctl is deprecated\n"); + dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1); return 0; - case SIOGIFINDEX: - ifr.ifr_ifindex = dev->ifindex; - goto rarok; - + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; /* * Unknown or private ioctl */ default: - if((getset >= SIOCDEVPRIVATE) && - (getset <= (SIOCDEVPRIVATE + 15))) { - if(dev->do_ioctl==NULL) - return -EOPNOTSUPP; - ret = dev->do_ioctl(dev, &ifr, getset); - if (!ret) - { - err = copy_to_user(arg,&ifr,sizeof(struct ifreq)); - if (err) - ret = -EFAULT; - } - break; + if(cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) { + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, cmd); + return -EOPNOTSUPP; } #ifdef CONFIG_NET_RADIO - if((getset >= SIOCIWFIRST) && (getset <= SIOCIWLAST)) - { - if(dev->do_ioctl==NULL) - return -EOPNOTSUPP; - /* Perform the ioctl */ - ret=dev->do_ioctl(dev, &ifr, getset); - /* If return args... */ - if(IW_IS_GET(getset)) - { - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - { - ret = -EFAULT; - } - } - break; + if(cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, cmd); + return -EOPNOTSUPP; } #endif /* CONFIG_NET_RADIO */ - ret = -EINVAL; } - return(ret); -/* - * The load of calls that return an ifreq and ok (saves memory). - */ -rarok: - err = copy_to_user(arg, &ifr, sizeof(struct ifreq)); - if (err) - err = -EFAULT; - return err; + return -EINVAL; } @@ -1528,47 +1339,98 @@ rarok: int dev_ioctl(unsigned int cmd, void *arg) { + struct ifreq ifr; + int ret; +#ifdef CONFIG_NET_ALIAS + char *colon; +#endif + + /* One special case: SIOCGIFCONF takes ifconf argument + and requires shared lock, because it sleeps writing + to user space. + */ + + if (cmd == SIOCGIFCONF) { + rtnl_shlock(); + dev_ifconf((char *) arg); + rtnl_shunlock(); + return 0; + } + if (cmd == SIOCGIFCOUNT) { + return dev_ifcount((unsigned int*)arg); + } + if (cmd == SIOCGIFNAME) { + return dev_ifname((struct ifreq *)arg); + } + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + +#ifdef CONFIG_NET_ALIAS + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; +#endif + + /* + * See which interface the caller is talking about. + */ + +#ifdef CONFIG_KERNELD + dev_load(ifr.ifr_name); +#endif + switch(cmd) { - case SIOCGIFCONF: - (void) dev_ifconf((char *) arg); - return 0; - case SIOCGIFCOUNT: - return dev_ifcount((unsigned int *) arg); - case SIOGIFNAME: - return dev_ifname((struct ifreq *)arg); - /* - * Ioctl calls that can be done by all. + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value */ case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: - case SIOCGIFMEM: case SIOCGIFHWADDR: case SIOCGIFSLAVE: case SIOCGIFMAP: - case SIOGIFINDEX: - return dev_ifsioc(arg, cmd); + case SIOCGIFINDEX: + ret = dev_ifsioc(&ifr, cmd); + if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return ret; /* - * Ioctl calls requiring the power of a superuser + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value */ case SIOCSIFFLAGS: case SIOCSIFMETRIC: case SIOCSIFMTU: - case SIOCSIFMEM: - case SIOCSIFHWADDR: case SIOCSIFMAP: + case SIOCSIFHWADDR: case SIOCSIFSLAVE: case SIOCADDMULTI: case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: if (!suser()) return -EPERM; - return dev_ifsioc(arg, cmd); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + return ret; + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but currently + do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. Not applicable in our case */ case SIOCSIFLINK: return -EINVAL; @@ -1577,16 +1439,29 @@ int dev_ioctl(unsigned int cmd, void *arg) */ default: - if((cmd >= SIOCDEVPRIVATE) && - (cmd <= (SIOCDEVPRIVATE + 15))) { - return dev_ifsioc(arg, cmd); + if (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) { + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return ret; } #ifdef CONFIG_NET_RADIO - if((cmd >= SIOCIWFIRST) && (cmd <= SIOCIWLAST)) - { - if((IW_IS_SET(cmd)) && (!suser())) - return -EPERM; - return dev_ifsioc(arg, cmd); + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + if (IW_IS_SET(cmd)) { + if (!suser()) + return -EPERM; + rtnl_lock(); + } + ret = dev_ifsioc(&ifr, cmd); + if (IW_IS_SET(cmd)) + rtnl_unlock(); + if (!ret && IW_IS_GET(cmd) && + copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return ret; } #endif /* CONFIG_NET_RADIO */ return -EINVAL; @@ -1596,9 +1471,103 @@ int dev_ioctl(unsigned int cmd, void *arg) int dev_new_index() { static int ifindex; - return ++ifindex; + for (;;) { + if (++ifindex <= 0) + ifindex=1; + if (dev_get_by_index(ifindex) == NULL) + return ifindex; + } +} + +static int dev_boot_phase = 1; + + +int register_netdevice(struct device *dev) +{ + struct device *d, **dp; + + if (dev_boot_phase) { + printk(KERN_INFO "early initialization of device %s is deferred\n", dev->name); + + /* Check for existence, and append to tail of chain */ + for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev || strcmp(d->name, dev->name) == 0) + return -EEXIST; + } + dev->next = NULL; + *dp = dev; + return 0; + } + + dev->iflink = -1; + + /* Init, if this function is available */ + if (dev->init && dev->init(dev) != 0) + return -EIO; + + /* Check for existence, and append to tail of chain */ + for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev || strcmp(d->name, dev->name) == 0) + return -EEXIST; + } + dev->next = NULL; + dev_init_scheduler(dev); + dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + *dp = dev; + + /* Notify protocols, that a new device appeared. */ + notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); + + return 0; } +int unregister_netdevice(struct device *dev) +{ + struct device *d, **dp; + + if (dev_boot_phase == 0) { + /* If device is running, close it. + It is very bad idea, really we should + complain loudly here, but random hackery + in linux/drivers/net likes it. + */ + if (dev->flags & IFF_UP) + dev_close(dev); + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); + + /* + * Flush the multicast chain + */ + dev_mc_discard(dev); + + /* To avoid pointers looking to nowhere, + we wait for end of critical section */ + dev_lock_wait(); + } + + /* And unlink it from device chain. */ + for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev) { + *dp = d->next; + d->next = NULL; + if (dev->destructor) + dev->destructor(dev); + return 0; + } + } + return -ENODEV; +} + + /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not @@ -1643,6 +1612,8 @@ __initfunc(int net_dev_init(void)) { struct device *dev, **dp; + pktsched_init(); + /* * Initialise the packet receive queue. */ @@ -1662,6 +1633,7 @@ __initfunc(int net_dev_init(void)) * * Some devices want to be initialized early.. */ + #if defined(CONFIG_LANCE) lance_init(); #endif @@ -1714,6 +1686,7 @@ __initfunc(int net_dev_init(void)) slhc_install(); #endif + /* * Add the devices. * If the call to dev->init fails, the dev is removed @@ -1724,11 +1697,7 @@ __initfunc(int net_dev_init(void)) dp = &dev_base; while ((dev = *dp) != NULL) { - int i; - for (i = 0; i < DEV_NUMBUFFS; i++) { - skb_queue_head_init(dev->buffs + i); - } - + dev->iflink = -1; if (dev->init && dev->init(dev)) { /* @@ -1740,6 +1709,9 @@ __initfunc(int net_dev_init(void)) { dp = &dev->next; dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + dev_init_scheduler(dev); } } @@ -1753,18 +1725,13 @@ __initfunc(int net_dev_init(void)) #endif /* CONFIG_PROC_FS */ #endif /* CONFIG_NET_RADIO */ - /* - * Initialise net_alias engine - * - * - register net_alias device notifier - * - register proc entries: /proc/net/alias_types - * /proc/net/aliases - */ + init_bh(NET_BH, net_bh); -#ifdef CONFIG_NET_ALIAS - net_alias_init(); + dev_boot_phase = 0; + +#ifdef CONFIG_IP_PNP + ip_auto_config(); #endif - init_bh(NET_BH, net_bh); return 0; } diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index 4aa6cbb0c33e..eaa1bd058cd6 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -42,7 +42,6 @@ #include #include #include -#include /* @@ -69,19 +68,6 @@ void dev_mc_upload(struct device *dev) if(!(dev->flags&IFF_UP)) return; - /* - * An aliased device should end up with the combined - * multicast list of all its aliases. - * Really, multicasting with logical interfaces is very - * subtle question. Now we DO forward multicast packets - * to logical interfcases, that doubles multicast - * traffic but allows mrouted to work. - * Alas, mrouted does not understand aliases even - * in 4.4BSD --ANK - */ - - dev = net_alias_main_dev(dev); - /* * Devices with no set multicast don't get set */ @@ -99,7 +85,6 @@ void dev_mc_upload(struct device *dev) void dev_mc_delete(struct device *dev, void *addr, int alen, int all) { struct dev_mc_list **dmi; - dev = net_alias_main_dev(dev); for(dmi=&dev->mc_list;*dmi!=NULL;dmi=&(*dmi)->next) { @@ -136,8 +121,6 @@ void dev_mc_add(struct device *dev, void *addr, int alen, int newonly) { struct dev_mc_list *dmi; - dev = net_alias_main_dev(dev); - for(dmi=dev->mc_list;dmi!=NULL;dmi=dmi->next) { if(memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) @@ -165,12 +148,12 @@ void dev_mc_add(struct device *dev, void *addr, int alen, int newonly) void dev_mc_discard(struct device *dev) { - if (net_alias_is(dev)) - return; while(dev->mc_list!=NULL) { struct dev_mc_list *tmp=dev->mc_list; dev->mc_list=dev->mc_list->next; + if (tmp->dmi_users) + printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users); kfree_s(tmp,sizeof(*tmp)); } dev->mc_count=0; diff --git a/net/core/iovec.c b/net/core/iovec.c index 9bc21ffc57f2..10aa7a4cc6c0 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -192,69 +192,78 @@ int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, * * ip_build_xmit must ensure that when fragmenting only the last * call to this function will be unaligned also. - * - * FIXME: add an error handling path when a copy/checksum from - * user space failed because of a invalid pointer. */ -unsigned int csum_partial_copy_fromiovecend(unsigned char *kdata, - struct iovec *iov, int offset, - int len, int csum) +int csum_partial_copy_fromiovecend(unsigned char *kdata, + struct iovec *iov, int offset, + int len, int *csump) { - __u32 partial; - __u32 partial_cnt = 0; + int partial_cnt = 0; + int err = 0; + int csum; - while(offset>0) - { - if (offset > iov->iov_len) - { - offset -= iov->iov_len; + do { + int copy = iov->iov_len - offset; - } - else - { - u8 *base; - int copy; + if (copy >= 0) { + u8 *base = iov->iov_base + offset; - base = iov->iov_base + offset; - copy = min(len, iov->iov_len - offset); - offset = 0; + /* Normal case (single iov component) is fastly detected */ + if (len <= copy) { + *csump = csum_partial_copy_from_user(base, kdata, + len, *csump, &err); + return err; + } partial_cnt = copy % 4; - if (partial_cnt) - { + if (partial_cnt) { copy -= partial_cnt; - copy_from_user(&partial, base + copy, - partial_cnt); + err |= copy_from_user(kdata+copy, base+copy, partial_cnt); } - /* - * FIXME: add exception handling to the - * csum functions and set *err when an - * exception occurs. - */ - csum = csum_partial_copy_fromuser(base, kdata, - copy, csum); + *csump = csum_partial_copy_from_user(base, kdata, + copy, *csump, &err); len -= copy + partial_cnt; kdata += copy + partial_cnt; + iov++; + break; } - iov++; - } + iov++; + offset = -copy; + } while (offset > 0); + + csum = *csump; while (len>0) { u8 *base = iov->iov_base; - int copy=min(len, iov->iov_len); + int copy = min(len, iov->iov_len); + /* There is a remnant from previous iov. */ if (partial_cnt) { int par_len = 4 - partial_cnt; - copy_from_user(&partial, base + partial_cnt, par_len); - csum = csum_partial((u8*) &partial, 4, csum); + /* iov component is too short ... */ + if (par_len > copy) { + err |= copy_from_user(kdata, base, copy); + base += copy; + partial_cnt += copy; + kdata += copy; + len -= copy; + iov++; + if (len) + continue; + *csump = csum_partial(kdata-partial_cnt, partial_cnt, csum); + return err; + } + err |= copy_from_user(kdata, base, par_len); + csum = csum_partial(kdata-partial_cnt, 4, csum); base += par_len; copy -= par_len; + len -= par_len; + kdata += par_len; partial_cnt = 0; } @@ -264,16 +273,15 @@ unsigned int csum_partial_copy_fromiovecend(unsigned char *kdata, if (partial_cnt) { copy -= partial_cnt; - copy_from_user(&partial, base + copy, - partial_cnt); + err |= copy_from_user(kdata+copy, base + copy, partial_cnt); } } - csum = csum_partial_copy_fromuser(base, kdata, copy, csum); + csum = csum_partial_copy_from_user(base, kdata, copy, csum, &err); len -= copy + partial_cnt; kdata += copy + partial_cnt; iov++; } - - return csum; + *csump = csum; + return err; } diff --git a/net/core/net_alias.c b/net/core/net_alias.c deleted file mode 100644 index 807c2e93553b..000000000000 --- a/net/core/net_alias.c +++ /dev/null @@ -1,1464 +0,0 @@ -/* - * NET_ALIAS network device aliasing module. - * - * - * Version: @(#)net_alias.c 0.43 12/20/95 - * - * Authors: Juan Jose Ciarlante, - * Marcelo Fabian Roccasalva, - * - * Features: - * - AF_ independent: net_alias_type objects - * - AF_INET optimized - * - ACTUAL alias devices inserted in dev chain - * - fast hashed alias address lookup - * - net_alias_type objs registration/unreg., module-ables. - * - /proc/net/aliases & /proc/net/alias_types entries - * Fixes: - * JJC : several net_alias_type func. renamed. - * JJC : net_alias_type object methods now pass - * *this. - * JJC : xxx_rcv device selection based on - * addrs - * Andreas Schultz : Kerneld support. - * - * FIXME: - * - User calls sleep/wake_up locking. - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef CONFIG_KERNELD -#include -#endif - -/* - * Only allow the following flags to pass from main device to aliases - */ - -#define NET_ALIAS_IFF_MASK (IFF_UP|IFF_RUNNING|IFF_NOARP|IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_MULTICAST) - -static struct net_alias_type * nat_getbytype(int type); -static int nat_attach_chg(struct net_alias_type *nat, int delta); -static int nat_bind(struct net_alias_type *nat,struct net_alias *alias, struct sockaddr *sa); -static int nat_unbind(struct net_alias_type *nat, struct net_alias *alias); - -static int net_alias_devinit(struct device *dev); -static int net_alias_hard_start_xmit(struct sk_buff *skb, struct device *dev); -static int net_alias_devsetup(struct net_alias *alias, struct net_alias_type *nat, struct sockaddr *sa); -static struct net_alias **net_alias_slow_findp(struct net_alias_info *alias_info, struct net_alias *alias); -static struct device *net_alias_dev_create(struct device *main_dev, int slot, int *err, struct sockaddr *sa, void *data); -static struct device *net_alias_dev_delete(struct device *main_dev, int slot, int *err); -static void net_alias_free(struct device *dev); - -/* - * net_alias_type base array, will hold net_alias_type obj hashed list - * heads. - */ - -struct net_alias_type *nat_base[16]; - - -/* - * Get net_alias_type ptr by type - */ - -extern __inline__ struct net_alias_type *nat_getbytype(int type) -{ - struct net_alias_type *nat; - for(nat = nat_base[type & 0x0f]; nat ; nat = nat->next) - { - if (nat->type == type) - return nat; - } - return NULL; -} - - -/* - * Get addr32 representation (pre-hashing) of address. - * If NULL nat->get_addr32, assume sockaddr_in struct (IP-ish). - */ - -extern __inline__ __u32 nat_addr32(struct net_alias_type *nat, struct sockaddr *sa) -{ - if (nat->get_addr32) - return nat->get_addr32(nat, sa); - else - return (*(struct sockaddr_in *)sa).sin_addr.s_addr; -} - - -/* - * Hashing code for alias_info->hash_tab entries - * 4 bytes -> 1/2 byte using xor complemented by af - */ - -extern __inline__ unsigned HASH(__u32 addr, int af) -{ - unsigned tmp = addr ^ (addr>>16); /* 4 -> 2 */ - tmp ^= (tmp>>8); /* 2 -> 1 */ - return (tmp^(tmp>>4)^af) & 0x0f; /* 1 -> 1/2 */ -} - - -/* - * get hash key for supplied net alias type and address - * nat must be !NULL - * the purpose here is to map a net_alias_type and a generic - * address to a hash code. - */ - -extern __inline__ int nat_hash_key(struct net_alias_type *nat, struct sockaddr *sa) -{ - return HASH(nat_addr32(nat,sa), sa->sa_family); -} - - -/* - * Change net_alias_type number of attachments (bindings) - */ - -static int nat_attach_chg(struct net_alias_type *nat, int delta) -{ - unsigned long flags; - int n_at; - if (!nat) - return -1; - save_flags(flags); - cli(); - n_at = nat->n_attach + delta; - if (n_at < 0) - { - restore_flags(flags); - printk(KERN_WARNING - "net_alias: tried to set n_attach < 0 for (family==%d) nat object.\n", - nat->type); - return -1; - } - nat->n_attach = n_at; - restore_flags(flags); - return 0; -} - - -/* - * Bind alias to its type (family) object and call initialization hook - */ - -extern __inline__ int nat_bind(struct net_alias_type *nat, - struct net_alias *alias, struct sockaddr *sa) -{ - if (nat->alias_init_1) - nat->alias_init_1(nat, alias, sa); - return nat_attach_chg(nat, +1); -} - - -/* - * Unbind alias from type object and call alias destructor - */ - -extern __inline__ int nat_unbind(struct net_alias_type *nat, - struct net_alias *alias) -{ - if (nat->alias_done_1) - nat->alias_done_1(nat, alias); - return nat_attach_chg(nat, -1); -} - - -/* - * Compare device address with given. if NULL nat->dev_addr_chk, - * compare dev->pa_addr with (sockaddr_in) 32 bits address (IP-ish) - */ - -static __inline__ int nat_dev_addr_chk_1(struct net_alias_type *nat, - struct device *dev, struct sockaddr *sa) -{ - if (nat->dev_addr_chk) - return nat->dev_addr_chk(nat, dev, sa); - else - return (dev->pa_addr == (*(struct sockaddr_in *)sa).sin_addr.s_addr); -} - - -/* - * Alias device init() - * do nothing. - */ - -static int net_alias_devinit(struct device *dev) -{ -#ifdef ALIAS_USER_LAND_DEBUG - printk("net_alias_devinit(%s) called.\n", dev->name); -#endif - return 0; -} - - -/* - * 2 options for multicast: - * 1) fake it for aliases. - * 2) allow aliases and actual device to set it. - * current choice: option 1 - */ -static void net_alias_setmulticast(struct device *dev) -{ -} - - -/* - * Hard_start_xmit() should not be called. - * ignore ... but shout!. - */ - -static int net_alias_hard_start_xmit(struct sk_buff *skb, struct device *dev) -{ - printk(KERN_WARNING "net_alias: net_alias_hard_start_xmit() for %s called (ignored)!!\n", dev->name); - dev_kfree_skb(skb, FREE_WRITE); - return 0; -} - - -static int net_alias_open(struct device * dev) -{ - return 0; -} - -static int net_alias_close(struct device * dev) -{ - return 0; -} - -/* - * setups a new (alias) device - */ - -static int net_alias_devsetup(struct net_alias *alias, - struct net_alias_type *nat, struct sockaddr *sa) -{ - struct device *main_dev; - struct device *dev; - int family; - int i; - - /* - * - * generic device setup based on main_dev info - * - * FIXME: is NULL bitwise 0 for all Linux platforms? - */ - - main_dev = alias->main_dev; - dev = &alias->dev; - memset(dev, '\0', sizeof(struct device)); - family = (sa)? sa->sa_family : main_dev->family; - - dev->alias_info = NULL; /* no aliasing recursion */ - dev->my_alias = alias; /* point to alias */ - dev->name = alias->name; - dev->type = main_dev->type; - dev->open = net_alias_open; - dev->stop = net_alias_close; - if (main_dev->set_multicast_list) - dev->set_multicast_list = net_alias_setmulticast; - dev->hard_header_len = main_dev->hard_header_len; - memcpy(dev->broadcast, main_dev->broadcast, MAX_ADDR_LEN); - memcpy(dev->dev_addr, main_dev->dev_addr, MAX_ADDR_LEN); - dev->addr_len = main_dev->addr_len; - dev->init = net_alias_devinit; - dev->hard_start_xmit = net_alias_hard_start_xmit; - dev->flags = main_dev->flags & NET_ALIAS_IFF_MASK & ~IFF_UP; - dev->ifindex = dev_new_index(); - - /* - * Only makes sense if same family (arguable) - */ - - if (family == main_dev->family) - { - dev->metric = main_dev->metric; - dev->mtu = main_dev->mtu; - dev->pa_alen = main_dev->pa_alen; - dev->hard_header = main_dev->hard_header; - dev->hard_header_cache = main_dev->hard_header_cache; - dev->header_cache_update = main_dev->header_cache_update; - dev->rebuild_header = main_dev->rebuild_header; - } - - /* - * Fill in the generic fields of the device structure. - * not actually used, avoids some dev.c #ifdef's - */ - - for (i = 0; i < DEV_NUMBUFFS; i++) - skb_queue_head_init(&dev->buffs[i]); - - dev->family = family; - return 0; -} - - -/* - * Slow alias find (parse the whole hash_tab) - * returns: alias' pointer address - */ - -static struct net_alias **net_alias_slow_findp(struct net_alias_info - *alias_info, struct net_alias *alias) -{ - unsigned idx, n_aliases; - struct net_alias **aliasp; - - /* - * For each alias_info's hash_tab entry, for every alias ... - */ - - n_aliases = alias_info->n_aliases; - for (idx=0; idx < 16 ; idx++) - { - for (aliasp = &alias_info->hash_tab[idx];*aliasp; - aliasp = &(*aliasp)->next) - { - if (*aliasp == alias) - return aliasp; - else - if (--n_aliases == 0) - break; /* faster give up */ - } - } - return NULL; -} - - -/* - * Create alias device for main_dev with given slot num. - * if sa==NULL will create a same_family alias device. - */ - -static struct device *net_alias_dev_create(struct device *main_dev, int slot, - int *err, struct sockaddr *sa, void *data) -{ - struct net_alias_info *alias_info; - struct net_alias *alias, **aliasp; - struct net_alias_type *nat; - struct device *dev; - unsigned long flags; - int family; - __u32 addr32; - - /* FIXME: lock */ - - alias_info = main_dev->alias_info; - - /* - * If NULL address given, take family from main_dev - */ - - family = (sa)? sa->sa_family : main_dev->family; - - /* - * Check if wanted family has a net_alias_type object registered - */ - - nat = nat_getbytype(family); - if (!nat) - { -#ifdef CONFIG_KERNELD - char modname[20]; - sprintf (modname,"netalias-%d", family); - request_module(modname); - - nat = nat_getbytype(family); - if (!nat) - { -#endif - printk(KERN_WARNING "net_alias_dev_create(%s:%d): unregistered family==%d\n", - main_dev->name, slot, family); - /* *err = -EAFNOSUPPORT; */ - *err = -EINVAL; - return NULL; -#ifdef CONFIG_KERNELD - } -#endif - } - - /* - * Do not allow creation over downed devices - */ - - *err = -EIO; - - if (! (main_dev->flags & IFF_UP) ) - return NULL; - - /* - * If first alias, must also create alias_info - */ - - *err = -ENOMEM; - - if (!alias_info) - { - alias_info = kmalloc(sizeof(struct net_alias_info), GFP_KERNEL); - if (!alias_info) - return NULL; /* ENOMEM */ - memset(alias_info, 0, sizeof(struct net_alias_info)); - } - - if (!(alias = kmalloc(sizeof(struct net_alias), GFP_KERNEL))) - return NULL; /* ENOMEM */ - - memset(alias, 0, sizeof(struct net_alias)); - alias->slot = slot; - alias->main_dev = main_dev; - alias->nat = nat; - alias->next = NULL; - alias->data = data; - sprintf(alias->name, "%s:%d", main_dev->name, slot); - - /* - * Initialise alias' device structure - */ - - net_alias_devsetup(alias, nat, sa); - - dev = &alias->dev; - - save_flags(flags); - cli(); - - /* - * bind alias to its object type - * nat_bind calls nat->alias_init_1 - */ - - nat_bind(nat, alias, sa); - - /* - * If no address passed, take from device (could have been - * set by nat->alias_init_1) - */ - - addr32 = (sa)? nat_addr32(nat, sa) : alias->dev.pa_addr; - - /* - * Store hash key in alias: will speed-up rehashing and deletion - */ - - alias->hash = HASH(addr32, family); - - /* - * Insert alias in hashed linked list - */ - - aliasp = &alias_info->hash_tab[alias->hash]; - alias->next = *aliasp; - *aliasp = alias; - - /* - * If first alias ... - */ - - if (!alias_info->n_aliases++) - { - alias_info->taildev = main_dev; - main_dev->alias_info = alias_info; - } - - /* - * add device at tail (just after last main_dev alias) - */ - - dev->next = alias_info->taildev->next; - alias_info->taildev->next = dev; - alias_info->taildev = dev; - restore_flags(flags); - return dev; -} - - -/* - * Delete one main_dev alias (referred by its slot num) - */ - -static struct device *net_alias_dev_delete(struct device *main_dev, int slot, - int *err) -{ - struct net_alias_info *alias_info; - struct net_alias *alias, **aliasp; - struct device *dev; - unsigned n_aliases; - unsigned long flags; - struct net_alias_type *nat; - struct device *prevdev; - - /* FIXME: lock */ - *err = -ENODEV; - - if (main_dev == NULL) - return NULL; - - /* - * Does main_dev have aliases? - */ - - alias_info = main_dev->alias_info; - if (!alias_info) - return NULL; /* ENODEV */ - - n_aliases = alias_info->n_aliases; - - /* - * Find device that holds the same slot number (could also - * be strcmp() ala dev_get). - */ - - for (prevdev=main_dev, alias = NULL; - prevdev->next && n_aliases; prevdev = prevdev->next) - { - if (!(alias = prevdev->next->my_alias)) - { - printk(KERN_ERR "net_alias_dev_delete(): incorrect non-alias device after maindev\n"); - continue; /* or should give up? */ - } - if (alias->slot == slot) - break; - alias = NULL; - n_aliases--; - } - - if (!alias) - return NULL; /* ENODEV */ - - dev = &alias->dev; - - /* - * Find alias hashed entry - */ - - for(aliasp = &alias_info->hash_tab[alias->hash]; *aliasp; - aliasp = &(*aliasp)->next) - { - if(*aliasp == alias) - break; - } - - /* - * If not found (???), try a full search - */ - - if (*aliasp != alias) - { - if ((aliasp = net_alias_slow_findp(alias_info, alias))) - printk(KERN_WARNING "net_alias_dev_delete(%s): bad hashing recovered\n", alias->name); - else - { - printk(KERN_ERR "net_alias_dev_delete(%s): unhashed alias!\n",alias->name); - return NULL; /* ENODEV */ - } - } - nat = alias->nat; - - save_flags(flags); - cli(); - - /* - * Unbind alias from alias_type obj. - */ - - nat_unbind(nat, alias); - - /* - * Is alias at tail? - */ - - if ( dev == alias_info->taildev ) - alias_info->taildev = prevdev; - - /* - * Unlink and close device - */ - prevdev->next = dev->next; - dev_close(dev); - - /* - * Unlink alias - */ - - *aliasp = (*aliasp)->next; - if (--alias_info->n_aliases == 0) /* last alias */ - main_dev->alias_info = NULL; - - restore_flags(flags); - - /* - * Now free structures - */ - - kfree_s(alias, sizeof(struct net_alias)); - if (main_dev->alias_info == NULL) - kfree_s(alias_info, sizeof(struct net_alias_info)); - - /* - * Deletion ok (*err=0), NULL device returned. - */ - - *err = 0; - return NULL; -} - -/* - * Free all main device aliasing stuff - * will be called on dev_close(main_dev) - */ - -static void net_alias_free(struct device *main_dev) -{ - struct net_alias_info *alias_info; - struct net_alias *alias; - struct net_alias_type *nat; - struct device *dev; - unsigned long flags; - - /* - * Do I really have aliases? - */ - - if (!(alias_info = main_dev->alias_info)) - return; - - /* - * Fast device link "short-circuit": set main_dev->next to - * device after last alias - */ - - save_flags(flags); - cli(); - - dev = main_dev->next; - main_dev->next = alias_info->taildev->next; - main_dev->alias_info = NULL; - alias_info->taildev->next = NULL; - - restore_flags(flags); - - /* - * Loop over alias devices, free and dev_close() - */ - - while (dev) - { - if (net_alias_is(dev)) - { - alias = dev->my_alias; - if (alias->main_dev == main_dev) - { - /* - * unbind alias from alias_type object - */ - nat = alias->nat; - if (nat) - { - nat_unbind(nat, alias); - } /* else error/printk ??? */ - - dev_close(dev); - dev = dev->next; - - kfree_s(alias, sizeof(struct net_alias)); - continue; - } - else - printk(KERN_ERR "net_alias_free(%s): '%s' is not my alias\n", - main_dev->name, alias->name); - } - else - { - printk(KERN_ERR "net_alias_free(%s): found a non-alias after device!\n", - main_dev->name); - } - dev = dev->next; - } - - kfree_s(alias_info, sizeof(alias_info)); - return; -} - -/* - * dev_get() with added alias naming magic. - */ - -struct device *net_alias_dev_get(char *dev_name, int aliasing_ok, int *err, - struct sockaddr *sa, void *data) -{ - struct device *dev; - char *sptr,*eptr; - int slot = 0; - int delete = 0; - - *err = -ENODEV; - if ((dev=dev_get(dev_name))) - return dev; - - /* - * Want alias naming magic? - */ - - if (!aliasing_ok) - return NULL; - - if (!dev_name || !*dev_name) - return NULL; - - /* - * Find the first ':' , must be followed by, at least, 1 char - */ - - sptr=strchr(dev_name,':'); - if (sptr==NULL || !sptr[1]) - return NULL; - -#if 0 - for (sptr=dev_name ; *sptr ; sptr++) - if(*sptr==':') - break; - if (!*sptr || !*(sptr+1)) - return NULL; -#endif - /* - * Seems to be an alias name, fetch main device - */ - - *sptr='\0'; - if (!(dev=dev_get(dev_name))) - return NULL; - *sptr++=':'; - - /* - * Fetch slot number - */ - - slot = simple_strtoul(sptr,&eptr,10); - if (slot >= NET_ALIAS_MAX_SLOT) - return NULL; - - /* - * If last char is '-', it is a deletion request - */ - - if (eptr[0] == '-' && !eptr[1] ) - delete++; - else if (eptr[0]) - return NULL; - - /* - * Well... let's work. - */ - - if (delete) - return net_alias_dev_delete(dev, slot, err); - else - return net_alias_dev_create(dev, slot, err, sa, data); -} - - -/* - * Rehash alias device with address supplied. - */ - -int net_alias_dev_rehash(struct device *dev, struct sockaddr *sa) -{ - struct net_alias_info *alias_info; - struct net_alias *alias, **aliasp; - struct device *main_dev; - unsigned long flags; - struct net_alias_type *o_nat, *n_nat; - unsigned n_hash; - - /* - * Defensive ... - */ - - if (dev == NULL) - return -1; - if ( (alias = dev->my_alias) == NULL ) - return -1; - - if (!sa) - { - printk(KERN_ERR "net_alias_rehash(): NULL sockaddr passed\n"); - return -1; - } - - /* - * Defensive. should not happen. - */ - - if ( (main_dev = alias->main_dev) == NULL ) - { - printk(KERN_ERR "net_alias_rehash for %s: NULL maindev\n", alias->name); - return -1; - } - - /* - * Defensive. should not happen. - */ - - if (!(alias_info=main_dev->alias_info)) - { - printk(KERN_ERR "net_alias_rehash for %s: NULL alias_info\n", alias->name); - return -1; - } - - /* - * Will the request also change device family? - */ - - o_nat = alias->nat; - if (!o_nat) - { - printk(KERN_ERR "net_alias_rehash(%s): unbound alias.\n", alias->name); - return -1; - } - - /* - * Point to new alias_type obj. - */ - - if (o_nat->type == sa->sa_family) - n_nat = o_nat; - else - { - n_nat = nat_getbytype(sa->sa_family); - if (!n_nat) - { - printk(KERN_ERR "net_alias_rehash(%s): unreg family==%d.\n", alias->name, sa->sa_family); - return -1; - } - } - - /* - * New hash key. if same as old AND same type (family) return; - */ - - n_hash = nat_hash_key(n_nat, sa); - if (n_hash == alias->hash && o_nat == n_nat ) - return 0; - - /* - * Find alias in hashed list - */ - - for (aliasp = &alias_info->hash_tab[alias->hash]; *aliasp; - aliasp = &(*aliasp)->next) - { - if (*aliasp == alias) - break; - } - - /* - * Not found (???). try a full search - */ - - if(!*aliasp) - { - if ((aliasp = net_alias_slow_findp(alias_info, alias))) - { - printk(KERN_WARNING - "net_alias_rehash(%s): bad hashing recovered\n", alias->name); - } - else - { - printk(KERN_ERR "net_alias_rehash(%s): unhashed alias!\n", alias->name); - return -1; - } - } - - save_flags(flags); - cli(); - - /* - * If type (family) changed, unlink from old type object (o_nat) - * Will call o_nat->alias_done_1() - */ - - if (o_nat != n_nat) - nat_unbind(o_nat, alias); - - /* - * If diff hash key, change alias position in hashed list - */ - - if (n_hash != alias->hash) - { - *aliasp = (*aliasp)->next; - alias->hash = n_hash; - aliasp = &alias_info->hash_tab[n_hash]; - alias->next = *aliasp; - *aliasp = alias; - } - - /* - * If type (family) changed link to new type object (n_nat) - * will call n_nat->alias_init_1() - */ - - if (o_nat != n_nat) - nat_bind(n_nat, alias, sa); - - restore_flags(flags); - return 0; -} - - - - -/* - * Implements /proc/net/alias_types entry - * Shows net_alias_type objects registered. - */ - -int net_alias_types_getinfo(char *buffer, char **start, off_t offset, int length, int dummy) -{ - off_t pos=0, begin=0; - int len=0; - struct net_alias_type *nat; - unsigned idx; - len=sprintf(buffer,"type name n_attach\n"); - for (idx=0 ; idx < 16 ; idx++) - { - for (nat = nat_base[idx]; nat ; nat = nat->next) - { - len += sprintf(buffer+len, "%-7d %-15s %-7d\n", - nat->type, nat->name,nat->n_attach); - pos=begin+len; - if(posoffset+length) - break; - } - } - *start=buffer+(offset-begin); - len-=(offset-begin); - if(len>length) - len=length; - return len; -} - - -/* - * Implements /proc/net/aliases entry, shows alias devices. - * calls alias nat->alias_print_1 if not NULL and formats everything - * to a fixed rec. size without using local (stack) buffers - * - */ - -#define NET_ALIASES_RECSIZ 64 - -int net_alias_getinfo(char *buffer, char **start, off_t offset, - int length, int dummy) -{ - off_t pos=0, begin=0; - int len=0; - int dlen; - struct net_alias_type *nat; - struct net_alias *alias; - struct device *dev; - - len=sprintf(buffer,"%-*s\n",NET_ALIASES_RECSIZ-1,"device family address"); - for (dev = dev_base; dev ; dev = dev->next) - { - if (net_alias_is(dev)) - { - alias = dev->my_alias; - nat = alias->nat; - dlen=sprintf(buffer+len, "%-16s %-6d ", alias->name, alias->dev.family); - - /* - * Call alias_type specific print function. - */ - - if (nat->alias_print_1) - dlen += nat->alias_print_1(nat, alias, buffer+len+dlen, NET_ALIASES_RECSIZ - dlen); - else - dlen += sprintf(buffer+len+dlen, "-"); - - /* - * Fill with spaces if needed - */ - - if (dlen < NET_ALIASES_RECSIZ) - memset(buffer+len+dlen, ' ', NET_ALIASES_RECSIZ - dlen); - - /* - * Truncate to NET_ALIASES_RECSIZ - */ - - len += NET_ALIASES_RECSIZ; - buffer[len-1] = '\n'; - - pos=begin+len; - if(posoffset+length) - break; - } - } - *start=buffer+(offset-begin); - len-=(offset-begin); - if(len>length) - len=length; - return len; -} - - -/* - * Notifier for devices events - */ - -int net_alias_device_event(struct notifier_block *this, unsigned long event, void *ptr) -{ - struct device *dev = ptr; - - if (event == NETDEV_DOWN) - { -#ifdef ALIAS_USER_LAND_DEBUG - printk("net_alias: NETDEV_DOWN for %s received\n", dev->name); -#endif - if (net_alias_has(dev)) - net_alias_free(dev); - } - - if (event == NETDEV_UP) - { -#ifdef ALIAS_USER_LAND_DEBUG - printk("net_alias: NETDEV_UP for %s received\n", dev->name); -#endif - dev->alias_info = 0; - } - - return NOTIFY_DONE; -} - - -/* - * Device aliases address comparison workhorse - * No checks for nat and alias_info, must be !NULL - */ - -extern __inline__ struct device *nat_addr_chk(struct net_alias_type *nat, - struct net_alias_info *alias_info, struct sockaddr *sa, int flags_on, int flags_off) -{ - struct net_alias *alias; - for(alias = alias_info->hash_tab[nat_hash_key(nat,sa)]; - alias; alias = alias->next) - { - if (alias->dev.family != sa->sa_family) - continue; - - /* - * Nat_dev_addr_chk_1 will call type specific address - * cmp function. - */ - - if (alias->dev.flags & flags_on && - !(alias->dev.flags & flags_off) && - nat_dev_addr_chk_1(nat,&alias->dev,sa)) - return &alias->dev; - } - return NULL; -} - -/* - * Nat_addr_chk enough for protocols whose addr is (fully) stored at - * pa_addr. Note that nat pointer is ignored because of static comparison. - */ - -extern __inline__ struct device *nat_addr_chk32(struct net_alias_type *nat, - struct net_alias_info *alias_info, int family, __u32 addr32, - int flags_on, int flags_off) -{ - struct net_alias *alias; - for (alias=alias_info->hash_tab[HASH(addr32,family)]; - alias; alias=alias->next) - { - if (alias->dev.family != family) - continue; - /* - * "hard" (static) comparison between addr32 and pa_addr. - */ - - if (alias->dev.flags & flags_on && !(alias->dev.flags & flags_off) && - addr32 == alias->dev.pa_addr) - return &alias->dev; - } - return NULL; -} - -/* - * Returns alias device with specified address AND flags_on AND flags_off, - * else NULL. - * Intended for main devices. - */ - -struct device *net_alias_dev_chk(struct device *main_dev, - struct sockaddr *sa,int flags_on, int flags_off) -{ - struct net_alias_info *alias_info = main_dev->alias_info; - struct net_alias_type *nat; - - /* - * Only if main_dev has aliases - */ - - if (!alias_info) - return NULL; - - /* - * Get alias_type object for sa->sa_family. - */ - - nat = nat_getbytype(sa->sa_family); - if (!nat) - return NULL; - - return nat_addr_chk(nat, alias_info, sa, flags_on, flags_off); -} - -/* - * net_alias_dev_chk enough for protocols whose addr is (fully) stored - * at pa_addr. - */ - -struct device *net_alias_dev_chk32(struct device *main_dev, int family, - __u32 addr32, int flags_on, int flags_off) -{ - struct net_alias_info *alias_info = main_dev->alias_info; - - /* - * only if main_dev has aliases - */ - - if (!alias_info) - return NULL; - return nat_addr_chk32(NULL, alias_info, family, addr32, - flags_on, flags_off); -} - - -/* - * Select closest (main or alias) device to addresses given. If - * there is no further info available, return main_dev (for easier - * calling arrangement). - * - * Should be called early at xxx_rcv() time for device selection - */ - -struct device *net_alias_dev_rcv_sel(struct device *main_dev, - struct sockaddr *sa_src, struct sockaddr *sa_dst) -{ - int family; - struct net_alias_type *nat; - struct net_alias_info *alias_info; - struct device *dev; - - if (main_dev == NULL) - return NULL; - - /* - * If not aliased, don't bother any more - */ - - if ((alias_info = main_dev->alias_info) == NULL) - return main_dev; - - /* - * Find out family - */ - - family = (sa_src)? sa_src->sa_family : - ((sa_dst)? sa_dst->sa_family : AF_UNSPEC); - - if (family == AF_UNSPEC) - return main_dev; - - /* - * Get net_alias_type object for this family - */ - - if ( (nat = nat_getbytype(family)) == NULL ) - return main_dev; - - /* - * First step: find out if dst addr is main_dev's or one of its - * aliases' - */ - - if (sa_dst) - { - if (nat_dev_addr_chk_1(nat, main_dev,sa_dst)) - return main_dev; - - dev = nat_addr_chk(nat, alias_info, sa_dst, IFF_UP, 0); - - if (dev != NULL) - return dev; - } - - /* - * Second step: find the rcv addr 'closest' alias through nat - * method call - */ - - if ( sa_src == NULL || nat->dev_select == NULL) - return main_dev; - - dev = nat->dev_select(nat, main_dev, sa_src); - - if (dev == NULL || dev->family != family) - return main_dev; - - /* - * Dev ok only if it is alias of main_dev - */ - - dev = net_alias_is(dev)? - ( (dev->my_alias->main_dev == main_dev)? dev : NULL) : NULL; - - /* - * Do not return NULL. - */ - - return (dev)? dev : main_dev; - -} - -/* - * dev_rcv_sel32: dev_rcv_sel for 'pa_addr' protocols. - */ - -struct device *net_alias_dev_rcv_sel32(struct device *main_dev, int family, - __u32 src, __u32 dst) -{ - struct net_alias_type *nat; - struct net_alias_info *alias_info; - struct sockaddr_in sin_src; - struct device *dev; - - if (main_dev == NULL) - return NULL; - - /* - * If not aliased, don't bother any more - */ - - if ((alias_info = main_dev->alias_info) == NULL) - return main_dev; - - /* - * Early return if dst is main_dev's address - */ - - if (dst == main_dev->pa_addr) - return main_dev; - - if (family == AF_UNSPEC) - return main_dev; - - /* - * Get net_alias_type object for this family - */ - - if ( (nat = nat_getbytype(family)) == NULL ) - return main_dev; - - /* - * First step: find out if dst address one of main_dev aliases' - */ - - if (dst) - { - dev = nat_addr_chk32(nat, alias_info, family, dst, IFF_UP, 0); - if (dev) - return dev; - } - - /* - * Second step: find the rcv addr 'closest' alias through nat - * method call - */ - - if ( src == 0 || nat->dev_select == NULL) - return main_dev; - - sin_src.sin_family = family; - sin_src.sin_addr.s_addr = src; - - dev = nat->dev_select(nat, main_dev, (struct sockaddr *)&sin_src); - - if (dev == NULL || dev->family != family) - return main_dev; - - /* - * Dev ok only if it is alias of main_dev - */ - - dev = net_alias_is(dev)? - ( (dev->my_alias->main_dev == main_dev)? dev : NULL) : NULL; - - /* - * Do not return NULL. - */ - - return (dev)? dev : main_dev; -} - - -/* - * Device event hook - */ - -static struct notifier_block net_alias_dev_notifier = -{ - net_alias_device_event, - NULL, - 0 -}; - -#ifndef ALIAS_USER_LAND_DEBUG -#ifdef CONFIG_PROC_FS -static struct proc_dir_entry proc_net_alias_types = { - PROC_NET_ALIAS_TYPES, 11, "alias_types", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - net_alias_types_getinfo -}; -static struct proc_dir_entry proc_net_aliases = { - PROC_NET_ALIASES, 7, "aliases", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - net_alias_getinfo -}; -#endif -#endif - -/* - * Net_alias initialisation called from net_dev_init(). - */ - -__initfunc(void net_alias_init(void)) -{ - - /* - * Register device events notifier - */ - - register_netdevice_notifier(&net_alias_dev_notifier); - - /* - * Register /proc/net entries - */ - -#ifndef ALIAS_USER_LAND_DEBUG -#ifdef CONFIG_PROC_FS - proc_net_register(&proc_net_alias_types); - proc_net_register(&proc_net_aliases); -#endif -#endif - -} - -/* - * Net_alias type object registering func. - */ - -int register_net_alias_type(struct net_alias_type *nat, int type) -{ - unsigned hash; - unsigned long flags; - if (!nat) - { - printk(KERN_ERR "register_net_alias_type(): NULL arg\n"); - return -EINVAL; - } - nat->type = type; - nat->n_attach = 0; - hash = nat->type & 0x0f; - save_flags(flags); - cli(); - nat->next = nat_base[hash]; - nat_base[hash] = nat; - restore_flags(flags); - return 0; -} - -/* - * Net_alias type object unreg. - */ - -int unregister_net_alias_type(struct net_alias_type *nat) -{ - struct net_alias_type **natp; - unsigned hash; - unsigned long flags; - - if (!nat) - { - printk(KERN_ERR "unregister_net_alias_type(): NULL arg\n"); - return -EINVAL; - } - - /* - * Only allow unregistration if it has no attachments - */ - - if (nat->n_attach) - { - printk(KERN_ERR "unregister_net_alias_type(): has %d attachments. failed\n", - nat->n_attach); - return -EINVAL; - } - hash = nat->type & 0x0f; - save_flags(flags); - cli(); - for (natp = &nat_base[hash]; *natp ; natp = &(*natp)->next) - { - if (nat==(*natp)) - { - *natp = nat->next; - restore_flags(flags); - return 0; - } - } - restore_flags(flags); - printk(KERN_ERR "unregister_net_alias_type(type=%d): not found!\n", nat->type); - return -EINVAL; -} - diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c new file mode 100644 index 000000000000..795e0d062ad9 --- /dev/null +++ b/net/core/rtnetlink.c @@ -0,0 +1,436 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Routing netlink socket interface: protocol independent part. + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +atomic_t rtnl_rlockct; +struct wait_queue *rtnl_wait; + + +void rtnl_lock() +{ + rtnl_shlock(); + rtnl_exlock(); +} + +void rtnl_unlock() +{ + rtnl_exunlock(); + rtnl_shunlock(); +} + +#ifdef CONFIG_RTNETLINK +struct sock *rtnl; + +unsigned long rtnl_wlockct; + +struct rtnetlink_link * rtnetlink_links[NPROTO]; + +#define _S 1 /* superuser privileges required */ +#define _X 2 /* exclusive access to tables required */ +#define _G 4 /* GET request */ + +static unsigned char rtm_properties[RTM_MAX-RTM_BASE+1] = +{ + _S|_X, /* RTM_NEWLINK */ + _S|_X, /* RTM_DELLINK */ + _G, /* RTM_GETLINK */ + 0, + + _S|_X, /* RTM_NEWADDR */ + _S|_X, /* RTM_DELADDR */ + _G, /* RTM_GETADDR */ + 0, + + _S|_X, /* RTM_NEWROUTE */ + _S|_X, /* RTM_DELROUTE */ + _G, /* RTM_GETROUTE */ + 0, + + _S|_X, /* RTM_NEWNEIGH */ + _S|_X, /* RTM_DELNEIGH */ + _G, /* RTM_GETNEIGH */ + 0, + + _S|_X, /* RTM_NEWRULE */ + _S|_X, /* RTM_DELRULE */ + _G, /* RTM_GETRULE */ + 0 +}; + +static int rtnetlink_get_rta(struct kern_rta *rta, struct rtattr *attr, int attrlen) +{ + void **rta_data = (void**)rta; + + while (RTA_OK(attr, attrlen)) { + int type = attr->rta_type; + if (type != RTA_UNSPEC) { + if (type > RTA_MAX) + return -EINVAL; + rta_data[type-1] = RTA_DATA(attr); + } + attr = RTA_NEXT(attr, attrlen); + } + return 0; +} + +static int rtnetlink_get_ifa(struct kern_ifa *ifa, struct rtattr *attr, int attrlen) +{ + void **ifa_data = (void**)ifa; + + while (RTA_OK(attr, attrlen)) { + int type = attr->rta_type; + if (type != IFA_UNSPEC) { + if (type > IFA_MAX) + return -EINVAL; + ifa_data[type-1] = RTA_DATA(attr); + } + attr = RTA_NEXT(attr, attrlen); + } + return 0; +} + +void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ + struct rtattr *rta; + int size = RTA_LENGTH(attrlen); + + rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); + rta->rta_type = attrtype; + rta->rta_len = size; + memcpy(RTA_DATA(rta), data, attrlen); +} + +static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, + int type, pid_t pid, u32 seq) +{ + struct ifinfomsg *r; + struct nlmsghdr *nlh; + + nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + r = NLMSG_DATA(nlh); + r->ifi_addrlen = dev->addr_len; + r->ifi_address.sa_family = dev->type; + memcpy(&r->ifi_address.sa_data, dev->dev_addr, dev->addr_len); + r->ifi_broadcast.sa_family = dev->type; + memcpy(&r->ifi_broadcast.sa_data, dev->broadcast, dev->addr_len); + r->ifi_flags = dev->flags; + r->ifi_mtu = dev->mtu; + r->ifi_index = dev->ifindex; + r->ifi_link = dev->iflink; + strncpy(r->ifi_name, dev->name, IFNAMSIZ-1); + r->ifi_qdiscname[0] = 0; + r->ifi_qdisc = dev->qdisc_sleeping->handle; + if (dev->qdisc_sleeping->ops) + strcpy(r->ifi_qdiscname, dev->qdisc_sleeping->ops->id); + return skb->len; + +nlmsg_failure: + return -1; +} + +int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->args[0]; + struct device *dev; + + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq) <= 0) + break; + } + cb->args[0] = idx; + + return skb->len; +} + +void rtmsg_ifinfo(int type, struct device *dev) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct ifinfomsg)); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return; + + if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0) < 0) { + kfree_skb(skb, 0); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL); +} + +static int rtnetlink_done(struct netlink_callback *cb) +{ + if (NETLINK_CREDS(cb->skb)->uid == 0 && cb->nlh->nlmsg_flags&NLM_F_ATOMIC) + rtnl_shunlock(); + return 0; +} + +/* Process one rtnetlink message. */ + +extern __inline__ int +rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) +{ + union { + struct kern_rta rta; + struct kern_ifa ifa; + } u; + struct rtmsg *rtm; + struct ifaddrmsg *ifm; + int exclusive = 0; + int family; + int type; + int err; + + if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) + return 0; + type = nlh->nlmsg_type; + if (type < RTM_BASE) + return 0; + if (type > RTM_MAX) + goto err_inval; + + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) + return 0; + family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; + if (family > NPROTO || rtnetlink_links[family] == NULL) { + *errp = -EAFNOSUPPORT; + return -1; + } + if (rtm_properties[type-RTM_BASE]&_S) { + if (NETLINK_CREDS(skb)->uid) { + *errp = -EPERM; + return -1; + } + } + if (rtm_properties[type-RTM_BASE]&_G && nlh->nlmsg_flags&NLM_F_DUMP) { + if (rtnetlink_links[family][type-RTM_BASE].dumpit == NULL) + goto err_inval; + + /* Super-user locks all the tables to get atomic snapshot */ + if (NETLINK_CREDS(skb)->uid == 0 && nlh->nlmsg_flags&NLM_F_ATOMIC) + atomic_inc(&rtnl_rlockct); + if ((*errp = netlink_dump_start(rtnl, skb, nlh, + rtnetlink_links[family][type-RTM_BASE].dumpit, + rtnetlink_done)) != 0) { + if (NETLINK_CREDS(skb)->uid == 0 && nlh->nlmsg_flags&NLM_F_ATOMIC) + atomic_dec(&rtnl_rlockct); + return -1; + } + skb_pull(skb, NLMSG_ALIGN(nlh->nlmsg_len)); + return -1; + } + if (rtm_properties[type-RTM_BASE]&_X) { + if (rtnl_exlock_nowait()) { + *errp = 0; + return -1; + } + exclusive = 1; + } + + memset(&u, 0, sizeof(u)); + + switch (nlh->nlmsg_type) { + case RTM_NEWROUTE: + case RTM_DELROUTE: + case RTM_GETROUTE: + case RTM_NEWRULE: + case RTM_DELRULE: + case RTM_GETRULE: + rtm = NLMSG_DATA(nlh); + if (nlh->nlmsg_len < sizeof(*rtm)) + goto err_inval; + + if (rtm->rtm_optlen && + rtnetlink_get_rta(&u.rta, RTM_RTA(rtm), rtm->rtm_optlen) < 0) + goto err_inval; + break; + + case RTM_NEWADDR: + case RTM_DELADDR: + case RTM_GETADDR: + ifm = NLMSG_DATA(nlh); + if (nlh->nlmsg_len < sizeof(*ifm)) + goto err_inval; + + if (nlh->nlmsg_len > NLMSG_LENGTH(sizeof(*ifm)) && + rtnetlink_get_ifa(&u.ifa, IFA_RTA(ifm), + nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifm))) < 0) + goto err_inval; + break; + + case RTM_NEWLINK: + case RTM_DELLINK: + case RTM_GETLINK: + case RTM_NEWNEIGH: + case RTM_DELNEIGH: + case RTM_GETNEIGH: + /* Not urgent and even not necessary */ + default: + goto err_inval; + } + + if (rtnetlink_links[family][type-RTM_BASE].doit == NULL) + goto err_inval; + err = rtnetlink_links[family][type-RTM_BASE].doit(skb, nlh, (void *)&u); + + if (exclusive) + rtnl_exunlock(); + *errp = err; + return err; + +err_inval: + if (exclusive) + rtnl_exunlock(); + *errp = -EINVAL; + return -1; +} + +/* + * Process one packet of messages. + * Malformed skbs with wrong lengths of messages are discarded silently. + */ + +extern __inline__ int rtnetlink_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr * nlh; + + while (skb->len >= NLMSG_SPACE(0)) { + int rlen; + + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if (rtnetlink_rcv_msg(skb, nlh, &err)) { + /* Not error, but we must interrupt processing here: + * Note, that in this case we do not pull message + * from skb, it will be processed later. + */ + if (err == 0) + return -1; + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags&NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + + return 0; +} + +/* + * rtnetlink input queue processing routine: + * - try to acquire shared lock. If it is failed, defer processing. + * - feed skbs to rtnetlink_rcv_skb, until it refuse a message, + * that will occur, when a dump started and/or acquisition of + * exclusive lock failed. + */ + +static void rtnetlink_rcv(struct sock *sk, int len) +{ + struct sk_buff *skb; + + if (rtnl_shlock_nowait()) + return; + + while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { + if (rtnetlink_rcv_skb(skb)) { + if (skb->len) + skb_queue_head(&sk->receive_queue, skb); + else + kfree_skb(skb, FREE_READ); + break; + } + kfree_skb(skb, FREE_READ); + } + + rtnl_shunlock(); +} + +static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + switch (event) { + case NETDEV_UNREGISTER: + rtmsg_ifinfo(RTM_DELLINK, dev); + break; + default: + rtmsg_ifinfo(RTM_NEWLINK, dev); + break; + } + return NOTIFY_DONE; +} + +struct notifier_block rtnetlink_dev_notifier = { + rtnetlink_event, + NULL, + 0 +}; + + +__initfunc(void rtnetlink_init(void)) +{ +#ifdef RTNL_DEBUG + printk("Initializing RT netlink socket\n"); +#endif + rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv); + if (rtnl == NULL) + panic("rtnetlink_init: cannot initialize rtnetlink\n"); + register_netdevice_notifier(&rtnetlink_dev_notifier); +} + + + +#endif diff --git a/net/core/scm.c b/net/core/scm.c index e5fa793a7052..5a6d24c4017f 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -205,25 +205,25 @@ error: return err; } -void put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) +int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { struct cmsghdr *cm = (struct cmsghdr*)msg->msg_control; + struct cmsghdr cmhdr; int cmlen = CMSG_LEN(len); int err; if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { msg->msg_flags |= MSG_CTRUNC; - return; + return 0; /* XXX: return error? check spec. */ } if (msg->msg_controllen < cmlen) { msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } - err = put_user(level, &cm->cmsg_level); - if (!err) - err = put_user(type, &cm->cmsg_type); - if (!err) - err = put_user(cmlen, &cm->cmsg_len); + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + err = copy_to_user(cm, &cmhdr, sizeof cmhdr); if (!err) err = copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)); if (!err) { @@ -231,6 +231,7 @@ void put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_control += cmlen; msg->msg_controllen -= cmlen; } + return err; } void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 06c321e4fc03..6baf37c0352a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -176,7 +176,7 @@ struct sk_buff *alloc_skb(unsigned int size,int priority) skb->dst = NULL; skb->destructor = NULL; memset(skb->cb, 0, sizeof(skb->cb)); - skb->priority = SOPRI_NORMAL; + skb->priority = 0; atomic_inc(&net_skbcount); atomic_set(&skb->users, 1); diff --git a/net/core/sock.c b/net/core/sock.c index 65cee3b62169..16d5435ed89c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -101,6 +101,7 @@ #include #include #include +#include #include #include @@ -143,6 +144,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, int valbool; int err; struct linger ling; + struct ifreq req; int ret = 0; /* @@ -241,7 +243,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_PRIORITY: - if (val >= 0 && val < DEV_NUMBUFFS) + if (val >= 0 && val <= 7) sk->priority = val; else return(-EINVAL); @@ -317,6 +319,46 @@ int sock_setsockopt(struct socket *sock, int level, int optname, return -EINVAL; break; #endif + case SO_BINDTODEVICE: + /* Bind this socket to a particular device like "eth0", + * as specified in an ifreq structure. If the device + * is "", socket is NOT bound to a device. + */ + + if (!valbool) { + sk->bound_dev_if = 0; + } + else { + if (copy_from_user(&req, optval, sizeof(req)) < 0) + return -EFAULT; + + /* Remove any cached route for this socket. */ + if (sk->dst_cache) { + ip_rt_put((struct rtable*)sk->dst_cache); + sk->dst_cache = NULL; + } + + if (req.ifr_ifrn.ifrn_name[0] == '\0') { + sk->bound_dev_if = 0; + } + else { + struct device *dev = dev_get(req.ifr_ifrn.ifrn_name); + if (!dev) + return -EINVAL; + sk->bound_dev_if = dev->ifindex; + if (sk->daddr) { + int ret; + ret = ip_route_output((struct rtable**)&sk->dst_cache, + sk->daddr, sk->saddr, + sk->ip_tos, sk->bound_dev_if); + if (ret) + return ret; + } + } + } + return 0; + + /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ default: @@ -627,7 +669,7 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigne produce annoying no free page messages still.... */ skb = sock_wmalloc(sk, size, 0 , GFP_BUFFER); if(!skb) - skb=sock_wmalloc(sk, fallback, 0, GFP_KERNEL); + skb=sock_wmalloc(sk, fallback, 0, sk->allocation); } /* @@ -669,7 +711,7 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigne * In any case I'd delete this check at all, or * change it to: */ - if (atomic_read(&sk->wmem_alloc) + size >= sk->sndbuf) + if (atomic_read(&sk->wmem_alloc) >= sk->sndbuf) #endif { sk->socket->flags &= ~SO_NOSPACE; @@ -967,7 +1009,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->allocation = GFP_KERNEL; sk->rcvbuf = sysctl_rmem_default*2; sk->sndbuf = sysctl_wmem_default*2; - sk->priority = SOPRI_NORMAL; sk->state = TCP_CLOSE; sk->zapped = 1; sk->socket = sock; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index bdc6b37fd582..47417a27a9a1 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -232,6 +232,13 @@ unsigned short eth_type_trans(struct sk_buff *skb, struct device *dev) return htons(ETH_P_802_2); } +int eth_header_parse(struct sk_buff *skb, unsigned char *haddr) +{ + struct ethhdr *eth = skb->mac.ethernet; + memcpy(haddr, eth->h_source, ETH_ALEN); + return ETH_ALEN; +} + int eth_header_cache(struct dst_entry *dst, struct neighbour *neigh, struct hh_cache *hh) { diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in index 3a5ac3b04590..ea50576aba47 100644 --- a/net/ipv4/Config.in +++ b/net/ipv4/Config.in @@ -2,6 +2,25 @@ # IP configuration # bool 'IP: multicasting' CONFIG_IP_MULTICAST +bool 'IP: advanced router' CONFIG_IP_ADVANCED_ROUTER +if [ "$CONFIG_IP_ADVANCED_ROUTER" = "y" ]; then + define_bool CONFIG_RTNETLINK y + bool 'IP: policy routing' CONFIG_IP_MULTIPLE_TABLES + bool 'IP: equal cost multipath' CONFIG_IP_ROUTE_MULTIPATH + bool 'IP: use TOS value as routing key' CONFIG_IP_ROUTE_TOS + bool 'IP: verbose route monitoring' CONFIG_IP_ROUTE_VERBOSE + bool 'IP: large routing tables' CONFIG_IP_ROUTE_LARGE_TABLES + if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then + bool 'IP: fast network address translation' CONFIG_IP_ROUTE_NAT + fi +fi +bool 'IP: kernel level autoconfiguration' CONFIG_IP_PNP +if [ "$CONFIG_IP_PNP" = "y" ]; then + bool ' BOOTP support' CONFIG_IP_PNP_BOOTP + bool ' RARP support' CONFIG_IP_PNP_RARP +# not yet ready.. +# bool ' ARP support' CONFIG_IP_PNP_ARP +fi if [ "$CONFIG_FIREWALL" = "y" ]; then bool 'IP: firewalling' CONFIG_IP_FIREWALL if [ "$CONFIG_IP_FIREWALL" = "y" ]; then @@ -9,23 +28,29 @@ if [ "$CONFIG_FIREWALL" = "y" ]; then bool 'IP: firewall packet netlink device' CONFIG_IP_FIREWALL_NETLINK fi bool 'IP: firewall packet logging' CONFIG_IP_FIREWALL_VERBOSE - bool 'IP: masquerading' CONFIG_IP_MASQUERADE - if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then - comment 'Protocol-specific masquerading support will be built as modules.' - fi bool 'IP: transparent proxy support' CONFIG_IP_TRANSPARENT_PROXY bool 'IP: always defragment' CONFIG_IP_ALWAYS_DEFRAG fi fi bool 'IP: accounting' CONFIG_IP_ACCT +bool 'IP: masquerading' CONFIG_IP_MASQUERADE +if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then + comment 'Protocol-specific masquerading support will be built as modules.' +fi bool 'IP: optimize as router not host' CONFIG_IP_ROUTER tristate 'IP: tunneling' CONFIG_NET_IPIP +tristate 'IP: GRE tunnels over IP' CONFIG_NET_IPGRE if [ "$CONFIG_IP_MULTICAST" = "y" ]; then + if [ "$CONFIG_NET_IPGRE" != "n" ]; then + bool 'IP: broadcast GRE over IP' CONFIG_NET_IPGRE_BROADCAST + fi bool 'IP: multicast routing' CONFIG_IP_MROUTE + if [ "$CONFIG_IP_MROUTE" = "y" ]; then + bool 'IP: PIM-SM version 1 support' CONFIG_IP_PIMSM_V1 + bool 'IP: PIM-SM version 2 support' CONFIG_IP_PIMSM_V2 + fi fi -if [ "$CONFIG_NET_ALIAS" = "y" ]; then - tristate 'IP: aliasing support' CONFIG_IP_ALIAS -fi +tristate 'IP: aliasing support' CONFIG_IP_ALIAS if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then if [ "$CONFIG_NETLINK" = "y" ]; then bool 'IP: ARP daemon support (EXPERIMENTAL)' CONFIG_ARPD @@ -33,9 +58,9 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then fi bool 'IP: TCP syncookie support (not enabled per default) ' CONFIG_SYN_COOKIES comment '(it is safe to leave these untouched)' -bool 'IP: PC/TCP compatibility mode' CONFIG_INET_PCTCP +#bool 'IP: PC/TCP compatibility mode' CONFIG_INET_PCTCP tristate 'IP: Reverse ARP' CONFIG_INET_RARP -bool 'IP: Path MTU Discovery (normally enabled)' CONFIG_PATH_MTU_DISCOVERY +#bool 'IP: Path MTU Discovery (normally enabled)' CONFIG_PATH_MTU_DISCOVERY #bool 'IP: Disable NAGLE algorithm (normally enabled)' CONFIG_TCP_NAGLE_OFF bool 'IP: Drop source routed frames' CONFIG_IP_NOSR bool 'IP: Allow large windows (not recommended if <16Mb of memory)' CONFIG_SKB_LARGE diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 2428ccc55f57..759def7ea5e9 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,17 +8,25 @@ # Note 2! The CFLAGS definition is now in the main makefile... O_TARGET := ipv4.o -IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o packet.o \ +IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o\ raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o ip_fw.o \ - sysctl_net_ipv4.o fib.o ip_nat_dumb.o + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o IPV4X_OBJS := MOD_LIST_NAME := IPV4_MODULES M_OBJS := +ifeq ($(CONFIG_IP_MULTIPLE_TABLES),y) +IPV4_OBJS += fib_rules.o +endif + +ifeq ($(CONFIG_IP_ROUTE_NAT),y) +IPV4_OBJS += ip_nat_dumb.o +endif + ifeq ($(CONFIG_IP_MROUTE),y) IPV4_OBJS += ipmr.o endif @@ -32,10 +40,18 @@ else endif ifeq ($(CONFIG_NET_IPIP),y) -IPV4_OBJS += ipip.o +IPV4X_OBJS += ipip.o else ifeq ($(CONFIG_NET_IPIP),m) - M_OBJS += ipip.o + MX_OBJS += ipip.o + endif +endif + +ifeq ($(CONFIG_NET_IPGRE),y) +IPV4X_OBJS += ip_gre.o +else + ifeq ($(CONFIG_NET_IPGRE),m) + MX_OBJS += ip_gre.o endif endif @@ -44,19 +60,15 @@ IPV4X_OBJS += ip_masq.o ip_masq_app.o M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o ip_masq_quake.o endif -ifeq ($(CONFIG_IP_ALIAS),y) -IPV4_OBJS += ip_alias.o -else - ifeq ($(CONFIG_IP_ALIAS),m) - M_OBJS += ip_alias.o - endif -endif - ifeq ($(CONFIG_SYN_COOKIES),y) IPV4_OBJS += syncookies.o # module not supported, because it would be too messy. endif +ifeq ($(CONFIG_IP_PNP),y) +IPV4_OBJS += ipconfig.o +endif + ifdef CONFIG_INET O_OBJS := $(IPV4_OBJS) OX_OBJS := $(IPV4X_OBJS) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4bf4bf1660c2..ca3ff3213749 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * AF_INET protocol family socket handler. * - * Version: @(#)af_inet.c (from sock.c) 1.0.17 06/02/93 + * Version: $Id: af_inet.c,v 1.58 1997/10/29 20:27:21 kuznet Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -77,6 +77,7 @@ #include #include #include +#include #include #include @@ -94,14 +95,15 @@ #include #include #include +#include #include #include +#ifdef CONFIG_IP_MROUTE +#include +#endif #ifdef CONFIG_IP_MASQUERADE #include #endif -#ifdef CONFIG_IP_ALIAS -#include -#endif #ifdef CONFIG_BRIDGE #include #endif @@ -115,13 +117,13 @@ #define min(a,b) ((a)<(b)?(a):(b)) extern int sysctl_core_destroy_delay; -extern struct proto packet_prot; + extern int raw_get_info(char *, char **, off_t, int, int); extern int snmp_get_info(char *, char **, off_t, int, int); extern int afinet_get_info(char *, char **, off_t, int, int); extern int tcp_get_info(char *, char **, off_t, int, int); extern int udp_get_info(char *, char **, off_t, int, int); - +extern void ip_mc_drop_socket(struct sock *sk); #ifdef CONFIG_DLCI extern int dlci_ioctl(unsigned int, void*); @@ -165,9 +167,8 @@ static __inline__ void kill_sk_now(struct sock *sk) /* No longer exists. */ del_from_prot_sklist(sk); - /* This is gross, but needed for SOCK_PACKET -DaveM */ - if(sk->prot->unhash) - sk->prot->unhash(sk); + /* Remove from protocol hash chains. */ + sk->prot->unhash(sk); if(sk->opt) kfree(sk->opt); @@ -321,13 +322,24 @@ static int inet_create(struct socket *sock, int protocol) struct sock *sk; struct proto *prot; + /* Compatibility */ + if (sock->type == SOCK_PACKET) { + static int warned; + if (net_families[AF_PACKET]==NULL) + return -ESOCKTNOSUPPORT; + if (!warned++) + printk(KERN_INFO "%s uses obsolete (AF_INET,SOCK_PACKET)\n", current->comm); + return net_families[AF_PACKET]->create(sock, protocol); + } + sock->state = SS_UNCONNECTED; sk = sk_alloc(AF_INET, GFP_KERNEL); if (sk == NULL) goto do_oom; - /* Note for tcp that also wiped the dummy_th block for us. */ - if(sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET) { + switch (sock->type) { + case SOCK_STREAM: + /* Note for tcp that also wiped the dummy_th block for us. */ if (protocol && protocol != IPPROTO_TCP) goto free_and_noproto; protocol = IPPROTO_TCP; @@ -338,7 +350,10 @@ static int inet_create(struct socket *sock, int protocol) sk->ip_pmtudisc = IP_PMTUDISC_WANT; prot = &tcp_prot; sock->ops = &inet_stream_ops; - } else if(sock->type == SOCK_DGRAM) { + break; + case SOCK_SEQPACKET: + goto free_and_badtype; + case SOCK_DGRAM: if (protocol && protocol != IPPROTO_UDP) goto free_and_noproto; protocol = IPPROTO_UDP; @@ -346,21 +361,26 @@ static int inet_create(struct socket *sock, int protocol) sk->ip_pmtudisc = IP_PMTUDISC_DONT; prot=&udp_prot; sock->ops = &inet_dgram_ops; - } else if(sock->type == SOCK_RAW || sock->type == SOCK_PACKET) { + break; + case SOCK_RAW: if (!suser()) goto free_and_badperm; if (!protocol) goto free_and_noproto; - prot = (sock->type == SOCK_RAW) ? &raw_prot : &packet_prot; + prot = &raw_prot; sk->reuse = 1; sk->ip_pmtudisc = IP_PMTUDISC_DONT; sk->num = protocol; sock->ops = &inet_dgram_ops; - } else { + if (protocol == IPPROTO_RAW) + sk->ip_hdrincl = 1; + break; + default: goto free_and_badtype; } sock_init_data(sock,sk); + sk->destruct = NULL; sk->zapped=0; @@ -378,11 +398,6 @@ static int inet_create(struct socket *sock, int protocol) sk->ip_ttl=ip_statistics.IpDefaultTTL; - if(sk->type==SOCK_RAW && protocol==IPPROTO_RAW) - sk->ip_hdrincl=1; - else - sk->ip_hdrincl=0; - sk->ip_mc_loop=1; sk->ip_mc_ttl=1; sk->ip_mc_index=0; @@ -398,11 +413,10 @@ static int inet_create(struct socket *sock, int protocol) * creation time automatically * shares. */ - sk->dummy_th.source = ntohs(sk->num); + sk->dummy_th.source = htons(sk->num); - /* This is gross, but needed for SOCK_PACKET -DaveM */ - if(sk->prot->hash) - sk->prot->hash(sk); + /* Add to protocol hash chains. */ + sk->prot->hash(sk); add_to_prot_sklist(sk); } @@ -482,7 +496,7 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) unsigned short snum; int chk_addr_ret; - /* If the socket has its own bind function then use it. (RAW and PACKET) */ + /* If the socket has its own bind function then use it. (RAW) */ if(sk->prot->bind) return sk->prot->bind(sk, uaddr, addr_len); @@ -503,12 +517,12 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (snum < PROT_SOCK && !suser()) return(-EACCES); - chk_addr_ret = __ip_chk_addr(addr->sin_addr.s_addr); - if (addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR && - chk_addr_ret != IS_MULTICAST && chk_addr_ret != IS_BROADCAST) { + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + if (addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) { #ifdef CONFIG_IP_TRANSPARENT_PROXY /* Superuser may bind to any address to allow transparent proxying. */ - if(!suser()) + if(chk_addr_ret != RTN_UNICAST || !suser()) #endif return -EADDRNOTAVAIL; /* Source address MUST be ours! */ } @@ -521,7 +535,7 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * which case the sending device address is used. */ sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr; - if(chk_addr_ret == IS_MULTICAST || chk_addr_ret == IS_BROADCAST) + if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) sk->saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ @@ -529,7 +543,7 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EADDRINUSE; sk->num = snum; - sk->dummy_th.source = ntohs(snum); + sk->dummy_th.source = htons(snum); sk->daddr = 0; sk->dummy_th.dest = 0; sk->prot->rehash(sk); @@ -868,9 +882,6 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCDARP: case SIOCGARP: case SIOCSARP: - case OLD_SIOCDARP: - case OLD_SIOCGARP: - case OLD_SIOCSARP: return(arp_ioctl(cmd,(void *) arg)); case SIOCDRARP: case SIOCGRARP: @@ -889,10 +900,12 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFNETMASK: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: + case SIOCSIFPFLAGS: + case SIOCGIFPFLAGS: + case SIOCSIFFLAGS: return(devinet_ioctl(cmd,(void *) arg)); case SIOCGIFCONF: case SIOCGIFFLAGS: - case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFMETRIC: @@ -908,9 +921,10 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGIFMAP: case SIOCSIFSLAVE: case SIOCGIFSLAVE: - case SIOGIFINDEX: - case SIOGIFNAME: - case SIOCGIFCOUNT: + case SIOCGIFINDEX: + case SIOCGIFNAME: + case SIOCGIFCOUNT: + case SIOCSIFHWBROADCAST: return(dev_ioctl(cmd,(void *) arg)); case SIOCGIFBR: @@ -1105,6 +1119,16 @@ __initfunc(void inet_proto_init(struct net_proto *pro)) icmp_init(&inet_family_ops); + /* I wish inet_add_protocol had no constructor hook... + I had to move IPIP from net/ipv4/protocol.c :-( --ANK + */ +#ifdef CONFIG_NET_IPIP + ipip_init(); +#endif +#ifdef CONFIG_NET_IPGRE + ipgre_init(); +#endif + /* * Set the firewalling up */ @@ -1114,21 +1138,13 @@ __initfunc(void inet_proto_init(struct net_proto *pro)) #ifdef CONFIG_IP_MASQUERADE ip_masq_init(); #endif - + /* * Initialise the multicast router */ #if defined(CONFIG_IP_MROUTE) ip_mr_init(); #endif - - /* - * Initialise AF_INET alias type (register net_alias_type) - */ - -#if defined(CONFIG_IP_ALIAS) - ip_alias_init(); -#endif #ifdef CONFIG_INET_RARP rarp_ioctl_hook = rarp_ioctl; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 472f648117dd..26cc21977797 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,4 +1,6 @@ /* linux/net/inet/arp.c + * + * Version: $Id: arp.c,v 1.56 1997/11/24 12:51:47 freitag Exp $ * * Copyright (C) 1994 by Florian La Roche * @@ -58,6 +60,8 @@ * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... + * Jes Sorensen : Make FDDI work again in 2.1.x and + * clean up the APFDDI & gen. FDDI bits. */ /* RFC1122 Status: @@ -105,7 +109,6 @@ #include #endif #endif -#include #ifdef CONFIG_ARPD #include #endif @@ -251,6 +254,7 @@ static atomic_t arp_unres_size = ATOMIC_INIT(0); #ifdef CONFIG_ARPD static int arpd_not_running; static int arpd_stamp; +struct sock *arpd_sk; #endif static void arp_check_expire (unsigned long); @@ -428,8 +432,6 @@ static void arpd_send(int req, u32 addr, struct device * dev, char *ha, static __inline__ void arpd_update(u32 ip, struct device *dev, char *ha) { - if (arpd_not_running) - return; arpd_send(ARPD_UPDATE, ip, dev, ha, jiffies); } @@ -440,8 +442,6 @@ static __inline__ void arpd_update(u32 ip, struct device *dev, char *ha) static __inline__ void arpd_lookup(u32 addr, struct device * dev) { - if (arpd_not_running) - return; arpd_send(ARPD_LOOKUP, addr, dev, NULL, 0); } @@ -451,13 +451,11 @@ static __inline__ void arpd_lookup(u32 addr, struct device * dev) static __inline__ void arpd_flush(struct device * dev) { - if (arpd_not_running) - return; arpd_send(ARPD_FLUSH, 0, dev, NULL, 0); } -static int arpd_callback(int minor, struct sk_buff *skb) +static int arpd_callback(struct sk_buff *skb, struct sock *sk) { struct device * dev; struct arpd_request *retreq; @@ -484,7 +482,9 @@ static int arpd_callback(int minor, struct sk_buff *skb) /* * Invalid mapping: drop it and send ARP broadcast. */ - arp_send(ARPOP_REQUEST, ETH_P_ARP, retreq->ip, dev, dev->pa_addr, NULL, + arp_send(ARPOP_REQUEST, ETH_P_ARP, retreq->ip, dev, + inet_select_addr(dev, retreq->ip, RT_SCOPE_LINK), + NULL, dev->dev_addr, NULL); } else @@ -658,8 +658,8 @@ static void arp_check_expire(unsigned long dummy) entry->timer.expires = jiffies + ARP_CONFIRM_TIMEOUT; add_timer(&entry->timer); arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, - dev, dev->pa_addr, entry->u.neigh.ha, - dev->dev_addr, NULL); + dev, inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), + entry->u.neigh.ha, dev->dev_addr, NULL); #if RT_CACHE_DEBUG >= 2 printk("arp_expire: %08x requires confirmation\n", entry->ip); #endif @@ -710,7 +710,8 @@ static void arp_expire_request (unsigned long arg) /* Set new timer. */ entry->timer.expires = jiffies + sysctl_arp_res_time; add_timer(&entry->timer); - arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, dev->pa_addr, + arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, + inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), entry->retries > sysctl_arp_max_tries ? entry->u.neigh.ha : NULL, dev->dev_addr, NULL); return; @@ -749,7 +750,8 @@ static void arp_expire_request (unsigned long arg) entry->timer.expires = jiffies + sysctl_arp_dead_res_time; add_timer(&entry->timer); - arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, dev->pa_addr, + arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, + inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), NULL, dev->dev_addr, NULL); return; } @@ -797,9 +799,7 @@ static struct arp_table * arp_alloc(int how) entry = (struct arp_table *)neigh_alloc(sizeof(struct arp_table), &arp_neigh_ops); - - if (entry != NULL) - { + if (entry != NULL) { atomic_set(&entry->u.neigh.refcnt, 1); if (how) @@ -953,19 +953,19 @@ static __inline__ struct arp_table *arp_lookup(u32 paddr, struct device * dev) for (entry = arp_tables[HASH(paddr)]; entry != NULL; entry = entry->u.next) if (entry->ip == paddr && entry->u.neigh.dev == dev) - return entry; - return NULL; + break; + return entry; } static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct device * dev) { switch (addr_hint) { - case IS_MYADDR: + case RTN_LOCAL: printk(KERN_DEBUG "ARP: arp called for own IP address\n"); memcpy(haddr, dev->dev_addr, dev->addr_len); return 1; - case IS_MULTICAST: + case RTN_MULTICAST: if(dev->type==ARPHRD_ETHER || dev->type==ARPHRD_IEEE802 || dev->type==ARPHRD_FDDI) { @@ -985,7 +985,7 @@ static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, s * If a device does not support multicast broadcast the stuff (eg AX.25 for now) */ - case IS_BROADCAST: + case RTN_BROADCAST: memcpy(haddr, dev->broadcast, dev->addr_len); return 1; } @@ -1007,11 +1007,17 @@ static void arp_start_resolution(struct arp_table *entry) else #endif arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, - dev->pa_addr, NULL, dev->dev_addr, NULL); + inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), NULL, + dev->dev_addr, NULL); } /* * Create a new unresolved entry. + * + * NOTE: Always make sure no possibility of sleeping is introduced here, + * since nearly all callers are inside of BH atomic. Don't let + * the arp_alloc() fool you, at neigh_alloc() it is using GFP_ATOMIC + * always. */ struct arp_table * arp_new_entry(u32 paddr, struct device *dev, struct sk_buff *skb) @@ -1049,7 +1055,6 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) struct device *dev = skb->dev; u32 paddr; struct arp_table *entry; - unsigned long hash; if (!skb->dst) { printk(KERN_DEBUG "arp_find called with dst==NULL\n"); @@ -1058,14 +1063,11 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) paddr = ((struct rtable*)skb->dst)->rt_gateway; - if (arp_set_predefined(__ip_chk_addr(paddr), haddr, paddr, dev)) { - if (skb) - skb->arp = 1; + if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) { + skb->arp = 1; return 0; } - hash = HASH(paddr); - start_bh_atomic(); /* @@ -1079,8 +1081,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) { entry->u.neigh.lastused = jiffies; memcpy(haddr, entry->u.neigh.ha, dev->addr_len); - if (skb) - skb->arp = 1; + skb->arp = 1; end_bh_atomic(); return 0; } @@ -1090,24 +1091,17 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) * queue the packet with the previous attempt */ - if (skb != NULL) - { - if (entry->last_updated) - { - if (entry->u.neigh.arp_queue.qlen < ARP_MAX_UNRES_PACKETS) - skb_queue_tail(&entry->u.neigh.arp_queue, skb); - else - kfree_skb(skb, FREE_WRITE); - } - /* - * If last_updated==0 host is dead, so - * drop skb's and set socket error. - */ + if (entry->last_updated) { + if (entry->u.neigh.arp_queue.qlen < ARP_MAX_UNRES_PACKETS) + skb_queue_tail(&entry->u.neigh.arp_queue, skb); else - { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); kfree_skb(skb, FREE_WRITE); - } + } else { + /* If last_updated==0 host is dead, so + * drop skb's and set socket error. + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + kfree_skb(skb, FREE_WRITE); } end_bh_atomic(); return 1; @@ -1115,7 +1109,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) entry = arp_new_entry(paddr, dev, skb); - if (skb != NULL && !entry) + if (entry == NULL) kfree_skb(skb, FREE_WRITE); end_bh_atomic(); @@ -1129,12 +1123,13 @@ int arp_find_1(unsigned char *haddr, struct dst_entry *dst, struct device *dev = dst->dev; u32 paddr = rt->rt_gateway; struct arp_table *entry; - unsigned long hash; if (!neigh) { - if ((rt->rt_flags & RTF_MULTICAST) && - (dev->type==ARPHRD_ETHER || dev->type==ARPHRD_IEEE802)) + if (rt->rt_type == RTN_MULTICAST && + (dev->type == ARPHRD_ETHER || + dev->type == ARPHRD_IEEE802 || + dev->type == ARPHRD_FDDI)) { u32 taddr; haddr[0]=0x01; @@ -1148,12 +1143,12 @@ int arp_find_1(unsigned char *haddr, struct dst_entry *dst, haddr[3]=taddr&0x7f; return 1; } - if (rt->rt_flags & (RTF_BROADCAST|RTF_MULTICAST)) + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) { memcpy(haddr, dev->broadcast, dev->addr_len); return 1; } - if (rt->rt_flags & RTF_LOCAL) + if (rt->rt_flags & RTCF_LOCAL) { printk(KERN_DEBUG "ARP: arp called for own IP address\n"); memcpy(haddr, dev->dev_addr, dev->addr_len); @@ -1162,8 +1157,6 @@ int arp_find_1(unsigned char *haddr, struct dst_entry *dst, return 0; } - hash = HASH(paddr); - start_bh_atomic(); entry = (struct arp_table*)neigh; @@ -1187,17 +1180,14 @@ struct neighbour* arp_find_neighbour(struct dst_entry *dst, int resolve) struct device *dev = rt->u.dst.dev; u32 paddr = rt->rt_gateway; struct arp_table *entry; - unsigned long hash; if (dst->ops->family != AF_INET) return NULL; if ((dev->flags & (IFF_LOOPBACK|IFF_NOARP)) || - (rt->rt_flags & (RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST))) + (rt->rt_flags & (RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST))) return NULL; - hash = HASH(paddr); - start_bh_atomic(); /* @@ -1213,8 +1203,10 @@ struct neighbour* arp_find_neighbour(struct dst_entry *dst, int resolve) return (struct neighbour*)entry; } - if (!resolve) + if (!resolve) { + end_bh_atomic(); return NULL; + } entry = arp_new_entry(paddr, dev, NULL); @@ -1256,17 +1248,19 @@ void arp_send(int type, int ptype, u32 dest_ip, */ skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4) - + dev->hard_header_len, GFP_ATOMIC); + + dev->hard_header_len + 15, GFP_ATOMIC); if (skb == NULL) { printk(KERN_DEBUG "ARP: no memory to send an arp packet\n"); return; } - skb_reserve(skb, dev->hard_header_len); + + skb_reserve(skb, (dev->hard_header_len+15)&~15); + skb->nh.raw = skb->data; arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); skb->arp = 1; skb->dev = dev; - skb->protocol = htons (ETH_P_ARP); + skb->protocol = __constant_htons (ETH_P_ARP); /* * Fill the device header for the ARP frame @@ -1295,7 +1289,7 @@ void arp_send(int type, int ptype, u32 dest_ip, arp->ar_pro = (dev->type != ARPHRD_AX25) ? htons(ETH_P_IP) : htons(AX25_P_IP); #endif #else - arp->ar_pro = htons(ETH_P_IP); + arp->ar_pro = __constant_htons(ETH_P_IP); #endif arp->ar_hln = dev->addr_len; arp->ar_pln = 4; @@ -1319,6 +1313,20 @@ void arp_send(int type, int ptype, u32 dest_ip, dev_queue_xmit(skb); } +static __inline__ int arp_check_published(u32 tip, struct device *dev) +{ + struct arp_table *entry; + + for (entry = arp_proxy_list; entry; entry = entry->u.next) { + if (!((entry->ip^tip)&entry->mask) && + ((!entry->u.neigh.dev && + (!(entry->flags & ATF_COM) || entry->hatype == dev->type)) + || entry->u.neigh.dev == dev) ) + break; + } + + return entry && !(entry->flags & ATF_DONTPUB); +} /* * Receive an arp request by the device layer. @@ -1331,6 +1339,7 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) struct rtable *rt; unsigned char *sha, *tha; u32 sip, tip; + u16 dev_type = dev->type; /* * The hardware length of the packet should match the hardware length @@ -1339,45 +1348,38 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * is not from an IP number. We can't currently handle this, so toss * it. */ -#if defined(CONFIG_FDDI) || defined(CONFIG_AP1000) - if (dev->type == ARPHRD_FDDI) +#if defined(CONFIG_FDDI) + if (dev_type == ARPHRD_FDDI) { /* * According to RFC 1390, FDDI devices should accept ARP hardware types * of 1 (Ethernet). However, to be more robust, we'll accept hardware * types of either 1 (Ethernet) or 6 (IEEE 802.2). */ + if (arp->ar_hln != dev->addr_len || ((ntohs(arp->ar_hrd) != ARPHRD_ETHER) && (ntohs(arp->ar_hrd) != ARPHRD_IEEE802)) || dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || arp->ar_pln != 4) - { - kfree_skb(skb, FREE_READ); - return 0; - } + goto out; } else { if (arp->ar_hln != dev->addr_len || - dev->type != ntohs(arp->ar_hrd) || + dev_type != ntohs(arp->ar_hrd) || dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || arp->ar_pln != 4) - { - kfree_skb(skb, FREE_READ); - return 0; - } + goto out; } #else if (arp->ar_hln != dev->addr_len || - dev->type != ntohs(arp->ar_hrd) || + dev_type != ntohs(arp->ar_hrd) || dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || - arp->ar_pln != 4) { - kfree_skb(skb, FREE_READ); - return 0; - } + arp->ar_pln != 4) + goto out; #endif /* @@ -1387,24 +1389,18 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * problem, so toss the packet. */ - switch (dev->type) + switch (dev_type) { #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) case ARPHRD_AX25: if(arp->ar_pro != htons(AX25_P_IP)) - { - kfree_skb(skb, FREE_READ); - return 0; - } + goto out; break; #endif #if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) case ARPHRD_NETROM: if(arp->ar_pro != htons(AX25_P_IP)) - { - kfree_skb(skb, FREE_READ); - return 0; - } + goto out; break; #endif case ARPHRD_ETHER: @@ -1412,23 +1408,19 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) case ARPHRD_METRICOM: case ARPHRD_IEEE802: case ARPHRD_FDDI: + case ARPHRD_IPGRE: if(arp->ar_pro != htons(ETH_P_IP)) - { - kfree_skb(skb, FREE_READ); - return 0; - } + goto out; break; default: printk(KERN_ERR "ARP: dev->type mangled!\n"); - kfree_skb(skb, FREE_READ); - return 0; + goto out; } /* * Extract fields */ - sha=arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); @@ -1440,21 +1432,8 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ - if (LOOPBACK(tip) || MULTICAST(tip)) { - kfree_skb(skb, FREE_READ); - return 0; - } - if (ip_route_input(skb, tip, sip, 0, dev)) { - kfree_skb(skb, FREE_READ); - return 0; - } - dev = skb->dev; - rt = (struct rtable*)skb->dst; - if (dev->type != ntohs(arp->ar_hrd) || dev->flags&IFF_NOARP || - rt->rt_flags&RTF_BROADCAST) { - kfree_skb(skb, FREE_READ); - return 0; - } + if (LOOPBACK(tip) || MULTICAST(tip)) + goto out; /* * Process entry. The idea here is we want to send a reply if it is a @@ -1472,31 +1451,31 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * and in the case of requests for us we add the requester to the arp * cache. */ - if (arp->ar_op == htons(ARPOP_REQUEST)) { - struct arp_table *entry; - - for (entry = arp_proxy_list; entry; entry = entry->u.next) { - if (!((entry->ip^tip)&entry->mask) && - ((!entry->u.neigh.dev && - (!(entry->flags & ATF_COM) || entry->hatype == dev->type)) - || entry->u.neigh.dev == dev) ) - break; - } - - if (entry && !(entry->flags & ATF_DONTPUB)) { - char *ha = (entry->flags & ATF_COM) ? entry->u.neigh.ha : dev->dev_addr; - - if (rt->rt_flags&(RTF_LOCAL|RTF_NAT) || - (!(rt->rt_flags&RTCF_DOREDIRECT) && - rt->u.dst.dev != dev)) - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,ha,sha); - } + int addr_type; + struct in_device *in_dev = dev->ip_ptr; + + if (ip_route_input(skb, tip, sip, 0, dev)) + goto out; + rt = (struct rtable*)skb->dst; + addr_type = rt->rt_type; + + if (addr_type == RTN_LOCAL || (rt->rt_flags&RTCF_DNAT) || + (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && + ((in_dev && IN_DEV_PROXY_ARP(in_dev) && IN_DEV_FORWARD(in_dev)) || + arp_check_published(tip, dev)))) + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + } else { + if (arp->ar_op != htons(ARPOP_REPLY) || + inet_addr_type(sip) != RTN_UNICAST) + goto out; } start_bh_atomic(); - arp_update(sip, sha, dev, 0, !RT_LOCALADDR(rt->rt_flags) && dev->type != ARPHRD_METRICOM); + arp_update(sip, sha, dev, 0, arp->ar_op == htons(ARPOP_REPLY)); end_bh_atomic(); + +out: kfree_skb(skb, FREE_READ); return 0; } @@ -1554,13 +1533,13 @@ int arp_req_set(struct arpreq *r, struct device * dev) if ((r->arp_flags & ATF_PERM) && !(r->arp_flags & ATF_COM)) return -EINVAL; - err = ip_route_output(&rt, ip, 0, 1, dev); + err = ip_route_output(&rt, ip, 0, 1, dev ? dev->ifindex : 0); if (err) return err; if (!dev) dev = rt->u.dst.dev; - if (rt->rt_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTCF_NAT)) { - if (rt->rt_flags&RTF_BROADCAST && + if (rt->rt_flags&(RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST|RTCF_DNAT)) { + if (rt->rt_flags&RTCF_BROADCAST && dev->type == ARPHRD_METRICOM && r->arp_ha.sa_family == ARPHRD_METRICOM) { memcpy(dev->broadcast, r->arp_ha.sa_data, dev->addr_len); @@ -1578,7 +1557,7 @@ int arp_req_set(struct arpreq *r, struct device * dev) if (dev && r->arp_ha.sa_family != dev->type) return -EINVAL; - + start_bh_atomic(); if (!(r->arp_flags & ATF_PUBL)) @@ -1991,7 +1970,7 @@ __initfunc(void arp_init (void)) #endif #ifdef CONFIG_ARPD - netlink_attach(NETLINK_ARPD, arpd_callback); + arpd_sk = netlink_kernel_create(NETLINK_ARPD, arpd_callback); #endif } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c12417c5280a..269361e355e5 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,6 +1,8 @@ /* * NET3 IP device support routines. * + * Version: $Id: devinet.c,v 1.14 1997/10/10 22:40:44 davem Exp $ + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -13,9 +15,13 @@ * * Additional Authors: * Alan Cox, + * Alexey Kuznetsov, + * + * Changes: + * Alexey Kuznetsov: pa_* fields are replaced with ifaddr lists. */ -#include /* For CONFIG_IP_CLASSLESS */ +#include #include #include @@ -34,72 +40,336 @@ #include #include #include -#include -#include -#include -#include -#include #include -#include -#include +#include +#include #include -#include +#include +#include #ifdef CONFIG_KERNELD #include #endif -extern struct notifier_block *netdev_chain; +#include +#include +#include -/* - * Determine a default network mask, based on the IP address. +#ifdef CONFIG_RTNETLINK +static void rtmsg_ifa(int event, struct in_ifaddr *); +#else +#define rtmsg_ifa(a,b) do { } while(0) +#endif + +static struct notifier_block *inetaddr_chain; +static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); + + +int inet_ifa_count; +int inet_dev_count; + +static struct in_ifaddr * inet_alloc_ifa(void) +{ + struct in_ifaddr *ifa; + + ifa = kmalloc(sizeof(*ifa), GFP_KERNEL); + if (ifa) { + memset(ifa, 0, sizeof(*ifa)); + inet_ifa_count++; + } + + return ifa; +} + +static __inline__ void inet_free_ifa(struct in_ifaddr *ifa) +{ + kfree_s(ifa, sizeof(*ifa)); + inet_ifa_count--; +} + +struct in_device *inetdev_init(struct device *dev) +{ + struct in_device *in_dev; + + in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL); + if (!in_dev) + return NULL; + inet_dev_count++; + memset(in_dev, 0, sizeof(*in_dev)); + in_dev->dev = dev; + dev->ip_ptr = in_dev; + ip_mc_init_dev(in_dev); + return in_dev; +} + +static void inetdev_destroy(struct in_device *in_dev) +{ + struct in_ifaddr *ifa; + + ip_mc_destroy_dev(in_dev); + + while ((ifa = in_dev->ifa_list) != NULL) { + inet_del_ifa(in_dev, &in_dev->ifa_list, 0); + inet_free_ifa(ifa); + } + + in_dev->dev->ip_ptr = NULL; + kfree(in_dev); +} + +struct in_ifaddr * inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b) +{ + for_primary_ifa(in_dev) { + if (inet_ifa_match(a, ifa)) { + if (!b || inet_ifa_match(b, ifa)) + return ifa; + } + } endfor_ifa(in_dev); + return NULL; +} + +static void +inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) +{ + struct in_ifaddr *ifa1 = *ifap; + struct in_ifaddr *ifa; + + /* 1. Unlink it */ + + *ifap = ifa1->ifa_next; + + /* 2. Deleting primary ifaddr forces deletion all secondaries */ + + if (!(ifa1->ifa_flags&IFA_F_SECONDARY)) { + while ((ifa=*ifap) != NULL) { + if (ifa1->ifa_mask != ifa->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, ifa)) { + ifap = &ifa->ifa_next; + continue; + } + *ifap = ifa->ifa_next; + rtmsg_ifa(RTM_DELADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); + inet_free_ifa(ifa); + } + } + + /* 3. Announce address deletion */ + + /* Send message first, then call notifier. + At first sight, FIB update triggered by notifier + will refer to already deleted ifaddr, that could confuse + netlink listeners. It is not true: look, gated sees + that route deleted and if it still thinks that ifaddr + is valid, it will try to restore deleted routes... Grr. + So that, this order is correct. + */ + rtmsg_ifa(RTM_DELADDR, ifa1); + notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); + if (destroy) { + inet_free_ifa(ifa1); + if (in_dev->ifa_list == NULL) + inetdev_destroy(in_dev); + } +} + +static int +inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa) +{ + struct in_ifaddr *ifa1, **ifap, **last_primary; + + if (ifa->ifa_local == 0) { + inet_free_ifa(ifa); + return 0; + } + + ifa->ifa_flags &= ~IFA_F_SECONDARY; + last_primary = &in_dev->ifa_list; + + for (ifap=&in_dev->ifa_list; (ifa1=*ifap)!=NULL; ifap=&ifa1->ifa_next) { + if (!(ifa1->ifa_flags&IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) + last_primary = &ifa1->ifa_next; + if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) { + if (ifa1->ifa_local == ifa->ifa_local) { + inet_free_ifa(ifa); + return -EEXIST; + } + if (ifa1->ifa_scope != ifa->ifa_scope) { + inet_free_ifa(ifa); + return -EINVAL; + } + ifa->ifa_flags |= IFA_F_SECONDARY; + } + } + + if (!(ifa->ifa_flags&IFA_F_SECONDARY)) + ifap = last_primary; + + cli(); + ifa->ifa_next = *ifap; + *ifap = ifa; + sti(); + + /* Send message first, then call notifier. + Notifier will trigger FIB update, so that + listeners of netlink will know about new ifaddr */ + rtmsg_ifa(RTM_NEWADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); + + return 0; +} + +static int +inet_set_ifa(struct device *dev, struct in_ifaddr *ifa) +{ + struct in_device *in_dev = dev->ip_ptr; + + if (in_dev == NULL) { + in_dev = inetdev_init(dev); + if (in_dev == NULL) { + inet_free_ifa(ifa); + return -ENOBUFS; + } + } + ifa->ifa_dev = in_dev; + if (LOOPBACK(ifa->ifa_local)) + ifa->ifa_scope = RT_SCOPE_HOST; + return inet_insert_ifa(in_dev, ifa); +} + +struct in_device *inetdev_by_index(int ifindex) +{ + struct device *dev; + dev = dev_get_by_index(ifindex); + if (dev) + return dev->ip_ptr; + return NULL; +} + +struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask) +{ + for_primary_ifa(in_dev) { + if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) + return ifa; + } endfor_ifa(in_dev); + return NULL; +} + +#ifdef CONFIG_RTNETLINK + +/* rtm_{add|del} functions are not reenterable, so that + this structure can be made static */ -static unsigned long ip_get_mask(unsigned long addr) +int +inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { - unsigned long dst; + struct kern_ifa *k_ifa = arg; + struct in_device *in_dev; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in_ifaddr *ifa, **ifap; - if (ZERONET(addr)) - return(0L); /* special case */ - - dst = ntohl(addr); - if (IN_CLASSA(dst)) - return(htonl(IN_CLASSA_NET)); - if (IN_CLASSB(dst)) - return(htonl(IN_CLASSB_NET)); - if (IN_CLASSC(dst)) - return(htonl(IN_CLASSC_NET)); - - /* - * Something else, probably a multicast. - */ - - return(0); + if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL) + return -EADDRNOTAVAIL; + + for (ifap=&in_dev->ifa_list; (ifa=*ifap)!=NULL; ifap=&ifa->ifa_next) { + if ((k_ifa->ifa_local && memcmp(k_ifa->ifa_local, &ifa->ifa_local, 4)) || + (k_ifa->ifa_label && strcmp(k_ifa->ifa_label, ifa->ifa_label)) || + (k_ifa->ifa_address && + (ifm->ifa_prefixlen != ifa->ifa_prefixlen || + !inet_ifa_match(*(u32*)k_ifa->ifa_address, ifa)))) + continue; + inet_del_ifa(in_dev, ifap, 1); + return 0; + } + + return -EADDRNOTAVAIL; } +int +inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct kern_ifa *k_ifa = arg; + struct device *dev; + struct in_device *in_dev; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in_ifaddr *ifa; -/* - * This checks bitmasks for the ioctl calls for devices. + if (ifm->ifa_prefixlen > 32 || k_ifa->ifa_local == NULL) + return -EINVAL; + + if ((dev = dev_get_by_index(ifm->ifa_index)) == NULL) + return -ENODEV; + + if ((in_dev = dev->ip_ptr) == NULL) { + in_dev = inetdev_init(dev); + if (!in_dev) + return -ENOBUFS; + } + + if ((ifa = inet_alloc_ifa()) == NULL) + return -ENOBUFS; + + if (k_ifa->ifa_address == NULL) + k_ifa->ifa_address = k_ifa->ifa_local; + memcpy(&ifa->ifa_local, k_ifa->ifa_local, 4); + memcpy(&ifa->ifa_address, k_ifa->ifa_address, 4); + ifa->ifa_prefixlen = ifm->ifa_prefixlen; + ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); + if (k_ifa->ifa_broadcast) + memcpy(&ifa->ifa_broadcast, k_ifa->ifa_broadcast, 4); + if (k_ifa->ifa_anycast) + memcpy(&ifa->ifa_anycast, k_ifa->ifa_anycast, 4); + ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_scope = ifm->ifa_scope; + ifa->ifa_dev = in_dev; + if (k_ifa->ifa_label) + memcpy(ifa->ifa_label, k_ifa->ifa_label, IFNAMSIZ); + else + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + + return inet_insert_ifa(in_dev, ifa); +} + +#endif + +/* + * Determine a default network mask, based on the IP address. */ - -static inline int bad_mask(__u32 mask, __u32 addr) + +static __inline__ int inet_abc_len(u32 addr) { - if (addr & (mask = ~mask)) - return 1; - mask = ntohl(mask); - if (mask & (mask+1)) - return 1; - return 0; + if (ZERONET(addr)) + return 0; + + addr = ntohl(addr); + if (IN_CLASSA(addr)) + return 8; + if (IN_CLASSB(addr)) + return 16; + if (IN_CLASSC(addr)) + return 24; + + /* + * Something else, probably a multicast. + */ + + return -1; } - + int devinet_ioctl(unsigned int cmd, void *arg) { struct ifreq ifr; + struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; + struct in_device *in_dev; + struct in_ifaddr **ifap = NULL; + struct in_ifaddr *ifa = NULL; struct device *dev; - __u32 addr; -#ifdef CONFIG_NET_ALIAS - int err; +#ifdef CONFIG_IP_ALIAS + char *colon; #endif + int exclusive = 0; + int ret = 0; /* * Fetch the caller's info block into kernel space @@ -107,191 +377,483 @@ int devinet_ioctl(unsigned int cmd, void *arg) if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; + ifr.ifr_name[IFNAMSIZ-1] = 0; + +#ifdef CONFIG_IP_ALIAS + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; +#endif - /* - * See which interface the caller is talking about. - */ - - /* - * - * net_alias_dev_get(): dev_get() with added alias naming magic. - * only allow alias creation/deletion if (getset==SIOCSIFADDR) - * - */ - #ifdef CONFIG_KERNELD dev_load(ifr.ifr_name); -#endif +#endif -#ifdef CONFIG_NET_ALIAS - if ((dev = net_alias_dev_get(ifr.ifr_name, cmd == SIOCSIFADDR, &err, NULL, NULL)) == NULL) - return(err); -#else - if ((dev = dev_get(ifr.ifr_name)) == NULL) - return(-ENODEV); + switch(cmd) { + case SIOCGIFADDR: /* Get interface address */ + case SIOCGIFBRDADDR: /* Get the broadcast address */ + case SIOCGIFDSTADDR: /* Get the destination address */ + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + case SIOCGIFPFLAGS: /* Get per device sysctl controls */ + /* Note that this ioctls will not sleep, + so that we do not impose a lock. + One day we will be forced to put shlock here (I mean SMP) + */ + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + break; + + case SIOCSIFFLAGS: + if (!suser()) + return -EACCES; + rtnl_lock(); + exclusive = 1; + break; + case SIOCSIFADDR: /* Set interface address (and family) */ + case SIOCSIFBRDADDR: /* Set the broadcast address */ + case SIOCSIFDSTADDR: /* Set the destination address */ + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + case SIOCSIFPFLAGS: /* Set per device sysctl controls */ + if (!suser()) + return -EACCES; + if (sin->sin_family != AF_INET) + return -EINVAL; + rtnl_lock(); + exclusive = 1; + break; + default: + return -EINVAL; + } + + + if ((dev = dev_get(ifr.ifr_name)) == NULL) { + ret = -ENODEV; + goto done; + } + +#ifdef CONFIG_IP_ALIAS + if (colon) + *colon = ':'; #endif - if (cmd != SIOCSIFADDR && dev->family != AF_INET) - return(-EINVAL); + if ((in_dev=dev->ip_ptr) != NULL) { + for (ifap=&in_dev->ifa_list; (ifa=*ifap) != NULL; ifap=&ifa->ifa_next) + if (strcmp(ifr.ifr_name, ifa->ifa_label) == 0) + break; + } - switch(cmd) - { - case SIOCGIFADDR: /* Get interface address (and family) */ - if (ifr.ifr_addr.sa_family == AF_UNSPEC) - { - memcpy(ifr.ifr_hwaddr.sa_data, dev->dev_addr, MAX_ADDR_LEN); - ifr.ifr_hwaddr.sa_family = dev->type; - } - else - { - (*(struct sockaddr_in *) - &ifr.ifr_addr).sin_addr.s_addr = dev->pa_addr; - (*(struct sockaddr_in *) - &ifr.ifr_addr).sin_family = dev->family; - (*(struct sockaddr_in *) - &ifr.ifr_addr).sin_port = 0; - } - break; - - case SIOCSIFADDR: /* Set interface address (and family) */ - - if (!suser()) - return -EPERM; + if (ifa == NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) { + ret = -EADDRNOTAVAIL; + goto done; + } - /* - * BSDism. SIOCSIFADDR family=AF_UNSPEC sets the - * physical address. We can cope with this now. - */ - - if(ifr.ifr_addr.sa_family==AF_UNSPEC) - { - int ret; - if(dev->set_mac_address==NULL) - return -EOPNOTSUPP; - ret = dev->set_mac_address(dev,&ifr.ifr_addr); - if (!ret) - notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); - return ret; - } - if(ifr.ifr_addr.sa_family!=AF_INET) - return -EINVAL; + switch(cmd) { + case SIOCGIFADDR: /* Get interface address */ + sin->sin_addr.s_addr = ifa->ifa_local; + goto rarok; - addr = (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr; + case SIOCGIFBRDADDR: /* Get the broadcast address */ + sin->sin_addr.s_addr = ifa->ifa_broadcast; + goto rarok; - dev_lock_wait(); - dev_lock_list(); + case SIOCGIFDSTADDR: /* Get the destination address */ + sin->sin_addr.s_addr = ifa->ifa_address; + goto rarok; - if (dev->family == AF_INET && addr == dev->pa_addr) { - dev_unlock_list(); - return 0; - } + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + sin->sin_addr.s_addr = ifa->ifa_mask; + goto rarok; - if (dev->flags & IFF_UP) - notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + case SIOCGIFPFLAGS: + ifr.ifr_flags = in_dev->flags; + goto rarok; - /* - * if dev is an alias, must rehash to update - * address change - */ + case SIOCSIFFLAGS: +#ifdef CONFIG_IP_ALIAS + if (colon) { + if (ifa == NULL) { + ret = -EADDRNOTAVAIL; + break; + } + if (!(ifr.ifr_flags&IFF_UP)) + inet_del_ifa(in_dev, ifap, 1); + break; + } +#endif + ret = dev_change_flags(dev, ifr.ifr_flags); + break; + + case SIOCSIFPFLAGS: + in_dev->flags = ifr.ifr_flags; + break; -#ifdef CONFIG_NET_ALIAS - if (net_alias_is(dev)) - net_alias_dev_rehash(dev, &ifr.ifr_addr); + case SIOCSIFADDR: /* Set interface address (and family) */ + if (inet_abc_len(sin->sin_addr.s_addr) < 0) { + ret = -EINVAL; + break; + } + + if (!ifa) { + if ((ifa = inet_alloc_ifa()) == NULL) { + ret = -ENOBUFS; + break; + } +#ifdef CONFIG_IP_ALIAS + if (colon) + memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); + else #endif - dev->pa_addr = addr; - dev->ip_flags |= IFF_IP_ADDR_OK; - dev->ip_flags &= ~(IFF_IP_BRD_OK|IFF_IP_MASK_OK); - dev->family = AF_INET; - if (dev->flags & IFF_POINTOPOINT) { - dev->pa_mask = 0xFFFFFFFF; - dev->pa_brdaddr = 0xFFFFFFFF; + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); } else { - dev->pa_mask = ip_get_mask(dev->pa_addr); - dev->pa_brdaddr = dev->pa_addr|~dev->pa_mask; + ret = 0; + if (ifa->ifa_local == sin->sin_addr.s_addr) + break; + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = 0; + ifa->ifa_anycast = 0; + ifa->ifa_prefixlen = 32; + ifa->ifa_mask = inet_make_mask(32); } - if (dev->flags & IFF_UP) - notifier_call_chain(&netdev_chain, NETDEV_UP, dev); - dev_unlock_list(); - return 0; - - case SIOCGIFBRDADDR: /* Get the broadcast address */ - (*(struct sockaddr_in *) - &ifr.ifr_broadaddr).sin_addr.s_addr = dev->pa_brdaddr; - (*(struct sockaddr_in *) - &ifr.ifr_broadaddr).sin_family = dev->family; - (*(struct sockaddr_in *) - &ifr.ifr_broadaddr).sin_port = 0; + + ifa->ifa_address = + ifa->ifa_local = sin->sin_addr.s_addr; + + if (!(dev->flags&IFF_POINTOPOINT)) { + ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); + ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); + if ((dev->flags&IFF_BROADCAST) && ifa->ifa_prefixlen < 31) + ifa->ifa_broadcast = ifa->ifa_address|~ifa->ifa_mask; + } + ret = inet_set_ifa(dev, ifa); break; case SIOCSIFBRDADDR: /* Set the broadcast address */ - if (!suser()) - return -EPERM; - - addr = (*(struct sockaddr_in *)&ifr.ifr_broadaddr).sin_addr.s_addr; - - if (dev->flags & IFF_UP) - ip_rt_change_broadcast(dev, addr); - dev->pa_brdaddr = addr; - dev->ip_flags |= IFF_IP_BRD_OK; - return 0; - - case SIOCGIFDSTADDR: /* Get the destination address (for point-to-point links) */ - (*(struct sockaddr_in *) - &ifr.ifr_dstaddr).sin_addr.s_addr = dev->pa_dstaddr; - (*(struct sockaddr_in *) - &ifr.ifr_dstaddr).sin_family = dev->family; - (*(struct sockaddr_in *) - &ifr.ifr_dstaddr).sin_port = 0; + if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = sin->sin_addr.s_addr; + inet_insert_ifa(in_dev, ifa); + } break; - case SIOCSIFDSTADDR: /* Set the destination address (for point-to-point links) */ - if (!suser()) - return -EPERM; - addr = (*(struct sockaddr_in *)&ifr.ifr_dstaddr).sin_addr.s_addr; - if (addr == dev->pa_dstaddr) - return 0; - if (dev->flags & IFF_UP) - ip_rt_change_dstaddr(dev, addr); - dev->pa_dstaddr = addr; - return 0; - - case SIOCGIFNETMASK: /* Get the netmask for the interface */ - (*(struct sockaddr_in *) - &ifr.ifr_netmask).sin_addr.s_addr = dev->pa_mask; - (*(struct sockaddr_in *) - &ifr.ifr_netmask).sin_family = dev->family; - (*(struct sockaddr_in *) - &ifr.ifr_netmask).sin_port = 0; + case SIOCSIFDSTADDR: /* Set the destination address */ + if (ifa->ifa_address != sin->sin_addr.s_addr) { + if (inet_abc_len(sin->sin_addr.s_addr) < 0) { + ret = -EINVAL; + break; + } + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_address = sin->sin_addr.s_addr; + inet_insert_ifa(in_dev, ifa); + } break; case SIOCSIFNETMASK: /* Set the netmask for the interface */ - if (!suser()) - return -EPERM; - addr = (*(struct sockaddr_in *)&ifr.ifr_netmask).sin_addr.s_addr; - - if (addr == dev->pa_mask) { - dev->ip_flags |= IFF_IP_MASK_OK; - return 0; - } /* * The mask we set must be legal. */ - if (bad_mask(addr, 0)) - return -EINVAL; - if (addr == htonl(0xFFFFFFFE)) - return -EINVAL; - if (dev->flags & IFF_UP) - ip_rt_change_netmask(dev, addr); - dev->pa_mask = addr; - dev->ip_flags |= IFF_IP_MASK_OK; - dev->ip_flags &= ~IFF_IP_BRD_OK; - return 0; - default: - return -EINVAL; - + if (bad_mask(sin->sin_addr.s_addr, 0)) { + ret = -EINVAL; + break; + } + + if (ifa->ifa_mask != sin->sin_addr.s_addr) { + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_mask = sin->sin_addr.s_addr; + ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); + inet_set_ifa(dev, ifa); + } + break; } +done: + if (exclusive) + rtnl_unlock(); + return ret; + +rarok: if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) return -EFAULT; return 0; } + +static int +inet_gifconf(struct device *dev, char *buf, int len) +{ + struct in_device *in_dev = dev->ip_ptr; + struct in_ifaddr *ifa; + struct ifreq ifr; + int done=0; + + if (in_dev==NULL || (ifa=in_dev->ifa_list)==NULL) + return 0; + + for ( ; ifa; ifa = ifa->ifa_next) { + if (!buf) { + done += sizeof(ifr); + continue; + } + if (len < sizeof(ifr)) + return done; + memset(&ifr, 0, sizeof(struct ifreq)); + if (ifa->ifa_label) + strcpy(ifr.ifr_name, ifa->ifa_label); + else + strcpy(ifr.ifr_name, dev->name); + + (*(struct sockaddr_in *) &ifr.ifr_addr).sin_family = AF_INET; + (*(struct sockaddr_in *) &ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; + + if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) + return -EFAULT; + buf += sizeof(struct ifreq); + len -= sizeof(struct ifreq); + done += sizeof(struct ifreq); + } + return done; +} + +u32 inet_select_addr(struct device *dev, u32 dst, int scope) +{ + u32 addr = 0; + struct in_device *in_dev = dev->ip_ptr; + + if (in_dev == NULL) + return 0; + + for_primary_ifa(in_dev) { + if (ifa->ifa_scope > scope) + continue; + addr = ifa->ifa_local; + if (!dst || inet_ifa_match(dst, ifa)) + return addr; + } endfor_ifa(in_dev); + + return addr; +} + +/* + * Device notifier + */ + +int register_inetaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&inetaddr_chain, nb); +} + +int unregister_inetaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&inetaddr_chain,nb); +} + +static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + struct in_device *in_dev = dev->ip_ptr; + + if (in_dev == NULL) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + if (in_dev) + printk(KERN_DEBUG "inetdev_event: bug\n"); + dev->ip_ptr = NULL; + break; + case NETDEV_UP: + if (dev == &loopback_dev) { + struct in_ifaddr *ifa; + if ((ifa = inet_alloc_ifa()) != NULL) { + ifa->ifa_local = + ifa->ifa_address = htonl(INADDR_LOOPBACK); + ifa->ifa_prefixlen = 8; + ifa->ifa_mask = inet_make_mask(8); + ifa->ifa_dev = in_dev; + ifa->ifa_scope = RT_SCOPE_HOST; + inet_insert_ifa(in_dev, ifa); + } + } + ip_mc_up(in_dev); + break; + case NETDEV_DOWN: + ip_mc_down(in_dev); + break; + case NETDEV_UNREGISTER: + inetdev_destroy(in_dev); + break; + } + + return NOTIFY_DONE; +} + +struct notifier_block ip_netdev_notifier={ + inetdev_event, + NULL, + 0 +}; + +#ifdef CONFIG_RTNETLINK + +static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, + pid_t pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET; + ifm->ifa_prefixlen = ifa->ifa_prefixlen; + ifm->ifa_flags = ifa->ifa_flags; + ifm->ifa_scope = ifa->ifa_scope; + ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + if (ifa->ifa_prefixlen) + RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address); + if (ifa->ifa_local) + RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local); + if (ifa->ifa_broadcast) + RTA_PUT(skb, IFA_BROADCAST, 4, &ifa->ifa_broadcast); + if (ifa->ifa_anycast) + RTA_PUT(skb, IFA_ANYCAST, 4, &ifa->ifa_anycast); + if (ifa->ifa_label[0]) + RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_put(skb, b - skb->tail); + return -1; +} + +static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, ip_idx; + int s_idx, s_ip_idx; + struct device *dev; + struct in_device *in_dev; + struct in_ifaddr *ifa; + + s_idx = cb->args[0]; + s_ip_idx = ip_idx = cb->args[1]; + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_ip_idx = 0; + if ((in_dev = dev->ip_ptr) == NULL) + continue; + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; + ifa = ifa->ifa_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWADDR) <= 0) + goto done; + } + } +done: + cb->args[0] = idx; + cb->args[1] = ip_idx; + + return skb->len; +} + +static void rtmsg_ifa(int event, struct in_ifaddr * ifa) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); + return; + } + if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { + kfree_skb(skb, 0); + netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL); +} + + +static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_ifinfo, }, + { NULL, NULL, }, + + { inet_rtm_newaddr, NULL, }, + { inet_rtm_deladdr, NULL, }, + { NULL, inet_dump_ifaddr, }, + { NULL, NULL, }, + + { inet_rtm_newroute, NULL, }, + { inet_rtm_delroute, NULL, }, + { inet_rtm_getroute, inet_dump_fib, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + +#ifdef CONFIG_IP_MULTIPLE_TABLES + { inet_rtm_newrule, NULL, }, + { inet_rtm_delrule, NULL, }, + { NULL, inet_dump_rules, }, + { NULL, NULL, }, +#else + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, +#endif +}; + +#endif /* CONFIG_RTNETLINK */ + +#ifdef CONFIG_IP_PNP_BOOTP + +/* + * Addition and deletion of fake interface addresses + * for sending of BOOTP packets. In this case, we must + * set the local address to zero which is not permitted + * otherwise. + */ + +__initfunc(int inet_add_bootp_addr(struct device *dev)) +{ + struct in_device *in_dev = dev->ip_ptr; + struct in_ifaddr *ifa; + + if (!in_dev && !(in_dev = inetdev_init(dev))) + return -ENOBUFS; + if (!(ifa = inet_alloc_ifa())) + return -ENOBUFS; + ifa->ifa_dev = in_dev; + in_dev->ifa_list = ifa; + rtmsg_ifa(RTM_NEWADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); + return 0; +} + +__initfunc(void inet_del_bootp_addr(struct device *dev)) +{ + if (dev->ip_ptr) + inetdev_destroy(dev->ip_ptr); +} + +#endif + +__initfunc(void devinet_init(void)) +{ + register_gifconf(AF_INET, inet_gifconf); + register_netdevice_notifier(&ip_netdev_notifier); +#ifdef CONFIG_RTNETLINK + rtnetlink_links[AF_INET] = inet_rtnetlink_table; +#endif +} diff --git a/net/ipv4/fib.c b/net/ipv4/fib.c deleted file mode 100644 index f444718a7cb3..000000000000 --- a/net/ipv4/fib.c +++ /dev/null @@ -1,2077 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * IPv4 Forwarding Information Base. - * - * Authors: Alexey Kuznetsov, - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * - * NOTE: This file is scheduled to be removed from kernel. - * The natural place for router FIB is user level - * routing daemon (it has to keep its copy in any case) - * - * Kernel should keep only interface routes and, - * if host is not router, default gateway. - * - * We have good proof that it is feasible and efficient - - * multicast routing. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct fib_class local_class = {RT_CLASS_LOCAL, }; -static struct fib_class default_class = {RT_CLASS_DEFAULT, }; -static struct fib_class main_class = {RT_CLASS_MAIN, }; -static struct fib_class *fib_classes[RT_CLASS_MAX+1]; - -static struct fib_rule *fib_rules; - -static struct fib_info *fib_info_list; - -static int fib_stamp; - -static int rtmsg_process(struct nlmsghdr *n, struct in_rtmsg *r); - - -#ifdef CONFIG_RTNETLINK - -static unsigned rt_nl_flags; -static int rt_nl_owner = -1; - -/* - * Default mode is delayed for 0.5sec batch delivery. - * If someone starts to use user->level calls, - * we turn on synchronous message passing. - */ - -#define RTMSG_DELAY (HZ/2) - -static struct nlmsg_ctl rtmsg_ctl = { - { NULL, NULL, 0, 0L, NULL }, - NULL, - NETLINK_ROUTE, - RTMSG_DELAY, - NLMSG_GOODSIZE, - 0, 0, 0, 0 -}; - -static void __rtmsg_ack(struct nlmsghdr *n, int err); - -static __inline__ void rtmsg_ack(struct nlmsghdr *n, int err) -{ - if (n->nlmsg_seq && rt_nl_flags&RTCTL_ACK) - __rtmsg_ack(n, err); -} - -static void rtmsg_fib(unsigned long type, struct fib_node *f, int logmask, - struct fib_class *class, struct nlmsghdr *n); -static void rtmsg_dev(unsigned long type, struct device *dev, struct nlmsghdr *n); -#define rtmsg_kick() ({ if (rtmsg_ctl.nlmsg_skb) nlmsg_transmit(&rtmsg_ctl); }) - -#else -#define rtmsg_fib(a,b,c,d,e) -#define rtmsg_dev(a,b,c) -#define rtmsg_ack(a,b) -#define rtmsg_kick() -#endif - - -/* - * FIB locking. - */ - -static struct wait_queue *fib_wait; -static atomic_t fib_users = ATOMIC_INIT(0); - -static void fib_lock(void) -{ - while (atomic_read(&fib_users)) - sleep_on(&fib_wait); - atomic_inc(&fib_users); - dev_lock_list(); -} - -static void fib_unlock(void) -{ - dev_unlock_list(); - if (atomic_dec_and_test(&fib_users)) { - rtmsg_kick(); - wake_up(&fib_wait); - } -} - -/* - * Check if a mask is acceptable. - */ - -static __inline__ int bad_mask(u32 mask, u32 addr) -{ - if (addr & (mask = ~mask)) - return 1; - mask = ntohl(mask); - if (mask & (mask+1)) - return 1; - return 0; -} - -/* - * Evaluate mask length. - */ - -static __inline__ int fib_logmask(u32 mask) -{ - if (!(mask = ntohl(mask))) - return 32; - return ffz(~mask); -} - -/* - * Create mask from mask length. - */ - -static __inline__ u32 fib_mask(int logmask) -{ - if (logmask >= 32) - return 0; - return htonl(~((1<cl_id = id; - fib_classes[id] = class; - return class; -} - -static struct fib_class *fib_empty_class(void) -{ - int id; - for (id = 1; id <= RT_CLASS_MAX; id++) - if (fib_classes[id] == NULL) - return fib_alloc_class(id); - return NULL; -} - -static int fib_rule_delete(struct in_rtrulemsg *r, struct device *dev, struct nlmsghdr *n) -{ - u32 src = r->rtrmsg_src.s_addr; - u32 dst = r->rtrmsg_dst.s_addr; - u32 srcmask = fib_netmask(r->rtrmsg_srclen); - u32 dstmask = fib_netmask(r->rtrmsg_dstlen); - struct fib_rule *cl, **clp; - - for (clp=&fib_rules; (cl=*clp) != NULL; clp=&cl->cl_next) { - if (src == cl->cl_src && - srcmask == cl->cl_srcmask && - dst == cl->cl_dst && - dstmask == cl->cl_dstmask && - r->rtrmsg_tos == cl->cl_tos && - dev == cl->cl_dev && - r->rtrmsg_action == cl->cl_action && - (!r->rtrmsg_preference || r->rtrmsg_preference == cl->cl_preference) && - (!r->rtrmsg_class || (cl && r->rtrmsg_class == cl->cl_class->cl_id))) { - cli(); - *clp = cl->cl_next; - sti(); - if (cl->cl_class) - cl->cl_class->cl_users--; - kfree(cl); - return 0; - } - } - return -ESRCH; -} - -static int fib_rule_add(struct in_rtrulemsg *r, struct device *dev, struct nlmsghdr *n) -{ - u32 src = r->rtrmsg_src.s_addr; - u32 dst = r->rtrmsg_dst.s_addr; - u32 srcmask = fib_netmask(r->rtrmsg_srclen); - u32 dstmask = fib_netmask(r->rtrmsg_dstlen); - - struct fib_rule *cl, *new_cl, **clp; - struct fib_class *class = NULL; - - if ((src&~srcmask) || (dst&~dstmask)) - return -EINVAL; - if (dev && net_alias_main_dev(dev) != dev) - return -ENODEV; - - if (!r->rtrmsg_class) { - if (r->rtrmsg_action==RTP_GO || r->rtrmsg_action==RTP_NAT - || r->rtrmsg_action==RTP_MASQUERADE) { - if ((class = fib_empty_class()) == NULL) - return -ENOMEM; - class->cl_auto = 1; - } else if (r->rtrmsg_rtmsgs) - return -EINVAL; - } else if ((class = fib_alloc_class(r->rtrmsg_class)) == NULL) - return -ENOMEM; - - new_cl = kmalloc(sizeof(*new_cl), GFP_KERNEL); - if (!new_cl) - return -ENOMEM; - new_cl->cl_src = src; - new_cl->cl_srcmask = srcmask; - new_cl->cl_dst = dst; - new_cl->cl_dstmask = dstmask; - new_cl->cl_dev = dev; - new_cl->cl_srcmap = r->rtrmsg_srcmap.s_addr; - new_cl->cl_tos = r->rtrmsg_tos; - new_cl->cl_action = r->rtrmsg_action; - new_cl->cl_flags = r->rtrmsg_flags; - new_cl->cl_preference = r->rtrmsg_preference; - new_cl->cl_class = class; - if (class) - class->cl_users++; - - clp = &fib_rules; - - if (!new_cl->cl_preference) { - cl = fib_rules; - if (cl && (cl = cl->cl_next) != NULL) { - clp = &fib_rules->cl_next; - if (cl->cl_preference) - new_cl->cl_preference = cl->cl_preference - 1; - } - } - - while ( (cl = *clp) != NULL ) { - if (cl->cl_preference >= new_cl->cl_preference) - break; - clp = &cl->cl_next; - } - - new_cl->cl_next = cl; - cli(); - *clp = new_cl; - sti(); - - if (r->rtrmsg_rtmsgs) { - n->nlmsg_type = RTMSG_NEWROUTE; - r->rtrmsg_rtmsg->rtmsg_class = class->cl_id; - return rtmsg_process(n, r->rtrmsg_rtmsg); - } - return 0; -} - - -#define FZ_MAX_DIVISOR 1024 - -static __inline__ u32 fib_hash(u32 key, u32 mask) -{ - u32 h; - h = key^(key>>20); - h = h^(h>>10); - h = h^(h>>5); - return h & mask; -} - -static __inline__ struct fib_node ** fz_hash_p(u32 key, struct fib_zone *fz) -{ - return &fz->fz_hash[fib_hash(key, fz->fz_hashmask)]; -} - -static __inline__ struct fib_node * fz_hash(u32 key, struct fib_zone *fz) -{ - return fz->fz_hash[fib_hash(key, fz->fz_hashmask)]; -} - -/* - * Free FIB node. - */ - -static void fib_free_node(struct fib_node * f) -{ - struct fib_info * fi = f->fib_info; - if (fi && !--fi->fib_refcnt) { -#if RT_CACHE_DEBUG >= 2 - printk("fib_free_node: fi %08x/%s is free\n", fi->fib_gateway, fi->fib_dev ? fi->fib_dev->name : "null"); -#endif - if (fi->fib_next) - fi->fib_next->fib_prev = fi->fib_prev; - if (fi->fib_prev) - fi->fib_prev->fib_next = fi->fib_next; - if (fi == fib_info_list) - fib_info_list = fi->fib_next; - } - kfree_s(f, sizeof(struct fib_node)); -} - -static __inline__ int fib_flags_trans(unsigned flags) -{ - if (flags & RTF_BROADCAST) - return IS_BROADCAST; - if (flags & RTF_MULTICAST) - return IS_MULTICAST; - if (flags & RTF_LOCAL) - return IS_MYADDR; - return 0; -} - -unsigned ip_fib_chk_addr(u32 addr) -{ - struct fib_zone * fz; - struct fib_node * f; - - /* - * Accept both `all ones' and `all zeros' as BROADCAST. - * (Support old BSD in other words). This old BSD - * support will go very soon as it messes other things - * up. - */ - - if (addr == INADDR_ANY || addr == 0xFFFFFFFF) - return RTF_LOCAL|RTF_BROADCAST; - - if ((addr & htonl(0x7F000000L)) == htonl(0x7F000000L)) - return RTF_LOCAL|RTF_INTERFACE; - - if (MULTICAST(addr)) - return RTF_MULTICAST; - - addr = ntohl(addr); - for (fz = local_class.fib_zone_list; fz; fz = fz->fz_next) { - u32 key = (addr&fz->fz_mask)>>fz->fz_logmask; - for (f = fz_hash(key, fz); f; f = f->fib_next) { - if (key != f->fib_key || (f->fib_flag & FIBFLG_DOWN)) - continue; - if (!f->fib_info) - return 0; - return f->fib_info->fib_flags&RTF_ADDRCLASSMASK; - } - } - - return 0; -} - -int __ip_chk_addr(unsigned long addr) -{ - return fib_flags_trans(ip_fib_chk_addr(addr)); -} - -/* - * Find the first device with a given source address. - */ - -struct device *ip_dev_find(unsigned long addr, char *name) -{ - struct fib_zone * fz = local_class.fib_zones[0]; - u32 key; - struct fib_node * f; - - key = (ntohl(addr)&fz->fz_mask)>>fz->fz_logmask; - for (f = fz_hash(key, fz); f; f = f->fib_next) { - if (key == f->fib_key && - !(f->fib_flag & (FIBFLG_DOWN|FIBFLG_REJECT|FIBFLG_THROW)) && - f->fib_info->fib_flags == (RTF_IFLOCAL&~RTF_UP)) { - if (!name || strcmp(name, f->fib_info->fib_dev->name) == 0) - return f->fib_info->fib_dev; - } - } - - return NULL; -} - -/* - * Find tunnel with a given source and destination. - */ - -struct device *ip_dev_find_tunnel(u32 daddr, u32 saddr) -{ - struct fib_zone * fz = local_class.fib_zones[0]; - u32 key; - struct fib_node * f; - - key = (ntohl(daddr)&fz->fz_mask)>>fz->fz_logmask; - for (f = fz_hash(key, fz); f; f = f->fib_next) { - if (key == f->fib_key && - !(f->fib_flag & (FIBFLG_DOWN|FIBFLG_REJECT|FIBFLG_THROW)) && - f->fib_info->fib_flags == (RTF_IFLOCAL&~RTF_UP)) { - struct device *dev = f->fib_info->fib_dev; - if (dev->type == ARPHRD_TUNNEL && - dev->pa_dstaddr == saddr) - return dev; - } - if (!f->fib_info) - return NULL; - } - - return NULL; -} - - -int ip_fib_chk_default_gw(u32 addr, struct device *dev) -{ - struct fib_rule *cl; - struct fib_node * f; - - for (cl = fib_rules; cl; cl = cl->cl_next) { - if (cl->cl_srcmask || cl->cl_dstmask || cl->cl_tos || - cl->cl_dev || cl->cl_action != RTP_GO || !cl->cl_class || - !cl->cl_class->fib_zones[32]) - continue; - for (f = cl->cl_class->fib_zones[32]->fz_hash[0]; f; f = f->fib_next) { - struct fib_info *fi = f->fib_info; - if (!(f->fib_flag & (FIBFLG_DOWN|FIBFLG_REJECT|FIBFLG_THROW)) && - fi->fib_gateway == addr && - fi->fib_dev == dev && - fi->fib_flags&RTF_GATEWAY) - return 0; - } - } - return -1; -} - - -/* - * Main lookup routine. - */ - - -int -fib_lookup(struct fib_result *res, u32 daddr, u32 src, u8 tos, - struct device *devin, struct device *devout) -{ - struct fib_node * f; - struct fib_rule * cl; - u32 dst; - int local = tos & 1; - - tos &= IPTOS_TOS_MASK; - dst = ntohl(daddr); - - for (cl = fib_rules; cl; cl=cl->cl_next) { - struct fib_zone * fz; - - if (((src^cl->cl_src) & cl->cl_srcmask) || - ((daddr^cl->cl_dst) & cl->cl_dstmask) || - (cl->cl_tos && cl->cl_tos != tos) || - (cl->cl_dev && cl->cl_dev != devin)) - continue; - - switch (cl->cl_action) { - case RTP_GO: - case RTP_NAT: - case RTP_MASQUERADE: - default: - break; - case RTP_UNREACHABLE: - return -ENETUNREACH; - case RTP_DROP: - return -EINVAL; - case RTP_PROHIBIT: - return -EACCES; - } - - for (fz = cl->cl_class->fib_zone_list; fz; fz = fz->fz_next) { - u32 key = (dst&fz->fz_mask)>>fz->fz_logmask; - - for (f = fz_hash(key, fz); f; f = f->fib_next) { - if (key != f->fib_key || - (f->fib_flag & FIBFLG_DOWN) || - (f->fib_tos && f->fib_tos != tos)) - continue; - if (f->fib_flag & FIBFLG_THROW) - goto next_class; - if (f->fib_flag & FIBFLG_REJECT) - return -ENETUNREACH; - if (devout && f->fib_info->fib_dev != devout) - continue; - if (!local || !(f->fib_info->fib_flags&RTF_GATEWAY)) { - res->f = f; - res->fr = cl; - res->fm = fz->fz_logmask; - return 0; - } - } - } -next_class: - } - return -ENETUNREACH; -} - -static int fib_autopublish(int op, struct fib_node *f, int logmask) -{ - struct fib_zone *fz; - struct fib_node *f1; - struct arpreq r; - u32 addr = htonl(f->fib_key<fib_flag || LOOPBACK(addr) || - (!RT_LOCALADDR(f->fib_info->fib_flags) && - !(f->fib_info->fib_flags&RTF_NAT))) - return 0; - - memset(&r, 0, sizeof(struct arpreq)); - r.arp_flags = ATF_PUBL|ATF_PERM|ATF_MAGIC; - if (logmask) - r.arp_flags |= ATF_NETMASK; - ((struct sockaddr_in*)&r.arp_pa)->sin_family = AF_INET; - ((struct sockaddr_in*)&r.arp_pa)->sin_addr.s_addr = addr; - ((struct sockaddr_in*)&r.arp_netmask)->sin_family = AF_INET; - ((struct sockaddr_in*)&r.arp_netmask)->sin_addr.s_addr = fib_mask(logmask); - - if (op) - return arp_req_set(&r, NULL); - - fz = local_class.fib_zones[logmask]; - - for (f1 = fz_hash(f->fib_key, fz); f1; f1=f1->fib_next) { - if (f->fib_key != f1->fib_key || f1->fib_flag || - (!RT_LOCALADDR(f1->fib_info->fib_flags) && - !(f1->fib_info->fib_flags&RTF_NAT))) - continue; - return 0; - } - - return arp_req_delete(&r, NULL); -} - -#define FIB_SCAN(f, fp) \ -for ( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fib_next) - -#define FIB_SCAN_KEY(f, fp, key) \ -for ( ; ((f) = *(fp)) != NULL && (f)->fib_key == (key); (fp) = &(f)->fib_next) - -#define FIB_CONTINUE(f, fp) \ -{ \ - fp = &f->fib_next; \ - continue; \ -} - -static int fib_delete(struct in_rtmsg * r, struct device *dev, - struct fib_class *class, struct nlmsghdr *n) -{ - struct fib_node **fp, *f; - struct fib_zone *fz = class->fib_zones[32-r->rtmsg_prefixlen]; - int logmask = 32 - r->rtmsg_prefixlen; - u32 dst = ntohl(r->rtmsg_prefix.s_addr); - u32 gw = r->rtmsg_gateway.s_addr; - short metric = r->rtmsg_metric; - u8 tos = r->rtmsg_tos; - u8 fibflg = 0; - int found=0; - unsigned flags; - u32 key; - - flags = r->rtmsg_flags; - if (flags & RTF_REJECT) - fibflg |= FIBFLG_REJECT; - else if (flags & RTF_THROW) - fibflg |= FIBFLG_THROW; - flags &= ~(RTF_UP|RTF_REJECT|RTF_THROW); - - if (fz != NULL) { - key = (dst&fz->fz_mask)>>logmask; - fp = fz_hash_p(key, fz); - - FIB_SCAN(f, fp) { - if (f->fib_key == key) - break; - } - FIB_SCAN_KEY(f, fp, key) { - if (f->fib_tos == tos) - break; - } - - while ((f = *fp) != NULL && f->fib_key == key && f->fib_tos == tos) { - struct fib_info * fi = f->fib_info; - - /* - * If metric was not specified (<0), match all metrics. - */ - if (metric >= 0 && f->fib_metric != metric) - FIB_CONTINUE(f, fp); - - if (flags & RTF_MAGIC) { - /* "Magic" deletions require exact match */ - if (!fi || (fi->fib_flags^flags) || - fi->fib_dev != dev || - fi->fib_gateway != gw) - FIB_CONTINUE(f, fp); - } else { - /* - * Device, gateway, reject and throw are - * also checked if specified. - */ - if ((dev && fi && fi->fib_dev != dev) || - (gw && fi && fi->fib_gateway != gw) || - (fibflg && (f->fib_flag^fibflg)&~FIBFLG_DOWN)) - FIB_CONTINUE(f, fp); - } - cli(); - /* It's interesting, can this operation be not atomic? */ - *fp = f->fib_next; - sti(); - if (class == &local_class) - fib_autopublish(0, f, logmask); - rtmsg_fib(RTMSG_DELROUTE, f, logmask, class, n); - fib_free_node(f); - found++; - } - fz->fz_nent -= found; - } - - if (found) { - fib_stamp++; - rt_cache_flush(0); - rtmsg_ack(n, 0); - return 0; - } - rtmsg_ack(n, ESRCH); - return -ESRCH; -} - -static struct fib_info * fib_create_info(struct device * dev, struct in_rtmsg *r) -{ - struct fib_info * fi; - unsigned flags = r->rtmsg_flags; - u32 gw = r->rtmsg_gateway.s_addr; - unsigned short mtu; - unsigned short irtt; - unsigned long window; - - mtu = dev ? dev->mtu : 0; - if (flags&RTF_MSS && r->rtmsg_mtu < mtu && r->rtmsg_mtu >= 68) - mtu = r->rtmsg_mtu; - window = (flags & RTF_WINDOW) ? r->rtmsg_window : 0; - irtt = (flags & RTF_IRTT) ? r->rtmsg_rtt : TCP_TIMEOUT_INIT; - - flags &= RTF_FIB; - - for (fi=fib_info_list; fi; fi = fi->fib_next) { - if (fi->fib_gateway != gw || - fi->fib_dev != dev || - fi->fib_flags != flags || - fi->fib_mtu != mtu || - fi->fib_window != window || - fi->fib_irtt != irtt) - continue; - fi->fib_refcnt++; -#if RT_CACHE_DEBUG >= 2 - printk("fib_create_info: fi %08x/%s/%04x is duplicate\n", fi->fib_gateway, fi->fib_dev ? fi->fib_dev->name : "null", fi->fib_flags); -#endif - return fi; - } - fi = (struct fib_info*)kmalloc(sizeof(struct fib_info), GFP_KERNEL); - if (!fi) - return NULL; - memset(fi, 0, sizeof(struct fib_info)); - fi->fib_flags = flags; - fi->fib_dev = dev; - fi->fib_gateway = gw; - fi->fib_mtu = mtu; - fi->fib_window = window; - fi->fib_refcnt++; - fi->fib_next = fib_info_list; - fi->fib_prev = NULL; - fi->fib_irtt = irtt; - if (fib_info_list) - fib_info_list->fib_prev = fi; - fib_info_list = fi; -#if RT_CACHE_DEBUG >= 2 - printk("fib_create_info: fi %08x/%s/%04x is created\n", fi->fib_gateway, fi->fib_dev ? fi->fib_dev->name : "null", fi->fib_flags); -#endif - return fi; -} - -static __inline__ void fib_rebuild_zone(struct fib_zone *fz, - struct fib_node **old_ht, - int old_divisor) -{ - int i; - struct fib_node **ht = fz->fz_hash; - u32 hashmask = fz->fz_hashmask; - struct fib_node *f, **fp, *next; - unsigned hash; - - for (i=0; ifib_next; - f->fib_next = NULL; - hash = fib_hash(f->fib_key, hashmask); - for (fp = &ht[hash]; *fp; fp = &(*fp)->fib_next) - /* NONE */; - *fp = f; - } - } -} - -static void fib_rehash_zone(struct fib_zone *fz) -{ - struct fib_node **ht, **old_ht; - int old_divisor, new_divisor; - u32 new_hashmask; - - old_divisor = fz->fz_divisor; - - switch (old_divisor) { - case 16: - new_divisor = 256; - new_hashmask = 0xFF; - break; - case 256: - new_divisor = 1024; - new_hashmask = 0x3FF; - break; - default: - printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); - return; - } -#if RT_CACHE_DEBUG >= 2 - printk("fib_rehash_zone: hash for zone %d grows from %d\n", fz->fz_logmask, old_divisor); -#endif - - ht = kmalloc(new_divisor*sizeof(struct rtable*), GFP_KERNEL); - - if (ht) { - memset(ht, 0, new_divisor*sizeof(struct fib_node*)); - start_bh_atomic(); - old_ht = fz->fz_hash; - fz->fz_hash = ht; - fz->fz_hashmask = new_hashmask; - fz->fz_divisor = new_divisor; - fib_rebuild_zone(fz, old_ht, old_divisor); - fib_stamp++; - end_bh_atomic(); - kfree(old_ht); - } -} - -static struct fib_zone * -fib_new_zone(struct fib_class *class, int logmask) -{ - int i; - struct fib_zone *fz = kmalloc(sizeof(struct fib_zone), GFP_KERNEL); - if (!fz) - return NULL; - - memset(fz, 0, sizeof(struct fib_zone)); - if (logmask < 32) { - fz->fz_divisor = 16; - fz->fz_hashmask = 0xF; - } else { - fz->fz_divisor = 1; - fz->fz_hashmask = 0; - } - fz->fz_hash = kmalloc(fz->fz_divisor*sizeof(struct fib_node*), GFP_KERNEL); - if (!fz->fz_hash) { - kfree(fz); - return NULL; - } - memset(fz->fz_hash, 0, fz->fz_divisor*sizeof(struct fib_node*)); - fz->fz_logmask = logmask; - fz->fz_mask = ntohl(fib_mask(logmask)); - for (i=logmask-1; i>=0; i--) - if (class->fib_zones[i]) - break; - start_bh_atomic(); - if (i<0) { - fz->fz_next = class->fib_zone_list; - class->fib_zone_list = fz; - } else { - fz->fz_next = class->fib_zones[i]->fz_next; - class->fib_zones[i]->fz_next = fz; - } - class->fib_zones[logmask] = fz; - fib_stamp++; - end_bh_atomic(); - return fz; -} - -static int fib_create(struct in_rtmsg *r, struct device *dev, - struct fib_class *class, struct nlmsghdr *n) -{ - struct fib_node *f, *f1, **fp; - struct fib_node **dup_fp = NULL; - struct fib_zone * fz; - struct fib_info * fi; - - long logmask = 32L - r->rtmsg_prefixlen; /* gcc bug work-around: must be "L" and "long" */ - u32 dst = ntohl(r->rtmsg_prefix.s_addr); - u32 gw = r->rtmsg_gateway.s_addr; - short metric = r->rtmsg_metric; - unsigned flags = r->rtmsg_flags; - u8 tos = r->rtmsg_tos; - u8 fibflg = 0; - u32 key; - - /* - * Allocate an entry and fill it in. - */ - - f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL); - if (f == NULL) { - rtmsg_ack(n, ENOMEM); - return -ENOMEM; - } - - memset(f, 0, sizeof(struct fib_node)); - - if (!(flags & RTF_UP)) - fibflg = FIBFLG_DOWN; - if (flags & RTF_REJECT) - fibflg |= FIBFLG_REJECT; - else if (flags & RTF_THROW) - fibflg |= FIBFLG_THROW; - - flags &= ~(RTF_UP|RTF_REJECT|RTF_THROW); - r->rtmsg_flags = flags; - - fi = NULL; - if (!(fibflg & (FIBFLG_REJECT|FIBFLG_THROW))) { - if ((fi = fib_create_info(dev, r)) == NULL) { - kfree_s(f, sizeof(struct fib_node)); - rtmsg_ack(n, ENOMEM); - return -ENOMEM; - } - f->fib_info = fi; - flags = fi->fib_flags; - } - - f->fib_key = key = dst>>logmask; - f->fib_metric = metric; - f->fib_tos = tos; - f->fib_flag = fibflg; - fz = class->fib_zones[logmask]; - - if (!fz && !(fz = fib_new_zone(class, logmask))) { - fib_free_node(f); - rtmsg_ack(n, ENOMEM); - return -ENOMEM; - } - - if (fz->fz_nent > (fz->fz_divisor<<2) && - fz->fz_divisor < FZ_MAX_DIVISOR && - (!logmask || (1<<(32-logmask)) > fz->fz_divisor)) - fib_rehash_zone(fz); - - fp = fz_hash_p(key, fz); - - /* - * Scan list to find the first route with the same destination - */ - FIB_SCAN(f1, fp) { - if (f1->fib_key == key) - break; - } - - /* - * Find route with the same destination and tos. - */ - FIB_SCAN_KEY(f1, fp, dst) { - if (f1->fib_tos <= tos) - break; - } - - /* - * Find route with the same destination/tos and less (or equal) metric. - * "Magic" additions go to the end of list. - */ - for ( ; (f1 = *fp) != NULL && f1->fib_key == key && f1->fib_tos == tos; - fp = &f1->fib_next) { - if (f1->fib_metric >= metric && metric != MAGIC_METRIC) - break; - - /* - * Record route with the same destination/tos/gateway/dev, - * but less metric. - */ - if (!dup_fp) { - struct fib_info *fi1 = f1->fib_info; - - if ((fibflg^f1->fib_flag) & ~FIBFLG_DOWN) - continue; - if (fi == fi1 || - (fi && fi1 && - fi->fib_dev == fi1->fib_dev && - fi->fib_gateway == fi1->fib_gateway && - !(flags&RTF_MAGIC))) - dup_fp = fp; - } - } - - /* - * Is it already present? - */ - - if (f1 && f1->fib_key == key && f1->fib_tos == tos && - f1->fib_metric == metric && f1->fib_info == fi) { - fib_free_node(f); - - if (fibflg == f1->fib_flag) { - rtmsg_ack(n, EEXIST); - return -EEXIST; - } else { - fib_stamp++; - f1->fib_flag = fibflg; - rt_cache_flush(0); - rtmsg_ack(n, 0); - return 0; - } - } - - /* - * Do not add "magic" route, if better one is already present. - */ - if ((flags & RTF_MAGIC) && dup_fp) { - fib_free_node(f); - rtmsg_ack(n, EEXIST); - return -EEXIST; - } - - /* - * Insert new entry to the list. - */ - - cli(); - f->fib_next = f1; - *fp = f; - sti(); - fz->fz_nent++; - if (class == &local_class && !dup_fp) - fib_autopublish(1, f, logmask); - rtmsg_fib(RTMSG_NEWROUTE, f, logmask, class, n); - - if (flags & RTF_MAGIC) { - fib_stamp++; - rt_cache_flush(0); - rtmsg_ack(n, 0); - return 0; - } - - /* - * Clean routes with the same destination,tos,gateway and device, - * but different metric. - */ - fp = dup_fp ? : &f->fib_next; - - while ((f1 = *fp) != NULL && f1->fib_key == key && f1->fib_tos == tos) { - if (f1 == f || ((f1->fib_flag^fibflg)&~FIBFLG_DOWN)) - FIB_CONTINUE(f1, fp); - - if (f1->fib_info != fi && - (!fi || !f1->fib_info || - f1->fib_info->fib_gateway != gw || - f1->fib_info->fib_dev != dev)) - FIB_CONTINUE(f1, fp); - - cli(); - *fp = f1->fib_next; - sti(); - fz->fz_nent--; - rtmsg_fib(RTMSG_DELROUTE, f1, logmask, class, n); - fib_free_node(f1); - } - fib_stamp++; - rt_cache_flush(0); - rtmsg_ack(n, 0); - return 0; -} - -static int fib_flush_list(struct fib_node ** fp, struct device *dev, - int logmask, struct fib_class *class) -{ - int found = 0; - struct fib_node *f; - - while ((f = *fp) != NULL) { - if (!f->fib_info || f->fib_info->fib_dev != dev) - FIB_CONTINUE(f, fp); - cli(); - *fp = f->fib_next; - sti(); - if (class == &local_class) - fib_autopublish(0, f, logmask); -#ifdef CONFIG_RTNETLINK - if (rt_nl_flags&RTCTL_FLUSH) - rtmsg_fib(RTMSG_DELROUTE, f, logmask, class, 0); -#endif - fib_free_node(f); - found++; - } - return found; -} - -static void fib_flush(struct device *dev) -{ - struct fib_class *class; - struct fib_rule *cl, **clp; - struct fib_zone *fz; - int found = 0; - int i, tmp, cl_id; - - - for (cl_id = RT_CLASS_MAX; cl_id>=0; cl_id--) { - if ((class = fib_classes[cl_id])==NULL) - continue; - for (fz = class->fib_zone_list; fz; fz = fz->fz_next) { - tmp = 0; - for (i=fz->fz_divisor-1; i>=0; i--) - tmp += fib_flush_list(&fz->fz_hash[i], dev, - fz->fz_logmask, class); - fz->fz_nent -= tmp; - found += tmp; - } - } - - clp = &fib_rules; - while ( (cl=*clp) != NULL) { - if (cl->cl_dev != dev) { - clp = &cl->cl_next; - continue; - } - found++; - cli(); - *clp = cl->cl_next; - sti(); - kfree(cl); - } - - if (found) { - fib_stamp++; - rt_cache_flush(1); - } -} - -#ifdef CONFIG_PROC_FS - -static unsigned __inline__ fib_flag_trans(u8 fibflg) -{ - unsigned ret = RTF_UP; - if (!fibflg) - return ret; - if (fibflg & FIBFLG_DOWN) - ret &= ~RTF_UP; - if (fibflg & FIBFLG_REJECT) - ret |= RTF_REJECT; - if (fibflg & FIBFLG_THROW) - ret |= RTF_THROW; - return ret; -} - -/* - * Called from the PROCfs module. This outputs /proc/net/route. - * - * We preserve the old format but pad the buffers out. This means that - * we can spin over the other entries as we read them. Remember the - * gated BGP4 code could need to read 60,000+ routes on occasion (that's - * about 7Mb of data). To do that ok we will need to also cache the - * last route we got to (reads will generally be following on from - * one another without gaps). - */ - -static int fib_get_info(char *buffer, char **start, off_t offset, int length, int dummy) -{ - struct fib_class *class; - struct fib_zone *fz; - struct fib_node *f; - int len=0; - off_t pos=0; - char temp[129]; - int i; - int cl_id; - - pos = 128; - - if (offset<128) - { - sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\tTOS\tClass"); - len = 128; - } - - fib_lock(); - - for (cl_id=RT_CLASS_MAX-1; cl_id >= 0; cl_id--) { - class = fib_classes[cl_id]; - if (!class) - continue; - for (fz=class->fib_zone_list; fz; fz = fz->fz_next) - { - int maxslot; - struct fib_node ** fp; - - if (fz->fz_nent == 0) - continue; - - if (pos + 128*fz->fz_nent <= offset) { - pos += 128*fz->fz_nent; - len = 0; - continue; - } - - maxslot = fz->fz_divisor; - fp = fz->fz_hash; - - for (i=0; i < maxslot; i++, fp++) { - - for (f = *fp; f; f = f->fib_next) - { - struct fib_info * fi; - unsigned flags; - - /* - * Spin through entries until we are ready - */ - pos += 128; - - if (pos <= offset) - { - len=0; - continue; - } - - fi = f->fib_info; - flags = fib_flag_trans(f->fib_flag); - - if (fi) - flags |= fi->fib_flags; - sprintf(temp, "%s\t%08lX\t%08X\t%04X\t%d\t%u\t%d\t%08lX\t%d\t%lu\t%u\t%02x\t%02x", - fi && fi->fib_dev ? fi->fib_dev->name : "*", htonl(f->fib_key<fz_logmask), fi ? fi->fib_gateway : 0, - flags, 0, 0, f->fib_metric, - htonl(fz->fz_mask), fi ? (int)fi->fib_mtu : 0, fi ? fi->fib_window : 0, fi ? (int)fi->fib_irtt : 0, f->fib_tos, class->cl_id); - sprintf(buffer+len,"%-127s\n",temp); - - len += 128; - if (pos >= offset+length) - goto done; - } - } - } - } - -done: - fib_unlock(); - - *start = buffer+len-(pos-offset); - len = pos - offset; - if (len>length) - len = length; - return len; -} - -static int fib_local_get_info(char *buffer, char **start, off_t offset, int length, int dummy) -{ - struct fib_zone *fz; - struct fib_node *f; - int len=0; - off_t pos=0; - char temp[129]; - int i; - - pos = 128; - - if (offset<128) - { - sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\tTOS\tClass"); - len = 128; - } - - fib_lock(); - - for (fz=local_class.fib_zone_list; fz; fz = fz->fz_next) - { - int maxslot; - struct fib_node ** fp; - - if (fz->fz_nent == 0) - continue; - - if (pos + 128*fz->fz_nent <= offset) - { - pos += 128*fz->fz_nent; - len = 0; - continue; - } - - maxslot = fz->fz_divisor; - fp = fz->fz_hash; - - for (i=0; i < maxslot; i++, fp++) - { - - for (f = *fp; f; f = f->fib_next) - { - unsigned flags; - struct fib_info * fi; - - /* - * Spin through entries until we are ready - */ - pos += 128; - - if (pos <= offset) - { - len=0; - continue; - } - - fi = f->fib_info; - flags = fib_flag_trans(f->fib_flag); - - if (fi) - flags |= fi->fib_flags; - sprintf(temp, "%s\t%08lX\t%08X\t%X\t%d\t%u\t%d\t%08lX\t%d\t%lu\t%u\t%02x\t%02x", - fi && fi->fib_dev ? fi->fib_dev->name : "*", - htonl(f->fib_key<fz_logmask), - fi ? fi->fib_gateway : 0, - flags, 0, 0, f->fib_metric, - htonl(fz->fz_mask), fi ? (int)fi->fib_mtu : 0, fi ? fi->fib_window : 0, fi ? (int)fi->fib_irtt : 0, f->fib_tos, RT_CLASS_LOCAL); - sprintf(buffer+len,"%-127s\n",temp); - - len += 128; - if (pos >= offset+length) - goto done; - } - } - } - -done: - fib_unlock(); - - *start = buffer+len-(pos-offset); - len = pos - offset; - if (len>length) - len = length; - return len; -} - -static int fib_rules_get_info(char *buffer, char **start, off_t offset, int length, int dummy) -{ - int len=0; - off_t pos=0; - char temp[129]; - struct fib_rule *cl; - - pos = 128; - - if (offset<128) { - sprintf(buffer,"%-127s\n","Pref\tSource\t\tSrcMask\t\tDst\t\tDstMask\t\tIface\tTOS\tClass\tFlags\tSrcMap\n"); - len = 128; - } - - - fib_lock(); - - for (cl = fib_rules; cl; cl = cl->cl_next) { - /* - * Spin through entries until we are ready - */ - pos += 128; - - if (pos <= offset) { - len = 0; - continue; - } - - sprintf(temp, "%d\t%08X\t%08X\t%08X\t%08X\t%s\t%02X\t%02x\t%02X\t%02X\t%08X", - cl->cl_preference, - cl->cl_src, cl->cl_srcmask, - cl->cl_dst, cl->cl_dstmask, - cl->cl_dev ? cl->cl_dev->name : "*", - cl->cl_tos, cl->cl_class ? cl->cl_class->cl_id : 0, - cl->cl_flags, cl->cl_action, cl->cl_srcmap - ); - sprintf(buffer+len,"%-127s\n",temp); - len += 128; - if (pos >= offset+length) - goto done; - } - -done: - fib_unlock(); - - *start = buffer+len-(pos-offset); - len = pos-offset; - if (len>length) - len = length; - return len; -} - -static int fib_class_get_info(char *buffer, char **start, off_t offset, int length, int dummy) -{ - int len=0; - off_t pos=0; - char temp[129]; - int i; - struct fib_class *cl; - - pos = 128; - - if (offset<128) - { - sprintf(buffer,"%-127s\n","Class\tSize\n"); - len = 128; - } - - - fib_lock(); - - for (i = RT_CLASS_MAX; i>=0; i--) - { - int sz = 0; - struct fib_zone *fz; - - if ((cl=fib_classes[i])==NULL) - continue; - - for (fz=cl->fib_zone_list; fz; fz=fz->fz_next) - sz += fz->fz_nent; - - /* - * Spin through entries until we are ready - */ - pos += 128; - - if (pos <= offset) - { - len = 0; - continue; - } - - sprintf(temp, "%d\t%d\n", cl->cl_id, sz); - sprintf(buffer+len,"%-127s\n",temp); - len += 128; - if (pos >= offset+length) - goto done; - } - -done: - fib_unlock(); - - *start = buffer+len-(pos-offset); - len = pos-offset; - if (len>length) - len = length; - return len; -} - -#endif - -static int rtmsg_process(struct nlmsghdr *n, struct in_rtmsg *r) -{ - unsigned long cmd=n->nlmsg_type; - struct device * dev = NULL; - struct fib_class *class; - - if ((cmd != RTMSG_NEWROUTE && cmd != RTMSG_DELROUTE) || - (r->rtmsg_flags & (RTF_MAGIC|RTF_XRESOLVE|RTF_REINSTATE)) || - r->rtmsg_prefixlen > 32 || - (r->rtmsg_tos & ~IPTOS_TOS_MASK)) { - rtmsg_ack(n, EINVAL); - return -EINVAL; - } - - /* Reject/throw directives have no interface/gateway specification */ - - if (r->rtmsg_flags & (RTF_REJECT|RTF_THROW)) { - r->rtmsg_ifindex = 0; - r->rtmsg_gateway.s_addr = 0; - r->rtmsg_flags &= ~RTF_GATEWAY; - } - - /* Silly metric hack, it is preserved for "compatibility", - * though I do not know any program using it. - */ - - r->rtmsg_metric--; - if (cmd == RTMSG_NEWROUTE && r->rtmsg_metric < 0) - r->rtmsg_metric = 0; - - if (cmd == RTMSG_DELROUTE) - r->rtmsg_flags &= RTF_FIB; - - if (r->rtmsg_ifindex) { - dev = dev_get_by_index(r->rtmsg_ifindex); - if (!dev) { - rtmsg_ack(n, ENODEV); - return -ENODEV; - } - } - - if (r->rtmsg_gateway.s_addr && !(r->rtmsg_flags&RTF_NAT)) { - struct fib_info *fi; - - fi = fib_lookup_info(r->rtmsg_gateway.s_addr, 0, 1, - &loopback_dev, dev); - if (fi) { - if (fi->fib_flags&(RTF_BROADCAST|RTF_MULTICAST) && - cmd != RTMSG_DELROUTE) - return -EINVAL; - dev = fi->fib_dev; - if (fi->fib_flags&RTF_LOCAL) { - r->rtmsg_flags &= ~RTF_GATEWAY; - r->rtmsg_gateway.s_addr = 0; - } - } else if (cmd != RTMSG_DELROUTE) - return -ENETUNREACH; - - /* If gateway is not found in routing table, - * we could assume that user knows that he does. - * It is link layer problem to decide reachable - * this gateway or not. Good example is tunnel interface. - * Another example is ethernet, ARP could (in theory) - * resolve addresses, even if we had no routes. - */ - } - - if (dev && (dev->flags&IFF_LOOPBACK)) { - if (r->rtmsg_flags&RTF_GATEWAY) - return -EINVAL; - /* - * Loopback routes: we declare them local addresses. - * It is the only reasonable solution to avoid - * loopback routing loops. - */ - r->rtmsg_flags |= RTF_LOCAL|RTF_INTERFACE; - } - - if (r->rtmsg_flags&RTF_GATEWAY) { - if (!dev && cmd != RTMSG_DELROUTE) { - rtmsg_ack(n, ENETUNREACH); - return -ENETUNREACH; - } - } else { - if (!dev && !(r->rtmsg_flags & (RTF_NAT|RTF_REJECT|RTF_THROW)) && - cmd != RTMSG_DELROUTE) { - rtmsg_ack(n, ENODEV); - return -ENODEV; - } - } - - if (dev && dev->family != AF_INET) - { - rtmsg_ack(n, ENODEV); - return -ENODEV; - } - - if (r->rtmsg_class == 0) { - if (r->rtmsg_flags&(RTF_LOCAL|RTF_NAT)) - r->rtmsg_class = RT_CLASS_LOCAL; - else if ((r->rtmsg_flags&RTF_GATEWAY) && - (ipv4_config.fib_model==2 || - (ipv4_config.fib_model==1 && !r->rtmsg_prefixlen))) - r->rtmsg_class = RT_CLASS_DEFAULT; - else - r->rtmsg_class = RT_CLASS_MAIN; - } - - if ((class = fib_classes[r->rtmsg_class]) == NULL) - { - rtmsg_ack(n, EINVAL); - return -EINVAL; - } - - return (cmd == RTMSG_NEWROUTE ? fib_create : fib_delete)(r, dev, class, n); -} - - -static int rtrulemsg_process(struct nlmsghdr *n, struct in_rtrulemsg *r) -{ - unsigned long cmd=n->nlmsg_type; - struct device * dev = NULL; - - if ((cmd != RTMSG_NEWRULE && cmd != RTMSG_DELRULE) || - r->rtrmsg_srclen > 32 || r->rtrmsg_dstlen > 32 || - (r->rtrmsg_tos & ~IPTOS_TOS_MASK)) - return -EINVAL; - - if (r->rtrmsg_ifindex) { - dev = dev_get_by_index(r->rtrmsg_ifindex); - if (!dev) - return -ENODEV; - if (dev->family != AF_INET) - return -ENODEV; - } - - if (cmd == RTMSG_DELRULE) - return fib_rule_delete(r, dev, n); - - return fib_rule_add(r, dev, n); -} - - -static int ifmsg_process(struct nlmsghdr *n, struct in_ifmsg *r) -{ - unsigned long cmd=n->nlmsg_type; - - if (cmd != RTMSG_NEWDEVICE && cmd != RTMSG_DELDEVICE) { - rtmsg_ack(n, EINVAL); - return -EINVAL; - } - rtmsg_ack(n, EINVAL); - return -EINVAL; -} - -static int rtcmsg_process(struct nlmsghdr *n, struct in_rtctlmsg *r) -{ -#ifdef CONFIG_RTNETLINK - if (r->rtcmsg_flags&RTCTL_DELAY) - rtmsg_ctl.nlmsg_delay = r->rtcmsg_delay; - if (r->rtcmsg_flags&RTCTL_OWNER) - rt_nl_owner = n->nlmsg_pid; - rt_nl_flags = r->rtcmsg_flags; - return 0; -#else - return -EINVAL; -#endif -} - -static int get_rt_from_user(struct in_rtmsg *rtm, void *arg) -{ - struct rtentry r; - - if (copy_from_user(&r, arg, sizeof(struct rtentry))) - return -EFAULT; - if (r.rt_dev) { - struct device *dev; - char devname[16]; - - if (copy_from_user(devname, r.rt_dev, 15)) - return -EFAULT; - devname[15] = 0; - dev = dev_get(devname); - if (!dev) - return -ENODEV; - rtm->rtmsg_ifindex = dev->ifindex; - } - - rtm->rtmsg_flags = r.rt_flags; - - if (r.rt_dst.sa_family != AF_INET) - return -EAFNOSUPPORT; - rtm->rtmsg_prefix = ((struct sockaddr_in*)&r.rt_dst)->sin_addr; - - if (rtm->rtmsg_flags&RTF_HOST) { - rtm->rtmsg_flags &= ~RTF_HOST; - rtm->rtmsg_prefixlen = 32; - } else { - u32 mask = ((struct sockaddr_in*)&r.rt_genmask)->sin_addr.s_addr; - if (r.rt_genmask.sa_family != AF_INET) { - printk(KERN_DEBUG "%s forgot to specify route netmask.\n", current->comm); - if (r.rt_genmask.sa_family) - return -EAFNOSUPPORT; - } - if (bad_mask(mask, rtm->rtmsg_prefix.s_addr)) - return -EINVAL; - rtm->rtmsg_prefixlen = 32 - fib_logmask(mask); - } - if ((rtm->rtmsg_flags & RTF_GATEWAY) && - r.rt_gateway.sa_family != AF_INET) - return -EAFNOSUPPORT; - rtm->rtmsg_gateway = ((struct sockaddr_in*)&r.rt_gateway)->sin_addr; - rtm->rtmsg_rtt = r.rt_irtt; - rtm->rtmsg_window = r.rt_window; - rtm->rtmsg_mtu = r.rt_mtu; - rtm->rtmsg_class = r.rt_class; - rtm->rtmsg_metric = r.rt_metric; - rtm->rtmsg_tos = r.rt_tos; - return 0; -} - - -/* - * Handle IP routing ioctl calls. These are used to manipulate the routing tables - */ - -int ip_rt_ioctl(unsigned int cmd, void *arg) -{ - int err; - union - { - struct in_rtmsg rtmsg; - struct in_ifmsg ifmsg; - struct in_rtrulemsg rtrmsg; - struct in_rtctlmsg rtcmsg; - } m; - struct nlmsghdr dummy_nlh; - - memset(&m, 0, sizeof(m)); - dummy_nlh.nlmsg_seq = 0; - dummy_nlh.nlmsg_pid = current->pid; - - switch (cmd) - { - case SIOCADDRT: /* Add a route */ - case SIOCDELRT: /* Delete a route */ - if (!suser()) - return -EPERM; - err = get_rt_from_user(&m.rtmsg, arg); - if (err) - return err; - fib_lock(); - dummy_nlh.nlmsg_type = cmd == SIOCDELRT ? RTMSG_DELROUTE - : RTMSG_NEWROUTE; - err = rtmsg_process(&dummy_nlh, &m.rtmsg); - fib_unlock(); - return err; - case SIOCRTMSG: - if (!suser()) - return -EPERM; - if (copy_from_user(&dummy_nlh, arg, sizeof(dummy_nlh))) - return -EFAULT; - switch (dummy_nlh.nlmsg_type) - { - case RTMSG_NEWROUTE: - case RTMSG_DELROUTE: - if (dummy_nlh.nlmsg_len < sizeof(m.rtmsg) + sizeof(dummy_nlh)) - return -EINVAL; - if (copy_from_user(&m.rtmsg, arg+sizeof(dummy_nlh), sizeof(m.rtmsg))) - return -EFAULT; - fib_lock(); - err = rtmsg_process(&dummy_nlh, &m.rtmsg); - fib_unlock(); - return err; - case RTMSG_NEWRULE: - case RTMSG_DELRULE: - if (dummy_nlh.nlmsg_len < sizeof(m.rtrmsg) + sizeof(dummy_nlh)) - return -EINVAL; - if (copy_from_user(&m.rtrmsg, arg+sizeof(dummy_nlh), sizeof(m.rtrmsg))) - return -EFAULT; - fib_lock(); - err = rtrulemsg_process(&dummy_nlh, &m.rtrmsg); - fib_unlock(); - return err; - case RTMSG_NEWDEVICE: - case RTMSG_DELDEVICE: - if (dummy_nlh.nlmsg_len < sizeof(m.ifmsg) + sizeof(dummy_nlh)) - return -EINVAL; - if (copy_from_user(&m.ifmsg, arg+sizeof(dummy_nlh), sizeof(m.ifmsg))) - return -EFAULT; - fib_lock(); - err = ifmsg_process(&dummy_nlh, &m.ifmsg); - fib_unlock(); - return err; - case RTMSG_CONTROL: - if (dummy_nlh.nlmsg_len < sizeof(m.rtcmsg) + sizeof(dummy_nlh)) - return -EINVAL; - if (copy_from_user(&m.rtcmsg, arg+sizeof(dummy_nlh), sizeof(m.rtcmsg))) - return -EFAULT; - fib_lock(); - err = rtcmsg_process(&dummy_nlh, &m.rtcmsg); - fib_unlock(); - return err; - default: - return -EINVAL; - } - } - - return -EINVAL; -} - -#ifdef CONFIG_RTNETLINK - -/* - * Netlink hooks for IP - */ - - -static void -rtmsg_fib(unsigned long type, struct fib_node *f, int logmask, - struct fib_class *class, struct nlmsghdr *n) -{ - struct in_rtmsg *r; - struct fib_info *fi; - - if (n && !(rt_nl_flags&RTCTL_ECHO) && rt_nl_owner == n->nlmsg_pid) - return; - - start_bh_atomic(); - r = nlmsg_send(&rtmsg_ctl, type, sizeof(*r), n ? n->nlmsg_seq : 0, - n ? n->nlmsg_pid : 0); - if (r) { - r->rtmsg_prefix.s_addr = htonl(f->fib_key<rtmsg_prefixlen = 32 - logmask; - r->rtmsg_metric= f->fib_metric; - r->rtmsg_tos = f->fib_tos; - r->rtmsg_class=class->cl_id; - r->rtmsg_flags = fib_flag_trans(f->fib_flag); - - if ((fi = f->fib_info) != NULL) { - r->rtmsg_gateway.s_addr = fi->fib_gateway; - r->rtmsg_flags |= fi->fib_flags; - r->rtmsg_mtu = fi->fib_mtu; - r->rtmsg_window = fi->fib_window; - r->rtmsg_rtt = fi->fib_irtt; - r->rtmsg_ifindex = fi->fib_dev ? fi->fib_dev->ifindex : 0; - } - } - end_bh_atomic(); -} - -static void -__rtmsg_ack(struct nlmsghdr *n, int err) -{ - nlmsg_ack(&rtmsg_ctl, n->nlmsg_seq, n->nlmsg_pid, err); -} - - -static void -rtmsg_dev(unsigned long type, struct device *dev, struct nlmsghdr *n) -{ - struct in_ifmsg *r; - - start_bh_atomic(); - r = nlmsg_send(&rtmsg_ctl, type, sizeof(*r), n ? n->nlmsg_seq : 0, - n ? n->nlmsg_pid : 0); - if (r) - { - memset(r, 0, sizeof(*r)); - r->ifmsg_lladdr.sa_family = dev->type; - memcpy(&r->ifmsg_lladdr.sa_data, dev->dev_addr, dev->addr_len); - r->ifmsg_prefix.s_addr = dev->pa_addr; - if (dev->flags & IFF_POINTOPOINT || dev->type == ARPHRD_TUNNEL) - r->ifmsg_brd.s_addr = dev->pa_dstaddr; - else - r->ifmsg_brd.s_addr = dev->pa_brdaddr; - r->ifmsg_flags = dev->flags; - r->ifmsg_mtu = dev->mtu; - r->ifmsg_metric = dev->metric; - r->ifmsg_prefixlen = 32 - fib_logmask(dev->pa_mask); - r->ifmsg_index = dev->ifindex; - strcpy(r->ifmsg_name, dev->name); - } - end_bh_atomic(); -} - -static int fib_netlink_call(int minor, struct sk_buff *skb) -{ - struct nlmsghdr *nlh; - int totlen = 0; - int err = 0; - - fib_lock(); - while (skb->len >= sizeof(*nlh)) { - int rlen; - nlh = (struct nlmsghdr *)skb->data; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (skb->len < rlen) - break; - totlen += rlen; - err = 0; - skb_pull(skb, rlen); - switch (nlh->nlmsg_type) { - case RTMSG_NEWROUTE: - case RTMSG_DELROUTE: - if (nlh->nlmsg_len < sizeof(*nlh)+sizeof(struct in_rtmsg)) { - rtmsg_ack(nlh, EINVAL); - err = -EINVAL; - break; - } - err = rtmsg_process(nlh, (struct in_rtmsg*)nlh->nlmsg_data); - break; - case RTMSG_NEWRULE: - case RTMSG_DELRULE: - if (nlh->nlmsg_len < sizeof(*nlh)+sizeof(struct in_rtrulemsg)) { - rtmsg_ack(nlh, EINVAL); - err = -EINVAL; - break; - } - err = rtrulemsg_process(nlh, (struct in_rtrulemsg*)nlh->nlmsg_data); - break; - case RTMSG_NEWDEVICE: - case RTMSG_DELDEVICE: - if (nlh->nlmsg_len < sizeof(*nlh)+sizeof(struct in_ifmsg)) { - rtmsg_ack(nlh, EINVAL); - err = -EINVAL; - break; - } - err = ifmsg_process(nlh, (struct in_ifmsg*)nlh->nlmsg_data); - break; - case RTMSG_CONTROL: - if (nlh->nlmsg_len < sizeof(*nlh)+sizeof(struct in_rtctlmsg)) { - rtmsg_ack(nlh, EINVAL); - err = -EINVAL; - break; - } - err = rtcmsg_process(nlh, (struct in_rtctlmsg*)nlh->nlmsg_data); - break; - default: - break; - } - } - kfree_skb(skb, FREE_READ); - fib_unlock(); - if (!err || rt_nl_flags&RTCTL_ACK) - return totlen; - return err; -} - -#endif - - -static int fib_magic(int op, unsigned flags, u32 dst, u32 mask, struct device *dev) -{ - struct nlmsghdr n; - struct in_rtmsg r; - memset(&r, 0, sizeof(r)); - n.nlmsg_seq=0; - n.nlmsg_pid=0; - r.rtmsg_metric = MAGIC_METRIC; - r.rtmsg_prefix.s_addr = dst; - if (dev->flags&IFF_LOOPBACK) - flags |= RTF_LOCAL; - r.rtmsg_flags = flags; - r.rtmsg_prefixlen = 32 - fib_logmask(mask); - - return (op == RTMSG_NEWROUTE ? fib_create : fib_delete) - (&r, dev, (flags&RTF_LOCAL) ? &local_class : &main_class, &n); -} - -static void ip_rt_del_broadcasts(struct device *dev) -{ - u32 net = dev->pa_addr&dev->pa_mask; - - fib_magic(RTMSG_DELROUTE, RTF_IFBRD, dev->pa_brdaddr, ~0, dev); - fib_magic(RTMSG_DELROUTE, RTF_IFBRD, net, ~0, dev); - fib_magic(RTMSG_DELROUTE, RTF_IFBRD, net|~dev->pa_mask, ~0, dev); -} - -static void ip_rt_add_broadcasts(struct device *dev, u32 brd, u32 mask) -{ - u32 net = dev->pa_addr&mask; - - if (dev->flags&IFF_BROADCAST) - fib_magic(RTMSG_NEWROUTE, RTF_IFBRD, brd, ~0, dev); - - if (net && !(mask&htonl(1))) { - fib_magic(RTMSG_NEWROUTE, RTF_IFBRD, net, ~0, dev); - fib_magic(RTMSG_NEWROUTE, RTF_IFBRD, net|~mask, ~0, dev); - } -} - -void ip_rt_change_broadcast(struct device *dev, u32 new_brd) -{ - fib_lock(); - printk(KERN_DEBUG "%s changes brd %08X -> %08X\n", - dev->name, (u32)dev->pa_brdaddr, new_brd); - if (!ZERONET(dev->pa_addr) && dev->flags&IFF_BROADCAST) { - fib_magic(RTMSG_DELROUTE, RTF_IFBRD, dev->pa_brdaddr, ~0, dev); - rtmsg_dev(RTMSG_DELDEVICE, dev, NULL); - rtmsg_dev(RTMSG_NEWDEVICE, dev, NULL); - ip_rt_add_broadcasts(dev, new_brd, dev->pa_mask); - } - fib_unlock(); -} - -void ip_rt_change_dstaddr(struct device *dev, u32 dstaddr) -{ - fib_lock(); - if (!ZERONET(dev->pa_addr) && (dev->flags&IFF_POINTOPOINT) && dev->type != ARPHRD_TUNNEL) { - printk(KERN_DEBUG "%s changes dst %08X -> %08X\n", - dev->name, (u32)dev->pa_dstaddr, dstaddr); - fib_magic(RTMSG_DELROUTE, RTF_IFPREFIX, dev->pa_dstaddr, ~0, dev); - rtmsg_dev(RTMSG_DELDEVICE, dev, NULL); - rtmsg_dev(RTMSG_NEWDEVICE, dev, NULL); - if (dstaddr) - fib_magic(RTMSG_NEWROUTE, RTF_IFPREFIX, dstaddr, ~0, dev); - } - fib_unlock(); -} - -void ip_rt_change_netmask(struct device *dev, u32 mask) -{ - u32 net; - - fib_lock(); - printk(KERN_DEBUG "%s changes netmask %08X -> %08X\n", - dev->name, (u32)dev->pa_mask, mask); - if (ZERONET(dev->pa_addr)) { - fib_unlock(); - return; - } - net = dev->pa_addr&dev->pa_mask; - fib_magic(RTMSG_DELROUTE, RTF_IFPREFIX, net, dev->pa_mask, dev); - ip_rt_del_broadcasts(dev); - if (mask != 0xFFFFFFFF && dev->flags&IFF_POINTOPOINT) - fib_magic(RTMSG_DELROUTE, RTF_IFPREFIX, dev->pa_dstaddr, ~0, dev); - rtmsg_dev(RTMSG_DELDEVICE, dev, NULL); - - if (mask != 0xFFFFFFFF) - dev->flags &= ~IFF_POINTOPOINT; - - rtmsg_dev(RTMSG_NEWDEVICE, dev, NULL); - net = dev->pa_addr&mask; - if (net) - fib_magic(RTMSG_NEWROUTE, RTF_IFPREFIX, net, mask, dev); - ip_rt_add_broadcasts(dev, dev->pa_addr, mask); - fib_unlock(); -} - -int ip_rt_event(int event, struct device *dev) -{ - fib_lock(); - if (event == NETDEV_DOWN) { - fib_flush(dev); - rtmsg_dev(RTMSG_DELDEVICE, dev, NULL); - fib_unlock(); - return NOTIFY_DONE; - } - if (event == NETDEV_CHANGE) { - printk(KERN_DEBUG "%s(%s) changes state fl=%08x pa=%08X/%08X brd=%08X dst=%08X\n", - dev->name, current->comm, dev->flags, (u32)dev->pa_addr, (u32)dev->pa_mask, - (u32)dev->pa_brdaddr, (u32)dev->pa_dstaddr); - if (!(dev->flags&IFF_BROADCAST)) - fib_magic(RTMSG_DELROUTE, RTF_IFBRD, dev->pa_brdaddr, ~0, dev); - if (!(dev->flags&IFF_POINTOPOINT)) - fib_magic(RTMSG_DELROUTE, RTF_IFPREFIX, dev->pa_dstaddr, ~0, dev); - else { - u32 net = dev->pa_addr&dev->pa_mask; - fib_magic(RTMSG_DELROUTE, RTF_IFPREFIX, net, dev->pa_mask, dev); - ip_rt_del_broadcasts(dev); - } - rtmsg_dev(RTMSG_DELDEVICE, dev, NULL); - } - - if ((event == NETDEV_UP || event == NETDEV_CHANGE) && !ZERONET(dev->pa_addr)) { - if (dev->flags&IFF_POINTOPOINT) { - dev->pa_mask = 0xFFFFFFFF; - dev->ip_flags &= ~IFF_IP_MASK_OK; - dev->flags &= ~IFF_BROADCAST; - dev->pa_brdaddr = 0; - } - - if (event == NETDEV_UP) - printk(KERN_DEBUG "%s UP fl=%08x pa=%08X/%08X brd=%08X dst=%08X\n", - dev->name, dev->flags, (u32)dev->pa_addr, - (u32)dev->pa_mask, (u32)dev->pa_brdaddr, (u32)dev->pa_dstaddr); - - rtmsg_dev(RTMSG_NEWDEVICE, dev, NULL); - - if (dev->flags&IFF_POINTOPOINT) { - if (dev->pa_dstaddr && dev->type != ARPHRD_TUNNEL) - fib_magic(RTMSG_NEWROUTE, RTF_IFPREFIX, dev->pa_dstaddr, ~0, dev); - } else { - u32 net = dev->pa_addr&dev->pa_mask; - - if (net) - fib_magic(RTMSG_NEWROUTE, RTF_IFPREFIX, net, dev->pa_mask, dev); - ip_rt_add_broadcasts(dev, dev->pa_brdaddr, dev->pa_mask); - } - fib_magic(RTMSG_NEWROUTE, RTF_IFLOCAL, dev->pa_addr, ~0, dev); - if (dev == &loopback_dev) { - if (dev->pa_addr != htonl(INADDR_LOOPBACK)) { - u32 mask = htonl(0xFF000000); - fib_magic(RTMSG_NEWROUTE, RTF_IFPREFIX, - htonl(INADDR_LOOPBACK)&mask, - mask, dev); - fib_magic(RTMSG_NEWROUTE, RTF_IFLOCAL, - htonl(INADDR_LOOPBACK), - mask, dev); - } - } - } - if (event == NETDEV_CHANGEMTU || event == NETDEV_CHANGEADDR) - rtmsg_dev(RTMSG_NEWDEVICE, dev, NULL); - fib_unlock(); - return NOTIFY_DONE; -} - - -__initfunc(void ip_fib_init(void)) -{ - struct in_rtrulemsg r; - -#ifdef CONFIG_PROC_FS - proc_net_register(&(struct proc_dir_entry) { - PROC_NET_ROUTE, 5, "route", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - fib_get_info - }); - proc_net_register(&(struct proc_dir_entry) { - PROC_NET_RTCLASSES, 10, "rt_classes", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - fib_class_get_info - }); - proc_net_register(&(struct proc_dir_entry) { - PROC_NET_RTLOCAL, 8, "rt_local", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - fib_local_get_info - }); - proc_net_register(&(struct proc_dir_entry) { - PROC_NET_RTRULES, 8, "rt_rules", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - fib_rules_get_info - }); -#endif /* CONFIG_PROC_FS */ - - fib_classes[RT_CLASS_LOCAL] = &local_class; - fib_classes[RT_CLASS_MAIN] = &main_class; - fib_classes[RT_CLASS_DEFAULT] = &default_class; - - memset(&r, 0, sizeof(r)); - r.rtrmsg_class = RT_CLASS_LOCAL; - r.rtrmsg_preference = 0; - fib_rule_add(&r, NULL, NULL); - - memset(&r, 0, sizeof(r)); - r.rtrmsg_class = RT_CLASS_DEFAULT; - r.rtrmsg_preference = 255; - fib_rule_add(&r, NULL, NULL); - - memset(&r, 0, sizeof(r)); - r.rtrmsg_class = RT_CLASS_MAIN; - r.rtrmsg_preference = 254; - fib_rule_add(&r, NULL, NULL); - -#ifdef CONFIG_RTNETLINK - netlink_attach(NETLINK_ROUTE, fib_netlink_call); -#endif -} diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c new file mode 100644 index 000000000000..16d72fcd2981 --- /dev/null +++ b/net/ipv4/fib_frontend.c @@ -0,0 +1,572 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: FIB frontend. + * + * Version: $Id: fib_frontend.c,v 1.4 1997/11/09 20:05:23 kuznet Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define FFprint(a...) printk(KERN_DEBUG a) + +#ifndef CONFIG_IP_MULTIPLE_TABLES + +#define RT_TABLE_MIN RT_TABLE_MAIN + +struct fib_table *local_table; +struct fib_table *main_table; + +#else + +#define RT_TABLE_MIN 1 + +struct fib_table *fib_tables[RT_TABLE_MAX+1]; + +struct fib_table *__fib_new_table(int id) +{ + struct fib_table *tb; + + tb = fib_hash_init(id); + if (!tb) + return NULL; + fib_tables[id] = tb; + return tb; +} + + +#endif /* CONFIG_IP_MULTIPLE_TABLES */ + + +void fib_flush(void) +{ + int flushed = 0; +#ifdef CONFIG_IP_MULTIPLE_TABLES + struct fib_table *tb; + int id; + + for (id = RT_TABLE_MAX; id>0; id--) { + if ((tb = fib_get_table(id))==NULL) + continue; + flushed += tb->tb_flush(tb); + } +#else /* CONFIG_IP_MULTIPLE_TABLES */ + flushed += main_table->tb_flush(main_table); + flushed += local_table->tb_flush(local_table); +#endif /* CONFIG_IP_MULTIPLE_TABLES */ + + if (flushed) + rt_cache_flush(RT_FLUSH_DELAY); +} + + +#ifdef CONFIG_PROC_FS + +/* + * Called from the PROCfs module. This outputs /proc/net/route. + * + * It always works in backward compatibility mode. + * The format of the file is not supposed to be changed. + */ + +static int +fib_get_procinfo(char *buffer, char **start, off_t offset, int length, int dummy) +{ + int first = offset/128; + char *ptr = buffer; + int count = (length+127)/128; + int len; + + *start = buffer + offset%128; + + if (--first < 0) { + sprintf(buffer, "%-127s\n", "Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT"); + --count; + ptr += 128; + first = 0; + } + + /* rtnl_shlock(); -- it is pointless at the moment --ANK */ + if (main_table && count > 0) { + int n = main_table->tb_get_info(main_table, ptr, first, count); + count -= n; + ptr += n*128; + } + /* rtnl_shunlock(); */ + len = ptr - *start; + if (len >= length) + return length; + if (len >= 0) + return len; + return 0; +} + +#endif /* CONFIG_PROC_FS */ + +/* + * Find the first device with a given source address. + */ + +struct device * ip_dev_find(u32 addr) +{ + struct rt_key key; + struct fib_result res; + + memset(&key, 0, sizeof(key)); + key.dst = addr; + key.scope = RT_SCOPE_UNIVERSE; + + if (!local_table || local_table->tb_lookup(local_table, &key, &res) + || res.type != RTN_LOCAL) + return NULL; + + return FIB_RES_DEV(res); +} + +unsigned inet_addr_type(u32 addr) +{ + struct rt_key key; + struct fib_result res; + + if (ZERONET(addr) || BADCLASS(addr)) + return RTN_BROADCAST; + if (MULTICAST(addr)) + return RTN_MULTICAST; + + memset(&key, 0, sizeof(key)); + key.dst = addr; + + if (local_table) { + if (local_table->tb_lookup(local_table, &key, &res) == 0) + return res.type; + return RTN_UNICAST; + } + return RTN_BROADCAST; +} + +/* Given (packet source, input interface) and optional (dst, oif, tos): + - (main) check, that source is valid i.e. not broadcast or our local + address. + - figure out what "logical" interface this packet arrived + and calculate "specific destination" address. + - check, that packet arrived from expected physical interface. + */ + +int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, + struct device *dev, u32 *spec_dst) +{ + struct in_device *in_dev = dev->ip_ptr; + struct rt_key key; + struct fib_result res; + + key.dst = src; + key.src = dst; + key.tos = tos; + key.oif = 0; + key.iif = oif; + key.scope = RT_SCOPE_UNIVERSE; + + if (in_dev == NULL) + return -EINVAL; + if (fib_lookup(&key, &res)) + goto last_resort; + if (res.type != RTN_UNICAST) + return -EINVAL; + *spec_dst = FIB_RES_PREFSRC(res); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) +#else + if (FIB_RES_DEV(res) == dev) +#endif + return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + + if (in_dev->ifa_list == NULL) + goto last_resort; + if (IN_DEV_RPFILTER(in_dev)) + return -EINVAL; + key.oif = dev->ifindex; + if (fib_lookup(&key, &res) == 0 && res.type == RTN_UNICAST) { + *spec_dst = FIB_RES_PREFSRC(res); + return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + } + return 0; + +last_resort: + if (IN_DEV_RPFILTER(in_dev)) + return -EINVAL; + *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + return 0; +} + +#ifndef CONFIG_IP_NOSIOCRT + +/* + * Handle IP routing ioctl calls. These are used to manipulate the routing tables + */ + +int ip_rt_ioctl(unsigned int cmd, void *arg) +{ + int err; + struct kern_rta rta; + struct rtentry r; + struct { + struct nlmsghdr nlh; + struct rtmsg rtm; + } req; + + switch (cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ + if (!suser()) + return -EPERM; + if (copy_from_user(&r, arg, sizeof(struct rtentry))) + return -EFAULT; + rtnl_lock(); + err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, arg); + if (err == 0) { + if (cmd == SIOCDELRT) { + struct fib_table *tb = fib_get_table(req.rtm.rtm_table); + err = -ESRCH; + if (tb) + err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); + } else { + struct fib_table *tb = fib_new_table(req.rtm.rtm_table); + err = -ENOBUFS; + if (tb) + err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); + } + } + rtnl_unlock(); + return err; + } + return -EINVAL; +} + +#else + +int ip_rt_ioctl(unsigned int cmd, void *arg) +{ + return -EINVAL; +} + +#endif + +#ifdef CONFIG_RTNETLINK + +int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib_table * tb; + struct kern_rta *rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + tb = fib_get_table(r->rtm_table); + if (tb) + return tb->tb_delete(tb, r, rta, nlh, &NETLINK_CB(skb)); + return -ESRCH; +} + +int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib_table * tb; + struct kern_rta *rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + tb = fib_new_table(r->rtm_table); + if (tb) + return tb->tb_insert(tb, r, rta, nlh, &NETLINK_CB(skb)); + return -ENOBUFS; +} + +int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct fib_table *tb; + + s_t = cb->args[0]; + if (s_t == 0) + s_t = cb->args[0] = RT_TABLE_MIN; + + for (t=s_t; t<=RT_TABLE_MAX; t++) { + if (t < s_t) continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); + if ((tb = fib_get_table(t))==NULL) + continue; + if (tb->tb_dump(tb, skb, cb) < 0) + break; + } + + cb->args[0] = t; + + return skb->len; +} + +#endif + +/* Prepare and feed intra-kernel routing request. + Really, it should be netlink message, but :-( netlink + can be not configured, so that we feed it directly + to fib engine. It is legal, because all events occur + only when netlink is already locked. + */ + +static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa) +{ + struct fib_table * tb; + struct { + struct nlmsghdr nlh; + struct rtmsg rtm; + } req; + struct kern_rta rta; + + memset(&req.rtm, 0, sizeof(req.rtm)); + memset(&rta, 0, sizeof(rta)); + + if (type == RTN_UNICAST) + tb = fib_new_table(RT_TABLE_MAIN); + else + tb = fib_new_table(RT_TABLE_LOCAL); + + if (tb == NULL) + return; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = cmd; + req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = 0; + + req.rtm.rtm_dst_len = dst_len; + req.rtm.rtm_table = tb->tb_id; + req.rtm.rtm_protocol = RTPROT_KERNEL; + req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST); + req.rtm.rtm_type = type; + + rta.rta_dst = &dst; + rta.rta_prefsrc = &ifa->ifa_local; + rta.rta_oif = &ifa->ifa_dev->dev->ifindex; + + if (cmd == RTM_NEWROUTE) + tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); + else + tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); +} + +static void fib_add_ifaddr(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct device *dev = in_dev->dev; + struct in_ifaddr *prim = ifa; + u32 mask = ifa->ifa_mask; + u32 addr = ifa->ifa_local; + u32 prefix = ifa->ifa_address&mask; + + if (ifa->ifa_flags&IFA_F_SECONDARY) + prim = inet_ifa_byprefix(in_dev, prefix, mask); + + fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); + + if (!(dev->flags&IFF_UP)) + return; + + /* Add broadcast address, if it is explicitly assigned. */ + if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF) + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + + if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY)) { + fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : + RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); + + /* Add network specific broadcasts, when it takes a sense */ + if (ifa->ifa_prefixlen < 31) { + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); + } + } +} + +static void fib_del_ifaddr(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct device *dev = in_dev->dev; + struct in_ifaddr *ifa1; + struct in_ifaddr *prim = ifa; + u32 brd = ifa->ifa_address|~ifa->ifa_mask; + u32 any = ifa->ifa_address&ifa->ifa_mask; +#define LOCAL_OK 1 +#define BRD_OK 2 +#define BRD0_OK 4 +#define BRD1_OK 8 + unsigned ok = 0; + + if (!(ifa->ifa_flags&IFA_F_SECONDARY)) + fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : + RTN_UNICAST, any, ifa->ifa_prefixlen, prim); + else + prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); + + /* Deletion is more complicated than add. + We should take care of not to delete too much :-) + + Scan address list to be sure that addresses are really gone. + */ + + for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + if (ifa->ifa_local == ifa1->ifa_local) + ok |= LOCAL_OK; + if (ifa->ifa_broadcast == ifa1->ifa_broadcast) + ok |= BRD_OK; + if (brd == ifa1->ifa_broadcast) + ok |= BRD1_OK; + if (any == ifa1->ifa_broadcast) + ok |= BRD0_OK; + } + + if (!(ok&BRD_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + if (!(ok&BRD1_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + if (!(ok&BRD0_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + if (!(ok&LOCAL_OK)) { + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); + + /* Check, that this local address finally disappeared. */ + if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) { + /* And the last, but not the least thing. + We must flush stray FIB entries. + + First of all, we scan fib_info list searching + for stray nexthop entries, then ignite fib_flush. + */ + if (fib_sync_down(ifa->ifa_local, NULL)) + fib_flush(); + } + } +#undef LOCAL_OK +#undef BRD_OK +#undef BRD0_OK +#undef BRD1_OK +} + +static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr*)ptr; + + switch (event) { + case NETDEV_UP: + fib_add_ifaddr(ifa); + rt_cache_flush(2*HZ); + break; + case NETDEV_DOWN: + fib_del_ifaddr(ifa); + rt_cache_flush(1*HZ); + break; + } + return NOTIFY_DONE; +} + +static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + struct in_device *in_dev = dev->ip_ptr; + + if (!in_dev) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + for_ifa(in_dev) { + fib_add_ifaddr(ifa); + } endfor_ifa(in_dev); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + fib_sync_up(dev); +#endif + rt_cache_flush(2*HZ); + break; + case NETDEV_DOWN: + if (fib_sync_down(0, dev)) + fib_flush(); + rt_cache_flush(0); + break; + case NETDEV_UNREGISTER: + if (in_dev->ifa_list) + printk("About to crash!\n"); + rt_cache_flush(0); + break; + } + return NOTIFY_DONE; +} + +struct notifier_block fib_inetaddr_notifier = { + fib_inetaddr_event, + NULL, + 0 +}; + +struct notifier_block fib_netdev_notifier = { + fib_netdev_event, + NULL, + 0 +}; + +__initfunc(void ip_fib_init(void)) +{ +#ifdef CONFIG_PROC_FS + proc_net_register(&(struct proc_dir_entry) { + PROC_NET_ROUTE, 5, "route", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + fib_get_procinfo + }); +#endif /* CONFIG_PROC_FS */ + +#ifndef CONFIG_IP_MULTIPLE_TABLES + local_table = fib_hash_init(RT_TABLE_LOCAL); + main_table = fib_hash_init(RT_TABLE_MAIN); +#else + fib_rules_init(); +#endif + + register_netdevice_notifier(&fib_netdev_notifier); + register_inetaddr_notifier(&fib_inetaddr_notifier); +} + diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c new file mode 100644 index 000000000000..afa6f7fe0b68 --- /dev/null +++ b/net/ipv4/fib_hash.c @@ -0,0 +1,754 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 FIB: lookup engine and maintenance routines. + * + * Version: $Id: fib_hash.c,v 1.1 1997/11/09 19:53:13 kuznet Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define FTprint(a...) +/* +printk(KERN_DEBUG a) + */ + +/* + These bizarre types are just to force strict type checking. + When I reversed order of bytes and changed to natural mask lengths, + I forgot to make fixes in several places. Now I am lazy to return + it back. + */ + +typedef struct { + u32 datum; +} fn_key_t; + +typedef struct { + u32 datum; +} fn_hash_idx_t; + +struct fib_node +{ + struct fib_node *fn_next; + struct fib_info *fn_info; +#define FIB_INFO(f) ((f)->fn_info) + fn_key_t fn_key; + u8 fn_tos; + u8 fn_type; + u8 fn_scope; + u8 fn_state; +}; + +#define FN_S_ZOMBIE 1 +#define FN_S_ACCESSED 2 + +static int fib_hash_zombies; + +struct fn_zone +{ + struct fn_zone *fz_next; /* Next not empty zone */ + struct fib_node **fz_hash; /* Hash table pointer */ + int fz_nent; /* Number of entries */ + + int fz_divisor; /* Hash divisor */ + u32 fz_hashmask; /* (1<fz_hashmask) + + int fz_order; /* Zone order */ + u32 fz_mask; +#define FZ_MASK(fz) ((fz)->fz_mask) +}; + +/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask + can be cheaper than memory lookup, so that FZ_* macros are used. + */ + +struct fn_hash +{ + struct fn_zone *fn_zones[33]; + struct fn_zone *fn_zone_list; +}; + +static __inline__ fn_hash_idx_t fn_hash(fn_key_t key, struct fn_zone *fz) +{ + u32 h = ntohl(key.datum)>>(32 - fz->fz_order); + h ^= (h>>20); + h ^= (h>>10); + h ^= (h>>5); + h &= FZ_HASHMASK(fz); + return *(fn_hash_idx_t*)&h; +} + +#define fz_key_0(key) ((key).datum = 0) +#define fz_prefix(key,fz) ((key).datum) + +static __inline__ fn_key_t fz_key(u32 dst, struct fn_zone *fz) +{ + fn_key_t k; + k.datum = dst & FZ_MASK(fz); + return k; +} + +static __inline__ struct fib_node ** fz_chain_p(fn_key_t key, struct fn_zone *fz) +{ + return &fz->fz_hash[fn_hash(key, fz).datum]; +} + +static __inline__ struct fib_node * fz_chain(fn_key_t key, struct fn_zone *fz) +{ + return fz->fz_hash[fn_hash(key, fz).datum]; +} + +extern __inline__ int fn_key_eq(fn_key_t a, fn_key_t b) +{ + return a.datum == b.datum; +} + +#define FZ_MAX_DIVISOR 1024 + +#ifdef CONFIG_IP_ROUTE_LARGE_TABLES + +static __inline__ void fn_rebuild_zone(struct fn_zone *fz, + struct fib_node **old_ht, + int old_divisor) +{ + int i; + struct fib_node *f, **fp, *next; + + for (i=0; ifn_next; + f->fn_next = NULL; + for (fp = fz_chain_p(f->fn_key, fz); *fp; fp = &(*fp)->fn_next) + /* NONE */; + *fp = f; + } + } +} + +static void fn_rehash_zone(struct fn_zone *fz) +{ + struct fib_node **ht, **old_ht; + int old_divisor, new_divisor; + u32 new_hashmask; + + old_divisor = fz->fz_divisor; + + switch (old_divisor) { + case 16: + new_divisor = 256; + new_hashmask = 0xFF; + break; + case 256: + new_divisor = 1024; + new_hashmask = 0x3FF; + break; + default: + printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); + return; + } +#if RT_CACHE_DEBUG >= 2 + printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor); +#endif + + ht = kmalloc(new_divisor*sizeof(struct fib_node*), GFP_KERNEL); + + if (ht) { + memset(ht, 0, new_divisor*sizeof(struct fib_node*)); + start_bh_atomic(); + old_ht = fz->fz_hash; + fz->fz_hash = ht; + fz->fz_hashmask = new_hashmask; + fz->fz_divisor = new_divisor; + fn_rebuild_zone(fz, old_ht, old_divisor); + end_bh_atomic(); + kfree(old_ht); +FTprint("REHASHED ZONE: order %d mask %08x hash %d/%08x\n", fz->fz_order, fz->fz_mask, fz->fz_divisor, fz->fz_hashmask); + } +} +#endif /* CONFIG_IP_ROUTE_LARGE_TABLES */ + +static void fn_free_node(struct fib_node * f) +{ + fib_release_info(FIB_INFO(f)); + kfree_s(f, sizeof(struct fib_node)); +} + + +static struct fn_zone * +fn_new_zone(struct fn_hash *table, int z) +{ + int i; + struct fn_zone *fz = kmalloc(sizeof(struct fn_zone), GFP_KERNEL); + if (!fz) + return NULL; + + memset(fz, 0, sizeof(struct fn_zone)); + if (z) { + fz->fz_divisor = 16; + fz->fz_hashmask = 0xF; + } else { + fz->fz_divisor = 1; + fz->fz_hashmask = 0; + } + fz->fz_hash = kmalloc(fz->fz_divisor*sizeof(struct fib_node*), GFP_KERNEL); + if (!fz->fz_hash) { + kfree(fz); + return NULL; + } + memset(fz->fz_hash, 0, fz->fz_divisor*sizeof(struct fib_node*)); + fz->fz_order = z; + fz->fz_mask = inet_make_mask(z); + + /* Find the first not empty zone with more specific mask */ + for (i=z+1; i<=32; i++) + if (table->fn_zones[i]) + break; + start_bh_atomic(); + if (i>32) { + /* No more specific masks, we are the first. */ + fz->fz_next = table->fn_zone_list; + table->fn_zone_list = fz; + } else { + fz->fz_next = table->fn_zones[i]->fz_next; + table->fn_zones[i]->fz_next = fz; + } + table->fn_zones[z] = fz; + end_bh_atomic(); +FTprint("NEW ZONE: order %d mask %08x hash %d/%08x\n", fz->fz_order, fz->fz_mask, fz->fz_divisor, fz->fz_hashmask); + return fz; +} + +static int +fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result *res) +{ + int err; + struct fn_zone *fz; + struct fn_hash *t = (struct fn_hash*)tb->tb_data; + + for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { + struct fib_node *f; + fn_key_t k = fz_key(key->dst, fz); + int matched = 0; + + for (f = fz_chain(k, fz); f; f = f->fn_next) { + if (!fn_key_eq(k, f->fn_key) +#ifdef CONFIG_IP_ROUTE_TOS + || (f->fn_tos && f->fn_tos != key->tos) +#endif + ) { + if (matched) + return 1; + continue; + } + matched = 1; + f->fn_state |= FN_S_ACCESSED; + + if (f->fn_state&FN_S_ZOMBIE) + continue; + if (f->fn_scope < key->scope) + continue; + + err = fib_semantic_match(f->fn_type, FIB_INFO(f), key, res); + if (err == 0) { + res->type = f->fn_type; + res->scope = f->fn_scope; + res->prefixlen = fz->fz_order; + res->prefix = &fz_prefix(f->fn_key, fz); + return 0; + } + if (err < 0) + return err; + } + } + return 1; +} + +#define FIB_SCAN(f, fp) \ +for ( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next) + +#define FIB_SCAN_KEY(f, fp, key) \ +for ( ; ((f) = *(fp)) != NULL && fn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next) + +#define FIB_CONTINUE(f, fp) \ +{ \ + fp = &f->fn_next; \ + continue; \ +} + +#ifdef CONFIG_RTNETLINK +static void rtmsg_fib(int, struct fib_node*, int, int, + struct nlmsghdr *n, + struct netlink_skb_parms *); +#else +#define rtmsg_fib(a, b, c, d, e, f) +#endif + + +static int +fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + struct fib_node *new_f, *f, **fp; + struct fn_zone *fz; + struct fib_info *fi; + + int z = r->rtm_dst_len; + int type = r->rtm_type; +#ifdef CONFIG_IP_ROUTE_TOS + u8 tos = r->rtm_tos; +#endif + fn_key_t key; + unsigned state = 0; + int err; + +FTprint("tb(%d)_insert: %d %08x/%d %d %08x\n", tb->tb_id, r->rtm_type, rta->rta_dst ? +*(u32*)rta->rta_dst : 0, z, rta->rta_oif ? *rta->rta_oif : -1, +rta->rta_prefsrc ? *(u32*)rta->rta_prefsrc : 0); + if (z > 32) + return -EINVAL; + fz = table->fn_zones[z]; + if (!fz && !(fz = fn_new_zone(table, z))) + return -ENOBUFS; + + fz_key_0(key); + if (rta->rta_dst) { + u32 dst; + memcpy(&dst, rta->rta_dst, 4); + if (dst & ~FZ_MASK(fz)) + return -EINVAL; + key = fz_key(dst, fz); + } + + if ((fi = fib_create_info(r, rta, n, &err)) == NULL) { +FTprint("fib_create_info err=%d\n", err); + return err; + } + +#ifdef CONFIG_IP_ROUTE_LARGE_TABLES + if (fz->fz_nent > (fz->fz_divisor<<2) && + fz->fz_divisor < FZ_MAX_DIVISOR && + (z==32 || (1< fz->fz_divisor)) + fn_rehash_zone(fz); +#endif + + fp = fz_chain_p(key, fz); + + /* + * Scan list to find the first route with the same destination + */ + FIB_SCAN(f, fp) { + if (fn_key_eq(f->fn_key,key)) + break; + } + +#ifdef CONFIG_IP_ROUTE_TOS + /* + * Find route with the same destination and tos. + */ + FIB_SCAN_KEY(f, fp, key) { + if (f->fn_tos <= tos) + break; + } +#endif + + if (f && fn_key_eq(f->fn_key, key) +#ifdef CONFIG_IP_ROUTE_TOS + && f->fn_tos == tos +#endif + ) { + state = f->fn_state; + if (n->nlmsg_flags&NLM_F_EXCL && !(state&FN_S_ZOMBIE)) + return -EEXIST; + if (n->nlmsg_flags&NLM_F_REPLACE) { + struct fib_info *old_fi = FIB_INFO(f); + if (old_fi != fi) { + rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); + start_bh_atomic(); + FIB_INFO(f) = fi; + f->fn_type = r->rtm_type; + f->fn_scope = r->rtm_scope; + end_bh_atomic(); + rtmsg_fib(RTM_NEWROUTE, f, z, tb->tb_id, n, req); + } + state = f->fn_state; + f->fn_state = 0; + fib_release_info(old_fi); + if (state&FN_S_ACCESSED) + rt_cache_flush(RT_FLUSH_DELAY); + return 0; + } + for ( ; (f = *fp) != NULL && fn_key_eq(f->fn_key, key) +#ifdef CONFIG_IP_ROUTE_TOS + && f->fn_tos == tos +#endif + ; fp = &f->fn_next) { + state |= f->fn_state; + if (f->fn_type == type && f->fn_scope == r->rtm_scope + && FIB_INFO(f) == fi) { + fib_release_info(fi); + if (f->fn_state&FN_S_ZOMBIE) { + f->fn_state = 0; + rtmsg_fib(RTM_NEWROUTE, f, z, tb->tb_id, n, req); + if (state&FN_S_ACCESSED) + rt_cache_flush(RT_FLUSH_DELAY); + return 0; + } + return -EEXIST; + } + } + } else { + if (!(n->nlmsg_flags&NLM_F_CREATE)) + return -ENOENT; + } + + new_f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL); + if (new_f == NULL) { + fib_release_info(fi); + return -ENOBUFS; + } + + memset(new_f, 0, sizeof(struct fib_node)); + + new_f->fn_key = key; +#ifdef CONFIG_IP_ROUTE_TOS + new_f->fn_tos = tos; +#endif + new_f->fn_type = type; + new_f->fn_scope = r->rtm_scope; + FIB_INFO(new_f) = fi; + + /* + * Insert new entry to the list. + */ + + start_bh_atomic(); + new_f->fn_next = f; + *fp = new_f; + end_bh_atomic(); + fz->fz_nent++; + + rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->tb_id, n, req); + rt_cache_flush(RT_FLUSH_DELAY); + return 0; +} + + +static int +fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + struct fib_node **fp, *f; + int z = r->rtm_dst_len; + struct fn_zone *fz; + fn_key_t key; +#ifdef CONFIG_IP_ROUTE_TOS + u8 tos = r->rtm_tos; +#endif + +FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ? + *(u32*)rta->rta_dst : 0, z, rta->rta_oif ? *rta->rta_oif : -1); + if (z > 32) + return -EINVAL; + if ((fz = table->fn_zones[z]) == NULL) + return -ESRCH; + + fz_key_0(key); + if (rta->rta_dst) { + u32 dst; + memcpy(&dst, rta->rta_dst, 4); + if (dst & ~FZ_MASK(fz)) + return -EINVAL; + key = fz_key(dst, fz); + } + + fp = fz_chain_p(key, fz); + + FIB_SCAN(f, fp) { + if (fn_key_eq(f->fn_key, key)) + break; + } +#ifdef CONFIG_IP_ROUTE_TOS + FIB_SCAN_KEY(f, fp, key) { + if (f->fn_tos == tos) + break; + } +#endif + + while ((f = *fp) != NULL && fn_key_eq(f->fn_key, key) +#ifdef CONFIG_IP_ROUTE_TOS + && f->fn_tos == tos +#endif + ) { + struct fib_info * fi = FIB_INFO(f); + + if ((f->fn_state&FN_S_ZOMBIE) || + (r->rtm_type && f->fn_type != r->rtm_type) || + (r->rtm_scope && f->fn_scope != r->rtm_scope) || + (r->rtm_protocol && fi->fib_protocol != r->rtm_protocol) || + fib_nh_match(r, n, rta, fi)) + FIB_CONTINUE(f, fp); + break; + } + if (!f) + return -ESRCH; +#if 0 + *fp = f->fn_next; + rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); + fn_free_node(f); + fz->fz_nent--; + rt_cache_flush(0); +#else + f->fn_state |= FN_S_ZOMBIE; + rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); + if (f->fn_state&FN_S_ACCESSED) { + f->fn_state &= ~FN_S_ACCESSED; + rt_cache_flush(RT_FLUSH_DELAY); + } + if (++fib_hash_zombies > 128) + fib_flush(); +#endif + return 0; +} + +extern __inline__ int +fn_flush_list(struct fib_node ** fp, int z, struct fn_hash *table) +{ + int found = 0; + struct fib_node *f; + + while ((f = *fp) != NULL) { + struct fib_info *fi = FIB_INFO(f); + + if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) { + *fp = f->fn_next; + fn_free_node(f); + found++; + continue; + } + fp = &f->fn_next; + } + return found; +} + +static int fn_hash_flush(struct fib_table *tb) +{ + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + struct fn_zone *fz; + int found = 0; + + fib_hash_zombies = 0; + for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { + int i; + int tmp = 0; + for (i=fz->fz_divisor-1; i>=0; i--) + tmp += fn_flush_list(&fz->fz_hash[i], fz->fz_order, table); + fz->fz_nent -= tmp; + found += tmp; + } + return found; +} + + +#ifdef CONFIG_PROC_FS + +static int fn_hash_get_info(struct fib_table *tb, char *buffer, int first, int count) +{ + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + struct fn_zone *fz; + int pos = 0; + int n = 0; + + for (fz=table->fn_zone_list; fz; fz = fz->fz_next) { + int i; + struct fib_node *f; + int maxslot = fz->fz_divisor; + struct fib_node **fp = fz->fz_hash; + + if (fz->fz_nent == 0) + continue; + + if (pos + fz->fz_nent <= first) { + pos += fz->fz_nent; + continue; + } + + for (i=0; i < maxslot; i++, fp++) { + for (f = *fp; f; f = f->fn_next) { + if (++pos <= first) + continue; + fib_node_get_info(f->fn_type, + f->fn_state&FN_S_ZOMBIE, + FIB_INFO(f), + fz_prefix(f->fn_key, fz), + FZ_MASK(fz), buffer); + buffer += 128; + if (++n >= count) + return n; + } + } + } + return n; +} +#endif + + +#ifdef CONFIG_RTNETLINK + +extern __inline__ int +fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_table *tb, + struct fn_zone *fz, + struct fib_node *f) +{ + int i, s_i; + + s_i = cb->args[3]; + for (i=0; f; i++, f=f->fn_next) { + if (i < s_i) continue; + if (f->fn_state&FN_S_ZOMBIE) continue; + if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + RTM_NEWROUTE, + tb->tb_id, (f->fn_state&FN_S_ZOMBIE) ? 0 : f->fn_type, f->fn_scope, + &f->fn_key, fz->fz_order, f->fn_tos, + f->fn_info) < 0) { + cb->args[3] = i; + return -1; + } + } + cb->args[3] = i; + return skb->len; +} + +extern __inline__ int +fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_table *tb, + struct fn_zone *fz) +{ + int h, s_h; + + s_h = cb->args[2]; + for (h=0; h < fz->fz_divisor; h++) { + if (h < s_h) continue; + if (h > s_h) + memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(int)); + if (fz->fz_hash == NULL || fz->fz_hash[h] == NULL) + continue; + if (fn_hash_dump_bucket(skb, cb, tb, fz, fz->fz_hash[h]) < 0) { + cb->args[2] = h; + return -1; + } + } + cb->args[2] = h; + return skb->len; +} + +static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) +{ + int m, s_m; + struct fn_zone *fz; + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + + s_m = cb->args[1]; + for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { + if (m < s_m) continue; + if (m > s_m) + memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(int)); + if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { + cb->args[1] = m; + return -1; + } + } + cb->args[1] = m; + return skb->len; +} + +static void rtmsg_fib(int event, struct fib_node* f, int z, int tb_id, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct sk_buff *skb; + pid_t pid = req ? req->pid : 0; + int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return; + + if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id, + f->fn_type, f->fn_scope, &f->fn_key, z, f->fn_tos, + FIB_INFO(f)) < 0) { + kfree_skb(skb, 0); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE; + if (n->nlmsg_flags&NLM_F_ECHO) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL); + if (n->nlmsg_flags&NLM_F_ECHO) + netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); +} + +#endif /* CONFIG_RTNETLINK */ + +#ifdef CONFIG_IP_MULTIPLE_TABLES +struct fib_table * fib_hash_init(int id) +#else +__initfunc(struct fib_table * fib_hash_init(int id)) +#endif +{ + struct fib_table *tb; + tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), GFP_KERNEL); + if (tb == NULL) + return NULL; + tb->tb_id = id; + tb->tb_lookup = fn_hash_lookup; + tb->tb_insert = fn_hash_insert; + tb->tb_delete = fn_hash_delete; + tb->tb_flush = fn_hash_flush; +#ifdef CONFIG_RTNETLINK + tb->tb_dump = fn_hash_dump; +#endif +#ifdef CONFIG_PROC_FS + tb->tb_get_info = fn_hash_get_info; +#endif + memset(tb->tb_data, 0, sizeof(struct fn_hash)); + return tb; +} diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c new file mode 100644 index 000000000000..c593d758f8f1 --- /dev/null +++ b/net/ipv4/fib_rules.c @@ -0,0 +1,363 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: policy rules. + * + * Version: $Id: fib_rules.c,v 1.2 1997/10/10 22:40:49 davem Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define FRprintk(a...) + +struct fib_rule +{ + struct fib_rule *r_next; + unsigned r_preference; + unsigned char r_table; + unsigned char r_action; + unsigned char r_dst_len; + unsigned char r_src_len; + u32 r_src; + u32 r_srcmask; + u32 r_dst; + u32 r_dstmask; + u32 r_srcmap; + u8 r_flags; + u8 r_tos; + int r_ifindex; + char r_ifname[IFNAMSIZ]; +}; + +static struct fib_rule default_rule = { NULL, 0x7FFF, RT_TABLE_DEFAULT, RTN_UNICAST, }; +static struct fib_rule main_rule = { &default_rule, 0x7FFE, RT_TABLE_MAIN, RTN_UNICAST, }; +static struct fib_rule local_rule = { &main_rule, 0, RT_TABLE_LOCAL, RTN_UNICAST, }; + +static struct fib_rule *fib_rules = &local_rule; + +int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct kern_rta *rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct fib_rule *r, **rp; + + for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) { + if ((!rta->rta_src || memcmp(rta->rta_src, &r->r_src, 4) == 0) && + rtm->rtm_src_len == r->r_src_len && + rtm->rtm_dst_len == r->r_dst_len && + (!rta->rta_dst || memcmp(rta->rta_dst, &r->r_dst, 4) == 0) && + rtm->rtm_tos == r->r_tos && + rtm->rtm_type == r->r_action && + (!rta->rta_priority || *rta->rta_priority == r->r_preference) && + (!rta->rta_ifname || strcmp(rta->rta_ifname, r->r_ifname) == 0) && + (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) { + *rp = r->r_next; + if (r != &default_rule && r != &main_rule && r != &local_rule) + kfree(r); + return 0; + } + } + return -ESRCH; +} + +/* Allocate new unique table id */ + +static struct fib_table *fib_empty_table(void) +{ + int id; + + for (id = 1; id <= RT_TABLE_MAX; id++) + if (fib_tables[id] == NULL) + return __fib_new_table(id); + return NULL; +} + + +int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct kern_rta *rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct fib_rule *r, *new_r, **rp; + unsigned char table_id; + + if (rtm->rtm_src_len > 32 || rtm->rtm_dst_len > 32 || + (rtm->rtm_tos & ~IPTOS_TOS_MASK)) + return -EINVAL; + + table_id = rtm->rtm_table; + if (table_id == RT_TABLE_UNSPEC) { + struct fib_table *table; + if (rtm->rtm_type == RTN_UNICAST || rtm->rtm_type == RTN_NAT) { + if ((table = fib_empty_table()) == NULL) + return -ENOBUFS; + table_id = table->tb_id; + } + } + + new_r = kmalloc(sizeof(*new_r), GFP_KERNEL); + if (!new_r) + return -ENOMEM; + memset(new_r, 0, sizeof(*new_r)); + if (rta->rta_src) + memcpy(&new_r->r_src, rta->rta_src, 4); + if (rta->rta_dst) + memcpy(&new_r->r_dst, rta->rta_dst, 4); + if (rta->rta_gw) + memcpy(&new_r->r_srcmap, rta->rta_gw, 4); + new_r->r_src_len = rtm->rtm_src_len; + new_r->r_dst_len = rtm->rtm_dst_len; + new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len); + new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len); + new_r->r_tos = rtm->rtm_tos; + new_r->r_action = rtm->rtm_type; + new_r->r_flags = rtm->rtm_flags; + if (rta->rta_priority) + new_r->r_preference = *rta->rta_priority; + new_r->r_table = table_id; + if (rta->rta_ifname) { + struct device *dev; + memcpy(new_r->r_ifname, rta->rta_ifname, IFNAMSIZ); + new_r->r_ifindex = -1; + dev = dev_get(rta->rta_ifname); + if (dev) + new_r->r_ifindex = dev->ifindex; + } + + rp = &fib_rules; + if (!new_r->r_preference) { + r = fib_rules; + if (r && (r = r->r_next) != NULL) { + rp = &fib_rules->r_next; + if (r->r_preference) + new_r->r_preference = r->r_preference - 1; + } + } + + while ( (r = *rp) != NULL ) { + if (r->r_preference > new_r->r_preference) + break; + rp = &r->r_next; + } + + new_r->r_next = r; + *rp = new_r; + return 0; +} + +u32 fib_rules_map_destination(u32 daddr, struct fib_result *res) +{ + u32 mask = inet_make_mask(res->prefixlen); + return (daddr&~mask)|res->fi->fib_nh->nh_gw; +} + +u32 fib_rules_policy(u32 saddr, struct fib_result *res, unsigned *flags) +{ + struct fib_rule *r = res->r; + + if (r->r_action == RTN_NAT) { + int addrtype = inet_addr_type(r->r_srcmap); + + if (addrtype == RTN_NAT) { + /* Packet is from translated source; remember it */ + saddr = (saddr&~r->r_srcmask)|r->r_srcmap; + *flags |= RTCF_SNAT; + } else if (addrtype == RTN_LOCAL || r->r_srcmap == 0) { + /* Packet is from masqueraded source; remember it */ + saddr = r->r_srcmap; + *flags |= RTCF_MASQ; + } + } + return saddr; +} + +static void fib_rules_detach(struct device *dev) +{ + struct fib_rule *r; + + for (r=fib_rules; r; r=r->r_next) { + if (r->r_ifindex == dev->ifindex) + r->r_ifindex = -1; + } +} + +static void fib_rules_attach(struct device *dev) +{ + struct fib_rule *r; + + for (r=fib_rules; r; r=r->r_next) { + if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) + r->r_ifindex = dev->ifindex; + } +} + +int fib_lookup(const struct rt_key *key, struct fib_result *res) +{ + int err; + struct fib_rule *r, *policy; + struct fib_table *tb; + + u32 daddr = key->dst; + u32 saddr = key->src; + +FRprintk("Lookup: %08x <- %08x ", key->dst, key->src); + for (r = fib_rules; r; r=r->r_next) { + if (((saddr^r->r_src) & r->r_srcmask) || + ((daddr^r->r_dst) & r->r_dstmask) || +#ifdef CONFIG_IP_TOS_ROUTING + (r->r_tos && r->r_tos != key->tos) || +#endif + (r->r_ifindex && r->r_ifindex != key->iif)) + continue; + +FRprintk("tb %d r %d ", r->r_table, r->r_action); + switch (r->r_action) { + case RTN_UNICAST: + policy = NULL; + break; + case RTN_NAT: + policy = r; + break; + case RTN_UNREACHABLE: + return -ENETUNREACH; + default: + case RTN_BLACKHOLE: + return -EINVAL; + case RTN_PROHIBIT: + return -EACCES; + } + + if ((tb = fib_get_table(r->r_table)) == NULL) + continue; + err = tb->tb_lookup(tb, key, res); + if (err == 0) { +FRprintk("ok\n"); + res->r = policy; + return 0; + } + if (err < 0) + return err; +FRprintk("RCONT "); + } +FRprintk("FAILURE\n"); + return -ENETUNREACH; +} + +static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + + if (event == NETDEV_UNREGISTER) + fib_rules_detach(dev); + else if (event == NETDEV_REGISTER) + fib_rules_attach(dev); + return NOTIFY_DONE; +} + + +struct notifier_block fib_rules_notifier = { + fib_rules_event, + NULL, + 0 +}; + +#ifdef CONFIG_RTNETLINK + +extern __inline__ int inet_fill_rule(struct sk_buff *skb, + struct fib_rule *r, + struct netlink_callback *cb) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = r->r_dst_len; + rtm->rtm_src_len = r->r_src_len; + rtm->rtm_tos = r->r_tos; + rtm->rtm_table = r->r_table; + rtm->rtm_protocol = 0; + rtm->rtm_scope = 0; + rtm->rtm_nhs = 0; + rtm->rtm_type = r->r_action; + rtm->rtm_optlen = 0; + rtm->rtm_flags = r->r_flags; + + if (r->r_dst_len) + RTA_PUT(skb, RTA_DST, 4, &r->r_dst); + if (r->r_src_len) + RTA_PUT(skb, RTA_SRC, 4, &r->r_src); + if (r->r_ifname[0]) + RTA_PUT(skb, RTA_IFNAME, IFNAMSIZ, &r->r_ifname); + if (r->r_preference) + RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference); + if (r->r_srcmap) + RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_put(skb, b - skb->tail); + return -1; +} + +int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->args[0]; + struct fib_rule *r; + + for (r=fib_rules, idx=0; r; r = r->r_next, idx++) { + if (idx < s_idx) + continue; + if (inet_fill_rule(skb, r, cb) < 0) + break; + } + cb->args[0] = idx; + + return skb->len; +} + +#endif /* CONFIG_RTNETLINK */ + +__initfunc(void fib_rules_init(void)) +{ + register_netdevice_notifier(&fib_rules_notifier); +} diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c new file mode 100644 index 000000000000..8f3e70cad343 --- /dev/null +++ b/net/ipv4/fib_semantics.c @@ -0,0 +1,908 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: semantics. + * + * Version: $Id: fib_semantics.c,v 1.5 1997/10/10 22:40:50 davem Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define FSprintk(a...) + +static struct fib_info *fib_info_list; + +#define for_fib_info() { struct fib_info *fi; \ + for (fi = fib_info_list; fi; fi = fi->fib_next) + +#define endfor_fib_info() } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ +for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ +for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +/* Hope, that gcc will optimize it to get rid of dummy loop */ + +#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \ +for (nhsel=0; nhsel < 1; nhsel++) + +#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \ +for (nhsel=0; nhsel < 1; nhsel++) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define endfor_nexthops(fi) } + + +static struct +{ + int error; + u8 scope; +} fib_props[RTA_MAX+1] = { + { 0, RT_SCOPE_NOWHERE}, /* RTN_UNSPEC */ + { 0, RT_SCOPE_UNIVERSE}, /* RTN_UNICAST */ + { 0, RT_SCOPE_HOST}, /* RTN_LOCAL */ + { 0, RT_SCOPE_LINK}, /* RTN_BROADCAST */ + { 0, RT_SCOPE_LINK}, /* RTN_ANYCAST */ + { 0, RT_SCOPE_UNIVERSE}, /* RTN_MULTICAST */ + { -EINVAL, RT_SCOPE_UNIVERSE}, /* RTN_BLACKHOLE */ + { -EHOSTUNREACH, RT_SCOPE_UNIVERSE},/* RTN_UNREACHABLE */ + { -EACCES, RT_SCOPE_UNIVERSE}, /* RTN_PROHIBIT */ + { 1, RT_SCOPE_UNIVERSE}, /* RTN_THROW */ +#ifdef CONFIG_IP_ROUTE_NAT + { 0, RT_SCOPE_HOST}, /* RTN_NAT */ +#else + { -EINVAL, RT_SCOPE_NOWHERE}, /* RTN_NAT */ +#endif + { -EINVAL, RT_SCOPE_NOWHERE} /* RTN_XRESOLVE */ +}; + +/* Release a nexthop info record */ + +void fib_release_info(struct fib_info *fi) +{ + if (fi && !--fi->fib_refcnt) { + if (fi->fib_next) + fi->fib_next->fib_prev = fi->fib_prev; + if (fi->fib_prev) + fi->fib_prev->fib_next = fi->fib_next; + if (fi == fib_info_list) + fib_info_list = fi->fib_next; + kfree(fi); + } +} + +extern __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) +{ + const struct fib_nh *onh = ofi->fib_nh; + + for_nexthops(fi) { + if (nh->nh_oif != onh->nh_oif || + nh->nh_gw != onh->nh_gw || +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->nh_weight != onh->nh_weight || +#endif + ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) + return -1; + onh++; + } endfor_nexthops(fi); + return 0; +} + +extern __inline__ struct fib_info * fib_find_info(const struct fib_info *nfi) +{ + for_fib_info() { + if (fi->fib_nhs != nfi->fib_nhs) + continue; + if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_prefsrc == fi->fib_prefsrc && + nfi->fib_mtu == fi->fib_mtu && + nfi->fib_rtt == fi->fib_rtt && + nfi->fib_window == fi->fib_window && + ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && + (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) + return fi; + } endfor_fib_info(); + return NULL; +} + +/* Check, that the gateway is already configured. + Used only by redirect accept routine. + */ + +int ip_fib_check_default(u32 gw, struct device *dev) +{ + for_fib_info() { + if (fi->fib_flags & RTNH_F_DEAD) + continue; + for_nexthops(fi) { + if (nh->nh_dev == dev && nh->nh_gw == gw && + !(nh->nh_flags&RTNH_F_DEAD)) + return 0; + } endfor_nexthops(fi); + } endfor_fib_info(); + return -1; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type) +{ + while (RTA_OK(attr,attrlen)) { + if (attr->rta_type == type) + return *(u32*)RTA_DATA(attr); + attr = RTA_NEXT(attr, attrlen); + } + return 0; +} + +static int +fib_get_nhs(struct fib_info *fi, const struct nlmsghdr *nlh, const struct rtmsg *r) +{ + struct rtnexthop *nhp = RTM_RTNH(r); + int nhlen = RTM_NHLEN(nlh, r); + +printk("get nhs %d/%d\n", r->rtm_nhs, nhlen); + change_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags; + nh->nh_oif = nhp->rtnh_ifindex; + nh->nh_weight = nhp->rtnh_hops + 1; + if (attrlen) + nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); +printk("Got nh: via %08x dev %d w %d fl %02x\n", nh->nh_gw, nh->nh_oif, + nh->nh_weight, nh->nh_flags); + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); + return 0; +} + +#endif + +int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta, + struct fib_info *fi) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH + struct rtnexthop *nhp; + int nhlen; +#endif + + if (rta->rta_oif || rta->rta_gw) { + if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) && + (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0)) + return 0; + return 1; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (r->rtm_nhs == 0) + return 0; + + nhp = RTM_RTNH(r); + nhlen = RTM_NHLEN(nlh, r); + + for_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + u32 gw; + + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif) + return 1; + if (attrlen) { + gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); + if (gw && gw != nh->nh_gw) + return 1; + } + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); +#endif + return 0; +} + + +/* + Picture + ------- + + Semantics of nexthop is very messy by historical reasons. + We have to take into account, that: + a) gateway can be actually local interface address, + so that gatewayed route is direct. + b) gateway must be on-link address, possibly + described not by an ifaddr, but also by a direct route. + c) If both gateway and interface are specified, they should not + contradict. + d) If we use tunnel routes, gateway could be not on-link. + + Attempt to reconcile all of these (alas, self-contradictory) conditions + results in pretty ugly and hairy code with obscure logic. + + I choosed to generalized it instead, so that the size + of code does not increase practically, but it becomes + much more general. + Every prefix is assigned a "scope" value: "host" is local address, + "link" is direct route, + [ ... "site" ... "interior" ... ] + and "universe" is true gateway route with global meaning. + + Every prefix refers to a set of "nexthop"s (gw, oif), + where gw must have narrower scope. This recursion stops + when gw has LOCAL scope or if "nexthop" is declared ONLINK, + which means that gw is forced to be on link. + + Code is still hairy, but now it is apparently logically + consistent and very flexible. F.e. as by-product it allows + to co-exists in peace independent exterior and interior + routing processes. + + Normally it looks as following. + + {universe prefix} -> (gw, oif) [scope link] + | + |-> {link prefix} -> (gw, oif) [scope local] + | + |-> {local prefix} (terminal node) + */ + +static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh) +{ + int err; + + if (nh->nh_gw) { + struct rt_key key; + struct fib_result res; + +#ifdef CONFIG_IP_ROUTE_PERVASIVE + if (nh->nh_flags&RTNH_F_PERVASIVE) + return 0; +#endif + if (nh->nh_flags&RTNH_F_ONLINK) { + struct device *dev; + + if (r->rtm_scope >= RT_SCOPE_LINK) + return -EINVAL; + if (inet_addr_type(nh->nh_gw) != RTN_UNICAST) + return -EINVAL; + if ((dev = dev_get_by_index(nh->nh_oif)) == NULL) + return -ENODEV; + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; + nh->nh_dev = dev; + nh->nh_scope = RT_SCOPE_LINK; + return 0; + } + memset(&key, 0, sizeof(key)); + key.dst = nh->nh_gw; + key.oif = nh->nh_oif; + key.scope = r->rtm_scope + 1; + + /* It is not necessary, but requires a bit of thinking */ + if (key.scope < RT_SCOPE_LINK) + key.scope = RT_SCOPE_LINK; + + if ((err = fib_lookup(&key, &res)) != 0) + return err; + nh->nh_scope = res.scope; + nh->nh_oif = FIB_RES_OIF(res); + nh->nh_dev = FIB_RES_DEV(res); + } else { + struct in_device *in_dev; + + if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) + return -EINVAL; + + in_dev = inetdev_by_index(nh->nh_oif); + if (in_dev == NULL) + return -ENODEV; + if (!(in_dev->dev->flags&IFF_UP)) + return -ENETDOWN; + nh->nh_dev = in_dev->dev; + nh->nh_scope = RT_SCOPE_HOST; + } + return 0; +} + +struct fib_info * +fib_create_info(const struct rtmsg *r, struct kern_rta *rta, + const struct nlmsghdr *nlh, int *errp) +{ + int err; + struct fib_info *fi = NULL; + struct fib_info *ofi; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int nhs = r->rtm_nhs ? : 1; +#else + const int nhs = 1; +#endif + + /* Fast check to catch the most weird cases */ + if (fib_props[r->rtm_type].scope > r->rtm_scope) { + printk("Einval 1\n"); + goto err_inval; + } + + fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); + err = -ENOBUFS; + if (fi == NULL) + goto failure; + memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh)); + + fi->fib_protocol = r->rtm_protocol; + fi->fib_nhs = nhs; + fi->fib_flags = r->rtm_flags; + if (rta->rta_mtu) + fi->fib_mtu = *rta->rta_mtu; + if (rta->rta_rtt) + fi->fib_rtt = *rta->rta_rtt; + if (rta->rta_window) + fi->fib_window = *rta->rta_window; + if (rta->rta_prefsrc) + memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4); + + if (r->rtm_nhs) { +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if ((err = fib_get_nhs(fi, nlh, r)) != 0) + goto failure; + if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif) + goto err_inval; + if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4)) + goto err_inval; +#else + goto err_inval; +#endif + } else { + struct fib_nh *nh = fi->fib_nh; + if (rta->rta_oif) + nh->nh_oif = *rta->rta_oif; + if (rta->rta_gw) + memcpy(&nh->nh_gw, rta->rta_gw, 4); + nh->nh_flags = r->rtm_flags; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->nh_weight = 1; +#endif + } + +#ifdef CONFIG_IP_ROUTE_NAT + if (r->rtm_type == RTN_NAT) { + if (rta->rta_gw == NULL || nhs != 1 || rta->rta_oif) + goto err_inval; + memcpy(&fi->fib_nh->nh_gw, rta->rta_gw, 4); + goto link_it; + } +#endif + + if (fib_props[r->rtm_type].error) { + if (rta->rta_gw || rta->rta_oif || r->rtm_nhs) + goto err_inval; + goto link_it; + } + + if (r->rtm_scope > RT_SCOPE_HOST) + goto err_inval; + + if (r->rtm_scope == RT_SCOPE_HOST) { + struct fib_nh *nh = fi->fib_nh; + + /* Local address is added. */ + if (nhs != 1 || nh->nh_gw) + goto err_inval; + nh->nh_scope = RT_SCOPE_NOWHERE; + nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif); + err = -ENODEV; + if (nh->nh_dev == NULL) + goto failure; + } else { + change_nexthops(fi) { + if ((err = fib_check_nh(r, fi, nh)) != 0) { + if (err == -EINVAL) + printk("Einval 2\n"); + goto failure; + } + } endfor_nexthops(fi) + } + + if (fi->fib_prefsrc) { + if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL || + memcmp(&fi->fib_prefsrc, rta->rta_dst, 4)) + if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) { + printk("Einval 3\n"); + goto err_inval; + } + } + +link_it: + if ((ofi = fib_find_info(fi)) != NULL) { + kfree(fi); + ofi->fib_refcnt++; + return ofi; + } + + fi->fib_refcnt++; + fi->fib_next = fib_info_list; + fi->fib_prev = NULL; + if (fib_info_list) + fib_info_list->fib_prev = fi; + fib_info_list = fi; + return fi; + +err_inval: + err = -EINVAL; + +failure: + *errp = err; + if (fi) + kfree(fi); + return NULL; +} + +int +fib_semantic_match(int type, struct fib_info *fi, const struct rt_key *key, struct fib_result *res) +{ + int err = fib_props[type].error; + + if (err == 0) { + if (fi->fib_flags&RTNH_F_DEAD) + return 1; + + res->fi = fi; + + switch (type) { +#ifdef CONFIG_IP_ROUTE_NAT + case RTN_NAT: + FIB_RES_RESET(*res); + return 0; +#endif + case RTN_UNICAST: + case RTN_LOCAL: + case RTN_BROADCAST: + case RTN_ANYCAST: + case RTN_MULTICAST: + for_nexthops(fi) { + if (nh->nh_flags&RTNH_F_DEAD) + continue; + if (!key->oif || key->oif == nh->nh_oif) + break; + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (nhsel < fi->fib_nhs) { + res->nh_sel = nhsel; + return 0; + } +#else + if (nhsel < 1) + return 0; +#endif + endfor_nexthops(fi); + return 1; + default: + printk(KERN_DEBUG "impossible 102\n"); + return -EINVAL; + } + } + return err; +} + +/* Find appropriate source address to this destination */ + +u32 __fib_res_prefsrc(struct fib_result *res) +{ + return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope); +} + +#ifdef CONFIG_RTNETLINK + +int +fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, + u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos, + struct fib_info *fi) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + unsigned char *o; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = dst_len; + rtm->rtm_src_len = 0; + rtm->rtm_tos = tos; + rtm->rtm_table = tb_id; + rtm->rtm_type = type; + rtm->rtm_flags = fi->fib_flags; + rtm->rtm_scope = scope; + rtm->rtm_nhs = 0; + + o = skb->tail; + if (rtm->rtm_dst_len) + RTA_PUT(skb, RTA_DST, 4, dst); + rtm->rtm_protocol = fi->fib_protocol; + if (fi->fib_mtu) + RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &fi->fib_mtu); + if (fi->fib_window) + RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &fi->fib_window); + if (fi->fib_rtt) + RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &fi->fib_rtt); + if (fi->fib_prefsrc) + RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc); + if (fi->fib_nhs == 1) { + if (fi->fib_nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw); + if (fi->fib_nh->nh_oif) + RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif); + } + rtm->rtm_optlen = skb->tail - o; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (fi->fib_nhs > 1) { + struct rtnexthop *nhp; + for_nexthops(fi) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = nh->nh_flags & 0xFF; + nhp->rtnh_hops = nh->nh_weight-1; + nhp->rtnh_ifindex = nh->nh_oif; + if (nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw); + nhp->rtnh_len = skb->tail - (unsigned char*)nhp; + rtm->rtm_nhs++; + } endfor_nexthops(fi); + } +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_put(skb, b - skb->tail); + return -1; +} + +#endif /* CONFIG_RTNETLINK */ + +#ifndef CONFIG_IP_NOSIOCRT + +int +fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, + struct kern_rta *rta, struct rtentry *r) +{ + int plen; + u32 *ptr; + + memset(rtm, 0, sizeof(*rtm)); + memset(rta, 0, sizeof(*rta)); + + if (r->rt_dst.sa_family != AF_INET) + return -EAFNOSUPPORT; + + /* Check mask for validity: + a) it must be contiguous. + b) destination must have all host bits clear. + c) if application forgot to set correct family (AF_INET), + reject request unless it is absolutely clear i.e. + both family and mask are zero. + */ + plen = 32; + ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr; + if (!(r->rt_flags&RTF_HOST)) { + u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr; + if (r->rt_genmask.sa_family != AF_INET) { + if (mask || r->rt_genmask.sa_family) + return -EAFNOSUPPORT; + } + if (bad_mask(mask, *ptr)) + return -EINVAL; + plen = inet_mask_len(mask); + } + + nl->nlmsg_flags = NLM_F_REQUEST; + nl->nlmsg_pid = 0; + nl->nlmsg_seq = 0; + nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm)); + if (cmd == SIOCDELRT) { + nl->nlmsg_type = RTM_DELROUTE; + nl->nlmsg_flags = 0; + } else { + nl->nlmsg_type = RTM_NEWROUTE; + nl->nlmsg_flags = NLM_F_CREATE; + rtm->rtm_protocol = RTPROT_BOOT; + if (plen != 0) + nl->nlmsg_flags |= NLM_F_REPLACE; + } + + rtm->rtm_dst_len = plen; + rta->rta_dst = ptr; + + if (r->rt_flags&RTF_REJECT) { + rtm->rtm_scope = RT_SCOPE_HOST; + rtm->rtm_type = RTN_UNREACHABLE; + return 0; + } + rtm->rtm_scope = RT_SCOPE_LINK; + rtm->rtm_type = RTN_UNICAST; + + if (r->rt_dev) { +#ifdef CONFIG_IP_ALIAS + char *colon; +#endif + struct device *dev; + char devname[IFNAMSIZ]; + + if (copy_from_user(devname, r->rt_dev, 15)) + return -EFAULT; + devname[IFNAMSIZ-1] = 0; +#ifdef CONFIG_IP_ALIAS + colon = strchr(devname, ':'); + if (colon) + *colon = 0; +#endif + dev = dev_get(devname); + if (!dev) + return -ENODEV; + rta->rta_oif = &dev->ifindex; +#ifdef CONFIG_IP_ALIAS + if (colon) { + struct in_ifaddr *ifa; + struct in_device *in_dev = dev->ip_ptr; + if (!in_dev) + return -ENODEV; + *colon = ':'; + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) + if (strcmp(ifa->ifa_label, devname) == 0) + break; + if (ifa == NULL) + return -ENODEV; + rta->rta_prefsrc = &ifa->ifa_local; + } +#endif + } + + ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr; + if (r->rt_gateway.sa_family == AF_INET && *ptr) { + rta->rta_gw = ptr; + if (r->rt_flags&RTF_GATEWAY) + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + } + + if (cmd == SIOCDELRT) + return 0; + + if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL) + return -EINVAL; + + /* Ugly conversion from rtentry types to unsigned */ + + if (r->rt_flags&RTF_IRTT) { + rta->rta_rtt = (unsigned*)&r->rt_pad3; + *rta->rta_rtt = r->rt_irtt; + } + if (r->rt_flags&RTF_WINDOW) { + rta->rta_window = (unsigned*)&r->rt_window; + if (sizeof(*rta->rta_window) != sizeof(r->rt_window)) + *rta->rta_window = r->rt_window; + } + if (r->rt_flags&RTF_MTU) { + rta->rta_mtu = (unsigned*)&r->rt_mtu; + if (sizeof(*rta->rta_mtu) != sizeof(r->rt_mtu)) + *rta->rta_mtu = r->rt_mtu; + } + return 0; +} + +#endif + +/* + Update FIB if: + - local address disappeared -> we must delete all the entries + referring to it. + - device went down -> we must shutdown all nexthops going via it. + */ + +int fib_sync_down(u32 local, struct device *dev) +{ + int ret = 0; + + for_fib_info() { + if (local && fi->fib_prefsrc == local) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } else if (dev && fi->fib_nhs) { + int dead = 0; + + change_nexthops(fi) { + if (nh->nh_flags&RTNH_F_DEAD) + dead++; + else if (nh->nh_dev == dev && + nh->nh_scope != RT_SCOPE_NOWHERE) { + nh->nh_flags |= RTNH_F_DEAD; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + fi->fib_power -= nh->nh_power; + nh->nh_power = 0; +#endif + dead++; + } + } endfor_nexthops(fi) + if (dead == fi->fib_nhs) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } + } + } endfor_fib_info(); + return ret; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* + Dead device goes up. We wake up dead nexthops. + It takes sense only on multipath routes. + */ + +int fib_sync_up(struct device *dev) +{ + int ret = 0; + + if (!(dev->flags&IFF_UP)) + return 0; + + for_fib_info() { + int alive = 0; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + alive++; + continue; + } + if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + continue; + if (nh->nh_dev != dev || dev->ip_ptr == NULL) + continue; + alive++; + nh->nh_power = 0; + nh->nh_flags &= ~RTNH_F_DEAD; + } endfor_nexthops(fi) + + if (alive == fi->fib_nhs) { + fi->fib_flags &= ~RTNH_F_DEAD; + ret++; + } + } endfor_fib_info(); + return ret; +} + +/* + The algorithm is suboptimal, but it provides really + fair weighted route distribution. + */ + +void fib_select_multipath(const struct rt_key *key, struct fib_result *res) +{ + struct fib_info *fi = res->fi; + int w; + + if (fi->fib_power <= 0) { + int power = 0; + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + power += nh->nh_weight; + nh->nh_power = nh->nh_weight; + } + } endfor_nexthops(fi); + fi->fib_power = power; +#if 1 + if (power <= 0) { + printk(KERN_CRIT "impossible 777\n"); + return; + } +#endif + } + + + /* w should be random number [0..fi->fib_power-1], + it is pretty bad approximation. + */ + + w = jiffies % fi->fib_power; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { + if ((w -= nh->nh_power) <= 0) { + nh->nh_power--; + fi->fib_power--; + res->nh_sel = nhsel; + return; + } + } + } endfor_nexthops(fi); + +#if 1 + printk(KERN_CRIT "impossible 888\n"); +#endif + return; +} +#endif + + +#ifdef CONFIG_PROC_FS + +static unsigned fib_flag_trans(int type, int dead, u32 mask, struct fib_info *fi) +{ + static unsigned type2flags[RTN_MAX+1] = { + 0, 0, 0, 0, 0, 0, 0, RTF_REJECT, RTF_REJECT, 0, 0, 0 + }; + unsigned flags = type2flags[type]; + + if (fi && fi->fib_nh->nh_gw) + flags |= RTF_GATEWAY; + if (mask == 0xFFFFFFFF) + flags |= RTF_HOST; + if (!dead) + flags |= RTF_UP; + return flags; +} + +void fib_node_get_info(int type, int dead, struct fib_info *fi, u32 prefix, u32 mask, char *buffer) +{ + int len; + unsigned flags = fib_flag_trans(type, dead, mask, fi); + + if (fi) { + len = sprintf(buffer, "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", + fi->fib_dev ? fi->fib_dev->name : "*", prefix, + fi->fib_nh->nh_gw, flags, 0, 0, 0, + mask, fi->fib_mtu, fi->fib_window, fi->fib_rtt); + } else { + len = sprintf(buffer, "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", + prefix, 0, + flags, 0, 0, 0, + mask, 0, 0, 0); + } + memset(buffer+len, 0, 127-len); + buffer[127] = '\n'; +} + +#endif diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 667d2352c6e3..e66efde90748 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,6 +3,8 @@ * * Alan Cox, * + * Version: $Id: icmp.c,v 1.35 1997/10/19 18:17:13 freitag Exp $ + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -41,6 +43,10 @@ * Andi Kleen : Check all packet lengths properly * and moved all kfree_skb() up to * icmp_rcv. + * Andi Kleen : Move the rate limit bookkeeping + * into the dest entry and use a tocken + * bucket filter (thanks to ANK). Make + * the rates sysctl configurable. * * RFC1122 (Host Requirements -- Comm. Layer) Status: * (boy, are there a lot of rules for ICMP) @@ -77,7 +83,7 @@ * [Solaris 2.X seems to assert EPROTO when this occurs] -- AC * 3.2.2.6 (Echo Request/Reply) * MUST reply to ECHO_REQUEST, and give app to do ECHO stuff (OK, OK) - * MAY discard broadcast ECHO_REQUESTs. (We don't, but that's OK.) + * MAY discard broadcast ECHO_REQUESTs. (Configurable with a sysctl.) * MUST reply using same source address as the request was sent to. * We're OK for unicast ECHOs, and it doesn't say anything about * how to handle broadcast ones, since it's optional. @@ -293,39 +299,9 @@ struct icmp_err icmp_err_convert[] = { { EHOSTUNREACH, 1 } /* ICMP_PREC_CUTOFF */ }; -/* - * A spare long used to speed up statistics updating - */ - -unsigned long dummy; - -/* - * ICMP transmit rate limit control structures. We use a relatively simple - * approach to the problem: For each type of ICMP message with rate limit - * we count the number of messages sent during some time quantum. If this - * count exceeds given maximal value, we ignore all messages not separated - * from the last message sent at least by specified time. - */ - -#define XRLIM_CACHE_SIZE 16 /* How many destination hosts do we cache */ - -struct icmp_xrl_cache /* One entry of the ICMP rate cache */ -{ - __u32 daddr; /* Destination address */ - unsigned long counter; /* Message counter */ - unsigned long next_reset; /* Time of next reset of the counter */ - unsigned long last_access; /* Time of last access to this entry (LRU) */ - unsigned int restricted; /* Set if we're in restricted mode */ - unsigned long next_packet; /* When we'll allow a next packet if restricted */ -}; - -struct icmp_xrlim -{ - unsigned long timeout; /* Time quantum for rate measuring */ - unsigned long limit; /* Maximal number of messages per time quantum allowed */ - unsigned long delay; /* How long we wait between packets when restricting */ - struct icmp_xrl_cache cache[XRLIM_CACHE_SIZE]; /* Rate cache */ -}; +/* Control parameters for ECHO relies. */ +int sysctl_icmp_echo_ignore_all = 0; +int sysctl_icmp_echo_ignore_broadcasts = 0; /* * ICMP control array. This specifies what to do with each ICMP. @@ -336,8 +312,8 @@ struct icmp_control unsigned long *output; /* Address to increment on output */ unsigned long *input; /* Address to increment on input */ void (*handler)(struct icmphdr *icmph, struct sk_buff *skb, int len); - unsigned long error; /* This ICMP is classed as an error message */ - struct icmp_xrlim *xrlim; /* Transmit rate limit control structure or NULL for no limits */ + short error; /* This ICMP is classed as an error message */ + int *timeout; /* Rate limit */ }; static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; @@ -369,100 +345,47 @@ struct socket *icmp_socket=&icmp_inode.u.socket_i; * Send an ICMP frame. */ - -/* - * Initialize the transmit rate limitation mechanism. - */ - -#ifndef CONFIG_NO_ICMP_LIMIT - -__initfunc(static void xrlim_init(void)) -{ - int type, entry; - struct icmp_xrlim *xr; - - for (type=0; type<=NR_ICMP_TYPES; type++) { - xr = icmp_pointers[type].xrlim; - if (xr) { - for (entry=0; entrycache[entry].daddr = INADDR_NONE; - } - } -} - /* * Check transmit rate limitation for given message. + * The rate information is held in the destination cache now. + * This function is generic and could be used for other purposes + * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. * * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate - * SHOULD allow setting of rate limits (we allow - * in the source) + * SHOULD allow setting of rate limits + * + * Shared between ICMPv4 and ICMPv6. */ - -static int xrlim_allow(int type, __u32 addr) +#define XRLIM_BURST_FACTOR 6 +int xrlim_allow(struct dst_entry *dst, int timeout) { - struct icmp_xrlim *r; - struct icmp_xrl_cache *c; unsigned long now; - if (type > NR_ICMP_TYPES) /* No time limit present */ - return 1; - r = icmp_pointers[type].xrlim; - if (!r) + now = jiffies; + dst->rate_tokens += now - dst->rate_last; + if (dst->rate_tokens > 6*timeout) + dst->rate_tokens = XRLIM_BURST_FACTOR*timeout; + if (dst->rate_tokens >= timeout) { + dst->rate_tokens -= timeout; return 1; + } + return 0; +} - for (c = r->cache; c < &r->cache[XRLIM_CACHE_SIZE]; c++) - /* Cache lookup */ - if (c->daddr == addr) - break; - - now = jiffies; /* Cache current time (saves accesses to volatile variable) */ +static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code) +{ + struct dst_entry *dst = &rt->u.dst; - if (c == &r->cache[XRLIM_CACHE_SIZE]) { /* Cache miss */ - unsigned long oldest = now; /* Find the oldest entry to replace */ - struct icmp_xrl_cache *d; - c = r->cache; - for (d = r->cache; d < &r->cache[XRLIM_CACHE_SIZE]; d++) - if (!d->daddr) { /* Unused entry */ - c = d; - break; - } else if (d->last_access < oldest) { - oldest = d->last_access; - c = d; - } - c->last_access = now; /* Fill the entry with new data */ - c->daddr = addr; - c->counter = 1; - c->next_reset = now + r->timeout; - c->restricted = 0; + if (type > NR_ICMP_TYPES || !icmp_pointers[type].timeout) return 1; - } - c->last_access = now; - if (c->next_reset > now) { /* Let's increment the counter */ - c->counter++; - if (c->counter == r->limit) { /* Limit exceeded, start restrictions */ - c->restricted = 1; - c->next_packet = now + r->delay; - return 0; - } - if (c->restricted) { /* Any restrictions pending? */ - if (c->next_packet > now) - return 0; - c->next_packet = now + r->delay; - return 1; - } - } else { /* Reset the counter */ - if (c->counter < r->limit) /* Switch off all restrictions */ - c->restricted = 0; - c->next_reset = now + r->timeout; - c->counter = 0; - } + /* Don't limit PMTU discovery. */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + return 1; - return 1; /* Send the packet */ + return xrlim_allow(dst, *(icmp_pointers[type].timeout)); } -#endif /* CONFIG_NO_ICMP_LIMIT */ - /* * Maintain the counters used in the SNMP statistics for outgoing ICMP */ @@ -530,7 +453,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) daddr = icmp_param->replyopts.faddr; - if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), NULL)) + if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) return; ip_build_xmit(sk, icmp_glue_bits, icmp_param, icmp_param->data_len+sizeof(struct icmphdr), @@ -578,7 +501,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) */ if (!rt) return; - if (rt->rt_flags&(RTF_BROADCAST|RTF_MULTICAST)) + if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) return; @@ -610,34 +533,30 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) } } - /* - * Check the rate limit - */ - -#ifndef CONFIG_NO_ICMP_LIMIT - if (!xrlim_allow(type, iph->saddr)) - return; -#endif /* * Construct source address and options. */ saddr = iph->daddr; - if (!(rt->rt_flags&RTF_LOCAL)) + if (!(rt->rt_flags&RTCF_LOCAL)) saddr = 0; tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL) : iph->tos; - if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), NULL)) + /* XXX: use a more aggressive expire for routes created by + * this call (not longer than the rate limit timeout). + * It could be also worthwhile to not put them into ipv4 + * fast routing cache at first. + */ + if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0)) return; - if (ip_options_echo(&icmp_param.replyopts, skb_in)) { - ip_rt_put(rt); - return; - } + if (ip_options_echo(&icmp_param.replyopts, skb_in)) + goto ende; + /* * Prepare data for ICMP header. @@ -655,10 +574,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) ipc.opt = &icmp_param.replyopts; if (icmp_param.replyopts.srr) { ip_rt_put(rt); - if (ip_route_output(&rt, icmp_param.replyopts.faddr, saddr, RT_TOS(tos), NULL)) + if (ip_route_output(&rt, icmp_param.replyopts.faddr, saddr, RT_TOS(tos), 0)) return; } + if (!icmpv4_xrlim_allow(rt, type, code)) + goto ende; + /* RFC says return as much as we can without exceeding 576 bytes. */ room = rt->u.dst.pmtu; @@ -674,6 +596,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) icmp_param.data_len+sizeof(struct icmphdr), &ipc, rt, MSG_DONTWAIT); +ende: ip_rt_put(rt); } @@ -753,7 +676,7 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) * get the other vendor to fix their kit. */ - if(__ip_chk_addr(iph->daddr)==IS_BROADCAST) + if (inet_addr_type(iph->daddr) == RTN_BROADCAST) { if (net_ratelimit()) printk("%s sent an invalid ICMP error to a broadcast.\n", @@ -770,12 +693,12 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) hash = iph->protocol & (MAX_INET_PROTOS - 1); if ((raw_sk = raw_v4_htable[hash]) != NULL) { - raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr); + raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); while (raw_sk) { raw_err(raw_sk, skb); raw_sk = raw_v4_lookup(raw_sk->next, iph->protocol, - iph->saddr, iph->daddr); + iph->saddr, iph->daddr, skb->dev->ifindex); } } @@ -797,7 +720,7 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) /* appropriate protocol layer (MUST), as per 3.2.2. */ if (iph->protocol == ipprot->protocol && ipprot->err_handler) - ipprot->err_handler(skb, dp); + ipprot->err_handler(skb, dp, len); ipprot = nextip; } @@ -850,18 +773,18 @@ static void icmp_redirect(struct icmphdr *icmph, struct sk_buff *skb, int len) * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring echo requests, MUST have default=NOT. * See also WRT handling of options once they are done and working. */ - + static void icmp_echo(struct icmphdr *icmph, struct sk_buff *skb, int len) { -#ifndef CONFIG_IP_IGNORE_ECHO_REQUESTS - struct icmp_bxm icmp_param; - - icmp_param.icmph=*icmph; - icmp_param.icmph.type=ICMP_ECHOREPLY; - icmp_param.data_ptr=(icmph+1); - icmp_param.data_len=len; - icmp_reply(&icmp_param, skb); -#endif + if (!sysctl_icmp_echo_ignore_all) { + struct icmp_bxm icmp_param; + + icmp_param.icmph=*icmph; + icmp_param.icmph.type=ICMP_ECHOREPLY; + icmp_param.data_ptr=(icmph+1); + icmp_param.data_len=len; + icmp_reply(&icmp_param, skb); + } } /* @@ -928,32 +851,16 @@ static void icmp_timestamp(struct icmphdr *icmph, struct sk_buff *skb, int len) * Gratuitous mask announcements suffer from the same problem. * RFC1812 explains it, but still allows to use ADDRMASK, * that is pretty silly. --ANK + * + * All these rules are so bizarre, that I removed kernel addrmask + * support at all. It is wrong, it is obsolete, nobody uses it in + * any case. --ANK */ - + static void icmp_address(struct icmphdr *icmph, struct sk_buff *skb, int len) { - struct icmp_bxm icmp_param; - struct rtable *rt = (struct rtable*)skb->dst; - struct device *dev = skb->dev; - - if (!ipv4_config.addrmask_agent || - len < 4 || - ZERONET(rt->rt_src) || - rt->rt_src_dev != rt->u.dst.dev || - !(rt->rt_flags&RTCF_DIRECTSRC) || - (rt->rt_flags&RTF_GATEWAY) || - !(dev->ip_flags&IFF_IP_ADDR_OK) || - !(dev->ip_flags&IFF_IP_MASK_OK)) { - icmp_statistics.IcmpInErrors++; - return; - } - - icmp_param.icmph.type=ICMP_ADDRESSREPLY; - icmp_param.icmph.code=0; - icmp_param.icmph.un.echo = icmph->un.echo; - icmp_param.data_ptr=&dev->pa_mask; - icmp_param.data_len=4; - icmp_reply(&icmp_param, skb); + if (net_ratelimit()) + printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); } /* @@ -965,27 +872,29 @@ static void icmp_address_reply(struct icmphdr *icmph, struct sk_buff *skb, int l { struct rtable *rt = (struct rtable*)skb->dst; struct device *dev = skb->dev; + struct in_device *in_dev = dev->ip_ptr; + struct in_ifaddr *ifa; u32 mask; if (!ipv4_config.log_martians || + !IS_ROUTER || + !in_dev || !in_dev->ifa_list || len < 4 || - !(rt->rt_flags&RTCF_DIRECTSRC) || - (rt->rt_flags&RTF_GATEWAY) || - !(dev->ip_flags&IFF_IP_ADDR_OK) || - !(dev->ip_flags&IFF_IP_MASK_OK)) { - icmp_statistics.IcmpInErrors++; + !(rt->rt_flags&RTCF_DIRECTSRC)) return; - } mask = *(u32*)&icmph[1]; - if (mask != dev->pa_mask && net_ratelimit()) + for (ifa=in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + if (mask == ifa->ifa_mask && inet_ifa_match(rt->rt_src, ifa)) + return; + } + if (net_ratelimit()) printk(KERN_INFO "Wrong address mask %08lX from %08lX/%s\n", ntohl(mask), ntohl(rt->rt_src), dev->name); } static void icmp_discard(struct icmphdr *icmph, struct sk_buff *skb, int len) { - return; } #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -1000,8 +909,8 @@ static void icmp_discard(struct icmphdr *icmph, struct sk_buff *skb, int len) */ /* This should work with the new hashes now. -DaveM */ -extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport); -extern struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport); +extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif); +extern struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif); int icmp_chkaddr(struct sk_buff *skb) { @@ -1017,7 +926,7 @@ int icmp_chkaddr(struct sk_buff *skb) { struct tcphdr *th = (struct tcphdr *)(((unsigned char *)iph)+(iph->ihl<<2)); - sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source); + sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex); if (!sk) return 0; if (sk->saddr != iph->saddr) return 0; if (sk->daddr != iph->daddr) return 0; @@ -1031,9 +940,9 @@ int icmp_chkaddr(struct sk_buff *skb) { struct udphdr *uh = (struct udphdr *)(((unsigned char *)iph)+(iph->ihl<<2)); - sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source); + sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex); if (!sk) return 0; - if (sk->saddr != iph->saddr && __ip_chk_addr(iph->saddr) != IS_MYADDR) + if (sk->saddr != iph->saddr && inet_addr_type(iph->saddr) != RTN_LOCAL) return 0; /* * This packet may have come from us. @@ -1067,46 +976,59 @@ int icmp_rcv(struct sk_buff *skb, unsigned short len) if(len < sizeof(struct icmphdr) || ip_compute_csum((unsigned char *) icmph, len) || icmph->type > NR_ICMP_TYPES) - { - icmp_statistics.IcmpInErrors++; - kfree_skb(skb, FREE_READ); - return 0; - } + goto error; /* * Parse the ICMP message */ - if (rt->rt_flags&(RTF_BROADCAST|RTF_MULTICAST)) { + if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) { /* - * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be silently ignored (we don't as it is used - * by some network mapping tools). - * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently discarded if to broadcast/multicast. + * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be + * silently ignored (we let user decide with a sysctl). + * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently + * discarded if to broadcast/multicast. */ + if (icmph->type == ICMP_ECHO && + sysctl_icmp_echo_ignore_broadcasts) { + goto error; + } if (icmph->type != ICMP_ECHO && icmph->type != ICMP_TIMESTAMP && icmph->type != ICMP_ADDRESS && icmph->type != ICMP_ADDRESSREPLY) { - icmp_statistics.IcmpInErrors++; - kfree_skb(skb, FREE_READ); - return(0); + goto error; } } len -= sizeof(struct icmphdr); (*icmp_pointers[icmph->type].input)++; (icmp_pointers[icmph->type].handler)(icmph, skb, len); + +drop: kfree_skb(skb, FREE_READ); return 0; +error: + icmp_statistics.IcmpInErrors++; + goto drop; } /* - * This table defined limits of ICMP sending rate for various ICMP messages. + * A spare long used to speed up statistics updating */ + +static unsigned long dummy; -static struct icmp_xrlim - xrl_unreach = { 4*HZ, 80, HZ/4 }, /* Host Unreachable */ - xrl_generic = { 3*HZ, 30, HZ/4 }; /* All other errors */ +/* + * Configurable rate limits. + * Send at most one packets per time. + * Someone should check if these default values are correct. + */ +int sysctl_icmp_sourcequench_time = 1*HZ; +int sysctl_icmp_destunreach_time = 1*HZ; +int sysctl_icmp_timeexceed_time = 1*HZ; +int sysctl_icmp_paramprob_time = 1*HZ; +int sysctl_icmp_echoreply_time = 0; /* don't limit it per default. */ /* * This table is the definition of how we handle ICMP. @@ -1114,38 +1036,38 @@ static struct icmp_xrlim static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1] = { /* ECHO REPLY (0) */ - { &icmp_statistics.IcmpOutEchoReps, &icmp_statistics.IcmpInEchoReps, icmp_discard, 0, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, + { &icmp_statistics.IcmpOutEchoReps, &icmp_statistics.IcmpInEchoReps, icmp_discard, 0, &sysctl_icmp_echoreply_time}, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, /* DEST UNREACH (3) */ - { &icmp_statistics.IcmpOutDestUnreachs, &icmp_statistics.IcmpInDestUnreachs, icmp_unreach, 1, &xrl_unreach }, + { &icmp_statistics.IcmpOutDestUnreachs, &icmp_statistics.IcmpInDestUnreachs, icmp_unreach, 1, &sysctl_icmp_destunreach_time }, /* SOURCE QUENCH (4) */ - { &icmp_statistics.IcmpOutSrcQuenchs, &icmp_statistics.IcmpInSrcQuenchs, icmp_unreach, 1, NULL }, + { &icmp_statistics.IcmpOutSrcQuenchs, &icmp_statistics.IcmpInSrcQuenchs, icmp_unreach, 1, &sysctl_icmp_sourcequench_time }, /* REDIRECT (5) */ - { &icmp_statistics.IcmpOutRedirects, &icmp_statistics.IcmpInRedirects, icmp_redirect, 1, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, + { &icmp_statistics.IcmpOutRedirects, &icmp_statistics.IcmpInRedirects, icmp_redirect, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, /* ECHO (8) */ - { &icmp_statistics.IcmpOutEchos, &icmp_statistics.IcmpInEchos, icmp_echo, 0, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, + { &icmp_statistics.IcmpOutEchos, &icmp_statistics.IcmpInEchos, icmp_echo, 0, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, /* TIME EXCEEDED (11) */ - { &icmp_statistics.IcmpOutTimeExcds, &icmp_statistics.IcmpInTimeExcds, icmp_unreach, 1, &xrl_generic }, + { &icmp_statistics.IcmpOutTimeExcds, &icmp_statistics.IcmpInTimeExcds, icmp_unreach, 1, &sysctl_icmp_timeexceed_time }, /* PARAMETER PROBLEM (12) */ /* FIXME: RFC1122 3.2.2.5 - MUST pass PARAM_PROB messages to transport layer */ - { &icmp_statistics.IcmpOutParmProbs, &icmp_statistics.IcmpInParmProbs, icmp_discard, 1, &xrl_generic }, + { &icmp_statistics.IcmpOutParmProbs, &icmp_statistics.IcmpInParmProbs, icmp_discard, 1, &sysctl_icmp_paramprob_time }, /* TIMESTAMP (13) */ - { &icmp_statistics.IcmpOutTimestamps, &icmp_statistics.IcmpInTimestamps, icmp_timestamp, 0, NULL }, + { &icmp_statistics.IcmpOutTimestamps, &icmp_statistics.IcmpInTimestamps, icmp_timestamp, 0, }, /* TIMESTAMP REPLY (14) */ - { &icmp_statistics.IcmpOutTimestampReps, &icmp_statistics.IcmpInTimestampReps, icmp_discard, 0, NULL }, + { &icmp_statistics.IcmpOutTimestampReps, &icmp_statistics.IcmpInTimestampReps, icmp_discard, 0, }, /* INFO (15) */ - { &dummy, &dummy, icmp_discard, 0, NULL }, + { &dummy, &dummy, icmp_discard, 0, }, /* INFO REPLY (16) */ - { &dummy, &dummy, icmp_discard, 0, NULL }, + { &dummy, &dummy, icmp_discard, 0, }, /* ADDR MASK (17) */ - { &icmp_statistics.IcmpOutAddrMasks, &icmp_statistics.IcmpInAddrMasks, icmp_address, 0, NULL }, + { &icmp_statistics.IcmpOutAddrMasks, &icmp_statistics.IcmpInAddrMasks, icmp_address, 0, }, /* ADDR MASK REPLY (18) */ - { &icmp_statistics.IcmpOutAddrMaskReps, &icmp_statistics.IcmpInAddrMaskReps, icmp_address_reply, 0, NULL } + { &icmp_statistics.IcmpOutAddrMaskReps, &icmp_statistics.IcmpInAddrMaskReps, icmp_address_reply, 0, } }; __initfunc(void icmp_init(struct net_proto_family *ops)) @@ -1166,8 +1088,4 @@ __initfunc(void icmp_init(struct net_proto_family *ops)) icmp_socket->sk->allocation=GFP_ATOMIC; icmp_socket->sk->num = 256; /* Don't receive any data */ icmp_socket->sk->ip_ttl = MAXTTL; -#ifndef CONFIG_NO_ICMP_LIMIT - xrlim_init(); -#endif } - diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index fbc5403fcfc5..1c59f54629a8 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -8,6 +8,8 @@ * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * + * Version: $Id: igmp.c,v 1.22 1997/10/29 20:27:24 kuznet Exp $ + * * Authors: * Alan Cox * @@ -65,9 +67,11 @@ * fix from pending 2.1.x patches. * Alan Cox: Forget to enable FDDI support earlier. * Alexey Kuznetsov: Fixed leaving groups on device down. + * Alexey Kuznetsov: Accordance to igmp-v2-06 draft. */ +#include #include #include #include @@ -79,141 +83,52 @@ #include #include #include +#include +#include +#include #include +#include #include #include #include -#include #include -#include #include +#ifdef CONFIG_IP_MROUTE +#include +#endif -int sysctl_igmp_max_host_report_delay = IGMP_MAX_HOST_REPORT_DELAY; -int sysctl_igmp_timer_scale = IGMP_TIMER_SCALE; -int sysctl_igmp_age_threshold = IGMP_AGE_THRESHOLD; - -/* - * If time expired, change the router type to IGMP_NEW_ROUTER. - */ - -static void ip_router_timer_expire(unsigned long data) -{ - struct ip_router_info *i=(struct ip_router_info *)data; - - del_timer(&i->timer); - i->type=IGMP_NEW_ROUTER; /* Revert to new multicast router */ - i->time=0; -} - -/* - * Multicast router info manager - */ -struct ip_router_info *ip_router_info_head=(struct ip_router_info *)0; +#ifdef CONFIG_IP_MULTICAST -/* - * Get the multicast router info on that device - */ +/* Parameter names and values are taken from igmp-v2-06 draft */ -static struct ip_router_info *igmp_get_mrouter_info(struct device *dev) -{ - register struct ip_router_info *i; +#define IGMP_V1_Router_Present_Timeout (400*HZ) +#define IGMP_Unsolicited_Report_Interval (10*HZ) +#define IGMP_Query_Response_Interval (10*HZ) +#define IGMP_Unsolicited_Report_Count 2 - for(i=ip_router_info_head;i!=NULL;i=i->next) - { - if (i->dev == dev) - { - return i; - } - } - /* - * Not found. Create a new entry. The default is IGMP V2 router - */ - - i=(struct ip_router_info *)kmalloc(sizeof(*i), GFP_ATOMIC); - if(i==NULL) - return NULL; - i->dev = dev; - i->type = IGMP_NEW_ROUTER; - i->time = sysctl_igmp_age_threshold; - i->next = ip_router_info_head; - ip_router_info_head = i; - - init_timer(&i->timer); - i->timer.data=(unsigned long)i; - i->timer.function=&ip_router_timer_expire; - - return i; -} +#define IGMP_Initial_Report_Delay (1*HZ) -/* - * Set the multicast router info on that device +/* IGMP_Initial_Report_Delay is not from IGMP specs! + * IGMP specs require to report membership immediately after + * joining a group, but we delay the first report by a + * small interval. It seems more natural and still does not + * contradict to specs provided this delay is small enough. */ -static struct ip_router_info *igmp_set_mrouter_info(struct device *dev,int type,int time) -{ - register struct ip_router_info *i; - - for(i=ip_router_info_head;i!=NULL;i=i->next) - { - if (i->dev == dev) - { - if(i->type==IGMP_OLD_ROUTER) - { - del_timer(&i->timer); - } - - i->type = type; - i->time = time; - - if(i->type==IGMP_OLD_ROUTER) - { - i->timer.expires=jiffies+i->time*HZ; - add_timer(&i->timer); - } - return i; - } - } - - /* - * Not found. Create a new entry. - */ - i=(struct ip_router_info *)kmalloc(sizeof(*i), GFP_ATOMIC); - if(i==NULL) - return NULL; - i->dev = dev; - i->type = type; - i->time = time; - i->next = ip_router_info_head; - ip_router_info_head = i; - - init_timer(&i->timer); - i->timer.data=(unsigned long)i; - i->timer.function=&ip_router_timer_expire; - if(i->type==IGMP_OLD_ROUTER) - { - i->timer.expires=jiffies+i->time*HZ; - add_timer(&i->timer); - } - - return i; -} - +#define IGMP_V1_SEEN(in_dev) ((in_dev)->mr_v1_seen && jiffies - (in_dev)->mr_v1_seen < 0) /* * Timer management */ -static void igmp_stop_timer(struct ip_mc_list *im) +static __inline__ void igmp_stop_timer(struct ip_mc_list *im) { - if (im->tm_running) - { - del_timer(&im->timer); - im->tm_running=0; - } - else - printk(KERN_DEBUG "igmp_stop_timer() called with timer not running by %p\n",__builtin_return_address(0)); + if (im->tm_running) { + del_timer(&im->timer); + im->tm_running=0; + } } extern __inline__ unsigned int random(void) @@ -223,17 +138,13 @@ extern __inline__ unsigned int random(void) return seed^jiffies; } -/* - * Inlined as it's only called once. - */ - -static void igmp_start_timer(struct ip_mc_list *im,unsigned char max_resp_time) +static __inline__ void igmp_start_timer(struct ip_mc_list *im, int max_delay) { int tv; - if(im->tm_running) + if (im->tm_running) return; - tv=random()%(max_resp_time*HZ/sysctl_igmp_timer_scale); /* Pick a number any number 8) */ - im->timer.expires=jiffies+tv; + tv=random() % max_delay; + im->timer.expires=jiffies+tv+2; im->tm_running=1; add_timer(&im->timer); } @@ -244,20 +155,32 @@ static void igmp_start_timer(struct ip_mc_list *im,unsigned char max_resp_time) #define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4) -static void igmp_send_report(struct device *dev, u32 group, int type) +static int igmp_send_report(struct device *dev, u32 group, int type) { struct sk_buff *skb; struct iphdr *iph; struct igmphdr *ih; struct rtable *rt; + u32 dst; - if (ip_route_output(&rt, group, 0, 0, dev)) - return; + /* According to IGMPv2 specs, LEAVE messages are + * sent to all-routers group. + */ + dst = group; + if (type == IGMP_HOST_LEAVE_MESSAGE) + dst = IGMP_ALL_ROUTER; + + if (ip_route_output(&rt, dst, 0, 0, dev->ifindex)) + return -1; + if (rt->rt_src == 0) { + ip_rt_put(rt); + return -1; + } skb=alloc_skb(IGMP_SIZE+dev->hard_header_len+15, GFP_ATOMIC); if (skb == NULL) { ip_rt_put(rt); - return; + return -1; } skb->dst = &rt->u.dst; @@ -272,7 +195,7 @@ static void igmp_send_report(struct device *dev, u32 group, int type) iph->tos = 0; iph->frag_off = 0; iph->ttl = 1; - iph->daddr = group; + iph->daddr = dst; iph->saddr = rt->rt_src; iph->protocol = IPPROTO_IGMP; iph->tot_len = htons(IGMP_SIZE); @@ -290,115 +213,140 @@ static void igmp_send_report(struct device *dev, u32 group, int type) ih->group=group; ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - skb->dst->output(skb); + return skb->dst->output(skb); } static void igmp_timer_expire(unsigned long data) { struct ip_mc_list *im=(struct ip_mc_list *)data; - struct ip_router_info *r; + struct in_device *in_dev = im->interface; + int err; im->tm_running=0; - r=igmp_get_mrouter_info(im->interface); - if(r==NULL) - return; - if(r->type==IGMP_NEW_ROUTER) - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_NEW_MEMBERSHIP_REPORT); + + if (IGMP_V1_SEEN(in_dev)) + err = igmp_send_report(in_dev->dev, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT); else - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT); - im->reporter = 1; -} + err = igmp_send_report(in_dev->dev, im->multiaddr, IGMP_HOST_NEW_MEMBERSHIP_REPORT); -static void igmp_init_timer(struct ip_mc_list *im) -{ - im->tm_running=0; - init_timer(&im->timer); - im->timer.data=(unsigned long)im; - im->timer.function=&igmp_timer_expire; -} + /* Failed. Retry later. */ + if (err) { + igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); + return; + } + if (im->unsolicit_count) { + im->unsolicit_count--; + igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); + } + im->reporter = 1; +} -static void igmp_heard_report(struct device *dev, u32 group, u32 source) +static void igmp_heard_report(struct in_device *in_dev, u32 group) { struct ip_mc_list *im; /* Timers are only set for non-local groups */ + if (LOCAL_MCAST(group)) return; - for (im=dev->ip_mc_list; im!=NULL; im=im->next) { + for (im=in_dev->mc_list; im!=NULL; im=im->next) { if (im->multiaddr == group) { - if (im->tm_running) - igmp_stop_timer(im); - if (source != dev->pa_addr) - im->reporter = 0; + igmp_stop_timer(im); + im->reporter = 0; + im->unsolicit_count = 0; return; } } } -static void igmp_heard_query(struct device *dev, unsigned char max_resp_time, +static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_time, u32 group) { - struct ip_mc_list *im; - int mrouter_type; + struct ip_mc_list *im; + int max_delay; - /* - * The max_resp_time is in units of 1/10 second. - */ - if(max_resp_time>0) { - mrouter_type=IGMP_NEW_ROUTER; + max_delay = max_resp_time*(HZ/IGMP_TIMER_SCALE); - if (igmp_set_mrouter_info(dev,mrouter_type,0)==NULL) - return; - /* - * - Start the timers in all of our membership records - * that the query applies to for the interface on - * which the query arrived excl. those that belong - * to a "local" group (224.0.0.X) - * - For timers already running check if they need to - * be reset. - * - Use the igmp->igmp_code field as the maximum - * delay possible - */ - for(im=dev->ip_mc_list;im!=NULL;im=im->next) { - if (group && group != im->multiaddr) - continue; - if(im->tm_running) { - if(im->timer.expires>jiffies+max_resp_time*HZ/sysctl_igmp_timer_scale) { - igmp_stop_timer(im); - igmp_start_timer(im,max_resp_time); - } - } else if (!LOCAL_MCAST(im->multiaddr)) - igmp_start_timer(im,max_resp_time); - } - } else { - mrouter_type=IGMP_OLD_ROUTER; - max_resp_time=sysctl_igmp_max_host_report_delay*sysctl_igmp_timer_scale; + if (max_resp_time == 0) { + /* Alas, old v1 router presents here. */ - if(igmp_set_mrouter_info(dev,mrouter_type,sysctl_igmp_age_threshold)==NULL) - return; + max_delay = IGMP_Query_Response_Interval; + in_dev->mr_v1_seen = jiffies + IGMP_V1_Router_Present_Timeout; + group = 0; + } + + /* + * - Start the timers in all of our membership records + * that the query applies to for the interface on + * which the query arrived excl. those that belong + * to a "local" group (224.0.0.X) + * - For timers already running check if they need to + * be reset. + * - Use the igmp->igmp_code field as the maximum + * delay possible + */ + for (im=in_dev->mc_list; im!=NULL; im=im->next) { + if (group && group != im->multiaddr) + continue; + if (LOCAL_MCAST(im->multiaddr)) + continue; + im->unsolicit_count = 0; + if (im->tm_running && im->timer.expires-jiffies > max_delay) + igmp_stop_timer(im); + igmp_start_timer(im, max_delay); + } +} - /* - * Start the timers in all of our membership records for - * the interface on which the query arrived, except those - * that are already running and those that belong to a - * "local" group (224.0.0.X). - */ +int igmp_rcv(struct sk_buff *skb, unsigned short len) +{ + /* This basically follows the spec line by line -- see RFC1112 */ + struct igmphdr *ih = skb->h.igmph; + struct in_device *in_dev = skb->dev->ip_ptr; - for(im=dev->ip_mc_list;im!=NULL;im=im->next) { - if(!im->tm_running && !LOCAL_MCAST(im->multiaddr)) - igmp_start_timer(im,max_resp_time); - } + if (len < sizeof(struct igmphdr) || ip_compute_csum((void *)ih, len) + || in_dev==NULL) { + kfree_skb(skb, FREE_READ); + return 0; + } + + switch (ih->type) { + case IGMP_HOST_MEMBERSHIP_QUERY: + igmp_heard_query(in_dev, ih->code, ih->group); + break; + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMP_HOST_NEW_MEMBERSHIP_REPORT: + /* Is it our report looped back? */ + if (((struct rtable*)skb->dst)->key.iif == 0) + break; + igmp_heard_report(in_dev, ih->group); + break; + case IGMP_PIM: +#ifdef CONFIG_IP_PIMSM_V1 + return pim_rcv_v1(skb, len); +#endif + case IGMP_DVMRP: + case IGMP_TRACE: + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_MTRACE: + case IGMP_MTRACE_RESP: + break; + default: + NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); } + kfree_skb(skb, FREE_READ); + return 0; } +#endif + /* * Map a multicast IP onto multicast MAC for type ethernet. */ -extern __inline__ void ip_mc_map(unsigned long addr, char *buf) +extern __inline__ void ip_mc_map(u32 addr, char *buf) { addr=ntohl(addr); buf[0]=0x01; @@ -415,15 +363,16 @@ extern __inline__ void ip_mc_map(unsigned long addr, char *buf) * Add a filter to a device */ -void ip_mc_filter_add(struct device *dev, unsigned long addr) +static void ip_mc_filter_add(struct in_device *in_dev, u32 addr) { char buf[6]; - ip_rt_multicast_event(dev); - if(!(dev->flags & IFF_MULTICAST)) + struct device *dev = in_dev->dev; + + if (!(dev->flags & IFF_MULTICAST)) return; - if(dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) + if (dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) return; /* Only do ethernet or FDDI for now */ - ip_mc_map(addr,buf); + ip_mc_map(addr, buf); dev_mc_add(dev,buf,ETH_ALEN,0); } @@ -431,70 +380,49 @@ void ip_mc_filter_add(struct device *dev, unsigned long addr) * Remove a filter from a device */ -void ip_mc_filter_del(struct device *dev, unsigned long addr) +static void ip_mc_filter_del(struct in_device *in_dev, u32 addr) { char buf[6]; - ip_rt_multicast_event(dev); - if(dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) + struct device *dev = in_dev->dev; + + if (dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) return; /* Only do ethernet or FDDI for now */ ip_mc_map(addr,buf); dev_mc_delete(dev,buf,ETH_ALEN,0); } -extern __inline__ void igmp_group_dropped(struct ip_mc_list *im) +static void igmp_group_dropped(struct ip_mc_list *im) { - del_timer(&im->timer); - if (im->reporter) - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE); ip_mc_filter_del(im->interface, im->multiaddr); -} -extern __inline__ void igmp_group_added(struct ip_mc_list *im) -{ - struct ip_router_info *r; - igmp_init_timer(im); - ip_mc_filter_add(im->interface, im->multiaddr); - r=igmp_get_mrouter_info(im->interface); - if(r==NULL) +#ifdef CONFIG_IP_MULTICAST + if (LOCAL_MCAST(im->multiaddr)) return; - if(r->type==IGMP_NEW_ROUTER) - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_NEW_MEMBERSHIP_REPORT); - else - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT); + + start_bh_atomic(); + igmp_stop_timer(im); + end_bh_atomic(); + + if (im->reporter && !IGMP_V1_SEEN(im->interface)) + igmp_send_report(im->interface->dev, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE); +#endif } -int igmp_rcv(struct sk_buff *skb, unsigned short len) +static void igmp_group_added(struct ip_mc_list *im) { - /* This basically follows the spec line by line -- see RFC1112 */ - struct igmphdr *ih = skb->h.igmph; + ip_mc_filter_add(im->interface, im->multiaddr); - if (len < sizeof(struct igmphdr) || ip_compute_csum((void *)ih, len)) { - kfree_skb(skb, FREE_READ); - return 0; - } - - switch (ih->type) { - case IGMP_HOST_MEMBERSHIP_QUERY: - igmp_heard_query(skb->dev, ih->code, ih->group); - break; - case IGMP_HOST_MEMBERSHIP_REPORT: - case IGMP_HOST_NEW_MEMBERSHIP_REPORT: - igmp_heard_report(skb->dev, ih->group, skb->nh.iph->saddr); - break; - case IGMP_DVMRP: - case IGMP_PIM: - case IGMP_TRACE: - case IGMP_HOST_LEAVE_MESSAGE: - case IGMP_MTRACE: - case IGMP_MTRACE_RESP: - break; - default: - NETDEBUG(printk(KERN_DEBUG "Unknown IGMP type=%d\n", ih->type)); - } - kfree_skb(skb, FREE_READ); - return 0; +#ifdef CONFIG_IP_MULTICAST + if (LOCAL_MCAST(im->multiaddr)) + return; + + start_bh_atomic(); + igmp_start_timer(im, IGMP_Initial_Report_Delay); + end_bh_atomic(); +#endif } + /* * Multicast list managers */ @@ -504,143 +432,210 @@ int igmp_rcv(struct sk_buff *skb, unsigned short len) * A socket has joined a multicast group on device dev. */ -static void ip_mc_inc_group(struct device *dev, unsigned long addr) +void ip_mc_inc_group(struct in_device *in_dev, u32 addr) { - struct ip_mc_list *i; - for(i=dev->ip_mc_list;i!=NULL;i=i->next) - { - if(i->multiaddr==addr) - { + struct ip_mc_list *i, *im; + + im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL); + + for (i=in_dev->mc_list; i; i=i->next) { + if (i->multiaddr == addr) { i->users++; + if (im) + kfree(im); return; } } - i=(struct ip_mc_list *)kmalloc(sizeof(*i), GFP_KERNEL); - if(!i) + if (!im) return; - i->users=1; - i->interface=dev; - i->multiaddr=addr; - i->next=dev->ip_mc_list; - igmp_group_added(i); - dev->ip_mc_list=i; + im->users=1; + im->interface=in_dev; + im->multiaddr=addr; +#ifdef CONFIG_IP_MULTICAST + im->tm_running=0; + init_timer(&im->timer); + im->timer.data=(unsigned long)im; + im->timer.function=&igmp_timer_expire; + im->unsolicit_count = IGMP_Unsolicited_Report_Count; + im->reporter = 0; +#endif + im->next=in_dev->mc_list; + in_dev->mc_list=im; + if (in_dev->dev->flags & IFF_UP) { + igmp_group_added(im); + ip_rt_multicast_event(in_dev); + } + return; } /* * A socket has left a multicast group on device dev */ -static void ip_mc_dec_group(struct device *dev, unsigned long addr) +int ip_mc_dec_group(struct in_device *in_dev, u32 addr) { - struct ip_mc_list **i; - for(i=&(dev->ip_mc_list);(*i)!=NULL;i=&(*i)->next) - { - if((*i)->multiaddr==addr) - { - if(--((*i)->users) == 0) - { - struct ip_mc_list *tmp= *i; - igmp_group_dropped(tmp); - *i=(*i)->next; - kfree_s(tmp,sizeof(*tmp)); + struct ip_mc_list *i, **ip; + + for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { + if (i->multiaddr==addr) { + if (--i->users == 0) { + *ip = i->next; + if (in_dev->dev->flags & IFF_UP) { + igmp_group_dropped(i); + ip_rt_multicast_event(in_dev); + } + kfree_s(i, sizeof(*i)); } - return; + return 0; } } + return -ESRCH; } -/* - * Device going down: Clean up. - */ +/* Device going down */ -void ip_mc_drop_device(struct device *dev) +void ip_mc_down(struct in_device *in_dev) { struct ip_mc_list *i; - struct ip_mc_list *j; - start_bh_atomic(); - for(i=dev->ip_mc_list;i!=NULL;i=j) - { - j=i->next; - if(i->tm_running) - del_timer(&i->timer); - kfree_s(i,sizeof(*i)); - } - dev->ip_mc_list=NULL; - end_bh_atomic(); + + for (i=in_dev->mc_list; i; i=i->next) + igmp_group_dropped(i); +} + +/* Device going up */ + +void ip_mc_up(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + for (i=in_dev->mc_list; i; i=i->next) + igmp_group_added(i); } /* - * Device going up. Make sure it is in all hosts + * Device is about to be destroyed: clean up. */ -void ip_mc_allhost(struct device *dev) +void ip_mc_destroy_dev(struct in_device *in_dev) { struct ip_mc_list *i; - for(i=dev->ip_mc_list;i!=NULL;i=i->next) - if(i->multiaddr==IGMP_ALL_HOSTS) - return; - i=(struct ip_mc_list *)kmalloc(sizeof(*i), GFP_KERNEL); - if(!i) - return; - i->users=1; - i->interface=dev; - i->multiaddr=IGMP_ALL_HOSTS; - i->tm_running=0; - i->next=dev->ip_mc_list; - dev->ip_mc_list=i; - ip_mc_filter_add(i->interface, i->multiaddr); + + while ((i = in_dev->mc_list) != NULL) { + in_dev->mc_list = i->next; + kfree_s(i, sizeof(*i)); + } +} + +/* Initialize multicasting on an IP interface */ + +void ip_mc_init_dev(struct in_device *in_dev) +{ + in_dev->mc_list = NULL; + in_dev->mr_v1_seen = 0; + ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); +} + +static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) +{ + struct rtable *rt; + struct device *dev = NULL; + + if (imr->imr_address.s_addr) { + dev = ip_dev_find(imr->imr_address.s_addr); + if (!dev) + return NULL; + } + + if (!dev && !ip_route_output(&rt, imr->imr_multiaddr.s_addr, 0, 0, 0)) { + dev = rt->u.dst.dev; + ip_rt_put(rt); + } + if (dev) { + imr->imr_ifindex = dev->ifindex; + return dev->ip_ptr; + } + return NULL; } /* * Join a socket to a group */ -int ip_mc_join_group(struct sock *sk , struct device *dev, unsigned long addr) +int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) { - int unused= -1; - int i; - if(!MULTICAST(addr)) + int err; + u32 addr = imr->imr_multiaddr.s_addr; + struct ip_mc_socklist *iml, *i; + struct in_device *in_dev; + int count = 0; + + if (!MULTICAST(addr)) return -EINVAL; - if(sk->ip_mc_list==NULL) - { - if((sk->ip_mc_list=(struct ip_mc_socklist *)kmalloc(sizeof(*sk->ip_mc_list), GFP_KERNEL))==NULL) - return -ENOMEM; - memset(sk->ip_mc_list,'\0',sizeof(*sk->ip_mc_list)); - } - for(i=0;iip_mc_list->multiaddr[i]==addr && sk->ip_mc_list->multidev[i]==dev) - return -EADDRINUSE; - if(sk->ip_mc_list->multidev[i]==NULL) - unused=i; + + rtnl_shlock(); + + if (!imr->imr_ifindex) + in_dev = ip_mc_find_dev(imr); + else + in_dev = inetdev_by_index(imr->imr_ifindex); + + if (!in_dev) { + iml = NULL; + err = -ENODEV; + goto done; } - if(unused==-1) - return -ENOBUFS; - sk->ip_mc_list->multiaddr[unused]=addr; - sk->ip_mc_list->multidev[unused]=dev; - ip_mc_inc_group(dev,addr); - return 0; + iml = (struct ip_mc_socklist *)kmalloc(sizeof(*iml), GFP_KERNEL); + + err = -EADDRINUSE; + for (i=sk->ip_mc_list; i; i=i->next) { + if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { + /* New style additions are reference counted */ + if (imr->imr_address.s_addr == 0) { + i->count++; + err = 0; + } + goto done; + } + count++; + } + err = -ENOBUFS; + if (iml == NULL || count >= IP_MAX_MEMBERSHIPS) + goto done; + memcpy(&iml->multi, imr, sizeof(*imr)); + iml->next = sk->ip_mc_list; + iml->count = 1; + sk->ip_mc_list = iml; + ip_mc_inc_group(in_dev, addr); + iml = NULL; + err = 0; +done: + rtnl_shunlock(); + if (iml) + kfree(iml); + return err; } /* * Ask a socket to leave a group. */ -int ip_mc_leave_group(struct sock *sk, struct device *dev, unsigned long addr) +int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { - int i; - if(!MULTICAST(addr)) - return -EINVAL; - if(sk->ip_mc_list==NULL) - return -EADDRNOTAVAIL; - - for(i=0;iip_mc_list->multiaddr[i]==addr && sk->ip_mc_list->multidev[i]==dev) - { - sk->ip_mc_list->multidev[i]=NULL; - ip_mc_dec_group(dev,addr); + struct ip_mc_socklist *iml, **imlp; + + for (imlp=&sk->ip_mc_list; (iml=*imlp)!=NULL; imlp=&iml->next) { + if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && + iml->multi.imr_address.s_addr==imr->imr_address.s_addr && + (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) { + struct in_device *in_dev; + if (--iml->count) + return 0; + *imlp = iml->next; + in_dev = inetdev_by_index(iml->multi.imr_ifindex); + if (in_dev) + ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr); + kfree_s(iml, sizeof(*iml)); return 0; } } @@ -653,69 +648,63 @@ int ip_mc_leave_group(struct sock *sk, struct device *dev, unsigned long addr) void ip_mc_drop_socket(struct sock *sk) { - int i; - - if(sk->ip_mc_list==NULL) - return; - - for(i=0;iip_mc_list->multidev[i]) - { - ip_mc_dec_group(sk->ip_mc_list->multidev[i], sk->ip_mc_list->multiaddr[i]); - sk->ip_mc_list->multidev[i]=NULL; - } + struct ip_mc_socklist *iml; + + while ((iml=sk->ip_mc_list) != NULL) { + struct in_device *in_dev; + sk->ip_mc_list = iml->next; + if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) + ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); + kfree_s(iml, sizeof(*iml)); } - kfree_s(sk->ip_mc_list,sizeof(*sk->ip_mc_list)); - sk->ip_mc_list=NULL; } -/* - * Write an multicast group list table for the IGMP daemon to - * read. - */ +#ifdef CONFIG_IP_MULTICAST int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dummy) { off_t pos=0, begin=0; struct ip_mc_list *im; - unsigned long flags; int len=0; struct device *dev; - len=sprintf(buffer,"Device : Count\tGroup Users Timer\tReporter\n"); - save_flags(flags); - cli(); + len=sprintf(buffer,"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); for(dev = dev_base; dev; dev = dev->next) { - if(dev->flags&IFF_UP) - { - len+=sprintf(buffer+len,"%-10s: %5d\n", - dev->name, dev->mc_count); - for(im = dev->ip_mc_list; im; im = im->next) - { - len+=sprintf(buffer+len, - "\t\t\t%08lX %5d %d:%08lX\t%d\n", - im->multiaddr, im->users, - im->tm_running, im->timer.expires-jiffies, im->reporter); - pos=begin+len; - if(posoffset+length) - break; - } - } + struct in_device *in_dev = dev->ip_ptr; + char *querier = "NONE"; + + if (in_dev == NULL) + continue; + + querier = IGMP_V1_SEEN(in_dev) ? "V1" : "V2"; + + len+=sprintf(buffer+len,"%d\t%-10s: %5d %7s\n", + dev->ifindex, dev->name, dev->mc_count, querier); + + for (im = in_dev->mc_list; im; im = im->next) { + len+=sprintf(buffer+len, + "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", + im->multiaddr, im->users, + im->tm_running, im->timer.expires-jiffies, im->reporter); + + pos=begin+len; + if(posoffset+length) + break; + } } - restore_flags(flags); *start=buffer+(offset-begin); len-=(offset-begin); if(len>length) - len=length; + len=length; return len; } +#endif diff --git a/net/ipv4/ip_alias.c b/net/ipv4/ip_alias.c deleted file mode 100644 index a78eef17a95d..000000000000 --- a/net/ipv4/ip_alias.c +++ /dev/null @@ -1,170 +0,0 @@ -/* - * IP_ALIAS (AF_INET) aliasing module. - * - * - * Version: @(#)ip_alias.c 0.43 12/20/95 - * - * Author: Juan Jose Ciarlante, - * - * Fixes: - * JJC : ip_alias_dev_select method. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef ALIAS_USER_LAND_DEBUG -#include "net_alias.h" -#include "ip_alias.h" -#include "user_stubs.h" -#endif - -#include -#include - -/* - * AF_INET alias init - */ - -static int ip_alias_init_1(struct net_alias_type *this, struct net_alias *alias, struct sockaddr *sa) -{ -#ifdef ALIAS_USER_LAND_DEBUG - printk("alias_init(%s) called.\n", alias->name); -#endif - MOD_INC_USE_COUNT; - return 0; -} - -/* - * AF_INET alias done - */ - -static int ip_alias_done_1(struct net_alias_type *this, struct net_alias *alias) -{ -#ifdef ALIAS_USER_LAND_DEBUG - printk("alias_done(%s) called.\n", alias->name); -#endif - MOD_DEC_USE_COUNT; - return 0; -} - -/* - * Print alias address info - */ - -int ip_alias_print_1(struct net_alias_type *this, struct net_alias *alias, char *buf, int len) -{ - char *p; - - p = (char *) &alias->dev.pa_addr; - return sprintf(buf, "%d.%d.%d.%d", - (p[0] & 255), (p[1] & 255), (p[2] & 255), (p[3] & 255)); -} - -struct device *ip_alias_dev_select(struct net_alias_type *this, struct device *main_dev, struct sockaddr *sa) -{ - __u32 addr; -#if 0 - struct rtable *rt; -#endif - struct device *dev=NULL; - - /* - * Defensive... - */ - - if (main_dev == NULL) - return NULL; - - /* - * Get u32 address. - */ - - addr = (sa)? (*(struct sockaddr_in *)sa).sin_addr.s_addr : 0; - if (addr == 0) - return NULL; - - /* - * Find 'closest' device to address given. any other suggestions? ... - * net_alias module will check if returned device is main_dev's alias - */ - -#if 0 - rt = ip_rt_route(addr, 0); - if(rt) - { - dev=rt->rt_dev; - ip_rt_put(rt); - } -#endif - return dev; -} - -/* - * net_alias AF_INET type defn. - */ - -struct net_alias_type ip_alias_type = -{ - AF_INET, /* type */ - 0, /* n_attach */ - "ip", /* name */ - NULL, /* get_addr32() */ - NULL, /* dev_addr_chk() */ - ip_alias_dev_select, /* dev_select() */ - ip_alias_init_1, /* alias_init_1() */ - ip_alias_done_1, /* alias_done_1() */ - ip_alias_print_1, /* alias_print_1() */ - NULL /* next */ -}; - -/* - * ip_alias module initialization - */ - -__initfunc(int ip_alias_init(void)) -{ - return register_net_alias_type(&ip_alias_type, AF_INET); -} - -/* - * ip_alias module done - */ - -int ip_alias_done(void) -{ - return unregister_net_alias_type(&ip_alias_type); -} - -#ifdef MODULE - -int init_module(void) -{ - if (ip_alias_init() != 0) - return -EIO; - return 0; -} - -void cleanup_module(void) -{ - if (ip_alias_done() != 0) - printk(KERN_INFO "ip_alias: can't remove module"); -} - -#endif /* MODULE */ diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 0726f3bb44c8..8f48894a4d20 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -5,6 +5,8 @@ * * The IP forwarding functionality. * + * Version: $Id: ip_forward.c,v 1.32 1997/10/24 17:16:06 kuznet Exp $ + * * Authors: see ip.c * * Fixes: @@ -76,10 +78,13 @@ int ip_forward(struct sk_buff *skb) int fw_res = 0; #endif - if (skb->pkt_type != PACKET_HOST) { - kfree_skb(skb,FREE_WRITE); - return 0; + if (IPCB(skb)->opt.router_alert) { + if (ip_call_ra_chain(skb)) + return 0; } + + if (skb->pkt_type != PACKET_HOST) + goto drop; /* * According to the RFC, we must first decrease the TTL field. If @@ -90,27 +95,25 @@ int ip_forward(struct sk_buff *skb) iph = skb->nh.iph; rt = (struct rtable*)skb->dst; +#ifdef CONFIG_CPU_IS_SLOW + if (net_cpu_congestion > 1 && !(iph->tos&IPTOS_RELIABILITY) && + IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) { + if (((xtime.tv_usec&0xF)< 0x1C) + goto drop; + } +#endif + + #ifdef CONFIG_TRANSPARENT_PROXY if (ip_chk_sock(skb)) - return ip_local_deliver(skb); + goto local_pkt; #endif - if (ip_decrease_ttl(iph) <= 0) { - /* Tell the sender its packet died... */ - icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); - kfree_skb(skb, FREE_WRITE); - return -1; - } - - if (opt->is_strictroute && (rt->rt_flags&RTF_GATEWAY)) { - /* - * Strict routing permits no gatewaying - */ - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); - kfree_skb(skb, FREE_WRITE); - return -1; - } + if (ip_decrease_ttl(iph) <= 0) + goto too_many_hops; + if (opt->is_strictroute && (rt->rt_flags&RTF_GATEWAY)) + goto sr_failed; /* * Having picked a route we can now send the frame out @@ -139,19 +142,23 @@ int ip_forward(struct sk_buff *skb) */ if (dev2->flags & IFF_UP) { - if (skb->len > mtu && (ntohs(iph->frag_off) & IP_DF)) { - ip_statistics.IpFragFails++; - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - kfree_skb(skb, FREE_WRITE); - return -1; - } + if (skb->len > mtu && (ntohs(iph->frag_off) & IP_DF)) + goto frag_needed; - if (rt->rt_flags&RTCF_NAT) { +#ifdef CONFIG_IP_ROUTE_NAT + if (rt->rt_flags & RTCF_NAT) { + if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) { + struct sk_buff *skb2; + skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15); + kfree_skb(skb, FREE_WRITE); + skb = skb2; + } if (ip_do_nat(skb)) { kfree_skb(skb, FREE_WRITE); return -1; } } +#endif #ifdef CONFIG_IP_MASQUERADE if(!(IPCB(skb)->flags&IPSKB_MASQUERADED)) { @@ -168,7 +175,7 @@ int ip_forward(struct sk_buff *skb) * and skip the firewall checks */ if (iph->protocol == IPPROTO_ICMP) { - if ((fw_res = ip_fw_masq_icmp(&skb, dev2)) < 0) { + if ((fw_res = ip_fw_masq_icmp(&skb)) < 0) { kfree_skb(skb, FREE_READ); return -1; } @@ -179,7 +186,8 @@ int ip_forward(struct sk_buff *skb) } if (rt->rt_flags&RTCF_MASQ) goto skip_call_fw_firewall; -#endif +#endif /* CONFIG_IP_MASQUERADE */ + #ifdef CONFIG_FIREWALL fw_res=call_fw_firewall(PF_INET, dev2, iph, NULL, &skb); switch (fw_res) { @@ -205,7 +213,16 @@ skip_call_fw_firewall: */ if (!(IPCB(skb)->flags&IPSKB_MASQUERADED) && (fw_res==FW_MASQUERADE || rt->rt_flags&RTCF_MASQ)) { - if (ip_fw_masquerade(&skb, dev2) < 0) { + u32 maddr; + +#ifdef CONFIG_IP_ROUTE_NAT + maddr = (rt->rt_flags&RTCF_MASQ) ? rt->rt_src_map : 0; + + if (maddr == 0) +#endif + maddr = inet_select_addr(dev2, rt->rt_gateway, RT_SCOPE_UNIVERSE); + + if (ip_fw_masquerade(&skb, maddr) < 0) { kfree_skb(skb, FREE_READ); return -1; } @@ -238,10 +255,36 @@ skip_call_fw_firewall: ip_statistics.IpForwDatagrams++; - if (opt->optlen) - ip_forward_options(skb); - + if (opt->optlen == 0) { + ip_send(skb); + return 0; + } + ip_forward_options(skb); ip_send(skb); } return 0; + +#ifdef CONFIG_TRANSPARENT_PROXY +local_pkt: +#endif + return ip_local_deliver(skb); + +frag_needed: + ip_statistics.IpFragFails++; + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + goto drop; + +sr_failed: + /* + * Strict routing permits no gatewaying + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); + goto drop; + +too_many_hops: + /* Tell the sender its packet died... */ + icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); +drop: + kfree_skb(skb,FREE_WRITE); + return -1; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 5edcb4a9c8ec..637fe022e974 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,7 +5,7 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.26 1997/09/04 22:35:00 davem Exp $ + * Version: $Id: ip_fragment.c,v 1.29 1997/11/22 12:31:05 freitag Exp $ * * Authors: Fred N. van Kempen * Alan Cox @@ -130,7 +130,7 @@ static struct ipfrag *ip_frag_create(int offset, int end, /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and return the queue entry address if found. */ -static inline struct ipq *ip_find(struct iphdr *iph) +static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst) { __u16 id = iph->id; __u32 saddr = iph->saddr; @@ -314,7 +314,8 @@ static struct sk_buff *ip_glue(struct ipq *qp) len = qp->ihlen + qp->len; if(len>65535) { - printk(KERN_INFO "Oversized IP packet from %d.%d.%d.%d.\n", NIPQUAD(qp->iph->saddr)); + if (net_ratelimit()) + printk(KERN_INFO "Oversized IP packet from %d.%d.%d.%d.\n", NIPQUAD(qp->iph->saddr)); ip_statistics.IpReasmFails++; ip_free(qp); return NULL; @@ -322,7 +323,7 @@ static struct sk_buff *ip_glue(struct ipq *qp) if ((skb = dev_alloc_skb(len)) == NULL) { ip_statistics.IpReasmFails++; - NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing queue %p\n", qp)); + NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing queue %p\n", qp)); ip_free(qp); return NULL; } @@ -390,7 +391,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) ip_evictor(); /* Find the entry of this IP datagram in the "incomplete datagrams" queue. */ - qp = ip_find(iph); + qp = ip_find(iph, skb->dst); /* Is this a non-fragmented datagram? */ offset = ntohs(iph->frag_off); @@ -435,7 +436,8 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) /* Attempt to construct an oversize packet. */ if(ntohs(iph->tot_len)+(int)offset>65535) { - printk(KERN_INFO "Oversized packet received from %d.%d.%d.%d\n", NIPQUAD(iph->saddr)); + if (net_ratelimit()) + printk(KERN_INFO "Oversized packet received from %d.%d.%d.%d\n", NIPQUAD(iph->saddr)); frag_kfree_skb(skb, FREE_READ); ip_statistics.IpReasmFails++; return NULL; diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c index fa5917957901..9f8123afd181 100644 --- a/net/ipv4/ip_fw.c +++ b/net/ipv4/ip_fw.c @@ -6,6 +6,8 @@ * license in recognition of the original copyright. * -- Alan Cox. * + * $Id: ip_fw.c,v 1.29 1997/10/10 22:41:01 davem Exp $ + * * Ported from BSD to Linux, * Alan Cox 22/Nov/1994. * Zeroing /proc and other additions @@ -104,7 +106,7 @@ #include #include #include -#include +#include #include #include #include @@ -165,6 +167,10 @@ static int *policies[] = #endif +#ifdef CONFIG_IP_FIREWALL_NETLINK +struct sock *ipfwsk; +#endif + /* * Returns 1 if the port is matched by the vector, 0 otherwise */ @@ -375,15 +381,6 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, __u16 *redirport, struct ip_ if (!match) continue; - /* - * Look for a VIA address match - */ - if(f->fw_via.s_addr && rif) - { - if(rif->pa_addr!=f->fw_via.s_addr) - continue; /* Mismatch */ - } - /* * Look for a VIA device match */ @@ -651,6 +648,11 @@ static int insert_in_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl, if ((ftmp->fw_vianame)[0]) { if (!(ftmp->fw_viadev = dev_get(ftmp->fw_vianame))) ftmp->fw_viadev = (struct device *) -1; + } else if (ftmp->fw_via.s_addr) { + if (!(ftmp->fw_viadev = ip_dev_find(ftmp->fw_via.s_addr))) + ftmp->fw_viadev = (struct device *) -1; + else + memcpy(ftmp->fw_vianame, ftmp->fw_viadev->name, IFNAMSIZ); } else ftmp->fw_viadev = NULL; @@ -695,6 +697,11 @@ static int append_to_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl, if ((ftmp->fw_vianame)[0]) { if (!(ftmp->fw_viadev = dev_get(ftmp->fw_vianame))) ftmp->fw_viadev = (struct device *) -1; + } else if (ftmp->fw_via.s_addr) { + if (!(ftmp->fw_viadev = ip_dev_find(ftmp->fw_via.s_addr))) + ftmp->fw_viadev = (struct device *) -1; + else + memcpy(ftmp->fw_vianame, ftmp->fw_viadev->name, IFNAMSIZ); } else ftmp->fw_viadev = NULL; @@ -955,12 +962,6 @@ int ip_fw_ctl(int stage, void *m, int len) if ( !(viadev = dev_get(ipfwp->fwp_vianame)) ) { #ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: invalid device \"%s\"\n", ipfwp->fwp_vianame); -#endif - return(EINVAL); - } else if ( viadev->pa_addr != ipfwp->fwp_via.s_addr ) { -#ifdef DEBUG_IP_FIREWALL - printk("ip_fw_ctl: device \"%s\" has another IP address\n", - ipfwp->fwp_vianame); #endif return(EINVAL); } else if ( ip->ihl != sizeof(struct iphdr) / sizeof(int)) { @@ -1066,6 +1067,7 @@ int ip_fw_ctl(int stage, void *m, int len) } #endif /* CONFIG_IP_FIREWALL */ +#ifdef CONFIG_PROC_FS #if defined(CONFIG_IP_FIREWALL) || defined(CONFIG_IP_ACCT) static int ip_chain_procinfo(int stage, char *buffer, char **start, @@ -1120,9 +1122,9 @@ static int ip_chain_procinfo(int stage, char *buffer, char **start, ntohl(i->fw_dst.s_addr),ntohl(i->fw_dmsk.s_addr), (i->fw_vianame)[0] ? i->fw_vianame : "-", ntohl(i->fw_via.s_addr),i->fw_flg); - /* 9 is enough for a 32 bit box but the counters are 64bit on + /* 10 is enough for a 32 bit box but the counters are 64bit on the Alpha and Ultrapenguin */ - len+=sprintf(buffer+len,"%u %u %-19lu %-19lu", + len+=sprintf(buffer+len,"%u %u %-20lu %-20lu", i->fw_nsp,i->fw_ndp, i->fw_pcnt,i->fw_bcnt); for (p = 0; p < IP_FW_MAX_PORTS; p++) len+=sprintf(buffer+len, " %u", i->fw_pts[p]); @@ -1192,6 +1194,7 @@ static int ip_fw_fwd_procinfo(char *buffer, char **start, off_t offset, reset); } #endif +#endif #ifdef CONFIG_IP_FIREWALL @@ -1323,8 +1326,7 @@ __initfunc(void ip_fw_init(void)) /* Register for device up/down reports */ register_netdevice_notifier(&ipfw_dev_notifier); #endif - #ifdef CONFIG_IP_FIREWALL_NETLINK - netlink_attach(NETLINK_FIREWALL, netlink_donothing); /* XXX */ -#endif /* CONFIG_IP_FIREWALL_NETLINK */ + ipfwsk = netlink_kernel_create(NETLINK_FIREWALL, NULL); +#endif } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c new file mode 100644 index 000000000000..dbd62e27eeca --- /dev/null +++ b/net/ipv4/ip_gre.c @@ -0,0 +1,1191 @@ +/* + * Linux NET3: GRE over IP protocol decoder. + * + * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_IPV6 +#include +#include +#include +#endif + +/* + Problems & solutions + -------------------- + + 1. The most important issue is detecting local dead loops. + They would cause complete host lockup in transmit, which + would be "resolved" by stack overflow or, if queueing is enabled, + with infinite looping in net_bh. + + We cannot track such dead loops during route installation, + it is infeasible task. The most general solutions would be + to keep skb->encapsulation counter (sort of local ttl), + and silently drop packet when it expires. It is the best + solution, but it supposes maintaing new variable in ALL + skb, even if no tunneling is used. + + Current solution: t->recursion lock breaks dead loops. It looks + like dev->tbusy flag, but I preferred new variable, because + the semantics is different. One day, when hard_start_xmit + will be multithreaded we will have to use skb->encapsulation. + + + + 2. Networking dead loops would not kill routers, but would really + kill network. IP hop limit plays role of "t->recursion" in this case, + if we copy it from packet being encapsulated to upper header. + It is very good solution, but it introduces two problems: + + - Routing protocols, using packets with ttl=1 (OSPF, RIP2), + do not work over tunnels. + - traceroute does not work. I planned to relay ICMP from tunnel, + so that this problem would be solved and traceroute output + would even more informative. This idea appeared to be wrong: + only Linux complies to rfc1812 now (yes, guys, Linux is the only + true router now :-)), all routers (at least, in neighbourhood of mine) + return only 8 bytes of payload. It is the end. + + Hence, if we want that OSPF worked or traceroute said something reasonable, + we should search for another solution. + + One of them is to parse packet trying to detect inner encapsulation + made by our node. It is difficult or even impossible, especially, + taking into account fragmentation. TO be short, tt is not solution at all. + + Current solution: The solution was UNEXPECTEDLY SIMPLE. + We force DF flag on tunnels with preconfigured hop limit, + that is ALL. :-) Well, it does not remove the problem completely, + but exponential growth of network traffic is changed to linear + (branches, that exceed pmtu are pruned) and tunnel mtu + fastly degrades to value <68, where looping stops. + Yes, it is not good if there exists a router in the loop, + which does not force DF, even when encapsulating packets have DF set. + But it is not our problem! Nobody could accuse us, we made + all that we could make. Even if it is your gated who injected + fatal route to network, even if it were you who configured + fatal static route: you are innocent. :-) + + + + 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain + practically identical code. It would be good to glue them + together, but it is not very evident, how to make them modular. + sit is integral part of IPv6, ipip and gre are naturally modular. + We could extract common parts (hash table, ioctl etc) + to a separate module (ip_tunnel.c). + + Alexey Kuznetsov. + */ + +static int ipgre_tunnel_init(struct device *dev); + +/* Fallback tunnel: no source, no destination, no key, no options */ + +static int ipgre_fb_tunnel_init(struct device *dev); + +static struct device ipgre_fb_tunnel_dev = { + NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init, +}; + +static struct ip_tunnel ipgre_fb_tunnel = { + NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre0", } +}; + +/* Tunnel hash table */ + +/* + 4 hash tables: + + 3: (remote,local) + 2: (remote,*) + 1: (*,local) + 0: (*,*) + + We require exact key match i.e. if a key is present in packet + it will match only tunnel with the same key; if it is not present, + it will match only keyless tunnel. + + All keysless packets, if not matched configured keyless tunnels + will match fallback tunnel. + */ + +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) + +static struct ip_tunnel *tunnels[4][HASH_SIZE]; + +#define tunnels_r_l (tunnels[3]) +#define tunnels_r (tunnels[2]) +#define tunnels_l (tunnels[1]) +#define tunnels_wc (tunnels[0]) + +/* Given src, dst and key, find approriate for input tunnel. */ + +static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(key); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_r[h0^h1]; t; t = t->next) { + if (remote == t->parms.iph.daddr) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr || + (local == t->parms.iph.daddr && MULTICAST(local))) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_wc[h1]; t; t = t->next) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + if (ipgre_fb_tunnel_dev.flags&IFF_UP) + return &ipgre_fb_tunnel; + return NULL; +} + +static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create) +{ + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + u32 key = parms->i_key; + struct ip_tunnel *t, **tp, *nt; + struct device *dev; + unsigned h = HASH(key); + int prio = 0; + + if (local) + prio |= 1; + if (remote && !MULTICAST(remote)) { + prio |= 2; + h ^= HASH(remote); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { + if (key == t->parms.i_key) + return t; + } + } + if (!create) + return NULL; + + MOD_INC_USE_COUNT; + dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL); + if (dev == NULL) { + MOD_DEC_USE_COUNT; + return NULL; + } + memset(dev, 0, sizeof(*dev) + sizeof(*t)); + dev->priv = (void*)(dev+1); + nt = (struct ip_tunnel*)dev->priv; + nt->dev = dev; + dev->name = nt->parms.name; + dev->init = ipgre_tunnel_init; + memcpy(&nt->parms, parms, sizeof(*parms)); + if (dev->name[0] == 0) { + int i; + for (i=1; i<100; i++) { + sprintf(dev->name, "gre%d", i); + if (dev_get(dev->name) == NULL) + break; + } + if (i==100) + goto failed; + memcpy(parms->name, dev->name, IFNAMSIZ); + } + if (register_netdevice(dev) < 0) + goto failed; + + start_bh_atomic(); + nt->next = t; + *tp = nt; + end_bh_atomic(); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + kfree(dev); + MOD_DEC_USE_COUNT; + return NULL; +} + +static void ipgre_tunnel_destroy(struct device *dev) +{ + struct ip_tunnel *t, **tp; + struct ip_tunnel *t0 = (struct ip_tunnel*)dev->priv; + u32 remote = t0->parms.iph.daddr; + u32 local = t0->parms.iph.saddr; + unsigned h = HASH(t0->parms.i_key); + int prio = 0; + + if (local) + prio |= 1; + if (remote && !MULTICAST(remote)) { + prio |= 2; + h ^= HASH(remote); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (t == t0) { + *tp = t->next; + if (dev != &ipgre_fb_tunnel_dev) { + kfree(dev); + MOD_DEC_USE_COUNT; + } + break; + } + } +} + + +void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len) +{ +#ifndef I_WISH_WORLD_WERE_PERFECT + +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + + Moreover, Cisco "wise men" put GRE key to the third word + in GRE header. It makes impossible maintaining even soft state for keyed + GRE tunnels with enabled checksum. Tell them "thank you". + + Well, I wonder, rfc1812 was written by Cisco employee, + what the hell these idiots break standrads established + by themself??? + */ + + struct iphdr *iph = (struct iphdr*)dp; + u16 *p = (u16*)(dp+(iph->ihl<<2)); + int grehlen = (iph->ihl<<2) + 4; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; + u16 flags; + + flags = p[0]; + if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { + if (flags&(GRE_VERSION|GRE_ROUTING)) + return; + if (flags&GRE_KEY) { + grehlen += 4; + if (flags&GRE_CSUM) + grehlen += 4; + } + } + + /* If only 8 bytes returned, keyed message will be dropped here */ + if (len < grehlen) + return; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0); + if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr)) + return; + + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + return; + + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; + return; +#else + struct iphdr *iph = (struct iphdr*)dp; + struct iphdr *eiph; + u16 *p = (u16*)(dp+(iph->ihl<<2)); + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + u16 flags; + int grehlen = (iph->ihl<<2) + 4; + struct sk_buff *skb2; + struct rtable *rt; + + if (p[1] != __constant_htons(ETH_P_IP)) + return; + + flags = p[0]; + if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { + if (flags&(GRE_VERSION|GRE_ROUTING)) + return; + if (flags&GRE_CSUM) + grehlen += 4; + if (flags&GRE_KEY) + grehlen += 4; + if (flags&GRE_SEQ) + grehlen += 4; + } + if (len < grehlen + sizeof(struct iphdr)) + return; + eiph = (struct iphdr*)(dp + grehlen); + + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < (iph->ihl<<2)) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMP_PARAMETERPROB; + rel_info = skb->h.icmph->un.gateway - grehlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* And it is the only really necesary thing :-) */ + rel_info = ntohs(skb->h.icmph->un.frag.mtu); + if (rel_info < grehlen+68) + return; + rel_info -= grehlen; + /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ + if (rel_info > ntohs(eiph->tot_len)) + return; + break; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + /* Prepare fake skb to feed it to icmp_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)eiph); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { + kfree_skb(skb2, FREE_WRITE); + return; + } + skb2->dev = rt->u.dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags&RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || + rt->u.dst.dev->type != ARPHRD_IPGRE) { + ip_rt_put(rt); + kfree_skb(skb2, FREE_WRITE); + return; + } + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || + skb2->dst->dev->type != ARPHRD_IPGRE) { + kfree_skb(skb2, FREE_WRITE); + return; + } + } + + /* change mtu on this route */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + if (rel_info > skb2->dst->pmtu) { + kfree_skb(skb2, FREE_WRITE); + return; + } + skb2->dst->pmtu = rel_info; + rel_info = htonl(rel_info); + } else if (type == ICMP_TIME_EXCEEDED) { + struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; + if (t->parms.iph.ttl) { + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + } + } + + icmp_send(skb2, rel_type, rel_code, rel_info); + kfree_skb(skb2, FREE_WRITE); +#endif +} + +int ipgre_rcv(struct sk_buff *skb, unsigned short len) +{ + struct iphdr *iph = skb->nh.iph; + u8 *h = skb->h.raw; + u16 flags = *(u16*)h; + u16 csum = 0; + u32 key = 0; + u32 seqno = 0; + struct ip_tunnel *tunnel; + int offset = 4; + + if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { + /* - Version must be 0. + - We do not support routing headers. + */ + if (flags&(GRE_VERSION|GRE_ROUTING)) + goto drop; + + if (flags&GRE_CSUM) { + csum = ip_compute_csum(h, len); + offset += 4; + } + if (flags&GRE_KEY) { + key = *(u32*)(h + offset); + offset += 4; + } + if (flags&GRE_SEQ) { + seqno = ntohl(*(u32*)(h + offset)); + offset += 4; + } + } + + if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) { + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb_pull(skb, h + offset - skb->data); + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->ip_summed = 0; + skb->protocol = *(u16*)(h + 2); + skb->pkt_type = PACKET_HOST; +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (MULTICAST(iph->daddr)) { + /* Looped back packet, drop it! */ + if (((struct rtable*)skb->dst)->key.iif == 0) + goto drop; + tunnel->stat.multicast++; + skb->pkt_type = PACKET_BROADCAST; + } +#endif + + if (((flags&GRE_CSUM) && csum) || + (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { + tunnel->stat.rx_crc_errors++; + tunnel->stat.rx_errors++; + goto drop; + } + if (tunnel->parms.i_flags&GRE_SEQ) { + if (!(flags&GRE_SEQ) || + (tunnel->i_seqno && seqno - tunnel->i_seqno < 0)) { + tunnel->stat.rx_fifo_errors++; + tunnel->stat.rx_errors++; + goto drop; + } + tunnel->i_seqno = seqno + 1; + } + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + netif_rx(skb); + return(0); + } + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + +drop: + kfree_skb(skb, FREE_READ); + return(0); +} + +static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *old_iph = skb->nh.iph; + struct iphdr *tiph; + u8 tos; + u16 df; + struct rtable *rt; /* Route to the other host */ + struct device *tdev; /* Device to other host */ + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + int gre_hlen; + u32 dst; + int mtu; + + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } + + if (dev->hard_header) { + gre_hlen = 0; + tiph = (struct iphdr*)skb->data; + } else { + gre_hlen = tunnel->hlen; + tiph = &tunnel->parms.iph; + } + + if ((dst = tiph->daddr) == 0) { + /* NBMA tunnel */ + + if (skb->dst == NULL) { + tunnel->stat.tx_fifo_errors++; + goto tx_error; + } + + if (skb->protocol == __constant_htons(ETH_P_IP)) { + rt = (struct rtable*)skb->dst; + if ((dst = rt->rt_gateway) == 0) + goto tx_error_icmp; + } +#ifdef CONFIG_IPV6 + else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { + struct in6_addr *addr6; + int addr_type; + struct nd_neigh *neigh = (struct nd_neigh *) skb->dst->neighbour; + + if (neigh == NULL) + goto tx_error; + + addr6 = &neigh->ndn_addr; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &skb->nh.ipv6h->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + goto tx_error_icmp; + + dst = addr6->s6_addr32[3]; + } +#endif + else + goto tx_error; + } + + tos = tiph->tos; + if (tos&1) { + if (skb->protocol == __constant_htons(ETH_P_IP)) + tos = old_iph->tos; + tos &= ~1; + } + + if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error; + } + tdev = rt->u.dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } + + df = tiph->frag_off; + mtu = rt->u.dst.pmtu - tunnel->hlen; + + if (skb->protocol == __constant_htons(ETH_P_IP)) { + if (skb->dst && mtu < skb->dst->pmtu && mtu >= 68) + skb->dst->pmtu = mtu; + + df |= (old_iph->frag_off&__constant_htons(IP_DF)); + + if ((old_iph->frag_off&__constant_htons(IP_DF)) && + mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + } +#ifdef CONFIG_IPV6 + else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { + struct rt6_info *rt6 = (struct rt6_info*)skb->dst; + + if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= 576) { + if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) || + rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + skb->dst->pmtu = mtu; + } + } + + if (mtu >= 576 && mtu < skb->len - tunnel->hlen + gre_hlen) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + ip_rt_put(rt); + goto tx_error; + } + } +#endif + + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + + if (skb->protocol == __constant_htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); +#ifdef CONFIG_IPV6 + else if (skb->protocol == __constant_htons(ETH_P_IPV6)) + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, dev); +#endif + } else + tunnel->err_count = 0; + } + + skb->h.raw = skb->nh.raw; + + max_headroom = ((tdev->hard_header_len+15)&~15)+ gre_hlen; + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; + dev_kfree_skb(skb, FREE_WRITE); + tunnel->recursion--; + return 0; + } + dev_kfree_skb(skb, FREE_WRITE); + skb = new_skb; + } + + skb->nh.raw = skb_push(skb, gre_hlen); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = IPPROTO_GRE; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) { + if (skb->protocol == __constant_htons(ETH_P_IP)) + iph->ttl = old_iph->ttl; +#ifdef CONFIG_IPV6 + else if (skb->protocol == __constant_htons(ETH_P_IPV6)) + iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit; +#endif + else + iph->ttl = ip_statistics.IpDefaultTTL; + } + + ((u16*)(iph+1))[0] = tunnel->parms.o_flags; + ((u16*)(iph+1))[1] = skb->protocol; + + if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { + u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4); + + if (tunnel->parms.o_flags&GRE_SEQ) { + ++tunnel->o_seqno; + *ptr = htonl(tunnel->o_seqno); + ptr--; + } + if (tunnel->parms.o_flags&GRE_KEY) { + *ptr = tunnel->parms.o_key; + ptr--; + } + if (tunnel->parms.o_flags&GRE_CSUM) { + *ptr = 0; + *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr)); + } + } + + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); + ip_send_check(iph); + + stats->tx_bytes += skb->len; + stats->tx_packets++; + ip_send(skb); + tunnel->recursion--; + return 0; + +tx_error_icmp: + if (skb->protocol == __constant_htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); +#ifdef CONFIG_IPV6 + else if (skb->protocol == __constant_htons(ETH_P_IPV6)) + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, dev); +#endif + +tx_error: + stats->tx_errors++; + dev_kfree_skb(skb, FREE_WRITE); + tunnel->recursion--; + return 0; +} + +static int +ipgre_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + MOD_INC_USE_COUNT; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == &ipgre_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipgre_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || + p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF)) || + ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= __constant_htons(IP_DF); + + if (!(p.i_flags&GRE_KEY)) + p.i_key = 0; + if (!(p.o_flags&GRE_KEY)) + p.o_key = 0; + + t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + if (dev == &ipgre_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipgre_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == &ipgre_fb_tunnel) + goto done; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + MOD_DEC_USE_COUNT; + return err; +} + +static struct net_device_stats *ipgre_tunnel_get_stats(struct device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); +} + +static int ipgre_tunnel_change_mtu(struct device *dev, int new_mtu) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +#ifdef CONFIG_NET_IPGRE_BROADCAST +/* Nice toy. Unfortunately, useless in real life :-) + It allows to construct virtual multiprotocol broadcast "LAN" + over the Internet, provided multicast routing is tuned. + + + I have no idea was this bicycle invented before me, + so that I had to set ARPHRD_IPGRE to a random value. + I have an impression, that Cisco could make something similar, + but this feature is apparently missing in IOS<=11.2(8). + + I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks + with broadcast 224.66.66.66. If you have access to mbone, play with me :-) + + ping -t 255 224.66.66.66 + + If nobody answers, mbone does not work. + + ip tunnel add Universe mode gre remote 224.66.66.66 local ttl 255 + ip addr add 10.66.66./24 dev Universe + ifconfig Universe up + ifconfig Universe add fe80::/10 + ifconfig Universe add fec0:6666:6666::/96 + ftp 10.66.66.66 + ... + ftp fec0:6666:6666::193.233.7.65 + ... + + */ + +static int ipgre_header(struct sk_buff *skb, struct device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); + u16 *p = (u16*)(iph+1); + + memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); + p[0] = t->parms.o_flags; + p[1] = htons(type); + + /* + * Set the source hardware address. + */ + + if (saddr) + memcpy(&iph->saddr, saddr, 4); + + if (daddr) { + memcpy(&iph->daddr, daddr, 4); + return t->hlen; + } + if (iph->daddr && !MULTICAST(iph->daddr)) + return t->hlen; + + return -t->hlen; +} + +static int ipgre_rebuild_header(struct sk_buff *skb) +{ + struct device *dev = skb->dev; + struct iphdr *iph = (struct iphdr *)skb->data; + u16 *p = (u16*)(iph + 1); + struct neighbour *neigh = NULL; + + if (skb->dst) + neigh = skb->dst->neighbour; + + if (neigh) + return neigh->ops->resolve((void*)&iph->daddr, skb); + + if (p[1] == __constant_htons(ETH_P_IP)) + return arp_find((void*)&iph->daddr, skb); + + if (net_ratelimit()) + printk(KERN_DEBUG "%s: unable to resolve type %X addresses.\n", + dev->name, (int)p[1]); + return 0; +} + +static int ipgre_open(struct device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + + MOD_INC_USE_COUNT; + if (MULTICAST(t->parms.iph.daddr)) { + struct rtable *rt; + if (ip_route_output(&rt, t->parms.iph.daddr, + t->parms.iph.saddr, RT_TOS(t->parms.iph.tos), + t->parms.link)) { + MOD_DEC_USE_COUNT; + return -EADDRNOTAVAIL; + } + dev = rt->u.dst.dev; + ip_rt_put(rt); + if (dev->ip_ptr == NULL) { + MOD_DEC_USE_COUNT; + return -EADDRNOTAVAIL; + } + t->mlink = dev->ifindex; + ip_mc_inc_group(dev->ip_ptr, t->parms.iph.daddr); + } + return 0; +} + +static int ipgre_close(struct device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + if (MULTICAST(t->parms.iph.daddr) && t->mlink) { + dev = dev_get_by_index(t->mlink); + if (dev && dev->ip_ptr) + ip_mc_dec_group(dev->ip_ptr, t->parms.iph.daddr); + } + MOD_DEC_USE_COUNT; + return 0; +} + +#endif + +static void ipgre_tunnel_init_gen(struct device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + + dev->destructor = ipgre_tunnel_destroy; + dev->hard_start_xmit = ipgre_tunnel_xmit; + dev->get_stats = ipgre_tunnel_get_stats; + dev->do_ioctl = ipgre_tunnel_ioctl; + dev->change_mtu = ipgre_tunnel_change_mtu; + + dev_init_buffers(dev); + + dev->type = ARPHRD_IPGRE; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4; + dev->mtu = 1500 - sizeof(struct iphdr) - 4; + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; + memcpy(dev->dev_addr, &t->parms.iph.saddr, 4); + memcpy(dev->broadcast, &t->parms.iph.daddr, 4); +} + +static int ipgre_tunnel_init(struct device *dev) +{ + struct device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + int hlen = LL_MAX_HEADER; + int mtu = 1500; + int addend = sizeof(struct iphdr) + 4; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + ipgre_tunnel_init_gen(dev); + + /* Guess output device to choose reasonable mtu and hard_header_len */ + + if (iph->daddr) { + struct rtable *rt; + if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + + dev->flags |= IFF_POINTOPOINT; + +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (MULTICAST(iph->daddr)) { + if (!iph->saddr) + return -EINVAL; + dev->flags = IFF_BROADCAST; + dev->hard_header = ipgre_header; + dev->rebuild_header = ipgre_rebuild_header; + dev->open = ipgre_open; + dev->stop = ipgre_close; + } +#endif + } + + if (!tdev && tunnel->parms.link) + tdev = dev_get_by_index(tunnel->parms.link); + + if (tdev) { + hlen = tdev->hard_header_len; + mtu = tdev->mtu; + } + dev->iflink = tunnel->parms.link; + + /* Precalculate GRE options length */ + if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { + if (tunnel->parms.o_flags&GRE_CSUM) + addend += 4; + if (tunnel->parms.o_flags&GRE_KEY) + addend += 4; + if (tunnel->parms.o_flags&GRE_SEQ) + addend += 4; + } + dev->hard_header_len = hlen + addend; + dev->mtu = mtu - addend; + tunnel->hlen = addend; + return 0; +} + +#ifdef MODULE +static int ipgre_fb_tunnel_open(struct device *dev) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int ipgre_fb_tunnel_close(struct device *dev) +{ + MOD_DEC_USE_COUNT; + return 0; +} +#endif + +__initfunc(int ipgre_fb_tunnel_init(struct device *dev)) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct iphdr *iph; + + ipgre_tunnel_init_gen(dev); +#ifdef MODULE + dev->open = ipgre_fb_tunnel_open; + dev->stop = ipgre_fb_tunnel_close; +#endif + + iph = &ipgre_fb_tunnel.parms.iph; + iph->version = 4; + iph->protocol = IPPROTO_GRE; + iph->ihl = 5; + tunnel->hlen = sizeof(struct iphdr) + 4; + + tunnels_wc[0] = &ipgre_fb_tunnel; + return 0; +} + + +static struct inet_protocol ipgre_protocol = { + ipgre_rcv, /* GRE handler */ + ipgre_err, /* TUNNEL error control */ + 0, /* next */ + IPPROTO_GRE, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "GRE" /* name */ +}; + + +/* + * And now the modules code and kernel interface. + */ + +#ifdef MODULE +int init_module(void) +#else +__initfunc(int ipgre_init(void)) +#endif +{ + printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); + + ipgre_fb_tunnel_dev.priv = (void*)&ipgre_fb_tunnel; + ipgre_fb_tunnel_dev.name = ipgre_fb_tunnel.parms.name; +#ifdef MODULE + register_netdev(&ipgre_fb_tunnel_dev); +#else + register_netdevice(&ipgre_fb_tunnel_dev); +#endif + + inet_add_protocol(&ipgre_protocol); + return 0; +} + +#ifdef MODULE + +void cleanup_module(void) +{ + if ( inet_del_protocol(&ipgre_protocol) < 0 ) + printk(KERN_INFO "ipgre close: can't remove protocol\n"); + + unregister_netdev(&ipgre_fb_tunnel_dev); +} + +#endif diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 2642832e3d28..1c3c2da7a7bc 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: @(#)ip.c 1.0.16b 9/1/93 + * Version: $Id: ip_input.c,v 1.24 1997/10/24 17:15:58 kuznet Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -153,8 +153,7 @@ #endif #include #include -#include -#include +#include #include /* @@ -184,13 +183,55 @@ int ip_ioctl(struct sock *sk, int cmd, unsigned long arg) #define CONFIG_IP_ALWAYS_DEFRAG 1 #endif +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) +{ + int type; + + type = skb->h.icmph->type; + if (type < 32) + return test_bit(type, &sk->tp_pinfo.tp_raw4.filter); + + /* Do not block unknown ICMP types */ + return 0; +} + +int ip_call_ra_chain(struct sk_buff *skb) +{ + struct ip_ra_chain *ra; + u8 protocol = skb->nh.iph->protocol; + struct sock *last = NULL; + + for (ra = ip_ra_chain; ra; ra = ra->next) { + struct sock *sk = ra->sk; + if (sk && sk->num == protocol) { + if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb); + if (skb == NULL) + return 1; + } + if (last) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + raw_rcv(last, skb2); + } + last = sk; + } + } + + if (last) { + raw_rcv(last, skb); + return 1; + } + return 0; +} int ip_local_deliver(struct sk_buff *skb) { struct iphdr *iph = skb->nh.iph; -#ifdef CONFIG_IP_MASQUERADE - struct device *dev = skb->dev; -#endif struct inet_protocol *ipprot; struct sock *raw_sk=NULL; unsigned char hash; @@ -214,7 +255,7 @@ int ip_local_deliver(struct sk_buff *skb) * Do we need to de-masquerade this packet? */ { - int ret = ip_fw_demasquerade(&skb, dev); + int ret = ip_fw_demasquerade(&skb); if (ret < 0) { kfree_skb(skb, FREE_WRITE); return 0; @@ -256,22 +297,23 @@ int ip_local_deliver(struct sk_buff *skb) if((raw_sk = raw_v4_htable[hash]) != NULL) { struct sock *sknext = NULL; struct sk_buff *skb1; - raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr); + raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); if(raw_sk) { /* Any raw sockets */ do { /* Find the next */ sknext = raw_v4_lookup(raw_sk->next, iph->protocol, - iph->saddr, iph->daddr); - if(sknext) + iph->saddr, iph->daddr, skb->dev->ifindex); + if (iph->protocol != IPPROTO_ICMP || !icmp_filter(raw_sk, skb)) { + if (sknext == NULL) + break; skb1 = skb_clone(skb, GFP_ATOMIC); - else - break; /* One pending raw socket left */ - if(skb1) - { - if(ipsec_sk_policy(raw_sk,skb1)) - raw_rcv(raw_sk, skb1); - else - kfree_skb(skb1, FREE_WRITE); + if(skb1) + { + if(ipsec_sk_policy(raw_sk,skb1)) + raw_rcv(raw_sk, skb1); + else + kfree_skb(skb1, FREE_WRITE); + } } raw_sk = sknext; } while(raw_sk!=NULL); @@ -350,15 +392,6 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) struct ip_options * opt = NULL; int err; -#ifdef CONFIG_NET_IPV6 - /* - * Intercept IPv6 frames. We dump ST-II and invalid types just below.. - */ - - if(iph->version == 6) - return ipv6_rcv(skb,dev,pt); -#endif - /* * When interface is in promisc. mode, drop all the crap * that it receives, do not truing to analyse it. @@ -398,13 +431,18 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ - - skb_trim(skb, ntohs(iph->tot_len)); + __skb_trim(skb, ntohs(iph->tot_len)); if (skb->dst == NULL) { err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev); if (err) goto drop; +#ifdef CONFIG_CPU_IS_SLOW + if (net_cpu_congestion > 10 && !(iph->tos&IPTOS_RELIABILITY) && + IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) { + goto drop; + } +#endif } #ifdef CONFIG_IP_ALWAYS_DEFRAG @@ -425,12 +463,12 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) opt = &(IPCB(skb)->opt); if (opt->srr) { if (!ipv4_config.source_route) { - if (ipv4_config.log_martians) + if (ipv4_config.log_martians && net_ratelimit()) printk(KERN_INFO "source route option %08lx -> %08lx\n", ntohl(iph->saddr), ntohl(iph->daddr)); goto drop; } - if (RT_LOCALADDR(((struct rtable*)skb->dst)->rt_flags) && + if (((struct rtable*)skb->dst)->rt_type == RTN_LOCAL && ip_options_rcv_srr(skb)) goto drop; } diff --git a/net/ipv4/ip_masq.c b/net/ipv4/ip_masq.c index 2d2fd37171f1..8c300e1556f1 100644 --- a/net/ipv4/ip_masq.c +++ b/net/ipv4/ip_masq.c @@ -339,7 +339,7 @@ static void masq_expire(unsigned long data) * given boundaries MASQ_BEGIN and MASQ_END. */ -struct ip_masq * ip_masq_new(struct device *dev, int proto, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags) +struct ip_masq * ip_masq_new(__u32 maddr, int proto, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags) { struct ip_masq *ms, *mst; int ports_tried, *free_ports_p; @@ -377,7 +377,7 @@ struct ip_masq * ip_masq_new(struct device *dev, int proto, __u32 saddr, __u16 s ms->flags |= IP_MASQ_F_NO_DADDR; /* get masq address from rif */ - ms->maddr = dev->pa_addr; + ms->maddr = maddr; for (ports_tried = 0; ports_tried < *free_ports_p; ports_tried++){ save_flags(flags); @@ -449,7 +449,7 @@ static void recalc_check(struct udphdr *uh, __u32 saddr, uh->check=0xFFFF; } -int ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) +int ip_fw_masquerade(struct sk_buff **skb_ptr, __u32 maddr) { struct sk_buff *skb=*skb_ptr; struct iphdr *iph = skb->nh.iph; @@ -489,7 +489,7 @@ int ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) if (ms==NULL) { - ms = ip_masq_new(dev, iph->protocol, + ms = ip_masq_new(maddr, iph->protocol, iph->saddr, portptr[0], iph->daddr, portptr[1], 0); @@ -512,7 +512,7 @@ int ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) * Attempt ip_masq_app call. * will fix ip_masq and iph seq stuff */ - if (ip_masq_app_pkt_out(ms, skb_ptr, dev) != 0) + if (ip_masq_app_pkt_out(ms, skb_ptr, maddr) != 0) { /* * skb has possibly changed, update pointers. @@ -572,7 +572,7 @@ int ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) ip_send_check(iph); #ifdef DEBUG_CONFIG_IP_MASQUERADE - printk("O-routed from %lX:%X over %s\n",ntohl(ms->maddr),ntohs(ms->mport),dev->name); + printk("O-routed from %lX:%X via %lX\n",ntohl(ms->maddr),ntohs(ms->mport),ntohl(maddr)); #endif return 0; @@ -586,7 +586,7 @@ int ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) * Currently handles error types - unreachable, quench, ttl exceeded */ -int ip_fw_masq_icmp(struct sk_buff **skb_p, struct device *dev) +int ip_fw_masq_icmp(struct sk_buff **skb_p) { struct sk_buff *skb = *skb_p; struct iphdr *iph = skb->nh.iph; @@ -685,7 +685,7 @@ int ip_fw_masq_icmp(struct sk_buff **skb_p, struct device *dev) * Currently handles error types - unreachable, quench, ttl exceeded */ -int ip_fw_demasq_icmp(struct sk_buff **skb_p, struct device *dev) +int ip_fw_demasq_icmp(struct sk_buff **skb_p) { struct sk_buff *skb = *skb_p; struct iphdr *iph = skb->nh.iph; @@ -778,7 +778,7 @@ int ip_fw_demasq_icmp(struct sk_buff **skb_p, struct device *dev) * this function. */ -int ip_fw_demasquerade(struct sk_buff **skb_p, struct device *dev) +int ip_fw_demasquerade(struct sk_buff **skb_p) { struct sk_buff *skb = *skb_p; struct iphdr *iph = skb->nh.iph; @@ -789,7 +789,7 @@ int ip_fw_demasquerade(struct sk_buff **skb_p, struct device *dev) switch (iph->protocol) { case IPPROTO_ICMP: - return(ip_fw_demasq_icmp(skb_p, dev)); + return(ip_fw_demasq_icmp(skb_p)); case IPPROTO_TCP: case IPPROTO_UDP: /* Make sure packet is in the masq range */ @@ -869,7 +869,7 @@ int ip_fw_demasquerade(struct sk_buff **skb_p, struct device *dev) * will fix ip_masq and iph ack_seq stuff */ - if (ip_masq_app_pkt_in(ms, skb_p, dev) != 0) + if (ip_masq_app_pkt_in(ms, skb_p) != 0) { /* * skb has changed, update pointers. @@ -937,6 +937,7 @@ int ip_fw_demasquerade(struct sk_buff **skb_p, struct device *dev) return 0; } +#ifdef CONFIG_PROC_FS /* * /proc/net entry */ @@ -999,7 +1000,6 @@ done: return len; } -#ifdef CONFIG_PROC_FS static struct proc_dir_entry proc_net_ipmsqhst = { PROC_NET_IPMSQHST, 13, "ip_masquerade", S_IFREG | S_IRUGO, 1, 0, 0, diff --git a/net/ipv4/ip_masq_app.c b/net/ipv4/ip_masq_app.c index 9862add850db..dcf777c143d2 100644 --- a/net/ipv4/ip_masq_app.c +++ b/net/ipv4/ip_masq_app.c @@ -306,7 +306,7 @@ static __inline__ void masq_seq_update(struct ip_masq *ms, struct ip_masq_seq *m * returns (new - old) skb->len diff. */ -int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) { struct ip_masq_app * mapp; struct iphdr *iph; @@ -351,7 +351,7 @@ int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, struct devic if ( mapp->pkt_out == NULL ) return 0; - diff = mapp->pkt_out(mapp, ms, skb_p, dev); + diff = mapp->pkt_out(mapp, ms, skb_p, maddr); /* * Update ip_masq seq stuff if len has changed. @@ -369,7 +369,7 @@ int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, struct devic * returns (new - old) skb->len diff. */ -int ip_masq_app_pkt_in(struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +int ip_masq_app_pkt_in(struct ip_masq *ms, struct sk_buff **skb_p) { struct ip_masq_app * mapp; struct iphdr *iph; @@ -414,7 +414,7 @@ int ip_masq_app_pkt_in(struct ip_masq *ms, struct sk_buff **skb_p, struct device if ( mapp->pkt_in == NULL ) return 0; - diff = mapp->pkt_in(mapp, ms, skb_p, dev); + diff = mapp->pkt_in(mapp, ms, skb_p); /* * Update ip_masq seq stuff if len has changed. diff --git a/net/ipv4/ip_masq_ftp.c b/net/ipv4/ip_masq_ftp.c index 4d5568d0a466..4cb88d925fd3 100644 --- a/net/ipv4/ip_masq_ftp.c +++ b/net/ipv4/ip_masq_ftp.c @@ -50,7 +50,7 @@ masq_ftp_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) } int -masq_ftp_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +masq_ftp_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) { struct sk_buff *skb; struct iphdr *iph; @@ -118,7 +118,7 @@ masq_ftp_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb ip_masq_set_expire(n_ms,0); } else { - n_ms = ip_masq_new(dev, IPPROTO_TCP, + n_ms = ip_masq_new(maddr, IPPROTO_TCP, htonl(from), htons(port), iph->daddr, 0, IP_MASQ_F_NO_DPORT); diff --git a/net/ipv4/ip_masq_irc.c b/net/ipv4/ip_masq_irc.c index a1be56f818bd..b2e325ce6bc9 100644 --- a/net/ipv4/ip_masq_irc.c +++ b/net/ipv4/ip_masq_irc.c @@ -51,7 +51,7 @@ masq_irc_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) } int -masq_irc_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +masq_irc_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) { struct sk_buff *skb; struct iphdr *iph; @@ -167,7 +167,7 @@ masq_irc_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb * connection is requested by another client. */ - n_ms = ip_masq_new(dev, IPPROTO_TCP, + n_ms = ip_masq_new(maddr, IPPROTO_TCP, htonl(s_addr),htons(s_port), 0, 0, IP_MASQ_F_NO_DPORT|IP_MASQ_F_NO_DADDR diff --git a/net/ipv4/ip_masq_quake.c b/net/ipv4/ip_masq_quake.c index 08a062bc7f52..482096f2b9fd 100644 --- a/net/ipv4/ip_masq_quake.c +++ b/net/ipv4/ip_masq_quake.c @@ -73,7 +73,7 @@ masq_quake_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) } int -masq_quake_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +masq_quake_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p) { struct sk_buff *skb; struct iphdr *iph; @@ -158,7 +158,7 @@ masq_quake_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **sk } int -masq_quake_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +masq_quake_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) { struct sk_buff *skb; struct iphdr *iph; @@ -234,7 +234,7 @@ masq_quake_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **s memcpy(&udp_port, data, 2); - n_ms = ip_masq_new(dev, IPPROTO_UDP, + n_ms = ip_masq_new(maddr, IPPROTO_UDP, ms->saddr, htons(udp_port), ms->daddr, ms->dport, 0); diff --git a/net/ipv4/ip_masq_raudio.c b/net/ipv4/ip_masq_raudio.c index 52f439102b60..26b5cd4da314 100644 --- a/net/ipv4/ip_masq_raudio.c +++ b/net/ipv4/ip_masq_raudio.c @@ -2,7 +2,7 @@ * IP_MASQ_RAUDIO - Real Audio masquerading module * * - * Version: @(#)$Id: ip_masq_raudio.c,v 1.6 1997/04/29 09:38:26 mj Exp $ + * Version: @(#)$Id: ip_masq_raudio.c,v 1.7 1997/09/16 18:43:40 kuznet Exp $ * * Author: Nigel Metheringham * [strongly based on ftp module by Juan Jose Ciarlante & Wouter Gadeyne] @@ -88,7 +88,7 @@ masq_raudio_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) } int -masq_raudio_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +masq_raudio_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) { struct sk_buff *skb; struct iphdr *iph; @@ -154,7 +154,7 @@ masq_raudio_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff ** if (ntohs(msg_id) == 1) { /* This is a message detailing the UDP port to be used */ memcpy(&udp_port, p, 2); - n_ms = ip_masq_new(dev, IPPROTO_UDP, + n_ms = ip_masq_new(maddr, IPPROTO_UDP, ms->saddr, udp_port, ms->daddr, 0, IP_MASQ_F_NO_DPORT); diff --git a/net/ipv4/ip_nat_dumb.c b/net/ipv4/ip_nat_dumb.c index 1d510af42337..06e9be8fb2fc 100644 --- a/net/ipv4/ip_nat_dumb.c +++ b/net/ipv4/ip_nat_dumb.c @@ -5,6 +5,8 @@ * * Dumb Network Address Translation. * + * Version: $Id: ip_nat_dumb.c,v 1.2 1997/10/10 22:41:05 davem Exp $ + * * Authors: Alexey Kuznetsov, * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 80baf83642d6..14b423f2fff2 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -5,6 +5,8 @@ * * The options processing module for ip.c * + * Version: $Id: ip_options.c,v 1.12 1997/10/10 22:41:08 davem Exp $ + * * Authors: A.N.Kuznetsov * */ @@ -15,10 +17,10 @@ #include #include #include +#include #include #include #include -#include /* * Write options to IP header, record destination address to @@ -32,7 +34,7 @@ */ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, - u32 daddr, u32 saddr, int is_frag) + u32 daddr, struct rtable *rt, int is_frag) { unsigned char * iph = skb->nh.raw; @@ -46,9 +48,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, if (!is_frag) { if (opt->rr_needaddr) - memcpy(iph+opt->rr+iph[opt->rr+2]-5, &saddr, 4); + ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt); if (opt->ts_needaddr) - memcpy(iph+opt->ts+iph[opt->ts+2]-9, &saddr, 4); + ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt); if (opt->ts_needtime) { struct timeval tv; __u32 midtime; @@ -147,7 +149,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) if (((struct timestamp*)(dptr+1))->flags == IPOPT_TS_PRESPEC) { __u32 addr; memcpy(&addr, sptr+soffset-9, 4); - if (__ip_chk_addr(addr) == 0) { + if (inet_addr_type(addr) == RTN_UNICAST) { dopt->ts_needtime = 0; dopt->ts_needaddr = 0; soffset -= 8; @@ -248,6 +250,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) unsigned char * optptr; int optlen; unsigned char * pp_ptr = NULL; + struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL; if (!opt) { opt = &(IPCB(skb)->opt); @@ -328,7 +331,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) goto error; } if (skb) { - memcpy(&optptr[optptr[2]-1], &skb->dev->pa_addr, 4); + memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); opt->is_changed = 1; } optptr[2] += 4; @@ -371,7 +374,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) } opt->ts = optptr - iph; if (skb) { - memcpy(&optptr[ts->ptr-1], &skb->dev->pa_addr, 4); + memcpy(&optptr[ts->ptr-1], &rt->rt_spec_dst, 4); timeptr = (__u32*)&optptr[ts->ptr+3]; } opt->ts_needaddr = 1; @@ -387,7 +390,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) { u32 addr; memcpy(&addr, &optptr[ts->ptr-1], 4); - if (__ip_chk_addr(addr) == 0) + if (inet_addr_type(addr) == RTN_UNICAST) break; if (skb) timeptr = (__u32*)&optptr[ts->ptr+3]; @@ -521,7 +524,7 @@ void ip_forward_options(struct sk_buff *skb) if (opt->rr_needaddr) { optptr = (unsigned char *)raw + opt->rr; - memcpy(&optptr[optptr[2]-5], &rt->u.dst.dev->pa_addr, 4); + ip_rt_get_source(&optptr[optptr[2]-5], rt); opt->is_changed = 1; } if (opt->srr_is_hit) { @@ -540,20 +543,20 @@ void ip_forward_options(struct sk_buff *skb) } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; - memcpy(&optptr[srrptr-1], &rt->u.dst.dev->pa_addr, 4); + ip_rt_get_source(&optptr[srrptr-1], rt); skb->nh.iph->daddr = rt->rt_dst; optptr[2] = srrptr+4; } else printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); if (opt->ts_needaddr) { optptr = raw + opt->ts; - memcpy(&optptr[optptr[2]-9], &rt->u.dst.dev->pa_addr, 4); + ip_rt_get_source(&optptr[optptr[2]-9], rt); opt->is_changed = 1; } - if (opt->is_changed) { - opt->is_changed = 0; - ip_send_check(skb->nh.iph); - } + } + if (opt->is_changed) { + opt->is_changed = 0; + ip_send_check(skb->nh.iph); } } @@ -571,16 +574,16 @@ int ip_options_rcv_srr(struct sk_buff *skb) if (!opt->srr) return 0; - if (rt->rt_flags&(RTF_BROADCAST|RTF_MULTICAST|RTF_NAT) - || skb->pkt_type != PACKET_HOST) + if (skb->pkt_type != PACKET_HOST) return -EINVAL; - - if (!(rt->rt_flags & RTF_LOCAL)) { + if (rt->rt_type == RTN_UNICAST) { if (!opt->is_strictroute) return 0; icmp_send(skb, ICMP_PARAMETERPROB, 0, 16); return -EINVAL; } + if (rt->rt_type != RTN_LOCAL) + return -EINVAL; for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { if (srrptr + 3 > srrspace) { @@ -591,16 +594,15 @@ int ip_options_rcv_srr(struct sk_buff *skb) rt = (struct rtable*)skb->dst; skb->dst = NULL; - err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, - net_alias_main_dev(skb->dev)); + err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); rt2 = (struct rtable*)skb->dst; - if (err || rt2->rt_flags&(RTF_BROADCAST|RTF_MULTICAST|RTF_NAT)) { + if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { ip_rt_put(rt2); skb->dst = &rt->u.dst; return -EINVAL; } ip_rt_put(rt); - if (!(rt2->rt_flags&RTF_LOCAL)) + if (rt2->rt_type != RTN_LOCAL) break; /* Superfast 8) loopback forward */ memcpy(&iph->daddr, &optptr[srrptr-1], 4); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4f070ed0b297..106236c939ce 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: @(#)ip.c 1.0.16b 9/1/93 + * Version: $Id: ip_output.c,v 1.40 1997/10/12 17:01:48 kuznet Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -67,7 +67,7 @@ #include #include #include -#include +#include #include static void __inline__ ip_ll_header_reserve(struct sk_buff *skb) @@ -92,7 +92,7 @@ int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, daddr = opt->faddr; err = ip_route_output(&rt, daddr, saddr, RT_TOS(sk->ip_tos) | - (sk->localroute||0), NULL); + (sk->localroute||0), sk->bound_dev_if); if (err) { ip_statistics.IpOutNoRoutes++; @@ -130,7 +130,7 @@ int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, iph->tos = sk->ip_tos; iph->frag_off = 0; if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - !(rt->rt_flags & RTF_NOPMTUDISC)) + !(rt->rt_flags & RTCF_NOPMTUDISC)) iph->frag_off |= htons(IP_DF); iph->ttl = sk->ip_ttl; iph->daddr = rt->rt_dst; @@ -143,8 +143,7 @@ int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, { iph->ihl += opt->optlen>>2; skb->h.raw += opt->optlen; - ip_options_build(skb, opt, final_daddr, - rt->u.dst.dev->pa_addr, 0); + ip_options_build(skb, opt, final_daddr, rt, 0); } ip_rt_put(rt); @@ -170,9 +169,10 @@ int ip_build_header(struct sk_buff *skb, struct sock *sk) rt = (struct rtable*)sk->dst_cache; if (!rt || rt->u.dst.obsolete) { + sk->dst_cache = NULL; ip_rt_put(rt); err = ip_route_output(&rt, daddr, sk->saddr, RT_TOS(sk->ip_tos) | - (sk->localroute||0), NULL); + (sk->localroute||0), sk->bound_dev_if); if (err) return err; sk->dst_cache = &rt->u.dst; @@ -210,7 +210,7 @@ int ip_build_header(struct sk_buff *skb, struct sock *sk) iph->tos = sk->ip_tos; iph->frag_off = 0; if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - !(rt->rt_flags & RTF_NOPMTUDISC)) + !(rt->rt_flags & RTCF_NOPMTUDISC)) iph->frag_off |= htons(IP_DF); iph->ttl = sk->ip_ttl; iph->daddr = rt->rt_dst; @@ -223,7 +223,7 @@ int ip_build_header(struct sk_buff *skb, struct sock *sk) return 0; iph->ihl += opt->optlen>>2; skb->h.raw += opt->optlen; - ip_options_build(skb, opt, final_daddr, rt->u.dst.dev->pa_addr, 0); + ip_options_build(skb, opt, final_daddr, rt, 0); return 0; } @@ -242,17 +242,35 @@ int ip_mc_output(struct sk_buff *skb) #ifdef CONFIG_IP_ACCT ip_fw_chk(skb->nh.iph, skb->dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT); #endif - +#ifdef CONFIG_IP_ROUTE_NAT if (rt->rt_flags & RTCF_NAT) ip_do_nat(skb); +#endif /* * Multicasts are looped back for other local users */ - - if (rt->rt_flags&RTF_MULTICAST && !(dev->flags&IFF_LOOPBACK)) { - if (sk==NULL || sk->ip_mc_loop) - dev_loopback_xmit(skb); + + if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) { +#ifndef CONFIG_IP_MROUTE +#if 1 + /* It should never occur. Delete it eventually. --ANK */ + if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK)) + printk(KERN_DEBUG "ip_mc_output (mc): it should never occur\n"); + else +#endif +#else + /* Small optimization: do not loopback not local frames, + which returned after forwarding; they will be dropped + by ip_mr_input in any case. + Note, that local frames are looped back to be delivered + to local recipients. + + This check is duplicated in ip_mr_input at the moment. + */ + if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) +#endif + dev_loopback_xmit(skb); /* Multicasts with ttl 0 must not go beyond the host */ @@ -262,9 +280,15 @@ int ip_mc_output(struct sk_buff *skb) } } - if ((rt->rt_flags&(RTF_LOCAL|RTF_BROADCAST)) == (RTF_LOCAL|RTF_BROADCAST) && - !(dev->flags&IFF_LOOPBACK)) + if (rt->rt_flags&RTCF_BROADCAST) { +#if 1 + /* It should never occur. Delete it eventually. --ANK */ + if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK)) + printk(KERN_DEBUG "ip_mc_output (brd): it should never occur!\n"); + else +#endif dev_loopback_xmit(skb); + } if (dev->flags & IFF_UP) { dev_queue_xmit(skb); @@ -291,8 +315,10 @@ int ip_output(struct sk_buff *skb) ip_fw_chk(skb->nh.iph, skb->dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT); #endif +#ifdef CONFIG_IP_ROUTE_NAT if (rt->rt_flags&RTCF_NAT) ip_do_nat(skb); +#endif if (dev->flags & IFF_UP) { dev_queue_xmit(skb); @@ -431,8 +457,7 @@ check_route: */ { struct rtable *nrt; - if (ip_route_output(&nrt, rt->key.dst, rt->key.src, - rt->key.tos, NULL)) { + if (ip_route_output(&nrt, rt->key.dst, rt->key.src, rt->key.tos, sk?sk->bound_dev_if:0)) { kfree_skb(skb, 0); return; } @@ -500,14 +525,13 @@ int ip_build_xmit(struct sock *sk, int hh_len = rt->u.dst.dev->hard_header_len; int nfrags=0; struct ip_options *opt = ipc->opt; - struct device *dev = rt->u.dst.dev; int df = htons(IP_DF); #ifdef CONFIG_NET_SECURITY int fw_res; #endif if (sk->ip_pmtudisc == IP_PMTUDISC_DONT || - rt->rt_flags&RTF_NOPMTUDISC) + rt->rt_flags&RTCF_NOPMTUDISC) df = 0; @@ -546,7 +570,7 @@ int ip_build_xmit(struct sock *sk, iph->id=htons(ip_id_count++); iph->frag_off = df; iph->ttl=sk->ip_mc_ttl; - if (!(rt->rt_flags&RTF_MULTICAST)) + if (rt->rt_type != RTN_MULTICAST) iph->ttl=sk->ip_ttl; iph->protocol=sk->protocol; iph->saddr=rt->rt_src; @@ -695,14 +719,14 @@ int ip_build_xmit(struct sock *sk, if (opt) { iph->ihl += opt->optlen>>2; ip_options_build(skb, opt, - ipc->addr, dev->pa_addr, offset); + ipc->addr, rt, offset); } iph->tos = sk->ip_tos; iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4); iph->id = id; iph->frag_off = htons(offset>>3); iph->frag_off |= mf|df; - if (rt->rt_flags&RTF_MULTICAST) + if (rt->rt_type == RTN_MULTICAST) iph->ttl = sk->ip_mc_ttl; else iph->ttl = sk->ip_ttl; @@ -966,7 +990,7 @@ struct sk_buff * ip_reply(struct sk_buff *skb, int payload) if (ipc.opt->srr) daddr = replyopts.opt.faddr; - if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), NULL)) + if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) return NULL; iphlen = sizeof(struct iphdr) + replyopts.opt.optlen; @@ -1000,7 +1024,7 @@ struct sk_buff * ip_reply(struct sk_buff *skb, int payload) iph->saddr = rt->rt_src; iph->protocol = skb->nh.iph->protocol; - ip_options_build(reply, &replyopts.opt, daddr, rt->u.dst.dev->pa_addr, 0); + ip_options_build(reply, &replyopts.opt, daddr, rt, 0); return reply; } @@ -1019,43 +1043,16 @@ static struct packet_type ip_packet_type = }; -/* - * Device notifier - */ - -static int ip_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) -{ - struct device *dev=ptr; - - if (dev->family != AF_INET) - return NOTIFY_DONE; - - if(event==NETDEV_UP) - { - /* - * Join the initial group if multicast. - */ - ip_mc_allhost(dev); - } - if(event==NETDEV_DOWN) - ip_mc_drop_device(dev); - - return ip_rt_event(event, dev); -} - -struct notifier_block ip_netdev_notifier={ - ip_netdev_event, - NULL, - 0 -}; #ifdef CONFIG_PROC_FS +#ifdef CONFIG_IP_MULTICAST static struct proc_dir_entry proc_net_igmp = { PROC_NET_IGMP, 4, "igmp", S_IFREG | S_IRUGO, 1, 0, 0, 0, &proc_net_inode_operations, ip_mc_procinfo }; +#endif #endif /* @@ -1068,11 +1065,10 @@ __initfunc(void ip_init(void)) ip_rt_init(); - /* So we flush routes and multicast lists when a device is downed */ - register_netdevice_notifier(&ip_netdev_notifier); - #ifdef CONFIG_PROC_FS +#ifdef CONFIG_IP_MULTICAST proc_net_register(&proc_net_igmp); +#endif #endif } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 366ce9fb93f7..080452dd3689 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,6 +5,8 @@ * * The IP to API glue. * + * Version: $Id: ip_sockglue.c,v 1.28 1997/11/17 17:36:08 kuznet Exp $ + * * Authors: see ip.c * * Fixes: @@ -27,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -36,34 +39,47 @@ #include +#define IP_CMSG_PKTINFO 1 +#define IP_CMSG_TTL 2 +#define IP_CMSG_TOS 4 +#define IP_CMSG_RECVOPTS 8 +#define IP_CMSG_RETOPTS 16 + /* * SOL_IP control messages. */ -static void ip_cmsg_recv_rxinfo(struct msghdr *msg, struct sk_buff *skb) +static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct in_pktinfo info; struct rtable *rt = (struct rtable *)skb->dst; - info.ipi_ifindex = skb->dev->ifindex; info.ipi_addr.s_addr = skb->nh.iph->daddr; - info.ipi_spec_dst.s_addr = rt->rt_spec_dst; + if (rt) { + info.ipi_ifindex = rt->rt_iif; + info.ipi_spec_dst.s_addr = rt->rt_spec_dst; + } else { + info.ipi_ifindex = 0; + info.ipi_spec_dst.s_addr = 0; + } - put_cmsg(msg, SOL_IP, IP_RXINFO, sizeof(info), &info); + put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } -static void ip_cmsg_recv_localaddr(struct msghdr *msg, struct sk_buff *skb, int local) +static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) { - struct in_addr addr; + if (IPCB(skb)->opt.optlen == 0) + return; - addr.s_addr = skb->nh.iph->daddr; + put_cmsg(msg, SOL_IP, IP_TTL, 1, &skb->nh.iph->ttl); +} - if (local) { - struct rtable *rt = (struct rtable *)skb->dst; - addr.s_addr = rt->rt_spec_dst; - } - put_cmsg(msg, SOL_IP, local ? IP_LOCALADDR : IP_RECVDSTADDR, - sizeof(addr), &addr); +static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) +{ + if (IPCB(skb)->opt.optlen == 0) + return; + + put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos); } static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) @@ -99,26 +115,30 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) /* Ordered by supposed usage frequency */ if (flags & 1) - ip_cmsg_recv_rxinfo(msg, skb); + ip_cmsg_recv_pktinfo(msg, skb); if ((flags>>=1) == 0) return; + if (flags & 1) - ip_cmsg_recv_localaddr(msg, skb, 1); + ip_cmsg_recv_ttl(msg, skb); if ((flags>>=1) == 0) return; + if (flags & 1) - ip_cmsg_recv_opts(msg, skb); + ip_cmsg_recv_tos(msg, skb); if ((flags>>=1) == 0) return; + if (flags & 1) - ip_cmsg_recv_retopts(msg, skb); + ip_cmsg_recv_opts(msg, skb); if ((flags>>=1) == 0) return; + if (flags & 1) - ip_cmsg_recv_localaddr(msg, skb, 0); + ip_cmsg_recv_retopts(msg, skb); } -int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc, struct device **devp) +int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) { int err; struct cmsghdr *cmsg; @@ -127,27 +147,19 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc, struct device **de if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { - case IP_LOCALADDR: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) - return -EINVAL; - memcpy(&ipc->addr, CMSG_DATA(cmsg), sizeof(struct in_addr)); - break; case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0); if (err) return err; break; - case IP_TXINFO: + case IP_PKTINFO: { struct in_pktinfo *info; if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) return -EINVAL; info = (struct in_pktinfo *)CMSG_DATA(cmsg); - if (info->ipi_ifindex && !devp) - return -EINVAL; - if ((*devp = dev_get_by_index(info->ipi_ifindex)) == NULL) - return -ENODEV; + ipc->oif = info->ipi_ifindex; ipc->addr = info->ipi_spec_dst.s_addr; break; } @@ -158,6 +170,53 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc, struct device **de return 0; } + +/* Special input handler for packets catched by router alert option. + They are selected only by protocol field, and then processed likely + local ones; but only if someone wants them! Otherwise, router + not running rsvpd will kill RSVP. + + It is user level problem, what it will make with them. + I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), + but receiver should be enough clever f.e. to forward mtrace requests, + sent to multicast group to reach destination designated router. + */ +struct ip_ra_chain *ip_ra_chain; + +int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) +{ + struct ip_ra_chain *ra, *new_ra, **rap; + + if (sk->type != SOCK_RAW || sk->num == IPPROTO_RAW) + return -EINVAL; + + new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + + for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + if (ra->sk == sk) { + if (on) { + if (new_ra) + kfree(new_ra); + return -EADDRINUSE; + } + *rap = ra->next; + if (ra->destructor) + ra->destructor(sk); + kfree(ra); + return 0; + } + } + if (new_ra == NULL) + return -ENOBUFS; + new_ra->sk = sk; + new_ra->destructor = destructor; + start_bh_atomic(); + new_ra->next = ra; + *rap = new_ra; + end_bh_atomic(); + return 0; +} + /* * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on * an IP socket. @@ -168,7 +227,6 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc, struct device **de int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { int val=0,err; - unsigned char ucval = 0; #if defined(CONFIG_IP_FIREWALL) || defined(CONFIG_IP_ACCT) struct ip_fw tmp_fw; #endif @@ -177,9 +235,12 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt if(get_user(val, (int *) optval)) return -EFAULT; } else if(optlen>=sizeof(char)) { + unsigned char ucval; if(get_user(ucval, (unsigned char *) optval)) return -EFAULT; + val = (int)ucval; } + /* If optlen==0, it is equivalent to val == 0 */ if(level!=SOL_IP) return -ENOPROTOOPT; @@ -213,50 +274,38 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt kfree_s(old_opt, sizeof(struct ip_options) + old_opt->optlen); return 0; } - case IP_RXINFO: - if (optlen<4) - return -EINVAL; + case IP_PKTINFO: if (val) - sk->ip_cmsg_flags |= 1; + sk->ip_cmsg_flags |= IP_CMSG_PKTINFO; else - sk->ip_cmsg_flags &= ~1; + sk->ip_cmsg_flags &= ~IP_CMSG_PKTINFO; return 0; - case IP_LOCALADDR: - if (optlen<4) - return -EINVAL; + case IP_RECVTTL: if (val) - sk->ip_cmsg_flags |= 2; + sk->ip_cmsg_flags |= IP_CMSG_TTL; else - sk->ip_cmsg_flags &= ~2; + sk->ip_cmsg_flags &= ~IP_CMSG_TTL; return 0; - case IP_RECVOPTS: - if (optlen<4) - return -EINVAL; + case IP_RECVTOS: if (val) - sk->ip_cmsg_flags |= 4; + sk->ip_cmsg_flags |= IP_CMSG_TOS; else - sk->ip_cmsg_flags &= ~4; + sk->ip_cmsg_flags &= ~IP_CMSG_TOS; return 0; - case IP_RETOPTS: - if (optlen<4) - return -EINVAL; + case IP_RECVOPTS: if (val) - sk->ip_cmsg_flags |= 8; + sk->ip_cmsg_flags |= IP_CMSG_RECVOPTS; else - sk->ip_cmsg_flags &= ~8; + sk->ip_cmsg_flags &= ~IP_CMSG_RECVOPTS; return 0; - case IP_RECVDSTADDR: - if (optlen<4) - return -EINVAL; + case IP_RETOPTS: if (val) - sk->ip_cmsg_flags |= 0x10; + sk->ip_cmsg_flags |= IP_CMSG_RETOPTS; else - sk->ip_cmsg_flags &= ~0x10; + sk->ip_cmsg_flags &= ~IP_CMSG_RETOPTS; return 0; case IP_TOS: /* This sets both TOS and Precedence */ /* Reject setting of unused bits */ - if (optlen<4) - return -EINVAL; if (val & ~(IPTOS_TOS_MASK|IPTOS_PREC_MASK)) return -EINVAL; if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && !suser()) @@ -274,29 +323,25 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt sk->priority = rt_tos2priority(val); return 0; case IP_TTL: - if (optlen<4) + if (optlen<1) return -EINVAL; + if(val==-1) + val = ip_statistics.IpDefaultTTL; if(val<1||val>255) return -EINVAL; sk->ip_ttl=val; return 0; case IP_HDRINCL: - if (optlen<4) - return -EINVAL; if(sk->type!=SOCK_RAW) return -ENOPROTOOPT; sk->ip_hdrincl=val?1:0; return 0; case IP_PMTUDISC: - if (optlen<4) - return -EINVAL; if (val<0 || val>2) return -EINVAL; sk->ip_pmtudisc = val; return 0; case IP_RECVERR: - if (optlen<4) - return -EINVAL; if (sk->type==SOCK_STREAM) return -ENOPROTOOPT; lock_sock(sk); @@ -312,211 +357,81 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt case IP_MULTICAST_TTL: if (optlen<1) return -EINVAL; - sk->ip_mc_ttl=(int)ucval; + if (val==-1) + val = 1; + if (val < 0 || val > 255) + return -EINVAL; + sk->ip_mc_ttl=val; return 0; case IP_MULTICAST_LOOP: if (optlen<1) return -EINVAL; - if(ucval!=0 && ucval!=1) - return -EINVAL; - sk->ip_mc_loop=(int)ucval; + sk->ip_mc_loop = val ? 1 : 0; return 0; case IP_MULTICAST_IF: { - struct in_addr addr; + struct ip_mreqn mreq; struct device *dev = NULL; /* * Check the arguments are allowable */ - if(optlenip_mc_index = 0; - return 0; - } - - /* - * Find the device - */ - - dev=ip_dev_find(addr.s_addr, NULL); - - /* - * Did we find one - */ - - if(dev) - { - sk->ip_mc_index = dev->ifindex; - return 0; + if (optlen >= sizeof(struct ip_mreqn)) { + if (copy_from_user(&mreq,optval,sizeof(mreq))) + return -EFAULT; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (optlen >= sizeof(struct in_addr) && + copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr))) + return -EFAULT; } - return -EADDRNOTAVAIL; - } - - - case IP_ADD_MEMBERSHIP: - { - -/* - * FIXME: Add/Del membership should have a semaphore protecting them from re-entry - */ - struct ip_mreq mreq; - struct rtable *rt; - struct device *dev=NULL; - - /* - * Check the arguments. - */ - - if(optlenu.dst.dev; - ip_rt_put(rt); - } else - dev = ip_dev_find(mreq.imr_interface.s_addr, NULL); - - /* - * No device, no cookies. - */ - - if(!dev) - return -ENODEV; - - /* - * Join group. - */ - - return ip_mc_join_group(sk,dev,mreq.imr_multiaddr.s_addr); - } - - case IP_DROP_MEMBERSHIP: - { - struct ip_mreq mreq; - struct rtable *rt; - struct device *dev=NULL; - - /* - * Check the arguments - */ - - if(optlenu.dst.dev; - ip_rt_put(rt); - } else - dev = ip_dev_find(mreq.imr_interface.s_addr, NULL); - - /* - * Did we find a suitable device. - */ - - if(!dev) - return -ENODEV; - - /* - * Leave group - */ - - return ip_mc_leave_group(sk,dev,mreq.imr_multiaddr.s_addr); - } - - case IP_MULTICAST_IFN: - { - struct ip_mreqn mreq; - struct device *dev = NULL; - - if(optlenip_mc_index = 0; sk->ip_mc_addr = 0; return 0; } - dev = ip_dev_find(mreq.imr_address.s_addr, NULL); + dev = ip_dev_find(mreq.imr_address.s_addr); } else dev = dev_get_by_index(mreq.imr_ifindex); if (!dev) - return -ENODEV; + return -EADDRNOTAVAIL; + + if (sk->bound_dev_if && dev->ifindex != sk->bound_dev_if) + return -EINVAL; sk->ip_mc_index = mreq.imr_ifindex; sk->ip_mc_addr = mreq.imr_address.s_addr; return 0; } - case IP_ADD_MEMBERSHIPN: - { - struct ip_mreqn mreq; - struct device *dev = NULL; - if(optlen= sizeof(struct ip_mreqn)) { + if(copy_from_user(&mreq,optval,sizeof(mreq))) + return -EFAULT; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq))) + return -EFAULT; + } - dev=dev_get_by_index(mreq.imr_ifindex); - if(!dev) - return -ENODEV; - - return ip_mc_leave_group(sk,dev,mreq.imr_multiaddr.s_addr); + if (optname == IP_ADD_MEMBERSHIP) + return ip_mc_join_group(sk,&mreq); + else + return ip_mc_leave_group(sk,&mreq); } + case IP_ROUTER_ALERT: + return ip_ra_control(sk, val ? 1 : 0, NULL); + #ifdef CONFIG_IP_FIREWALL case IP_FW_INSERT_IN: case IP_FW_INSERT_OUT: @@ -616,21 +531,21 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op return -EFAULT; return 0; } - case IP_RXINFO: - val = (sk->ip_cmsg_flags & 1) != 0; - return 0; - case IP_LOCALADDR: - val = (sk->ip_cmsg_flags & 2) != 0; - return 0; + case IP_PKTINFO: + val = (sk->ip_cmsg_flags & IP_CMSG_PKTINFO) != 0; + break; + case IP_RECVTTL: + val = (sk->ip_cmsg_flags & IP_CMSG_TTL) != 0; + break; + case IP_RECVTOS: + val = (sk->ip_cmsg_flags & IP_CMSG_TOS) != 0; + break; case IP_RECVOPTS: - val = (sk->ip_cmsg_flags & 4) != 0; - return 0; + val = (sk->ip_cmsg_flags & IP_CMSG_RECVOPTS) != 0; + break; case IP_RETOPTS: - val = (sk->ip_cmsg_flags & 8) != 0; - return 0; - case IP_RECVDSTADDR: - val = (sk->ip_cmsg_flags & 0x10) != 0; - return 0; + val = (sk->ip_cmsg_flags & IP_CMSG_RETOPTS) != 0; + break; case IP_TOS: val=sk->ip_tos; break; @@ -642,17 +557,18 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op break; case IP_PMTUDISC: val=sk->ip_pmtudisc; - return 0; + break; case IP_RECVERR: val=sk->ip_recverr; - return 0; + break; case IP_MULTICAST_TTL: val=sk->ip_mc_ttl; break; case IP_MULTICAST_LOOP: val=sk->ip_mc_loop; break; - case IP_MULTICAST_IFN: +#if 0 + case IP_MULTICAST_IF: { struct ip_mreqn mreq; len = min(len,sizeof(struct ip_mreqn)); @@ -665,9 +581,13 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op return -EFAULT; return 0; } +#endif case IP_MULTICAST_IF: { struct device *dev = dev_get_by_index(sk->ip_mc_index); + + printk(KERN_INFO "application %s uses old get IP_MULTICAST_IF. Please, report!\n", current->comm); + if (dev == NULL) { len = 0; @@ -689,11 +609,19 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op return(-ENOPROTOOPT); } - len=min(sizeof(int),len); - - if(put_user(len, optlen)) - return -EFAULT; - if(copy_to_user(optval,&val,len)) - return -EFAULT; + if (len < sizeof(int) && len > 0 && val>=0 && val<255) { + unsigned char ucval = (unsigned char)val; + len = 1; + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&ucval,1)) + return -EFAULT; + } else { + len=min(sizeof(int),len); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&val,len)) + return -EFAULT; + } return 0; } diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c new file mode 100644 index 000000000000..30df2360d57b --- /dev/null +++ b/net/ipv4/ipconfig.c @@ -0,0 +1,1160 @@ +/* + * $Id: ipconfig.c,v 1.5 1997/10/27 16:08:02 mj Exp $ + * + * Automatic Configuration of IP -- use BOOTP or RARP or user-supplied + * information to configure own IP address and routes. + * + * Copyright (C) 1996, 1997 Martin Mares + * + * Derived from network configuration code in fs/nfs/nfsroot.c, + * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* Define this to allow debugging output */ +#undef IPCONFIG_DEBUG + +#ifdef IPCONFIG_DEBUG +#define DBG(x) printk x +#else +#define DBG(x) do { } while(0) +#endif + +/* Define the timeout for waiting for a RARP/BOOTP reply */ +#define CONF_BASE_TIMEOUT (HZ*5) /* Initial timeout: 5 seconds */ +#define CONF_RETRIES 10 /* 10 retries */ +#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ +#define CONF_TIMEOUT_MULT *5/4 /* Rate of timeout growth */ +#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */ + +/* IP configuration */ +static char user_dev_name[IFNAMSIZ] __initdata = { 0, };/* Name of user-selected boot device */ +u32 ic_myaddr __initdata = INADDR_NONE; /* My IP address */ +u32 ic_servaddr __initdata = INADDR_NONE; /* Server IP address */ +u32 ic_gateway __initdata = INADDR_NONE; /* Gateway IP address */ +u32 ic_netmask __initdata = INADDR_NONE; /* Netmask for local subnet */ +int ic_bootp_flag __initdata = 1; /* Use BOOTP */ +int ic_rarp_flag __initdata = 1; /* Use RARP */ +int ic_enable __initdata = 1; /* Automatic IP configuration enabled */ +int ic_host_name_set __initdata = 0; /* Host name configured manually */ +int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */ + +u32 root_server_addr __initdata = INADDR_NONE; /* Address of boot server */ +u8 root_server_path[256] __initdata = { 0, }; /* Path to mount as root */ + +#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_RARP) + +#define CONFIG_IP_PNP_DYNAMIC + +static int ic_got_reply __initdata = 0; + +#define IC_GOT_BOOTP 1 +#define IC_GOT_RARP 2 + +#endif + +/* + * Network devices + */ + +struct ic_device { + struct ic_device *next; + struct device *dev; + unsigned short flags; +}; + +static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ +static struct device *ic_dev __initdata = NULL; /* Selected device */ +static int bootp_dev_count __initdata = 0; /* BOOTP capable devices */ +static int rarp_dev_count __initdata = 0; /* RARP capable devices */ + +__initfunc(int ic_open_devs(void)) +{ + struct ic_device *d, **last; + struct device *dev; + unsigned short oflags; + + last = &ic_first_dev; + for (dev = dev_base; dev; dev = dev->next) + if (dev->type < ARPHRD_SLIP && + !(dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) && + strncmp(dev->name, "dummy", 5) && + (!user_dev_name[0] || !strcmp(dev->name, user_dev_name))) { + oflags = dev->flags; + if (dev_change_flags(dev, oflags | IFF_UP) < 0) { + printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); + continue; + } + if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) + return -1; + d->dev = dev; + *last = d; + last = &d->next; + d->flags = oflags; + bootp_dev_count++; + if (!(dev->flags & IFF_NOARP)) + rarp_dev_count++; + DBG(("IP-Config: Opened %s\n", dev->name)); + } + *last = NULL; + + if (!bootp_dev_count) { + if (user_dev_name[0]) + printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); + else + printk(KERN_ERR "IP-Config: No network devices available.\n"); + return -1; + } + return 0; +} + +__initfunc(void ic_close_devs(void)) +{ + struct ic_device *d, *next; + struct device *dev; + + next = ic_first_dev; + while ((d = next)) { + next = d->next; + dev = d->dev; + if (dev != ic_dev) { + DBG(("IP-Config: Downing %s\n", dev->name)); + dev_change_flags(dev, d->flags); + } + kfree_s(d, sizeof(struct ic_device)); + } +} + +/* + * Interface to various network functions. + */ + +static inline void +set_sockaddr(struct sockaddr_in *sin, u32 addr, u16 port) +{ + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = addr; + sin->sin_port = port; +} + +__initfunc(static int ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = devinet_ioctl(cmd, arg); + set_fs(oldfs); + return res; +} + +__initfunc(static int ic_route_ioctl(unsigned int cmd, struct rtentry *arg)) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = ip_rt_ioctl(cmd, arg); + set_fs(oldfs); + return res; +} + +/* + * Set up interface addresses and routes. + */ + +__initfunc(static int ic_setup_if(void)) +{ + struct ifreq ir; + struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr; + int err; + + memset(&ir, 0, sizeof(ir)); + strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); + set_sockaddr(sin, ic_myaddr, 0); + if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err); + return -1; + } + set_sockaddr(sin, ic_netmask, 0); + if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err); + return -1; + } + set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); + if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err); + return -1; + } + return 0; +} + +__initfunc(int ic_setup_routes(void)) +{ + /* No need to setup device routes, only the default route... */ + + if (ic_gateway != INADDR_NONE) { + struct rtentry rm; + int err; + + memset(&rm, 0, sizeof(rm)); + if ((ic_gateway ^ ic_myaddr) & ic_netmask) { + printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n"); + return -1; + } + set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0); + set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0); + set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0); + rm.rt_flags = RTF_UP | RTF_GATEWAY; + if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { + printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err); + return -1; + } + } + + return 0; +} + +/* + * Fill in default values for all missing parameters. + */ + +__initfunc(int ic_defaults(void)) +{ + if (!ic_host_name_set) + strcpy(system_utsname.nodename, in_ntoa(ic_myaddr)); + + if (root_server_addr == INADDR_NONE) + root_server_addr = ic_servaddr; + + if (ic_netmask == INADDR_NONE) { + if (IN_CLASSA(ic_myaddr)) + ic_netmask = IN_CLASSA_NET; + else if (IN_CLASSB(ic_myaddr)) + ic_netmask = IN_CLASSB_NET; + else if (IN_CLASSC(ic_myaddr)) + ic_netmask = IN_CLASSC_NET; + else { + printk(KERN_ERR "IP-Config: Unable to guess netmask for address %08x\n", ic_myaddr); + return -1; + } + } + + return 0; +} + +/* + * RARP support. + */ + +#ifdef CONFIG_IP_PNP_RARP + +static int ic_rarp_recv(struct sk_buff *skb, struct device *dev, + struct packet_type *pt); + +static struct packet_type rarp_packet_type __initdata = { + 0, /* Should be: __constant_htons(ETH_P_RARP) + * - but this _doesn't_ come out constant! */ + NULL, /* Listen to all devices */ + ic_rarp_recv, + NULL, + NULL +}; + +__initfunc(static void ic_rarp_init(void)) +{ + rarp_packet_type.type = htons(ETH_P_RARP); + dev_add_pack(&rarp_packet_type); +} + +__initfunc(static void ic_rarp_cleanup(void)) +{ + dev_remove_pack(&rarp_packet_type); +} + +/* + * Process received RARP packet. + */ +__initfunc(static int +ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)) +{ + struct arphdr *rarp = (struct arphdr *)skb->h.raw; + unsigned char *rarp_ptr = (unsigned char *) (rarp + 1); + unsigned long sip, tip; + unsigned char *sha, *tha; /* s for "source", t for "target" */ + + /* If this test doesn't pass, it's not IP, or we should ignore it anyway */ + if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd)) + goto drop; + + /* If it's not a RARP reply, delete it. */ + if (rarp->ar_op != htons(ARPOP_RREPLY)) + goto drop; + + /* If it's not ethernet, delete it. */ + if (rarp->ar_pro != htons(ETH_P_IP)) + goto drop; + + /* Extract variable-width fields */ + sha = rarp_ptr; + rarp_ptr += dev->addr_len; + memcpy(&sip, rarp_ptr, 4); + rarp_ptr += 4; + tha = rarp_ptr; + rarp_ptr += dev->addr_len; + memcpy(&tip, rarp_ptr, 4); + + /* Discard packets which are not meant for us. */ + if (memcmp(tha, dev->dev_addr, dev->addr_len)) + goto drop; + + /* Discard packets which are not from specified server. */ + if (ic_servaddr != INADDR_NONE && ic_servaddr != sip) + goto drop; + + /* Victory! The packet is what we were looking for! */ + if (!ic_got_reply) { + ic_got_reply = IC_GOT_RARP; + ic_dev = dev; + if (ic_myaddr == INADDR_NONE) + ic_myaddr = tip; + ic_servaddr = sip; + } + + /* And throw the packet out... */ +drop: + kfree_skb(skb, FREE_READ); + return 0; +} + + +/* + * Send RARP request packet over all devices which allow RARP. + */ +__initfunc(static void ic_rarp_send(void)) +{ + struct ic_device *d; + + for (d=ic_first_dev; d; d=d->next) { + struct device *dev = d->dev; + if (!(dev->flags & IFF_NOARP)) + arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL, + dev->dev_addr, dev->dev_addr); + } +} + +#endif + +/* + * BOOTP support. + */ + +#ifdef CONFIG_IP_PNP_BOOTP + +static struct socket *ic_bootp_xmit_sock __initdata = NULL; /* BOOTP send socket */ +static struct socket *ic_bootp_recv_sock __initdata = NULL; /* BOOTP receive socket */ + +struct bootp_pkt { /* BOOTP packet format */ + u8 op; /* 1=request, 2=reply */ + u8 htype; /* HW address type */ + u8 hlen; /* HW address length */ + u8 hops; /* Used only by gateways */ + u32 xid; /* Transaction ID */ + u16 secs; /* Seconds since we started */ + u16 flags; /* Just what it says */ + u32 client_ip; /* Client's IP address if known */ + u32 your_ip; /* Assigned IP address */ + u32 server_ip; /* Server's IP address */ + u32 relay_ip; /* IP address of BOOTP relay */ + u8 hw_addr[16]; /* Client's HW address */ + u8 serv_name[64]; /* Server host name */ + u8 boot_file[128]; /* Name of boot file */ + u8 vendor_area[128]; /* Area for extensions */ +}; + +#define BOOTP_REQUEST 1 +#define BOOTP_REPLY 2 + +static struct bootp_pkt *ic_xmit_bootp __initdata = NULL; /* Packet being transmitted */ +static struct bootp_pkt *ic_recv_bootp __initdata = NULL; /* Packet being received */ + +/* + * Dirty tricks for BOOTP packet routing. We replace the standard lookup function + * for the local fib by our version which does fake lookups and returns our private + * fib entries. Ugly, but it seems to be the simplest way to do the job. + */ + +static void *ic_old_local_lookup __initdata = NULL; /* Old local routing table lookup function */ +static struct fib_info *ic_bootp_tx_fib __initdata = NULL; /* Our fake fib entries */ +static struct fib_info *ic_bootp_rx_fib __initdata = NULL; + +__initfunc(static int ic_bootp_route_lookup(struct fib_table *tb, const struct rt_key *key, + struct fib_result *res)) +{ + static u32 ic_brl_zero = 0; + + DBG(("BOOTP: Route lookup: %d:%08x -> %d:%08x: ", key->iif, key->src, key->oif, key->dst)); + res->scope = RT_SCOPE_UNIVERSE; + res->prefix = &ic_brl_zero; + res->prefixlen = 0; + res->nh_sel = 0; + if (key->src == 0 && key->dst == 0xffffffff && key->iif == loopback_dev.ifindex) { /* Packet output */ + DBG(("Output\n")); + res->type = RTN_UNICAST; + res->fi = ic_bootp_tx_fib; + } else if (key->iif && key->iif != loopback_dev.ifindex && key->oif == 0) { /* Packet input */ + DBG(("Input\n")); + res->type = RTN_LOCAL; + res->fi = ic_bootp_rx_fib; + } else if (!key->iif && !key->oif && !key->src) { /* Address check by inet_addr_type() */ + DBG(("Check\n")); + res->type = RTN_UNICAST; + res->fi = ic_bootp_tx_fib; + } else { + DBG(("Drop\n")); + return -EINVAL; + } + return 0; +} + +__initfunc(static int ic_set_bootp_route(struct ic_device *d)) +{ + struct fib_info *f = ic_bootp_tx_fib; + struct fib_nh *n = &f->fib_nh[0]; + + n->nh_dev = d->dev; + n->nh_oif = n->nh_dev->ifindex; + rt_cache_flush(0); + return 0; +} + +__initfunc(static int ic_bootp_route_init(void)) +{ + int size = sizeof(struct fib_info) + sizeof(struct fib_nh); + struct fib_info *rf, *tf; + struct fib_nh *nh; + + if (!(rf = ic_bootp_rx_fib = kmalloc(size, GFP_KERNEL)) || + !(tf = ic_bootp_tx_fib = kmalloc(size, GFP_KERNEL))) + return -1; + + memset(rf, 0, size); + rf->fib_nhs = 1; + nh = &rf->fib_nh[0]; + nh->nh_scope = RT_SCOPE_UNIVERSE; + + memset(tf, 0, size); + rf->fib_nhs = 1; + nh = &rf->fib_nh[0]; + nh->nh_dev = ic_first_dev->dev; + nh->nh_scope = RT_SCOPE_UNIVERSE; + nh->nh_oif = nh->nh_dev->ifindex; + + /* Dirty trick: replace standard routing table lookup by our function */ + ic_old_local_lookup = local_table->tb_lookup; + local_table->tb_lookup = ic_bootp_route_lookup; + + return 0; +} + +__initfunc(static void ic_bootp_route_cleanup(void)) +{ + if (ic_old_local_lookup) + local_table->tb_lookup = ic_old_local_lookup; + if (ic_bootp_rx_fib) + kfree_s(ic_bootp_rx_fib, sizeof(struct fib_info) + sizeof(struct fib_nh)); + if (ic_bootp_tx_fib) + kfree_s(ic_bootp_tx_fib, sizeof(struct fib_info) + sizeof(struct fib_nh)); +} + + +/* + * Allocation and freeing of BOOTP packet buffers. + */ +__initfunc(static int ic_bootp_alloc(void)) +{ + if (!(ic_xmit_bootp = kmalloc(sizeof(struct bootp_pkt), GFP_KERNEL)) || + !(ic_recv_bootp = kmalloc(sizeof(struct bootp_pkt), GFP_KERNEL))) { + printk(KERN_ERR "BOOTP: Out of memory!\n"); + return -1; + } + return 0; +} + +__initfunc(static void ic_bootp_free(void)) +{ + if (ic_xmit_bootp) { + kfree_s(ic_xmit_bootp, sizeof(struct bootp_pkt)); + ic_xmit_bootp = NULL; + } + if (ic_recv_bootp) { + kfree_s(ic_recv_bootp, sizeof(struct bootp_pkt)); + ic_recv_bootp = NULL; + } +} + + +/* + * Add / Remove fake interface addresses for BOOTP packet sending. + */ +__initfunc(static int ic_bootp_addrs_add(void)) +{ + struct ic_device *d; + int err; + + for(d=ic_first_dev; d; d=d->next) + if ((err = inet_add_bootp_addr(d->dev)) < 0) { + printk(KERN_ERR "BOOTP: Unable to set interface address\n"); + return -1; + } + return 0; +} + +__initfunc(static void ic_bootp_addrs_del(void)) +{ + struct ic_device *d; + + for(d=ic_first_dev; d; d=d->next) + inet_del_bootp_addr(d->dev); +} + +/* + * UDP socket operations. + */ +__initfunc(static int ic_udp_open(struct socket **sock)) +{ + int err; + + if ((err = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, sock)) < 0) + printk(KERN_ERR "BOOTP: Cannot open UDP socket!\n"); + return err; +} + +static inline void ic_udp_close(struct socket *sock) +{ + if (sock) + sock_release(sock); +} + +__initfunc(static int ic_udp_connect(struct socket *sock, u32 addr, u16 port)) +{ + struct sockaddr_in sa; + int err; + + set_sockaddr(&sa, htonl(addr), htons(port)); + err = sock->ops->connect(sock, (struct sockaddr *) &sa, sizeof(sa), 0); + if (err < 0) { + printk(KERN_ERR "BOOTP: connect() failed (%d)\n", err); + return -1; + } + return 0; +} + +__initfunc(static int ic_udp_bind(struct socket *sock, u32 addr, u16 port)) +{ + struct sockaddr_in sa; + int err; + + set_sockaddr(&sa, htonl(addr), htons(port)); + err = sock->ops->bind(sock, (struct sockaddr *) &sa, sizeof(sa)); + if (err < 0) { + printk(KERN_ERR "BOOTP: bind() failed (%d)\n", err); + return -1; + } + return 0; +} + +__initfunc(static int ic_udp_send(struct socket *sock, void *buf, int size)) +{ + mm_segment_t oldfs; + int result; + struct msghdr msg; + struct iovec iov; + + oldfs = get_fs(); + set_fs(get_ds()); + iov.iov_base = buf; + iov.iov_len = size; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + result = sock_sendmsg(sock, &msg, size); + set_fs(oldfs); + + return (result != size); +} + +__initfunc(static int ic_udp_recv(struct socket *sock, void *buf, int size)) +{ + mm_segment_t oldfs; + int result; + struct msghdr msg; + struct iovec iov; + + oldfs = get_fs(); + set_fs(get_ds()); + iov.iov_base = buf; + iov.iov_len = size; + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + result = sock_recvmsg(sock, &msg, size, MSG_DONTWAIT); + set_fs(oldfs); + return result; +} + + +/* + * Initialize BOOTP extension fields in the request. + */ +__initfunc(static void ic_bootp_init_ext(u8 *e)) +{ + *e++ = 99; /* RFC1048 Magic Cookie */ + *e++ = 130; + *e++ = 83; + *e++ = 99; + *e++ = 1; /* Subnet mask request */ + *e++ = 4; + e += 4; + *e++ = 3; /* Default gateway request */ + *e++ = 4; + e += 4; + *e++ = 12; /* Host name request */ + *e++ = 32; + e += 32; + *e++ = 40; /* NIS Domain name request */ + *e++ = 32; + e += 32; + *e++ = 17; /* Boot path */ + *e++ = 32; + e += 32; + *e = 255; /* End of the list */ +} + + +/* + * Initialize the BOOTP mechanism. + */ +__initfunc(static int ic_bootp_init(void)) +{ + /* Allocate memory for BOOTP packets */ + if (ic_bootp_alloc() < 0) + return -1; + + /* Add fake zero addresses to all interfaces */ + if (ic_bootp_addrs_add() < 0) + return -1; + + /* Initialize BOOTP routing */ + if (ic_bootp_route_init() < 0) + return -1; + + /* Initialize common portion of BOOTP request */ + memset(ic_xmit_bootp, 0, sizeof(struct bootp_pkt)); + ic_xmit_bootp->op = BOOTP_REQUEST; + get_random_bytes(&ic_xmit_bootp->xid, sizeof(ic_xmit_bootp->xid)); + ic_bootp_init_ext(ic_xmit_bootp->vendor_area); + + DBG(("BOOTP: XID=%08x\n", ic_xmit_bootp->xid)); + + /* Open the sockets */ + if (ic_udp_open(&ic_bootp_xmit_sock) || + ic_udp_open(&ic_bootp_recv_sock)) + return -1; + + /* Bind/connect the sockets */ + ic_bootp_xmit_sock->sk->broadcast = 1; + ic_bootp_xmit_sock->sk->reuse = 1; + ic_bootp_recv_sock->sk->reuse = 1; + ic_set_bootp_route(ic_first_dev); + if (ic_udp_bind(ic_bootp_recv_sock, INADDR_ANY, 68) || + ic_udp_bind(ic_bootp_xmit_sock, INADDR_ANY, 68) || + ic_udp_connect(ic_bootp_xmit_sock, INADDR_BROADCAST, 67)) + return -1; + + return 0; +} + + +/* + * BOOTP cleanup. + */ +__initfunc(static void ic_bootp_cleanup(void)) +{ + ic_udp_close(ic_bootp_xmit_sock); + ic_udp_close(ic_bootp_recv_sock); + ic_bootp_addrs_del(); + ic_bootp_free(); + ic_bootp_route_cleanup(); +} + + +/* + * Send BOOTP request to single interface. + */ +__initfunc(static int ic_bootp_send_if(struct ic_device *d, u32 jiffies)) +{ + struct device *dev = d->dev; + struct bootp_pkt *b = ic_xmit_bootp; + + b->htype = dev->type; + b->hlen = dev->addr_len; + memset(b->hw_addr, 0, sizeof(b->hw_addr)); + memcpy(b->hw_addr, dev->dev_addr, dev->addr_len); + b->secs = htons(jiffies / HZ); + ic_set_bootp_route(d); + return ic_udp_send(ic_bootp_xmit_sock, b, sizeof(struct bootp_pkt)); +} + + +/* + * Send BOOTP requests to all interfaces. + */ +__initfunc(static int ic_bootp_send(u32 jiffies)) +{ + struct ic_device *d; + + for(d=ic_first_dev; d; d=d->next) + if (ic_bootp_send_if(d, jiffies) < 0) + return -1; + return 0; +} + + +/* + * Copy BOOTP-supplied string if not already set. + */ +__initfunc(static int ic_bootp_string(char *dest, char *src, int len, int max)) +{ + if (!len) + return 0; + if (len > max-1) + len = max-1; + strncpy(dest, src, len); + dest[len] = '\0'; + return 1; +} + + +/* + * Process BOOTP extension. + */ +__initfunc(static void ic_do_bootp_ext(u8 *ext)) +{ +#ifdef IPCONFIG_DEBUG + u8 *c; + + printk("BOOTP: Got extension %02x",*ext); + for(c=ext+2; cop != BOOTP_REPLY || + b->xid != ic_xmit_bootp->xid) { + printk("?"); + return; + } + + /* Find interface this arrived from */ + for(d=ic_first_dev; d; d=d->next) { + struct device *dev = d->dev; + if (b->htype == dev->type || + b->hlen == dev->addr_len || + !memcmp(b->hw_addr, dev->dev_addr, dev->addr_len)) + break; + } + if (!d) { /* Unknown device */ + printk("!"); + return; + } + + /* Record BOOTP packet arrival */ + cli(); + if (ic_got_reply) { + sti(); + return; + } + ic_got_reply = IC_GOT_BOOTP; + sti(); + ic_dev = d->dev; + + /* Extract basic fields */ + ic_myaddr = b->your_ip; + ic_servaddr = b->server_ip; + + /* Parse extensions */ + if (b->vendor_area[0] == 99 && /* Check magic cookie */ + b->vendor_area[1] == 130 && + b->vendor_area[2] == 83 && + b->vendor_area[3] == 99) { + ext = &b->vendor_area[4]; + end = (u8 *) b + len; + while (ext < end && *ext != 0xff) { + if (*ext == 0) /* Padding */ + ext++; + else { + opt = ext; + ext += ext[1] + 2; + if (ext <= end) + ic_do_bootp_ext(opt); + } + } + } +} + +#endif + + +/* + * Dynamic IP configuration -- BOOTP and RARP. + */ + +#ifdef CONFIG_IP_PNP_DYNAMIC + +__initfunc(int ic_dynamic(void)) +{ + int retries; + unsigned long timeout, jiff; + unsigned long start_jiffies; + + /* + * If neither BOOTP nor RARP was selected, return with an error. This + * routine gets only called when some pieces of information are mis- + * sing, and without BOOTP and RARP we are not able to get that in- + * formation. + */ + if (!ic_bootp_flag && !ic_rarp_flag) { + printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); + return -1; + } + +#ifdef CONFIG_IP_PNP_BOOTP + if (ic_bootp_flag && !bootp_dev_count) { + printk(KERN_ERR "BOOTP: No suitable device found.\n"); + ic_bootp_flag = 0; + } +#else + ic_bootp_flag = 0; +#endif + +#ifdef CONFIG_IP_PNP_RARP + if (ic_rarp_flag && !rarp_dev_count) { + printk(KERN_ERR "RARP: No suitable device found.\n"); + ic_rarp_flag = 0; + } +#else + ic_rarp_flag = 0; +#endif + + if (!ic_bootp_flag && !ic_rarp_flag) + /* Error message already printed */ + return -1; + + /* + * Setup RARP and BOOTP protocols + */ +#ifdef CONFIG_IP_PNP_RARP + if (ic_rarp_flag) + ic_rarp_init(); +#endif +#ifdef CONFIG_IP_PNP_BOOTP + if (ic_bootp_flag && ic_bootp_init() < 0) { + ic_bootp_cleanup(); + return -1; + } +#endif + + /* + * Send requests and wait, until we get an answer. This loop + * seems to be a terrible waste of CPU time, but actually there is + * only one process running at all, so we don't need to use any + * scheduler functions. + * [Actually we could now, but the nothing else running note still + * applies.. - AC] + */ + printk(KERN_NOTICE "Sending %s%s%s requests...", + ic_bootp_flag ? "BOOTP" : "", + ic_bootp_flag && ic_rarp_flag ? " and " : "", + ic_rarp_flag ? "RARP" : ""); + start_jiffies = jiffies; + retries = CONF_RETRIES; + get_random_bytes(&timeout, sizeof(timeout)); + timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); + for(;;) { +#ifdef CONFIG_IP_PNP_BOOTP + if (ic_bootp_flag && ic_bootp_send(jiffies - start_jiffies) < 0) { + printk(" BOOTP failed!\n"); + ic_bootp_cleanup(); + ic_bootp_flag = 0; + if (!ic_rarp_flag) + break; + } +#endif +#ifdef CONFIG_IP_PNP_RARP + if (ic_rarp_flag) + ic_rarp_send(); +#endif + printk("."); + jiff = jiffies + timeout; + while (jiffies < jiff && !ic_got_reply) +#ifdef CONFIG_IP_PNP_BOOTP + if (ic_bootp_flag) + ic_bootp_recv(); +#else + ; +#endif + if (ic_got_reply) { + printk(" OK\n"); + break; + } + if (! --retries) { + printk(" timed out!\n"); + break; + } + timeout = timeout CONF_TIMEOUT_MULT; + if (timeout > CONF_TIMEOUT_MAX) + timeout = CONF_TIMEOUT_MAX; + } + +#ifdef CONFIG_IP_PNP_RARP + if (ic_rarp_flag) + ic_rarp_cleanup(); +#endif +#ifdef CONFIG_IP_PNP_BOOTP + if (ic_bootp_flag) + ic_bootp_cleanup(); +#endif + + if (!ic_got_reply) + return -1; + + printk("IP-Config: Got %s answer from %s, ", + (ic_got_reply == IC_GOT_BOOTP) ? "BOOTP" : "RARP", + in_ntoa(ic_servaddr)); + printk("my address is %s\n", in_ntoa(ic_myaddr)); + + return 0; +} + +#endif + +/* + * IP Autoconfig dispatcher. + */ + +__initfunc(int ip_auto_config(void)) +{ + if (!ic_enable) + return 0; + + DBG(("IP-Config: Entered.\n")); + + /* Setup all network devices */ + if (ic_open_devs() < 0) + return -1; + + /* + * If the config information is insufficient (e.g., our IP address or + * IP address of the boot server is missing or we have multiple network + * interfaces and no default was set), use BOOTP or RARP to get the + * missing values. + */ + if (ic_myaddr == INADDR_NONE || +#ifdef CONFIG_ROOT_NFS + root_server_addr == INADDR_NONE || +#endif + (ic_first_dev && ic_first_dev->next)) { +#ifdef CONFIG_IP_PNP_DYNAMIC + if (ic_dynamic() < 0) { + printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n"); + ic_close_devs(); + return -1; + } +#else + printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); + ic_close_devs(); + return -1; +#endif + } else { + ic_dev = ic_first_dev->dev; /* Device selected manually or only one device -> use it */ + } + + /* + * Use defaults whereever applicable. + */ + if (ic_defaults() < 0) + return -1; + + /* + * Close all network devices except the device we've + * autoconfigured and set up routes. + */ + ic_close_devs(); + if (ic_setup_if() < 0 || ic_setup_routes() < 0) + return -1; + + DBG(("IP-Config: device=%s, local=%08x, server=%08x, boot=%08x, gw=%08x, mask=%08x\n", + ic_dev->name, ic_myaddr, ic_servaddr, root_server_addr, ic_gateway, ic_netmask)); + DBG(("IP-Config: host=%s, domain=%s, path=`%s'\n", system_utsname.nodename, + system_utsname.domainname, root_server_path)); + return 0; +} + +/* + * Decode any IP configuration options in the "ipconfig" kernel command + * line parameter. It consists of option fields separated by colons in + * the following order: + * + * :::::: + * + * Any of the fields can be empty which means to use a default value: + * - address given by BOOTP or RARP + * - address of host returning BOOTP or RARP packet + * - none, or the address returned by BOOTP + * - automatically determined from , or the + * one returned by BOOTP + * - in ASCII notation, or the name returned + * by BOOTP + * - use all available devices + * - use both protocols to determine my own address + */ +__initfunc(void ip_auto_config_setup(char *addrs, int *ints)) +{ + char *cp, *ip, *dp; + int num = 0; + + ic_set_manually = 1; + + if (!strcmp(addrs, "bootp")) { + ic_rarp_flag = 0; + return; + } else if (!strcmp(addrs, "rarp")) { + ic_bootp_flag = 0; + return; + } else if (!strcmp(addrs, "both")) { + return; + } else if (!strcmp(addrs, "off")) { + ic_enable = 0; + return; + } + + /* Parse the whole string */ + ip = addrs; + while (ip && *ip) { + if ((cp = strchr(ip, ':'))) + *cp++ = '\0'; + if (strlen(ip) > 0) { + DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip)); + switch (num) { + case 0: + if ((ic_myaddr = in_aton(ip)) == INADDR_ANY) + ic_myaddr = INADDR_NONE; + break; + case 1: + if ((ic_servaddr = in_aton(ip)) == INADDR_ANY) + ic_servaddr = INADDR_NONE; + break; + case 2: + if ((ic_gateway = in_aton(ip)) == INADDR_ANY) + ic_gateway = INADDR_NONE; + break; + case 3: + if ((ic_netmask = in_aton(ip)) == INADDR_ANY) + ic_netmask = INADDR_NONE; + break; + case 4: + if ((dp = strchr(ip, '.'))) { + *dp++ = '\0'; + strncpy(system_utsname.domainname, dp, __NEW_UTS_LEN); + system_utsname.domainname[__NEW_UTS_LEN] = '\0'; + } + strncpy(system_utsname.nodename, ip, __NEW_UTS_LEN); + system_utsname.nodename[__NEW_UTS_LEN] = '\0'; + ic_host_name_set = 1; + break; + case 5: + strncpy(user_dev_name, ip, IFNAMSIZ); + user_dev_name[IFNAMSIZ-1] = '\0'; + break; + case 6: + if (!strcmp(ip, "rarp")) + ic_bootp_flag = 0; + else if (!strcmp(ip, "bootp")) + ic_rarp_flag = 0; + else if (strcmp(ip, "both")) + ic_bootp_flag = ic_rarp_flag = 0; + break; + } + } + ip = cp; + num++; + } +} diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 75346d6dc370..565116ffca54 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,6 +1,8 @@ /* * Linux NET3: IP/IP protocol decoder. * + * Version: $Id: ipip.c,v 1.19 1997/11/08 17:50:21 kuznet Exp $ + * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 * @@ -11,6 +13,11 @@ * to keep ip_forward happy. * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL + * David Woodhouse : Perform some basic ICMP handling. + * IPIP Routing without decapsulation. + * Carlos Picoto : GRE over IP support + * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. + * I do not want to merge them together. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -18,12 +25,80 @@ * 2 of the License, or (at your option) any later version. * */ + +/* tunnel.c: an IP tunnel driver + + The purpose of this driver is to provide an IP tunnel through + which you can tunnel network traffic transparently across subnets. + + This was written by looking at Nick Holloway's dummy driver + Thanks for the great code! + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 + + Minor tweaks: + Cleaned up the code a little and added some pre-1.3.0 tweaks. + dev->hard_header/hard_header_len changed to use no headers. + Comments/bracketing tweaked. + Made the tunnels use dev->name not tunnel: when error reporting. + Added tx_dropped stat + + -Alan Cox (Alan.Cox@linux.org) 21 March 95 + + Reworked: + Changed to tunnel to destination gateway in addition to the + tunnel's pointopoint address + Almost completely rewritten + Note: There is currently no firewall or ICMP handling done. + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 + +*/ + +/* Things I wish I had known when writing the tunnel driver: + + When the tunnel_xmit() function is called, the skb contains the + packet to be sent (plus a great deal of extra info), and dev + contains the tunnel device that _we_ are. + + When we are passed a packet, we are expected to fill in the + source address with our source IP address. + + What is the proper way to allocate, copy and free a buffer? + After you allocate it, it is a "0 length" chunk of memory + starting at zero. If you want to add headers to the buffer + later, you'll have to call "skb_reserve(skb, amount)" with + the amount of memory you want reserved. Then, you call + "skb_put(skb, amount)" with the amount of space you want in + the buffer. skb_put() returns a pointer to the top (#0) of + that buffer. skb->len is set to the amount of space you have + "allocated" with skb_put(). You can then write up to skb->len + bytes to that buffer. If you need more, you can call skb_put() + again with the additional amount of space you need. You can + find out how much more space you can allocate by calling + "skb_tailroom(skb)". + Now, to add header space, call "skb_push(skb, header_len)". + This creates space at the beginning of the buffer and returns + a pointer to this new space. If later you need to strip a + header from a buffer, call "skb_pull(skb, header_len)". + skb_headroom() will return how much space is left at the top + of the buffer (before the main data). Remember, this headroom + space must be reserved before the skb_put() function is called. + */ + +/* + This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + -#include #include +#include #include #include #include +#include #include #include #include @@ -31,91 +106,673 @@ #include #include #include +#include -#include #include #include #include #include #include -void ipip_err(struct sk_buff *skb, unsigned char *dp) +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) + +static int ipip_fb_tunnel_init(struct device *dev); +static int ipip_tunnel_init(struct device *dev); + +static struct device ipip_fb_tunnel_dev = { + NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip_fb_tunnel_init, +}; + +static struct ip_tunnel ipip_fb_tunnel = { + NULL, &ipip_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"tunl0", } +}; + +static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_r[HASH_SIZE]; +static struct ip_tunnel *tunnels_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_wc[1]; +static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; + +static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local) { - /* NI */ - return; + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_r[h0]; t; t = t->next) { + if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + return t; + } + if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + return t; + return NULL; } -/* - * The IPIP protocol driver. - * - * On entry here - * skb->data is the original IP header - * skb->nh points to the initial IP header. - * skb->h points at the new header. +struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create) +{ + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + struct ip_tunnel *t, **tp, *nt; + struct device *dev; + unsigned h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) + return t; + } + if (!create) + return NULL; + + MOD_INC_USE_COUNT; + dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL); + if (dev == NULL) { + MOD_DEC_USE_COUNT; + return NULL; + } + memset(dev, 0, sizeof(*dev) + sizeof(*t)); + dev->priv = (void*)(dev+1); + nt = (struct ip_tunnel*)dev->priv; + nt->dev = dev; + dev->name = nt->parms.name; + dev->init = ipip_tunnel_init; + memcpy(&nt->parms, parms, sizeof(*parms)); + if (dev->name[0] == 0) { + int i; + for (i=1; i<100; i++) { + sprintf(dev->name, "tunl%d", i); + if (dev_get(dev->name) == NULL) + break; + } + if (i==100) + goto failed; + memcpy(parms->name, dev->name, IFNAMSIZ); + } + if (register_netdevice(dev) < 0) + goto failed; + + start_bh_atomic(); + nt->next = t; + *tp = nt; + end_bh_atomic(); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + kfree(dev); + MOD_DEC_USE_COUNT; + return NULL; +} + +static void ipip_tunnel_destroy(struct device *dev) +{ + struct ip_tunnel *t, **tp; + struct ip_tunnel *t0 = (struct ip_tunnel*)dev->priv; + u32 remote = t0->parms.iph.daddr; + u32 local = t0->parms.iph.saddr; + unsigned h = 0; + int prio = 0; + + if (dev == &ipip_fb_tunnel_dev) { + tunnels_wc[0] = NULL; + return; + } + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (t == t0) { + *tp = t->next; + kfree(dev); + MOD_DEC_USE_COUNT; + break; + } + } +} + + +void ipip_err(struct sk_buff *skb, unsigned char *dp, int len) +{ +#ifndef I_WISH_WORLD_WERE_PERFECT + +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. */ + struct iphdr *iph = (struct iphdr*)dp; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; + + if (len < sizeof(struct iphdr)) + return; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + t = ipip_tunnel_lookup(iph->daddr, iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + return; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + return; + + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; + return; +#else + struct iphdr *iph = (struct iphdr*)dp; + int hlen = iph->ihl<<2; + struct iphdr *eiph; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + struct sk_buff *skb2; + struct rtable *rt; + + if (len < hlen + sizeof(struct iphdr)) + return; + eiph = (struct iphdr*)(dp + hlen); + + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < hlen) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMP_PARAMETERPROB; + rel_info = skb->h.icmph->un.gateway - hlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* And it is the only really necesary thing :-) */ + rel_info = ntohs(skb->h.icmph->un.frag.mtu); + if (rel_info < hlen+68) + return; + rel_info -= hlen; + /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ + if (rel_info > ntohs(eiph->tot_len)) + return; + break; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + /* Prepare fake skb to feed it to icmp_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)eiph); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { + kfree_skb(skb2, FREE_WRITE); + return; + } + skb2->dev = rt->u.dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags&RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || + rt->u.dst.dev->type != ARPHRD_IPGRE) { + ip_rt_put(rt); + kfree_skb(skb2, FREE_WRITE); + return; + } + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || + skb2->dst->dev->type != ARPHRD_IPGRE) { + kfree_skb(skb2, FREE_WRITE); + return; + } + } + + /* change mtu on this route */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + if (rel_info > skb2->dst->pmtu) { + kfree_skb(skb2, FREE_WRITE); + return; + } + skb2->dst->pmtu = rel_info; + rel_info = htonl(rel_info); + } else if (type == ICMP_TIME_EXCEEDED) { + struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; + if (t->parms.iph.ttl) { + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + } + } + + icmp_send(skb2, rel_type, rel_code, rel_info); + kfree_skb(skb2, FREE_WRITE); + return; +#endif +} int ipip_rcv(struct sk_buff *skb, unsigned short len) { - struct device *dev; struct iphdr *iph; + struct ip_tunnel *tunnel; -#ifdef TUNNEL_DEBUG - printk("ipip_rcv: got a packet!\n"); -#endif - /* - * Discard the original IP header - */ - - skb_pull(skb, skb->h.raw - skb->nh.raw); - - /* - * Adjust pointers - */ - iph = skb->nh.iph; - skb->nh.iph = skb->h.ipiph; + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb_pull(skb, skb->h.raw - skb->data); memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); - - /* - * If you want to add LZ compressed IP or things like that here, - * and in drivers/net/tunnel.c are the places to add. - */ - - skb->protocol = htons(ETH_P_IP); + skb->protocol = __constant_htons(ETH_P_IP); skb->ip_summed = 0; skb->pkt_type = PACKET_HOST; + if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + netif_rx(skb); + return 0; + } + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + kfree_skb(skb, FREE_READ); + return 0; +} + +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *tiph = &tunnel->parms.iph; + u8 tos = tunnel->parms.iph.tos; + u16 df = tiph->frag_off; + struct rtable *rt; /* Route to the other host */ + struct device *tdev; /* Device to other host */ + struct iphdr *old_iph = skb->nh.iph; + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + u32 dst = tiph->daddr; + int mtu; + + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } + + if (skb->protocol != __constant_htons(ETH_P_IP)) + goto tx_error; + + if (tos&1) + tos = old_iph->tos; + + if (!dst) { + /* NBMA tunnel */ + if ((rt = (struct rtable*)skb->dst) == NULL) { + tunnel->stat.tx_fifo_errors++; + goto tx_error; + } + if ((dst = rt->rt_gateway) == 0) + goto tx_error_icmp; + } + + if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } + tdev = rt->u.dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } + + mtu = rt->u.dst.pmtu - sizeof(struct iphdr); + if (mtu < 68) { + tunnel->stat.collisions++; + ip_rt_put(rt); + goto tx_error; + } + if (skb->dst && mtu < skb->dst->pmtu) + skb->dst->pmtu = mtu; + + df |= (old_iph->frag_off&__constant_htons(IP_DF)); + + if ((old_iph->frag_off&__constant_htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + } else + tunnel->err_count = 0; + } + + skb->h.raw = skb->nh.raw; + /* - * Is it draconic? I do not think so. --ANK + * Okay, now see if we can stuff it in the buffer as-is. */ - dev = ip_dev_find_tunnel(iph->daddr, iph->saddr); - if (dev == NULL) { -#ifdef CONFIG_IP_MROUTE - int vif; - - if (!MULTICAST(skb->nh.iph->daddr) || - !ipv4_config.multicast_route || - LOCAL_MCAST(skb->nh.iph->daddr) || - (vif=ip_mr_find_tunnel(iph->daddr, iph->saddr)) < 0) - { -#endif - kfree_skb(skb, FREE_READ); - return -EINVAL; -#ifdef CONFIG_IP_MROUTE + max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; + dev_kfree_skb(skb, FREE_WRITE); + tunnel->recursion--; + return 0; } - IPCB(skb)->flags |= IPSKB_TUNNELED; - IPCB(skb)->vif = vif; - dev = skb->dev; -#endif + dev_kfree_skb(skb, FREE_WRITE); + skb = new_skb; } - skb->dev = dev; + + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); dst_release(skb->dst); - skb->dst = NULL; - netif_rx(skb); - return(0); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = old_iph->ttl; + + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); + ip_send_check(iph); + + stats->tx_bytes += skb->len; + stats->tx_packets++; + ip_send(skb); + tunnel->recursion--; + return 0; + +tx_error_icmp: + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); +tx_error: + stats->tx_errors++; + dev_kfree_skb(skb, FREE_WRITE); + tunnel->recursion--; + return 0; +} + +static int +ipip_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + MOD_INC_USE_COUNT; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == &ipip_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || + p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= __constant_htons(IP_DF); + + t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + if (dev == &ipip_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == &ipip_fb_tunnel) + goto done; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + MOD_DEC_USE_COUNT; + return err; +} + +static struct net_device_stats *ipip_tunnel_get_stats(struct device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); +} + +static int ipip_tunnel_change_mtu(struct device *dev, int new_mtu) +{ + if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void ipip_tunnel_init_gen(struct device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + + dev->destructor = ipip_tunnel_destroy; + dev->hard_start_xmit = ipip_tunnel_xmit; + dev->get_stats = ipip_tunnel_get_stats; + dev->do_ioctl = ipip_tunnel_ioctl; + dev->change_mtu = ipip_tunnel_change_mtu; + + dev_init_buffers(dev); + + dev->type = ARPHRD_TUNNEL; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = 1500 - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; + memcpy(dev->dev_addr, &t->parms.iph.saddr, 4); + memcpy(dev->broadcast, &t->parms.iph.daddr, 4); +} + +static int ipip_tunnel_init(struct device *dev) +{ + struct device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + ipip_tunnel_init_gen(dev); + + if (iph->daddr) { + struct rtable *rt; + if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = dev_get_by_index(tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + } + dev->iflink = tunnel->parms.link; + + return 0; } #ifdef MODULE +static int ipip_fb_tunnel_open(struct device *dev) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int ipip_fb_tunnel_close(struct device *dev) +{ + MOD_DEC_USE_COUNT; + return 0; +} +#endif + +__initfunc(int ipip_fb_tunnel_init(struct device *dev)) +{ + struct iphdr *iph; + + ipip_tunnel_init_gen(dev); +#ifdef MODULE + dev->open = ipip_fb_tunnel_open; + dev->stop = ipip_fb_tunnel_close; +#endif + + iph = &ipip_fb_tunnel.parms.iph; + iph->version = 4; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + + tunnels_wc[0] = &ipip_fb_tunnel; + return 0; +} static struct inet_protocol ipip_protocol = { ipip_rcv, /* IPIP handler */ @@ -127,21 +784,34 @@ static struct inet_protocol ipip_protocol = { "IPIP" /* name */ }; +#ifdef MODULE +int init_module(void) +#else +__initfunc(int ipip_init(void)) +#endif +{ + printk(KERN_INFO "IPv4 over IPv4 tunneling driver\n"); -/* - * And now the modules code and kernel interface. - */ + ipip_fb_tunnel_dev.priv = (void*)&ipip_fb_tunnel; + ipip_fb_tunnel_dev.name = ipip_fb_tunnel.parms.name; +#ifdef MODULE + register_netdev(&ipip_fb_tunnel_dev); +#else + register_netdevice(&ipip_fb_tunnel_dev); +#endif -int init_module( void) -{ inet_add_protocol(&ipip_protocol); return 0; } -void cleanup_module( void) +#ifdef MODULE + +void cleanup_module(void) { if ( inet_del_protocol(&ipip_protocol) < 0 ) printk(KERN_INFO "ipip close: can't remove protocol\n"); + + unregister_netdevice(&ipip_fb_tunnel_dev); } #endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 20246148aa45..9909f32b0bc8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,6 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * + * Version: $Id: ipmr.c,v 1.28 1997/10/30 00:43:16 davem Exp $ * * Fixes: * Michael Chastain : Incorrect size of copying. @@ -20,14 +21,8 @@ * Alexey Kuznetsov : Status, optimisations and more. * Brad Parker : Better behaviour on mrouted upcall * overflow. + * Carlos Picoto : PIMv1 Support * - * Status: - * Cache manager under test. Forwarding in vague test mode - * Todo: - * Flow control - * Finish Tunnels - * Debug cache ttl handling properly - * Resolve IFF_ALLMULTI for rest of cards */ #include @@ -45,6 +40,8 @@ #include #include #include +#include +#include #include #include #include @@ -54,9 +51,16 @@ #include #include #include +#include #include +#include +#include #include +#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) +#define CONFIG_IP_PIMSM 1 +#endif + /* * Multicast router control variables */ @@ -64,10 +68,133 @@ static struct vif_device vif_table[MAXVIFS]; /* Devices */ static unsigned long vifc_map; /* Active device map */ static int maxvif; -int mroute_do_pim = 0; /* Set in PIM assert */ +int mroute_do_assert = 0; /* Set in PIM assert */ +int mroute_do_pim = 0; static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */ int cache_resolve_queue_len = 0; /* Size of unresolved */ +static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); +static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); +static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); + +extern struct inet_protocol pim_protocol; + +static +struct device *ipmr_new_tunnel(struct vifctl *v) +{ + struct device *dev = NULL; + + rtnl_lock(); + dev = dev_get("tunl0"); + + if (dev) { + int err; + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + struct in_device *in_dev; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); + ifr.ifr_ifru.ifru_data = (void*)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + + if (err == 0 && (dev = dev_get(p.name)) != NULL) { + dev->flags |= IFF_MULTICAST; + + in_dev = dev->ip_ptr; + if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL) + goto failure; + + if (dev_open(dev)) + goto failure; + } + } + rtnl_unlock(); + return dev; + +failure: + unregister_netdevice(dev); + rtnl_unlock(); + return NULL; +} + +#ifdef CONFIG_IP_PIMSM + +static int reg_vif_num = -1; +static struct device * reg_dev; + +static int reg_vif_xmit(struct sk_buff *skb, struct device *dev) +{ + ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); + kfree_skb(skb, FREE_WRITE); + return 0; +} + +static struct net_device_stats *reg_vif_get_stats(struct device *dev) +{ + return (struct net_device_stats*)dev->priv; +} + +static +struct device *ipmr_reg_vif(struct vifctl *v) +{ + struct device *dev; + struct in_device *in_dev; + int size; + + size = sizeof(*dev) + IFNAMSIZ + sizeof(struct net_device_stats); + dev = kmalloc(size, GFP_KERNEL); + if (!dev) + return NULL; + + memset(dev, 0, size); + + dev->priv = dev + 1; + dev->name = dev->priv + sizeof(struct net_device_stats); + + strcpy(dev->name, "pimreg"); + + dev->type = ARPHRD_PIMREG; + dev->mtu = 1500 - sizeof(struct iphdr) - 8; + dev->flags = IFF_NOARP; + dev->hard_start_xmit = reg_vif_xmit; + dev->get_stats = reg_vif_get_stats; + + rtnl_lock(); + + if (register_netdevice(dev)) { + rtnl_unlock(); + kfree(dev); + return NULL; + } + + if ((in_dev = inetdev_init(dev)) == NULL) + goto failure; + + if (dev_open(dev)) + goto failure; + + rtnl_unlock(); + reg_dev = dev; + return dev; + +failure: + unregister_netdevice(dev); + rtnl_unlock(); + kfree(dev); + return NULL; +} +#endif + /* * Delete a VIF entry */ @@ -75,27 +202,35 @@ int cache_resolve_queue_len = 0; /* Size of unresolved */ static int vif_delete(int vifi) { struct vif_device *v; + struct device *dev; + struct in_device *in_dev; if (vifi < 0 || vifi >= maxvif || !(vifc_map&(1<dev; + v->dev = NULL; + vifc_map &= ~(1<flags&VIFF_TUNNEL)) { - v->u.dev->flags &= ~IFF_ALLMULTI; - dev_mc_upload(v->u.dev); - ip_rt_multicast_event(v->u.dev); - v->u.dev = NULL; - } else { - ip_rt_put(v->u.rt); - v->u.rt = NULL; - } + if ((in_dev = dev->ip_ptr) != NULL) + in_dev->flags &= ~IFF_IP_MFORWARD; - vifc_map&=~(1<flags&(VIFF_TUNNEL|VIFF_REGISTER)) { +#ifdef CONFIG_IP_PIMSM + if (vifi == reg_vif_num) { + reg_vif_num = -1; + reg_dev = NULL; + } +#endif + unregister_netdevice(dev); + if (v->flags&VIFF_REGISTER) + kfree(dev); + } if (vifi+1 == maxvif) { int tmp; @@ -108,21 +243,27 @@ static int vif_delete(int vifi) return 0; } -static void ipmr_set_bounds(struct mfc_cache *cache) +static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls) { int vifi; + + start_bh_atomic(); + + cache->mfc_minvif = MAXVIFS; + cache->mfc_maxvif = 0; + memset(cache->mfc_ttls, 255, MAXVIFS); + for (vifi=0; vifimfc_ttls[vifi]) { - cache->mfc_minvif = vifi; - cache->mfc_maxvif = vifi+1; + if (vifc_map&(1<mfc_ttls[vifi] = ttls[vifi]; + if (cache->mfc_minvif > vifi) + cache->mfc_minvif = vifi; + if (cache->mfc_maxvif <= vifi) + cache->mfc_maxvif = vifi + 1; vifi++; - break; } } - for ( ; vifimfc_ttls[vifi]) - cache->mfc_maxvif = vifi+1; - } + end_bh_atomic(); } /* @@ -148,7 +289,7 @@ static void ipmr_cache_delete(struct mfc_cache *cache) /* * Unlink the buffer */ - + while(*cp!=NULL) { if(*cp==cache) @@ -158,7 +299,7 @@ static void ipmr_cache_delete(struct mfc_cache *cache) } cp=&((*cp)->next); } - + /* * Free the buffer. If it is a pending resolution * clean up the other resources. @@ -167,8 +308,19 @@ static void ipmr_cache_delete(struct mfc_cache *cache) if(cache->mfc_flags&MFC_QUEUED) { cache_resolve_queue_len--; - while((skb=skb_dequeue(&cache->mfc_unresolved))) + while((skb=skb_dequeue(&cache->mfc_unresolved))) { +#ifdef CONFIG_RTNETLINK + if (skb->nh.iph->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT; + netlink_unicast(rtnl, skb, NETLINK_CB(skb).pid, MSG_DONTWAIT); + } else +#endif kfree_skb(skb, FREE_WRITE); + } } kfree_s(cache,sizeof(cache)); } @@ -222,14 +374,12 @@ static struct mfc_cache *ipmr_cache_alloc(int priority) struct mfc_cache *c=(struct mfc_cache *)kmalloc(sizeof(struct mfc_cache), priority); if(c==NULL) return NULL; - c->mfc_queuelen=0; + memset(c, 0, sizeof(*c)); skb_queue_head_init(&c->mfc_unresolved); init_timer(&c->mfc_timer); c->mfc_timer.data=(long)c; c->mfc_timer.function=ipmr_cache_timer; - c->mfc_last_assert=0; c->mfc_minvif = MAXVIFS; - c->mfc_maxvif = 0; return c; } @@ -259,8 +409,26 @@ static void ipmr_cache_resolve(struct mfc_cache *cache) /* * Play the pending entries through our router */ - while((skb=skb_dequeue(&cache->mfc_unresolved))) - ip_mr_input(skb); + while((skb=skb_dequeue(&cache->mfc_unresolved))) { +#ifdef CONFIG_RTNETLINK + if (skb->nh.iph->version == 0) { + int err; + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + + if (ipmr_fill_mroute(skb, cache, NLMSG_DATA(nlh)) > 0) { + nlh->nlmsg_len = skb->tail - (u8*)nlh; + } else { + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; + } + err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) printk(KERN_DEBUG "Err=%d", err); + } else +#endif + ip_mr_forward(skb, cache, 0); + } } /* @@ -270,15 +438,40 @@ static void ipmr_cache_resolve(struct mfc_cache *cache) static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) { - struct sk_buff *skb = alloc_skb(128, GFP_ATOMIC); + struct sk_buff *skb; int ihl = pkt->nh.iph->ihl<<2; struct igmphdr *igmp; struct igmpmsg *msg; int ret; +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) + skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); + else +#endif + skb = alloc_skb(128, GFP_ATOMIC); + if(!skb) - return -ENOMEM; - + return -ENOBUFS; + +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) { + /* Ugly, but we have no choice with this interface. + Duplicate old header, fix ihl, length etc. + And all this only to mangle msg->im_msgtype and + to set msg->im_mbz to "mbz" :-) + */ + msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr)); + skb->nh.raw = skb->h.raw = (u8*)msg; + memcpy(msg, pkt->nh.raw, sizeof(struct iphdr)); + msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_mbz = 0; + msg->im_vif = reg_vif_num; + skb->nh.iph->ihl = sizeof(struct iphdr) >> 2; + skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr)); + } else { +#endif + /* * Copy the IP header */ @@ -287,33 +480,30 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) memcpy(skb->data,pkt->data,ihl); skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */ msg = (struct igmpmsg*)skb->nh.iph; - if (assert) - msg->im_vif = vifi; - + msg->im_vif = vifi; + skb->dst = dst_clone(pkt->dst); + /* * Add our header */ - + igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr)); igmp->type = - msg->im_msgtype = assert ? IGMPMSG_WRONGVIF : IGMPMSG_NOCACHE; + msg->im_msgtype = assert; igmp->code = 0; skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */ skb->h.raw = skb->nh.raw; +#ifdef CONFIG_IP_PIMSM + } +#endif /* * Deliver to mrouted */ - if((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) - { - static unsigned long last_warn; - if(jiffies-last_warn>10*HZ) - { - last_warn=jiffies; - printk("mroute: pending queue full, dropping entries.\n"); - } + if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) { + if (net_ratelimit()) + printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); kfree_skb(skb, FREE_READ); - return ret; } return ret; @@ -323,7 +513,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) * Queue a packet for resolution */ -static void ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct sk_buff *skb) +static int ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct sk_buff *skb) { if(cache==NULL) { @@ -333,12 +523,12 @@ static void ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct s if(cache_resolve_queue_len>=10 || (cache=ipmr_cache_alloc(GFP_ATOMIC))==NULL) { kfree_skb(skb, FREE_WRITE); - return; + return -ENOBUFS; } /* * Fill in the new cache entry */ - cache->mfc_parent=vifi; + cache->mfc_parent=ALL_VIFS; cache->mfc_origin=skb->nh.iph->saddr; cache->mfc_mcastgrp=skb->nh.iph->daddr; cache->mfc_flags=MFC_QUEUED; @@ -358,9 +548,16 @@ static void ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct s if(mroute_socket) { /* If the report failed throw the cache entry - out - Brad Parker */ - if(ipmr_cache_report(skb, vifi, 0)<0) + out - Brad Parker + + OK, OK, Brad. Only do not forget to free skb + and return :-) --ANK + */ + if (ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE)<0) { ipmr_cache_delete(cache); + kfree_skb(skb, FREE_WRITE); + return -ENOBUFS; + } } } /* @@ -369,10 +566,11 @@ static void ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct s if(cache->mfc_queuelen>3) { kfree_skb(skb, FREE_WRITE); - return; + return -ENOBUFS; } cache->mfc_queuelen++; skb_queue_tail(&cache->mfc_unresolved,skb); + return 0; } /* @@ -416,8 +614,7 @@ int ipmr_mfc_modify(int action, struct mfcctl *mfc) cache->mfc_flags|=MFC_RESOLVED; cache->mfc_parent=mfc->mfcc_parent; - memcpy(cache->mfc_ttls, mfc->mfcc_ttls,sizeof(cache->mfc_ttls)); - ipmr_set_bounds(cache); + ipmr_update_threshoulds(cache, mfc->mfcc_ttls); /* * Check to see if we resolved a queued list. If so we @@ -445,13 +642,21 @@ int ipmr_mfc_modify(int action, struct mfcctl *mfc) cache->mfc_origin=mfc->mfcc_origin.s_addr; cache->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; cache->mfc_parent=mfc->mfcc_parent; - memcpy(cache->mfc_ttls, mfc->mfcc_ttls,sizeof(cache->mfc_ttls)); - ipmr_set_bounds(cache); + ipmr_update_threshoulds(cache, mfc->mfcc_ttls); ipmr_cache_insert(cache); end_bh_atomic(); return 0; } - + +static void mrtsock_destruct(struct sock *sk) +{ + if (sk == mroute_socket) { + ipv4_config.multicast_route = 0; + mroute_socket=NULL; + mroute_close(sk); + } +} + /* * Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately @@ -461,7 +666,6 @@ int ipmr_mfc_modify(int action, struct mfcctl *mfc) int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) { - int err; struct vifctl vif; struct mfcctl mfc; @@ -480,9 +684,8 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) return -ENOPROTOOPT; { int opt; - err = get_user(opt,(int *)optval); - if (err) - return err; + if (get_user(opt,(int *)optval)) + return -EFAULT; if (opt != 1) return -ENOPROTOOPT; } @@ -490,78 +693,101 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) return -EADDRINUSE; mroute_socket=sk; ipv4_config.multicast_route = 1; - /* Initialise state */ - return 0; + if (ip_ra_control(sk, 1, mrtsock_destruct) == 0) + return 0; + mrtsock_destruct(sk); + return -EADDRINUSE; case MRT_DONE: - ipv4_config.multicast_route = 0; - mroute_close(sk); - mroute_socket=NULL; + mrtsock_destruct(sk); return 0; case MRT_ADD_VIF: case MRT_DEL_VIF: if(optlen!=sizeof(vif)) return -EINVAL; - err = copy_from_user(&vif,optval,sizeof(vif)); - if (err) + if (copy_from_user(&vif,optval,sizeof(vif))) return -EFAULT; - if(vif.vifc_vifi > MAXVIFS) + if(vif.vifc_vifi >= MAXVIFS) return -ENFILE; if(optname==MRT_ADD_VIF) { struct vif_device *v=&vif_table[vif.vifc_vifi]; struct device *dev; - /* Empty vif ? */ - if(vifc_map&(1<flags&IFF_MULTICAST) - { - /* Most ethernet cards don't know - how to do this yet.. */ - dev->flags|=IFF_ALLMULTI; - dev_mc_upload(dev); - ip_rt_multicast_event(dev); - } - else - { - /* We are stuck.. */ - return -EOPNOTSUPP; + + switch (vif.vifc_flags) { +#ifdef CONFIG_IP_PIMSM + case VIFF_REGISTER: + + /* + * Special Purpose VIF in PIM + * All the packets will be sent to the daemon + */ + if (reg_vif_num >= 0) + return -EADDRINUSE; + reg_vif_num = vif.vifc_vifi; + dev = ipmr_reg_vif(&vif); + if (!dev) { + reg_vif_num = -1; + return -ENOBUFS; } + break; +#endif + case VIFF_TUNNEL: + dev = ipmr_new_tunnel(&vif); + if (!dev) + return -ENOBUFS; + break; + case 0: + dev=ip_dev_find(vif.vifc_lcl_addr.s_addr); + if (!dev) + return -EADDRNOTAVAIL; + break; + default: + printk(KERN_DEBUG "ipmr_add_vif: flags %02x\n", vif.vifc_flags); + return -EINVAL; } + + if ((in_dev = dev->ip_ptr) == NULL) + return -EADDRNOTAVAIL; + if (in_dev->flags & IFF_IP_MFORWARD) + return -EADDRINUSE; + in_dev->flags |= IFF_IP_MFORWARD; + dev_set_allmulti(dev, +1); + ip_rt_multicast_event(in_dev); + /* * Fill in the VIF structures */ - cli(); + start_bh_atomic(); v->rate_limit=vif.vifc_rate_limit; v->local=vif.vifc_lcl_addr.s_addr; v->remote=vif.vifc_rmt_addr.s_addr; v->flags=vif.vifc_flags; v->threshold=vif.vifc_threshold; - v->u.dev=NULL; - if (!(vif.vifc_flags&VIFF_TUNNEL)) - v->u.dev=dev; + v->dev=dev; v->bytes_in = 0; v->bytes_out = 0; v->pkt_in = 0; v->pkt_out = 0; + v->link = dev->ifindex; + if (vif.vifc_flags&(VIFF_TUNNEL|VIFF_REGISTER)) + v->link = dev->iflink; vifc_map|=(1< maxvif) maxvif = vif.vifc_vifi+1; - sti(); + end_bh_atomic(); return 0; - } else - return vif_delete(vif.vifc_vifi); + } else { + int ret; + rtnl_lock(); + ret = vif_delete(vif.vifc_vifi); + rtnl_unlock(); + return ret; + } /* * Manipulate the forwarding caches. These live @@ -571,8 +797,9 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) case MRT_DEL_MFC: if(optlen!=sizeof(mfc)) return -EINVAL; - err = copy_from_user(&mfc,optval, sizeof(mfc)); - return err ? -EFAULT : ipmr_mfc_modify(optname, &mfc); + if (copy_from_user(&mfc,optval, sizeof(mfc))) + return -EFAULT; + return ipmr_mfc_modify(optname, &mfc); /* * Control PIM assert. */ @@ -581,9 +808,29 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) int v; if(get_user(v,(int *)optval)) return -EFAULT; - mroute_do_pim=(v)?1:0; + mroute_do_assert=(v)?1:0; return 0; } +#ifdef CONFIG_IP_PIMSM + case MRT_PIM: + { + int v; + if(get_user(v,(int *)optval)) + return -EFAULT; + v = (v)?1:0; + if (v != mroute_do_pim) { + mroute_do_pim = v; + mroute_do_assert = v; +#ifdef CONFIG_IP_PIMSM_V2 + if (mroute_do_pim) + inet_add_protocol(&pim_protocol); + else + inet_del_protocol(&pim_protocol); +#endif + } + return 0; + } +#endif /* * Spurious command, or MRT_VERSION which you cannot * set. @@ -604,7 +851,11 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char *optval,int *optlen) if(sk!=mroute_socket) return -EACCES; - if(optname!=MRT_VERSION && optname!=MRT_ASSERT) + if(optname!=MRT_VERSION && +#ifdef CONFIG_IP_PIMSM + optname!=MRT_PIM && +#endif + optname!=MRT_ASSERT) return -ENOPROTOOPT; if(get_user(olr, optlen)) @@ -615,8 +866,12 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char *optval,int *optlen) return -EFAULT; if(optname==MRT_VERSION) val=0x0305; - else +#ifdef CONFIG_IP_PIMSM + else if(optname==MRT_PIM) val=mroute_do_pim; +#endif + else + val=mroute_do_assert; if(copy_to_user(optval,&val,olr)) return -EFAULT; return 0; @@ -628,7 +883,6 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char *optval,int *optlen) int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg) { - int err; struct sioc_sg_req sr; struct sioc_vif_req vr; struct vif_device *vif; @@ -637,8 +891,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg) switch(cmd) { case SIOCGETVIFCNT: - err = copy_from_user(&vr,(void *)arg,sizeof(vr)); - if (err) + if (copy_from_user(&vr,(void *)arg,sizeof(vr))) return -EFAULT; if(vr.vifi>=maxvif) return -EINVAL; @@ -649,16 +902,13 @@ int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg) vr.ocount=vif->pkt_out; vr.ibytes=vif->bytes_in; vr.obytes=vif->bytes_out; - err = copy_to_user((void *)arg,&vr,sizeof(vr)); - if (err) - err = -EFAULT; - return err; + if (copy_to_user((void *)arg,&vr,sizeof(vr))) + return -EFAULT; return 0; } return -EADDRNOTAVAIL; case SIOCGETSGCNT: - err = copy_from_user(&sr,(void *)arg,sizeof(sr)); - if (err) + if (copy_from_user(&sr,(void *)arg,sizeof(sr))) return -EFAULT; for (c = mfc_cache_array[MFC_HASH(sr.grp.s_addr, sr.src.s_addr)]; c; c = c->next) { @@ -667,10 +917,8 @@ int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg) sr.pktcnt = c->mfc_pkt; sr.bytecnt = c->mfc_bytes; sr.wrong_if = c->mfc_wrong_if; - err = copy_to_user((void *)arg,&sr,sizeof(sr)); - if (err) - err = -EFAULT; - return err; + if (copy_to_user((void *)arg,&sr,sizeof(sr))) + return -EFAULT; return 0; } } @@ -691,9 +939,10 @@ void mroute_close(struct sock *sk) /* * Shut down all active vif entries */ - + rtnl_lock(); for(i=0; iflags&VIFF_TUNNEL) && v->u.dev==ptr) + for(ct=0;ctdev==ptr) vif_delete(ct); v++; } @@ -769,26 +1017,24 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, struct rtable *rt; int encap = 0; struct sk_buff *skb2; - int err; - + +#ifdef CONFIG_IP_PIMSM + if (vif->flags & VIFF_REGISTER) { + vif->pkt_out++; + vif->bytes_out+=skb->len; + ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len; + ((struct net_device_stats*)vif->dev->priv)->tx_packets++; + ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); + return; + } +#endif + if (vif->flags&VIFF_TUNNEL) { - rt = vif->u.rt; - if (!rt || rt->u.dst.obsolete) { - ip_rt_put(rt); - vif->u.rt = NULL; - err = ip_route_output(&rt, vif->remote, vif->local, RT_TOS(iph->tos), NULL); - if (err) - return; - vif->u.rt = rt; - } - dst_clone(&rt->u.dst); + if (ip_route_output(&rt, vif->remote, vif->local, RT_TOS(iph->tos), vif->link)) + return; encap = sizeof(struct iphdr); } else { - dev = vif->u.dev; - if (dev == NULL) - return; - err = ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos), dev); - if (err) + if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos), vif->link)) return; } @@ -807,10 +1053,14 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, return; } - if (skb_headroom(skb) < encap || (encap && !last)) + if (skb_headroom(skb) < encap || skb_cloned(skb) || !last) skb2 = skb_realloc_headroom(skb, (encap + 15)&~15); - else + else if (atomic_read(&skb->users) != 1) skb2 = skb_clone(skb, GFP_ATOMIC); + else { + atomic_inc(&skb->users); + skb2 = skb; + } if (skb2 == NULL) { ip_rt_put(rt); @@ -826,34 +1076,45 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, iph = skb2->nh.iph; ip_decrease_ttl(iph); - if (vif->flags & VIFF_TUNNEL) + if (vif->flags & VIFF_TUNNEL) { ip_encap(skb2, vif->local, vif->remote); + ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++; + ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb2->len; + } + + IPCB(skb2)->flags |= IPSKB_FORWARDED; - ip_send(skb2); + /* + * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + * not only before forwarding, but after forwarding on all output + * interfaces. It is clear, if mrouter runs a multicasting + * program, it should receive packets not depending to what interface + * program is joined. + * If we will not make it, the program will have to join on all + * interfaces. On the other hand, multihoming host (or router, but + * not mrouter) cannot join to more than one interface - it will + * result in receiving multiple packets. + */ + ip_ll_header(skb2); + skb2->dst->output(skb2); } -/* - * Multicast packets for forwarding arrive here - */ +int ipmr_find_vif(struct device *dev) +{ + int ct; + for (ct=0; ctflags&IPSKB_TUNNELED; - - cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr); - - /* - * No usable cache entry - */ - - if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) { - ipmr_cache_unresolved(cache, ALL_VIFS, skb); - return -EAGAIN; - } vif = cache->mfc_parent; cache->mfc_pkt++; @@ -862,75 +1123,290 @@ int ip_mr_input(struct sk_buff *skb) /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (vif >= maxvif || !(vifc_map&(1<vif != vif) || - (!tunneled && (vif_table[vif].flags&VIFF_TUNNEL || - vif_table[vif].u.dev != skb->dev))) { + if (vif_table[vif].dev != skb->dev) { + int true_vifi; + + if (((struct rtable*)skb->dst)->key.iif == 0) { + /* It is our own packet, looped back. + Very complicated situation... + + The best workaround until routing daemons will be + fixed is not to redistribute packet, if it was + send through wrong interface. It means, that + multicast applications WILL NOT work for + (S,G), which have default multicast route pointing + to wrong oif. In any case, it is not a good + idea to use multicasting applications on router. + */ + goto dont_forward; + } + cache->mfc_wrong_if++; - if (vif < MAXVIFS && mroute_do_pim && - !(vif_table[vif].flags&VIFF_TUNNEL) && - skb->dev->flags&IFF_BROADCAST && + true_vifi = ipmr_find_vif(skb->dev); + + if (true_vifi < MAXVIFS && mroute_do_assert && + /* pimsm uses asserts, when switching from RPT to SPT, + so that we cannot check that packet arrived on an oif. + It is bad, but otherwise we would need to move pretty + large chunk of pimd to kernel. Ough... --ANK + */ + (mroute_do_pim || cache->mfc_ttls[true_vifi] < 255) && jiffies - cache->mfc_last_assert > MFC_ASSERT_THRESH) { cache->mfc_last_assert = jiffies; - /* - * It is wrong! Routing daemon can - * determine vif itself, but it cannot - * determine REAL device. - * BSD bug. Fix it later, PIM does not - * work in any case 8) _ANK_ - */ - ipmr_cache_report(skb, vif, 1); + ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF); } - kfree_skb(skb, FREE_WRITE); - return -EINVAL; + goto dont_forward; } vif_table[vif].pkt_in++; vif_table[vif].bytes_in+=skb->len; - if (IPCB(skb)->opt.router_alert || - ((struct rtable*)skb->dst)->rt_flags&RTF_LOCAL || - skb->nh.iph->protocol == IPPROTO_IGMP) - local = 1; - /* * Forward the frame */ - ct = cache->mfc_maxvif-1; - while (ct>=cache->mfc_minvif) { - /* - * 0 means don't do it. Silly idea, 255 as don't do it would be cleaner! - */ - if (skb->nh.iph->ttl > cache->mfc_ttls[ct] && cache->mfc_ttls[ct]>0) { + for (ct = cache->mfc_maxvif-1; ct >= cache->mfc_minvif; ct--) { + if (skb->nh.iph->ttl > cache->mfc_ttls[ct]) { if (psend != -1) ipmr_queue_xmit(skb, cache, psend, 0); psend=ct; } - ct--; } if (psend != -1) - ipmr_queue_xmit(skb, cache, psend, 1); + ipmr_queue_xmit(skb, cache, psend, !local); + +dont_forward: + if (!local) + kfree_skb(skb, FREE_WRITE); + return 0; +} + + +/* + * Multicast packets for forwarding arrive here + */ + +int ip_mr_input(struct sk_buff *skb) +{ + struct mfc_cache *cache; + int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL; + + /* Packet is looped back after forward, it should not be + forwarded second time, but still can be delivered locally. + */ + if (IPCB(skb)->flags&IPSKB_FORWARDED) + goto dont_forward; + if (!local) { + if (IPCB(skb)->opt.router_alert) { + if (ip_call_ra_chain(skb)) + return 0; + } else if (skb->nh.iph->protocol == IPPROTO_IGMP && mroute_socket) { + /* IGMPv1 (and broken IGMPv2 implementations sort of + Cisco IOS <= 11.2(8)) do not put router alert + option to IGMP packets destined to routable + groups. It is very bad, because it means + that we can forward NO IGMP messages. + */ + raw_rcv(mroute_socket, skb); + return 0; + } + } + + cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr); + + /* + * No usable cache entry + */ + + if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) { + int vif; + + if (local) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + ip_local_deliver(skb); + if (skb2 == NULL) + return -ENOBUFS; + skb = skb2; + } + + vif = ipmr_find_vif(skb->dev); + if (vif != ALL_VIFS) { + ipmr_cache_unresolved(cache, vif, skb); + return -EAGAIN; + } kfree_skb(skb, FREE_READ); return 0; } - return ip_local_deliver(skb); + + ip_mr_forward(skb, cache, local); + + if (local) + return ip_local_deliver(skb); + return 0; + +dont_forward: + if (local) + return ip_local_deliver(skb); + kfree_skb(skb, FREE_READ); + return 0; +} + +#ifdef CONFIG_IP_PIMSM_V1 +/* + * Handle IGMP messages of PIMv1 + */ + +int pim_rcv_v1(struct sk_buff * skb, unsigned short len) +{ + struct igmphdr *pim = (struct igmphdr*)skb->h.raw; + struct iphdr *encap; + + if (!mroute_do_pim || + len < sizeof(*pim) + sizeof(*encap) || + pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER || + reg_dev == NULL) { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } + + encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr)); + /* + Check that: + a. packet is really destinted to a multicast group + b. packet is not a NULL-REGISTER + c. packet is not truncated + */ + if (!MULTICAST(encap->daddr) || + ntohs(encap->tot_len) == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > len) { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } + skb_pull(skb, (u8*)encap - skb->data); + skb->nh.iph = (struct iphdr *)skb->data; + skb->dev = reg_dev; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = __constant_htons(ETH_P_IP); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + dst_release(skb->dst); + skb->dst = NULL; + ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; + ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + netif_rx(skb); + return 0; +} +#endif + +#ifdef CONFIG_IP_PIMSM_V2 +int pim_rcv(struct sk_buff * skb, unsigned short len) +{ + struct pimreghdr *pim = (struct pimreghdr*)skb->h.raw; + struct iphdr *encap; + + if (len < sizeof(*pim) + sizeof(*encap) || + pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || + (pim->flags&PIM_NULL_REGISTER) || + reg_dev == NULL || + ip_compute_csum((void *)pim, len)) { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } + + /* check if the inner packet is destined to mcast group */ + encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr)); + if (!MULTICAST(encap->daddr) || + ntohs(encap->tot_len) == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > len) { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } + skb_pull(skb, (u8*)encap - skb->data); + skb->nh.iph = (struct iphdr *)skb->data; + skb->dev = reg_dev; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = __constant_htons(ETH_P_IP); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + dst_release(skb->dst); + ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; + ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + skb->dst = NULL; + netif_rx(skb); + return 0; } +#endif -int ip_mr_find_tunnel(u32 local, u32 remote) +#ifdef CONFIG_RTNETLINK + +static int +ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) { int ct; - struct vif_device *vif; + struct rtnexthop *nhp; + struct device *dev = vif_table[c->mfc_parent].dev; - for (ct=0; ctflags&VIFF_TUNNEL && - vif->local == local && vif->remote == remote) - return ct; + if (dev) { + u8 *o = skb->tail; + RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); + rtm->rtm_optlen += skb->tail - o; + } + + for (ct = c->mfc_minvif; ct < c->mfc_maxvif; ct++) { + if (c->mfc_ttls[ct] < 255) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_ttls[ct]; + nhp->rtnh_ifindex = vif_table[ct].dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + rtm->rtm_nhs++; + } } - return -1; + rtm->rtm_type = RTN_MULTICAST; + return 1; + +rtattr_failure: + return -EMSGSIZE; } +int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm) +{ + struct mfc_cache *cache; + struct rtable *rt = (struct rtable*)skb->dst; + + start_bh_atomic(); + cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); + if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) { + struct device *dev = skb->dev; + int vif; + int err; + + if (dev == NULL || (vif = ipmr_find_vif(dev)) == ALL_VIFS) { + end_bh_atomic(); + return -ENODEV; + } + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + skb->nh.iph->ihl = sizeof(struct iphdr)>>2; + skb->nh.iph->saddr = rt->rt_src; + skb->nh.iph->daddr = rt->rt_dst; + skb->nh.iph->version = 0; + err = ipmr_cache_unresolved(cache, vif, skb); + end_bh_atomic(); + return err; + } + /* Resolved cache entry is not changed by net bh, + so that we are allowed to enable it. + */ + end_bh_atomic(); + + if (rtm->rtm_flags & RTM_F_NOTIFY) + cache->mfc_flags |= MFC_NOTIFY; + return ipmr_fill_mroute(skb, cache, rtm); +} +#endif + /* * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif */ @@ -945,16 +1421,19 @@ int ipmr_vif_info(char *buffer, char **start, off_t offset, int length, int dumm int ct; len += sprintf(buffer, - "Interface Bytes In Pkts In Bytes Out Pkts Out Flags Local Remote\n"); + "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); pos=len; for (ct=0;ctflags&VIFF_TUNNEL ? "Tunnel" : vif->u.dev->name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, + if (vif->dev) + name = vif->dev->name; + size = sprintf(buffer+len, "%2d %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", + ct, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags, vif->local, vif->remote); len+=size; pos+=size; @@ -984,7 +1463,7 @@ int ipmr_mfc_info(char *buffer, char **start, off_t offset, int length, int dumm int ct; len += sprintf(buffer, - "Group Origin SrcIface Pkts Bytes Wrong VifTtls\n"); + "Group Origin Iif Pkts Bytes Wrong Oifs\n"); pos=len; for (ct=0;ctmfc_parent < maxvif && vifc_map&(1<mfc_parent)) { - if (vif_table[mfc->mfc_parent].flags&VIFF_TUNNEL) - name="Tunnel"; - else - name=vif_table[mfc->mfc_parent].u.dev->name; - } + /* * Interface forwarding map */ - size = sprintf(buffer+len, "%08lX %08lX %-8s %8ld %8ld %8ld", + size = sprintf(buffer+len, "%08lX %08lX %-3d %8ld %8ld %8ld", (unsigned long)mfc->mfc_mcastgrp, (unsigned long)mfc->mfc_origin, - name, + mfc->mfc_parent == ALL_VIFS ? -1 : mfc->mfc_parent, + (mfc->mfc_flags & MFC_QUEUED) ? mfc->mfc_unresolved.qlen : mfc->mfc_pkt, mfc->mfc_bytes, - mfc->mfc_pkt, mfc->mfc_wrong_if); - for(n=0;nmfc_minvif;nmfc_maxvif;n++) { - if(vifc_map&(1<mfc_ttls[n]); - else - size += sprintf(buffer+len+size, " --- "); + if(vifc_map&(1<mfc_ttls[n] < 255) + size += sprintf(buffer+len+size, " %2d:%-3d", n, mfc->mfc_ttls[n]); } size += sprintf(buffer+len+size, "\n"); len+=size; @@ -1043,6 +1511,10 @@ done: len-=(offset-begin); if(len>length) len=length; + if (len < 0) { + len = 0; + printk(KERN_CRIT "Yep, guys... our template for proc_*_read is crappy :-)\n"); + } return len; } @@ -1061,6 +1533,19 @@ static struct proc_dir_entry proc_net_ipmr_mfc = { }; #endif +#ifdef CONFIG_IP_PIMSM_V2 +struct inet_protocol pim_protocol = +{ + pim_rcv, /* PIM handler */ + NULL, /* PIM error control */ + NULL, /* next */ + IPPROTO_PIM, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "PIM" /* name */ +}; +#endif + /* * Setup for IP multicast routing @@ -1068,7 +1553,7 @@ static struct proc_dir_entry proc_net_ipmr_mfc = { __initfunc(void ip_mr_init(void)) { - printk(KERN_INFO "Linux IP multicast router 0.06.\n"); + printk(KERN_INFO "Linux IP multicast router 0.06 plus PIM-SM\n"); register_netdevice_notifier(&ip_mr_notifier); #ifdef CONFIG_PROC_FS proc_net_register(&proc_net_ipmr_vif); diff --git a/net/ipv4/packet.c b/net/ipv4/packet.c deleted file mode 100644 index f69449e76d3e..000000000000 --- a/net/ipv4/packet.c +++ /dev/null @@ -1,528 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * PACKET - implements raw packet sockets. - * - * Doesn't belong in IP but it's currently too hooked into ip - * to separate. - * - * Version: @(#)packet.c 1.0.6 05/25/93 - * - * Authors: Ross Biro, - * Fred N. van Kempen, - * Alan Cox, - * - * Fixes: - * Alan Cox : verify_area() now used correctly - * Alan Cox : new skbuff lists, look ma no backlogs! - * Alan Cox : tidied skbuff lists. - * Alan Cox : Now uses generic datagram routines I - * added. Also fixed the peek/read crash - * from all old Linux datagram code. - * Alan Cox : Uses the improved datagram code. - * Alan Cox : Added NULL's for socket options. - * Alan Cox : Re-commented the code. - * Alan Cox : Use new kernel side addressing - * Rob Janssen : Correct MTU usage. - * Dave Platt : Counter leaks caused by incorrect - * interrupt locking and some slightly - * dubious gcc output. Can you read - * compiler: it said _VOLATILE_ - * Richard Kooijman : Timestamp fixes. - * Alan Cox : New buffers. Use sk->mac.raw. - * Alan Cox : sendmsg/recvmsg support. - * Alan Cox : Protocol setting support - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/* - * This should be the easiest of all, all we do is copy it into a buffer. - */ - -int packet_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) -{ - struct sock *sk; - - /* - * When we registered the protocol we saved the socket in the data - * field for just this event. - */ - - sk = (struct sock *) pt->data; - - /* - * Yank back the headers [hope the device set this - * right or kerboom...] - */ - - skb_push(skb,skb->data-skb->mac.raw); - - /* - * The SOCK_PACKET socket receives _all_ frames. - */ - - skb->dev = dev; - - /* - * Charge the memory to the socket. This is done specifically - * to prevent sockets using all the memory up. - */ - - if(sock_queue_rcv_skb(sk,skb)<0) - { - kfree_skb(skb, FREE_READ); - return 0; - } - /* - * Processing complete. - */ - - return(0); -} - - -/* - * Output a raw packet to a device layer. This bypasses all the other - * protocol layers and you must therefore supply it with a complete frame - */ - -static int packet_sendmsg(struct sock *sk, struct msghdr *msg, int len) -{ - struct sk_buff *skb; - struct device *dev; - struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; - unsigned short proto=0; - int err; - - /* - * Check the flags. - */ - - if (msg->msg_flags&~MSG_DONTWAIT) - return(-EINVAL); - - /* - * Get and verify the address. - */ - - if (saddr) - { - if (msg->msg_namelen < sizeof(struct sockaddr)) - return(-EINVAL); - if (msg->msg_namelen==sizeof(struct sockaddr_pkt)) - proto=saddr->spkt_protocol; - } - else - return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */ - - /* - * Find the device first to size check it - */ - - saddr->spkt_device[13] = 0; - dev = dev_get(saddr->spkt_device); - if (dev == NULL) - { - return(-ENODEV); - } - - /* - * You may not queue a frame bigger than the mtu. This is the lowest level - * raw protocol and you must do your own fragmentation at this level. - */ - - if(len>dev->mtu+dev->hard_header_len) - return -EMSGSIZE; - - skb = sock_wmalloc(sk, len+dev->hard_header_len, 0, GFP_KERNEL); - - /* - * If the write buffer is full, then tough. At this level the user gets to - * deal with the problem - do your own algorithmic backoffs. That's far - * more flexible. - */ - - if (skb == NULL) - { - return(-ENOBUFS); - } - - /* - * Fill it in - */ - - /* FIXME: Save some space for broken drivers that write a - * hard header at transmission time by themselves. PPP is the - * notable one here. This should really be fixed at the driver level. - */ - skb_reserve(skb,dev->hard_header_len); - err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); - skb->arp = 1; /* No ARP needs doing on this (complete) frame */ - skb->protocol = proto; - skb->dev = dev; - skb->priority = sk->priority; - - /* - * Now send it - */ - - if (err) - { - err = -EFAULT; - } - else - { - if (!(dev->flags & IFF_UP)) - { - err = -ENODEV; - } - } - - if (err) - { - kfree_skb(skb, FREE_WRITE); - return err; - } - - dev_queue_xmit(skb); - return(len); -} - -/* - * Close a SOCK_PACKET socket. This is fairly simple. We immediately go - * to 'closed' state and remove our protocol entry in the device list. - * The release_sock() will destroy the socket if a user has closed the - * file side of the object. - */ - -static void packet_close(struct sock *sk, unsigned long timeout) -{ - /* - * Stop more data and kill the socket off. - */ - - lock_sock(sk); - sk->state = TCP_CLOSE; - - /* - * Unhook the notifier - */ - - unregister_netdevice_notifier(&sk->protinfo.af_packet.notifier); - - if(sk->protinfo.af_packet.prot_hook) - { - /* - * Remove the protocol hook - */ - - dev_remove_pack((struct packet_type *)sk->protinfo.af_packet.prot_hook); - - /* - * Dispose of litter carefully. - */ - - kfree_s((void *)sk->protinfo.af_packet.prot_hook, sizeof(struct packet_type)); - sk->protinfo.af_packet.prot_hook = NULL; - } - - release_sock(sk); - sk->dead = 1; - destroy_sock(sk); -} - -/* - * Attach a packet hook to a device. - */ - -int packet_attach(struct sock *sk, struct device *dev) -{ - struct packet_type *p = (struct packet_type *) kmalloc(sizeof(*p), GFP_KERNEL); - if (p == NULL) - return(-ENOMEM); - - p->func = packet_rcv; - p->type = sk->num; - p->data = (void *)sk; - p->dev = dev; - dev_add_pack(p); - - /* - * We need to remember this somewhere. - */ - - sk->protinfo.af_packet.prot_hook = p; - sk->protinfo.af_packet.bound_dev = dev; - return 0; -} - -/* - * Bind a packet socket to a device - */ - -static int packet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) -{ - char name[15]; - struct device *dev; - - /* - * Check legality - */ - - if(addr_len!=sizeof(struct sockaddr)) - return -EINVAL; - strncpy(name,uaddr->sa_data,14); - name[14]=0; - - /* - * Lock the device chain while we sanity check - * the bind request. - */ - - dev_lock_list(); - dev=dev_get(name); - if(dev==NULL) - { - dev_unlock_list(); - return -ENODEV; - } - - if(!(dev->flags&IFF_UP)) - { - dev_unlock_list(); - return -ENETDOWN; - } - - /* - * Perform the request. - */ - - memcpy(sk->protinfo.af_packet.device_name,name,15); - - /* - * Rewrite an existing hook if present. - */ - - if(sk->protinfo.af_packet.prot_hook) - { - dev_remove_pack(sk->protinfo.af_packet.prot_hook); - sk->protinfo.af_packet.prot_hook->dev=dev; - sk->protinfo.af_packet.bound_dev=dev; - dev_add_pack(sk->protinfo.af_packet.prot_hook); - } - else - { - int err=packet_attach(sk, dev); - if(err) - { - dev_unlock_list(); - return err; - } - } - /* - * Now the notifier is set up right this lot is safe. - */ - dev_unlock_list(); - return 0; -} - -/* - * This hook is called when a device goes up or down so that - * SOCK_PACKET sockets can come unbound properly. - */ - -static int packet_unbind(struct notifier_block *this, unsigned long msg, void *data) -{ - struct inet_packet_opt *ipo=(struct inet_packet_opt *)this; - if(msg==NETDEV_DOWN && data==ipo->bound_dev) - { - /* - * Our device has gone down. - */ - ipo->bound_dev=NULL; - dev_remove_pack(ipo->prot_hook); - kfree(ipo->prot_hook); - ipo->prot_hook=NULL; - } - return NOTIFY_DONE; -} - - -/* - * Create a packet of type SOCK_PACKET. - */ - -static int packet_init(struct sock *sk) -{ - /* - * Attach a protocol block - */ - - int err=packet_attach(sk, NULL); - if(err) - return err; - - /* - * Set up the per socket notifier. - */ - - sk->protinfo.af_packet.notifier.notifier_call=packet_unbind; - sk->protinfo.af_packet.notifier.priority=0; - - register_netdevice_notifier(&sk->protinfo.af_packet.notifier); - - return(0); -} - - -/* - * Pull a packet from our receive queue and hand it to the user. - * If necessary we block. - */ - -int packet_recvmsg(struct sock *sk, struct msghdr *msg, int len, - int noblock, int flags,int *addr_len) -{ - int copied=0; - struct sk_buff *skb; - struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; - int err; - - /* - * If there is no protocol hook then the device is down. - */ - - if(sk->protinfo.af_packet.prot_hook==NULL) - return -ENETDOWN; - - /* - * If the address length field is there to be filled in, we fill - * it in now. - */ - - if (addr_len) - *addr_len=sizeof(*saddr); - - /* - * Call the generic datagram receiver. This handles all sorts - * of horrible races and re-entrancy so we can forget about it - * in the protocol layers. - */ - - skb=skb_recv_datagram(sk,flags,noblock,&err); - - /* - * An error occurred so return it. Because skb_recv_datagram() - * handles the blocking we don't see and worry about blocking - * retries. - */ - - if(skb==NULL) - return err; - - /* - * You lose any data beyond the buffer you gave. If it worries a - * user program they can ask the device for its MTU anyway. - */ - - copied = skb->len; - if(copied>len) - { - copied=len; - msg->msg_flags|=MSG_TRUNC; - } - - /* We can't use skb_copy_datagram here */ - err = memcpy_toiovec(msg->msg_iov, skb->data, copied); - if (err) - { - return -EFAULT; - } - - sk->stamp=skb->stamp; - - /* - * Copy the address. - */ - - if (saddr) - { - saddr->spkt_family = skb->dev->type; - strncpy(saddr->spkt_device,skb->dev->name, 15); - saddr->spkt_protocol = skb->protocol; - } - - /* - * Free or return the buffer as appropriate. Again this hides all the - * races and re-entrancy issues from us. - */ - - skb_free_datagram(sk, skb); - - return(copied); -} - -/* - * This structure declares to the lower layer socket subsystem currently - * incorrectly embedded in the IP code how to behave. This interface needs - * a lot of work and will change. - */ - -struct proto packet_prot = -{ - (struct sock *)&packet_prot, /* sklist_next */ - (struct sock *)&packet_prot, /* sklist_prev */ - packet_close, /* close */ - NULL, /* connect */ - NULL, /* accept */ - NULL, /* retransmit */ - NULL, /* write_wakeup */ - NULL, /* read_wakeup */ - datagram_poll, /* poll */ - NULL, /* ioctl */ - packet_init, /* init */ - NULL, /* destroy */ - NULL, /* shutdown */ - NULL, /* setsockopt */ - NULL, /* getsockopt */ - packet_sendmsg, /* Sendmsg */ - packet_recvmsg, /* Recvmsg */ - packet_bind, /* bind */ - NULL, /* backlog_rcv */ - NULL, /* hash */ - NULL, /* unhash */ - NULL, /* rehash */ - NULL, /* good_socknum */ - NULL, /* verify_bind */ - 128, /* max_header */ - 0, /* retransmits */ - "PACKET", /* name */ - 0, /* inuse */ - 0 /* highestinuse */ -}; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 0ce80fec4ef9..7f3b5f9bbb9b 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: @(#)proc.c 1.0.5 05/27/93 + * Version: $Id: proc.c,v 1.23 1997/10/30 23:52:20 davem Exp $ * * Authors: Fred N. van Kempen, * Gerald J. Heim, @@ -221,7 +221,6 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length, int du { /* From net/socket.c */ extern int socket_get_info(char *, char **, off_t, int); - extern struct proto packet_prot; int len = socket_get_info(buffer,start,offset,length); @@ -231,8 +230,6 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length, int du udp_prot.inuse, udp_prot.highestinuse); len += sprintf(buffer+len,"RAW: inuse %d highest %d\n", raw_prot.inuse, raw_prot.highestinuse); - len += sprintf(buffer+len,"PAC: inuse %d highest %d\n", - packet_prot.inuse, packet_prot.highestinuse); if (offset >= len) { *start = buffer; @@ -291,14 +288,15 @@ int snmp_get_info(char *buffer, char **start, off_t offset, int length, int dumm icmp_statistics.IcmpOutAddrMasks, icmp_statistics.IcmpOutAddrMaskReps); len += sprintf (buffer + len, - "Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs\n" - "Tcp: %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + "Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts\n" + "Tcp: %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", tcp_statistics.TcpRtoAlgorithm, tcp_statistics.TcpRtoMin, tcp_statistics.TcpRtoMax, tcp_statistics.TcpMaxConn, tcp_statistics.TcpActiveOpens, tcp_statistics.TcpPassiveOpens, tcp_statistics.TcpAttemptFails, tcp_statistics.TcpEstabResets, tcp_statistics.TcpCurrEstab, tcp_statistics.TcpInSegs, - tcp_statistics.TcpOutSegs, tcp_statistics.TcpRetransSegs); + tcp_statistics.TcpOutSegs, tcp_statistics.TcpRetransSegs, + tcp_statistics.TcpInErrs, tcp_statistics.TcpOutRsts); len += sprintf (buffer + len, "Udp: InDatagrams NoPorts InErrors OutDatagrams\nUdp: %lu %lu %lu %lu\n", diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 5c7d6ca759cb..b47480be5fcd 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -5,7 +5,7 @@ * * INET protocol dispatch tables. * - * Version: @(#)protocol.c 1.0.5 05/25/93 + * Version: $Id: protocol.c,v 1.9 1997/10/29 20:27:34 kuznet Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -45,20 +45,23 @@ #include #include +#define IPPROTO_PREVIOUS NULL -#ifdef CONFIG_NET_IPIP +#ifdef CONFIG_IP_MULTICAST -static struct inet_protocol ipip_protocol = +static struct inet_protocol igmp_protocol = { - ipip_rcv, /* IPIP handler */ - ipip_err, /* TUNNEL error control */ - 0, /* next */ - IPPROTO_IPIP, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "IPIP" /* name */ + igmp_rcv, /* IGMP handler */ + NULL, /* IGMP error control */ + IPPROTO_PREVIOUS, /* next */ + IPPROTO_IGMP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "IGMP" /* name */ }; +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &igmp_protocol #endif @@ -66,52 +69,47 @@ static struct inet_protocol tcp_protocol = { tcp_v4_rcv, /* TCP handler */ tcp_v4_err, /* TCP error control */ -#ifdef CONFIG_NET_IPIP - &ipip_protocol, -#else - NULL, /* next */ -#endif + IPPROTO_PREVIOUS, IPPROTO_TCP, /* protocol ID */ 0, /* copy */ NULL, /* data */ "TCP" /* name */ }; +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &tcp_protocol + static struct inet_protocol udp_protocol = { udp_rcv, /* UDP handler */ udp_err, /* UDP error control */ - &tcp_protocol, /* next */ + IPPROTO_PREVIOUS, /* next */ IPPROTO_UDP, /* protocol ID */ 0, /* copy */ NULL, /* data */ "UDP" /* name */ }; +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &udp_protocol + static struct inet_protocol icmp_protocol = { icmp_rcv, /* ICMP handler */ NULL, /* ICMP error control */ - &udp_protocol, /* next */ + IPPROTO_PREVIOUS, /* next */ IPPROTO_ICMP, /* protocol ID */ 0, /* copy */ NULL, /* data */ "ICMP" /* name */ }; -static struct inet_protocol igmp_protocol = -{ - igmp_rcv, /* IGMP handler */ - NULL, /* IGMP error control */ - &icmp_protocol, /* next */ - IPPROTO_IGMP, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "IGMP" /* name */ -}; +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &icmp_protocol + -struct inet_protocol *inet_protocol_base = &igmp_protocol; +struct inet_protocol *inet_protocol_base = IPPROTO_PREVIOUS; struct inet_protocol *inet_protos[MAX_INET_PROTOS] = { diff --git a/net/ipv4/rarp.c b/net/ipv4/rarp.c index d2e6ad5c4873..f7ab4ddc3e55 100644 --- a/net/ipv4/rarp.c +++ b/net/ipv4/rarp.c @@ -3,6 +3,8 @@ * Copyright (C) 1994 by Ross Martin * Based on linux/net/inet/arp.c, Copyright (C) 1994 by Florian La Roche * + * $Id: rarp.c,v 1.21 1997/10/27 09:13:16 geert Exp $ + * * This module implements the Reverse Address Resolution Protocol * (RARP, RFC 903), which is used to convert low level addresses such * as ethernet addresses into high level addresses such as IP addresses. @@ -119,20 +121,20 @@ static void rarp_destroy(unsigned long ip_addr) struct rarp_table *entry; struct rarp_table **pentry; - cli(); + start_bh_atomic(); pentry = &rarp_tables; while ((entry = *pentry) != NULL) { if (entry->ip == ip_addr) { *pentry = entry->next; - sti(); + end_bh_atomic(); rarp_release_entry(entry); return; } pentry = &entry->next; } - sti(); + end_bh_atomic(); } /* @@ -144,7 +146,7 @@ static void rarp_destroy_dev(struct device *dev) struct rarp_table *entry; struct rarp_table **pentry; - cli(); + start_bh_atomic(); pentry = &rarp_tables; while ((entry = *pentry) != NULL) { @@ -156,7 +158,7 @@ static void rarp_destroy_dev(struct device *dev) else pentry = &entry->next; } - sti(); + end_bh_atomic(); } static int rarp_device_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -176,6 +178,8 @@ static struct notifier_block rarp_dev_notifier={ NULL, 0 }; + +static int rarp_pkt_inited=0; static void rarp_init_pkt (void) { @@ -183,8 +187,19 @@ static void rarp_init_pkt (void) rarp_packet_type.type=htons(ETH_P_RARP); dev_add_pack(&rarp_packet_type); register_netdevice_notifier(&rarp_dev_notifier); + rarp_pkt_inited=1; } +static void rarp_end_pkt(void) +{ + if(!rarp_pkt_inited) + return; + dev_remove_pack(&rarp_packet_type); + unregister_netdevice_notifier(&rarp_dev_notifier); + rarp_pkt_inited=0; +} + + /* * Receive an arp request by the device layer. Maybe it should be * rewritten to use the incoming packet for the reply. The current @@ -199,6 +214,7 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type struct arphdr *rarp = (struct arphdr *) skb->data; unsigned char *rarp_ptr = skb_pull(skb,sizeof(struct arphdr)); struct rarp_table *entry; + struct in_device *in_dev = dev->ip_ptr; long sip,tip; unsigned char *sha,*tha; /* s for "source", t for "target" */ @@ -207,7 +223,7 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type */ if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd) - || dev->flags&IFF_NOARP) + || dev->flags&IFF_NOARP || !in_dev || !in_dev->ifa_list) { kfree_skb(skb, FREE_READ); return 0; @@ -256,7 +272,6 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type * Process entry. Use tha for table lookup according to RFC903. */ - cli(); for (entry = rarp_tables; entry != NULL; entry = entry->next) if (!memcmp(entry->ha, tha, rarp->ar_hln)) break; @@ -264,13 +279,10 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type if (entry != NULL) { sip=entry->ip; - sti(); - arp_send(ARPOP_RREPLY, ETH_P_RARP, sip, dev, dev->pa_addr, sha, + arp_send(ARPOP_RREPLY, ETH_P_RARP, sip, dev, in_dev->ifa_list->ifa_address, sha, dev->dev_addr, sha); } - else - sti(); kfree_skb(skb, FREE_READ); return 0; @@ -331,10 +343,10 @@ static int rarp_req_set(struct arpreq *req) * Is it reachable directly ? */ - err = ip_route_output(&rt, ip, 0, 1, NULL); + err = ip_route_output(&rt, ip, 0, 1, 0); if (err) return err; - if (rt->rt_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT)) { + if (rt->rt_flags&(RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST|RTCF_DNAT)) { ip_rt_put(rt); return -EINVAL; } @@ -344,7 +356,6 @@ static int rarp_req_set(struct arpreq *req) * Is there an existing entry for this address? Find out... */ - cli(); for (entry = rarp_tables; entry != NULL; entry = entry->next) if (entry->ip == ip) break; @@ -359,7 +370,6 @@ static int rarp_req_set(struct arpreq *req) GFP_ATOMIC); if (entry == NULL) { - sti(); return -ENOMEM; } if (initflag) @@ -368,21 +378,23 @@ static int rarp_req_set(struct arpreq *req) initflag=0; } + /* Block interrupts until table modification is finished */ + + cli(); entry->next = rarp_tables; rarp_tables = entry; } - + cli(); entry->ip = ip; entry->hlen = hlen; entry->htype = htype; memcpy(&entry->ha, &r.arp_ha.sa_data, hlen); entry->dev = dev; + sti(); /* Don't unlink if we have entries to serve. */ MOD_INC_USE_COUNT; - sti(); - return 0; } @@ -417,14 +429,12 @@ static int rarp_req_get(struct arpreq *req) si = (struct sockaddr_in *) &r.arp_pa; ip = si->sin_addr.s_addr; - cli(); for (entry = rarp_tables; entry != NULL; entry = entry->next) if (entry->ip == ip) break; if (entry == NULL) { - sti(); return -ENXIO; } @@ -434,7 +444,6 @@ static int rarp_req_get(struct arpreq *req) memcpy(r.arp_ha.sa_data, &entry->ha, entry->hlen); r.arp_ha.sa_family = entry->htype; - sti(); /* * Copy the information back @@ -483,6 +492,7 @@ int rarp_ioctl(unsigned int cmd, void *arg) return 0; } +#ifdef CONFIG_PROC_FS int rarp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { int len=0; @@ -505,7 +515,6 @@ int rarp_get_info(char *buffer, char **start, off_t offset, int length, int dumm pos+=size; len+=size; - cli(); for(entry=rarp_tables; entry!=NULL; entry=entry->next) { netip=htonl(entry->ip); /* switch to network order */ @@ -537,7 +546,6 @@ int rarp_get_info(char *buffer, char **start, off_t offset, int length, int dumm if(pos>offset+length) break; } - sti(); } *start = buffer+(offset-begin); /* Start of wanted data */ @@ -553,11 +561,14 @@ struct proc_dir_entry proc_net_rarp = { 0, &proc_net_inode_operations, rarp_get_info }; +#endif __initfunc(void rarp_init(void)) { +#ifdef CONFIG_PROC_FS proc_net_register(&proc_net_rarp); +#endif rarp_ioctl_hook = rarp_ioctl; } @@ -572,7 +583,9 @@ int init_module(void) void cleanup_module(void) { struct rarp_table *rt, *rt_next; +#ifdef CONFIG_PROC_FS proc_net_unregister(PROC_NET_RARP); +#endif rarp_ioctl_hook = NULL; cli(); /* Destroy the RARP-table */ @@ -584,5 +597,6 @@ void cleanup_module(void) rt_next = rt->next; rarp_release_entry(rt); } + rarp_end_pkt(); } #endif diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0d51af255d7d..2f4de9fbda18 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,7 +5,7 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: @(#)raw.c 1.0.4 05/25/93 + * Version: $Id: raw.c,v 1.32 1997/10/24 17:16:00 kuznet Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -126,7 +126,7 @@ static void raw_v4_rehash(struct sock *sk) /* Grumble... icmp and ip_input want to get at this... */ struct sock *raw_v4_lookup(struct sock *sk, unsigned short num, - unsigned long raddr, unsigned long laddr) + unsigned long raddr, unsigned long laddr, int dif) { struct sock *s = sk; @@ -135,7 +135,8 @@ struct sock *raw_v4_lookup(struct sock *sk, unsigned short num, if((s->num == num) && !(s->dead && (s->state == TCP_CLOSE)) && !(s->daddr && s->daddr != raddr) && - !(s->rcv_saddr && s->rcv_saddr != laddr)) + !(s->rcv_saddr && s->rcv_saddr != laddr) && + !(s->bound_dev_if && s->bound_dev_if != dif)) break; /* gotcha */ } SOCKHASH_UNLOCK(); @@ -203,7 +204,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) struct rawfakehdr { - const unsigned char *from; + struct iovec *iov; u32 saddr; }; @@ -218,7 +219,7 @@ struct rawfakehdr static int raw_getfrag(const void *p, char *to, unsigned int offset, unsigned int fraglen) { struct rawfakehdr *rfh = (struct rawfakehdr *) p; - return copy_from_user(to, rfh->from + offset, fraglen); + return memcpy_fromiovecend(to, rfh->iov, offset, fraglen); } /* @@ -229,8 +230,9 @@ static int raw_getrawfrag(const void *p, char *to, unsigned int offset, unsigned { struct rawfakehdr *rfh = (struct rawfakehdr *) p; - if (copy_from_user(to, rfh->from + offset, fraglen)) + if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen)) return -EFAULT; + if (offset==0) { struct iphdr *iph = (struct iphdr *)to; if (!iph->saddr) @@ -249,10 +251,8 @@ static int raw_getrawfrag(const void *p, char *to, unsigned int offset, unsigned return 0; } -static int raw_sendto(struct sock *sk, const unsigned char *from, - int len, struct msghdr *msg) +static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len) { - struct device *dev = NULL; struct ipcm_cookie ipc; struct rawfakehdr rfh; struct rtable *rt; @@ -302,9 +302,10 @@ static int raw_sendto(struct sock *sk, const unsigned char *from, ipc.addr = sk->saddr; ipc.opt = NULL; + ipc.oif = sk->bound_dev_if; if (msg->msg_controllen) { - int tmp = ip_cmsg_send(msg, &ipc, &dev); + int tmp = ip_cmsg_send(msg, &ipc); if (tmp) return tmp; if (ipc.opt && sk->ip_hdrincl) { @@ -327,23 +328,27 @@ static int raw_sendto(struct sock *sk, const unsigned char *from, } tos = RT_TOS(sk->ip_tos) | (sk->localroute || (msg->msg_flags&MSG_DONTROUTE)); - if (MULTICAST(daddr) && sk->ip_mc_index && dev==NULL) - err = ip_route_output_dev(&rt, daddr, rfh.saddr, tos, sk->ip_mc_index); - else - err = ip_route_output(&rt, daddr, rfh.saddr, tos, dev); + if (MULTICAST(daddr)) { + if (!ipc.oif) + ipc.oif = sk->ip_mc_index; + if (!rfh.saddr) + rfh.saddr = sk->ip_mc_addr; + } + + err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif); if (err) { if (free) kfree(ipc.opt); return err; } - if (rt->rt_flags&RTF_BROADCAST && !sk->broadcast) { + if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) { if (free) kfree(ipc.opt); ip_rt_put(rt); return -EACCES; } - rfh.from = from; + rfh.iov = msg->msg_iov; rfh.saddr = rt->rt_src; if (!ipc.addr) ipc.addr = rt->rt_dst; @@ -363,56 +368,10 @@ static int raw_sendto(struct sock *sk, const unsigned char *from, return err<0 ? err : len; } -/* - * Temporary - */ - -static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len) -{ - if (msg->msg_iovlen==1) - return raw_sendto(sk, msg->msg_iov[0].iov_base,len, msg); - else { - /* - * For awkward cases we linearise the buffer first. In theory this is only frames - * whose iovec's don't split on 4 byte boundaries, and soon encrypted stuff (to keep - * skip happy). We are a bit more general about it. - */ - - unsigned char *buf; - int err; - if(len>65515) - return -EMSGSIZE; - buf=kmalloc(len, GFP_KERNEL); - if(buf==NULL) - return -ENOBUFS; - err = memcpy_fromiovec(buf, msg->msg_iov, len); - if (!err) - { - unsigned long fs; - fs=get_fs(); - set_fs(get_ds()); - err=raw_sendto(sk,buf,len, msg); - set_fs(fs); - } - else - err = -EFAULT; - - kfree_s(buf,len); - return err; - } -} - static void raw_close(struct sock *sk, unsigned long timeout) { sk->state = TCP_CLOSE; -#ifdef CONFIG_IP_MROUTE - if(sk==mroute_socket) - { - ipv4_config.multicast_route = 0; - mroute_close(sk); - mroute_socket=NULL; - } -#endif + ip_ra_control(sk, 0, NULL); sk->dead=1; destroy_sock(sk); } @@ -425,17 +384,17 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if((sk->state != TCP_CLOSE) || (addr_len < sizeof(struct sockaddr_in))) return -EINVAL; - chk_addr_ret = __ip_chk_addr(addr->sin_addr.s_addr); - if(addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR && - chk_addr_ret != IS_MULTICAST && chk_addr_ret != IS_BROADCAST) { + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + if(addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) { #ifdef CONFIG_IP_TRANSPARENT_PROXY /* Superuser may bind to any address to allow transparent proxying. */ - if(!suser()) + if(chk_addr_ret != RTN_UNICAST || !suser()) #endif return -EADDRNOTAVAIL; } sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr; - if(chk_addr_ret == IS_MULTICAST || chk_addr_ret == IS_BROADCAST) + if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) sk->saddr = 0; /* Use device */ dst_release(sk->dst_cache); sk->dst_cache = NULL; @@ -448,7 +407,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) */ int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len, - int noblock, int flags,int *addr_len) + int noblock, int flags,int *addr_len) { int copied=0; struct sk_buff *skb; @@ -500,6 +459,75 @@ int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len, return err ? err : (copied); } +static int raw_init(struct sock *sk) +{ + struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4); + if (sk->num == IPPROTO_ICMP) { + memset(&tp->filter, 0, sizeof(tp->filter)); + + /* By default block ECHO and TIMESTAMP requests */ + + set_bit(ICMP_ECHO, &tp->filter); + set_bit(ICMP_TIMESTAMP, &tp->filter); + } + return 0; +} + +static int raw_seticmpfilter(struct sock *sk, char *optval, int optlen) +{ + if (optlen > sizeof(struct icmp_filter)) + optlen = sizeof(struct icmp_filter); + if (copy_from_user(&sk->tp_pinfo.tp_raw4.filter, optval, optlen)) + return -EFAULT; + return 0; +} + +static int raw_geticmpfilter(struct sock *sk, char *optval, int *optlen) +{ + int len; + + if (get_user(len,optlen)) + return -EFAULT; + if (len > sizeof(struct icmp_filter)) + len = sizeof(struct icmp_filter); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &sk->tp_pinfo.tp_raw4.filter, len)) + return -EFAULT; + return 0; +} + +static int raw_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (level != SOL_RAW) + return ip_setsockopt(sk, level, optname, optval, optlen); + + switch (optname) { + case ICMP_FILTER: + if (sk->num != IPPROTO_ICMP) + return -EOPNOTSUPP; + return raw_seticmpfilter(sk, optval, optlen); + }; + + return -ENOPROTOOPT; +} + +static int raw_getsockopt(struct sock *sk, int level, int optname, + char *optval, int *optlen) +{ + if (level != SOL_RAW) + return ip_getsockopt(sk, level, optname, optval, optlen); + + switch (optname) { + case ICMP_FILTER: + if (sk->num != IPPROTO_ICMP) + return -EOPNOTSUPP; + return raw_geticmpfilter(sk, optval, optlen); + }; + + return -ENOPROTOOPT; +} struct proto raw_prot = { (struct sock *)&raw_prot, /* sklist_next */ @@ -516,11 +544,11 @@ struct proto raw_prot = { #else NULL, /* ioctl */ #endif - NULL, /* init */ + raw_init, /* init */ NULL, /* destroy */ NULL, /* shutdown */ - ip_setsockopt, /* setsockopt */ - ip_getsockopt, /* getsockopt */ + raw_setsockopt, /* setsockopt */ + raw_getsockopt, /* getsockopt */ raw_sendmsg, /* sendmsg */ raw_recvmsg, /* recvmsg */ raw_bind, /* bind */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b55fb766608f..046c60beb338 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: @(#)route.c 1.0.14 05/31/93 + * Version: $Id: route.c,v 1.33 1997/10/24 17:16:08 kuznet Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -68,27 +68,27 @@ #include #include #include -#include #include #include -#include +#include +#include +#include +#include +#include +#include #include +#include #include +#include +#include #include #include -#include -#include #include -#include - -/* Compile time configuretion flags */ -#define CONFIG_IP_LOCAL_RT_POLICY 1 +#define RTprint(a...) printk(KERN_DEBUG a) -static void rt_run_flush(unsigned long); - static struct timer_list rt_flush_timer = - { NULL, NULL, RT_FLUSH_DELAY, 0L, rt_run_flush }; + { NULL, NULL, RT_FLUSH_DELAY, 0L, NULL }; /* * Interface to generic destination cache. @@ -108,6 +108,24 @@ struct dst_ops ipv4_dst_ops = ipv4_dst_destroy }; +__u8 ip_tos2prio[16] = { + TC_PRIO_FILLER, + TC_PRIO_BESTEFFORT, + TC_PRIO_FILLER, + TC_PRIO_FILLER, + TC_PRIO_BULK, + TC_PRIO_FILLER, + TC_PRIO_BULK, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE_BULK, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE_BULK, + TC_PRIO_FILLER +}; /* * Route cache. @@ -162,8 +180,10 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt r->u.dst.dev ? r->u.dst.dev->name : "*", (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, - r->rt_flags, atomic_read(&r->u.dst.refcnt), - atomic_read(&r->u.dst.use), 0, + r->rt_flags, + atomic_read(&r->u.dst.use), + atomic_read(&r->u.dst.refcnt), + 0, (unsigned long)r->rt_src, (int)r->u.dst.pmtu, r->u.dst.window, (int)r->u.dst.rtt, r->key.tos, @@ -202,8 +222,6 @@ void ip_rt_check_expire() struct rtable *rth, **rthp; unsigned long now = jiffies; - start_bh_atomic(); - for (i=0; iu.dst.lastuse - rth->u.dst.lastuse > RT_CACHE_BUBBLE_THRESHOLD || (rth->u.dst.lastuse - rth_next->u.dst.lastuse < 0 && - atomic_read(&rth->u.dst.use) < atomic_read(&rth_next->u.dst.use))) { + atomic_read(&rth->u.dst.refcnt) < atomic_read(&rth_next->u.dst.refcnt))) { #if RT_CACHE_DEBUG >= 2 printk("rt_check_expire bubbled %02x@%08x<->%08x\n", rover, rth->rt_dst, rth_next->rt_dst); #endif *rthp = rth_next; rth->u.rt_next = rth_next->u.rt_next; rth_next->u.rt_next = rth; - sti(); rthp = &rth_next->u.rt_next; continue; } rthp = &rth->u.rt_next; } } - - end_bh_atomic(); -} - - -void rt_cache_flush(int how) -{ - start_bh_atomic(); - if (rt_flush_timer.expires) { - if (jiffies - rt_flush_timer.expires > 0 || - rt_flush_timer.expires - jiffies > RT_FLUSH_DELAY/2) - how = 1; - } - if (how) { - if (rt_flush_timer.expires) - del_timer(&rt_flush_timer); - rt_flush_timer.expires = 0; - end_bh_atomic(); - rt_run_flush(0); - return; - } - if (rt_flush_timer.expires) { - end_bh_atomic(); - return; - } - del_timer(&rt_flush_timer); - rt_flush_timer.expires = jiffies + RT_FLUSH_DELAY; - add_timer(&rt_flush_timer); - end_bh_atomic(); } - -void rt_run_flush(unsigned long dummy) + +static void rt_run_flush(unsigned long dummy) { int i; struct rtable * rth, * next; @@ -313,6 +294,30 @@ void rt_run_flush(unsigned long dummy) #endif } } + +void rt_cache_flush(int delay) +{ + start_bh_atomic(); + if (delay && rt_flush_timer.function && + rt_flush_timer.expires - jiffies < delay) { + end_bh_atomic(); + return; + } + if (rt_flush_timer.function) { + del_timer(&rt_flush_timer); + rt_flush_timer.function = NULL; + } + if (delay == 0) { + end_bh_atomic(); + rt_run_flush(0); + return; + } + rt_flush_timer.function = rt_run_flush; + rt_flush_timer.expires = jiffies + delay; + add_timer(&rt_flush_timer); + end_bh_atomic(); +} + static void rt_garbage_collect(void) { @@ -327,7 +332,7 @@ static void rt_garbage_collect(void) /* * Garbage collection is pretty expensive, - * do not make it too frequently. + * do not make it too frequently, but just increase expire strength. */ if (now - last_gc < 1*HZ) { expire >>= 1; @@ -342,7 +347,7 @@ static void rt_garbage_collect(void) continue; for (rthp=&rt_hash_table[i]; (rth=*rthp); rthp=&rth->u.rt_next) { if (atomic_read(&rth->u.dst.use) || - (now - rth->u.dst.lastuse > expire)) + now - rth->u.dst.lastuse < expire) continue; atomic_dec(&rt_cache_size); *rthp = rth->u.rt_next; @@ -465,115 +470,94 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 prot void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, u32 saddr, u8 tos, struct device *dev) { - int i; - int off_link = 0; - struct fib_info *fi; + int i, k; + struct in_device *in_dev = dev->ip_ptr; struct rtable *rth, **rthp; - u32 skeys[2] = { saddr, 0, }; - struct device *pdev = net_alias_main_dev(dev); + u32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; tos &= IPTOS_TOS_MASK; - if (new_gw == old_gw || !ipv4_config.accept_redirects + if (!in_dev || new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) goto reject_redirect; - if ((new_gw^dev->pa_addr)&dev->pa_mask) - off_link = 1; - - if (!ipv4_config.rfc1620_redirects) { - if (off_link) + if (!IN_DEV_SHARED_MEDIA(in_dev)) { + if (ip_fib_check_default(new_gw, dev)) goto reject_redirect; - if (ipv4_config.secure_redirects && ip_fib_chk_default_gw(new_gw, dev)) + } else { + if (inet_addr_type(new_gw) != RTN_UNICAST) goto reject_redirect; } - fi = fib_lookup_info(new_gw, 0, 0, &loopback_dev, NULL); - if (fi == NULL || fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_NAT)) - goto reject_redirect; - for (i=0; i<2; i++) { - unsigned hash = rt_hash_code(daddr, skeys[i], tos); + for (k=0; k<2; k++) { + unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos); - rthp=&rt_hash_table[hash]; + rthp=&rt_hash_table[hash]; - while ( (rth = *rthp) != NULL) { - struct rtable *rt; + while ( (rth = *rthp) != NULL) { + struct rtable *rt; - if (rth->key.dst != daddr || - rth->key.src != skeys[i] || - rth->key.tos != tos || - rth->key.dst_dev != NULL || - rth->key.src_dev != NULL) { - rthp = &rth->u.rt_next; - continue; - } + if (rth->key.dst != daddr || + rth->key.src != skeys[i] || + rth->key.tos != tos || + rth->key.oif != ikeys[k] || + rth->key.iif != 0) { + rthp = &rth->u.rt_next; + continue; + } - if (rth->rt_dst != daddr || - rth->rt_src != saddr || - rth->rt_flags&RTF_REJECT || - rth->rt_gateway != old_gw || - rth->u.dst.dev != dev) - break; + if (rth->rt_dst != daddr || + rth->rt_src != saddr || + rth->u.dst.error || + rth->rt_gateway != old_gw || + rth->u.dst.dev != dev) + break; - rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); - if (rt == NULL) - return; + rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (rt == NULL) + return; - /* - * Copy all the information. - */ - atomic_set(&rt->u.dst.refcnt, 1); - rt->u.dst.dev = dev; - rt->u.dst.input = rth->u.dst.input; - rt->u.dst.output = rth->u.dst.output; - rt->u.dst.pmtu = dev->mtu; - rt->u.dst.rtt = TCP_TIMEOUT_INIT; - rt->u.dst.window = 0; - atomic_set(&rt->u.dst.use, 1); - rt->u.dst.lastuse = jiffies; - - rt->rt_flags = rth->rt_flags|RTF_DYNAMIC|RTF_MODIFIED; - rt->rt_flags &= ~RTF_GATEWAY; - if (new_gw != daddr) - rt->rt_flags |= RTF_GATEWAY; - - rt->rt_src = rth->rt_src; - rt->rt_dst = rth->rt_dst; - rt->rt_src_dev = rth->rt_src_dev; - rt->rt_spec_dst = rth->rt_spec_dst; - rt->key = rth->key; - - /* But gateway is different ... */ - rt->rt_gateway = new_gw; - - if (off_link) { - if (fi->fib_dev != dev && - net_alias_main_dev(fi->fib_dev) == pdev) - rt->u.dst.dev = fi->fib_dev; - } + /* + * Copy all the information. + */ + *rt = *rth; + atomic_set(&rt->u.dst.refcnt, 1); + atomic_set(&rt->u.dst.use, 1); + rt->u.dst.lastuse = jiffies; + rt->u.dst.neighbour = NULL; + rt->u.dst.hh = NULL; + + rt->rt_flags |= RTCF_REDIRECTED; + + /* Gateway is different ... */ + rt->rt_gateway = new_gw; + + if (!rt_ll_bind(rt)) { + ip_rt_put(rt); + rt_free(rt); + break; + } - if (ipv4_config.rfc1620_redirects && !rt_ll_bind(rt)) { + *rthp = rth->u.rt_next; + rt_free(rth); + rt = rt_intern_hash(hash, rt, ETH_P_IP); ip_rt_put(rt); - rt_free(rt); break; } - - *rthp = rth->u.rt_next; - rt_free(rth); - rt = rt_intern_hash(hash, rt, ETH_P_IP); - ip_rt_put(rt); - break; } } return; reject_redirect: +#ifdef CONFIG_IP_ROUTE_VERBOSE if (ipv4_config.log_martians && net_ratelimit()) printk(KERN_INFO "Redirect from %lX/%s to %lX ignored." "Path = %lX -> %lX, tos %02x\n", ntohl(old_gw), dev->name, ntohl(new_gw), ntohl(saddr), ntohl(daddr), tos); +#endif } @@ -585,7 +569,7 @@ void ip_rt_advice(struct rtable **rp, int advice) return; start_bh_atomic(); - if ((rt = *rp) != NULL && (rt->rt_flags&(RTF_DYNAMIC|RTF_MODIFIED))) { + if ((rt = *rp) != NULL && (rt->rt_flags&RTCF_REDIRECTED)) { #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ip_rt_advice: redirect to %08x/%02x dropped\n", rt->rt_dst, rt->key.tos); #endif @@ -602,7 +586,7 @@ void ip_rt_advice(struct rtable **rp, int advice) * 1. The first RT_REDIRECT_NUMBER redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. - * 2. If we did not see a packets requiring redirects + * 2. If we did not see packets requiring redirects * during RT_REDIRECT_SILENCE, we assume that the host * forgot redirected route and start to send redirects again. * @@ -637,9 +621,12 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (jiffies - rt->last_error > (RT_REDIRECT_LOAD<errors)) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); rt->last_error = jiffies; - if (ipv4_config.log_martians && ++rt->errors == RT_REDIRECT_NUMBER && net_ratelimit()) - printk(KERN_WARNING "host %08x/%s ignores redirects for %08x to %08x.\n", - rt->rt_src, rt->rt_src_dev->name, rt->rt_dst, rt->rt_gateway); + ++rt->errors; +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (ipv4_config.log_martians && rt->errors == RT_REDIRECT_NUMBER && net_ratelimit()) + printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n", + rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway); +#endif } } @@ -653,6 +640,9 @@ static int ip_error(struct sk_buff *skb) default: kfree_skb(skb, FREE_READ); return 0; + case EHOSTUNREACH: + code = ICMP_HOST_UNREACH; + break; case ENETUNREACH: code = ICMP_NET_UNREACH; break; @@ -668,37 +658,24 @@ static int ip_error(struct sk_buff *skb) return 0; } +/* + * The last two values are not from the RFC but + * are needed for AMPRnet AX.25 paths. + */ + +static unsigned short mtu_plateau[] = +{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; static __inline__ unsigned short guess_mtu(unsigned short old_mtu) { - if (old_mtu > 32000) - return 32000; - else if (old_mtu > 17914) - return 17914; - else if (old_mtu > 8166) - return 8166; - else if (old_mtu > 4352) - return 4352; - else if (old_mtu > 2002) - return 2002; - else if (old_mtu > 1492) - return 1492; - else if (old_mtu > 576) - return 576; - else if (old_mtu > 296) - return 296; - /* - * These two are not from the RFC but - * are needed for AMPRnet AX.25 paths. - */ - else if (old_mtu > 216) - return 216; - else if (old_mtu > 128) - return 128; + int i; + + for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++) + if (old_mtu > mtu_plateau[i]) + return mtu_plateau[i]; return 68; } - unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) { int i; @@ -721,8 +698,8 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) rth->rt_dst == daddr && rth->rt_src == iph->saddr && rth->key.tos == tos && - !rth->key.src_dev && - !(rth->rt_flags&RTF_NOPMTUDISC)) { + rth->key.iif == 0 && + !(rth->rt_flags&RTCF_NOPMTUDISC)) { unsigned short mtu = new_mtu; if (new_mtu < 68 || new_mtu >= old_mtu) { @@ -770,177 +747,227 @@ static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst, return NULL; } -int -ip_check_mc(struct device *dev, u32 mc_addr) +static int ip_rt_bug(struct sk_buff *skb) { - struct ip_mc_list *ip_mc; + printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr, + skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?"); + kfree_skb(skb, FREE_WRITE); + return 0; +} - if (mc_addr==htonl(INADDR_ALLHOSTS_GROUP)) - return 1; +/* + We do not cache source address of outgoing interface, + because it is used only by IP RR, TS and SRR options, + so that it out of fast path. - for (ip_mc=dev->ip_mc_list; ip_mc; ip_mc=ip_mc->next) - if (ip_mc->multiaddr == mc_addr) - return 1; - return 0; + BTW remember: "addr" is allowed to be not aligned + in IP options! + */ + +void ip_rt_get_source(u8 *addr, struct rtable *rt) +{ + u32 src; + struct fib_result res; + + if (rt->key.iif == 0) { + memcpy(addr, &rt->rt_src, 4); + return; + } + if (fib_lookup(&rt->key, &res) == 0) { + src = FIB_RES_PREFSRC(res); + memcpy(addr, &src, 4); + return; + } + src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); + memcpy(addr, &src, 4); } -static int ip_rt_bug(struct sk_buff *skb) +static int +ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct device *dev, int our) { - kfree_skb(skb, FREE_WRITE); - printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr, - skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?"); + unsigned hash; + struct rtable *rth; + u32 spec_dst; + struct in_device *in_dev = dev->ip_ptr; + + /* Primary sanity checks. */ + + if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || + in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP)) + return -EINVAL; + + if (ZERONET(saddr)) { + if (!LOCAL_MCAST(daddr)) + return -EINVAL; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); + } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst) < 0) + return -EINVAL; + + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (!rth) + return -ENOBUFS; + + rth->u.dst.output= ip_rt_bug; + + atomic_set(&rth->u.dst.use, 1); + rth->key.dst = daddr; + rth->rt_dst = daddr; + rth->key.tos = tos; + rth->key.src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = daddr; + rth->rt_src_map = saddr; +#endif + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + rth->key.oif = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->rt_type = RTN_MULTICAST; + rth->rt_flags = RTCF_MULTICAST; + if (our) { + rth->u.dst.input= ip_local_deliver; + rth->rt_flags |= RTCF_LOCAL; + } + +#ifdef CONFIG_IP_MROUTE + if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + rth->u.dst.input = ip_mr_input; +#endif + + hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); return 0; } /* - * This function is called ONLY FROM NET BH. No locking! - * * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. * * Such approach solves two big problems: - * 1. Not simplex devices (if they exist 8)) are handled properly. + * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. */ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, - u8 tos, struct device *pdev) + u8 tos, struct device *dev) { - struct device * dev = pdev; - struct fib_info *fi = NULL; - struct fib_info *src_fi = NULL; + struct rt_key key; + struct fib_result res; + struct in_device *in_dev = dev->ip_ptr; + struct in_device *out_dev; unsigned flags = 0; - struct device *devout; struct rtable * rth; unsigned hash; - struct fib_result res; - u32 src_key = saddr; - u32 dst_key = daddr; - int err = -EINVAL; - int log = 0; + u32 spec_dst; + int err = -EINVAL; + + /* + * IP on this device is disabled. + */ - hash = rt_hash_code(daddr, saddr^(unsigned long)pdev, tos); + if (!in_dev) + return -EINVAL; - /* Check for martians... */ + key.dst = daddr; + key.src = saddr; + key.tos = tos; + key.iif = dev->ifindex; + key.oif = 0; + key.scope = RT_SCOPE_UNIVERSE; + + hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos); + + /* Check for the most weird martians, which can be not detected + by fib_lookup. + */ if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) goto martian_source; - if (MULTICAST(daddr) || daddr == 0xFFFFFFFF) - goto mc_input; - /* Accept zero addresses only to limited broadcast/multicasts; - * I even do not know to fix it or not. + if (daddr == 0xFFFFFFFF) + goto brd_input; + + /* Accept zero addresses only to limited broadcast; + * I even do not know to fix it or not. Waiting for complains :-) */ if (ZERONET(saddr)) goto martian_source; + if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) goto martian_destination; /* - * Device is not yet initialized, accept all addresses as ours. + * Now we are ready to route packet. */ - if (ZERONET(dev->pa_addr)) - goto promisc_ip; - - /* - * Now we are able to route packet. - */ - if ((err = fib_lookup(&res, daddr, saddr, tos, pdev, NULL)) < 0) { - if (!IS_ROUTER) + if ((err = fib_lookup(&key, &res))) { + if (!IN_DEV_FORWARD(in_dev)) return -EINVAL; goto no_route; } - fi = res.f->fib_info; - flags = fi->fib_flags; - devout = fi->fib_dev; - - if (flags&RTF_NAT) { - daddr = htonl((ntohl(daddr)&((1<fib_gateway; - fi = fib_lookup_info(daddr, saddr, tos, pdev, NULL); - if (!fi || fi->fib_flags&(RTF_NAT|RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST)) - return -EINVAL; - devout = fi->fib_dev; - flags = fi->fib_flags|RTCF_NAT|RTF_NAT; - } +#ifdef CONFIG_IP_ROUTE_NAT + /* Policy is applied before mapping destination, + but rerouting after map should be made with old source. + */ - switch (res.fr->cl_action) { - case RTP_NAT: - /* Packet is from translated source; remember it */ - saddr = (saddr&~res.fr->cl_srcmask)|res.fr->cl_srcmap; - flags |= RTCF_NAT; - break; - case RTP_MASQUERADE: - /* Packet is from masqueraded source; remember it */ - flags |= RTCF_MASQ; - break; - default: - } - log = res.fr->cl_flags&RTRF_LOG; + if (1) { + u32 src_map = saddr; + if (res.r) + src_map = fib_rules_policy(saddr, &res, &flags); - if (!(flags & RTF_LOCAL)) { - if (!IS_ROUTER || flags&RTF_NOFORWARD) - return -EINVAL; - } else { - fi = NULL; - devout = &loopback_dev; - if (flags&RTF_BROADCAST) - goto mc_input; + if (res.type == RTN_NAT) { + key.dst = fib_rules_map_destination(daddr, &res); + if (fib_lookup(&key, &res) || res.type != RTN_UNICAST) + return -EINVAL; + flags |= RTCF_DNAT; + } + key.src = src_map; } - -#ifndef CONFIG_IP_LOCAL_RT_POLICY - if (flags&RTF_LOCAL) - src_fi = fib_lookup_info(src_key, 0, tos, &loopback_dev, NULL); - else #endif - if (fib_lookup(&res, src_key, daddr, tos, net_alias_main_dev(devout), NULL) == 0) { - src_fi = res.f->fib_info; - /* Destination is on masqueraded network: - * if it is real incoming frame, ip_forward will drop it. - */ - if (res.fr->cl_flags&RTRF_VALVE) - flags |= RTCF_VALVE; - } - if (src_fi) { - if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT)) + if (res.type == RTN_BROADCAST) + goto brd_input; + + if (res.type == RTN_LOCAL) { + spec_dst = daddr; + if (inet_addr_type(saddr) != RTN_UNICAST) goto martian_source; + goto local_input; + } - if (!(src_fi->fib_flags&RTF_GATEWAY)) - flags |= RTCF_DIRECTSRC; + if (!IN_DEV_FORWARD(in_dev)) + return -EINVAL; + if (res.type != RTN_UNICAST) + goto martian_destination; - if (net_alias_main_dev(src_fi->fib_dev) == pdev) - skb->dev = dev = src_fi->fib_dev; - else { - /* Route to packet source goes via - different interface; rfc1812 proposes - to drop them. - It is dangerous on not-stub/transit networks - because of path asymmetry. - */ - if (ipv4_config.rfc1812_filter >= 2) - goto martian_source; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && key.oif == 0) + fib_select_multipath(&key, &res); +#endif + out_dev = FIB_RES_DEV(res)->ip_ptr; - /* Weaker form of rfc1812 filtering. - If source is on directly connected network, - it can mean either local network configuration error - (the most probable case) or real IP spoofing attempt. - */ - if (ipv4_config.rfc1812_filter >= 1 && !(flags&RTCF_DIRECTSRC)) - goto martian_source; - } - } else if (ipv4_config.rfc1812_filter >= 1) + err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst); + if (err < 0) goto martian_source; -make_route: + if (err) + flags |= RTCF_DIRECTSRC; + + if (out_dev == in_dev && err && !(flags&RTCF_NAT) && + (IN_DEV_SHARED_MEDIA(out_dev) + || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res)))) + flags |= RTCF_DOREDIRECT; + if (skb->protocol != __constant_htons(ETH_P_IP)) { - /* ARP request. Do not make route for invalid destination or - * if it is redirected. + /* Not IP (i.e. ARP). Do not make route for invalid + * destination or if it is redirected. */ - if (flags&(RTF_REJECT|RTF_BROADCAST|RTF_MULTICAST) || - skb->pkt_type == PACKET_OTHERHOST || - (devout == dev && !(flags&(RTF_LOCAL|RTCF_NAT)))) + if (out_dev == in_dev && flags&RTCF_DOREDIRECT) return -EINVAL; } @@ -948,147 +975,105 @@ make_route: if (!rth) return -ENOBUFS; - rth->u.dst.output= ip_rt_bug; - atomic_set(&rth->u.dst.use, 1); - rth->key.dst = dst_key; - rth->rt_dst = dst_key; - rth->rt_dst_map = daddr; + rth->key.dst = daddr; + rth->rt_dst = daddr; rth->key.tos = tos; - rth->key.src = src_key; - rth->rt_src = src_key; - rth->rt_src_map = saddr; - rth->rt_src_dev = dev; - rth->key.src_dev= pdev; - rth->u.dst.dev = devout; - rth->key.dst_dev= NULL; + rth->key.src = saddr; + rth->rt_src = saddr; rth->rt_gateway = daddr; - rth->rt_spec_dst= daddr; - - if (!(flags&RTF_REJECT)) { - if (flags&RTF_LOCAL) - rth->u.dst.input= ip_local_deliver; - if (!(flags&(RTF_NOFORWARD|RTF_BROADCAST))) { - if (flags&RTF_MULTICAST) { -#ifdef CONFIG_IP_MROUTE - if (!LOCAL_MCAST(daddr) && ipv4_config.multicast_route) { - rth->u.dst.input = ip_mr_input; - rth->u.dst.output = ip_output; - } +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_src_map = key.src; + rth->rt_dst_map = key.dst; + if (flags&RTCF_DNAT) + rth->rt_gateway = key.dst; #endif - } else if (!(flags&RTF_LOCAL)) { - rth->u.dst.input = ip_forward; - rth->u.dst.output = ip_output; - } - } - } else if (IS_ROUTER && !(flags&(RTF_MULTICAST|RTF_BROADCAST))) { - rth->u.dst.input= ip_error; - rth->u.dst.error= -err; - } - - if ((flags&(RTF_BROADCAST|RTF_MULTICAST)) || !(flags&RTF_LOCAL)) - rth->rt_spec_dst= dev->pa_addr; + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = out_dev->dev; + rth->key.oif = 0; + rth->rt_spec_dst= spec_dst; - if (fi) { - rth->u.dst.pmtu = fi->fib_mtu; - rth->u.dst.window=fi->fib_window; - rth->u.dst.rtt = fi->fib_irtt; - if (flags & RTF_GATEWAY) - rth->rt_gateway = fi->fib_gateway; - } else { - rth->u.dst.pmtu = devout->mtu; - rth->u.dst.window=0; - rth->u.dst.rtt = TCP_TIMEOUT_INIT; - } + rth->u.dst.input = ip_forward; + rth->u.dst.output = ip_output; - if (!(flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTCF_NAT)) && - flags&RTCF_DIRECTSRC && - (devout == dev || (ipv4_config.rfc1620_redirects && - net_alias_main_dev(devout) == pdev))) - flags |= RTCF_DOREDIRECT; + rth->u.dst.pmtu = res.fi->fib_mtu ? : out_dev->dev->mtu; + rth->u.dst.window=res.fi->fib_window ? : 0; + rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; + if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK) + rth->rt_gateway = FIB_RES_GW(res); rth->rt_flags = flags; + rth->rt_type = res.type; - if (log) - printk(KERN_INFO "installing route %08lX -> %08lX\n", ntohl(rth->rt_src), ntohl(rth->rt_dst)); - - if (flags&(RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST|RTF_REJECT)) { - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); - return 0; - } - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, __constant_ntohs(skb->protocol)); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, ntohs(skb->protocol)); return 0; -mc_input: +brd_input: if (skb->protocol != __constant_htons(ETH_P_IP)) return -EINVAL; if (ZERONET(saddr)) { - if (!ipv4_config.bootp_agent) - goto martian_source; - flags |= RTF_NOFORWARD|RTF_LOCAL; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else { - src_fi = fib_lookup_info(saddr, 0, tos, &loopback_dev, NULL); - if (!src_fi) - goto martian_source; - - if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT)) + err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst); + if (err < 0) goto martian_source; - - if (!(src_fi->fib_flags&RTF_GATEWAY)) + if (err) flags |= RTCF_DIRECTSRC; - - if (!MULTICAST(daddr) || !ipv4_config.multicast_route || - LOCAL_MCAST(daddr)) { - if (net_alias_main_dev(src_fi->fib_dev) == pdev) { - skb->dev = dev = src_fi->fib_dev; - } else { - /* Fascist not-unicast filtering 8) */ - goto martian_source; - } - } - } - - if (!MULTICAST(daddr)) { - flags |= RTF_LOCAL|RTF_BROADCAST|RTF_NOFORWARD; - devout = dev; - goto make_route; } + flags |= RTCF_BROADCAST; - flags |= RTF_MULTICAST|RTF_LOCAL; +local_input: + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (!rth) + return -ENOBUFS; - if (ip_check_mc(dev, daddr) == 0) { - flags &= ~RTF_LOCAL; + rth->u.dst.output= ip_rt_bug; - if (!ipv4_config.multicast_route || !(dev->flags&IFF_ALLMULTI)) - goto no_route; + atomic_set(&rth->u.dst.use, 1); + rth->key.dst = daddr; + rth->rt_dst = daddr; + rth->key.tos = tos; + rth->key.src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = key.dst; + rth->rt_src_map = key.src; +#endif + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + rth->key.oif = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->u.dst.input= ip_local_deliver; + if (res.type == RTN_UNREACHABLE) { + rth->u.dst.input= ip_error; + rth->u.dst.error= err; } - devout = dev; - goto make_route; - -promisc_ip: - flags |= RTF_LOCAL|RTF_NOFORWARD; - if (MULTICAST(daddr)) - flags |= RTF_MULTICAST; - else - flags |= RTF_BROADCAST; - devout = dev; - goto make_route; + rth->rt_flags = flags|RTCF_LOCAL; + rth->rt_type = res.type; + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); + return 0; no_route: - flags |= RTF_REJECT; - devout = dev; - goto make_route; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + res.type = RTN_UNREACHABLE; + goto local_input; /* * Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: +#ifdef CONFIG_IP_ROUTE_VERBOSE if (ipv4_config.log_martians && net_ratelimit()) printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name); +#endif return -EINVAL; martian_source: +#ifdef CONFIG_IP_ROUTE_VERBOSE if (ipv4_config.log_martians && net_ratelimit()) { /* * RFC1812 recommenadtion, if source is martian, @@ -1104,6 +1089,7 @@ martian_source: printk("\n"); } } +#endif return -EINVAL; } @@ -1112,224 +1098,298 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, { struct rtable * rth; unsigned hash; - - if (skb->dst) - return 0; - -#if RT_CACHE_DEBUG >= 1 - if (dev->flags & IFF_LOOPBACK) { - printk(KERN_DEBUG "ip_route_input: bug: packet is looped back\n"); - return -EINVAL; - } - if (net_alias_main_dev(dev) != dev) - printk(KERN_DEBUG "ip_route_input: bug: packet is received on alias %s\n", dev->name); -#endif + int iif = dev->ifindex; tos &= IPTOS_TOS_MASK; - hash = rt_hash_code(daddr, saddr^(unsigned long)dev, tos); - skb->dev = dev; + hash = rt_hash_code(daddr, saddr^(iif<<5), tos); for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == saddr && - rth->key.src_dev == dev && - rth->key.dst_dev == NULL && + rth->key.iif == iif && + rth->key.oif == 0 && rth->key.tos == tos) { rth->u.dst.lastuse = jiffies; atomic_inc(&rth->u.dst.use); atomic_inc(&rth->u.dst.refcnt); skb->dst = (struct dst_entry*)rth; - skb->dev = rth->rt_src_dev; return 0; } } + + /* Multicast recognition logic is moved from route cache to here. + The problem was that too many ethernet cards have broken/missing + hardware multicast filters :-( As result the host on multicasting + network acquires a lot of useless route cache entries, sort of + SDR messages from all the world. Now we try to get rid of them. + Really, provided software IP multicast filter is organized + reasonably (at least, hashed), it does not result in a slowdown + comparing with route cache reject entries. + Note, that multicast routers are not affected, because + route cache entry is created eventually. + */ + if (MULTICAST(daddr)) { + int our = ip_check_mc(dev, daddr); + if (!our +#ifdef CONFIG_IP_MROUTE + && (LOCAL_MCAST(daddr) || !dev->ip_ptr || + !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr)) +#endif + ) return -EINVAL; + return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); + } return ip_route_input_slow(skb, daddr, saddr, tos, dev); } - /* * Major route resolver routine. */ -int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, - struct device *dev_out) +int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif) { - u32 src_key = saddr; - u32 dst_key = daddr; - u32 dst_map; - struct device *dst_dev_key = dev_out; + struct rt_key key; + struct fib_result res; unsigned flags = 0; - struct fib_info *fi = NULL; struct rtable *rth; -#ifdef CONFIG_IP_LOCAL_RT_POLICY - struct fib_result res; -#endif + struct device *dev_out = NULL; unsigned hash; tos &= IPTOS_TOS_MASK|1; + key.dst = daddr; + key.src = saddr; + key.tos = tos&IPTOS_TOS_MASK; + key.iif = loopback_dev.ifindex; + key.oif = oif; + key.scope = (tos&1) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + res.fi = NULL; if (saddr) { - if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr) || - __ip_chk_addr(saddr) != IS_MYADDR) + if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr)) return -EINVAL; - if (dev_out == NULL && (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) - dev_out = ip_dev_find(saddr, NULL); + + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(saddr); + if (dev_out == NULL) + return -EINVAL; + + /* I removed check for oif == dev_out->oif here. + It was wrong by three reasons: + 1. ip_dev_find(saddr) can return wrong iface, if saddr is + assigned to multiple interfaces. + 2. Moreover, we are allowed to send packets with saddr + of another iface. --ANK + */ + + if (oif == 0 && (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) { + /* Special hack: user can direct multicasts + and limited broadcast via necessary interface + without fiddling with IP_MULTICAST_IF or IP_TXINFO. + This hack is not just for fun, it allows + vic,vat and friends to work. + They bind socket to loopback, set ttl to zero + and expect that it will work. + From the viewpoint of routing cache they are broken, + because we are not allowed to build multicast path + with loopback source addr (look, routing cache + cannot know, that ttl is zero, so that packet + will not leave this host and route is valid). + Luckily, this hack is good workaround. + */ + + key.oif = dev_out->ifindex; + goto make_route; + } + dev_out = NULL; } - if (!daddr) - daddr = saddr; - - if (dev_out) { - if (!saddr) { - saddr = dev_out->pa_addr; - if (!daddr) - daddr = saddr; + if (oif) { + dev_out = dev_get_by_index(oif); + if (dev_out == NULL) + return -ENODEV; + if (dev_out->ip_ptr == NULL) + return -ENODEV; /* Wrong error code */ + + if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) { + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); + goto make_route; } - dst_map = daddr; - if (MULTICAST(daddr) || daddr == 0xFFFFFFFF) + if (MULTICAST(daddr)) { + key.src = inet_select_addr(dev_out, 0, key.scope); goto make_route; + } + if (!daddr) + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } - if (!daddr) - daddr = htonl(INADDR_LOOPBACK); + if (!key.dst) { + key.dst = key.src; + if (!key.dst) + key.dst = key.src = htonl(INADDR_LOOPBACK); + dev_out = &loopback_dev; + key.oif = loopback_dev.ifindex; + flags |= RTCF_LOCAL; + goto make_route; + } -#ifdef CONFIG_IP_LOCAL_RT_POLICY - if (fib_lookup(&res, daddr, saddr, tos, &loopback_dev, dev_out)) + if (fib_lookup(&key, &res)) { + res.fi = NULL; + if (oif) { + /* Apparently, routing tables are wrong. Assume, + that the destination is on link. + + WHY? DW. + Because we are allowed to send to iface + even if it has NO routes and NO assigned + addresses. When oif is specified, routing + tables are looked up with only one purpose: + to catch if destination is gatewayed, rather than + direct. Moreover, if MSG_DONTROUTE is set, + we send packet, no matter of routing tables + of ifaddr state. --ANK + + + We could make it even if oif is unknown, + likely IPv6, but we do not. + */ + + printk(KERN_DEBUG "Dest not on link. Forcing...\n"); + if (key.src == 0) + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); + goto make_route; + } return -ENETUNREACH; - fi = res.f->fib_info; - dst_map = daddr; + } - if (fi->fib_flags&RTF_NAT) + if (res.type == RTN_NAT) return -EINVAL; - if (!saddr) { - saddr = fi->fib_dev->pa_addr; + if (!key.src) { + key.src = FIB_RES_PREFSRC(res); + +#ifdef CONFIG_IP_MULTIPLE_TABLES /* * "Stabilization" of route. * This step is necessary, if locally originated packets - * are subjected to source routing, else we could get + * are subjected to policy routing, otherwise we could get * route flapping. */ - fi = fib_lookup_info(dst_map, saddr, tos, &loopback_dev, dev_out); - if (!fi) + if (fib_lookup(&key, &res)) return -ENETUNREACH; +#endif } -#else - fi = fib_lookup_info(daddr, 0, tos, &loopback_dev, dev_out); - if (!fi) - return -ENETUNREACH; - - if (fi->fib_flags&RTF_NAT) - return -EINVAL; - dst_map = daddr; - if (!saddr) - saddr = fi->fib_dev->pa_addr; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && key.oif == 0) + fib_select_multipath(&key, &res); #endif - flags |= fi->fib_flags; - dev_out = fi->fib_dev; + dev_out = FIB_RES_DEV(res); - if (RT_LOCALADDR(flags)) { + if (res.type == RTN_LOCAL) { dev_out = &loopback_dev; - fi = NULL; + key.oif = dev_out->ifindex; + res.fi = NULL; + flags |= RTCF_LOCAL; } - if (dst_dev_key && dev_out != dst_dev_key) - return -EINVAL; + key.oif = dev_out->ifindex; make_route: - if (LOOPBACK(saddr) && !(dev_out->flags&IFF_LOOPBACK)) { - printk(KERN_DEBUG "this guy talks to %08x from loopback\n", daddr); + if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK)) { + printk(KERN_DEBUG "this guy talks to %08x from loopback\n", key.dst); return -EINVAL; } - if (daddr == 0xFFFFFFFF) - flags |= RTF_BROADCAST; - else if (MULTICAST(daddr)) - flags |= RTF_MULTICAST; - else if (BADCLASS(daddr) || ZERONET(daddr)) + if (key.dst == 0xFFFFFFFF) + res.type = RTN_BROADCAST; + else if (MULTICAST(key.dst)) + res.type = RTN_MULTICAST; + else if (BADCLASS(key.dst) || ZERONET(key.dst)) return -EINVAL; - if (flags&RTF_BROADCAST && (dev_out->flags&IFF_LOOPBACK || - !(dev_out->flags&IFF_BROADCAST))) - flags &= ~RTF_LOCAL; - else if (flags&RTF_MULTICAST) { + if (res.type == RTN_BROADCAST) { + flags |= RTCF_BROADCAST; + if (!(dev_out->flags&IFF_LOOPBACK) && dev_out->flags&IFF_BROADCAST) + flags |= RTCF_LOCAL; + } else if (res.type == RTN_MULTICAST) { + flags |= RTCF_MULTICAST; if (ip_check_mc(dev_out, daddr)) - flags |= RTF_LOCAL; + flags |= RTCF_LOCAL; } - + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); if (!rth) return -ENOBUFS; atomic_set(&rth->u.dst.use, 1); - rth->key.dst = dst_key; + rth->key.dst = daddr; rth->key.tos = tos; - rth->key.src = src_key; - rth->key.src_dev= NULL; - rth->key.dst_dev= dst_dev_key; - rth->rt_dst = daddr; - rth->rt_dst_map = dst_map; - rth->rt_src = saddr; - rth->rt_src_map = saddr; - rth->rt_src_dev = dev_out; + rth->key.src = saddr; + rth->key.iif = 0; + rth->key.oif = oif; + rth->rt_dst = key.dst; + rth->rt_src = key.src; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = key.dst; + rth->rt_src_map = key.src; +#endif + rth->rt_iif = dev_out->ifindex; rth->u.dst.dev = dev_out; - rth->rt_gateway = dst_map; - rth->rt_spec_dst= dev_out->pa_addr; + rth->rt_gateway = key.dst; + rth->rt_spec_dst= key.src; rth->u.dst.output=ip_output; - if (flags&RTF_LOCAL) { + if (flags&RTCF_LOCAL) { rth->u.dst.input = ip_local_deliver; - rth->rt_spec_dst = daddr; + rth->rt_spec_dst = key.dst; } - if (flags&(RTF_BROADCAST|RTF_MULTICAST)) { - rth->rt_spec_dst = dev_out->pa_addr; - flags &= ~RTF_GATEWAY; - if (flags&RTF_LOCAL) + if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) { + rth->rt_spec_dst = key.src; + if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK)) rth->u.dst.output = ip_mc_output; - if (flags&RTF_MULTICAST) { - if (dev_out->flags&IFF_ALLMULTI) - rth->u.dst.output = ip_mc_output; #ifdef CONFIG_IP_MROUTE - if (ipv4_config.multicast_route && !LOCAL_MCAST(daddr)) + if (res.type == RTN_MULTICAST && dev_out->ip_ptr) { + struct in_device *in_dev = dev_out->ip_ptr; + if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) { rth->u.dst.input = ip_mr_input; -#endif + rth->u.dst.output = ip_mc_output; + } } +#endif } - if (fi) { - if (flags&RTF_GATEWAY) - rth->rt_gateway = fi->fib_gateway; - rth->u.dst.pmtu = fi->fib_mtu; - rth->u.dst.window=fi->fib_window; - rth->u.dst.rtt = fi->fib_irtt; + if (res.fi) { + if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK) + rth->rt_gateway = FIB_RES_GW(res); + rth->u.dst.pmtu = res.fi->fib_mtu ? : dev_out->mtu; + rth->u.dst.window=res.fi->fib_window ? : 0; + rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; } else { rth->u.dst.pmtu = dev_out->mtu; rth->u.dst.window=0; rth->u.dst.rtt = TCP_TIMEOUT_INIT; } rth->rt_flags = flags; - hash = rt_hash_code(dst_key, dst_dev_key ? src_key^(dst_dev_key->ifindex<<5) : src_key, tos); + rth->rt_type = res.type; + hash = rt_hash_code(daddr, saddr^(oif<<5), tos); *rp = rt_intern_hash(hash, rth, ETH_P_IP); return 0; } -int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, struct device *dev_out) +int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif) { unsigned hash; struct rtable *rth; - hash = rt_hash_code(daddr, dev_out ? saddr^(dev_out->ifindex<<5) - : saddr, tos); + hash = rt_hash_code(daddr, saddr^(oif<<5), tos); start_bh_atomic(); for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == saddr && - rth->key.src_dev == NULL && - rth->key.dst_dev == dev_out && + rth->key.iif == 0 && + rth->key.oif == oif && rth->key.tos == tos) { rth->u.dst.lastuse = jiffies; atomic_inc(&rth->u.dst.use); @@ -1341,48 +1401,126 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, struct dev } end_bh_atomic(); - return ip_route_output_slow(rp, daddr, saddr, tos, dev_out); + return ip_route_output_slow(rp, daddr, saddr, tos, oif); } -int ip_route_output_dev(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int ifindex) +#ifdef CONFIG_RTNETLINK + +int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { - unsigned hash; - struct rtable *rth; - struct device *dev_out; + struct kern_rta *rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct rtable *rt = NULL; + u32 dst = 0; + u32 src = 0; + int err; + struct sk_buff *skb; + u8 *o; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + return -ENOBUFS; - hash = rt_hash_code(daddr, saddr^(ifindex<<5), tos); + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ + skb->mac.raw = skb->data; + skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); + + if (rta->rta_dst) + memcpy(&dst, rta->rta_dst, 4); + if (rta->rta_src) + memcpy(&src, rta->rta_src, 4); + + if (rta->rta_iif) { + struct device *dev; + dev = dev_get_by_index(*rta->rta_iif); + if (!dev) + return -ENODEV; + skb->protocol = __constant_htons(ETH_P_IP); + skb->dev = dev; + start_bh_atomic(); + err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); + end_bh_atomic(); + rt = (struct rtable*)skb->dst; + if (!err && rt->u.dst.error) + err = rt->u.dst.error; + } else { + err = ip_route_output(&rt, dst, src, rtm->rtm_tos, + rta->rta_oif ? *rta->rta_oif : 0); + } + if (err) { + kfree_skb(skb, FREE_WRITE); + return err; + } - start_bh_atomic(); - for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { - if (rth->key.dst == daddr && - rth->key.src == saddr && - rth->key.src_dev == NULL && - rth->key.tos == tos && - rth->key.dst_dev && - rth->key.dst_dev->ifindex == ifindex) { - rth->u.dst.lastuse = jiffies; - atomic_inc(&rth->u.dst.use); - atomic_inc(&rth->u.dst.refcnt); - end_bh_atomic(); - *rp = rth; - return 0; + skb->dst = &rt->u.dst; + if (rtm->rtm_flags & RTM_F_NOTIFY) + rt->rt_flags |= RTCF_NOTIFY; + + nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + RTM_NEWROUTE, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + nlh->nlmsg_flags = 0; + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = 32; + rtm->rtm_src_len = 32; + rtm->rtm_tos = rt->key.tos; + rtm->rtm_table = RT_TABLE_MAIN; + rtm->rtm_type = rt->rt_type; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = RTPROT_UNSPEC; + rtm->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED; + rtm->rtm_nhs = 0; + + o = skb->tail; + RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); + RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src); + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); + if (rt->rt_dst != rt->rt_gateway) + RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); + RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu); + RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window); + RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt); + RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); + rtm->rtm_optlen = skb->tail - o; + if (rta->rta_iif) { +#ifdef CONFIG_IP_MROUTE + if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_config.multicast_route) { + NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid; + err = ipmr_get_route(skb, rtm); + if (err <= 0) + return err; + } else +#endif + { + RTA_PUT(skb, RTA_IIF, 4, rta->rta_iif); + rtm->rtm_optlen = skb->tail - o; } } - end_bh_atomic(); + nlh->nlmsg_len = skb->tail - (u8*)nlh; + err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err < 0) + return err; + return 0; - dev_out = dev_get_by_index(ifindex); - if (!dev_out) - return -ENODEV; - return ip_route_output_slow(rp, daddr, saddr, tos, dev_out); +nlmsg_failure: +rtattr_failure: + kfree_skb(skb, FREE_WRITE); + return -EMSGSIZE; } -void ip_rt_multicast_event(struct device *dev) +#endif /* CONFIG_RTNETLINK */ + +void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(0); + rt_cache_flush(1*HZ); } __initfunc(void ip_rt_init(void)) { + devinet_init(); ip_fib_init(); #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c175f30f3372..d3e018be8154 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * $Id: syncookies.c,v 1.2 1997/08/22 19:15:08 freitag Exp $ + * $Id: syncookies.c,v 1.3 1997/09/16 17:16:21 freitag Exp $ * * Missing: IPv6 support. * Some counter so that the Administrator can see when the machine @@ -200,9 +200,11 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) * no easy way to do this. */ if (ip_route_output(&rt, - opt && opt->srr ? opt->faddr : - req->af.v4_req.rmt_addr,req->af.v4_req.loc_addr, - sk->ip_tos, NULL)) { + opt && + opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, + req->af.v4_req.loc_addr, + sk->ip_tos, + 0)) { tcp_openreq_free(req); return NULL; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e710235a103f..f49514171c8e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,6 +1,8 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * + * $Id: sysctl_net_ipv4.c,v 1.21 1997/10/17 01:21:18 davem Exp $ + * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] */ @@ -36,16 +38,15 @@ extern int sysctl_arp_confirm_interval; extern int sysctl_arp_confirm_timeout; extern int sysctl_arp_max_pings; +/* From icmp.c */ +extern int sysctl_icmp_echo_ignore_all; +extern int sysctl_icmp_echo_ignore_broadcasts; + /* From ip_fragment.c */ extern int sysctl_ipfrag_low_thresh; extern int sysctl_ipfrag_high_thresh; extern int sysctl_ipfrag_time; -/* From igmp.c */ -extern int sysctl_igmp_max_host_report_delay; -extern int sysctl_igmp_timer_scale; -extern int sysctl_igmp_age_threshold; - extern int sysctl_tcp_cong_avoidance; extern int sysctl_tcp_hoe_retransmits; extern int sysctl_tcp_sack; @@ -65,6 +66,13 @@ extern int sysctl_tcp_stdurg; extern int sysctl_tcp_syn_taildrop; extern int sysctl_max_syn_backlog; +/* From icmp.c */ +extern int sysctl_icmp_sourcequench_time; +extern int sysctl_icmp_destunreach_time; +extern int sysctl_icmp_timeexceed_time; +extern int sysctl_icmp_paramprob_time; +extern int sysctl_icmp_echoreply_time; + int tcp_retr1_max = 255; extern int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp, @@ -77,6 +85,7 @@ struct ipv4_config ipv4_config = { 1, 1, 1, 0, }; struct ipv4_config ipv4_def_router_config = { 0, 1, 1, 1, 1, 1, 1, }; struct ipv4_config ipv4_def_host_config = { 1, 1, 1, 0, }; +static int ipv4_sysctl_forwarding(ctl_table *ctl, int write, struct file * filp, void *buffer, size_t *lenp) { @@ -95,6 +104,15 @@ int ipv4_sysctl_forwarding(ctl_table *ctl, int write, struct file * filp, return ret; } +static +int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + if (write) + rt_cache_flush(0); + return 0; +} + ctl_table ipv4_table[] = { {NET_IPV4_ARP_RES_TIME, "arp_res_time", &sysctl_arp_res_time, sizeof(int), 0644, NULL, &proc_dointvec}, @@ -147,17 +165,17 @@ ctl_table ipv4_table[] = { {NET_IPV4_SOURCE_ROUTE, "ip_source_route", &ipv4_config.source_route, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_ADDRMASK_AGENT, "ip_addrmask_agent", - &ipv4_config.addrmask_agent, sizeof(int), 0644, NULL, + {NET_IPV4_SEND_REDIRECTS, "ip_send_redirects", + &ipv4_config.send_redirects, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_BOOTP_AGENT, "ip_bootp_agent", - &ipv4_config.bootp_agent, sizeof(int), 0644, NULL, + {NET_IPV4_AUTOCONFIG, "ip_autoconfig", + &ipv4_config.autoconfig, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_BOOTP_RELAY, "ip_bootp_relay", &ipv4_config.bootp_relay, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_FIB_MODEL, "ip_fib_model", - &ipv4_config.fib_model, sizeof(int), 0644, NULL, + {NET_IPV4_PROXY_ARP, "ip_proxy_arp", + &ipv4_config.proxy_arp, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc", &ipv4_config.no_pmtu_disc, sizeof(int), 0644, NULL, @@ -171,6 +189,9 @@ ctl_table ipv4_table[] = { {NET_IPV4_RFC1620_REDIRECTS, "ip_rfc1620_redirects", &ipv4_config.rfc1620_redirects, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_RTCACHE_FLUSH, "ip_rtcache_flush", + NULL, sizeof(int), 0644, NULL, + &ipv4_sysctl_rtcache_flush}, {NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries", &sysctl_tcp_syn_retries, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh", @@ -197,17 +218,6 @@ ctl_table ipv4_table[] = { {NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout", &sysctl_tcp_fin_timeout, sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, - {NET_IPV4_IGMP_MAX_HOST_REPORT_DELAY, "igmp_max_host_report_delay", - &sysctl_igmp_max_host_report_delay, sizeof(int), 0644, NULL, - &proc_dointvec}, - {NET_IPV4_IGMP_TIMER_SCALE, "igmp_timer_scale", - &sysctl_igmp_timer_scale, sizeof(int), 0644, NULL, &proc_dointvec}, -#if 0 - /* This one shouldn't be exposed to the user (too implementation - specific): */ - {NET_IPV4_IGMP_AGE_THRESHOLD, "igmp_age_threshold", - &sysctl_igmp_age_threshold, sizeof(int), 0644, NULL, &proc_dointvec}, -#endif #ifdef CONFIG_SYN_COOKIES {NET_TCP_SYNCOOKIES, "tcp_syncookies", &sysctl_tcp_syncookies, sizeof(int), 0644, NULL, &proc_dointvec}, @@ -218,6 +228,25 @@ ctl_table ipv4_table[] = { sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog", &sysctl_max_syn_backlog, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range", + &sysctl_local_port_range, sizeof(sysctl_local_port_range), 0644, + NULL, &proc_dointvec}, + {NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all", + &sysctl_icmp_echo_ignore_all, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts", + &sysctl_icmp_echo_ignore_broadcasts, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ICMP_SOURCEQUENCH_RATE, "icmp_sourcequench_rate", + &sysctl_icmp_sourcequench_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_DESTUNREACH_RATE, "icmp_destunreach_rate", + &sysctl_icmp_destunreach_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_TIMEEXCEED_RATE, "icmp_timeexceed_rate", + &sysctl_icmp_timeexceed_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_PARAMPROB_RATE, "icmp_paramprob_rate", + &sysctl_icmp_paramprob_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_ECHOREPLY_RATE, "icmp_echoreply_rate", + &sysctl_icmp_echoreply_time, sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b62035e3bb9c..eff309bcf8d9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.71 1997/09/06 05:11:45 davem Exp $ + * Version: $Id: tcp.c,v 1.75 1997/10/16 02:57:34 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -437,8 +437,8 @@ static struct open_request *tcp_find_established(struct tcp_opt *tp, struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; while(req) { if (req->sk && - (req->sk->state == TCP_ESTABLISHED || - req->sk->state >= TCP_FIN_WAIT1)) + ((1 << req->sk->state) & + ~(TCPF_SYN_SENT|TCPF_SYN_RECV))) break; prev = req; req = req->dl_next; @@ -603,7 +603,7 @@ unsigned int tcp_poll(struct socket *sock, poll_table *wait) if (sk->err) mask = POLLERR; /* Connected? */ - if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) { + if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) { if (sk->shutdown & RCV_SHUTDOWN) mask |= POLLHUP; @@ -653,7 +653,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { unsigned long amount; - if (sk->state == TCP_LISTEN) return(-EINVAL); + if (sk->state == TCP_LISTEN) + return(-EINVAL); amount = sock_wspace(sk); return put_user(amount, (int *)arg); } @@ -701,7 +702,8 @@ static void wait_for_tcp_connect(struct sock * sk) { release_sock(sk); cli(); - if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0) + if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) && + sk->err == 0) interruptible_sleep_on(sk->sleep); sti(); lock_sock(sk); @@ -779,11 +781,11 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); /* Wait for a connection to finish. */ - while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) { + while ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { if (sk->err) return sock_error(sk); - if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) { + if ((1 << sk->state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (sk->keepopen) send_sig(SIGPIPE, current, 0); return -EPIPE; @@ -982,7 +984,7 @@ void tcp_read_wakeup(struct sock *sk) /* If we're closed, don't send an ack, or we'll get a RST * from the closed destination. */ - if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT)) + if ((1 << sk->state) & (TCPF_CLOSE|TCPF_TIME_WAIT)) return; tcp_send_ack(sk); @@ -1400,10 +1402,8 @@ void tcp_shutdown(struct sock *sk, int how) return; /* If we've already sent a FIN, or it's a closed state, skip this. */ - if (sk->state == TCP_ESTABLISHED || - sk->state == TCP_SYN_SENT || - sk->state == TCP_SYN_RECV || - sk->state == TCP_CLOSE_WAIT) { + if ((1 << sk->state) & + (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) { lock_sock(sk); /* Flag that the sender has shutdown. */ @@ -1424,9 +1424,7 @@ void tcp_shutdown(struct sock *sk, int how) static inline int closing(struct sock * sk) { - return ((1 << sk->state) & ((1 << TCP_FIN_WAIT1)| - (1 << TCP_CLOSING)| - (1 << TCP_LAST_ACK))); + return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK)); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7c6fbec56d86..e9f936f82ec4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.56 1997/08/31 08:24:54 freitag Exp $ + * Version: $Id: tcp_input.c,v 1.64 1997/10/30 23:52:24 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -64,6 +64,8 @@ static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, #define SYNC_INIT 1 #endif +extern int sysctl_tcp_fin_timeout; + int sysctl_tcp_cong_avoidance; int sysctl_tcp_hoe_retransmits; int sysctl_tcp_sack; @@ -249,7 +251,7 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) * really. */ -static int tcp_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_reset(struct sock *sk, struct sk_buff *skb) { sk->zapped = 1; @@ -285,8 +287,6 @@ static int tcp_reset(struct sock *sk, struct sk_buff *skb) #endif if (!sk->dead) sk->state_change(sk); - - return(0); } /* @@ -345,15 +345,16 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) /* Cheaper to set again then to * test syn. Optimize this? */ - if (sysctl_tcp_timestamps && !no_fancy) + if (sysctl_tcp_timestamps && !no_fancy) { tp->tstamp_ok = 1; - tp->saw_tstamp = 1; - tp->rcv_tsval = ntohl(*(__u32 *)ptr); - tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); + tp->saw_tstamp = 1; + tp->rcv_tsval = ntohl(*(__u32 *)ptr); + tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); + } } break; case TCPOPT_SACK: - if (no_fancy) + if (no_fancy || !sysctl_tcp_sack) break; tp->sacks = (opsize-2)>>3; if (tp->sacks<<3 == opsize-2) { @@ -486,8 +487,10 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) #define FLAG_WIN_UPDATE 0x02 #define FLAG_DATA_ACKED 0x04 -static __inline__ void clear_fast_retransmit(struct sock *sk) { +static __inline__ void clear_fast_retransmit(struct sock *sk) +{ struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + if (tp->dup_acks > 3) { tp->retrans_head = NULL; tp->snd_cwnd = max(tp->snd_ssthresh, 1); @@ -857,8 +860,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, tcp_ack_probe(sk, ack); /* See if we can take anything off of the retransmit queue. */ - if (tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt)) - flag |= FLAG_DATA_ACKED; + flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); /* If we have a timestamp, we always do rtt estimates. */ if (tp->saw_tstamp) { @@ -879,7 +881,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, } } else { tcp_set_rto(tp); - if (flag && FLAG_DATA_ACKED) + if (flag & FLAG_DATA_ACKED) (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); } /* NOTE: safe here so long as cong_ctl doesn't use rto */ @@ -973,6 +975,11 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if(sk->state == TCP_SYN_SENT) { + /* RFC793 says to drop the segment and return. */ + return 1; + } + /* XXX This fin_seq thing should disappear... -DaveM */ tp->fin_seq = skb->end_seq; @@ -985,7 +992,6 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) switch(sk->state) { case TCP_SYN_RECV: - case TCP_SYN_SENT: case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); @@ -999,12 +1005,16 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) * nothing. */ break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; case TCP_TIME_WAIT: /* Received a retransmission of the FIN, * restart the TIME_WAIT timer. */ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - return(0); + break; + case TCP_FIN_WAIT1: /* This case occurs when a simultaneous close * happens, we must ack the received FIN and @@ -1028,15 +1038,13 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) /* Already in CLOSE. */ break; default: - /* FIXME: Document whats happening in this case. -DaveM */ - tcp_set_state(sk,TCP_LAST_ACK); - - /* Start the timers. */ - tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - return(0); + /* Only TCP_LISTEN is left, in that case we should never + * reach this piece of code. + */ + printk("tcp_fin: Impossible, sk->state=%d\n", sk->state); + break; }; - - return(0); + return 0; } /* This one checks to see if we can put data from the @@ -1337,8 +1345,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, * We do checksum and copy also but from device to kernel. */ - tp = &(sk->tp_pinfo.af_tcp); - /* * RFC1323: H1. Apply PAWS check first. */ @@ -1373,6 +1379,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_data_snd_check(sk); } + tcp_statistics.TcpInErrs++; kfree_skb(skb, FREE_READ); return 0; } else if (skb->ack_seq == tp->snd_una) { @@ -1409,6 +1416,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if(th->syn && skb->seq != sk->syn_seq) { SOCK_DEBUG(sk, "syn in established state\n"); + tcp_statistics.TcpInErrs++; tcp_reset(sk, skb); return 1; } @@ -1430,7 +1438,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* step 8: check the FIN bit */ if (th->fin) - tcp_fin(skb, sk, th); + (void) tcp_fin(skb, sk, th); tcp_data_snd_check(sk); tcp_ack_snd_check(sk); @@ -1449,82 +1457,67 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* Shared between IPv4 and IPv6 now. */ struct sock * -tcp_check_req(struct sock *sk, struct sk_buff *skb, void *opt) +tcp_check_req(struct sock *sk, struct sk_buff *skb, struct open_request *req) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct open_request *dummy, *req; /* assumption: the socket is not in use. * as we checked the user count on tcp_rcv and we're * running from a soft interrupt. */ - req = tp->af_specific->search_open_req(tp, (void *)skb->nh.raw, skb->h.th, - &dummy); - if (req) { - if (req->sk) { - /* socket already created but not - * yet accepted()... - */ - sk = req->sk; - } else { - u32 flg; - /* Check for syn retransmission */ - flg = *(((u32 *)skb->h.th) + 3); + if (req->sk) { + /* socket already created but not + * yet accepted()... + */ + sk = req->sk; + } else { + u32 flg; - flg &= __constant_htonl(0x00170000); - if ((flg == __constant_htonl(0x00020000)) && - (!after(skb->seq, req->rcv_isn))) { + /* Check for syn retransmission */ + flg = *(((u32 *)skb->h.th) + 3); + + flg &= __constant_htonl(0x00170000); + /* Only SYN set? */ + if (flg == __constant_htonl(0x00020000)) { + if (!after(skb->seq, req->rcv_isn)) { /* retransmited syn. */ req->class->rtx_syn_ack(sk, req); return NULL; + } else { + return sk; /* New SYN */ } - - /* In theory the packet could be for a cookie, but - * TIME_WAIT should guard us against this. - * XXX: Nevertheless check for cookies? - */ - if (skb->ack_seq != req->snt_isn+1) { - tp->af_specific->send_reset(skb); - return NULL; - } - - sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - if (sk == NULL) - return NULL; + } - req->expires = 0UL; - req->sk = sk; + /* We know it's an ACK here */ + /* In theory the packet could be for a cookie, but + * TIME_WAIT should guard us against this. + * XXX: Nevertheless check for cookies? + * This sequence number check is done again later, + * but we do it here to prevent syn flood attackers + * from creating big SYN_RECV sockets. + */ + if (!between(skb->ack_seq, req->snt_isn, req->snt_isn+1) || + !between(skb->seq, req->rcv_isn, + req->rcv_isn+1+req->rcv_wnd)) { + req->class->send_reset(skb); + return NULL; } - } -#ifdef CONFIG_SYNCOOKIES - else { - sk = tp->af_specific->cookie_check(sk, skb, opt); + + sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + tcp_dec_slow_timer(TCP_SLT_SYNACK); if (sk == NULL) - return NULL; + return NULL; + + req->expires = 0UL; + req->sk = sk; } -#endif skb_orphan(skb); skb_set_owner_r(skb, sk); return sk; } - -static void tcp_rst_req(struct tcp_opt *tp, struct sk_buff *skb) -{ - struct open_request *req, *prev; - - req = tp->af_specific->search_open_req(tp,skb->nh.iph,skb->h.th,&prev); - if (!req) - return; - /* Sequence number check required by RFC793 */ - if (before(skb->seq, req->snt_isn) || after(skb->seq, req->snt_isn+1)) - return; - tcp_synq_unlink(tp, req, prev); -} - /* * This function implements the receiving procedure of RFC 793. * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be @@ -1540,16 +1533,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* state == CLOSED, hash lookup always fails, so no worries. -DaveM */ switch (sk->state) { case TCP_LISTEN: - if (th->rst) { - tcp_rst_req(tp, skb); - goto discard; - } - /* These use the socket TOS.. * might want to be the received TOS */ - if(th->ack) - return 1; + if(th->ack) + return 1; if(th->syn) { if(tp->af_specific->conn_request(sk, skb, opt, 0) < 0) @@ -1812,6 +1800,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_set_state(sk, TCP_FIN_WAIT2); if (!sk->dead) sk->state_change(sk); + else + tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); } break; @@ -1870,8 +1860,10 @@ step6: } /* step 8: check the FIN bit */ - if (th->fin) - tcp_fin(skb, sk, th); + if (th->fin) { + if(tcp_fin(skb, sk, th) != 0) + goto discard; + } tcp_data_snd_check(sk); tcp_ack_snd_check(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f8cb368947f6..6f62306ffc1c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.62 1997/09/04 22:34:59 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.74 1997/10/30 23:52:27 davem Exp $ * * IPv4 specific functions * @@ -88,6 +88,13 @@ struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE]; */ struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE]; +/* + * This array holds the first and last local port number. + * For high-usage systems, use sysctl to change this to + * 32768-61000 + */ +int sysctl_local_port_range[2] = { 1024, 4999 }; + static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, __u32 faddr, __u16 fport) { @@ -116,6 +123,13 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) unsigned char state = sk2->state; int sk2_reuse = sk2->reuse; + /* Two sockets can be bound to the same port if they're + * bound to different interfaces. + */ + + if(sk->bound_dev_if != sk2->bound_dev_if) + continue; + if(!sk2->rcv_saddr || !sk->rcv_saddr) { if((!sk2_reuse) || (!sk_reuse) || @@ -161,13 +175,15 @@ static __inline__ int tcp_lport_inuse(int num) */ unsigned short tcp_good_socknum(void) { - static int start = PROT_SOCK; + static int start = 0; static int binding_contour = 0; int best = 0; int size = 32767; /* a big num. */ int retval = 0, i, end, bc; SOCKHASH_LOCK(); + if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0]) + start = sysctl_local_port_range[0]; i = tcp_bhashfn(start); end = i + TCP_BHTABLE_SIZE; bc = binding_contour; @@ -207,8 +223,8 @@ verify: best = retval; /* mark the starting point to avoid infinite loops */ while(tcp_lport_inuse(retval)) { retval = tcp_bhashnext(retval,i); - if (retval > 32767) /* Upper bound */ - retval = tcp_bhashnext(PROT_SOCK,i); + if (retval > sysctl_local_port_range[1]) /* Upper bound */ + retval = tcp_bhashnext(sysctl_local_port_range[0],i); if (retval == best) { /* This hash chain is full. No answer. */ retval = 0; @@ -218,8 +234,6 @@ verify: done: start = (retval + 1); - if (start > 32767 || start < PROT_SOCK) - start = PROT_SOCK; SOCKHASH_UNLOCK(); return retval; @@ -301,20 +315,34 @@ static void tcp_v4_rehash(struct sock *sk) * connection. So always assume those are both wildcarded * during the search since they can never be otherwise. */ -static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum) +static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif) { struct sock *sk; struct sock *result = NULL; + int score, hiscore; + hiscore=0; for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) { if(sk->num == hnum) { __u32 rcv_saddr = sk->rcv_saddr; + score = 1; if(rcv_saddr) { - if(rcv_saddr == daddr) - return sk; /* Best possible match. */ - } else if(!result) + if (rcv_saddr != daddr) + continue; + score++; + } + if (sk->bound_dev_if) { + if (sk->bound_dev_if != dif) + continue; + score++; + } + if (score == 3) + return sk; + if (score > hiscore) { + hiscore = score; result = sk; + } } } return result; @@ -324,7 +352,7 @@ static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum) * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM */ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, - u32 saddr, u16 sport, u32 daddr, u16 dport) + u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { unsigned short hnum = ntohs(dport); struct sock *sk; @@ -338,7 +366,8 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, if(sk->daddr == saddr && /* remote address */ sk->dummy_th.dest == sport && /* remote port */ sk->num == hnum && /* local port */ - sk->rcv_saddr == daddr) /* local address */ + sk->rcv_saddr == daddr && /* local address */ + (!sk->bound_dev_if || sk->bound_dev_if == dif)) goto hit; /* You sunk my battleship! */ /* Must check for a TIME_WAIT'er before going to listener hash. */ @@ -346,17 +375,18 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, if(sk->daddr == saddr && /* remote address */ sk->dummy_th.dest == sport && /* remote port */ sk->num == hnum && /* local port */ - sk->rcv_saddr == daddr) /* local address */ + sk->rcv_saddr == daddr && /* local address */ + (!sk->bound_dev_if || sk->bound_dev_if == dif)) goto hit; - sk = tcp_v4_lookup_listener(daddr, hnum); + sk = tcp_v4_lookup_listener(daddr, hnum, dif); hit: return sk; } -__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport) +__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { - return __tcp_v4_lookup(0, saddr, sport, daddr, dport); + return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif); } #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -374,16 +404,25 @@ __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport #define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \ secondlist((hpnum),(sk)->bind_next,(fpass)) -struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, - unsigned short rnum, unsigned long laddr, - unsigned long paddr, unsigned short pnum) +static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, + unsigned short rnum, unsigned long laddr, + struct device *dev, unsigned short pnum, + int dif) { struct sock *s, *result = NULL; int badness = -1; + u32 paddr = 0; unsigned short hnum = ntohs(num); unsigned short hpnum = ntohs(pnum); int firstpass = 1; + if(dev && dev->ip_ptr) { + struct in_device *idev = dev->ip_ptr; + + if(idev->ifa_list) + paddr = idev->ifa_list->ifa_local; + } + /* This code must run only from NET_BH. */ for(s = tcp_v4_proxy_loop_init(hnum, hpnum, s, firstpass); s != NULL; @@ -408,7 +447,12 @@ struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, continue; score++; } - if(score == 3 && s->num == hnum) { + if(s->bound_dev_if) { + if(s->bound_dev_if != dif) + continue; + score++; + } + if(score == 4 && s->num == hnum) { result = s; break; } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) { @@ -486,7 +530,6 @@ out: int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sk_buff *buff; - struct sk_buff *skb1; int tmp; struct tcphdr *th; struct rtable *rt; @@ -517,11 +560,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } tmp = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr, - RT_TOS(sk->ip_tos)|(sk->localroute || 0)); + RT_TOS(sk->ip_tos)|(sk->localroute || 0), sk->bound_dev_if); if (tmp < 0) return tmp; - if (rt->rt_flags&(RTF_MULTICAST|RTF_BROADCAST)) { + if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } @@ -533,13 +576,22 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } lock_sock(sk); + + /* Do this early, so there is less state to unwind on failure. */ + buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL); + if (buff == NULL) { + release_sock(sk); + ip_rt_put(rt); + return(-ENOBUFS); + } + sk->dst_cache = &rt->u.dst; sk->daddr = rt->rt_dst; if (!sk->saddr) sk->saddr = rt->rt_src; sk->rcv_saddr = sk->saddr; - if (sk->priority == SOPRI_NORMAL) + if (sk->priority == 0) sk->priority = rt->u.dst.priority; sk->dummy_th.dest = usin->sin_port; @@ -557,20 +609,23 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->err = 0; - buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL); - if (buff == NULL) { - release_sock(sk); - return(-ENOBUFS); - } - /* Put in the IP header and routing stuff. */ tmp = ip_build_header(buff, sk); if (tmp < 0) { + /* Caller has done ip_rt_put(rt) and set sk->dst_cache + * to NULL. We must unwind the half built TCP socket + * state so that this failure does not create a "stillborn" + * sock (ie. future re-tries of connect() would fail). + */ + sk->daddr = 0; + sk->saddr = sk->rcv_saddr = 0; kfree_skb(buff, FREE_WRITE); release_sock(sk); return(-ENETUNREACH); } + /* No failure conditions can result past this point. */ + th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); buff->h.th = th; @@ -582,11 +637,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) th->ack = 0; th->syn = 1; - sk->mtu = rt->u.dst.pmtu; if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT || (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - rt->rt_flags&RTF_NOPMTUDISC)) && + rt->rt_flags&RTCF_NOPMTUDISC)) && rt->u.dst.pmtu > 576) sk->mtu = 576; @@ -639,8 +693,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->packets_out++; buff->when = jiffies; - skb1 = skb_clone(buff, GFP_KERNEL); - ip_queue_xmit(skb1); + ip_queue_xmit(skb_clone(buff, GFP_KERNEL)); /* Timer for repeating the SYN until an answer. */ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); @@ -691,11 +744,10 @@ out: * This should be replaced with a global hash table. */ static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, - void *header, - struct tcphdr *th, - struct open_request **prevp) + struct iphdr *iph, + struct tcphdr *th, + struct open_request **prevp) { - struct iphdr *iph = header; struct open_request *req, *prev; __u16 rport = th->source; @@ -750,7 +802,8 @@ static inline void do_pmtu_discovery(struct sock *sk, * dropped. This is the new "fast" path mtu * discovery. */ - tcp_simple_retransmit(sk); + if (!sk->sock_readers) + tcp_simple_retransmit(sk); } } } @@ -764,7 +817,7 @@ static inline void do_pmtu_discovery(struct sock *sk, * to find the appropriate port. */ -void tcp_v4_err(struct sk_buff *skb, unsigned char *dp) +void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) { struct iphdr *iph = (struct iphdr*)dp; struct tcphdr *th; @@ -773,18 +826,16 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp) int code = skb->h.icmph->code; struct sock *sk; __u32 seq; + int opening; -#if 0 - /* check wrong - icmp.c should pass in len */ - if (skb->len < 8+(iph->ihl << 2)+sizeof(struct tcphdr)) { + if (len < (iph->ihl << 2)+sizeof(struct tcphdr)) { icmp_statistics.IcmpInErrors++; return; } -#endif th = (struct tcphdr*)(dp+(iph->ihl<<2)); - sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source); + sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex); if (sk == NULL) { icmp_statistics.IcmpInErrors++; return; @@ -793,19 +844,38 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp) /* pointless, because we have no way to retry when sk is locked. But the socket should be really locked here for better interaction with the socket layer. This needs to be solved for SMP - (I would prefer an "ICMP backlog"). */ - /* lock_sock(sk); */ + (I would prefer an "ICMP backlog"). + + tcp_v4_err is called only from bh, so that lock_sock is pointless, + even in commented form :-) --ANK + + Note "for SMP" ;) -AK + + Couple of notes about backlogging: + - error_queue could be used for it. + - could, but MUST NOT :-), because: + a) it is not clear, + who will process deferred messages. + b) ICMP is not reliable by design, so that you can safely + drop ICMP messages. Besides that, if ICMP really arrived + it is very unlikely, that socket is locked. --ANK + + I don't think it's unlikely that sk is locked. With the + open_request stuff there is much more stress on the main + LISTEN socket. I just want to make sure that all ICMP unreachables + destroy unneeded open_requests as reliable as possible (for + syn flood protection) -AK + */ tp = &sk->tp_pinfo.af_tcp; - - seq = ntohl(th->seq); - #ifdef ICMP_PARANOIA - if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { + seq = ntohl(th->seq); + if (sk->state != TCP_LISTEN && + !between(seq, tp->snd_una, max(tp->snd_una+32768,tp->snd_nxt))) { if (net_ratelimit()) printk(KERN_DEBUG "icmp packet outside the tcp window:" " s:%d %u,%u,%u\n", (int)sk->state, seq, tp->snd_una, tp->snd_nxt); - goto out; + return; } #endif @@ -814,7 +884,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp) tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); tp->snd_cwnd = tp->snd_ssthresh; tp->high_seq = tp->snd_nxt; - goto out; + return; case ICMP_PARAMETERPROB: sk->err=EPROTO; sk->error_report(sk); @@ -822,7 +892,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp) case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ do_pmtu_discovery(sk, iph, th); - goto out; + return; } break; } @@ -830,62 +900,62 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp) /* If we've already connected we will keep trying * until we time out, or the user gives up. */ - if (code <= NR_ICMP_UNREACH) { - int fatal = 0; - - if (sk->state == TCP_LISTEN) { - struct open_request *req, *prev; - - /* Prevent race conditions with accept() - * icmp is unreliable. - * This is the easiest solution for now - for - * very big servers it might prove inadequate. - */ - if (sk->sock_readers) { - /* XXX: add a counter here to profile this. - * If too many ICMPs get dropped on busy - * servers this needs to be solved differently. - */ - goto out; - } + if (code > NR_ICMP_UNREACH) + return; - req = tcp_v4_search_req(tp, iph, th, &prev); - if (!req) - goto out; + opening = 0; + switch (sk->state) { + struct open_request *req, *prev; + case TCP_LISTEN: + /* Prevent race conditions with accept() - + * ICMP is unreliable. + */ + if (sk->sock_readers) { + /* XXX: add a counter here to profile this. + * If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + return; + } + + if (!th->syn && !th->ack) + return; + req = tcp_v4_search_req(tp, iph, th, &prev); + if (!req) + return; #ifdef ICMP_PARANOIA - if (seq != req->snt_isn) { - if (net_ratelimit()) - printk(KERN_DEBUG "icmp packet for openreq " - "with wrong seq number:%d:%d\n", - seq, req->snt_isn); - goto out; - } + if (seq != req->snt_isn) { + if (net_ratelimit()) + printk(KERN_DEBUG "icmp packet for openreq " + "with wrong seq number:%d:%d\n", + seq, req->snt_isn); + return; + } #endif - if (req->sk) { /* not yet accept()ed */ - sk = req->sk; - } else { - tcp_synq_unlink(tp, req, prev); - tcp_openreq_free(req); - fatal = 1; - } - } else if (sk->state == TCP_SYN_SENT - || sk->state == TCP_SYN_RECV) - fatal = 1; - - if(icmp_err_convert[code].fatal || fatal) { - sk->err = icmp_err_convert[code].errno; - if (fatal) { - tcp_statistics.TcpAttemptFails++; - if (sk->state != TCP_LISTEN) - tcp_set_state(sk,TCP_CLOSE); - sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - } - } else /* Only an error on timeout */ - sk->err_soft = icmp_err_convert[code].errno; + if (req->sk) { /* not yet accept()ed */ + sk = req->sk; /* report error in accept */ + } else { + tcp_synq_unlink(tp, req, prev); + req->class->destructor(req); + tcp_openreq_free(req); + } + /* FALL THOUGH */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + opening = 1; + break; } - -out: - /* release_sock(sk); */ + + if(icmp_err_convert[code].fatal || opening) { + sk->err = icmp_err_convert[code].errno; + if (opening) { + tcp_statistics.TcpAttemptFails++; + if (sk->state != TCP_LISTEN) + tcp_set_state(sk,TCP_CLOSE); + sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + } + } else /* Only an error on timeout */ + sk->err_soft = icmp_err_convert[code].errno; } /* This routine computes an IPv4 TCP checksum. */ @@ -948,6 +1018,7 @@ static void tcp_v4_send_reset(struct sk_buff *skb) /* FIXME: should this carry an options packet? */ ip_queue_xmit(skb1); tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutRsts++; } #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -962,7 +1033,7 @@ int tcp_chkaddr(struct sk_buff *skb) struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4); struct sock *sk; - sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest); + sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest, skb->dev->ifindex); if (!sk) return 0; @@ -992,7 +1063,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) kfree_skb(skb, FREE_WRITE); return; } - + mss = (skb->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); if (sk->user_mss) mss = min(mss, sk->user_mss); @@ -1077,7 +1148,8 @@ int sysctl_tcp_syn_taildrop = 1; struct or_calltable or_ipv4 = { tcp_v4_send_synack, - tcp_v4_or_free + tcp_v4_or_free, + tcp_v4_send_reset }; #ifdef NEW_LISTEN @@ -1304,7 +1376,7 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (ip_route_output(&rt, newsk->opt && newsk->opt->srr ? newsk->opt->faddr : newsk->daddr, - newsk->saddr, newsk->ip_tos, NULL)) { + newsk->saddr, newsk->ip_tos, 0)) { sk_free(newsk); return NULL; } @@ -1359,6 +1431,57 @@ exit: return NULL; } +static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct open_request *req, *prev; + + req = tcp_v4_search_req(tp,skb->nh.iph, skb->h.th, &prev); + if (!req) + return; + /* Sequence number check required by RFC793 */ + if (before(skb->seq, req->snt_isn) || after(skb->seq, req->snt_isn+1)) + return; + tcp_synq_unlink(tp, req, prev); + req->class->destructor(req); + tcp_openreq_free(req); +} + +/* Check for embryonic sockets (open_requests) We check packets with + * only the SYN bit set against the open_request queue too: This + * increases connection latency a bit, but is required to detect + * retransmitted SYNs. + */ +static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + u32 flg = ((u32 *)th)[3]; + + /* Check for RST */ + if (flg & __constant_htonl(0x00040000)) { + tcp_v4_rst_req(sk, skb); + return NULL; + } + + /* Check for SYN|ACK */ + if (flg & __constant_htonl(0x00120000)) { + struct open_request *req, *dummy; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Find possible connection requests. */ + req = tcp_v4_search_req(tp, skb->nh.iph, th, &dummy); + if (req) { + sk = tcp_check_req(sk, skb, req); + } +#ifdef CONFIG_SYN_COOKIES + else { + sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); + } +#endif + } + return sk; +} + int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { skb_set_owner_r(skb, sk); @@ -1368,49 +1491,42 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) * is currently called with bh processing disabled. */ lock_sock(sk); - + if (sk->state == TCP_ESTABLISHED) { /* Fast path */ if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) goto reset; - } else { - /* Check for embryonic sockets (open_requests) - * We check packets with only the SYN bit set - * against the open_request queue too: This - * increases connection latency a bit, but is - * required to detect retransmitted SYNs. - */ - /* FIXME: need to check for multicast syns - * here to satisfy RFC1122 4.2.3.10, p. 104: - * discard bcast/mcast SYN. I'm not sure if - * they're filtered out at the IP layer (I - * think not) - */ - if (sk->state == TCP_LISTEN && - ((u32 *)skb->h.th)[3] & __constant_htonl(0x00120000)) { - struct sock *nsk; - - /* Find possible connection requests. */ - nsk = tcp_check_req(sk, skb, &(IPCB(skb)->opt)); - if (nsk == NULL) - goto discard; - - release_sock(sk); - lock_sock(nsk); - sk = nsk; - } + release_sock(sk); + return 0; + } - if (tcp_rcv_state_process(sk, skb, skb->h.th, - &(IPCB(skb)->opt), skb->len)) - goto reset; + + if (sk->state == TCP_LISTEN) { + struct sock *nsk; + + nsk = tcp_v4_hnd_req(sk, skb); + if (!nsk) + goto discard; + lock_sock(nsk); + release_sock(sk); + sk = nsk; } + + if (tcp_rcv_state_process(sk, skb, skb->h.th, + &(IPCB(skb)->opt), skb->len)) + goto reset; release_sock(sk); return 0; reset: tcp_v4_send_reset(skb); discard: - kfree_skb(skb, FREE_READ); - release_sock(sk); + kfree_skb(skb, FREE_READ); + /* Be careful here. If this function gets more complicated and + * gcc suffers from register pressure on the x86, sk (in %ebx) + * might be destroyed here. This current version compiles correctly, + * but you have been warned. + */ + release_sock(sk); return 0; } @@ -1422,42 +1538,43 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) { struct tcphdr *th; struct sock *sk; - u32 saddr = skb->nh.iph->saddr; - u32 daddr = skb->nh.iph->daddr; - - th = skb->h.th; if (skb->pkt_type!=PACKET_HOST) goto discard_it; + th = skb->h.th; + /* Pull up the IP header. */ - skb_pull(skb, skb->h.raw-skb->data); + __skb_pull(skb, skb->h.raw - skb->data); + + /* Count it even if it's bad */ + tcp_statistics.TcpInSegs++; /* Try to use the device checksum if provided. */ switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = csum_partial((char *)th, len, 0); case CHECKSUM_HW: - if (tcp_v4_check(th,len,saddr,daddr,skb->csum)) { - struct iphdr * iph = skb->nh.iph; + if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) { printk(KERN_DEBUG "TCPv4 bad checksum from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, len=%d/%d/%d\n", - NIPQUAD(saddr), ntohs(th->source), NIPQUAD(daddr), - ntohs(th->dest), len, skb->len, ntohs(iph->tot_len)); - goto discard_it; + NIPQUAD(skb->nh.iph->saddr), ntohs(th->source), NIPQUAD(skb->nh.iph->daddr), + ntohs(th->dest), len, skb->len, ntohs(skb->nh.iph->tot_len)); + tcp_statistics.TcpInErrs++; + goto discard_it; } default: /* CHECKSUM_UNNECESSARY */ } - tcp_statistics.TcpInSegs++; - #ifdef CONFIG_IP_TRANSPARENT_PROXY if (IPCB(skb)->redirport) - sk = tcp_v4_proxy_lookup(th->dest, saddr, th->source, daddr, - skb->dev->pa_addr, IPCB(skb)->redirport); + sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, skb->dev, + IPCB(skb)->redirport, skb->dev->ifindex); else #endif - sk = __tcp_v4_lookup(th, saddr, th->source, daddr, th->dest); + sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, th->dest, skb->dev->ifindex); if (!sk) goto no_tcp_socket; if(!ipsec_sk_policy(sk,skb)) @@ -1501,7 +1618,7 @@ int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb) rt = (struct rtable*)skb->dst; if (rt->u.dst.obsolete) { int err; - err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos, rt->key.dst_dev); + err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos, rt->key.oif); if (err) { sk->err_soft=-err; sk->error_report(skb->sk); @@ -1524,7 +1641,7 @@ int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb) static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th) { return tcp_v4_lookup(skb->nh.iph->saddr, th->source, - skb->nh.iph->daddr, th->dest); + skb->nh.iph->daddr, th->dest, skb->dev->ifindex); } static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) @@ -1547,13 +1664,6 @@ struct tcp_func ipv4_specific = { ip_setsockopt, ip_getsockopt, v4_addr2sockaddr, - tcp_v4_send_reset, - tcp_v4_search_req, -#ifdef CONFIG_SYNCOOKIES - cookie_v4_check, -#else - NULL, -#endif sizeof(struct sockaddr_in) }; @@ -1592,8 +1702,6 @@ static int tcp_v4_init_sock(struct sock *sk) sk->priority = 1; sk->state = TCP_CLOSE; - /* This is how many unacked bytes we will accept for this socket. */ - sk->max_unacked = 2048; /* needs to be at most 2 full packets. */ sk->max_ack_backlog = SOMAXCONN; sk->mtu = 576; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8e60f1a509c9..f9ffb1517777 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.46 1997/08/24 16:22:28 freitag Exp $ + * Version: $Id: tcp_output.c,v 1.50 1997/10/15 19:13:02 freitag Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -74,9 +74,12 @@ static __inline__ int tcp_snd_test(struct sock *sk, struct sk_buff *skb) * (part of SWS is done on packetization) * c) We are retransmiting [Nagle] * d) We have too many packets 'in flight' + * + * Don't use the nagle rule for urgent data. */ len = skb->end_seq - skb->seq; - if (!sk->nonagle && len < (sk->mss >> 1) && tp->packets_out) + if (!sk->nonagle && len < (sk->mss >> 1) && tp->packets_out && + !skb->h.th->urg) nagle_check = 0; return (nagle_check && tp->packets_out < tp->snd_cwnd && @@ -471,8 +474,12 @@ unsigned short tcp_select_window(struct sock *sk) if (tp->window_clamp) { free_space = min(tp->window_clamp, free_space); mss = min(tp->window_clamp, mss); - } else + } +#ifdef NO_ANK_FIX + /* I am tired of this message */ + else printk(KERN_DEBUG "Clamp failure. Water leaking.\n"); +#endif if (mss < 1) { mss = 1; @@ -487,8 +494,11 @@ unsigned short tcp_select_window(struct sock *sk) if (cur_win < 0) { cur_win = 0; +#ifdef NO_ANK_FIX + /* And this too. */ printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n", tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup); +#endif } if (free_space < sk->rcvbuf/4 && free_space < mss/2) @@ -610,9 +620,8 @@ static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb) th1->urg = 1; th1->urg_ptr = th2->urg_ptr + size1; } - if (th2->fin) { + if (th2->fin) th1->fin = 1; - } /* ... and off you go. */ kfree_skb(buff, FREE_WRITE); @@ -1007,11 +1016,8 @@ void tcp_write_wakeup(struct sock *sk) * following states. If any other state is encountered, return. * [listen/close will never occur here anyway] */ - if (sk->state != TCP_ESTABLISHED && - sk->state != TCP_CLOSE_WAIT && - sk->state != TCP_FIN_WAIT1 && - sk->state != TCP_LAST_ACK && - sk->state != TCP_CLOSING) + if ((1 << sk->state) & + ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|TCPF_LAST_ACK|TCPF_CLOSING)) return; if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && (skb=tp->send_head)) { diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index cf6fcfbe7dc9..5cb05d55b9b5 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: @(#)tcp.c 1.0.16 05/25/93 + * Version: $Id: tcp_timer.c,v 1.31 1997/11/05 08:14:01 freitag Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -212,7 +212,7 @@ static int tcp_write_timeout(struct sock *sk) tcp_clear_xmit_timers(sk); /* Time wait the socket. */ - if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING) { + if ((1<state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) { tcp_set_state(sk,TCP_TIME_WAIT); tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); } else { @@ -263,8 +263,7 @@ void tcp_probe_timer(unsigned long data) { sk->error_report(sk); /* Time wait the socket. */ - if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 - || sk->state == TCP_CLOSING) { + if ((1<state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) { tcp_set_state(sk, TCP_TIME_WAIT); tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); } else { @@ -280,8 +279,7 @@ static __inline__ int tcp_keepopen_proc(struct sock *sk) { int res = 0; - if (sk->state == TCP_ESTABLISHED || sk->state == TCP_CLOSE_WAIT || - sk->state == TCP_FIN_WAIT2) { + if ((1<state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; __u32 elapsed = jiffies - tp->rcv_tstamp; @@ -382,6 +380,11 @@ void tcp_retransmit_timer(unsigned long data) return; } + if (sk->sock_readers) { + /* Try again in a second. */ + tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ); + return; + } lock_sock(sk); /* Clear delay ack timer. */ diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c index 3a2927528dc2..fe02b3f4c2bc 100644 --- a/net/ipv4/timer.c +++ b/net/ipv4/timer.c @@ -5,7 +5,7 @@ * * TIMER - implementation of software timers for IP. * - * Version: @(#)timer.c 1.0.7 05/25/93 + * Version: $Id: timer.c,v 1.7 1997/09/17 18:50:26 freitag Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ed84d5b0f4e5..42a3df7cac1e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: @(#)udp.c 1.0.13 06/02/93 + * Version: $Id: udp.c,v 1.44 1997/10/15 19:56:35 freitag Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -81,8 +81,7 @@ when application doesn't choose (NOT YET - doesn't seem to be in the BSD API) [Does opening a SOCK_PACKET and snooping your output count 8)] 4.1.3.6 (Invalid Addresses) - MUST discard invalid source addresses (NOT YET -- will be implemented - in IP, so UDP will eventually be OK. Right now it's a violation.) + MUST discard invalid source addresses (OK -- done in the new routing code) MUST only send datagrams with one of our addresses (NOT YET - ought to be OK ) 950728 -- MS */ @@ -133,6 +132,13 @@ static int udp_v4_verify_bind(struct sock *sk, unsigned short snum) unsigned char state = sk2->state; int sk2_reuse = sk2->reuse; + /* Two sockets can be bound to the same port if they're + * bound to different interfaces. + */ + + if(sk2->bound_dev_if != sk->bound_dev_if) + continue; + if(!sk2->rcv_saddr || !sk->rcv_saddr) { if((!sk2_reuse) || (!sk_reuse) || @@ -173,20 +179,24 @@ unsigned short udp_good_socknum(void) int i, best, best_size_so_far; SOCKHASH_LOCK(); + if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0]) + start = sysctl_local_port_range[0]; - /* Select initial not-so-random "best" */ - best = PROT_SOCK + 1 + (start & 1023); best_size_so_far = 32767; /* "big" num */ - result = best; - for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { + best = result = start; + + for(i = 0; i < UDP_HTABLE_SIZE; i++, result++) { struct sock *sk; int size; sk = udp_hash[result & (UDP_HTABLE_SIZE - 1)]; - /* No clashes - take it */ - if (!sk) + if(!sk) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); goto out; + } /* Is this one better than our best so far? */ size = 0; @@ -196,12 +206,19 @@ unsigned short udp_good_socknum(void) } while((sk = sk->next) != NULL); best_size_so_far = size; best = result; -next: + next: } - while (udp_lport_inuse(best)) - best += UDP_HTABLE_SIZE; result = best; + + for(;; result += UDP_HTABLE_SIZE) { + /* Get into range (but preserve hash bin)... */ + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); + if (!udp_lport_inuse(result)) + break; + } out: start = result; SOCKHASH_UNLOCK(); @@ -277,7 +294,7 @@ static void udp_v4_rehash(struct sock *sk) /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this here plus the last hit cache. -DaveM */ -struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport) +struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { struct sock *sk, *result = NULL; unsigned short hnum = ntohs(dport); @@ -301,7 +318,12 @@ struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport) continue; score++; } - if(score == 3) { + if(sk->bound_dev_if) { + if(sk->bound_dev_if != dif) + continue; + score++; + } + if(score == 4) { result = sk; break; } else if(score > badness) { @@ -313,23 +335,25 @@ struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport) return result; } -__inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport) +__inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { struct sock *sk; - if(uh_cache_sk && + if(!dif && uh_cache_sk && uh_cache_saddr == saddr && uh_cache_sport == sport && uh_cache_dport == dport && uh_cache_daddr == daddr) return uh_cache_sk; - sk = udp_v4_lookup_longway(saddr, sport, daddr, dport); - uh_cache_sk = sk; - uh_cache_saddr = saddr; - uh_cache_daddr = daddr; - uh_cache_sport = sport; - uh_cache_dport = dport; + sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif); + if(!dif) { + uh_cache_sk = sk; + uh_cache_saddr = saddr; + uh_cache_daddr = daddr; + uh_cache_sport = sport; + uh_cache_dport = dport; + } return sk; } @@ -348,16 +372,25 @@ __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport #define udp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \ secondlist((hpnum),(sk)->next,(fpass)) -struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr, - unsigned short rnum, unsigned long laddr, - unsigned long paddr, unsigned short pnum) +static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr, + unsigned short rnum, unsigned long laddr, + struct device *dev, unsigned short pnum, + int dif) { struct sock *s, *result = NULL; int badness = -1; + u32 paddr = 0; unsigned short hnum = ntohs(num); unsigned short hpnum = ntohs(pnum); int firstpass = 1; + if(dev && dev->ip_ptr) { + struct in_device *idev = dev->ip_ptr; + + if(idev->ifa_list) + paddr = idev->ifa_list->ifa_local; + } + SOCKHASH_LOCK(); for(s = udp_v4_proxy_loop_init(hnum, hpnum, s, firstpass); s != NULL; @@ -382,7 +415,12 @@ struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr, continue; score++; } - if(score == 3 && s->num == hnum) { + if(s->bound_dev_if) { + if(s->bound_dev_if != dif) + continue; + score++; + } + if(score == 4 && s->num == hnum) { result = s; break; } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) { @@ -434,7 +472,7 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk, * to find the appropriate port. */ -void udp_err(struct sk_buff *skb, unsigned char *dp) +void udp_err(struct sk_buff *skb, unsigned char *dp, int len) { struct iphdr *iph = (struct iphdr*)dp; struct udphdr *uh = (struct udphdr*)(dp+(iph->ihl<<2)); @@ -442,9 +480,16 @@ void udp_err(struct sk_buff *skb, unsigned char *dp) int code = skb->h.icmph->code; struct sock *sk; - sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source); - if (sk == NULL) - return; /* No socket for error */ + if (len < (iph->ihl<<2)+sizeof(struct udphdr)) { + icmp_statistics.IcmpInErrors++; + return; + } + + sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex); + if (sk == NULL) { + icmp_statistics.IcmpInErrors++; + return; /* No socket for error */ + } if (sk->ip_recverr && !sk->sock_readers) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -519,7 +564,6 @@ struct udpfakehdr u32 daddr; u32 other; struct iovec *iov; - int nriov; u32 wcheck; }; @@ -533,46 +577,23 @@ struct udpfakehdr static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned int fraglen) { struct udpfakehdr *ufh = (struct udpfakehdr *)p; - struct iovec *iov; - char *src; - char *dst = to; - unsigned int len; - - if (offset == 0) { - fraglen -= sizeof(struct udphdr); - dst += sizeof(struct udphdr); - } - - iov = ufh->iov; - do { - if ((len = iov->iov_len) > fraglen) - len = fraglen; - src = (char *) iov->iov_base + iov->iov_len - len; - ufh->wcheck = csum_partial_copy_fromuser(src, - dst + fraglen - len, len, - ufh->wcheck); - if ((iov->iov_len -= len) == 0) { - if (--(ufh->nriov) < 0) { - printk(KERN_NOTICE "udp_getfrag: nriov = %d\n", - ufh->nriov); - return -EINVAL; - } - iov--; - } - fraglen -= len; - } while (fraglen); - ufh->iov = iov; - - if (offset == 0) { + if (offset==0) { + if (csum_partial_copy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset, + fraglen-sizeof(struct udphdr), &ufh->wcheck)) + return -EFAULT; ufh->wcheck = csum_partial((char *)ufh, sizeof(struct udphdr), - ufh->wcheck); + ufh->wcheck); ufh->uh.check = csum_tcpudp_magic(ufh->saddr, ufh->daddr, ntohs(ufh->uh.len), IPPROTO_UDP, ufh->wcheck); if (ufh->uh.check == 0) ufh->uh.check = -1; memcpy(to, ufh, sizeof(struct udphdr)); + return 0; } + if (csum_partial_copy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr), + fraglen, &ufh->wcheck)) + return -EFAULT; return 0; } @@ -586,45 +607,19 @@ static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned i static int udp_getfrag_nosum(const void *p, char * to, unsigned int offset, unsigned int fraglen) { struct udpfakehdr *ufh = (struct udpfakehdr *)p; - struct iovec *iov; - char *src; - char *dst = to; - int err; - unsigned int len; - - if (offset == 0) { - fraglen -= sizeof(struct udphdr); - dst += sizeof(struct udphdr); - } - - iov = ufh->iov; - do { - if ((len = iov->iov_len) > fraglen) - len = fraglen; - src = (char *) iov->iov_base + iov->iov_len - len; - err = copy_from_user(dst + fraglen - len, src, len); - fraglen -= len; - if ((iov->iov_len -= len) == 0) { - if (--(ufh->nriov) < 0) { - printk(KERN_NOTICE "udp_getfrag: nriov = %d\n", - ufh->nriov); - return -EINVAL; - } - iov--; - } - } while (fraglen && err >= 0); - ufh->iov = iov; - if (offset == 0) + if (offset==0) { memcpy(to, ufh, sizeof(struct udphdr)); - return err; + return memcpy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset, + fraglen-sizeof(struct udphdr)); + } + return memcpy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr), + fraglen); } - int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) { int ulen = len + sizeof(struct udphdr); - struct device *dev = NULL; struct ipcm_cookie ipc; struct udpfakehdr ufh; struct rtable *rt; @@ -674,8 +669,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) ipc.addr = sk->saddr; ipc.opt = NULL; + ipc.oif = sk->bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(msg, &ipc, &dev); + err = ip_cmsg_send(msg, &ipc); if (err) return err; if (ipc.opt) @@ -695,17 +691,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) tos = RT_TOS(sk->ip_tos) | (sk->localroute || (msg->msg_flags&MSG_DONTROUTE) || (ipc.opt && ipc.opt->is_strictroute)); - if (MULTICAST(daddr) && sk->ip_mc_index && dev == NULL) - err = ip_route_output_dev(&rt, daddr, ufh.saddr, tos, sk->ip_mc_index); - else - err = ip_route_output(&rt, daddr, ufh.saddr, tos, dev); + if (MULTICAST(daddr)) { + if (!ipc.oif) + ipc.oif = sk->ip_mc_index; + if (!ufh.saddr) + ufh.saddr = sk->ip_mc_addr; + } + + err = ip_route_output(&rt, daddr, ufh.saddr, tos, ipc.oif); if (err) { if (free) kfree(ipc.opt); return err; } - if (rt->rt_flags&RTF_BROADCAST && !sk->broadcast) { + if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) { if (free) kfree(ipc.opt); ip_rt_put(rt); return -EACCES; @@ -718,8 +718,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) ufh.uh.len = htons(ulen); ufh.uh.check = 0; ufh.other = (htons(ulen) << 16) + IPPROTO_UDP*256; - ufh.iov = msg->msg_iov + msg->msg_iovlen - 1; - ufh.nriov = msg->msg_iovlen; + ufh.iov = msg->msg_iov; ufh.wcheck = 0; /* RFC1122: OK. Provides the checksumming facility (MUST) as per */ @@ -907,10 +906,10 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return(-EAFNOSUPPORT); err = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr, - sk->ip_tos|sk->localroute); + sk->ip_tos|sk->localroute, sk->bound_dev_if); if (err) return err; - if ((rt->rt_flags&RTF_BROADCAST) && !sk->broadcast) { + if ((rt->rt_flags&RTCF_BROADCAST) && !sk->broadcast) { ip_rt_put(rt); return -EACCES; } @@ -1024,7 +1023,7 @@ int udp_chkaddr(struct sk_buff *skb) struct udphdr *uh = (struct udphdr *)(skb->nh.raw + iph->ihl*4); struct sock *sk; - sk = udp_v4_lookup(iph->saddr, uh->source, iph->daddr, uh->dest); + sk = udp_v4_lookup(iph->saddr, uh->source, iph->daddr, uh->dest, skb->dev->ifindex); if (!sk) return 0; @@ -1113,17 +1112,17 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) skb_trim(skb,len); - if(rt->rt_flags & (RTF_BROADCAST|RTF_MULTICAST)) + if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); #ifdef CONFIG_IP_TRANSPARENT_PROXY if (IPCB(skb)->redirport) sk = udp_v4_proxy_lookup(uh->dest, saddr, uh->source, - daddr, skb->dev->pa_addr, - IPCB(skb)->redirport); + daddr, skb->dev, IPCB(skb)->redirport, + skb->dev->ifindex); else #endif - sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest); + sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); if (sk == NULL) { udp_statistics.UdpNoPorts++; diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c index d2b8e00894e5..0f463d0eec51 100644 --- a/net/ipv4/utils.c +++ b/net/ipv4/utils.c @@ -6,7 +6,7 @@ * Various kernel-resident INET utility functions; mainly * for format conversion and debugging output. * - * Version: @(#)utils.c 1.0.7 05/18/93 + * Version: $Id: utils.c,v 1.5 1997/09/17 18:50:31 freitag Exp $ * * Author: Fred N. van Kempen, * diff --git a/net/ipv6/Config.in b/net/ipv6/Config.in new file mode 100644 index 000000000000..f4c84e6402bc --- /dev/null +++ b/net/ipv6/Config.in @@ -0,0 +1,7 @@ +# +# IPv6 configuration +# +bool 'IPv6: enable EUI-64 token format' CONFIG_IPV6_EUI64 +bool 'IPv6: disable provided based addresses' CONFIG_IPV6_NO_PB +#bool 'IPv6: flow policy support' CONFIG_RT6_POLICY +#bool 'IPv6: firewall support' CONFIG_IPV6_FIREWALL diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 359de74f058d..6e69b8813a44 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: addrconf.c,v 1.21 1997/08/09 03:44:24 davem Exp $ + * $Id: addrconf.c,v 1.28 1997/11/05 20:20:43 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -42,7 +43,8 @@ #include #include #include -#include +#include +#include #include @@ -92,12 +94,11 @@ int ipv6_addr_type(struct in6_addr *addr) st = addr->s6_addr32[0]; - /* - * UCast Provider Based Address - * 0x4/3 + /* Consider all addresses with the first three bits different of + 000 and 111 as unicasts. */ - - if ((st & __constant_htonl(0xE0000000)) == __constant_htonl(0x40000000)) + if ((st & __constant_htonl(0xE0000000)) != __constant_htonl(0x00000000) && + (st & __constant_htonl(0xE0000000)) != __constant_htonl(0xE0000000)) return IPV6_ADDR_UNICAST; if ((st & __constant_htonl(0xFF000000)) == __constant_htonl(0xFF000000)) { @@ -184,6 +185,8 @@ void addrconf_forwarding_on(void) printk(KERN_DEBUG "joining all-routers\n"); #endif idev->router = 1; + + /* Wrong. It is user level function. */ ipv6_addr_all_routers(&maddr); ipv6_dev_mc_inc(idev->dev, &maddr); } @@ -222,6 +225,7 @@ struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev, memcpy(&ifa->addr, addr, sizeof(struct in6_addr)); init_timer(&ifa->timer); + ifa->timer.data = (unsigned long) ifa; ifa->scope = scope; ifa->idev = idev; @@ -361,7 +365,7 @@ struct inet6_ifaddr * ipv6_get_saddr(struct dst_entry *dst, } out: - if (ifp == NULL && match) + if (ifp == NULL) ifp = match; atomic_dec(&addr_list_lock); return ifp; @@ -410,6 +414,157 @@ struct inet6_ifaddr * ipv6_chk_addr(struct in6_addr *addr) return ifp; } +/* Join to solicited addr multicast group. */ + +static void addrconf_join_solict(struct device *dev, struct in6_addr *addr) +{ + struct in6_addr maddr; + + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); +} + +#ifdef CONFIG_IPV6_EUI64 +static int ipv6_generate_eui64(u8 *eui, struct device *dev) +{ + switch (dev->type) { + case ARPHRD_ETHER: + if (dev->addr_len != ETH_ALEN) + return -1; + memcpy(eui, dev->dev_addr, 3); + memcpy(eui + 5, dev->dev_addr+3, 3); + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + return 0; + } + return -1; +} +#endif + +/* + * Add prefix route. + */ + +static void +addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev, + unsigned long info) +{ + struct in6_rtmsg rtmsg; + int err; + + memset(&rtmsg, 0, sizeof(rtmsg)); + memcpy(&rtmsg.rtmsg_dst, pfx, sizeof(struct in6_addr)); + rtmsg.rtmsg_dst_len = plen; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + rtmsg.rtmsg_ifindex = dev->ifindex; + rtmsg.rtmsg_info = info; + rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF; + + /* Prevent useless cloning on PtP SIT. + This thing is done here expecting that the whole + class of non-broadcast devices need not cloning. + */ + if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) + rtmsg.rtmsg_flags |= RTF_NONEXTHOP; + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + + ip6_route_add(&rtmsg, &err); + + if (err) + printk(KERN_DEBUG "IPv6: error %d adding prefix route\n", err); +} + +/* Create "default" multicast route to the interface */ + +static void addrconf_add_mroute(struct device *dev) +{ + struct in6_rtmsg rtmsg; + struct rt6_info *rt; + int err; + + memset(&rtmsg, 0, sizeof(rtmsg)); + ipv6_addr_set(&rtmsg.rtmsg_dst, + __constant_htonl(0xFF000000), 0, 0, 0); + rtmsg.rtmsg_dst_len = 8; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + rtmsg.rtmsg_ifindex = dev->ifindex; + rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF; + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + + rt = ip6_route_add(&rtmsg, &err); + + /* + * Pedro makes interesting thing here, he attached + * fake nexthop to multicast route. + * It is trick to avoid cloning, ugly, but efficient. --ANK + */ + + if (err) + printk(KERN_DEBUG "IPv6: error %d adding mroute\n", err); + else + rt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_dst); +} + +static void sit_route_add(struct device *dev) +{ + struct in6_rtmsg rtmsg; + struct rt6_info *rt; + int err; + + memset(&rtmsg, 0, sizeof(rtmsg)); + + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + + /* prefix length - 96 bytes "::d.d.d.d" */ + rtmsg.rtmsg_dst_len = 96; + rtmsg.rtmsg_flags = RTF_UP; + rtmsg.rtmsg_ifindex = dev->ifindex; + + rt = ip6_route_add(&rtmsg, &err); + + /* See comment in addrconf_add_mroute. + * It is the same trick, but to avoid cloning for direct + * sit routes i.e. IPv4 comaptible destinations. + */ + if (err) + printk(KERN_DEBUG "sit_route_add: error %d in route_add\n", err); + else + rt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_dst); +} + +static void addrconf_add_lroute(struct device *dev) +{ + struct in6_addr addr; + + ipv6_addr_set(&addr, __constant_htonl(0xFE800000), 0, 0, 0); + addrconf_prefix_route(&addr, 10, dev, 0); +} + +static struct inet6_dev *addrconf_add_dev(struct device *dev) +{ + struct in6_addr maddr; + struct inet6_dev *idev; + + if ((idev = ipv6_get_idev(dev)) == NULL) { + idev = ipv6_add_dev(dev); + if (idev == NULL) + return NULL; + } + + /* Add default multicast route */ + addrconf_add_mroute(dev); + + /* Add link local route */ + addrconf_add_lroute(dev); + + /* Join to all nodes multicast group. */ + ipv6_addr_all_nodes(&maddr); + ipv6_dev_mc_inc(dev, &maddr); + return idev; +} + void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) { struct prefix_info *pinfo; @@ -432,7 +587,7 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) addr_type = ipv6_addr_type(&pinfo->prefix); - if (addr_type & IPV6_ADDR_LINKLOCAL) + if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)) return; valid_lft = ntohl(pinfo->valid); @@ -470,23 +625,12 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) rt->rt6i_expires = rt_expires; } } else if (pinfo->onlink && valid_lft) { - struct in6_rtmsg rtmsg; - int err; - - memset(&rtmsg, 0, sizeof(rtmsg)); - - printk(KERN_DEBUG "adding on link route\n"); - - ipv6_addr_copy(&rtmsg.rtmsg_dst, &pinfo->prefix); - rtmsg.rtmsg_dst_len = pinfo->prefix_len; - rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; - rtmsg.rtmsg_ifindex = dev->ifindex; - rtmsg.rtmsg_flags = RTF_UP | RTF_ADDRCONF; - rtmsg.rtmsg_info = rt_expires; - - ip6_route_add(&rtmsg, &err); + addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, + dev, rt_expires); } + /* Try to figure out our local address for this prefix */ + if (pinfo->autoconf && ipv6_config.autoconf) { struct inet6_ifaddr * ifp; struct in6_addr addr; @@ -494,33 +638,41 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) plen = pinfo->prefix_len >> 3; - if (plen + dev->addr_len == sizeof(struct in6_addr)) { +#ifdef CONFIG_IPV6_EUI64 + if (pinfo->prefix_len == 64) { + memcpy(&addr, &pinfo->prefix, 8); + if (ipv6_generate_eui64(addr.s6_addr + 8, dev)) + return; + goto ok; + } +#endif +#ifndef CONFIG_IPV6_NO_PB + if (pinfo->prefix_len == ((sizeof(struct in6_addr) - dev->addr_len)<<3)) { memcpy(&addr, &pinfo->prefix, plen); memcpy(addr.s6_addr + plen, dev->dev_addr, dev->addr_len); - } else { - ADBG(("addrconf: prefix_len invalid\n")); - return; + goto ok; } +#endif + printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n", pinfo->prefix_len); + return; +ok: ifp = ipv6_chk_addr(&addr); if (ifp == NULL && valid_lft) { struct inet6_dev *in6_dev = ipv6_get_idev(dev); - if (in6_dev == NULL) - ADBG(("addrconf: device not configured\n")); - + if (in6_dev == NULL) { + printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name); + return; + } + ifp = ipv6_add_addr(in6_dev, &addr, addr_type & IPV6_ADDR_SCOPE_MASK); - if (dev->flags & IFF_MULTICAST) { - struct in6_addr maddr; - - /* Join to solicited addr multicast group. */ - addrconf_addr_solict_mult(&addr, &maddr); - ipv6_dev_mc_inc(dev, &maddr); - } + if (ifp == NULL) + return; ifp->prefix_len = pinfo->prefix_len; @@ -564,17 +716,32 @@ int addrconf_set_dstaddr(void *arg) } if (dev->type == ARPHRD_SIT) { - struct device *dev; - + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4)) return -EADDRNOTAVAIL; - - dev = sit_add_tunnel(ireq.ifr6_addr.s6_addr32[3]); - - if (dev == NULL) - err = -ENODEV; - else - err = 0; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = ireq.ifr6_addr.s6_addr32[3]; + p.iph.saddr = 0; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPV6; + p.iph.ttl = 64; + ifr.ifr_ifru.ifru_data = (void*)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + + if (err == 0) { + err = -ENOBUFS; + if ((dev = dev_get(p.name)) == NULL) + goto err_exit; + err = dev_open(dev); + } } err_exit: @@ -595,38 +762,27 @@ int addrconf_add_ifaddr(void *arg) if (!suser()) return -EPERM; - if(copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; - if((dev = dev_get_by_index(ireq.ifr6_ifindex)) == NULL) - return -EINVAL; + if ((dev = dev_get_by_index(ireq.ifr6_ifindex)) == NULL) + return -ENODEV; + + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; - if ((idev = ipv6_get_idev(dev)) == NULL) - return -EINVAL; + if ((idev = addrconf_add_dev(dev)) == NULL) + return -ENOBUFS; scope = ipv6_addr_scope(&ireq.ifr6_addr); if((ifp = ipv6_add_addr(idev, &ireq.ifr6_addr, scope)) == NULL) return -ENOMEM; - ifp->prefix_len = 128; - - if (dev->flags & IFF_MULTICAST) { - struct in6_addr maddr; - - /* Join to solicited addr multicast group. */ - addrconf_addr_solict_mult(&ireq.ifr6_addr, &maddr); - ipv6_dev_mc_inc(dev, &maddr); - } - ifp->prefix_len = ireq.ifr6_prefixlen; ifp->flags |= ADDR_PERMANENT; - if (!(dev->flags & (IFF_NOARP|IFF_LOOPBACK))) - addrconf_dad_start(ifp); - else - ip6_rt_addr_add(&ifp->addr, dev); - + addrconf_dad_start(ifp); return 0; } @@ -645,90 +801,22 @@ int addrconf_del_ifaddr(void *arg) return -EFAULT; if ((dev = dev_get_by_index(ireq.ifr6_ifindex)) == NULL) - return -EINVAL; + return -ENODEV; if ((idev = ipv6_get_idev(dev)) == NULL) - return -EINVAL; + return -ENXIO; scope = ipv6_addr_scope(&ireq.ifr6_addr); for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == scope && - (!memcmp(&ireq.ifr6_addr, &ifp->addr, sizeof(struct in6_addr)))) { - ipv6_del_addr(ifp); - break; - } - } - - return 0; -} - -static void sit_route_add(struct device *dev) -{ - struct in6_rtmsg rtmsg; - struct rt6_info *rt; - int err; - - ADBG(("sit_route_add(%s): ", dev->name)); - memset(&rtmsg, 0, sizeof(rtmsg)); - - rtmsg.rtmsg_type = RTMSG_NEWROUTE; - rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; - - if (dev->pa_dstaddr == 0) { - ADBG(("pa_dstaddr=0, ")); - /* prefix length - 96 bytes "::d.d.d.d" */ - rtmsg.rtmsg_dst_len = 96; - rtmsg.rtmsg_flags = RTF_NONEXTHOP|RTF_UP; - } else { - ADBG(("pa_dstaddr=%08x, ", dev->pa_dstaddr)); - rtmsg.rtmsg_dst_len = 10; - rtmsg.rtmsg_dst.s6_addr32[0] = __constant_htonl(0xfe800000); - rtmsg.rtmsg_dst.s6_addr32[3] = dev->pa_dstaddr; - rtmsg.rtmsg_gateway.s6_addr32[3]= dev->pa_dstaddr; - rtmsg.rtmsg_flags = RTF_UP; - } - - rtmsg.rtmsg_ifindex = dev->ifindex; - ADBG(("doing ip6_route_add()\n")); - rt = ip6_route_add(&rtmsg, &err); - - if (err) { -#if ACONF_DEBUG >= 1 - printk(KERN_DEBUG "sit_route_add: error %d in route_add\n", err); -#endif - } - - ADBG(("sit_route_add(cont): ")); - if (dev->pa_dstaddr) { - struct rt6_info *mrt; - - ADBG(("pa_dstaddr != 0, ")); - rt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_gateway); - if (rt->rt6i_nexthop == NULL) { - ADBG(("can't get neighbour\n")); - printk(KERN_DEBUG "sit_route: get_neigh failed\n"); + if (ifp->scope == scope && + (!memcmp(&ireq.ifr6_addr, &ifp->addr, sizeof(struct in6_addr)))) { + ipv6_del_addr(ifp); + break; } - - /* - * Add multicast route. - */ - ADBG(("add MULT, ")); - ipv6_addr_set(&rtmsg.rtmsg_dst, __constant_htonl(0xFF000000), 0, 0, 0); - - rtmsg.rtmsg_dst_len = 8; - rtmsg.rtmsg_flags = RTF_UP; - rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; - - memset(&rtmsg.rtmsg_gateway, 0, sizeof(struct in6_addr)); - ADBG(("doing ip6_route_add()\n")); - mrt = ip6_route_add(&rtmsg, &err); - - if (mrt) - mrt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_dst); - } else { - ADBG(("pa_dstaddr==0\n")); } + + return 0; } static void sit_add_v4_addrs(struct inet6_dev *idev) @@ -739,34 +827,55 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) int scope; memset(&addr, 0, sizeof(struct in6_addr)); + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); - if (idev->dev->pa_dstaddr) { + if (idev->dev->flags&IFF_POINTOPOINT) { addr.s6_addr32[0] = __constant_htonl(0xfe800000); scope = IFA_LINK; } else { scope = IPV6_ADDR_COMPATv4; } + if (addr.s6_addr32[3]) { + ifp = ipv6_add_addr(idev, &addr, scope); + if (ifp) { + ifp->flags |= ADDR_PERMANENT; + ifp->prefix_len = 128; + ip6_rt_addr_add(&ifp->addr, idev->dev); + } + return; + } + for (dev = dev_base; dev != NULL; dev = dev->next) { - if (dev->family == AF_INET && (dev->flags & IFF_UP)) { + if (dev->ip_ptr && (dev->flags & IFF_UP)) { + struct in_device * in_dev = dev->ip_ptr; + struct in_ifaddr * ifa; + int flag = scope; - - addr.s6_addr32[3] = dev->pa_addr; - if (dev->flags & IFF_LOOPBACK) { - if (idev->dev->pa_dstaddr) - continue; + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + addr.s6_addr32[3] = ifa->ifa_local; - flag |= IFA_HOST; - } - - ifp = ipv6_add_addr(idev, &addr, flag); + if (ifa->ifa_scope == RT_SCOPE_LINK) + continue; + if (ifa->ifa_scope >= RT_SCOPE_HOST) { + if (idev->dev->flags&IFF_POINTOPOINT) + continue; + flag |= IFA_HOST; + } - if (ifp == NULL) - continue; + ifp = ipv6_add_addr(idev, &addr, flag); + + if (ifp == NULL) + continue; - ifp->flags |= ADDR_PERMANENT; - ip6_rt_addr_add(&ifp->addr, dev); + if (idev->dev->flags&IFF_POINTOPOINT) + ifp->prefix_len = 10; + else + ifp->prefix_len = 96; + ifp->flags |= ADDR_PERMANENT; + ip6_rt_addr_add(&ifp->addr, dev); + } } } } @@ -804,56 +913,98 @@ static void init_loopback(struct device *dev) printk(KERN_DEBUG "init_loopback: error in route_add\n"); } -static void addrconf_eth_config(struct device *dev) +static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr) +{ + struct inet6_ifaddr * ifp; + + ifp = ipv6_add_addr(idev, addr, IFA_LINK); + if (ifp == NULL) + return; + + ifp->flags = ADDR_PERMANENT; + ifp->prefix_len = 10; + + addrconf_dad_start(ifp); +} + +static void addrconf_dev_config(struct device *dev) { struct in6_addr addr; struct in6_addr maddr; - struct inet6_ifaddr * ifp; struct inet6_dev * idev; + if (dev->type != ARPHRD_ETHER) { + /* Alas, we support only ethernet autoconfiguration. */ + return; + } + + idev = addrconf_add_dev(dev); + if (idev == NULL) + return; + +#ifdef CONFIG_IPV6_EUI64 memset(&addr, 0, sizeof(struct in6_addr)); - /* Generate link local address. */ addr.s6_addr[0] = 0xFE; addr.s6_addr[1] = 0x80; - memcpy(addr.s6_addr + (sizeof(struct in6_addr) - dev->addr_len), - dev->dev_addr, dev->addr_len); + if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0) + addrconf_add_linklocal(idev, &addr); +#endif - idev = ipv6_add_dev(dev); - if (idev == NULL) - return; - - ifp = ipv6_add_addr(idev, &addr, IFA_LINK); - if (ifp == NULL) - return; +#ifndef CONFIG_IPV6_NO_PB + memset(&addr, 0, sizeof(struct in6_addr)); - ifp->flags = ADDR_PERMANENT; - ifp->prefix_len = 10; + addr.s6_addr[0] = 0xFE; + addr.s6_addr[1] = 0x80; - /* Join to all nodes multicast group. */ - ipv6_addr_all_nodes(&maddr); - ipv6_dev_mc_inc(dev, &maddr); + memcpy(addr.s6_addr + (sizeof(struct in6_addr) - dev->addr_len), + dev->dev_addr, dev->addr_len); + addrconf_add_linklocal(idev, &addr); +#endif if (ipv6_config.forwarding) { idev->router = 1; + + /* It is wrong. + It is routing daemon or radvd that must make it, + rather than kernel. + */ ipv6_addr_all_routers(&maddr); ipv6_dev_mc_inc(dev, &maddr); } +} - /* Join to solicited addr multicast group. */ - addrconf_addr_solict_mult(&addr, &maddr); - ipv6_dev_mc_inc(dev, &maddr); +static void addrconf_sit_config(struct device *dev) +{ + struct inet6_dev *idev; - /* Start duplicate address detection. */ - addrconf_dad_start(ifp); + /* + * Configure the tunnel with one of our IPv4 + * addresses... we should configure all of + * our v4 addrs in the tunnel + */ + + idev = ipv6_add_dev(dev); + if (idev == NULL) { + printk(KERN_DEBUG "init sit: add_dev failed\n"); + return; + } + + sit_add_v4_addrs(idev); + + if (dev->flags&IFF_POINTOPOINT) { + addrconf_add_mroute(dev); + addrconf_add_lroute(dev); + } else + sit_route_add(dev); } + int addrconf_notify(struct notifier_block *this, unsigned long event, void * data) { struct device *dev; - struct inet6_dev * idev; dev = (struct device *) data; @@ -861,34 +1012,15 @@ int addrconf_notify(struct notifier_block *this, unsigned long event, case NETDEV_UP: switch(dev->type) { case ARPHRD_SIT: - - printk(KERN_DEBUG "sit device up: %s\n", dev->name); - - /* - * Configure the tunnel with one of our IPv4 - * addresses... we should configure all of - * our v4 addrs in the tunnel - */ - - idev = ipv6_add_dev(dev); - - sit_add_v4_addrs(idev); - - /* - * we do an hack for now to configure the tunnel - * route. - */ - - sit_route_add(dev); + addrconf_sit_config(dev); break; case ARPHRD_LOOPBACK: init_loopback(dev); break; - case ARPHRD_ETHER: - printk(KERN_DEBUG "Configuring eth interface\n"); - addrconf_eth_config(dev); + default: + addrconf_dev_config(dev); break; }; @@ -934,7 +1066,6 @@ static int addrconf_ifdown(struct device *dev) } if (idev == NULL) { - printk(KERN_DEBUG "addrconf_ifdown: device not found\n"); end_bh_atomic(); return -ENODEV; } @@ -958,8 +1089,8 @@ static int addrconf_ifdown(struct device *dev) ifa = *bifa; continue; } - ifa = ifa->lst_next; bifa = &ifa->lst_next; + ifa = *bifa; } } @@ -968,6 +1099,7 @@ static int addrconf_ifdown(struct device *dev) return 0; } + static void addrconf_rs_timer(unsigned long data) { struct inet6_ifaddr *ifp; @@ -1003,10 +1135,8 @@ static void addrconf_rs_timer(unsigned long data) struct in6_rtmsg rtmsg; int err; -#if ACONF_DEBUG >= 2 printk(KERN_DEBUG "%s: no IPv6 routers present\n", ifp->idev->dev->name); -#endif memset(&rtmsg, 0, sizeof(struct in6_rtmsg)); rtmsg.rtmsg_type = RTMSG_NEWROUTE; @@ -1031,27 +1161,17 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp) dev = ifp->idev->dev; - if (dev->flags & IFF_MULTICAST) { - struct in6_rtmsg rtmsg; - struct rt6_info *mrt; - int err; - - memset(&rtmsg, 0, sizeof(rtmsg)); - ipv6_addr_set(&rtmsg.rtmsg_dst, - __constant_htonl(0xFF000000), 0, 0, 0); - - rtmsg.rtmsg_dst_len = 8; - rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; - rtmsg.rtmsg_ifindex = dev->ifindex; - - rtmsg.rtmsg_flags = RTF_UP; + addrconf_join_solict(dev, &ifp->addr); - mrt = ip6_route_add(&rtmsg, &err); + if (ifp->prefix_len != 128) + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0); - if (err) - printk(KERN_DEBUG "dad_start: mcast route add failed\n"); - else - mrt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_dst); + if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + start_bh_atomic(); + ifp->flags &= ~DAD_INCOMPLETE; + addrconf_dad_completed(ifp); + end_bh_atomic(); + return; } if (rand_seed) { @@ -1059,15 +1179,12 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp) nd_rand_seed = ifp->addr.s6_addr32[3]; } - init_timer(&ifp->timer); - ifp->probes = ipv6_config.dad_transmits; ifp->flags |= DAD_INCOMPLETE; rand_num = ipv6_random() % ipv6_config.rtr_solicit_delay; ifp->timer.function = addrconf_dad_timer; - ifp->timer.data = (unsigned long) ifp; ifp->timer.expires = jiffies + rand_num; add_timer(&ifp->timer); @@ -1105,62 +1222,41 @@ static void addrconf_dad_timer(unsigned long data) static void addrconf_dad_completed(struct inet6_ifaddr *ifp) { - struct device *dev; - int err; + struct device * dev = ifp->idev->dev; - dev = ifp->idev->dev; + /* + * Configure the address for reception. Now it is valid. + */ - if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) { - struct in6_rtmsg rtmsg; - struct in6_addr all_routers; + ip6_rt_addr_add(&ifp->addr, dev); - /* - * 1) configure a link route for this interface - * 2) send a (delayed) router solicitation - */ + /* If added prefix is link local and forwarding is off, + start sending router solicitations. + */ - memset(&rtmsg, 0, sizeof(rtmsg)); - - memcpy(&rtmsg.rtmsg_dst, &ifp->addr, sizeof(struct in6_addr)); + if (ipv6_config.forwarding == 0 && + (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) == 0 && + (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { + struct in6_addr all_routers; - rtmsg.rtmsg_dst_len = ifp->prefix_len; - rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; - rtmsg.rtmsg_ifindex = dev->ifindex; + ipv6_addr_set(&all_routers, + __constant_htonl(0xff020000U), 0, 0, + __constant_htonl(0x2U)); - rtmsg.rtmsg_flags = RTF_UP; + /* + * If a host as already performed a random delay + * [...] as part of DAD [...] there is no need + * to delay again before sending the first RS + */ + ndisc_send_rs(ifp->idev->dev, &ifp->addr, &all_routers); - ip6_route_add(&rtmsg, &err); - - if (err) - printk(KERN_DEBUG "dad_complete: error in route_add\n"); - - if (ipv6_config.forwarding == 0) { - ipv6_addr_set(&all_routers, - __constant_htonl(0xff020000U), 0, 0, - __constant_htonl(0x2U)); - - /* - * If a host as already performed a random delay - * [...] as part of DAD [...] there is no need - * to delay again before sending the first RS - */ - ndisc_send_rs(ifp->idev->dev, &ifp->addr, - &all_routers); - - ifp->probes = 1; - ifp->timer.function = addrconf_rs_timer; - ifp->timer.expires = (jiffies + - ipv6_config.rtr_solicit_interval); - ifp->idev->if_flags |= IF_RS_SENT; - add_timer(&ifp->timer); - } + ifp->probes = 1; + ifp->timer.function = addrconf_rs_timer; + ifp->timer.expires = (jiffies + + ipv6_config.rtr_solicit_interval); + ifp->idev->if_flags |= IF_RS_SENT; + add_timer(&ifp->timer); } - - /* - * configure the address for reception - */ - - ip6_rt_addr_add(&ifp->addr, dev); } #ifdef CONFIG_PROC_FS @@ -1251,7 +1347,9 @@ void addrconf_verify(unsigned long foo) __initfunc(void addrconf_init(void)) { +#ifdef MODULE struct device *dev; +#endif /* * init address and device hash lists @@ -1263,24 +1361,25 @@ __initfunc(void addrconf_init(void)) memset(inet6_dev_lst, 0, IN6_ADDR_HSIZE * sizeof(struct inet6_dev *)); - /* - * Init loopback device - */ - - dev = dev_get("lo"); - - if (dev && (dev->flags & IFF_UP)) - init_loopback(dev); - - /* - * and maybe: - * search availiable AF_INET devs and try to configure them - */ +#ifdef MODULE + /* This takes sense only during module load. */ - dev = dev_get("eth0"); + for (dev = dev_base; dev; dev = dev->next) { + if (!(dev->flags&IFF_UP)) + continue; - if (dev && (dev->flags & IFF_UP)) - addrconf_eth_config(dev); + switch (dev->type) { + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + case ARPHRD_ETHER: + addrconf_dev_config(dev); + break; + default: + /* Ignore all other */ + } + } +#endif #ifdef CONFIG_PROC_FS proc_net_register(&iface_proc_entry); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8d2755b0952e..9f707272fa53 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * $Id: af_inet6.c,v 1.21 1997/08/20 11:25:00 alan Exp $ + * $Id: af_inet6.c,v 1.23 1997/10/29 20:27:52 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include #include @@ -200,7 +200,7 @@ static int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { v4addr = addr->sin6_addr.s6_addr32[3]; - if (__ip_chk_addr(v4addr) != IS_MYADDR) + if (inet_addr_type(v4addr) != RTN_LOCAL) return(-EADDRNOTAVAIL); } else { if (addr_type != IPV6_ADDR_ANY) { @@ -354,8 +354,8 @@ static int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGIFMAP: case SIOCSIFSLAVE: case SIOCGIFSLAVE: - case SIOGIFINDEX: - case SIOGIFNAME: + case SIOCGIFINDEX: + case SIOCGIFNAME: case SIOCGIFCOUNT: return(dev_ioctl(cmd,(void *) arg)); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 36eb01ddc49d..28d9af57eca9 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: icmp.c,v 1.10 1997/06/05 11:07:20 schenk Exp $ + * $Id: icmp.c,v 1.11 1997/09/20 20:48:26 davem Exp $ * * Based on net/ipv4/icmp.c * diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0ad79f21119a..6c9f24492ac8 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: ip6_fib.c,v 1.7 1997/04/12 04:32:46 davem Exp $ + * $Id: ip6_fib.c,v 1.9 1997/09/20 20:48:27 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/net/ipv6/ip6_fw.c b/net/ipv6/ip6_fw.c index 5a47cc251c17..ddce1ccfaeba 100644 --- a/net/ipv6/ip6_fw.c +++ b/net/ipv6/ip6_fw.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: ip6_fw.c,v 1.5 1997/04/29 09:38:44 mj Exp $ + * $Id: ip6_fw.c,v 1.7 1997/10/06 23:09:54 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -13,6 +13,7 @@ * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -368,12 +369,16 @@ static void ip6_fw_destroy(struct flow_rule *rl) __initfunc(void ip6_fw_init(void)) { +#ifdef CONFIG_NETLINK netlink_attach(NETLINK_IP6_FW, ip6_fw_msgrcv); +#endif } #ifdef MODULE void module_cleanup(void) { +#ifdef CONFIG_NETLINK netlink_detach(NETLINK_IP6_FW); +#endif } #endif diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 534ebc66a8a1..72ce290ae876 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -6,7 +6,7 @@ * Pedro Roque * Ian P. Morris * - * $Id: ip6_input.c,v 1.6 1997/05/11 16:06:52 davem Exp $ + * $Id: ip6_input.c,v 1.7 1997/09/20 20:48:27 davem Exp $ * * Based in linux/net/ipv4/ip_input.c * diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7a865296f1b1..e0b20e066a9e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: ip6_output.c,v 1.3 1997/03/18 18:24:37 davem Exp $ + * $Id: ip6_output.c,v 1.5 1997/09/21 18:33:14 kuznet Exp $ * * Based on linux/net/ipv4/ip_output.c * @@ -540,6 +540,11 @@ int ip6_forward(struct sk_buff *skb) struct ipv6hdr *hdr = skb->nh.ipv6h; int size; + if (ipv6_config.forwarding == 0) { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } + /* * check hop-by-hop options present */ diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 64cfb00d54ac..98d8339b2fbd 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -7,7 +7,7 @@ * * Based on linux/net/ipv4/ip_sockglue.c * - * $Id: ipv6_sockglue.c,v 1.13 1997/05/15 18:55:10 davem Exp $ + * $Id: ipv6_sockglue.c,v 1.15 1997/10/29 20:27:54 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -43,7 +43,6 @@ #include #include #include -#include #include #include @@ -111,6 +110,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, sk->prot = &tcp_prot; tp->af_specific = &ipv4_specific; sk->socket->ops = &inet_stream_ops; + sk->family = AF_INET; } else { sk->prot = &udp_prot; sk->socket->ops = &inet_dgram_ops; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 637f434d4378..eae3efed6161 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: mcast.c,v 1.10 1997/05/07 09:40:22 davem Exp $ + * $Id: mcast.c,v 1.11 1997/10/29 20:27:50 kuznet Exp $ * * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c * @@ -417,7 +417,10 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type) skb_reserve(skb, (dev->hard_header_len + 15) & ~15); if (dev->hard_header) { unsigned char ha[MAX_ADDR_LEN]; - ipv6_mc_map(addr, ha); + if (dev->type == ARPHRD_ETHER) + ipv6_mc_map(addr, ha); + else + memcpy(ha, dev->broadcast, dev->addr_len); dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, plen); skb->arp = 1; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 83b5cf3bcc7f..04d92b6b9ca0 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -6,8 +6,6 @@ * Pedro Roque * Mike Shaver * - * $Id: ndisc.c,v 1.15 1997/04/29 09:38:48 mj Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -24,7 +22,7 @@ */ /* Set to 3 to get tracing... */ -#define ND_DEBUG 2 +#define ND_DEBUG 1 #if ND_DEBUG >= 3 #define NDBG(x) printk x @@ -396,7 +394,10 @@ int ndisc_eth_resolv(unsigned char *h_dest, struct sk_buff *skb) struct in6_addr *daddr; daddr = &skb->nh.ipv6h->daddr; - ipv6_mc_map(daddr, h_dest); + if (skb->dev->type == ARPHRD_ETHER) + ipv6_mc_map(daddr, h_dest); + else + memcpy(h_dest, skb->dev->broadcast, skb->dev->addr_len); return 0; } @@ -434,6 +435,54 @@ int ndisc_eth_resolv(unsigned char *h_dest, struct sk_buff *skb) return 1; } +static int +ndisc_build_ll_hdr(struct sk_buff *skb, struct device *dev, + struct in6_addr *daddr, struct neighbour *neigh, int len) +{ + unsigned char ha[MAX_ADDR_LEN]; + unsigned char *h_dest = NULL; + + skb->arp = 1; + if (dev->hard_header_len) { + skb_reserve(skb, (dev->hard_header_len + 15) & ~15); + + if (dev->hard_header) { + if (ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST) { + nd_stats.snt_probes_mcast++; + if (dev->type == ARPHRD_ETHER) + ipv6_mc_map(daddr, ha); + else + memcpy(ha, dev->broadcast, dev->addr_len); + h_dest = ha; + } else if (neigh) { + h_dest = neigh->ha; + nd_stats.snt_probes_ucast++; + } else { + struct nd_neigh *ndn; + + neigh_table_lock(&nd_tbl); + + neigh = neigh_lookup(&nd_tbl, (void *) daddr, + sizeof(struct in6_addr), dev); + if (neigh) { + ndn = (struct nd_neigh*)neigh; + if (ndn->ndn_flags&NTF_COMPLETE) { + memcpy(ha, ndn->ndn_ha, dev->addr_len); + h_dest = ha; + } + } + neigh_table_unlock(&nd_tbl); + } + + if (dev->hard_header(skb, dev, ETH_P_IPV6, h_dest, NULL, len) < 0) + skb->arp = 0; + } + } + + return skb->arp; +} + + /* * Send a Neighbour Advertisement */ @@ -486,17 +535,10 @@ void ndisc_send_na(struct device *dev, struct nd_neigh *ndn, printk(KERN_DEBUG "send_na: alloc skb failed\n"); return; } - /* - * build the MAC header - */ - - if (dev->hard_header_len) { - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); - if (dev->hard_header) { - dev->hard_header(skb, dev, ETH_P_IPV6, ndn->ndn_ha, - NULL, len); - skb->arp = 1; - } + + if (ndisc_build_ll_hdr(skb, dev, daddr, (struct neighbour*)ndn, len) == 0) { + kfree_skb(skb, FREE_WRITE); + return; } ip6_nd_hdr(sk, skb, dev, solicited_addr, daddr, IPPROTO_ICMPV6, len); @@ -540,12 +582,10 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh, struct in6_addr *solicit, struct in6_addr *daddr, struct in6_addr *saddr) { - unsigned char ha[MAX_ADDR_LEN]; struct sock *sk = ndisc_socket->sk; struct sk_buff *skb; struct nd_msg *msg; int len, opt_len; - void *h_dest; int err; NDBG(("ndisc_send_ns(%s,%p): ", (dev ? dev->name : "[NULL]"), neigh)); @@ -581,7 +621,11 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh, return; } +#if 0 + /* Why Pedro did it? Is it remnant of early + attempts to avoid looping back? I have no idea. --ANK */ skb->pkt_type = PACKET_NDISC; +#endif if (saddr == NULL) { struct inet6_ifaddr *ifa; @@ -593,29 +637,9 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh, saddr = &ifa->addr; } - if ((ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST)) { - nd_stats.snt_probes_mcast++; - ipv6_mc_map(daddr, ha); - h_dest = ha; - } else { - if (neigh == NULL) { -#if ND_DEBUG >= 1 - printk(KERN_DEBUG "send_ns: ucast destination " - "with null neighbour\n"); -#endif - return; - } - h_dest = neigh->ha; - nd_stats.snt_probes_ucast++; - } - - if (dev->hard_header_len) { - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); - if (dev->hard_header) { - dev->hard_header(skb, dev, ETH_P_IPV6, h_dest, NULL, - len); - skb->arp = 1; - } + if (ndisc_build_ll_hdr(skb, dev, daddr, neigh, len) == 0) { + kfree_skb(skb, FREE_WRITE); + return; } ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); @@ -684,15 +708,9 @@ void ndisc_send_rs(struct device *dev, struct in6_addr *saddr, return; } - if (dev->hard_header_len) { - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); - if (dev->hard_header) { - unsigned char ha[MAX_ADDR_LEN]; - - ipv6_mc_map(daddr, ha); - dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, len); - skb->arp = 1; - } + if (ndisc_build_ll_hdr(skb, dev, daddr, NULL, len) == 0) { + kfree_skb(skb, FREE_WRITE); + return; } ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); @@ -783,15 +801,19 @@ static void ndisc_timer_handler(unsigned long arg) ntimer = min(ntimer, time); } ndn = (struct nd_neigh *) ndn->neigh.next; - } while (ndn != head); } if (ntimer != (~0UL)) { - ndisc_timer.expires = now + ntimer; + unsigned long tval = jiffies + ntimer; + if (del_timer(&ndisc_timer)) { + if (ndisc_timer.expires - tval < 0) + tval = ndisc_timer.expires; + } + ndisc_timer.expires = tval; add_timer(&ndisc_timer); } - + neigh_table_unlock(&nd_tbl); } @@ -1238,14 +1260,12 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) NDBG(("ndisc_redirect_rcv(%p)\n", skb)); if (skb->nh.ipv6h->hop_limit != 255) { - printk(KERN_WARNING - "NDISC: fake ICMP redirect received\n"); + printk(KERN_WARNING "NDISC: fake ICMP redirect received\n"); return; } if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr) & IPV6_ADDR_LINKLOCAL)) { - printk(KERN_WARNING - "ICMP redirect: source address is not linklocal\n"); + printk(KERN_WARNING "ICMP redirect: source address is not linklocal\n"); return; } @@ -1269,19 +1289,15 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) if (ipv6_addr_cmp(dest, target) == 0) { on_link = 1; } else if (!(ipv6_addr_type(target) & IPV6_ADDR_LINKLOCAL)) { - printk(KERN_WARNING - "ICMP redirect: target address is not linklocal\n"); + printk(KERN_WARNING "ICMP redirect: target address is not linklocal\n"); return; } /* passed validation tests */ - rt = rt6_redirect(dest, &skb->nh.ipv6h->saddr, target, skb->dev, - on_link); + rt = rt6_redirect(dest, &skb->nh.ipv6h->saddr, target, skb->dev, on_link); - if (rt == NULL) { - printk(KERN_WARNING "ICMP redirect: no route to host\n"); + if (rt == NULL) return; - } ndn = (struct nd_neigh *) rt->rt6i_nexthop; @@ -1365,13 +1381,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, hlen = 0; - if (dev->hard_header_len) { - skb_reserve(buff, (dev->hard_header_len + 15) & ~15); - if (dev->hard_header) { - dev->hard_header(buff, dev, ETH_P_IPV6, ndn->ndn_ha, - NULL, len); - buff->arp = 1; - } + if (ndisc_build_ll_hdr(buff, dev, &skb->nh.ipv6h->saddr, NULL, len) == 0) { + kfree_skb(buff, FREE_WRITE); + return; } ip6_nd_hdr(sk, buff, dev, &ifp->addr, &skb->nh.ipv6h->saddr, @@ -1471,25 +1483,32 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: NDBG(("NS ")); - if ((ifp = ipv6_chk_addr(&msg->target))) { - int addr_type; + if ((ifp = ipv6_chk_addr(&msg->target)) != NULL) { + int addr_type = ipv6_addr_type(saddr); if (ifp->flags & DAD_INCOMPLETE) { - /* - * DAD failed + /* Address is tentative. If the source + is unspecified address, it is someone + does DAD, otherwise we ignore solicitations + until DAD timer expires. */ + if (addr_type == IPV6_ADDR_ANY) { + printk(KERN_INFO "%s: duplicate address detected!\n", + ifp->idev->dev->name); + del_timer(&ifp->timer); + } + return 0; + } - /* XXX Check if this came in over same interface - * XXX we just sent an NS from! That is valid! -DaveM - */ + if (addr_type == IPV6_ADDR_ANY) { + struct in6_addr maddr; - printk(KERN_DEBUG "%s: duplicate address\n", - ifp->idev->dev->name); - del_timer(&ifp->timer); + ipv6_addr_all_nodes(&maddr); + ndisc_send_na(dev, NULL, &maddr, &ifp->addr, + ifp->idev->router, 0, 1, 1); return 0; } - addr_type = ipv6_addr_type(saddr); if (addr_type & IPV6_ADDR_UNICAST) { int inc; @@ -1512,7 +1531,6 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, ifp->idev->router, 1, inc, inc); } else { #if ND_DEBUG >= 1 - /* FIXME */ printk(KERN_DEBUG "ns: non unicast saddr\n"); #endif } @@ -1521,6 +1539,28 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, case NDISC_NEIGHBOUR_ADVERTISEMENT: NDBG(("NA ")); + if ((ipv6_addr_type(saddr)&IPV6_ADDR_MULTICAST) && + msg->icmph.icmp6_solicited) { + printk(KERN_DEBUG "NDISC: solicited NA is multicasted\n"); + return 0; + } + if ((ifp = ipv6_chk_addr(&msg->target))) { + if (ifp->flags & DAD_INCOMPLETE) { + /* Address is duplicate. */ + printk(KERN_INFO "%s: duplicate address detected!\n", + ifp->idev->dev->name); + del_timer(&ifp->timer); + return 0; + } + /* What should we make now? The advertisement + is invalid, but ndisc specs say nothing + about it. It could be misconfiguration, or + an smart proxy agent tries to help us :-) + */ + printk(KERN_DEBUG "%s: someone avertise our address!\n", + ifp->idev->dev->name); + return 0; + } neigh_table_lock(&nd_tbl); ndn = (struct nd_neigh *) neigh_lookup(&nd_tbl, (void *) &msg->target, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 3036497051ff..17af36fe6218 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/raw.c * - * $Id: raw.c,v 1.12 1997/04/01 02:23:34 davem Exp $ + * $Id: raw.c,v 1.13 1997/09/14 08:32:14 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -156,7 +156,7 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { v4addr = addr->sin6_addr.s6_addr32[3]; - if (__ip_chk_addr(v4addr) != IS_MYADDR) + if (inet_addr_type(v4addr) != RTN_LOCAL) return(-EADDRNOTAVAIL); } else { if (addr_type != IPV6_ADDR_ANY) { @@ -307,8 +307,9 @@ static int rawv6_frag_cksum(const void *data, struct in6_addr *addr, { struct rawv6_fakehdr *hdr = (struct rawv6_fakehdr *) data; - hdr->cksum = csum_partial_copy_fromiovecend(buff, hdr->iov, offset, - len, hdr->cksum); + if (csum_partial_copy_fromiovecend(buff, hdr->iov, offset, + len, &hdr->cksum)) + return -EFAULT; if (offset == 0) { struct sock *sk; @@ -461,28 +462,49 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, char *optval, int optlen) { - struct raw6_opt *opt = &sk->tp_pinfo.tp_raw; - int err = 0; + switch (optname) { + case ICMPV6_FILTER: + if (optlen > sizeof(struct icmp6_filter)) + optlen = sizeof(struct icmp6_filter); + if (copy_from_user(&sk->tp_pinfo.tp_raw.filter, optval, optlen)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; + }; + + return 0; +} + +static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, + char *optval, int *optlen) +{ + int len; switch (optname) { - case ICMPV6_FILTER: - err = copy_from_user(&opt->filter, optval, - sizeof(struct icmp6_filter)); - if (err) - err = -EFAULT; - break; - default: - err = -ENOPROTOOPT; + case ICMPV6_FILTER: + if (get_user(len, optlen)) + return -EFAULT; + if (len > sizeof(struct icmp6_filter)) + len = sizeof(struct icmp6_filter); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &sk->tp_pinfo.tp_raw.filter, len)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; }; - return err; + return 0; } + static int rawv6_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { struct raw6_opt *opt = &sk->tp_pinfo.tp_raw; - int val, err; + int val; switch(level) { case SOL_RAW: @@ -501,12 +523,8 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, optlen); }; - if (optval == NULL) - return(-EINVAL); - - err = get_user(val, (int *)optval); - if(err) - return err; + if (get_user(val, (int *)optval)) + return -EFAULT; switch (optname) { case IPV6_CHECKSUM: @@ -525,6 +543,53 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, } } +static int rawv6_getsockopt(struct sock *sk, int level, int optname, + char *optval, int *optlen) +{ + struct raw6_opt *opt = &sk->tp_pinfo.tp_raw; + int val, len; + + switch(level) { + case SOL_RAW: + break; + + case SOL_ICMPV6: + if (sk->num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_geticmpfilter(sk, level, optname, optval, + optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_getsockopt(sk, level, optname, optval, + optlen); + }; + + if (get_user(len,optlen)) + return -EFAULT; + + switch (optname) { + case IPV6_CHECKSUM: + if (opt->checksum == 0) + val = -1; + else + val = opt->offset; + + default: + return -ENOPROTOOPT; + } + + len=min(sizeof(int),len); + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval,&val,len)) + return -EFAULT; + return 0; +} + + static void rawv6_close(struct sock *sk, unsigned long timeout) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; @@ -558,7 +623,7 @@ struct proto rawv6_prot = { NULL, /* destroy */ NULL, /* shutdown */ rawv6_setsockopt, /* setsockopt */ - ipv6_getsockopt, /* getsockopt - FIXME */ + rawv6_getsockopt, /* getsockopt */ rawv6_sendmsg, /* sendmsg */ rawv6_recvmsg, /* recvmsg */ rawv6_bind, /* bind */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 90a8caf09acf..6a412d423c99 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: route.c,v 1.13 1997/07/19 11:11:35 davem Exp $ + * $Id: route.c,v 1.18 1997/10/17 00:15:05 freitag Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -23,6 +23,8 @@ #include #include #include +#include +#include #ifdef CONFIG_PROC_FS #include @@ -34,7 +36,7 @@ #include #include #include -#include +#include #include @@ -64,7 +66,7 @@ struct dst_ops ip6_dst_ops = { struct rt6_info ip6_null_entry = { {{NULL, ATOMIC_INIT(0), ATOMIC_INIT(0), NULL, - 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, ip6_pkt_discard, ip6_pkt_discard, &ip6_dst_ops}}, NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0UL, 0, {NULL}, {{{{0}}}, 128}, {{{{0}}}, 128} @@ -297,7 +299,7 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, rt6_lock(); fn = fib6_lookup(&ip6_routing_table, daddr, saddr); - rt = rt6_device_match(fn->leaf, dev, 0); + rt = rt6_device_match(fn->leaf, dev, flags&RTF_LINKRT); rt6_unlock(); return rt; } @@ -314,6 +316,9 @@ static struct rt6_info *rt6_cow(struct rt6_info *rt, struct in6_addr *daddr, if (rt) { ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); + if (!(rt->rt6i_flags&RTF_GATEWAY)) + ipv6_addr_copy(&rt->rt6i_gateway, daddr); + rt->rt6i_dst.plen = 128; rt->rt6i_flags |= RTF_CACHE; @@ -322,7 +327,7 @@ static struct rt6_info *rt6_cow(struct rt6_info *rt, struct in6_addr *daddr, rt->rt6i_src.plen = 128; } - rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, daddr); + rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); rtreq_add(rt, RT_OPER_ADD); } else { @@ -556,6 +561,23 @@ struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, struct sk_buff *skb) return NULL; } +/* Clean host part of a prefix. Not necessary in radix tree, + but results in cleaner routing tables. + + Remove it only when all the things will work! + */ + +static void ipv6_wash_prefix(struct in6_addr *pfx, int plen) +{ + int b = plen&0x7; + int o = (plen + 7)>>3; + + if (o < 16) + memset(pfx->s6_addr + o, 0, 16 - o); + if (b != 0) + pfx->s6_addr[plen>>3] &= (0xFF<<(8-b)); +} + /* * */ @@ -566,7 +588,11 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) struct device *dev = NULL; int addr_type; - RDBG(("ip6_route_add(%p)[%p] ", rtmsg, __builtin_return_address(0))); + if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128) { + *err = -EINVAL; + return NULL; + } + *err = 0; rt = dst_alloc(sizeof(struct rt6_info), &ip6_dst_ops); @@ -577,29 +603,6 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) goto out; } - /* - * default... this should be chosen according to route flags - */ - -#if RT6_DEBUG >= 3 - { - struct in6_addr *addr = &rtmsg->rtmsg_dst; - int i; - - RDBG(("daddr[")); - for(i = 0; i < 8; i++) { - RDBG(("%04x%c", addr->s6_addr16[i], - i == 7 ? ']' : ':')); - } - addr = &rtmsg->rtmsg_src; - RDBG(("saddr[")); - for(i = 0; i < 8; i++) { - RDBG(("%04x%c", addr->s6_addr16[i], - i == 7 ? ']' : ':')); - } - } -#endif - addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst); if (addr_type & IPV6_ADDR_MULTICAST) { @@ -609,71 +612,58 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) RDBG(("!MCAST ")); rt->u.dst.input = ip6_forward; } - + rt->u.dst.output = dev_queue_xmit; - - if (rtmsg->rtmsg_ifindex) + + if (rtmsg->rtmsg_ifindex) { dev = dev_get_by_index(rtmsg->rtmsg_ifindex); - if(dev) - RDBG(("d[%s] ", dev->name)); + if (dev == NULL) { + *err = -ENODEV; + goto out; + } + } ipv6_addr_copy(&rt->rt6i_dst.addr, &rtmsg->rtmsg_dst); rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len; + ipv6_wash_prefix(&rt->rt6i_dst.addr, rt->rt6i_dst.plen); - /* XXX Figure out what really is supposed to be happening here -DaveM */ ipv6_addr_copy(&rt->rt6i_src.addr, &rtmsg->rtmsg_src); rt->rt6i_src.plen = rtmsg->rtmsg_src_len; - - if ((rt->rt6i_src.plen = rtmsg->rtmsg_src_len)) { - RDBG(("splen, ")); - ipv6_addr_copy(&rt->rt6i_src.addr, &rtmsg->rtmsg_src); - } else { - RDBG(("!splen, ")); - } - /* XXX */ + ipv6_wash_prefix(&rt->rt6i_src.addr, rt->rt6i_src.plen); - if (rtmsg->rtmsg_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) { - struct rt6_info *grt; + if (rtmsg->rtmsg_flags & RTF_GATEWAY) { struct in6_addr *gw_addr; - u32 flags = 0; - - RDBG(("RTF_GATEWAY, ")); - /* - * 1. gateway route lookup - * 2. ndisc_get_neigh - */ + int gwa_type; gw_addr = &rtmsg->rtmsg_gateway; - -#if RT6_DEBUG >= 3 - { - struct in6_addr *addr = gw_addr; - int i; - - RDBG(("gwaddr[")); - for(i = 0; i < 8; i++) { - RDBG(("%04x%c", addr->s6_addr16[i], - i == 7 ? ']' : ':')); + ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway); + gwa_type = ipv6_addr_type(gw_addr); + + if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { + struct rt6_info *grt; + + /* IPv6 strictly inhibits using not link-local + addresses as nexthop address. + It is very good, but in some (rare!) curcumstances + (SIT, NBMA NOARP links) it is handy to allow + some exceptions. + */ + if (!(gwa_type&IPV6_ADDR_UNICAST)) { + *err = -EINVAL; + goto out; } - } -#endif - if ((rtmsg->rtmsg_flags & RTF_GATEWAY) && - (rtmsg->rtmsg_flags & RTF_ADDRCONF) == 0) { - RDBG(("RTF_GATEWAY && !RTF_ADDRCONF, ")); - if (dev) - flags |= RTF_LINKRT; + grt = rt6_lookup(gw_addr, NULL, dev, RTF_LINKRT); - grt = rt6_lookup(gw_addr, NULL, dev, flags); - - if (grt == NULL) - { - RDBG(("!grt, ")); + if (grt == NULL || (grt->rt6i_flags&RTF_GATEWAY)) { *err = -EHOSTUNREACH; goto out; } dev = grt->rt6i_dev; - RDBG(("grt(d=%s), ", dev ? dev->name : "NULL")); + } + if (dev == NULL) { + *err = -EINVAL; + goto out; } rt->rt6i_nexthop = ndisc_get_neigh(dev, gw_addr); @@ -739,20 +729,26 @@ int ip6_route_del(struct in6_rtmsg *rtmsg) /* * Find device */ - if(rtmsg->rtmsg_ifindex) + if(rtmsg->rtmsg_ifindex) { dev=dev_get_by_index(rtmsg->rtmsg_ifindex); + if (dev == NULL) + return -ENODEV; + } /* * Find route */ - rt=rt6_lookup(&rtmsg->rtmsg_dst, &rtmsg->rtmsg_src, dev, rtmsg->rtmsg_flags); - + rt=rt6_lookup(&rtmsg->rtmsg_dst, &rtmsg->rtmsg_src, dev, dev ? RTF_LINKRT : 0); + /* * Blow it away */ - if(rt) + if(rt && rt->rt6i_dst.plen == rtmsg->rtmsg_dst_len && + rt->rt6i_src.plen == rtmsg->rtmsg_src_len) { ip6_del_rt(rt); + return 0; + } - return 0; + return -ESRCH; } @@ -777,6 +773,7 @@ void __rt6_run_bh(void) rt6_bh_mask = 0; } +#ifdef CONFIG_NETLINK /* * NETLINK interface * routing socket moral equivalent @@ -815,6 +812,7 @@ out: kfree_skb(skb, FREE_READ); return count; } +#endif /* CONFIG_NETLINK */ static void rt6_sndrtmsg(struct in6_rtmsg *rtmsg) { @@ -827,7 +825,9 @@ static void rt6_sndrtmsg(struct in6_rtmsg *rtmsg) memcpy(skb_put(skb, sizeof(struct in6_rtmsg)), &rtmsg, sizeof(struct in6_rtmsg)); +#ifdef CONFIG_NETLINK if (netlink_post(NETLINK_ROUTE6, skb)) +#endif kfree_skb(skb, FREE_WRITE); } @@ -867,7 +867,9 @@ void rt6_sndmsg(int type, struct in6_addr *dst, struct in6_addr *src, msg->rtmsg_flags = flags; +#ifdef CONFIG_NETLINK if (netlink_post(NETLINK_ROUTE6, skb)) +#endif kfree_skb(skb, FREE_WRITE); } @@ -878,54 +880,28 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, struct in6_addr *target, struct device *dev, int on_link) { - struct rt6_info *rt, *tgtr, *nrt; + struct rt6_info *rt, *nrt; - RDBG(("rt6_redirect(%s)[%p]: ", - dev ? dev->name : "NULL", - __builtin_return_address(0))); + /* Locate old route to this destination. */ rt = rt6_lookup(dest, NULL, dev, 0); - if (rt == NULL || rt->u.dst.error) { - RDBG(("!rt\n")); - printk(KERN_DEBUG "rt6_redirect: no route to destination\n"); + if (rt == NULL || rt->u.dst.error) return NULL; - } - if (rt->rt6i_flags & RTF_GATEWAY) { - /* - * This can happen due to misconfiguration - * if we are dealing with an "on link" redirect. - */ - RDBG(("RTF_GATEWAY\n")); - printk(KERN_DEBUG "rt6_redirect: destination not directly " - "connected\n"); + /* Duplicate redirect: silently ignore. */ + if (ipv6_addr_cmp(target, &rt->rt6i_gateway) == 0) return NULL; - } - RDBG(("tgt_lkup, ")); - tgtr = rt6_lookup(target, NULL, dev, 0); - if (tgtr == NULL || tgtr->u.dst.error) { - /* - * duh?! no route to redirect target. - * How where we talking to it in the first place ? - */ - RDBG(("!tgtr||dsterr\n")); - printk(KERN_DEBUG "rt6_redirect: no route to target\n"); + /* Current route is on-link; redirect is always invalid. */ + if (!(rt->rt6i_flags&RTF_GATEWAY)) return NULL; - } - - if ((tgtr->rt6i_flags & RTF_GATEWAY) && - ipv6_addr_cmp(dest, &tgtr->rt6i_gateway) == 0) { - RDBG(("tgt RTF_GATEWAY && dstmatch, dup\n")); - /* - * Check if we already have the right route. - */ -#if RT6_DEBUG >= 1 - printk(KERN_DEBUG "rt6_redirect: duplicate\n"); -#endif - return NULL; - } +#if !defined(CONFIG_IPV6_EUI64) || defined(CONFIG_IPV6_NO_PB) + /* + * During transition gateways have more than + * one link local address. Certainly, it is violation + * of basic principles, but it is temparary. + */ /* * RFC 1970 specifies that redirects should only be * accepted if they come from the nexthop to the target. @@ -934,62 +910,57 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, * routers. */ - if (ipv6_addr_cmp(saddr, &tgtr->rt6i_gateway)) { - RDBG(("saddr/tgt->gway match, ")); - if (tgtr->rt6i_flags & RTF_DEFAULT) { - tgtr = ip6_routing_table.leaf; + if (ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) { + if (rt->rt6i_flags & RTF_DEFAULT) { + rt = ip6_routing_table.leaf; - for (; tgtr; tgtr = tgtr->u.next) { - if (!ipv6_addr_cmp(saddr, &tgtr->rt6i_gateway)) { - RDBG(("found srcok, ")); + for (; rt; rt = rt->u.next) { + if (!ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) goto source_ok; - } } } - RDBG(("!dflt||!srcok, ")); printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop " - "for redirect target\n"); + "for redirect target\n"); + return NULL; } source_ok: +#endif /* * We have finally decided to accept it. */ - RDBG(("srcok: ")); - if ((tgtr->rt6i_flags & RTF_HOST)) { + if (rt->rt6i_dst.plen == 128) { /* * Already a host route. * */ - RDBG(("hralready, ")); - if (tgtr->rt6i_nexthop) { - RDBG(("nrel(nxthop) ")); - neigh_release(tgtr->rt6i_nexthop); - } + if (rt->rt6i_nexthop) + neigh_release(rt->rt6i_nexthop); /* * purge hh_cache */ - tgtr->rt6i_flags |= RTF_MODIFIED | RTF_CACHE; - ipv6_addr_copy(&tgtr->rt6i_gateway, dest); - tgtr->rt6i_nexthop = ndisc_get_neigh(tgtr->rt6i_dev, dest); - RDBG(("hhpurge, getnewneigh, ret(%p)\n", tgtr)); - return tgtr; + rt->rt6i_flags |= RTF_MODIFIED | RTF_CACHE; + if (on_link) + rt->rt6i_flags &= ~RTF_GATEWAY; + ipv6_addr_copy(&rt->rt6i_gateway, target); + rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, target); + return rt; } - nrt = ip6_rt_copy(tgtr); - nrt->rt6i_flags = RTF_GATEWAY|RTF_HOST|RTF_UP|RTF_DYNAMIC|RTF_CACHE; + nrt = ip6_rt_copy(rt); + nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; + if (on_link) + nrt->rt6i_flags &= ~RTF_GATEWAY; - ipv6_addr_copy(&nrt->rt6i_dst.addr, target); + ipv6_addr_copy(&nrt->rt6i_dst.addr, dest); nrt->rt6i_dst.plen = 128; - ipv6_addr_copy(&nrt->rt6i_gateway, dest); - nrt->rt6i_nexthop = ndisc_get_neigh(nrt->rt6i_dev, dest); + ipv6_addr_copy(&nrt->rt6i_gateway, target); + nrt->rt6i_nexthop = ndisc_get_neigh(nrt->rt6i_dev, target); nrt->rt6i_dev = dev; nrt->u.dst.pmtu = dev->mtu; - RDBG(("rt6_ins(%p)\n", nrt)); - rt6_lock(); rt6_ins(nrt); rt6_unlock(); @@ -1023,7 +994,15 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu) return; } - if (rt->rt6i_flags & RTF_HOST) { + /* It is wrong, but I plugged the hole here. + On-link routes are cloned differently, + look at rt6_redirect --ANK + */ + if (!(rt->rt6i_flags&RTF_GATEWAY)) { + return; + } + + if (rt->rt6i_dst.plen == 128) { /* * host route */ @@ -1037,7 +1016,7 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu) ipv6_addr_copy(&rt->rt6i_dst.addr, addr); rt->rt6i_dst.plen = 128; - rt->rt6i_flags |= (RTF_HOST | RTF_DYNAMIC | RTF_CACHE); + rt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE); rt6_lock(); rt6_ins(rt); @@ -1065,7 +1044,7 @@ struct rt6_info * ip6_rt_copy(struct rt6_info *ort) rt->rt6i_keylen = ort->rt6i_keylen; rt->rt6i_flags = ort->rt6i_flags; rt->rt6i_metric = ort->rt6i_metric; - + memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); } @@ -1257,7 +1236,7 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev) rt->rt6i_dev = dev_get("lo"); rt->u.dst.pmtu = rt->rt6i_dev->mtu; - rt->rt6i_flags = RTF_HOST | RTF_LOCAL | RTF_UP | RTF_NONEXTHOP; + rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; ipv6_addr_copy(&rt->rt6i_dst.addr, addr); rt->rt6i_dst.plen = 128; @@ -1600,7 +1579,9 @@ __initfunc(void ip6_route_init(void)) proc_net_register(&proc_rt6_stats); proc_net_register(&proc_rt6_tree); #endif +#ifdef CONFIG_NETLINK netlink_attach(NETLINK_ROUTE6, rt6_msgrcv); +#endif } #ifdef MODULE @@ -1611,7 +1592,9 @@ void ip6_route_cleanup(void) proc_net_unregister(PROC_NET_RT6_TREE); proc_net_unregister(PROC_NET_RT6_STATS); #endif +#ifdef CONFIG_NETLINK netlink_detach(NETLINK_ROUTE6); +#endif #if 0 fib6_flush(); #endif diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d818bc777c50..4ff6e28d8bd4 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -4,8 +4,9 @@ * * Authors: * Pedro Roque + * Alexey Kuznetsov * - * $Id: sit.c,v 1.14 1997/04/29 09:38:52 mj Exp $ + * $Id: sit.c,v 1.23 1997/11/08 18:15:49 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -13,6 +14,9 @@ * 2 of the License, or (at your option) any later version. */ +#include +#define __NO_VERSION__ +#include #include #include #include @@ -23,6 +27,7 @@ #include #include #include +#include #include #include @@ -31,385 +36,363 @@ #include #include #include +#include +#include #include #include #include #include -#include +#include +#include +/* + This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c -static int sit_init_dev(struct device *dev); - -static struct device sit_device = { - "sit0", - 0, 0, 0, 0, - 0x0, 0, - 0, 0, 0, NULL, sit_init_dev -}; - -static unsigned long sit_gc_last_run; -static void sit_mtu_cache_gc(void); - -static int sit_xmit(struct sk_buff *skb, - struct device *dev); -static int sit_rcv(struct sk_buff *skb, unsigned short len); -static void sit_err(struct sk_buff *skb, unsigned char *dp); - -static int sit_open(struct device *dev); -static int sit_close(struct device *dev); + For comments look at net/ipv4/ip_gre.c --ANK + */ -static struct net_device_stats *sit_get_stats(struct device *dev); +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) -extern void udp_err(struct sk_buff *, unsigned char *); +static int ipip6_fb_tunnel_init(struct device *dev); +static int ipip6_tunnel_init(struct device *dev); -static struct inet_protocol sit_protocol = { - sit_rcv, - sit_err, - 0, - IPPROTO_IPV6, - 0, - NULL, - "IPv6" +static struct device ipip6_fb_tunnel_dev = { + NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip6_fb_tunnel_init, }; -#define SIT_NUM_BUCKETS 16 - -struct sit_mtu_info *sit_mtu_cache[SIT_NUM_BUCKETS]; - -static int vif_num = 0; -static struct sit_vif *vif_list = NULL; +static struct ip_tunnel ipip6_fb_tunnel = { + NULL, &ipip6_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"sit0", } +}; -static __inline__ __u32 sit_addr_hash(__u32 addr) -{ - - __u32 hash_val; - - hash_val = addr; - - hash_val ^= hash_val >> 16; - hash_val ^= hash_val >> 8; - - return (hash_val & (SIT_NUM_BUCKETS - 1)); -} +static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_r[HASH_SIZE]; +static struct ip_tunnel *tunnels_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_wc[1]; +static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; -static void sit_cache_insert(__u32 addr, int mtu) +static struct ip_tunnel * ipip6_tunnel_lookup(u32 remote, u32 local) { - struct sit_mtu_info *minfo; - int hash; - - minfo = kmalloc(sizeof(struct sit_mtu_info), GFP_ATOMIC); - - if (minfo == NULL) - return; - - minfo->addr = addr; - minfo->tstamp = jiffies; - minfo->mtu = mtu; - - hash = sit_addr_hash(addr); - - minfo->next = sit_mtu_cache[hash]; - sit_mtu_cache[hash] = minfo; + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_r[h0]; t; t = t->next) { + if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + return t; + } + if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + return t; + return NULL; } -static struct sit_mtu_info * sit_mtu_lookup(__u32 addr) +struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create) { - struct sit_mtu_info *iter; - int hash; - - hash = sit_addr_hash(addr); + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + struct ip_tunnel *t, **tp, *nt; + struct device *dev; + unsigned h = 0; + int prio = 0; - for(iter = sit_mtu_cache[hash]; iter; iter=iter->next) { - if (iter->addr == addr) { - iter->tstamp = jiffies; - break; - } + if (remote) { + prio |= 2; + h ^= HASH(remote); } - - /* - * run garbage collector - */ - - if (jiffies - sit_gc_last_run > SIT_GC_FREQUENCY) { - sit_mtu_cache_gc(); - sit_gc_last_run = jiffies; + if (local) { + prio |= 1; + h ^= HASH(local); } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) + return t; + } + if (!create) + return NULL; - return iter; -} - -static void sit_mtu_cache_gc(void) -{ - struct sit_mtu_info *iter, *back; - unsigned long now = jiffies; - int i; - - for (i=0; i < SIT_NUM_BUCKETS; i++) { - back = NULL; - for (iter = sit_mtu_cache[i]; iter;) { - if (now - iter->tstamp > SIT_GC_TIMEOUT) { - struct sit_mtu_info *old; - - old = iter; - iter = iter->next; - - if (back) - back->next = iter; - else - sit_mtu_cache[i] = iter; - - kfree(old); - continue; - } - back = iter; - iter = iter->next; + MOD_INC_USE_COUNT; + dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL); + if (dev == NULL) { + MOD_DEC_USE_COUNT; + return NULL; + } + memset(dev, 0, sizeof(*dev) + sizeof(*t)); + dev->priv = (void*)(dev+1); + nt = (struct ip_tunnel*)dev->priv; + nt->dev = dev; + dev->name = nt->parms.name; + dev->init = ipip6_tunnel_init; + memcpy(&nt->parms, parms, sizeof(*parms)); + if (dev->name[0] == 0) { + int i; + for (i=1; i<100; i++) { + sprintf(dev->name, "sit%d", i); + if (dev_get(dev->name) == NULL) + break; } + if (i==100) + goto failed; + memcpy(parms->name, dev->name, IFNAMSIZ); } + if (register_netdevice(dev) < 0) + goto failed; + + start_bh_atomic(); + nt->next = t; + *tp = nt; + end_bh_atomic(); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + kfree(dev); + MOD_DEC_USE_COUNT; + return NULL; } -static int sit_init_dev(struct device *dev) -{ - int i; - - dev->open = sit_open; - dev->stop = sit_close; - - dev->hard_start_xmit = sit_xmit; - dev->get_stats = sit_get_stats; - - dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); - - if (dev->priv == NULL) - return -ENOMEM; - - memset(dev->priv, 0, sizeof(struct net_device_stats)); - - - for (i = 0; i < DEV_NUMBUFFS; i++) - skb_queue_head_init(&dev->buffs[i]); - - dev->hard_header = NULL; - dev->rebuild_header = NULL; - dev->set_mac_address = NULL; - dev->hard_header_cache = NULL; - dev->header_cache_update= NULL; - - dev->type = ARPHRD_SIT; - - dev->hard_header_len = MAX_HEADER; - dev->mtu = 1500 - sizeof(struct iphdr); - dev->addr_len = 0; - dev->tx_queue_len = 0; - - memset(dev->broadcast, 0, MAX_ADDR_LEN); - memset(dev->dev_addr, 0, MAX_ADDR_LEN); - - dev->flags = IFF_NOARP; - - dev->family = AF_INET6; - dev->pa_addr = 0; - dev->pa_brdaddr = 0; - dev->pa_dstaddr = 0; - dev->pa_mask = 0; - dev->pa_alen = 4; - - return 0; -} - -static int sit_init_vif(struct device *dev) +static void ipip6_tunnel_destroy(struct device *dev) { - int i; - - dev->flags = IFF_NOARP|IFF_POINTOPOINT|IFF_MULTICAST; - dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); - - if (dev->priv == NULL) - return -ENOMEM; - - memset(dev->priv, 0, sizeof(struct net_device_stats)); - - for (i = 0; i < DEV_NUMBUFFS; i++) - skb_queue_head_init(&dev->buffs[i]); - - return 0; -} - -static int sit_open(struct device *dev) -{ - return 0; -} + struct ip_tunnel *t, **tp; + struct ip_tunnel *t0 = (struct ip_tunnel*)dev->priv; + u32 remote = t0->parms.iph.daddr; + u32 local = t0->parms.iph.saddr; + unsigned h = 0; + int prio = 0; + + if (dev == &ipip6_fb_tunnel_dev) { + tunnels_wc[0] = NULL; + return; + } -static int sit_close(struct device *dev) -{ - return 0; + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (t == t0) { + *tp = t->next; + kfree(dev); + MOD_DEC_USE_COUNT; + break; + } + } } -__initfunc(int sit_init(void)) -{ - int i; - - /* register device */ - - if (register_netdev(&sit_device) != 0) - return -EIO; - - inet_add_protocol(&sit_protocol); - - for (i=0; i < SIT_NUM_BUCKETS; i++) - sit_mtu_cache[i] = NULL; - - sit_gc_last_run = jiffies; - return 0; -} - -struct device *sit_add_tunnel(__u32 dstaddr) +void ipip6_err(struct sk_buff *skb, unsigned char *dp, int len) { - struct sit_vif *vif; - struct device *dev; +#ifndef I_WISH_WORLD_WERE_PERFECT - if ((sit_device.flags & IFF_UP) == 0) - return NULL; - - vif = kmalloc(sizeof(struct sit_vif), GFP_KERNEL); - if (vif == NULL) - return NULL; - - /* - * Create PtoP configured tunnel - */ - - dev = kmalloc(sizeof(struct device), GFP_KERNEL); - if (dev == NULL) - return NULL; - - memcpy(dev, &sit_device, sizeof(struct device)); - dev->init = sit_init_vif; - dev->pa_dstaddr = dstaddr; - - dev->name = vif->name; - sprintf(vif->name, "sit%d", ++vif_num); - - register_netdev(dev); +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + */ + struct iphdr *iph = (struct iphdr*)dp; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; - vif->dev = dev; - vif->next = vif_list; - vif_list = vif; + if (len < sizeof(struct iphdr)) + return; - return dev; -} + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; -void sit_cleanup(void) -{ - struct sit_vif *vif; - - for (vif = vif_list; vif;) { - struct device *dev = vif->dev; - struct sit_vif *cur; - - unregister_netdev(dev); - kfree(dev->priv); - kfree(dev); - - cur = vif; - vif = vif->next; + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; } - vif_list = NULL; - - unregister_netdev(&sit_device); - inet_del_protocol(&sit_protocol); - -} - -/* - * receive IPv4 ICMP messages - */ + t = ipip6_tunnel_lookup(iph->daddr, iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + return; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + return; -static void sit_err(struct sk_buff *skb, unsigned char *dp) -{ + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; + return; +#else struct iphdr *iph = (struct iphdr*)dp; + int hlen = iph->ihl<<2; + struct ipv6hdr *iph6; int type = skb->h.icmph->type; int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + struct sk_buff *skb2; + struct rt6_info *rt6i; - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - struct sit_mtu_info *minfo; - unsigned short info = skb->h.icmph->un.frag.mtu - sizeof(struct iphdr); - - minfo = sit_mtu_lookup(iph->daddr); - - printk(KERN_DEBUG "sit: %08lx pmtu = %ul\n", ntohl(iph->saddr), - info); - - if (minfo == NULL) { - minfo = kmalloc(sizeof(struct sit_mtu_info), - GFP_ATOMIC); + if (len < hlen + sizeof(struct ipv6hdr)) + return; + iph6 = (struct ipv6hdr*)(dp + hlen); - if (minfo == NULL) - return; + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < hlen) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMPV6_PARAMPROB; + rel_info = skb->h.icmph->un.gateway - hlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Too complicated case ... */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMPV6_DEST_UNREACH; + rel_code = ICMPV6_ADDR_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + rel_type = ICMPV6_TIME_EXCEED; + rel_code = ICMPV6_EXC_HOPLIMIT; + break; + } - start_bh_atomic(); - sit_cache_insert(iph->daddr, info); - end_bh_atomic(); - } else { - minfo->mtu = info; + /* Prepare fake skb to feed it to icmpv6_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)iph6); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + rt6i = rt6_lookup(&iph6->saddr, NULL, NULL, 0); + if (rt6i && rt6i->rt6i_dev) { + skb2->dev = rt6i->rt6i_dev; + + rt6i = rt6_lookup(&iph6->daddr, &iph6->saddr, NULL, 0); + + if (rt6i && rt6i->rt6i_dev && rt6i->rt6i_dev->type == ARPHRD_SIT) { + struct ip_tunnel * t = (struct ip_tunnel*)rt6i->rt6i_dev->priv; + if (rel_type == ICMPV6_TIME_EXCEED && t->parms.iph.ttl) { + rel_type = ICMPV6_DEST_UNREACH; + rel_code = ICMPV6_ADDR_UNREACH; + } + icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev); } } + kfree_skb(skb2, FREE_WRITE); + return; +#endif } -static int sit_rcv(struct sk_buff *skb, unsigned short len) +int ipip6_rcv(struct sk_buff *skb, unsigned short len) { - struct net_device_stats *stats; - struct device *dev = NULL; - struct sit_vif *vif; - __u32 saddr = skb->nh.iph->saddr; - - skb->h.raw = skb->nh.raw = skb_pull(skb, skb->h.raw - skb->data); - - skb->protocol = __constant_htons(ETH_P_IPV6); - - for (vif = vif_list; vif; vif = vif->next) { - if (saddr == vif->dev->pa_dstaddr) { - dev = vif->dev; - break; - } + struct iphdr *iph; + struct ip_tunnel *tunnel; + + iph = skb->nh.iph; + + if ((tunnel = ipip6_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb_pull(skb, skb->h.raw - skb->data); + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = __constant_htons(ETH_P_IPV6); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + netif_rx(skb); + return 0; } - if (dev == NULL) - dev = &sit_device; - - skb->dev = dev; - skb->ip_summed = CHECKSUM_NONE; - - stats = (struct net_device_stats *)dev->priv; - stats->rx_bytes += len; - stats->rx_packets++; - - ipv6_rcv(skb, dev, NULL); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + kfree_skb(skb, FREE_READ); return 0; } -static int sit_xmit(struct sk_buff *skb, struct device *dev) +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev) { - struct net_device_stats *stats; - struct sit_mtu_info *minfo; + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *tiph = &tunnel->parms.iph; + struct ipv6hdr *iph6 = skb->nh.ipv6h; + u8 tos = tunnel->parms.iph.tos; + struct rtable *rt; /* Route to the other host */ + struct device *tdev; /* Device to other host */ + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + u32 dst = tiph->daddr; + int mtu; struct in6_addr *addr6; - struct rtable *rt; - struct iphdr *iph; - __u32 saddr; - __u32 daddr; int addr_type; - int mtu; - int headroom; - /* - * Make sure we are not busy (check lock variable) - */ + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } - stats = (struct net_device_stats *)dev->priv; + if (skb->protocol != __constant_htons(ETH_P_IPV6)) + goto tx_error; - daddr = dev->pa_dstaddr; - if (daddr == 0) { + if (!dst) { struct nd_neigh *neigh = NULL; if (skb->dst) @@ -417,9 +400,9 @@ static int sit_xmit(struct sk_buff *skb, struct device *dev) if (neigh == NULL) { printk(KERN_DEBUG "sit: nexthop == NULL\n"); - goto on_error; + goto tx_error; } - + addr6 = &neigh->ndn_addr; addr_type = ipv6_addr_type(addr6); @@ -428,88 +411,329 @@ static int sit_xmit(struct sk_buff *skb, struct device *dev) addr_type = ipv6_addr_type(addr6); } - if ((addr_type & IPV6_ADDR_COMPATv4) == 0) { - printk(KERN_DEBUG "sit_xmit: non v4 address\n"); - goto on_error; - } - daddr = addr6->s6_addr32[3]; - } + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + goto tx_error_icmp; - if (ip_route_output(&rt, daddr, 0, 0, NULL)) { - printk(KERN_DEBUG "sit: no route to host\n"); - goto on_error; + dst = addr6->s6_addr32[3]; } - minfo = sit_mtu_lookup(daddr); + if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } + tdev = rt->u.dst.dev; - /* IP should calculate pmtu correctly, - * let's check it... - */ -#if 0 - if (minfo) - mtu = minfo->mtu; - else -#endif - mtu = rt->u.dst.pmtu; + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } - if (mtu > 576 && skb->tail - (skb->data + sizeof(struct ipv6hdr)) > mtu) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + mtu = rt->u.dst.pmtu - sizeof(struct iphdr); + if (mtu < 68) { + tunnel->stat.collisions++; ip_rt_put(rt); - goto on_error; + goto tx_error; + } + if (mtu >= 576) { + if (skb->dst && mtu < skb->dst->pmtu) { + struct rt6_info *rt6 = (struct rt6_info*)skb->dst; + if (mtu < rt6->u.dst.pmtu) { + if (tunnel->parms.iph.daddr || rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + rt6->u.dst.pmtu = mtu; + } + } + } + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + ip_rt_put(rt); + goto tx_error; + } } - headroom = ((rt->u.dst.dev->hard_header_len+15)&~15)+sizeof(struct iphdr); + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); + } else + tunnel->err_count = 0; + } - if (skb_headroom(skb) < headroom || skb_shared(skb)) { - struct sk_buff *new_skb = skb_realloc_headroom(skb, headroom); + skb->h.raw = skb->nh.raw; + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - goto on_error; + stats->tx_dropped++; + dev_kfree_skb(skb, FREE_WRITE); + tunnel->recursion--; + return 0; } dev_kfree_skb(skb, FREE_WRITE); skb = new_skb; } - - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr)); - skb->nh.iph = iph; - saddr = rt->rt_src; + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); dst_release(skb->dst); skb->dst = &rt->u.dst; - iph->version = 4; - iph->ihl = 5; - iph->tos = 0; /* tos set to 0... */ + /* + * Push down and install the IPIP header. + */ + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; if (mtu > 576) - iph->frag_off = htons(IP_DF); + iph->frag_off = __constant_htons(IP_DF); else - iph->frag_off = 0; - - iph->ttl = 64; - iph->saddr = saddr; - iph->daddr = daddr; - iph->protocol = IPPROTO_IPV6; - iph->tot_len = htons(skb->len); - iph->id = htons(ip_id_count++); - ip_send_check(iph); + iph->frag_off = 0; - ip_send(skb); + iph->protocol = IPPROTO_IPV6; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = iph6->hop_limit; + + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); + ip_send_check(iph); stats->tx_bytes += skb->len; stats->tx_packets++; + ip_send(skb); + tunnel->recursion--; return 0; -on_error: - dev_kfree_skb(skb, FREE_WRITE); +tx_error_icmp: + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, dev); +tx_error: stats->tx_errors++; - return 0; + dev_kfree_skb(skb, FREE_WRITE); + tunnel->recursion--; + return 0; +} + +static int +ipip6_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + MOD_INC_USE_COUNT; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == &ipip6_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip6_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPV6 || + p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= __constant_htons(IP_DF); + + t = ipip6_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + if (dev == &ipip6_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip6_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == &ipip6_fb_tunnel) + goto done; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + MOD_DEC_USE_COUNT; + return err; +} + +static struct net_device_stats *ipip6_tunnel_get_stats(struct device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); } -static struct net_device_stats *sit_get_stats(struct device *dev) +static int ipip6_tunnel_change_mtu(struct device *dev, int new_mtu) { - return((struct net_device_stats *) dev->priv); + if (new_mtu < 576 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void ipip6_tunnel_init_gen(struct device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + + dev->destructor = ipip6_tunnel_destroy; + dev->hard_start_xmit = ipip6_tunnel_xmit; + dev->get_stats = ipip6_tunnel_get_stats; + dev->do_ioctl = ipip6_tunnel_ioctl; + dev->change_mtu = ipip6_tunnel_change_mtu; + + dev_init_buffers(dev); + + dev->type = ARPHRD_SIT; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = 1500 - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; + memcpy(dev->dev_addr, &t->parms.iph.saddr, 4); + memcpy(dev->broadcast, &t->parms.iph.daddr, 4); +} + +static int ipip6_tunnel_init(struct device *dev) +{ + struct device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + ipip6_tunnel_init_gen(dev); + + if (iph->daddr) { + struct rtable *rt; + if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = dev_get_by_index(tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + if (dev->mtu < 576) + dev->mtu = 576; + } + dev->iflink = tunnel->parms.link; + + return 0; +} + +#ifdef MODULE +static int ipip6_fb_tunnel_open(struct device *dev) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int ipip6_fb_tunnel_close(struct device *dev) +{ + MOD_DEC_USE_COUNT; + return 0; +} +#endif + +__initfunc(int ipip6_fb_tunnel_init(struct device *dev)) +{ + struct iphdr *iph; + + ipip6_tunnel_init_gen(dev); +#ifdef MODULE + dev->open = ipip6_fb_tunnel_open; + dev->stop = ipip6_fb_tunnel_close; +#endif + + iph = &ipip6_fb_tunnel.parms.iph; + iph->version = 4; + iph->protocol = IPPROTO_IPV6; + iph->ihl = 5; + iph->ttl = 64; + + tunnels_wc[0] = &ipip6_fb_tunnel; + return 0; +} + +static struct inet_protocol sit_protocol = { + ipip6_rcv, + ipip6_err, + 0, + IPPROTO_IPV6, + 0, + NULL, + "IPv6" +}; + +#ifdef MODULE +void sit_cleanup(void) +{ + inet_del_protocol(&sit_protocol); + unregister_netdevice(&ipip6_fb_tunnel_dev); +} +#endif + +__initfunc(int sit_init(void)) +{ + printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n"); + + ipip6_fb_tunnel_dev.priv = (void*)&ipip6_fb_tunnel; + ipip6_fb_tunnel_dev.name = ipip6_fb_tunnel.parms.name; +#ifdef MODULE + register_netdev(&ipip6_fb_tunnel_dev); +#else + register_netdevice(&ipip6_fb_tunnel_dev); +#endif + inet_add_protocol(&sit_protocol); + return 0; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7fba7c526548..b6559565b148 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: tcp_ipv6.c,v 1.37 1997/08/22 19:15:40 freitag Exp $ + * $Id: tcp_ipv6.c,v 1.43 1997/10/30 23:52:34 davem Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -42,22 +42,23 @@ #include +#define ICMP_PARANOIA + extern int sysctl_tcp_sack; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; -static void tcp_v6_send_reset(struct in6_addr *saddr, - struct in6_addr *daddr, - struct tcphdr *th, struct proto *prot, - struct ipv6_options *opt, - struct device *dev, int pri, int hop_limit); - +static void tcp_v6_send_reset(struct sk_buff *skb); static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); static int tcp_v6_backlog_rcv(struct sock *sk, struct sk_buff *skb); static int tcp_v6_build_header(struct sock *sk, struct sk_buff *skb); static void tcp_v6_xmit(struct sk_buff *skb); +static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, + struct ipv6hdr *ip6h, + struct tcphdr *th, + struct open_request **prevp); static struct tcp_func ipv6_mapped; static struct tcp_func ipv6_specific; @@ -536,7 +537,6 @@ out: return retval; } -/* XXX: this functions needs to be updated like tcp_v4_err. */ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, struct in6_addr *saddr, struct in6_addr *daddr, struct inet6_protocol *protocol) @@ -546,14 +546,34 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, struct sock *sk; int err; int opening; + struct tcp_opt *tp; +#ifdef ICMP_PARANOIA + __u32 seq; +#endif + + /* XXX: length check for tcphdr missing here */ sk = tcp_v6_lookup(daddr, th->dest, saddr, th->source); - if (sk == NULL) + if (sk == NULL) { + /* XXX: Update ICMP error count */ return; + } + + tp = &sk->tp_pinfo.af_tcp; +#ifdef ICMP_PARANOIA + seq = ntohl(th->seq); + if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { + if (net_ratelimit()) + printk(KERN_DEBUG "icmp packet outside the tcp window:" + " s:%d %u,%u,%u\n", + (int)sk->state, seq, tp->snd_una, tp->snd_nxt); + return; + } +#endif - np = &sk->net_pinfo.af_inet6; + np = &sk->net_pinfo.af_inet6; if (type == ICMPV6_PKT_TOOBIG && sk->state != TCP_LISTEN) { /* icmp should have updated the destination cache entry */ @@ -580,12 +600,52 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, else sk->mtu = np->dst->pmtu; - release_sock(sk); + if (sk->sock_readers) { /* remove later */ + printk(KERN_DEBUG "tcp_v6_err: pmtu disc: socket locked.\n"); + return; + } + tcp_simple_retransmit(sk); return; } - /* FIXME: This is wrong. Need to check for open_requests here. */ - opening = (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV); + opening = 0; + /* Might be for an open_request */ + switch (sk->state) { + struct open_request *req, *prev; + struct ipv6hdr hd; + case TCP_LISTEN: + if (sk->sock_readers) + return; + + /* Grrrr - fix this later. */ + ipv6_addr_copy(&hd.saddr, saddr); + ipv6_addr_copy(&hd.daddr, daddr); + req = tcp_v6_search_req(tp, &hd,th, &prev); + if (!req) + return; +#ifdef ICMP_PARANOIA + if (seq != req->snt_isn) { + if (net_ratelimit()) + printk(KERN_DEBUG "icmp packet for openreq " + "with wrong seq number:%d:%d\n", + seq, req->snt_isn); + return; + } +#endif + if (req->sk) { + sk = req->sk; /* report error in accept */ + } else { + tcp_synq_unlink(tp, req, prev); + req->class->destructor(req); + tcp_openreq_free(req); + } + /* FALL THROUGH */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + opening = 1; + break; + } + if (icmpv6_err_convert(type, code, &err) || opening) { sk->err = err; @@ -692,7 +752,8 @@ static void tcp_v6_or_free(struct open_request *req) static struct or_calltable or_ipv6 = { tcp_v6_send_synack, - tcp_v6_or_free + tcp_v6_or_free, + tcp_v6_send_reset }; /* FIXME: this is substantially similar to the ipv4 code. @@ -864,8 +925,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, atomic_set(&newsk->rmem_alloc, 0); newsk->localroute = sk->localroute; - newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF; - newsk->err = 0; newsk->shutdown = 0; newsk->ack_backlog = 0; @@ -957,17 +1016,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, return newsk; } -static void tcp_v6_reply_reset(struct sk_buff *skb) -{ -} - -static void tcp_v6_send_reset(struct in6_addr *saddr, struct in6_addr *daddr, - struct tcphdr *th, struct proto *prot, - struct ipv6_options *opt, - struct device *dev, int pri, int hop_limit) +static void tcp_v6_send_reset(struct sk_buff *skb) { + struct tcphdr *th = skb->h.th, *t1; struct sk_buff *buff; - struct tcphdr *t1; struct flowi fl; if(th->rst) @@ -982,7 +1034,7 @@ static void tcp_v6_send_reset(struct in6_addr *saddr, struct in6_addr *daddr, if (buff == NULL) return; - buff->dev = dev; + buff->dev = skb->dev; tcp_v6_build_header(NULL, buff); @@ -1009,29 +1061,32 @@ static void tcp_v6_send_reset(struct in6_addr *saddr, struct in6_addr *daddr, } buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); - - t1->check = csum_ipv6_magic(saddr, daddr, sizeof(*t1), IPPROTO_TCP, + + fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->daddr; + fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->saddr; + + t1->check = csum_ipv6_magic(fl.nl_u.ip6_u.saddr, + fl.nl_u.ip6_u.daddr, + sizeof(*t1), IPPROTO_TCP, buff->csum); fl.proto = IPPROTO_TCP; - fl.nl_u.ip6_u.daddr = daddr; - fl.nl_u.ip6_u.saddr = saddr; - fl.dev = dev; + fl.dev = skb->dev; fl.uli_u.ports.dport = th->dest; fl.uli_u.ports.sport = th->source; ip6_xmit(NULL, buff, &fl, NULL); tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutRsts++; } static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, - void *header, + struct ipv6hdr *ip6h, struct tcphdr *th, struct open_request **prevp) { - struct ipv6hdr *ip6h = header; struct open_request *req, *prev; - __u16 rport = th->source; + __u16 rport = th->source; /* assumption: the socket is not in use. * as we checked the user count on tcp_rcv and we're @@ -1050,6 +1105,22 @@ static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, return NULL; } +static void tcp_v6_rst_req(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct open_request *req, *prev; + + req = tcp_v6_search_req(tp,skb->nh.ipv6h,skb->h.th,&prev); + if (!req) + return; + /* Sequence number check required by RFC793 */ + if (before(skb->seq, req->snt_isn) || after(skb->seq, req->snt_isn+1)) + return; + tcp_synq_unlink(tp, req, prev); + req->class->destructor(req); + tcp_openreq_free(req); +} + int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, struct in6_addr *saddr, struct in6_addr *daddr, struct ipv6_options *opt, unsigned short len, @@ -1077,7 +1148,13 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, * Pull up the IP header. */ - skb_pull(skb, skb->h.raw - skb->data); + __skb_pull(skb, skb->h.raw - skb->data); + + /* + * Count it even if it's bad. + */ + + tcp_statistics.TcpInSegs++; /* * Try to use the device checksum if provided. @@ -1089,14 +1166,13 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, case CHECKSUM_HW: if (tcp_v6_check(th,len,saddr,daddr,skb->csum)) { printk(KERN_DEBUG "tcp csum failed\n"); + tcp_statistics.TcpInErrs++; goto discard_it; } default: /* CHECKSUM_UNNECESSARY */ }; - tcp_statistics.TcpInSegs++; - sk = __tcp_v6_lookup(th, saddr, th->source, daddr, th->dest); if (!sk) { @@ -1137,28 +1213,35 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, } } - if (!sk->prot) { - printk(KERN_DEBUG "tcp_rcv: sk->prot == NULL\n"); - return(0); - } - skb_set_owner_r(skb, sk); - /* I don't understand why lock_sock()/release_sock() is not - * called here. IPv4 does this. It looks like a bug to me. -AK - */ if (sk->state == TCP_ESTABLISHED) { if (tcp_rcv_established(sk, skb, th, len)) goto no_tcp_socket; return 0; } + if (sk->state == TCP_LISTEN) { + __u32 flg = ((u32 *)th)[3]; - if (sk->state == TCP_LISTEN && - ((u32 *)th)[3] & __constant_htonl(0x00120000)) { - sk = tcp_check_req(sk, skb, opt); - if (sk == NULL) - goto discard_it; + /* Check for RST */ + if (flg & __constant_htonl(0x00040000)) { + tcp_v6_rst_req(sk, skb); + } + + /* Check SYN|ACK */ + if (flg & __constant_htonl(0x00120000)) { + struct open_request *req, *prev; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + req = tcp_v6_search_req(tp, skb->nh.ipv6h,th,&prev); + if (req) { + sk = tcp_check_req(sk, skb, req); + } + /* else do syncookies (add them here) */ + if (sk == NULL) + goto discard_it; + } } if (tcp_rcv_state_process(sk, skb, th, opt, len) == 0) @@ -1168,11 +1251,10 @@ no_tcp_socket: /* * No such TCB. If th->rst is 0 send a reset - * (checked in tcp_send_reset) + * (checked in tcp_v6_send_reset) */ - tcp_v6_send_reset(daddr, saddr, th, &tcpv6_prot, opt, dev, - skb->nh.ipv6h->priority, 255); + tcp_v6_send_reset(skb); discard_it: @@ -1285,12 +1367,6 @@ static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) sin6->sin6_port = sk->dummy_th.dest; } -static struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb, - void *opt) -{ - return sk; /* dummy */ -} - static struct tcp_func ipv6_specific = { tcp_v6_build_header, tcp_v6_xmit, @@ -1302,9 +1378,6 @@ static struct tcp_func ipv6_specific = { ipv6_setsockopt, ipv6_getsockopt, v6_addr2sockaddr, - tcp_v6_reply_reset, - tcp_v6_search_req, - /* not implemented yet: */ cookie_v6_check, sizeof(struct sockaddr_in6) }; @@ -1323,9 +1396,6 @@ static struct tcp_func ipv6_mapped = { ipv6_setsockopt, ipv6_getsockopt, v6_addr2sockaddr, - tcp_v6_reply_reset, - tcp_v6_search_req, - cookie_v6_check, /* not implemented yet. */ sizeof(struct sockaddr_in6) }; @@ -1364,8 +1434,6 @@ static int tcp_v6_init_sock(struct sock *sk) sk->priority = 1; sk->state = TCP_CLOSE; - /* this is how many unacked bytes we will accept for this socket. */ - sk->max_unacked = 2048; /* needs to be at most 2 full packets. */ sk->max_ack_backlog = SOMAXCONN; sk->mtu = 576; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f18f5a6f8d32..aed22f964375 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -7,7 +7,7 @@ * * Based on linux/ipv4/udp.c * - * $Id: udp.c,v 1.17 1997/04/29 09:38:55 mj Exp $ + * $Id: udp.c,v 1.18 1997/09/14 08:32:24 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -601,8 +601,9 @@ static int udpv6_getfrag(const void *data, struct in6_addr *addr, clen -= sizeof(struct udphdr); } - udh->wcheck = csum_partial_copy_fromiovecend(dst, udh->iov, offset, - clen, udh->wcheck); + if (csum_partial_copy_fromiovecend(dst, udh->iov, offset, + clen, &udh->wcheck)) + return -EFAULT; if (final) { struct in6_addr *daddr; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 13d9528e60ae..2a46c5270c6f 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -694,7 +694,6 @@ static int ipxitf_send(ipx_interface *intrfc, struct sk_buff *skb, char *node) /* * Send it out */ - skb->priority = SOPRI_NORMAL; dev_queue_xmit(skb); return 0; } diff --git a/net/netlink.c b/net/netlink.c deleted file mode 100644 index d2128c1802dd..000000000000 --- a/net/netlink.c +++ /dev/null @@ -1,475 +0,0 @@ -/* - * NETLINK An implementation of a loadable kernel mode driver providing - * multiple kernel/user space bidirectional communications links. - * - * Author: Alan Cox - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include - -static int (*netlink_handler[MAX_LINKS])(int minor, struct sk_buff *skb); -static struct sk_buff_head skb_queue_rd[MAX_LINKS]; -static int rdq_size[MAX_LINKS]; -static struct wait_queue *read_space_wait[MAX_LINKS]; - -static unsigned long active_map = 0; -static unsigned long open_map = 0; - -/* - * Device operations - */ - -/* - * Default write handler. - */ - -static int netlink_err(int minor, struct sk_buff *skb) -{ - kfree_skb(skb, FREE_READ); - return -EUNATCH; -} - -/* - * Exported do nothing receiver for one way - * interfaces. - */ - -int netlink_donothing(int minor, struct sk_buff *skb) -{ - kfree_skb(skb, FREE_READ); - return -EINVAL; -} - -static unsigned int netlink_poll(struct file *file, poll_table * wait) -{ - unsigned int mask; - unsigned int minor = MINOR(file->f_dentry->d_inode->i_rdev); - - poll_wait(&read_space_wait[minor], wait); - mask = POLLOUT | POLLWRNORM; - if (skb_peek(&skb_queue_rd[minor])) - mask |= POLLIN | POLLRDNORM; - return mask; -} - -/* - * Write a message to the kernel side of a communication link - */ - -static ssize_t netlink_write(struct file * file, const char * buf, - size_t count,loff_t *ppos) -{ - int err; - unsigned int minor = MINOR(file->f_dentry->d_inode->i_rdev); - struct sk_buff *skb; - skb=alloc_skb(count, GFP_KERNEL); - err = copy_from_user(skb_put(skb,count),buf, count); - return err ? -EFAULT : (netlink_handler[minor])(minor,skb); -} - -/* - * Read a message from the kernel side of the communication link - */ - -static ssize_t netlink_read(struct file * file, char * buf, - size_t count,loff_t *ppos) -{ - int err; - unsigned int minor = MINOR(file->f_dentry->d_inode->i_rdev); - struct sk_buff *skb; - cli(); - while((skb=skb_dequeue(&skb_queue_rd[minor]))==NULL) - { - if(file->f_flags&O_NONBLOCK) - { - sti(); - return -EAGAIN; - } - interruptible_sleep_on(&read_space_wait[minor]); - if(signal_pending(current)) - { - sti(); - return -ERESTARTSYS; - } - } - rdq_size[minor]-=skb->len; - sti(); - if(skb->lenlen; - err = copy_to_user(buf,skb->data,count); - kfree_skb(skb, FREE_READ); - return err ? -EFAULT : count; -} - -static long long netlink_lseek(struct file * file, long long offset, int origin) -{ - return -ESPIPE; -} - -static int netlink_open(struct inode * inode, struct file * file) -{ - unsigned int minor = MINOR(inode->i_rdev); - - if(minor>=MAX_LINKS) - return -ENODEV; - if(active_map&(1<f_mode & FMODE_READ) - { - if (open_map&(1<i_rdev); - if (file->f_mode & FMODE_READ) - open_map&=~(1<i_rdev); - int retval = 0; - - if (minor >= MAX_LINKS) - return -ENODEV; - switch ( cmd ) { - default: - retval = -EINVAL; - } - return retval; -} - - -static struct file_operations netlink_fops = { - netlink_lseek, - netlink_read, - netlink_write, - NULL, /* netlink_readdir */ - netlink_poll, - netlink_ioctl, - NULL, /* netlink_mmap */ - netlink_open, - netlink_release -}; - -/* - * We export these functions to other modules. They provide a - * complete set of kernel non-blocking support for message - * queueing. - */ - -int netlink_attach(int unit, int (*function)(int minor, struct sk_buff *skb)) -{ - if(unit>=MAX_LINKS) - return -ENODEV; - if(active_map&(1<len>MAX_QBYTES) - ret=-EAGAIN; - else - { - skb_queue_tail(&skb_queue_rd[unit], skb); - rdq_size[unit]+=skb->len; - ret=0; - wake_up_interruptible(&read_space_wait[unit]); - } - restore_flags(flags); - } - return ret; -} - - -/* - * "High" level netlink interface. (ANK) - * - * Features: - * - standard message format. - * - pseudo-reliable delivery. Messages can be still lost, but - * user level will know that they were lost and can - * recover (f.e. gated could reread FIB and device list) - * - messages are batched. - * - if user is not attached, we do not make useless work. - * - * Examples: - * - netlink_post equivalent (but with pseudo-reliable delivery) - * ctl.nlmsg_delay = 0; - * ctl.nlmsg_maxsize = ; - * .... - * msg = nlmsg_send(&ctl, ...); - * if (msg) { - * ... make it ... - * nlmsg_transmit(&ctl); - * } - * - * - batched messages. - * if nlmsg_delay==0, messages are delivered only - * by nlmsg_transmit, or when batch is completed, - * otherwise nlmsg_transmit is noop (only starts - * timer) - * - * ctl.nlmsg_delay = ...; - * ctl.nlmsg_maxsize = ; - * .... - * msg = nlmsg_send(&ctl, ...); - * if (msg) - * ... make it ... - * .... - * msg = nlmsg_send(&ctl, ...); - * if (msg) - * ... make it ... - * .... - * if (ctl.nlmsg_skb) - * nlmsg_transmit(&ctl); - * - */ - -/* - * Try to deliver queued messages. - * If the delivery fails (netlink is not attached or congested), - * do not free skb to avoid useless new message creation. - * - * Notes: - * - timer should be already stopped. - * - NET SPL. - */ - -void nlmsg_flush(struct nlmsg_ctl *ctl) -{ - if (ctl->nlmsg_skb == NULL) - return; - - if (netlink_post(ctl->nlmsg_unit, ctl->nlmsg_skb) == 0) - { - ctl->nlmsg_skb = NULL; - return; - } - - ctl->nlmsg_timer.expires = jiffies + NLMSG_RECOVERY_TIMEO; - ctl->nlmsg_timer.data = (unsigned long)ctl; - ctl->nlmsg_timer.function = (void (*)(unsigned long))nlmsg_flush; - add_timer(&ctl->nlmsg_timer); - return; -} - - -/* - * Allocate room for new message. If it is impossible, - * start "overrun" mode and return NULL. - * - * Notes: - * - NET SPL. - */ - -void* nlmsg_send(struct nlmsg_ctl *ctl, unsigned long type, int len, - unsigned long seq, unsigned long pid) -{ - struct nlmsghdr *nlh; - struct sk_buff *skb; - int rlen; - - static __inline__ void nlmsg_lost(struct nlmsg_ctl *ctl, - unsigned long seq) - { - if (!ctl->nlmsg_overrun) - { - ctl->nlmsg_overrun_start = seq; - ctl->nlmsg_overrun_end = seq; - ctl->nlmsg_overrun = 1; - return; - } - if (!ctl->nlmsg_overrun_start) - ctl->nlmsg_overrun_start = seq; - if (seq) - ctl->nlmsg_overrun_end = seq; - } - - if (!(open_map&(1<nlmsg_unit))) - { - nlmsg_lost(ctl, seq); - return NULL; - } - - rlen = NLMSG_ALIGN(len + sizeof(struct nlmsghdr)); - - if (rlen > ctl->nlmsg_maxsize) - { - printk(KERN_ERR "nlmsg_send: too big message\n"); - return NULL; - } - - if ((skb=ctl->nlmsg_skb) == NULL || skb_tailroom(skb) < rlen) - { - if (skb) - { - ctl->nlmsg_force++; - nlmsg_flush(ctl); - ctl->nlmsg_force--; - } - - if (ctl->nlmsg_skb || - (skb=alloc_skb(ctl->nlmsg_maxsize, GFP_ATOMIC)) == NULL) - { - printk (KERN_WARNING "nlmsg at unit %d overrunned\n", ctl->nlmsg_unit); - nlmsg_lost(ctl, seq); - return NULL; - } - - ctl->nlmsg_skb = skb; - - if (ctl->nlmsg_overrun) - { - int *seqp; - nlh = (struct nlmsghdr*)skb_put(skb, sizeof(struct nlmsghdr) + 2*sizeof(unsigned long)); - nlh->nlmsg_type = NLMSG_OVERRUN; - nlh->nlmsg_len = sizeof(struct nlmsghdr) + 2*sizeof(unsigned long); - nlh->nlmsg_seq = 0; - nlh->nlmsg_pid = 0; - seqp = (int*)nlh->nlmsg_data; - seqp[0] = ctl->nlmsg_overrun_start; - seqp[1] = ctl->nlmsg_overrun_end; - ctl->nlmsg_overrun = 0; - } - if (ctl->nlmsg_timer.function) - { - del_timer(&ctl->nlmsg_timer); - ctl->nlmsg_timer.function = NULL; - } - if (ctl->nlmsg_delay) - { - ctl->nlmsg_timer.expires = jiffies + ctl->nlmsg_delay; - ctl->nlmsg_timer.function = (void (*)(unsigned long))nlmsg_flush; - ctl->nlmsg_timer.data = (unsigned long)ctl; - add_timer(&ctl->nlmsg_timer); - } - } - - nlh = (struct nlmsghdr*)skb_put(skb, rlen); - nlh->nlmsg_type = type; - nlh->nlmsg_len = sizeof(struct nlmsghdr) + len; - nlh->nlmsg_seq = seq; - nlh->nlmsg_pid = pid; - return nlh->nlmsg_data; -} - -/* - * Kick message queue. - * Two modes: - * - synchronous (delay==0). Messages are delivered immediately. - * - delayed. Do not deliver, but start delivery timer. - */ - -void nlmsg_transmit(struct nlmsg_ctl *ctl) -{ - start_bh_atomic(); - - if (!ctl->nlmsg_delay) - { - if (ctl->nlmsg_timer.function) - { - del_timer(&ctl->nlmsg_timer); - ctl->nlmsg_timer.function = NULL; - } - ctl->nlmsg_force++; - nlmsg_flush(ctl); - ctl->nlmsg_force--; - end_bh_atomic(); - return; - } - if (!ctl->nlmsg_timer.function) - { - ctl->nlmsg_timer.expires = jiffies + ctl->nlmsg_delay; - ctl->nlmsg_timer.function = (void (*)(unsigned long))nlmsg_flush; - ctl->nlmsg_timer.data = (unsigned long)ctl; - add_timer(&ctl->nlmsg_timer); - } - - end_bh_atomic(); -} - - -__initfunc(int init_netlink(void)) -{ - int ct; - - if(register_chrdev(NETLINK_MAJOR,"netlink", &netlink_fops)) { - printk(KERN_ERR "netlink: unable to get major %d\n", NETLINK_MAJOR); - return -EIO; - } - for(ct=0;ct + * Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define Nprintk(a...) + +#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE) +#define NL_EMULATE_DEV +#endif + +static struct sock *nl_table[MAX_LINKS]; +static atomic_t nl_table_lock[MAX_LINKS]; +static struct wait_queue *nl_table_wait; + +#ifdef NL_EMULATE_DEV +static struct socket *netlink_kernel[MAX_LINKS]; +#endif + +static int netlink_dump(struct sock *sk); +static void netlink_destroy_callback(struct netlink_callback *cb); + +extern __inline__ void +netlink_wait_on_table(int protocol) +{ + while (atomic_read(&nl_table_lock[protocol])) + sleep_on(&nl_table_wait); +} + +extern __inline__ void +netlink_lock_table(int protocol) +{ + atomic_inc(&nl_table_lock[protocol]); +} + +extern __inline__ void +netlink_unlock_table(int protocol, int wakeup) +{ +#if 0 + /* F...g gcc does not eat it! */ + + if (atomic_dec_and_test(&nl_table_lock[protocol]) && wakeup) + wake_up(&nl_table_wait); +#else + atomic_dec(&nl_table_lock[protocol]); + if (atomic_read(&nl_table_lock[protocol]) && wakeup) + wake_up(&nl_table_wait); +#endif +} + +static __inline__ void netlink_lock(struct sock *sk) +{ + atomic_inc(&sk->protinfo.af_netlink.locks); +} + +static __inline__ void netlink_unlock(struct sock *sk) +{ + atomic_dec(&sk->protinfo.af_netlink.locks); +} + +static __inline__ int netlink_locked(struct sock *sk) +{ + return atomic_read(&sk->protinfo.af_netlink.locks); +} + +static __inline__ struct sock *netlink_lookup(int protocol, pid_t pid) +{ + struct sock *sk; + + for (sk=nl_table[protocol]; sk; sk=sk->next) { + if (sk->protinfo.af_netlink.pid == pid) { + netlink_lock(sk); + return sk; + } + } + + return NULL; +} + +extern struct proto_ops netlink_ops; + +static void netlink_insert(struct sock *sk) +{ + cli(); + sk->next = nl_table[sk->protocol]; + nl_table[sk->protocol] = sk; + sti(); +} + +static void netlink_remove(struct sock *sk) +{ + struct sock **skp; + for (skp = &nl_table[sk->protocol]; *skp; skp = &((*skp)->next)) { + if (*skp == sk) { + *skp = sk->next; + return; + } + } +} + +static int netlink_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + sock->state = SS_UNCONNECTED; + + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + if (protocol<0 || protocol >= MAX_LINKS) + return -EPROTONOSUPPORT; + + sock->ops = &netlink_ops; + + sk = sk_alloc(AF_NETLINK, GFP_KERNEL); + if (!sk) + return -ENOMEM; + + sock_init_data(sock,sk); + sk->destruct = NULL; + + sk->mtu=4096; + sk->protocol=protocol; + return 0; +} + +static void netlink_destroy_timer(unsigned long data) +{ + struct sock *sk=(struct sock *)data; + + if (!netlink_locked(sk) && !atomic_read(&sk->wmem_alloc) + && !atomic_read(&sk->rmem_alloc)) { + sk_free(sk); + return; + } + + sk->timer.expires=jiffies+10*HZ; + add_timer(&sk->timer); + printk(KERN_DEBUG "netlink sk destroy delayed\n"); +} + +static int netlink_release(struct socket *sock, struct socket *peer) +{ + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + /* Wait on table before removing socket */ + netlink_wait_on_table(sk->protocol); + netlink_remove(sk); + + if (sk->protinfo.af_netlink.cb) { + netlink_unlock(sk); + sk->protinfo.af_netlink.cb->done(sk->protinfo.af_netlink.cb); + netlink_destroy_callback(sk->protinfo.af_netlink.cb); + sk->protinfo.af_netlink.cb = NULL; + } + + /* OK. Socket is unlinked, and, therefore, + no new packets will arrive */ + sk->state_change(sk); + sk->dead = 1; + + skb_queue_purge(&sk->receive_queue); + skb_queue_purge(&sk->write_queue); + + /* IMPORTANT! It is the major unpleasant feature of this + transport (and AF_UNIX datagram, when it will be repaired). + + Someone could wait on our sock->wait now. + We cannot release socket until waiter will remove yourself + from wait queue. I choose the most conservetive way of solving + the problem. + + We waked up this queue above, so that we need only to wait + when the readers release us. + */ + + while (netlink_locked(sk)) { + current->counter = 0; + schedule(); + } + + if (sk->socket) { + sk->socket = NULL; + sock->sk = NULL; + } + + if (atomic_read(&sk->rmem_alloc) || atomic_read(&sk->wmem_alloc)) { + sk->timer.data=(unsigned long)sk; + sk->timer.expires=jiffies+HZ; + sk->timer.function=netlink_destroy_timer; + add_timer(&sk->timer); + printk(KERN_DEBUG "impossible 333\n"); + return 0; + } + + sk_free(sk); + return 0; +} + +static int netlink_autobind(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct sock *osk; + + netlink_wait_on_table(sk->protocol); + + sk->protinfo.af_netlink.groups = 0; + sk->protinfo.af_netlink.pid = current->pid; + +retry: + for (osk=nl_table[sk->protocol]; osk; osk=osk->next) { + if (osk->protinfo.af_netlink.pid == sk->protinfo.af_netlink.pid) { + /* Bind collision, search negative pid values. */ + if (sk->protinfo.af_netlink.pid > 0) + sk->protinfo.af_netlink.pid = -4096; + sk->protinfo.af_netlink.pid--; + goto retry; + } + } + + netlink_insert(sk); + return 0; +} + +static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sock *sk = sock->sk; + struct sock *osk; + struct sockaddr_nl *nladdr=(struct sockaddr_nl *)addr; + + if (nladdr->nl_family != AF_NETLINK) + return -EINVAL; + + /* Only superuser is allowed to listen multicasts */ + if (nladdr->nl_groups && !suser()) + return -EPERM; + + if (sk->protinfo.af_netlink.pid) { + if (nladdr->nl_pid != sk->protinfo.af_netlink.pid) + return -EINVAL; + sk->protinfo.af_netlink.groups = nladdr->nl_groups; + return 0; + } + + if (nladdr->nl_pid == 0) { + netlink_autobind(sock); + sk->protinfo.af_netlink.groups = nladdr->nl_groups; + return 0; + } + + netlink_wait_on_table(sk->protocol); + + for (osk=nl_table[sk->protocol]; osk; osk=osk->next) { + if (osk->protinfo.af_netlink.pid == nladdr->nl_pid) + return -EADDRINUSE; + } + + sk->protinfo.af_netlink.pid = nladdr->nl_pid; + sk->protinfo.af_netlink.groups = nladdr->nl_groups; + netlink_insert(sk); + return 0; +} + +static int netlink_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_nl *nladdr=(struct sockaddr_nl*)addr; + + if (addr->sa_family == AF_UNSPEC) + { + sk->protinfo.af_netlink.dst_pid = 0; + sk->protinfo.af_netlink.dst_groups = 0; + return 0; + } + if (addr->sa_family != AF_NETLINK) + return -EINVAL; + + /* Only superuser is allowed to send multicasts */ + if (!suser() && nladdr->nl_groups) + return -EPERM; + + sk->protinfo.af_netlink.dst_pid = nladdr->nl_pid; + sk->protinfo.af_netlink.dst_groups = nladdr->nl_groups; + + if (!sk->protinfo.af_netlink.pid) + netlink_autobind(sock); + return 0; +} + +static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int peer) +{ + struct sock *sk = sock->sk; + struct sockaddr_nl *nladdr=(struct sockaddr_nl *)addr; + + nladdr->nl_family = AF_NETLINK; + *addr_len = sizeof(*nladdr); + + if (peer) { + nladdr->nl_pid = sk->protinfo.af_netlink.dst_pid; + nladdr->nl_groups = sk->protinfo.af_netlink.dst_groups; + } else { + nladdr->nl_pid = sk->protinfo.af_netlink.pid; + nladdr->nl_groups = sk->protinfo.af_netlink.groups; + } + return 0; +} + +int netlink_unicast(struct sock *ssk, struct sk_buff *skb, pid_t pid, int nonblock) +{ + struct sock *sk; + int len = skb->len; + int protocol = ssk->protocol; + +retry: + for (sk = nl_table[protocol]; sk; sk = sk->next) { + if (sk->protinfo.af_netlink.pid != pid) + continue; + + netlink_lock(sk); + +#ifdef NL_EMULATE_DEV + if (sk->protinfo.af_netlink.handler) { + len = sk->protinfo.af_netlink.handler(protocol, skb); + netlink_unlock(sk); + return len; + } +#endif + + cli(); + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { + if (nonblock) { + sti(); + netlink_unlock(sk); + kfree_skb(skb, 0); + return -EAGAIN; + } + interruptible_sleep_on(sk->sleep); + netlink_unlock(sk); + sti(); + + if (current->signal & ~current->blocked) { + kfree_skb(skb, 0); + return -ERESTARTSYS; + } + goto retry; + } + sti(); +Nprintk("unicast_deliver %d\n", skb->len); + skb_orphan(skb); + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->receive_queue, skb); + sk->data_ready(sk, len); + netlink_unlock(sk); + return len; + } + kfree_skb(skb, 0); + return -ECONNREFUSED; +} + +static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) +{ +#ifdef NL_EMULATE_DEV + if (sk->protinfo.af_netlink.handler) { + sk->protinfo.af_netlink.handler(sk->protocol, skb); + return 0; + } else +#endif + if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) { +Nprintk("broadcast_deliver %d\n", skb->len); + skb_orphan(skb); + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->receive_queue, skb); + sk->data_ready(sk, skb->len); + return 0; + } + return -1; +} + +void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, pid_t pid, + unsigned group, int allocation) +{ + struct sock *sk; + struct sk_buff *skb2 = NULL; + int protocol = ssk->protocol; + int failure = 0; + + /* While we sleep in clone, do not allow to change socket list */ + + netlink_lock_table(protocol); + + for (sk = nl_table[protocol]; sk; sk = sk->next) { + if (ssk == sk) + continue; + + if (sk->protinfo.af_netlink.pid == pid || + !(sk->protinfo.af_netlink.groups&group)) + continue; + + if (failure) { + sk->err = -ENOBUFS; + sk->state_change(sk); + continue; + } + + netlink_lock(sk); + if (skb2 == NULL) { + if (atomic_read(&skb->users) != 1) { + skb2 = skb_clone(skb, allocation); + } else { + skb2 = skb; + atomic_inc(&skb->users); + } + } + if (skb2 == NULL) { + sk->err = -ENOBUFS; + sk->state_change(sk); + /* Clone failed. Notify ALL listeners. */ + failure = 1; + } else if (netlink_broadcast_deliver(sk, skb2)) { + sk->err = -ENOBUFS; + sk->state_change(sk); + } else + skb2 = NULL; + netlink_unlock(sk); + } + + netlink_unlock_table(protocol, allocation == GFP_KERNEL); + + if (skb2) + kfree_skb(skb2, 0); + kfree_skb(skb, 0); +} + +void netlink_set_err(struct sock *ssk, pid_t pid, unsigned group, int code) +{ + struct sock *sk; + int protocol = ssk->protocol; + +Nprintk("seterr"); + for (sk = nl_table[protocol]; sk; sk = sk->next) { + if (ssk == sk) + continue; + + if (sk->protinfo.af_netlink.pid == pid || + !(sk->protinfo.af_netlink.groups&group)) + continue; + + sk->err = -code; + sk->state_change(sk); + } +} + +static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, int len, + struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + struct sockaddr_nl *addr=msg->msg_name; + pid_t dst_pid; + unsigned dst_groups; + struct sk_buff *skb; + int err; + + if (msg->msg_flags&MSG_OOB) + return -EOPNOTSUPP; + + if (msg->msg_flags&~MSG_DONTWAIT) { + printk("1 %08x\n", msg->msg_flags); + return -EINVAL; + } + + if (msg->msg_namelen) { + if (addr->nl_family != AF_NETLINK) { + printk("2 %08x\n", addr->nl_family); + return -EINVAL; + } + dst_pid = addr->nl_pid; + dst_groups = addr->nl_groups; + if (dst_groups && !suser()) + return -EPERM; + } else { + dst_pid = sk->protinfo.af_netlink.dst_pid; + dst_groups = sk->protinfo.af_netlink.dst_groups; + } + + + if (!sk->protinfo.af_netlink.pid) + netlink_autobind(sock); + + skb = sock_wmalloc(sk, len, 0, GFP_KERNEL); + if (skb==NULL) + return -ENOBUFS; + + NETLINK_CB(skb).pid = sk->protinfo.af_netlink.pid; + NETLINK_CB(skb).groups = sk->protinfo.af_netlink.groups; + NETLINK_CB(skb).dst_pid = dst_pid; + NETLINK_CB(skb).dst_groups = dst_groups; + memcpy(NETLINK_CREDS(skb), &scm->creds, sizeof(struct ucred)); + memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + + if (dst_groups) { + atomic_inc(&skb->users); + netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL); + } + err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); + if (err < 0) { + printk("3\n"); + } + return err; +} + +static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, int len, + int flags, struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + int noblock = flags&MSG_DONTWAIT; + int copied; + struct sk_buff *skb; + int err; + + if (flags&(MSG_OOB|MSG_PEEK)) + return -EOPNOTSUPP; + + err = -sock_error(sk); + if (err) + return err; + + skb = skb_recv_datagram(sk,flags,noblock,&err); + if (skb==NULL) + return err; + + msg->msg_namelen = 0; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + skb->h.raw = skb->data; + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + if (msg->msg_name) { + struct sockaddr_nl *addr = (struct sockaddr_nl*)msg->msg_name; + addr->nl_family = AF_NETLINK; + addr->nl_pid = NETLINK_CB(skb).pid; + addr->nl_groups = NETLINK_CB(skb).dst_groups; + msg->msg_namelen = sizeof(*addr); + } + + scm->creds = *NETLINK_CREDS(skb); + skb_free_datagram(sk, skb); + + if (sk->protinfo.af_netlink.cb + && atomic_read(&sk->rmem_alloc) <= sk->rcvbuf/2) + netlink_dump(sk); + return err ? err : copied; +} + +/* + * We export these functions to other modules. They provide a + * complete set of kernel non-blocking support for message + * queueing. + */ + +struct sock * +netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)) +{ + struct socket *sock; + struct sock *sk; + + if (unit<0 || unit>=MAX_LINKS) + return NULL; + + if (!(sock = sock_alloc())) + return NULL; + + sock->type = SOCK_RAW; + + if (netlink_create(sock, unit) < 0) { + sock_release(sock); + return NULL; + } + sk = sock->sk; + if (input) + sk->data_ready = input; + + netlink_insert(sk); + return sk; +} + +static void netlink_destroy_callback(struct netlink_callback *cb) +{ + if (cb->skb) + kfree_skb(cb->skb, 0); + kfree(cb); +} + +/* + * It looks a bit ugly. + * It would be better to create kernel thread. + */ + +static int netlink_dump(struct sock *sk) +{ + struct netlink_callback *cb; + struct sk_buff *skb; + struct nlmsghdr *nlh; + int len; + + skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + cb = sk->protinfo.af_netlink.cb; + + len = cb->dump(skb, cb); + + if (len > 0) { + skb_queue_tail(&sk->receive_queue, skb); + sk->data_ready(sk, len); + return 0; + } + + nlh = __nlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLMSG_DONE, sizeof(int)); + nlh->nlmsg_flags |= NLM_F_MULTI; + memcpy(NLMSG_DATA(nlh), &len, sizeof(len)); + skb_queue_tail(&sk->receive_queue, skb); + sk->data_ready(sk, skb->len); + + cb->done(cb); + sk->protinfo.af_netlink.cb = NULL; + netlink_destroy_callback(cb); + netlink_unlock(sk); + return 0; +} + +int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + struct nlmsghdr *nlh, + int (*dump)(struct sk_buff *skb, struct netlink_callback*), + int (*done)(struct netlink_callback*)) +{ + struct netlink_callback *cb; + struct sock *sk; + + cb = kmalloc(sizeof(*cb), GFP_KERNEL); + if (cb == NULL) + return -ENOBUFS; + + memset(cb, 0, sizeof(*cb)); + cb->dump = dump; + cb->done = done; + cb->nlh = nlh; + atomic_inc(&skb->users); + cb->skb = skb; + + sk = netlink_lookup(ssk->protocol, NETLINK_CB(skb).pid); + if (sk == NULL) { + netlink_destroy_callback(cb); + return -ECONNREFUSED; + } + /* A dump is in progress... */ + if (sk->protinfo.af_netlink.cb) { + netlink_destroy_callback(cb); + netlink_unlock(sk); + return -EBUSY; + } + sk->protinfo.af_netlink.cb = cb; + netlink_dump(sk); + return 0; +} + +void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) +{ + struct sk_buff *skb; + struct nlmsghdr *rep; + struct nlmsgerr *errmsg; + int size; + + if (err == 0) + size = NLMSG_SPACE(sizeof(struct nlmsgerr)); + else + size = NLMSG_SPACE(4 + nlh->nlmsg_len); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return; + + rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + NLMSG_ERROR, sizeof(struct nlmsgerr)); + errmsg = NLMSG_DATA(rep); + errmsg->error = err; + memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(struct nlmsghdr)); + netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); +} + + +#ifdef NL_EMULATE_DEV +/* + * Backward compatibility. + */ + +int netlink_attach(int unit, int (*function)(int, struct sk_buff *skb)) +{ + struct sock *sk = netlink_kernel_create(unit, NULL); + if (sk == NULL) + return -ENOBUFS; + sk->protinfo.af_netlink.handler = function; + netlink_kernel[unit] = sk->socket; + return 0; +} + +void netlink_detach(int unit) +{ + struct socket *sock = netlink_kernel[unit]; + netlink_kernel[unit] = NULL; + sock_release(sock); +} + +int netlink_post(int unit, struct sk_buff *skb) +{ + if (netlink_kernel[unit]) { + netlink_broadcast(netlink_kernel[unit]->sk, skb, 0, ~0, GFP_ATOMIC); + return 0; + } + return -EUNATCH;; +} + +EXPORT_SYMBOL(netlink_attach); +EXPORT_SYMBOL(netlink_detach); +EXPORT_SYMBOL(netlink_post); + +#endif + +#if 0 + +/* What a pity... It was good code, but at the moment it + results in unnecessary complications. + */ + +/* + * "High" level netlink interface. (ANK) + * + * Features: + * - standard message format. + * - pseudo-reliable delivery. Messages can be still lost, but + * user level will know that they were lost and can + * recover (f.e. gated could reread FIB and device list) + * - messages are batched. + */ + +/* + * Try to deliver queued messages. + */ + +static void nlmsg_delayed_flush(struct sock *sk) +{ + nlmsg_flush(sk, GFP_ATOMIC); +} + +static void nlmsg_flush(struct sock *sk, int allocation) +{ + struct sk_buff *skb; + unsigned long flags; + + save_flags(flags); + cli(); + while ((skb=skb_dequeue(&sk->write_queue)) != NULL) { + if (skb->users != 1) { + skb_queue_head(&sk->write_queue, skb); + break; + } + restore_flags(flags); + netlink_broadcast(sk, skb, 0, NETLINK_CB(skb).dst_groups, allocation); + cli(); + } + start_bh_atomic(); + restore_flags(flags); + if (skb) { + if (sk->timer.function) + del_timer(&sk->timer) + sk->timer.expires = jiffies + (sk->protinfo.af_netlink.delay ? : HZ/2); + sk->timer.function = (void (*)(unsigned long))nlmsg_delayed_flush; + sk->timer.data = (unsigned long)sk; + add_timer(&sk->timer); + } + end_bh_atomic(); +} + +/* + * Allocate room for new message. If it is impossible, return NULL. + */ + +void *nlmsg_broadcast(struct sock *sk, struct sk_buff **skbp, + unsigned long type, int len, + unsigned groups, int allocation) +{ + struct nlmsghdr *nlh; + struct sk_buff *skb; + int rlen; + unsigned long flags; + + rlen = NLMSG_SPACE(len); + + save_flags(flags); + cli(); + skb = sk->write_queue.tail; + if (skb == sk->write_queue.head) + skb = NULL; + if (skb == NULL || skb_tailroom(skb) < rlen || NETLINK_CB(skb).dst_groups != groups) { + restore_flags(flags); + + if (skb) + nlmsg_flush(sk, allocation); + + skb = sock_wmalloc(rlen > NLMSG_GOODSIZE ? rlen : NLMSG_GOODSIZE, + sk, 0, allocation); + + if (skb==NULL) { + printk (KERN_WARNING "nlmsg at unit %d overrunned\n", sk->protocol); + return NULL; + } + + NETLINK_CB(skb).dst_groups = groups; + cli(); + skb_queue_tail(&sk->write_queue, skb); + } + atomic_inc(&skb->users); + restore_flags(flags); + + nlh = (struct nlmsghdr*)skb_put(skb, rlen); + nlh->nlmsg_type = type; + nlh->nlmsg_len = NLMSG_LENGTH(len); + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + *skbp = skb; + return nlh->nlmsg_data; +} + +struct sk_buff* nlmsg_alloc(unsigned long type, int len, + unsigned long seq, unsigned long pid, int allocation) +{ + struct nlmsghdr *nlh; + struct sk_buff *skb; + int rlen; + + rlen = NLMSG_SPACE(len); + + skb = alloc_skb(rlen, allocation); + if (skb==NULL) + return NULL; + + nlh = (struct nlmsghdr*)skb_put(skb, rlen); + nlh->nlmsg_type = type; + nlh->nlmsg_len = NLMSG_LENGTH(len); + nlh->nlmsg_seq = seq; + nlh->nlmsg_pid = pid; + return skb; +} + +void nlmsg_release(struct sk_buff *skb) +{ + atomic_dec(skb->users); +} + + +/* + * Kick message queue. + * Two modes: + * - synchronous (delay==0). Messages are delivered immediately. + * - delayed. Do not deliver, but start delivery timer. + */ + +void __nlmsg_transmit(struct sock *sk, int allocation) +{ + start_bh_atomic(); + if (!sk->protinfo.af_netlink.delay) { + if (sk->timer.function) { + del_timer(&sk->timer); + sk->timer.function = NULL; + } + end_bh_atomic(); + nlmsg_flush(sk, allocation); + return; + } + if (!sk->timer.function) { + sk->timer.expires = jiffies + sk->protinfo.af_netlink.delay; + sk->timer.function = (void (*)(unsigned long))nlmsg_delayed_flush; + sk->timer.data = (unsigned long)sk; + add_timer(&sk->timer); + } + end_bh_atomic(); +} + +#endif + +#ifdef CONFIG_PROC_FS +static int netlink_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos=0; + off_t begin=0; + int len=0; + int i; + struct sock *s; + + len+= sprintf(buffer,"sk Eth Pid Groups " + "Rmem Wmem Dump Locks\n"); + + for (i=0; inext) { + len+=sprintf(buffer+len,"%p %-3d %-6d %08x %-8d %-8d %p %d", + s, + s->protocol, + s->protinfo.af_netlink.pid, + s->protinfo.af_netlink.groups, + atomic_read(&s->rmem_alloc), + atomic_read(&s->wmem_alloc), + s->protinfo.af_netlink.cb, + atomic_read(&s->protinfo.af_netlink.locks) + ); + + buffer[len++]='\n'; + + pos=begin+len; + if(posoffset+length) + goto done; + } + } + *eof = 1; + +done: + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} +#endif + +struct proto_ops netlink_ops = { + AF_NETLINK, + + sock_no_dup, + netlink_release, + netlink_bind, + netlink_connect, + NULL, + NULL, + netlink_getname, + datagram_poll, + sock_no_ioctl, + sock_no_listen, + sock_no_shutdown, + NULL, + NULL, + sock_no_fcntl, + netlink_sendmsg, + netlink_recvmsg +}; + +struct net_proto_family netlink_family_ops = { + AF_NETLINK, + netlink_create +}; + +void netlink_proto_init(struct net_proto *pro) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; +#endif + struct sk_buff *dummy_skb; + + if (sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb)) { + printk(KERN_CRIT "netlink_proto_init: panic\n"); + return; + } + sock_register(&netlink_family_ops); +#ifdef CONFIG_PROC_FS + ent = create_proc_entry("net/netlink", 0, 0); + ent->read_proc = netlink_read_proc; +#endif +} diff --git a/net/netlink/netlink_dev.c b/net/netlink/netlink_dev.c new file mode 100644 index 000000000000..cbd48c1c0ed8 --- /dev/null +++ b/net/netlink/netlink_dev.c @@ -0,0 +1,213 @@ +/* + * NETLINK An implementation of a loadable kernel mode driver providing + * multiple kernel/user space bidirectional communications links. + * + * Author: Alan Cox + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Now netlink devices are emulated on the top of netlink sockets + * by compatibility reasons. Remove this file after a period. --ANK + * + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static unsigned open_map = 0; +static struct socket *netlink_user[MAX_LINKS]; + +/* + * Device operations + */ + +static unsigned int netlink_poll(struct file *file, poll_table * wait) +{ + struct socket *sock = netlink_user[MINOR(file->f_dentry->d_inode->i_rdev)]; + + if (sock->ops->poll==NULL) + return 0; + return sock->ops->poll(sock, wait); +} + +/* + * Write a message to the kernel side of a communication link + */ + +static ssize_t netlink_write(struct file * file, const char * buf, + size_t count, loff_t *pos) +{ + struct inode *inode = file->f_dentry->d_inode; + struct socket *sock = netlink_user[MINOR(inode->i_rdev)]; + struct msghdr msg; + struct iovec iov; + + iov.iov_base = (void*)buf; + iov.iov_len = count; + msg.msg_name=NULL; + msg.msg_namelen=0; + msg.msg_controllen=0; + msg.msg_flags=0; + msg.msg_iov=&iov; + msg.msg_iovlen=1; + + return sock_sendmsg(sock, &msg, count); +} + +/* + * Read a message from the kernel side of the communication link + */ + +static ssize_t netlink_read(struct file * file, char * buf, + size_t count, loff_t *pos) +{ + struct inode *inode = file->f_dentry->d_inode; + struct socket *sock = netlink_user[MINOR(inode->i_rdev)]; + struct msghdr msg; + struct iovec iov; + + iov.iov_base = buf; + iov.iov_len = count; + msg.msg_name=NULL; + msg.msg_namelen=0; + msg.msg_controllen=0; + msg.msg_flags=0; + msg.msg_iov=&iov; + msg.msg_iovlen=1; + if (file->f_flags&O_NONBLOCK) + msg.msg_flags=MSG_DONTWAIT; + + return sock_recvmsg(sock, &msg, count, msg.msg_flags); +} + +static loff_t netlink_lseek(struct file * file, loff_t offset, int origin) +{ + return -ESPIPE; +} + +static int netlink_open(struct inode * inode, struct file * file) +{ + unsigned int minor = MINOR(inode->i_rdev); + struct socket *sock; + struct sockaddr_nl nladdr; + int err; + + if (minor>=MAX_LINKS) + return -ENODEV; + if (open_map&(1<type = SOCK_RAW; + + if ((err = net_families[AF_NETLINK]->create(sock, minor)) < 0) + { + sock_release(sock); + goto out; + } + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + nladdr.nl_groups = ~0; + if ((err = sock->ops->bind(sock, (struct sockaddr*)&nladdr, sizeof(nladdr))) < 0) { + sock_release(sock); + goto out; + } + + netlink_user[minor] = sock; + return 0; + +out: + open_map &= ~(1<i_rdev); + struct socket *sock = netlink_user[minor]; + + netlink_user[minor] = NULL; + open_map &= ~(1<i_rdev); + int retval = 0; + + if (minor >= MAX_LINKS) + return -ENODEV; + switch ( cmd ) { + default: + retval = -EINVAL; + } + return retval; +} + + +static struct file_operations netlink_fops = { + netlink_lseek, + netlink_read, + netlink_write, + NULL, /* netlink_readdir */ + netlink_poll, + netlink_ioctl, + NULL, /* netlink_mmap */ + netlink_open, + netlink_release +}; + +__initfunc(int init_netlink(void)) +{ + if (register_chrdev(NETLINK_MAJOR,"netlink", &netlink_fops)) { + printk(KERN_ERR "netlink: unable to get major %d\n", NETLINK_MAJOR); + return -EIO; + } + return 0; +} + +#ifdef MODULE + +int init_module(void) +{ + printk(KERN_INFO "Network Kernel/User communications module 0.04\n"); + return init_netlink(); +} + +void cleanup_module(void) +{ + unregister_chrdev(NET_MAJOR,"netlink"); +} + +#endif diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index f7b617dccb14..380ec8eccffa 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -249,14 +249,6 @@ int nr_init(struct device *dev) /* New-style flags. */ dev->flags = 0; - dev->family = AF_INET; - -#ifdef CONFIG_INET - dev->pa_addr = in_aton("192.168.0.1"); - dev->pa_brdaddr = in_aton("192.168.0.255"); - dev->pa_mask = in_aton("255.255.255.0"); - dev->pa_alen = 4; -#endif if ((dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL)) == NULL) return -ENOMEM; diff --git a/net/netsyms.c b/net/netsyms.c index 089d6ebc1b6c..1852be345a0f 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -19,6 +19,7 @@ #ifdef CONFIG_INET #include #include +#include #include #include #include @@ -28,8 +29,8 @@ #include #include #include +#include #include -#include #include extern struct net_proto_family inet_family_ops; @@ -43,13 +44,7 @@ extern struct net_proto_family inet_family_ops; #endif -#ifdef CONFIG_NETLINK -#include -#endif - -#ifdef CONFIG_NET_ALIAS -#include -#endif +#include #include @@ -121,6 +116,7 @@ EXPORT_SYMBOL(skb_copy_datagram_iovec); EXPORT_SYMBOL(skb_realloc_headroom); EXPORT_SYMBOL(datagram_poll); EXPORT_SYMBOL(put_cmsg); +EXPORT_SYMBOL(net_families); EXPORT_SYMBOL(neigh_table_init); /* Declared in but not defined? @@ -144,6 +140,12 @@ EXPORT_SYMBOL(dst_total); EXPORT_SYMBOL(__scm_destroy); EXPORT_SYMBOL(__scm_send); +/* Needed by unix.o */ +EXPORT_SYMBOL(scm_fp_dup); +EXPORT_SYMBOL(max_files); +EXPORT_SYMBOL(do_mknod); +EXPORT_SYMBOL(memcpy_toiovec); + #ifdef CONFIG_IPX_MODULE EXPORT_SYMBOL(make_8023_client); EXPORT_SYMBOL(destroy_8023_client); @@ -153,6 +155,9 @@ EXPORT_SYMBOL(destroy_EII_client); #ifdef CONFIG_ATALK_MODULE EXPORT_SYMBOL(sklist_destroy_socket); +#endif + +#if defined(CONFIG_ATALK_MODULE) || defined(CONFIG_PACKET_MODULE) EXPORT_SYMBOL(sklist_insert_socket); #endif @@ -169,15 +174,14 @@ EXPORT_SYMBOL(init_etherdev); EXPORT_SYMBOL(ip_route_output); EXPORT_SYMBOL(icmp_send); EXPORT_SYMBOL(ip_options_compile); -EXPORT_SYMBOL(ip_rt_put); EXPORT_SYMBOL(arp_send); EXPORT_SYMBOL(ip_id_count); EXPORT_SYMBOL(ip_send_check); EXPORT_SYMBOL(ip_fragment); -EXPORT_SYMBOL(ip_dev_find_tunnel); EXPORT_SYMBOL(inet_family_ops); EXPORT_SYMBOL(in_aton); EXPORT_SYMBOL(in_ntoa); +EXPORT_SYMBOL(net_ratelimit); #ifdef CONFIG_IPV6_MODULE /* inet functions common to v4 and v6 */ @@ -206,7 +210,6 @@ EXPORT_SYMBOL(udp_hash); EXPORT_SYMBOL(destroy_sock); EXPORT_SYMBOL(ip_queue_xmit); EXPORT_SYMBOL(csum_partial); -EXPORT_SYMBOL(dev_lockct); EXPORT_SYMBOL(memcpy_fromiovecend); EXPORT_SYMBOL(csum_partial_copy_fromiovecend); EXPORT_SYMBOL(__release_sock); @@ -231,7 +234,6 @@ EXPORT_SYMBOL(tcp_getsockopt); EXPORT_SYMBOL(tcp_recvmsg); EXPORT_SYMBOL(tcp_send_synack); EXPORT_SYMBOL(tcp_check_req); -EXPORT_SYMBOL(sock_wmalloc); EXPORT_SYMBOL(tcp_reset_xmit_timer); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); @@ -249,13 +251,35 @@ EXPORT_SYMBOL(tcp_v4_conn_request); EXPORT_SYMBOL(tcp_v4_syn_recv_sock); EXPORT_SYMBOL(tcp_v4_do_rcv); EXPORT_SYMBOL(tcp_v4_connect); -EXPORT_SYMBOL(__ip_chk_addr); +EXPORT_SYMBOL(inet_addr_type); EXPORT_SYMBOL(net_reset_timer); EXPORT_SYMBOL(net_delete_timer); EXPORT_SYMBOL(udp_prot); EXPORT_SYMBOL(tcp_prot); EXPORT_SYMBOL(tcp_openreq_cachep); EXPORT_SYMBOL(ipv4_specific); +EXPORT_SYMBOL(tcp_simple_retransmit); + +EXPORT_SYMBOL(xrlim_allow); +#endif + +#ifdef CONFIG_PACKET_MODULE +EXPORT_SYMBOL(memcpy_toiovec); +EXPORT_SYMBOL(dev_set_allmulti); +EXPORT_SYMBOL(dev_set_promiscuity); +EXPORT_SYMBOL(dev_mc_delete); +EXPORT_SYMBOL(sklist_remove_socket); +EXPORT_SYMBOL(rtnl_wait); +EXPORT_SYMBOL(rtnl_rlockct); +#ifdef CONFIG_RTNETLINK +EXPORT_SYMBOL(rtnl); +EXPORT_SYMBOL(rtnl_wlockct); +#endif +#endif + +#if defined(CONFIG_IPV6_MODULE) || defined(CONFIG_PACKET_MODULE) +EXPORT_SYMBOL(dev_lockct); +EXPORT_SYMBOL(sock_wmalloc); #endif #if defined(CONFIG_ULTRA) || defined(CONFIG_WD80x3) || \ @@ -282,15 +306,9 @@ EXPORT_SYMBOL(tr_freedev); EXPORT_SYMBOL(tr_reformat); #endif -#ifdef CONFIG_NET_ALIAS -#include -#endif - /* Used by at least ipip.c. */ EXPORT_SYMBOL(ipv4_config); -#ifdef CONFIG_IP_MROUTE -EXPORT_SYMBOL(ip_mr_find_tunnel); -#endif +EXPORT_SYMBOL(dev_open); #endif /* CONFIG_INET */ @@ -298,19 +316,19 @@ EXPORT_SYMBOL(ip_mr_find_tunnel); EXPORT_SYMBOL(register_netdevice_notifier); EXPORT_SYMBOL(unregister_netdevice_notifier); -#ifdef CONFIG_NET_ALIAS -EXPORT_SYMBOL(register_net_alias_type); -EXPORT_SYMBOL(unregister_net_alias_type); -#endif - /* support for loadable net drivers */ #ifdef CONFIG_NET +EXPORT_SYMBOL(register_netdevice); +EXPORT_SYMBOL(unregister_netdevice); EXPORT_SYMBOL(register_netdev); EXPORT_SYMBOL(unregister_netdev); EXPORT_SYMBOL(ether_setup); EXPORT_SYMBOL(dev_new_index); EXPORT_SYMBOL(dev_get_by_index); EXPORT_SYMBOL(eth_type_trans); +#ifdef CONFIG_FDDI +EXPORT_SYMBOL(fddi_type_trans); +#endif /* CONFIG_FDDI */ EXPORT_SYMBOL(eth_copy_and_sum); EXPORT_SYMBOL(alloc_skb); EXPORT_SYMBOL(__kfree_skb); @@ -318,7 +336,6 @@ EXPORT_SYMBOL(skb_clone); EXPORT_SYMBOL(skb_copy); EXPORT_SYMBOL(dev_alloc_skb); EXPORT_SYMBOL(netif_rx); -EXPORT_SYMBOL(dev_tint); EXPORT_SYMBOL(dev_add_pack); EXPORT_SYMBOL(dev_remove_pack); EXPORT_SYMBOL(dev_get); @@ -340,6 +357,9 @@ EXPORT_SYMBOL(kill_fasync); EXPORT_SYMBOL(ip_rcv); EXPORT_SYMBOL(arp_rcv); +EXPORT_SYMBOL(rtnl_lock); +EXPORT_SYMBOL(rtnl_unlock); + EXPORT_SYMBOL(if_port_text); #if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE) @@ -352,11 +372,13 @@ extern int (*dlci_ioctl_hook)(unsigned int, void *); EXPORT_SYMBOL(dlci_ioctl_hook); #endif -#endif /* CONFIG_NET */ +/* Packet scheduler modules want these. */ +EXPORT_SYMBOL(qdisc_destroy); +EXPORT_SYMBOL(qdisc_reset); +EXPORT_SYMBOL(qdisc_restart); +EXPORT_SYMBOL(qdisc_head); +EXPORT_SYMBOL(register_qdisc); +EXPORT_SYMBOL(unregister_qdisc); +EXPORT_SYMBOL(noop_qdisc); -#ifdef CONFIG_NETLINK -EXPORT_SYMBOL(netlink_attach); -EXPORT_SYMBOL(netlink_detach); -EXPORT_SYMBOL(netlink_donothing); -EXPORT_SYMBOL(netlink_post); -#endif /* CONFIG_NETLINK */ +#endif /* CONFIG_NET */ diff --git a/net/packet/Makefile b/net/packet/Makefile new file mode 100644 index 000000000000..cce16721d093 --- /dev/null +++ b/net/packet/Makefile @@ -0,0 +1,24 @@ +# +# Makefile for the packet AF. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := packet.o +MOD_LIST_NAME := NET_MISC_MODULES + +O_OBJS := +M_OBJS := + +ifeq ($(CONFIG_PACKET),y) + O_OBJS += af_packet.o +else + ifeq ($(CONFIG_PACKET), m) + M_OBJS += af_packet.o + endif +endif + +include $(TOPDIR)/Rules.make diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c new file mode 100644 index 000000000000..ff7fef1310e6 --- /dev/null +++ b/net/packet/af_packet.c @@ -0,0 +1,1251 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * PACKET - implements raw packet sockets. + * + * Doesn't belong in IP but it's currently too hooked into ip + * to separate. + * + * Version: @(#)packet.c 1.0.6 05/25/93 + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Alan Cox, + * + * Fixes: + * Alan Cox : verify_area() now used correctly + * Alan Cox : new skbuff lists, look ma no backlogs! + * Alan Cox : tidied skbuff lists. + * Alan Cox : Now uses generic datagram routines I + * added. Also fixed the peek/read crash + * from all old Linux datagram code. + * Alan Cox : Uses the improved datagram code. + * Alan Cox : Added NULL's for socket options. + * Alan Cox : Re-commented the code. + * Alan Cox : Use new kernel side addressing + * Rob Janssen : Correct MTU usage. + * Dave Platt : Counter leaks caused by incorrect + * interrupt locking and some slightly + * dubious gcc output. Can you read + * compiler: it said _VOLATILE_ + * Richard Kooijman : Timestamp fixes. + * Alan Cox : New buffers. Use sk->mac.raw. + * Alan Cox : sendmsg/recvmsg support. + * Alan Cox : Protocol setting support + * Alexey Kuznetsov : Untied from IPv4 stack. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_DLCI) || defined(CONFIG_DLCI_MODULE) +#include +#endif + +/* + Old SOCK_PACKET. Do exist programs, which use it? + (not counting tcpdump) - lots of them yes - AC. + + */ +#define CONFIG_SOCK_PACKET 1 + +/* + Proposed replacement for SIOC{ADD,DEL}MULTI and + IFF_PROMISC, IFF_ALLMULTI flags. + + It is more expensive, but I believe, + it is really correct solution: reentereble, safe and fault tolerant. + + Differences: + - Changing IFF_ALLMULTI from user level is disabled. + It could only confused multicast routing daemons, not more. + - IFF_PROMISC is faked by keeping reference count and + global flag, so that real IFF_PROMISC == (gflag|(count != 0)) + I'd remove it too, but it would require recompilation tcpdump + and another applications, using promiscuous mode. + - SIOC{ADD/DEL}MULTI are moved to deprecated state, + they work, but complain. I do know who uses them. + + +*************FIXME*************** + Alexey : This doesnt cook Im afraid. We need the low level SIOCADD/DELMULTI + and also IFF_ALLMULTI for DECNET, Appletalk and other stuff as well as + BSD compatibility issues. + + */ +#define CONFIG_PACKET_MULTICAST 1 + +/* + Assumptions: + - if device has no dev->hard_header routine, it adds and removes ll header + inside itself. In this case ll header is invisible outside of device, + but higher levels still should reserve dev->hard_header_len. + Some devices are enough clever to reallocate skb, when header + will not fit to reserved space (tunnel), another ones are silly + (PPP). + - packet socket receives packets with pulled ll header, + so that SOCK_RAW should push it back. + +On receive: +----------- + +Incoming, dev->hard_header!=NULL + mac.raw -> ll header + data -> data + +Outgoing, dev->hard_header!=NULL + mac.raw -> ll header + data -> ll header + +Incoming, dev->hard_header==NULL + mac.raw -> UNKNOWN position. It is very likely, that it points to ll header. + PPP makes it, that is wrong, because introduce assymetry + between rx and tx paths. + data -> data + +Outgoing, dev->hard_header==NULL + mac.raw -> data. ll header is still not built! + data -> data + +Resume + If dev->hard_header==NULL we are unlikely to restore sensible ll header. + + +On transmit: +------------ + +dev->hard_header != NULL + mac.raw -> ll header + data -> ll header + +dev->hard_header == NULL (ll header is added by device, we cannot control it) + mac.raw -> data + data -> data + + We should set nh.raw on output to correct posistion, + packet classifier depends on it. + */ + +/* List of all packet sockets. */ +struct sock * packet_sklist = NULL; + +/* Private packet socket structures. */ + +#ifdef CONFIG_PACKET_MULTICAST +struct packet_mclist +{ + struct packet_mclist *next; + int ifindex; + int count; + unsigned short type; + unsigned short alen; + unsigned char addr[8]; +}; +#endif + +static void packet_flush_mclist(struct sock *sk); + +struct packet_opt +{ + struct packet_type prot_hook; + char running; /* prot_hook is attached*/ + int ifindex; /* bound device */ +#ifdef CONFIG_PACKET_MULTICAST + struct packet_mclist *mclist; +#endif +}; + +extern struct proto_ops packet_ops; + +#ifdef CONFIG_SOCK_PACKET +extern struct proto_ops packet_ops_spkt; + +static int packet_rcv_spkt(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +{ + struct sock *sk; + struct sockaddr_pkt *spkt = (struct sockaddr_pkt*)skb->cb; + + /* + * When we registered the protocol we saved the socket in the data + * field for just this event. + */ + + sk = (struct sock *) pt->data; + + /* + * Yank back the headers [hope the device set this + * right or kerboom...] + * + * Incoming packets have ll header pulled, + * push it back. + * + * For outgoing ones skb->data == skb->mac.raw + * so that this procedure is noop. + */ + + skb_push(skb, skb->data-skb->mac.raw); + + /* + * The SOCK_PACKET socket receives _all_ frames. + */ + + spkt->spkt_family = dev->type; + strncpy(spkt->spkt_device, dev->name, 15); + spkt->spkt_protocol = skb->protocol; + + /* + * Charge the memory to the socket. This is done specifically + * to prevent sockets using all the memory up. + */ + + if (sock_queue_rcv_skb(sk,skb)<0) + { + kfree_skb(skb, FREE_READ); + return 0; + } + + /* + * Processing complete. + */ + return(0); +} + + +/* + * Output a raw packet to a device layer. This bypasses all the other + * protocol layers and you must therefore supply it with a complete frame + */ + +static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, int len, + struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + struct device *dev; + struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; + unsigned short proto=0; + int err; + + /* + * Check the flags. + */ + + if (msg->msg_flags&~MSG_DONTWAIT) + return(-EINVAL); + + /* + * Get and verify the address. + */ + + if (saddr) + { + if (msg->msg_namelen < sizeof(struct sockaddr)) + return(-EINVAL); + if (msg->msg_namelen==sizeof(struct sockaddr_pkt)) + proto=saddr->spkt_protocol; + } + else + return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */ + + /* + * Find the device first to size check it + */ + + saddr->spkt_device[13] = 0; + dev = dev_get(saddr->spkt_device); + if (dev == NULL) + { + return(-ENODEV); + } + + /* + * You may not queue a frame bigger than the mtu. This is the lowest level + * raw protocol and you must do your own fragmentation at this level. + */ + + if(len>dev->mtu+dev->hard_header_len) + return -EMSGSIZE; + + dev_lock_list(); + skb = sock_wmalloc(sk, len+dev->hard_header_len+15, 0, GFP_KERNEL); + + /* + * If the write buffer is full, then tough. At this level the user gets to + * deal with the problem - do your own algorithmic backoffs. That's far + * more flexible. + */ + + if (skb == NULL) + { + dev_unlock_list(); + return(-ENOBUFS); + } + + /* + * Fill it in + */ + + /* FIXME: Save some space for broken drivers that write a + * hard header at transmission time by themselves. PPP is the + * notable one here. This should really be fixed at the driver level. + */ + skb_reserve(skb,(dev->hard_header_len+15)&~15); + skb->mac.raw = skb->nh.raw = skb->data; + + /* Try to align data part correctly */ + if (dev->hard_header) { + skb->data -= dev->hard_header_len; + skb->tail -= dev->hard_header_len; + skb->mac.raw = skb->data; + } + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + skb->arp = 1; /* No ARP needs doing on this (complete) frame */ + skb->protocol = proto; + skb->dev = dev; + skb->priority = sk->priority; + dev_unlock_list(); + + /* + * Now send it + */ + + if (err) + { + err = -EFAULT; + } + else + { + if (!(dev->flags & IFF_UP)) + { + err = -ENETDOWN; + } + } + + if (err) + { + kfree_skb(skb, FREE_WRITE); + return err; + } + + dev_queue_xmit(skb); + return(len); +} +#endif + +static int packet_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +{ + struct sock *sk; + struct sockaddr_ll *sll = (struct sockaddr_ll*)skb->cb; + + /* + * When we registered the protocol we saved the socket in the data + * field for just this event. + */ + + sk = (struct sock *) pt->data; + + /* + * The SOCK_PACKET socket receives _all_ frames. + */ + + skb->dev = dev; + + sll->sll_family = AF_PACKET; + sll->sll_hatype = dev->type; + sll->sll_protocol = skb->protocol; + sll->sll_pkttype = skb->pkt_type; + sll->sll_ifindex = dev->ifindex; + sll->sll_halen = 0; + + if (dev->hard_header_parse) + sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr); + + if (dev->hard_header) { + /* The device has an explicit notion of ll header, + exported to higher levels. + + Otherwise, the device hides datails of it frame + structure, so that corresponding packet head + never delivered to user. + */ + if (sk->type != SOCK_DGRAM) + skb_push(skb, skb->data - skb->mac.raw); + else if (skb->pkt_type == PACKET_OUTGOING) { + /* Special case: outgoing packets have ll header at head */ + skb_pull(skb, skb->nh.raw - skb->data); + } + } + + /* + * Charge the memory to the socket. This is done specifically + * to prevent sockets using all the memory up. + */ + + if (sock_queue_rcv_skb(sk,skb)<0) + { + kfree_skb(skb, FREE_READ); + return 0; + } + return(0); +} + +static int packet_sendmsg(struct socket *sock, struct msghdr *msg, int len, + struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + struct device *dev; + struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name; + unsigned short proto; + int ifindex; + int err; + int reserve = 0; + + /* + * Check the flags. + */ + + if (msg->msg_flags&~MSG_DONTWAIT) + return(-EINVAL); + + /* + * Get and verify the address. + */ + + if (saddr == NULL) { + ifindex = sk->protinfo.af_packet->ifindex; + proto = sk->num; + } else { + if (msg->msg_namelen < sizeof(struct sockaddr_ll)) + return -EINVAL; + ifindex = saddr->sll_ifindex; + proto = saddr->sll_protocol; + } + + dev = dev_get_by_index(ifindex); + if (dev == NULL) + return -ENXIO; + if (sock->type == SOCK_RAW) + reserve = dev->hard_header_len; + + if (len > dev->mtu+reserve) + return -EMSGSIZE; + + dev_lock_list(); + + skb = sock_alloc_send_skb(sk, len+dev->hard_header_len+15, 0, msg->msg_flags&MSG_DONTWAIT, &err); + + if (skb==NULL) { + dev_unlock_list(); + return err; + } + + skb_reserve(skb, (dev->hard_header_len+15)&~15); + skb->mac.raw = skb->nh.raw = skb->data; + + if (dev->hard_header) { + if (dev->hard_header(skb, dev, ntohs(proto), + saddr ? saddr->sll_addr : NULL, + NULL, len) < 0 + && sock->type == SOCK_DGRAM) { + kfree_skb(skb, FREE_WRITE); + dev_unlock_list(); + return -EINVAL; + } + skb->mac.raw = skb->data; + if (sock->type != SOCK_DGRAM) { + skb->tail = skb->data; + skb->len = 0; + } + } + + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + skb->arp = 1; /* No ARP needs doing on this (complete) frame */ + skb->protocol = proto; + skb->dev = dev; + skb->priority = sk->priority; + dev_unlock_list(); + + /* + * Now send it + */ + + if (err) { + err = -EFAULT; + } else { + if (!(dev->flags & IFF_UP)) + err = -ENETDOWN; + } + + if (err) { + kfree_skb(skb, FREE_WRITE); + return err; + } + + dev_queue_xmit(skb); + return(len); +} + +static void packet_destroy_timer(unsigned long data) +{ + struct sock *sk=(struct sock *)data; + + if (!atomic_read(&sk->wmem_alloc) && !atomic_read(&sk->rmem_alloc)) { + sk_free(sk); + MOD_DEC_USE_COUNT; + return; + } + + sk->timer.expires=jiffies+10*HZ; + add_timer(&sk->timer); + printk(KERN_DEBUG "packet sk destroy delayed\n"); +} + +/* + * Close a PACKET socket. This is fairly simple. We immediately go + * to 'closed' state and remove our protocol entry in the device list. + */ + +static int packet_release(struct socket *sock, struct socket *peersock) +{ + struct sk_buff *skb; + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + sklist_remove_socket(&packet_sklist, sk); + + /* + * Unhook packet receive handler. + */ + + if (sk->protinfo.af_packet->running) + { + /* + * Remove the protocol hook + */ + + dev_remove_pack(&sk->protinfo.af_packet->prot_hook); + sk->protinfo.af_packet->running = 0; + } + +#ifdef CONFIG_PACKET_MULTICAST + packet_flush_mclist(sk); +#endif + + /* + * Now the socket is dead. No more input will appear. + */ + + sk->state_change(sk); /* It is useless. Just for sanity. */ + + sock->sk = NULL; + sk->socket = NULL; + sk->dead = 1; + + /* Purge queues */ + + while ((skb=skb_dequeue(&sk->receive_queue))!=NULL) + kfree_skb(skb,FREE_READ); + + if (atomic_read(&sk->rmem_alloc) || atomic_read(&sk->wmem_alloc)) { + sk->timer.data=(unsigned long)sk; + sk->timer.expires=jiffies+HZ; + sk->timer.function=packet_destroy_timer; + add_timer(&sk->timer); + return 0; + } + + sk_free(sk); + MOD_DEC_USE_COUNT; + return 0; +} + +/* + * Attach a packet hook. + */ + +static int packet_do_bind(struct sock *sk, struct device *dev, int protocol) +{ + /* + * Detach an existing hook if present. + */ + + if (sk->protinfo.af_packet->running) { + dev_remove_pack(&sk->protinfo.af_packet->prot_hook); + sk->protinfo.af_packet->running = 0; + } + + sk->num = protocol; + sk->protinfo.af_packet->prot_hook.type = protocol; + sk->protinfo.af_packet->prot_hook.dev = dev; + + if (protocol == 0) + return 0; + + if (dev) { + sk->protinfo.af_packet->ifindex = dev->ifindex; + if (dev->flags&IFF_UP) { + dev_add_pack(&sk->protinfo.af_packet->prot_hook); + sk->protinfo.af_packet->running = 1; + } else { + sk->err = ENETDOWN; + sk->error_report(sk); + } + } else { + sk->protinfo.af_packet->ifindex = 0; + dev_add_pack(&sk->protinfo.af_packet->prot_hook); + sk->protinfo.af_packet->running = 1; + } + return 0; +} + +/* + * Bind a packet socket to a device + */ + +#ifdef CONFIG_SOCK_PACKET + +static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk=sock->sk; + char name[15]; + struct device *dev; + + /* + * Check legality + */ + + if(addr_len!=sizeof(struct sockaddr)) + return -EINVAL; + strncpy(name,uaddr->sa_data,14); + name[14]=0; + + dev = dev_get(name); + if (dev) + return packet_do_bind(sk, dev, sk->num); + return -ENODEV; +} +#endif + +static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr; + struct sock *sk=sock->sk; + struct device *dev = NULL; + + /* + * Check legality + */ + + if (addr_len < sizeof(struct sockaddr_ll)) + return -EINVAL; + if (sll->sll_family != AF_PACKET) + return -EINVAL; + + if (sll->sll_ifindex) { + dev = dev_get_by_index(sll->sll_ifindex); + if (dev == NULL) + return -ENODEV; + } + return packet_do_bind(sk, dev, sll->sll_protocol ? : sk->num); +} + + +/* + * Create a packet of type SOCK_PACKET. + */ + +static int packet_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + if (!suser()) + return -EPERM; + if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW +#ifdef CONFIG_SOCK_PACKET + && sock->type != SOCK_PACKET +#endif + ) + return -ESOCKTNOSUPPORT; + + sock->state = SS_UNCONNECTED; + MOD_INC_USE_COUNT; + sk = sk_alloc(AF_PACKET, GFP_KERNEL); + if (sk == NULL) { + MOD_DEC_USE_COUNT; + return -ENOBUFS; + } + + sk->reuse = 1; +#ifdef CONFIG_SOCK_PACKET + if (sock->type == SOCK_PACKET) + sock->ops = &packet_ops_spkt; + else +#endif + sock->ops = &packet_ops; + sock_init_data(sock,sk); + + sk->protinfo.af_packet = kmalloc(sizeof(struct packet_opt), GFP_KERNEL); + if (sk->protinfo.af_packet == NULL) { + sk_free(sk); + MOD_DEC_USE_COUNT; + return -ENOBUFS; + } + memset(sk->protinfo.af_packet, 0, sizeof(struct packet_opt)); + sk->zapped=0; + sk->family = AF_PACKET; + sk->num = protocol; + + /* + * Attach a protocol block + */ + +#ifdef CONFIG_SOCK_PACKET + if (sock->type == SOCK_PACKET) + sk->protinfo.af_packet->prot_hook.func = packet_rcv_spkt; + else +#endif + sk->protinfo.af_packet->prot_hook.func = packet_rcv; + + sk->protinfo.af_packet->prot_hook.data = (void *)sk; + + if (protocol) { + sk->protinfo.af_packet->prot_hook.type = protocol; + dev_add_pack(&sk->protinfo.af_packet->prot_hook); + sk->protinfo.af_packet->running = 1; + } + + sklist_insert_socket(&packet_sklist, sk); + return(0); +} + +/* + * Pull a packet from our receive queue and hand it to the user. + * If necessary we block. + */ + +/* + * NOTE about lock_* & release_* primitives. + * I do not understand why skb_recv_datagram locks socket. + * My analysis shows that it is useless for datagram services: + * i.e. here, udp, raw and netlink. FIX ME if I am wrong, + * but lock&release are necessary only for SOCK_STREAM + * and, maybe, SOCK_SEQPACKET. + * --ANK + */ + +static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len, + int flags, struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + int copied=0; + struct sk_buff *skb; + int err; + +#if 0 + /* What error should we return now? EUNATTACH? */ + if (sk->protinfo.af_packet->ifindex < 0) + return -ENODEV; +#endif + + /* + * If the address length field is there to be filled in, we fill + * it in now. + */ + + if (sock->type == SOCK_PACKET) + msg->msg_namelen = sizeof(struct sockaddr_pkt); + else + msg->msg_namelen = sizeof(struct sockaddr_ll); + + /* + * Call the generic datagram receiver. This handles all sorts + * of horrible races and re-entrancy so we can forget about it + * in the protocol layers. + * + * Now it will return ENETDOWN, if device have just gone down, + * but then it will block. + */ + + skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err); + + /* + * An error occurred so return it. Because skb_recv_datagram() + * handles the blocking we don't see and worry about blocking + * retries. + */ + + if(skb==NULL) + return err; + + /* + * You lose any data beyond the buffer you gave. If it worries a + * user program they can ask the device for its MTU anyway. + */ + + copied = skb->len; + if(copied>len) + { + copied=len; + msg->msg_flags|=MSG_TRUNC; + } + + /* We can't use skb_copy_datagram here */ + err = memcpy_toiovec(msg->msg_iov, skb->data, copied); + if (err) + { + return -EFAULT; + } + + sk->stamp=skb->stamp; + + if (msg->msg_name) + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + + /* + * Free or return the buffer as appropriate. Again this hides all the + * races and re-entrancy issues from us. + */ + + skb_free_datagram(sk, skb); + + return(copied); +} + +#ifdef CONFIG_SOCK_PACKET +static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct device *dev; + struct sock *sk = sock->sk; + + if (peer) + return -EOPNOTSUPP; + + uaddr->sa_family = AF_PACKET; + dev = dev_get_by_index(sk->protinfo.af_packet->ifindex); + if (dev) + strncpy(uaddr->sa_data, dev->name, 15); + else + memset(uaddr->sa_data, 0, 14); + *uaddr_len = sizeof(*uaddr); + + return 0; +} +#endif + +static int packet_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct device *dev; + struct sock *sk = sock->sk; + struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr; + + if (peer) + return -EOPNOTSUPP; + + sll->sll_family = AF_PACKET; + sll->sll_ifindex = sk->protinfo.af_packet->ifindex; + sll->sll_protocol = sk->num; + dev = dev_get_by_index(sk->protinfo.af_packet->ifindex); + if (dev) { + sll->sll_hatype = dev->type; + sll->sll_halen = dev->addr_len; + memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); + } else { + sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ + sll->sll_halen = 0; + } + *uaddr_len = sizeof(*sll); + + return 0; +} + +#ifdef CONFIG_PACKET_MULTICAST +static void packet_dev_mc(struct device *dev, struct packet_mclist *i, int what) +{ + switch (i->type) { + case PACKET_MR_MULTICAST: + if (what > 0) + dev_mc_add(dev, i->addr, i->alen, 0); + else + dev_mc_delete(dev, i->addr, i->alen, 0); + break; + case PACKET_MR_PROMISC: + dev_set_promiscuity(dev, what); + break; + case PACKET_MR_ALLMULTI: + dev_set_allmulti(dev, what); + break; + default: + } +} + +static void packet_dev_mclist(struct device *dev, struct packet_mclist *i, int what) +{ + for ( ; i; i=i->next) { + if (i->ifindex == dev->ifindex) + packet_dev_mc(dev, i, what); + } +} + +static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq) +{ + int err; + struct packet_mclist *ml, *i; + struct device *dev; + + rtnl_shlock(); + + dev = dev_get_by_index(mreq->mr_ifindex); + + i = NULL; + err = -ENODEV; + if (!dev) + goto done; + err = -EINVAL; + if (mreq->mr_alen > dev->addr_len) + goto done; + + i = (struct packet_mclist *)kmalloc(sizeof(*i), GFP_KERNEL); + + for (ml=sk->protinfo.af_packet->mclist; ml; ml=ml->next) { + if (ml->ifindex == mreq->mr_ifindex && + ml->type == mreq->mr_type && + ml->alen == mreq->mr_alen && + memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { + ml->count++; + err = 0; + goto done; + } + } + err = -ENOBUFS; + if (i == NULL) + goto done; + i->type = mreq->mr_type; + i->ifindex = mreq->mr_ifindex; + i->alen = mreq->mr_alen; + memcpy(i->addr, mreq->mr_address, i->alen); + i->count = 1; + i->next = sk->protinfo.af_packet->mclist; + sk->protinfo.af_packet->mclist = i; + packet_dev_mc(dev, i, +1); + i = NULL; + err = 0; + +done: + rtnl_shunlock(); + if (i) + kfree(i); + return err; +} + +static int packet_mc_drop(struct sock *sk, struct packet_mreq *mreq) +{ + struct packet_mclist *ml, **mlp; + + for (mlp=&sk->protinfo.af_packet->mclist; (ml=*mlp)!=NULL; mlp=&ml->next) { + if (ml->ifindex == mreq->mr_ifindex && + ml->type == mreq->mr_type && + ml->alen == mreq->mr_alen && + memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { + if (--ml->count == 0) { + struct device *dev; + *mlp = ml->next; + dev = dev_get_by_index(ml->ifindex); + if (dev) + packet_dev_mc(dev, ml, -1); + kfree_s(ml, sizeof(*ml)); + } + return 0; + } + } + return -EADDRNOTAVAIL; +} + +static void packet_flush_mclist(struct sock *sk) +{ + struct packet_mclist *ml; + + while ((ml=sk->protinfo.af_packet->mclist) != NULL) { + struct device *dev; + sk->protinfo.af_packet->mclist = ml->next; + if ((dev = dev_get_by_index(ml->ifindex)) != NULL) + packet_dev_mc(dev, ml, -1); + kfree_s(ml, sizeof(*ml)); + } +} + +static int +packet_setsockopt(struct socket *sock, int level, int optname, char *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct packet_mreq mreq; + + if (level != SOL_PACKET) + return -ENOPROTOOPT; + + switch(optname) { + case PACKET_ADD_MEMBERSHIP: + case PACKET_DROP_MEMBERSHIP: + + if (optlennext) { + po = sk->protinfo.af_packet; + + switch (msg) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + if (dev->ifindex == po->ifindex) { + if (po->running) { + dev_remove_pack(&po->prot_hook); + po->running = 0; + sk->err = ENETDOWN; + sk->error_report(sk); + } + if (msg == NETDEV_UNREGISTER) { + po->ifindex = -1; + po->prot_hook.dev = NULL; + } + } +#ifdef CONFIG_PACKET_MULTICAST + if (po->mclist) + packet_dev_mclist(dev, po->mclist, -1); +#endif + break; + case NETDEV_UP: + if (dev->ifindex == po->ifindex && sk->num && po->running==0) { + dev_add_pack(&po->prot_hook); + po->running = 1; + } +#ifdef CONFIG_PACKET_MULTICAST + if (po->mclist) + packet_dev_mclist(dev, po->mclist, +1); +#endif + break; + } + } + return NOTIFY_DONE; +} + + +static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err; + int pid; + + switch(cmd) + { + case FIOSETOWN: + case SIOCSPGRP: + err = get_user(pid, (int *) arg); + if (err) + return err; + if (current->pid != pid && current->pgrp != -pid && !suser()) + return -EPERM; + sk->proc = pid; + return(0); + case FIOGETOWN: + case SIOCGPGRP: + return put_user(sk->proc, (int *)arg); + return(0); + case SIOCGSTAMP: + if(sk->stamp.tv_sec==0) + return -ENOENT; + err = copy_to_user((void *)arg,&sk->stamp,sizeof(struct timeval)); + if (err) + err = -EFAULT; + return err; + case SIOCGIFFLAGS: + case SIOCSIFFLAGS: + case SIOCGIFCONF: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + case SIOCGIFMEM: + case SIOCSIFMEM: + case SIOCGIFMTU: + case SIOCSIFMTU: + case SIOCSIFLINK: + case SIOCGIFHWADDR: + case SIOCSIFHWADDR: + case SIOCSIFMAP: + case SIOCGIFMAP: + case SIOCSIFSLAVE: + case SIOCGIFSLAVE: + case SIOCGIFINDEX: + case SIOCGIFNAME: + case SIOCGIFCOUNT: + case SIOCSIFHWBROADCAST: + return(dev_ioctl(cmd,(void *) arg)); + + case SIOCGIFBR: + case SIOCSIFBR: +#ifdef CONFIG_BRIDGE + return(br_ioctl(cmd,(void *) arg)); +#else + return -ENOPKG; +#endif + + case SIOCADDDLCI: + case SIOCDELDLCI: +#ifdef CONFIG_DLCI + return(dlci_ioctl(cmd, (void *) arg)); +#endif + +#ifdef CONFIG_DLCI_MODULE + +#ifdef CONFIG_KERNELD + if (dlci_ioctl_hook == NULL) + request_module("dlci"); +#endif + + if (dlci_ioctl_hook) + return((*dlci_ioctl_hook)(cmd, (void *) arg)); +#endif + return -ENOPKG; + + default: + if ((cmd >= SIOCDEVPRIVATE) && + (cmd <= (SIOCDEVPRIVATE + 15))) + return(dev_ioctl(cmd,(void *) arg)); + +#ifdef CONFIG_NET_RADIO + if((cmd >= SIOCIWFIRST) && (cmd <= SIOCIWLAST)) + return(dev_ioctl(cmd,(void *) arg)); +#endif + return -EOPNOTSUPP; + } + /*NOTREACHED*/ + return(0); +} + +#ifdef CONFIG_SOCK_PACKET +struct proto_ops packet_ops_spkt = { + AF_PACKET, + + sock_no_dup, + packet_release, + packet_bind_spkt, + sock_no_connect, + NULL, + NULL, + packet_getname_spkt, + datagram_poll, + packet_ioctl, + sock_no_listen, + sock_no_shutdown, + sock_no_setsockopt, + sock_no_getsockopt, + sock_no_fcntl, + packet_sendmsg_spkt, + packet_recvmsg +}; +#endif + +struct proto_ops packet_ops = { + AF_PACKET, + + sock_no_dup, + packet_release, + packet_bind, + sock_no_connect, + NULL, + NULL, + packet_getname, + datagram_poll, + packet_ioctl, + sock_no_listen, + sock_no_shutdown, +#ifdef CONFIG_PACKET_MULTICAST + packet_setsockopt, +#else + sock_no_setsockopt, +#endif + sock_no_getsockopt, + sock_no_fcntl, + packet_sendmsg, + packet_recvmsg +}; + +static struct net_proto_family packet_family_ops = { + AF_PACKET, + packet_create +}; + +struct notifier_block packet_netdev_notifier={ + packet_notifier, + NULL, + 0 +}; + + +#ifdef MODULE +void cleanup_module(void) +{ + unregister_netdevice_notifier(&packet_netdev_notifier); + sock_unregister(packet_family_ops.family); + return; +} + + +int init_module(void) +#else +__initfunc(void packet_proto_init(struct net_proto *pro)) +#endif +{ + sock_register(&packet_family_ops); + register_netdevice_notifier(&packet_netdev_notifier); +#ifdef MODULE + return 0; +#endif +} diff --git a/net/protocols.c b/net/protocols.c index a0bb0a6b8259..cba2a3ac42a0 100644 --- a/net/protocols.c +++ b/net/protocols.c @@ -10,8 +10,6 @@ #include #include -#define CONFIG_UNIX /* always present... */ - #ifdef CONFIG_UNIX #include #include @@ -24,6 +22,14 @@ extern void inet6_proto_init(struct net_proto *pro); #endif #endif /* INET */ +#ifdef CONFIG_NETLINK +extern void netlink_proto_init(struct net_proto *pro); +#endif + +#ifdef CONFIG_PACKET +extern void packet_proto_init(struct net_proto *pro); +#endif + #if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) #define NEED_802 #include @@ -61,6 +67,10 @@ extern void inet6_proto_init(struct net_proto *pro); #include #endif +#if defined(CONFIG_LLC) +#define NEED_LLC +#endif + #include #ifdef CONFIG_TR @@ -84,6 +94,14 @@ extern void rif_init(struct net_proto *); */ struct net_proto protocols[] = { +#ifdef CONFIG_NETLINK + { "NETLINK", netlink_proto_init }, +#endif + +#ifdef CONFIG_PACKET + { "PACKET", packet_proto_init }, +#endif + #ifdef CONFIG_UNIX { "UNIX", unix_proto_init }, /* Unix domain socket family */ #endif diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 69b77a9f2478..5ae64334dab3 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -548,6 +548,8 @@ static int rose_create(struct socket *sock, int protocol) sock_init_data(sock, sk); + skb_queue_head_init(&rose->ack_queue); + sock->ops = &rose_proto_ops; sk->protocol = protocol; sk->mtu = ROSE_MTU; /* 253 */ @@ -555,8 +557,6 @@ static int rose_create(struct socket *sock, int protocol) init_timer(&rose->timer); init_timer(&rose->idletimer); - skb_queue_head_init(&rose->frag_queue); - rose->t1 = sysctl_rose_call_request_timeout; rose->t2 = sysctl_rose_reset_request_timeout; rose->t3 = sysctl_rose_clear_request_timeout; @@ -583,6 +583,8 @@ static struct sock *rose_make_new(struct sock *osk) sock_init_data(NULL, sk); + skb_queue_head_init(&rose->ack_queue); + sk->type = osk->type; sk->socket = osk->socket; sk->priority = osk->priority; @@ -598,8 +600,6 @@ static struct sock *rose_make_new(struct sock *osk) init_timer(&rose->timer); init_timer(&rose->idletimer); - skb_queue_head_init(&rose->frag_queue); - rose->t1 = osk->protinfo.rose->t1; rose->t2 = osk->protinfo.rose->t2; rose->t3 = osk->protinfo.rose->t3; @@ -1068,7 +1068,9 @@ static int rose_sendmsg(struct socket *sock, struct msghdr *msg, int len, return -ENOTCONN; } - rose_output(sk, skb); /* Shove it onto the queue */ + skb_queue_tail(&sk->write_queue, skb); /* Shove it onto the queue */ + + rose_kick(sk); return len; } @@ -1210,7 +1212,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return 0; } - case SIOCRSL2CALL: + case SIOCRSSL2CALL: if (!suser()) return -EPERM; if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) ax25_listen_release(&rose_callsign, NULL); @@ -1220,6 +1222,11 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) ax25_listen_register(&rose_callsign, NULL); return 0; + case SIOCRSGL2CALL: + if (copy_to_user((void *)arg, &rose_callsign, sizeof(ax25_address))) + return -EFAULT; + return 0; + case SIOCRSACCEPT: if (sk->protinfo.rose->state == ROSE_STATE_5) { rose_write_internal(sk, ROSE_CALL_ACCEPTED); diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c index 7861220eede8..bc2097cda3eb 100644 --- a/net/rose/rose_dev.c +++ b/net/rose/rose_dev.c @@ -221,14 +221,6 @@ int rose_init(struct device *dev) /* New-style flags. */ dev->flags = 0; - dev->family = AF_INET; - -#ifdef CONFIG_INET - dev->pa_addr = in_aton("192.168.0.1"); - dev->pa_brdaddr = in_aton("192.168.0.255"); - dev->pa_mask = in_aton("255.255.255.0"); - dev->pa_alen = 4; -#endif if ((dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL)) == NULL) return -ENOMEM; diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 1ac11528dc97..de412d3c453d 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -19,6 +19,7 @@ * ROSE 001 Jonathan(G4KLX) Cloned from nr_in.c * ROSE 002 Jonathan(G4KLX) Return cause and diagnostic codes from Clear Requests. * ROSE 003 Jonathan(G4KLX) New timer architecture. + * Removed M bit processing. */ #include @@ -46,43 +47,6 @@ #include #include -static int rose_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) -{ - struct sk_buff *skbo, *skbn = skb; - - rose_start_idletimer(sk); - - if (more) { - sk->protinfo.rose->fraglen += skb->len; - skb_queue_tail(&sk->protinfo.rose->frag_queue, skb); - return 0; - } - - if (!more && sk->protinfo.rose->fraglen > 0) { /* End of fragment */ - sk->protinfo.rose->fraglen += skb->len; - skb_queue_tail(&sk->protinfo.rose->frag_queue, skb); - - if ((skbn = alloc_skb(sk->protinfo.rose->fraglen, GFP_ATOMIC)) == NULL) - return 1; - - skbn->h.raw = skbn->data; - - skbo = skb_dequeue(&sk->protinfo.rose->frag_queue); - memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); - kfree_skb(skbo, FREE_READ); - - while ((skbo = skb_dequeue(&sk->protinfo.rose->frag_queue)) != NULL) { - skb_pull(skbo, ROSE_MIN_LEN); - memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); - kfree_skb(skbo, FREE_READ); - } - - sk->protinfo.rose->fraglen = 0; - } - - return sock_queue_rcv_skb(sk, skbn); -} - /* * State machine for state 1, Awaiting Call Accepted State. * The handling of the timer(s) is in file rose_timer.c. @@ -166,6 +130,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety sk->protinfo.rose->vr = 0; sk->protinfo.rose->va = 0; sk->protinfo.rose->vl = 0; + rose_requeue_frames(sk); break; case ROSE_CLEAR_REQUEST: @@ -191,11 +156,9 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety rose_start_t2timer(sk); rose_stop_idletimer(sk); } else { - if (sk->protinfo.rose->condition & ROSE_COND_PEER_RX_BUSY) { - sk->protinfo.rose->va = nr; - } else { - rose_check_iframes_acked(sk, nr); - } + rose_frames_acked(sk, nr); + if (frametype == ROSE_RNR) + rose_requeue_frames(sk); } break; @@ -213,15 +176,12 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety rose_stop_idletimer(sk); break; } - if (sk->protinfo.rose->condition & ROSE_COND_PEER_RX_BUSY) { - sk->protinfo.rose->va = nr; - } else { - rose_check_iframes_acked(sk, nr); - } + rose_frames_acked(sk, nr); if (sk->protinfo.rose->condition & ROSE_COND_OWN_RX_BUSY) break; if (ns == sk->protinfo.rose->vr) { - if (rose_queue_rx_frame(sk, skb, m) == 0) { + rose_start_idletimer(sk); + if (sock_queue_rcv_skb(sk, skb) == 0) { sk->protinfo.rose->vr = (sk->protinfo.rose->vr + 1) % ROSE_MODULUS; queued = 1; } else { @@ -270,6 +230,7 @@ static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int framety sk->protinfo.rose->vs = 0; sk->protinfo.rose->vl = 0; sk->protinfo.rose->state = ROSE_STATE_3; + rose_requeue_frames(sk); break; case ROSE_CLEAR_REQUEST: diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index b481e485feaf..8ee27147a598 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -113,7 +113,7 @@ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh) else rose_call = &rose_callsign; - neigh->ax25 = ax25_send_frame(skb, 256, rose_call, &neigh->callsign, neigh->digipeat, neigh->dev); + neigh->ax25 = ax25_send_frame(skb, 0, rose_call, &neigh->callsign, neigh->digipeat, neigh->dev); return (neigh->ax25 != NULL); } diff --git a/net/rose/rose_out.c b/net/rose/rose_out.c index 0ed9f7480a23..aea1d9f68efa 100644 --- a/net/rose/rose_out.c +++ b/net/rose/rose_out.c @@ -12,6 +12,7 @@ * History * ROSE 001 Jonathan(G4KLX) Cloned from nr_out.c * ROSE 003 Jonathan(G4KLX) New timer architecture. + * Removed M bit processing. */ #include @@ -38,52 +39,6 @@ #include #include -/* - * This is where all ROSE frames pass; - */ -void rose_output(struct sock *sk, struct sk_buff *skb) -{ - struct sk_buff *skbn; - unsigned char header[ROSE_MIN_LEN]; - int err, frontlen, len; - - if (skb->len - ROSE_MIN_LEN > ROSE_MAX_PACKET_SIZE) { - /* Save a copy of the Header */ - memcpy(header, skb->data, ROSE_MIN_LEN); - skb_pull(skb, ROSE_MIN_LEN); - - frontlen = skb_headroom(skb); - - while (skb->len > 0) { - if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_MAX_PACKET_SIZE, 0, 0, &err)) == NULL) - return; - - skb_reserve(skbn, frontlen); - - len = (ROSE_MAX_PACKET_SIZE > skb->len) ? skb->len : ROSE_MAX_PACKET_SIZE; - - /* Copy the user data */ - memcpy(skb_put(skbn, len), skb->data, len); - skb_pull(skb, len); - - /* Duplicate the Header */ - skb_push(skbn, ROSE_MIN_LEN); - memcpy(skbn->data, header, ROSE_MIN_LEN); - - if (skb->len > 0) - skbn->data[2] |= ROSE_M_BIT; - - skb_queue_tail(&sk->write_queue, skbn); /* Throw it on the queue */ - } - - kfree_skb(skb, FREE_WRITE); - } else { - skb_queue_tail(&sk->write_queue, skb); /* Throw it on the queue */ - } - - rose_kick(sk); -} - /* * This procedure is passed a buffer descriptor for an iframe. It builds * the rest of the control part of the frame and then writes it out. @@ -103,8 +58,8 @@ static void rose_send_iframe(struct sock *sk, struct sk_buff *skb) void rose_kick(struct sock *sk) { - struct sk_buff *skb; - unsigned short end; + struct sk_buff *skb, *skbn; + unsigned short start, end; if (sk->protinfo.rose->state != ROSE_STATE_3) return; @@ -115,11 +70,14 @@ void rose_kick(struct sock *sk) if (skb_peek(&sk->write_queue) == NULL) return; - end = (sk->protinfo.rose->va + sysctl_rose_window_size) % ROSE_MODULUS; + start = (skb_peek(&sk->protinfo.rose->ack_queue) == NULL) ? sk->protinfo.rose->va : sk->protinfo.rose->vs; + end = (sk->protinfo.rose->va + sysctl_rose_window_size) % ROSE_MODULUS; - if (sk->protinfo.rose->vs == end) + if (start == end) return; + sk->protinfo.rose->vs = start; + /* * Transmit data until either we're out of data to send or * the window is full. @@ -128,13 +86,25 @@ void rose_kick(struct sock *sk) skb = skb_dequeue(&sk->write_queue); do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->write_queue, skb); + break; + } + + skb_set_owner_w(skbn, sk); + /* - * Transmit the frame. + * Transmit the frame copy. */ - rose_send_iframe(sk, skb); + rose_send_iframe(sk, skbn); sk->protinfo.rose->vs = (sk->protinfo.rose->vs + 1) % ROSE_MODULUS; + /* + * Requeue the original data frame. + */ + skb_queue_tail(&sk->protinfo.rose->ack_queue, skb); + } while (sk->protinfo.rose->vs != end && (skb = skb_dequeue(&sk->write_queue)) != NULL); sk->protinfo.rose->vl = sk->protinfo.rose->vr; @@ -161,14 +131,4 @@ void rose_enquiry_response(struct sock *sk) rose_stop_timer(sk); } -void rose_check_iframes_acked(struct sock *sk, unsigned short nr) -{ - if (sk->protinfo.rose->vs == nr) { - sk->protinfo.rose->va = nr; - } else { - if (sk->protinfo.rose->va != nr) - sk->protinfo.rose->va = nr; - } -} - #endif diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 43358644c4aa..d9145cdea616 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -63,7 +63,7 @@ static void rose_remove_neigh(struct rose_neigh *); */ static int rose_add_node(struct rose_route_struct *rose_route, struct device *dev) { - struct rose_node *rose_node, *rose_tmpn, *rose_tmpp; + struct rose_node *rose_node; struct rose_neigh *rose_neigh; unsigned long flags; int i; @@ -116,55 +116,18 @@ static int rose_add_node(struct rose_route_struct *rose_route, struct device *de restore_flags(flags); } - /* - * This is a new node to be inserted into the list. Find where it needs - * to be inserted into the list, and insert it. We want to be sure - * to order the list in descending order of mask size to ensure that - * later when we are searching this list the first match will be the - * best match. - */ if (rose_node == NULL) { - rose_tmpn = rose_node_list; - rose_tmpp = NULL; - - while (rose_tmpn != NULL) { - if (rose_tmpn->mask > rose_route->mask) { - rose_tmpp = rose_tmpn; - rose_tmpn = rose_tmpn->next; - } else { - break; - } - } - - /* create new node */ if ((rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) return -ENOMEM; - rose_node->address = rose_route->address; - rose_node->mask = rose_route->mask; - rose_node->count = 1; + rose_node->address = rose_route->address; + rose_node->mask = rose_route->mask; + rose_node->count = 1; rose_node->neighbour[0] = rose_neigh; save_flags(flags); cli(); - - if (rose_tmpn == NULL) { - if (rose_tmpp == NULL) { /* Empty list */ - rose_node_list = rose_node; - rose_node->next = NULL; - } else { - rose_tmpp->next = rose_node; - rose_node->next = NULL; - } - } else { - if (rose_tmpp == NULL) { /* 1st node */ - rose_node->next = rose_node_list; - rose_node_list = rose_node; - } else { - rose_tmpp->next = rose_node; - rose_node->next = rose_tmpn; - } - } - + rose_node->next = rose_node_list; + rose_node_list = rose_node; restore_flags(flags); rose_neigh->count++; @@ -487,20 +450,29 @@ struct rose_route *rose_route_free_lci(unsigned int lci, struct rose_neigh *neig struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, unsigned char *diagnostic) { struct rose_node *node; + struct rose_neigh *neigh; int failed = 0; + int mask = 0; int i; - for (node = rose_node_list; node != NULL; node = node->next) { + for (neigh = NULL, node = rose_node_list; node != NULL; node = node->next) { if (rosecmpm(addr, &node->address, node->mask) == 0) { - for (i = 0; i < node->count; i++) { - if (!rose_ftimer_running(node->neighbour[i])) - return node->neighbour[i]; - else - failed = 1; + if (node->mask > mask) { + mask = node->mask; + + for (i = 0; i < node->count; i++) { + if (!rose_ftimer_running(node->neighbour[i])) + neigh = node->neighbour[i]; + else + failed = 1; + } } } } + if (neigh != NULL) + return neigh; + if (failed) { *cause = ROSE_OUT_OF_ORDER; *diagnostic = 0; diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c index ee710bd6e353..e7709726cdfe 100644 --- a/net/rose/rose_subr.c +++ b/net/rose/rose_subr.c @@ -49,8 +49,47 @@ void rose_clear_queues(struct sock *sk) while ((skb = skb_dequeue(&sk->write_queue)) != NULL) kfree_skb(skb, FREE_WRITE); - while ((skb = skb_dequeue(&sk->protinfo.rose->frag_queue)) != NULL) - kfree_skb(skb, FREE_READ); + while ((skb = skb_dequeue(&sk->protinfo.rose->ack_queue)) != NULL) + kfree_skb(skb, FREE_WRITE); +} + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. + */ +void rose_frames_acked(struct sock *sk, unsigned short nr) +{ + struct sk_buff *skb; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (sk->protinfo.rose->va != nr) { + while (skb_peek(&sk->protinfo.rose->ack_queue) != NULL && sk->protinfo.rose->va != nr) { + skb = skb_dequeue(&sk->protinfo.rose->ack_queue); + kfree_skb(skb, FREE_WRITE); + sk->protinfo.rose->va = (sk->protinfo.rose->va + 1) % ROSE_MODULUS; + } + } +} + +void rose_requeue_frames(struct sock *sk) +{ + struct sk_buff *skb, *skb_prev = NULL; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by rose_kick. This arrangement handles the possibility of an + * empty output queue. + */ + while ((skb = skb_dequeue(&sk->protinfo.rose->ack_queue)) != NULL) { + if (skb_prev == NULL) + skb_queue_head(&sk->write_queue, skb); + else + skb_append(skb_prev, skb); + skb_prev = skb; + } } /* diff --git a/net/sched/Makefile b/net/sched/Makefile new file mode 100644 index 000000000000..cbb6704c1669 --- /dev/null +++ b/net/sched/Makefile @@ -0,0 +1,71 @@ +# +# Makefile for the Linux Traffic Control Unit. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := sched.o + +O_OBJS := sch_generic.o + +ifeq ($(CONFIG_NET_SCH_CBQ), y) +O_OBJS += sch_cbq.o +else + ifeq ($(CONFIG_NET_SCH_CBQ), m) + M_OBJS += sch_cbq.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_CSZ), y) +O_OBJS += sch_csz.o +else + ifeq ($(CONFIG_NET_SCH_CSZ), m) + M_OBJS += sch_csz.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_SFQ), y) +O_OBJS += sch_sfq.o +else + ifeq ($(CONFIG_NET_SCH_SFQ), m) + M_OBJS += sch_sfq.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_RED), y) +O_OBJS += sch_red.o +else + ifeq ($(CONFIG_NET_SCH_RED), m) + M_OBJS += sch_red.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_TBF), y) +O_OBJS += sch_tbf.o +else + ifeq ($(CONFIG_NET_SCH_TBF), m) + M_OBJS += sch_tbf.o + endif +endif + + +ifeq ($(CONFIG_NET_SCH_PFIFO), y) +O_OBJS += sch_fifo.o +else + ifeq ($(CONFIG_NET_SCH_PFIFO), m) + M_OBJS += sch_fifo.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_PRIO), y) +O_OBJS += sch_prio.o +else + ifeq ($(CONFIG_NET_SCH_PRIO), m) + M_OBJS += sch_prio.o + endif +endif + +include $(TOPDIR)/Rules.make diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c new file mode 100644 index 000000000000..626afe555345 --- /dev/null +++ b/net/sched/sch_cbq.c @@ -0,0 +1,839 @@ +/* + * net/sched/sch_cbq.c Class-Based Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Class-Based Queueing (CBQ) algorithm. + ======================================= + + Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource + Management Models for Packet Networks", + IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 + + [2] Sally Floyd, "Notes on CBQ and Guaranted Service", 1995 + + [3] Sally Floyd, "Notes on Class-Based Queueing: Setting + Parameters", 1996 + + Algorithm skeleton is taken from from NS simulator cbq.cc. + + ----------------------------------------------------------------------- + + Differences from NS version. + + --- WRR algorith is different. Our version looks more reasonable :-) + and fair when quanta are allowed to be less than MTU. + + --- cl->aveidle is REALLY limited from below by cl->minidle. + Seems, it was bug in NS. + + --- Purely lexical change: "depth" -> "level", "maxdepth" -> "toplevel". + When depth increases we expect, that the thing becomes lower, does not it? :-) + Besides that, "depth" word is semantically overloaded --- + "token bucket depth", "sfq depth"... Besides that, the algorithm + was called "top-LEVEL sharing". + + PROBLEM. + + --- Linux has no EOI event at the moment, so that we cannot + estimate true class idle time. Three workarounds are possible, + all of them have drawbacks: + + 1. (as now) Consider the next dequeue event as sign that + previous packet is finished. It is wrong because of ping-pong + buffers, but on permanently loaded link it is true. + 2. (NS approach) Use as link busy time estimate skb->leb/"physical + bandwidth". Even more wrong f.e. on ethernet real busy time much + higher because of collisions. + 3. (seems, the most clever) Split net bh to two parts: + NETRX_BH (for received packets) and preserve NET_BH for transmitter. + It will not require driver changes (NETRX_BH flag will be set + in netif_rx), but will allow to trace EOIs more precisely + and will save useless checks in net_bh. Besides that we will + have to eliminate random calling hard_start_xmit with dev->tbusy flag + (done) and to drop failure_q --- i.e. if !dev->tbusy hard_start_xmit + MUST succeed; failed packets will be dropped on the floor. +*/ + +#define CBQ_TOPLEVEL_SHARING +/* #define CBQ_NO_TRICKERY */ + +#define CBQ_CLASSIFIER(skb, q) ((q)->fallback_class) + +struct cbq_class +{ +/* Parameters */ + int priority; /* priority */ +#ifdef CBQ_TOPLEVEL_SHARING + int level; /* level of the class in hierarchy: + 0 for leaf classes, and maximal + level of childrens + 1 for nodes. + */ +#endif + + long maxidle; /* Class paramters: see below. */ + long minidle; + int filter_log; +#ifndef CBQ_NO_TRICKERY + long extradelay; +#endif + + long quantum; /* Allotment per WRR round */ + long rquantum; /* Relative allotment: see below */ + + int cell_log; + unsigned long L_tab[256]; + + struct Qdisc *qdisc; /* ptr to CBQ discipline */ + struct cbq_class *root; /* Ptr to root class; + root can be not unique. + */ + struct cbq_class *parent; /* Ptr to parent in the class tree */ + struct cbq_class *borrow; /* NULL if class is bandwidth limited; + parent otherwise */ + + struct Qdisc *q; /* Elementary queueing discipline */ + struct cbq_class *next; /* next class in this priority band */ + + struct cbq_class *next_alive; /* next class with backlog in this priority band */ + +/* Variables */ + psched_time_t last; + psched_time_t undertime; + long avgidle; + long deficit; /* Saved deficit for WRR */ + char awake; /* Class is in alive list */ + +#if 0 + void (*overlimit)(struct cbq_class *cl); +#endif +}; + +#define L2T(cl,len) ((cl)->L_tab[(len)>>(cl)->cell_log]) + +struct cbq_sched_data +{ + struct cbq_class *classes[CBQ_MAXPRIO]; /* List of all classes */ + int nclasses[CBQ_MAXPRIO]; + unsigned quanta[CBQ_MAXPRIO]; + unsigned mtu; + int cell_log; + unsigned long L_tab[256]; + struct cbq_class *fallback_class; + + unsigned activemask; + struct cbq_class *active[CBQ_MAXPRIO]; /* List of all classes + with backlog */ + struct cbq_class *last_sent; + int last_sent_len; + + psched_time_t now; /* Cached timestamp */ + + struct timer_list wd_timer; /* Wathchdog timer, that + started when CBQ has + backlog, but cannot + transmit just now */ + unsigned long wd_expires; +#ifdef CBQ_TOPLEVEL_SHARING + struct cbq_class *borrowed; + int toplevel; +#endif +}; + +/* + WRR quanta + ---------- + + cl->quantum is number added to class allotment on every round. + cl->rquantum is "relative" quantum. + + For real-time classes: + + cl->quantum = (cl->rquantum*q->nclasses[prio]*q->mtu)/q->quanta[prio] + + where q->quanta[prio] is sum of all rquanta for given priority. + cl->rquantum can be identified with absolute rate of the class + in arbitrary units (f.e. bytes/sec) + + In this case, delay introduced by round-robin was estimated by + Sally Floyd [2] as: + + D = q->nclasses*q->mtu/(bandwidth/2) + + Note, that D does not depend on class rate (it is very bad), + but not much worse than Gallager-Parekh estimate for CSZ + C/R = q->mtu/rate, when real-time classes have close rates. + + For not real-time classes this folmula is not necessary, + so that cl->quantum can be set to any reasonable not zero value. + Apparently, it should be proportional to class rate, if the + rate is not zero. +*/ + +/* + maxidle, minidle, extradelay + ---------------------------- + + CBQ estimator calculates smoothed class idle time cl->aveidle, + considering class as virtual interface with corresponding bandwidth. + When cl->aveidle wants to be less than zero, class is overlimit. + When it is positive, class is underlimit. + + * maxidle bounds aveidle from above. + It controls maximal length of burst in this class after + long period of idle time. Burstness of active class + is controlled by filter constant cl->filter_log, + but this number is related to burst length only indirectly. + + * minidle is a negative number, normally set to zero. + Setting it to not zero value allows avgidle to drop + below zero, effectively penalizing class, when it is overlimit. + When the class load will decrease, it will take a time to + raise negative avgidle to put the class at limit. + It should be set to zero for leaf classes. + + * extradelay is penalty in delay, when a class goes overlimit. + I believe this parameter is useless and confusing. + Setting it to not zero forces class to accumulate + its "idleness" for extradelay and then send BURST of packets + until going to overlimit again. Non-sense. + + For details see [1] and [3]. + + Really, minidle and extradelay are irrelevant to real scheduling + task. As I understand, SF&VJ introduced them to experiment + with CBQ simulator in attempts to fix erratic behaviour + of ancestor-only (and, partially, top-level) algorithm. + + WARNING. + + User passes them measured in usecs, but cl->minidle, + cl->maxidle and cl->aveidle are scaled with cl->filter_log + in the text of the scheduler. +*/ + +/* + A packet has just been enqueued on the empty class. + cbq_wakeup_class adds it to the tail of active class list + of its priority band. + */ + +static __inline__ void cbq_wakeup_class(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + int prio = cl->priority; + struct cbq_class *cl_tail; + + cl->awake = 1; + + cl_tail = q->active[prio]; + q->active[prio] = cl; + + if (cl_tail != NULL) { + cl->next_alive = cl_tail->next_alive; + cl->deficit = 0; + } else { + cl->next_alive = cl; + q->activemask |= (1<deficit = cl->quantum; + } +} + +static int +cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = CBQ_CLASSIFIER(skb, q); + + if (cl->q->enqueue(skb, cl->q) == 1) { + sch->q.qlen++; + +#ifdef CBQ_TOPLEVEL_SHARING + if (q->toplevel > 0) { + psched_time_t now; + PSCHED_GET_TIME(now); + if (PSCHED_TLESS(cl->undertime, now)) + q->toplevel = 0; + else if (q->toplevel > 1 && cl->borrow && + PSCHED_TLESS(cl->borrow->undertime, now)) + q->toplevel = 1; + } +#endif + if (!cl->awake) + cbq_wakeup_class(cl); + return 1; + } + return 0; +} + +static __inline__ void cbq_delay(struct cbq_sched_data *q, struct cbq_class *cl) +{ + long delay; + + delay = PSCHED_TDIFF(cl->undertime, q->now); + if (q->wd_expires == 0 || q->wd_expires - delay > 0) + q->wd_expires = delay; +} + +static void cbq_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + + q->wd_timer.expires = 0; + q->wd_timer.function = NULL; + qdisc_wakeup(sch->dev); +} + +static __inline__ void +cbq_update(struct cbq_sched_data *q) +{ + struct cbq_class *cl; + + for (cl = q->last_sent; cl; cl = cl->parent) { + long avgidle = cl->avgidle; + long idle; + + /* + (now - last) is total time between packet right edges. + (last_pktlen/rate) is "virtual" busy time, so that + + idle = (now - last) - last_pktlen/rate + */ + + idle = PSCHED_TDIFF(q->now, cl->last) + - L2T(cl, q->last_sent_len); + + /* true_avgidle := (1-W)*true_avgidle + W*idle, + where W=2^{-filter_log}. But cl->avgidle is scaled: + cl->avgidle == true_avgidle/W, + hence: + */ + avgidle += idle - (avgidle>>cl->filter_log); + + if (avgidle <= 0) { + /* Overlimit or at-limit */ +#ifdef CBQ_NO_TRICKERY + avgidle = 0; +#else + if (avgidle < cl->minidle) + avgidle = cl->minidle; +#endif + + /* This line was missing in NS. */ + cl->avgidle = avgidle; + + /* Calculate expected time, when this class + will be allowed to send. + It will occur, when: + (1-W)*true_avgidle + W*delay = 0, i.e. + idle = (1/W - 1)*(-true_avgidle) + or + idle = (1 - W)*(-cl->avgidle); + + That is not all. + We want to set undertime to the moment, when + the class is allowed to start next transmission i.e. + (undertime + next_pktlen/phys_bandwidth) + - now - next_pktlen/rate = idle + or + undertime = now + idle + next_pktlen/rate + - next_pktlen/phys_bandwidth + + We do not know next packet length, but can + estimate it with average packet length + or current packet_length. + */ + + idle = (-avgidle) - ((-avgidle) >> cl->filter_log); + idle += L2T(q, q->last_sent_len); + idle -= L2T(cl, q->last_sent_len); + PSCHED_TADD2(q->now, idle, cl->undertime); +#ifndef CBQ_NO_TRICKERY + /* Do not forget extra delay :-) */ + PSCHED_TADD(cl->undertime, cl->extradelay); +#endif + } else { + /* Underlimit */ + + PSCHED_SET_PASTPERFECT(cl->undertime); + if (avgidle > cl->maxidle) + cl->avgidle = cl->maxidle; + else + cl->avgidle = avgidle; + } + cl->last = q->now; + } + +#ifdef CBQ_TOPLEVEL_SHARING + cl = q->last_sent; + + if (q->borrowed && q->toplevel >= q->borrowed->level) { + if (cl->q->q.qlen <= 1 || PSCHED_TLESS(q->now, q->borrowed->undertime)) + q->toplevel = CBQ_MAXLEVEL; + else if (q->borrowed != cl) + q->toplevel = q->borrowed->level; + } +#endif + + q->last_sent = NULL; +} + +static __inline__ int +cbq_under_limit(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + struct cbq_class *this_cl = cl; + + if (PSCHED_IS_PASTPERFECT(cl->undertime) || cl->parent == NULL) + return 1; + + if (PSCHED_TLESS(cl->undertime, q->now)) { + q->borrowed = cl; + return 1; + } + + while (!PSCHED_IS_PASTPERFECT(cl->undertime) && + PSCHED_TLESS(q->now, cl->undertime)) { + cl = cl->borrow; + if (cl == NULL +#ifdef CBQ_TOPLEVEL_SHARING + || cl->level > q->toplevel +#endif + ) { +#if 0 + this_cl->overlimit(this_cl); +#else + cbq_delay(q, this_cl); +#endif + return 0; + } + } + q->borrowed = cl; + return 1; +} + +static __inline__ struct sk_buff * +cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl_tail, *cl_prev, *cl; + struct sk_buff *skb; + int deficit; + + cl_tail = cl_prev = q->active[prio]; + cl = cl_prev->next_alive; + + do { + deficit = 0; + + /* Start round */ + do { + /* Class is empty */ + if (cl->q->q.qlen == 0) + goto skip_class; + + if (fallback) { + /* Fallback pass: all classes are overlimit; + we send from the first class that is allowed + to borrow. + */ + + if (cl->borrow == NULL) + goto skip_class; + } else { + /* Normal pass: check that class is under limit */ + if (!cbq_under_limit(cl)) + goto skip_class; + } + + if (cl->deficit <= 0) { + /* Class exhausted its allotment per this + round. + */ + deficit = 1; + goto next_class; + } + + skb = cl->q->dequeue(cl->q); + + /* Class did not give us any skb :-( + It could occur if cl->q == "tbf" + */ + if (skb == NULL) + goto skip_class; + + cl->deficit -= skb->len; + q->last_sent = cl; + q->last_sent_len = skb->len; + + if (cl->deficit <= 0) { + q->active[prio] = cl; + cl = cl->next_alive; + cl->deficit += cl->quantum; + } + return skb; + +skip_class: + cl->deficit = 0; + + if (cl->q->q.qlen == 0) { + /* Class is empty, declare it dead */ + cl_prev->next_alive = cl->next_alive; + cl->awake = 0; + + /* Did cl_tail point to it? */ + if (cl == cl_tail) { + /* Repair it! */ + cl_tail = cl_prev; + + /* Was it the last class in this band? */ + if (cl == cl_tail) { + /* Kill the band! */ + q->active[prio] = NULL; + q->activemask &= ~(1<next_alive; + cl->deficit += cl->quantum; + } while (cl_prev != cl_tail); + } while (deficit); + + q->active[prio] = cl_prev; + + return NULL; +} + +static __inline__ struct sk_buff * +cbq_dequeue_1(struct Qdisc *sch, int fallback) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct sk_buff *skb; + unsigned activemask; + + activemask = q->activemask; + while (activemask) { + int prio = ffz(~activemask); + activemask &= ~(1<data; + + PSCHED_GET_TIME(q->now); + + if (q->last_sent) + cbq_update(q); + + q->wd_expires = 0; + + skb = cbq_dequeue_1(sch, 0); + if (skb) + return skb; + + /* All the classes are overlimit. + Search for overlimit class, which is allowed to borrow + and use it as fallback case. + */ + +#ifdef CBQ_TOPLEVEL_SHARING + q->toplevel = CBQ_MAXLEVEL; +#endif + + skb = cbq_dequeue_1(sch, 1); + if (skb) + return skb; + + /* No packets in scheduler or nobody wants to give them to us :-( + Sigh... start watchdog timer in the last case. */ + + if (sch->q.qlen && q->wd_expires) { + if (q->wd_timer.function) + del_timer(&q->wd_timer); + q->wd_timer.function = cbq_watchdog; + q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires); + add_timer(&q->wd_timer); + } + return NULL; +} + +/* CBQ class maintanance routines */ + +static void cbq_adjust_levels(struct cbq_class *this) +{ + struct cbq_class *cl; + + for (cl = this->parent; cl; cl = cl->parent) { + if (cl->level > this->level) + return; + cl->level = this->level + 1; + this = cl; + } +} + +static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + + if (q->quanta[prio] == 0) + return; + + for (cl = q->classes[prio]; cl; cl = cl->next) { + if (cl->rquantum) + cl->quantum = (cl->rquantum*q->mtu*q->nclasses[prio])/ + q->quanta[prio]; + } +} + +static __inline__ int cbq_unlink_class(struct cbq_class *this) +{ + struct cbq_class *cl, **clp; + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + + for (clp = &q->classes[this->priority]; (cl = *clp) != NULL; + clp = &cl->next) { + if (cl == this) { + *clp = cl->next; + return 0; + } + } + return -ENOENT; +} + +static int cbq_prune(struct cbq_class *this) +{ + struct cbq_class *cl; + int prio = this->priority; + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + + qdisc_reset(this->q); + + if (cbq_unlink_class(this)) + return -ENOENT; + + if (this->awake) { + struct cbq_class *cl_prev = q->active[prio]; + do { + cl = cl_prev->next_alive; + if (cl == this) { + cl_prev->next_alive = cl->next_alive; + + if (cl == q->active[prio]) { + q->active[prio] = cl; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + q->activemask &= ~(1<next_alive; + cl->deficit += cl->quantum; + break; + } + } while ((cl_prev = cl) != q->active[prio]); + } + + --q->nclasses[prio]; + if (this->rquantum) { + q->quanta[prio] -= this->rquantum; + cbq_normalize_quanta(q, prio); + } + + if (q->fallback_class == this) + q->fallback_class = NULL; + + this->parent = NULL; + this->borrow = NULL; + this->root = this; + this->qdisc = NULL; + return 0; +} + +static int cbq_graft(struct cbq_class *this, struct cbq_class *parent) +{ + struct cbq_class *cl, **clp; + int prio = this->priority; + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + + qdisc_reset(this->q); + + + for (clp = &q->classes[prio]; (cl = *clp) != NULL; clp = &cl->next) { + if (cl == this) + return -EBUSY; + } + + cl->next = NULL; + *clp = cl; + + cl->parent = parent; + cl->borrow = parent; + cl->root = parent ? parent->root : cl; + + ++q->nclasses[prio]; + if (this->rquantum) { + q->quanta[prio] += this->rquantum; + cbq_normalize_quanta(q, prio); + } + + cbq_adjust_levels(this); + + return 0; +} + + +static void +cbq_reset(struct Qdisc* sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl; + int prio; + + q->activemask = 0; + q->last_sent = NULL; + if (q->wd_timer.function) { + del_timer(&q->wd_timer); + q->wd_timer.expires = 0; + q->wd_timer.function = NULL; + } +#ifdef CBQ_TOPLEVEL_SHARING + q->toplevel = CBQ_MAXLEVEL; +#endif + + for (prio = 0; prio < CBQ_MAXPRIO; prio++) { + q->active[prio] = NULL; + + for (cl = q->classes[prio]; cl; cl = cl->next) { + qdisc_reset(cl->q); + + cl->next_alive = NULL; + PSCHED_SET_PASTPERFECT(cl->undertime); + cl->avgidle = 0; + cl->deficit = 0; + cl->awake = 0; + } + } +} + +static void +cbq_destroy(struct Qdisc* sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl, **clp; + int prio; + + for (prio = 0; prio < CBQ_MAXPRIO; prio++) { + struct cbq_class *cl_head = q->classes[prio]; + + for (clp = &cl_head; (cl=*clp) != NULL; clp = &cl->next) { + qdisc_destroy(cl->q); + kfree(cl); + } + } +} + +static int cbq_control(struct Qdisc *sch, void *arg) +{ + struct cbq_sched_data *q; + + q = (struct cbq_sched_data *)sch->data; + + /* Do attachment here. It is the last thing to do. */ + + return -EINVAL; +} + +static int cbq_init(struct Qdisc *sch, void *arg) +{ + struct cbq_sched_data *q; + struct cbqctl *ctl = (struct cbqctl*)arg; + + q = (struct cbq_sched_data *)sch->data; + init_timer(&q->wd_timer); + q->wd_timer.data = (unsigned long)sch; +#ifdef CBQ_TOPLEVEL_SHARING + q->toplevel = CBQ_MAXLEVEL; +#endif + + return 0; +} + + +struct Qdisc_ops cbq_ops = +{ + NULL, + "cbq", + 0, + sizeof(struct cbq_sched_data), + cbq_enqueue, + cbq_dequeue, + cbq_reset, + cbq_destroy, + cbq_init, + cbq_control, +}; + +#ifdef MODULE +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&cbq_ops); + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/sched/sch_csz.c b/net/sched/sch_csz.c new file mode 100644 index 000000000000..dbc05d31bf03 --- /dev/null +++ b/net/sched/sch_csz.c @@ -0,0 +1,832 @@ +/* + * net/sched/sch_csz.c Clark-Shenker-Zhang scheduler. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Clark-Shenker-Zhang algorithm. + ======================================= + + SOURCE. + + David D. Clark, Scott Shenker and Lixia Zhang + "Supporting Real-Time Applications in an Integrated Services Packet + Network: Architecture and Mechanism". + + CBQ presents a flexible universal algorithm for packet scheduling, + but it has pretty poor delay characteristics. + Round-robin scheduling and link-sharing goals + apparently contradict to minimization of network delay and jitter. + Moreover, correct handling of predicted flows seems to be + impossible in CBQ. + + CSZ presents more precise but less flexible and less efficient + approach. As I understand, the main idea is to create + WFQ flows for each guaranteed service and to allocate + the rest of bandwith to dummy flow-0. Flow-0 comprises + the predicted services and the best effort traffic; + it is handled by a priority scheduler with the highest + priority band allocated for predicted services, and the rest --- + to the best effort packets. + + Note, that in CSZ flows are NOT limited to their bandwidth. + It is supposed, that flow passed admission control at the edge + of QoS network and it more need no shaping. Any attempt to improve + the flow or to shape it to a token bucket at intermediate hops + will introduce undesired delays and raise jitter. + + At the moment CSZ is the only scheduler that provides + real guaranteed service. Another schemes (including CBQ) + do not provide guaranteed delay and randomize jitter. + There exists the statement (Sally Floyd), that delay + can be estimated by a IntServ compliant formulae. + This result is true formally, but it is wrong in principle. + At first, it ignores delays introduced by link sharing. + And the second (and main) it limits bandwidth, + it is fatal flaw. + + ALGORITHM. + + --- Notations. + + $B$ is link bandwidth (bits/sec). + + $I$ is set of all flows, including flow $0$. + Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and + $\sum_{a \in I} r_a = 1$. + + --- Flow model. + + Let $m_a$ is number of backlogged bits in flow $a$. + The flow is {\em active }, if $m_a > 0$. + This number is discontinuous function of time; + when a packet $i$ arrives: + \[ + m_a(t_i+0) - m_a(t_i-0) = L^i, + \] + where $L^i$ is the length of arrived packet. + The flow queue is drained continuously until $m_a == 0$: + \[ + {d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}. + \] + I.e. flow rates are their allocated rates proportionally + scaled to take all available link bandwidth. Apparently, + it is not the only possible policy. F.e. CBQ classes + without borrowing would be modelled by: + \[ + {d m_a \over dt} = - B r_a . + \] + More complicated hierarchical bandwidth allocation + policies are possible, but, unfortunately, basic + flows equation have simple solution only for proportional + scaling. + + --- Departure times. + + We calculate time until the last bit of packet will be sent: + \[ + E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a }, + \] + where $\delta_a(t)$ is number of bits drained since $t_i$. + We have to evaluate $E_a^i$ for all queued packets, + then find packet with minimal $E_a^i$ and send it. + + It sounds good, but direct implementation of the algorithm + is absolutely infeasible. Luckily, if flow rates + are scaled proportionally, the equations have simple solution. + + The differential equation for $E_a^i$ is + \[ + {d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} = + { B \over \sum_{b \in A} r_b} + \] + with initial condition + \[ + E_a^i (t_i) = { m_a(t_i) \over r_a } . + \] + + Let's introduce an auxiliary function $R(t)$: + + --- Round number. + + Consider the following model: we rotate over active flows, + sending $r_a B$ bits from every flow, so that we send + $B \sum_{a \in A} r_a$ bits per round, that takes + $\sum_{a \in A} r_a$ seconds. + + Hence, $R(t)$ (round number) is monotonically increasing + linear function of time when $A$ is not changed + \[ + { d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a } + \] + and it is continuous when $A$ changes. + + The central observation is that the quantity + $F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all! + $R(t)$ does not depend on flow, so that $F_a^i$ can be + calculated only once on packet arrival, and we need not + recalculation of $E$ numbers and resorting queues. + Number $F_a^i$ is called finish number of the packet. + It is just value of $R(t)$, when the last bit of packet + will be sent out. + + Maximal finish number on flow is called finish number of flow + and minimal one is "start number of flow". + Apparently, flow is active if and only if $F_a \leq R$. + + When packet of length $L_i$ bit arrives to flow $a$ at time $t_i$, + we calculate number $F_a^i$ as: + + If flow was inactive ($F_a < R$): + $F_a^i = R(t) + {L_i \over B r_a}$ + otherwise + $F_a^i = F_a + {L_i \over B r_a}$ + + These equations complete the algorithm specification. + + It looks pretty hairy, but there exists a simple + procedure for solving these equations. + See procedure csz_update(), that is a generalization of + algorithm from S. Keshav's thesis Chapter 3 + "Efficient Implementation of Fair Queeing". + + NOTES. + + * We implement only the simplest variant of CSZ, + when flow-0 is explicit 4band priority fifo. + It is bad, but we need "peek" operation in addition + to "dequeue" to implement complete CSZ. + I do not want to make it, until it is not absolutely + necessary. + + * A primitive support for token bucket filtering + presents too. It directly contradicts to CSZ, but + though the Internet is on the globe ... :-) + yet "the edges of the network" really exist. + + BUGS. + + * Fixed point arithmetic is overcomplicated, suboptimal and even + wrong. Check it later. +*/ + + +/* This number is arbitrary */ + +#define CSZ_MAX_GUARANTEED 16 + +#define CSZ_FLOW_ID(skb) (CSZ_MAX_GUARANTEED) + +struct csz_head +{ + struct csz_head *snext; + struct csz_head *sprev; + struct csz_head *fnext; + struct csz_head *fprev; +}; + +struct csz_flow +{ + struct csz_head *snext; + struct csz_head *sprev; + struct csz_head *fnext; + struct csz_head *fprev; + +/* Parameters */ + unsigned long rate; /* Flow rate. Fixed point is at rate_log */ + unsigned long *L_tab; /* Lookup table for L/(B*r_a) values */ + unsigned long max_bytes; /* Maximal length of queue */ +#ifdef CSZ_PLUS_TBF + unsigned long depth; /* Depth of token bucket, normalized + as L/(B*r_a) */ +#endif + +/* Variables */ +#ifdef CSZ_PLUS_TBF + unsigned long tokens; /* Tokens number: usecs */ + psched_time_t t_tbf; + unsigned long R_tbf; + int throttled; +#endif + unsigned peeked; + unsigned long start; /* Finish number of the first skb */ + unsigned long finish; /* Finish number of the flow */ + + struct sk_buff_head q; /* FIFO queue */ +}; + +#define L2R(q,f,L) ((f)->L_tab[(L)>>(q)->cell_log]) + +struct csz_sched_data +{ +/* Parameters */ + unsigned char cell_log; /* 1< 2.1sec is MAXIMAL value */ + +/* Variables */ +#ifdef CSZ_PLUS_TBF + struct timer_list wd_timer; + long wd_expires; +#endif + psched_time_t t_c; /* Time check-point */ + unsigned long R_c; /* R-number check-point */ + unsigned long rate; /* Current sum of rates of active flows */ + struct csz_head s; /* Flows sorted by "start" */ + struct csz_head f; /* Flows sorted by "finish" */ + + struct sk_buff_head other[4];/* Predicted (0) and the best efforts + classes (1,2,3) */ + struct csz_flow flow[CSZ_MAX_GUARANTEED]; /* Array of flows */ +}; + +/* These routines (csz_insert_finish and csz_insert_start) are + the most time consuming part of all the algorithm. + + We insert to sorted list, so that time + is linear with respect to number of active flows in the worst case. + Note that we have not very large number of guaranteed flows, + so that logarithmic algorithms (heap etc.) are useless, + they are slower than linear one when length of list <= 32. + + Heap would take sence if we used WFQ for best efforts + flows, but SFQ is better choice in this case. + */ + + +/* Insert flow "this" to the list "b" before + flow with greater finish number. + */ + +#if 0 +/* Scan forward */ +extern __inline__ void csz_insert_finish(struct csz_head *b, + struct csz_flow *this) +{ + struct csz_head *f = b->fnext; + unsigned long finish = this->finish; + + while (f != b) { + if (((struct csz_flow*)f)->finish - finish > 0) + break; + f = f->fnext; + } + this->fnext = f; + this->fprev = f->fprev; + this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this; +} +#else +/* Scan backward */ +extern __inline__ void csz_insert_finish(struct csz_head *b, + struct csz_flow *this) +{ + struct csz_head *f = b->fprev; + unsigned long finish = this->finish; + + while (f != b) { + if (((struct csz_flow*)f)->finish - finish <= 0) + break; + f = f->fprev; + } + this->fnext = f->fnext; + this->fprev = f; + this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this; +} +#endif + +/* Insert flow "this" to the list "b" before + flow with greater start number. + */ + +extern __inline__ void csz_insert_start(struct csz_head *b, + struct csz_flow *this) +{ + struct csz_head *f = b->snext; + unsigned long start = this->start; + + while (f != b) { + if (((struct csz_flow*)f)->start - start > 0) + break; + f = f->snext; + } + this->snext = f; + this->sprev = f->sprev; + this->snext->sprev = this->sprev->snext = (struct csz_head*)this; +} + + +/* Calculate and return current round number. + It is another time consuming part, but + it is impossible to avoid it. + + Fixed point arithmetic is not ... does not ... Well, it is just CRAP. + */ + +static unsigned long csz_update(struct Qdisc *sch) +{ + struct csz_sched_data *q = (struct csz_sched_data*)sch->data; + struct csz_flow *a; + unsigned long F; + unsigned long tmp; + psched_time_t now; + unsigned long delay; + unsigned long R_c; + + PSCHED_GET_TIME(now); + delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset); + + if (delay>>q->delta_log) { +do_reset: + /* Delta is too large. + It is possible if MTU/BW > 1<delta_log + (i.e. configuration error) or because of hardware + fault. We have no choice... + */ + qdisc_reset(sch); + return 0; + } + + q->t_c = now; + + for (;;) { + a = (struct csz_flow*)q->f.fnext; + + /* No more active flows. Reset R and exit. */ + if (a == (struct csz_flow*)&q->f) { +#ifdef CSZ_DEBUG + if (q->rate) { + printk("csz_update: rate!=0 on inactive csz\n"); + q->rate = 0; + } +#endif + q->R_c = 0; + return 0; + } + + F = a->finish; + +#ifdef CSZ_DEBUG + if (q->rate == 0) { + printk("csz_update: rate=0 on active csz\n"); + goto do_reset; + } +#endif + + /* + * tmp = (t - q->t_c)/q->rate; + */ + + tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log); + + tmp += q->R_c; + + /* OK, this flow (and all flows with greater + finish numbers) is still active */ + if (F - tmp > 0) + break; + + /* It is more not active */ + + a->fprev->fnext = a->fnext; + a->fnext->fprev = a->fprev; + + /* + * q->t_c += (F - q->R_c)*q->rate + */ + + tmp = ((F-q->R_c)*q->rate)<R_log; + R_c = F; + q->rate -= a->rate; + + if (delay - tmp >= 0) { + delay -= tmp; + continue; + } + delay = 0; + } + + q->R_c = tmp; + return tmp; +} + +static int +csz_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned flow_id = CSZ_FLOW_ID(skb); + unsigned long R; + int prio; + struct csz_flow *this; + + if (flow_id >= CSZ_MAX_GUARANTEED) { + prio = flow_id - CSZ_MAX_GUARANTEED; + flow_id = 0; + } + + this = &q->flow[flow_id]; + if (this->q.qlen >= this->max_bytes || this->L_tab == NULL) { + kfree_skb(skb, FREE_WRITE); + return 0; + } + + R = csz_update(sch); + + if (this->finish - R >= 0) { + /* It was active */ + this->finish += L2R(q,this,skb->len); + } else { + /* It is inactive; activate it */ + this->finish = R + L2R(q,this,skb->len); + q->rate += this->rate; + csz_insert_finish(&q->f, this); + } + + /* If this flow was empty, remember start number + and insert it into start queue */ + if (this->q.qlen == 0) { + this->start = this->finish; + csz_insert_start(&q->s, this); + } + if (flow_id) + skb_queue_tail(&this->q, skb); + else + skb_queue_tail(&q->other[prio], skb); + sch->q.qlen++; + return 1; +} + +static __inline__ struct sk_buff * +skb_dequeue_best(struct csz_sched_data * q) +{ + int i; + struct sk_buff *skb; + + for (i=0; i<4; i++) { + skb = skb_dequeue(&q->other[i]); + if (skb) { + q->flow[0].q.qlen--; + return skb; + } + } + return NULL; +} + +static __inline__ struct sk_buff * +skb_peek_best(struct csz_sched_data * q) +{ + int i; + struct sk_buff *skb; + + for (i=0; i<4; i++) { + skb = skb_peek(&q->other[i]); + if (skb) + return skb; + } + return NULL; +} + +#ifdef CSZ_PLUS_TBF + +static void csz_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct csz_sched_data *q = (struct csz_sched_data*)sch->data; + + q->wd_timer.expires = 0; + q->wd_timer.function = NULL; + + qdisc_wakeup(sch->dev); +} + +static __inline__ void +csz_move_queue(struct csz_flow *this, long delta) +{ + this->fprev->fnext = this->fnext; + this->fnext->fprev = this->fprev; + + this->start += delta; + this->finish += delta; + + csz_insert_finish(this); +} + +static __inline__ int csz_enough_tokens(struct csz_sched_data *q, + struct csz_flow *this, + struct sk_buff *skb) +{ + long toks; + long shift; + psched_time_t now; + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len); + + shift = 0; + if (this->throttled) { + /* Remember aposteriory delay */ + + unsigned long R = csz_update(q); + shift = R - this->R_tbf; + this->R_tbf = R; + } + + if (toks >= 0) { + /* Now we have enough tokens to proceed */ + + this->tokens = toks <= this->depth ? toks ? this->depth; + this->t_tbf = now; + + if (!this->throttled) + return 1; + + /* Flow was throttled. Update its start&finish numbers + with delay calculated aposteriori. + */ + + this->throttled = 0; + if (shift > 0) + csz_move_queue(this, shift); + return 1; + } + + if (!this->throttled) { + /* Flow has just been throttled; remember + current round number to calculate aposteriori delay + */ + this->throttled = 1; + this->R_tbf = csz_update(q); + } + + /* Move all the queue to the time when it will be allowed to send. + We should translate time to round number, but it is impossible, + so that we made the most conservative estimate i.e. we suppose + that only this flow is active and, hence, R = t. + Really toks <= R <= toks/r_a. + + This apriory shift in R will be adjusted later to reflect + real delay. We cannot avoid it because of: + - throttled flow continues to be active from the viewpoint + of CSZ, so that it would acquire highest priority, + if you not adjusted start numbers. + - Eventually, finish number would become less than round + number and flow were declared inactive. + */ + + toks = -toks; + + /* Remeber, that we should start watchdog */ + if (toks < q->wd_expires) + q->wd_expires = toks; + + toks >>= q->R_log; + shift += toks; + if (shift > 0) { + this->R_tbf += toks; + csz_move_queue(this, shift); + } + csz_insert_start(this); + return 0; +} +#endif + + +static struct sk_buff * +csz_dequeue(struct Qdisc* sch) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct sk_buff *skb; + struct csz_flow *this; + +#ifdef CSZ_PLUS_TBF + q->wd_expires = 0; +#endif + this = (struct csz_flow*)q->s.snext; + + while (this != (struct csz_flow*)&q->s) { + + /* First of all: unlink from start list */ + this->sprev->snext = this->snext; + this->snext->sprev = this->sprev; + + if (this != &q->flow[0]) { /* Guaranteed flow */ + skb = __skb_dequeue(&this->q); + if (skb) { +#ifdef CSZ_PLUS_TBF + if (this->depth) { + if (!csz_enough_tokens(q, this, skb)) + continue; + } +#endif + if (this->q.qlen) { + struct sk_buff *nskb = skb_peek(&this->q); + this->start += L2R(q,this,nskb->len); + csz_insert_start(&q->s, this); + } + sch->q.qlen--; + return skb; + } + } else { /* Predicted or best effort flow */ + skb = skb_dequeue_best(q); + if (skb) { + unsigned peeked = this->peeked; + this->peeked = 0; + + if (--this->q.qlen) { + struct sk_buff *nskb; + unsigned dequeued = L2R(q,this,skb->len); + + /* We got not the same thing that + peeked earlier; adjust start number + */ + if (peeked != dequeued && peeked) + this->start += dequeued - peeked; + + nskb = skb_peek_best(q); + peeked = L2R(q,this,nskb->len); + this->start += peeked; + this->peeked = peeked; + csz_insert_start(&q->s, this); + } + sch->q.qlen--; + return skb; + } + } + } +#ifdef CSZ_PLUS_TBF + /* We are about to return no skb. + Schedule watchdog timer, if it occured because of shaping. + */ + if (q->wd_expires) { + if (q->wd_timer.function) + del_timer(&q->wd_timer); + q->wd_timer.function = csz_watchdog; + q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires); + add_timer(&q->wd_timer); + } +#endif + return NULL; +} + +static void +csz_reset(struct Qdisc* sch) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct sk_buff *skb; + int i; + + for (i=0; i<4; i++) + while ((skb=skb_dequeue(&q->other[i])) != NULL) + kfree_skb(skb, 0); + + for (i=0; iflow + i; + while ((skb = skb_dequeue(&this->q)) != NULL) + kfree_skb(skb, FREE_WRITE); + this->snext = this->sprev = + this->fnext = this->fprev = (struct csz_head*)this; + this->start = this->finish = 0; + } + q->s.snext = q->s.sprev = &q->s; + q->f.fnext = q->f.fprev = &q->f; + q->R_c = 0; +#ifdef CSZ_PLUS_TBF + PSCHED_GET_TIME(&q->t_tbf); + q->tokens = q->depth; + if (q->wd_timer.function) { + del_timer(&q->wd_timer); + q->wd_timer.function = NULL; + } +#endif + sch->q.qlen = 0; +} + +static void +csz_destroy(struct Qdisc* sch) +{ +/* + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + int i; + + for (i=0; i<4; i++) + qdisc_destroy(q->other[i]); + */ +} + +static int csz_init(struct Qdisc *sch, void *arg) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct cszinitctl *ctl = (struct cszinitctl*)arg; + int i; + + for (i=0; i<4; i++) + skb_queue_head_init(&q->other[i]); + + for (i=0; iflow + i; + skb_queue_head_init(&this->q); + this->snext = this->sprev = + this->fnext = this->fprev = (struct csz_head*)this; + this->start = this->finish = 0; + } + q->s.snext = q->s.sprev = &q->s; + q->f.fnext = q->f.fprev = &q->f; + q->R_c = 0; +#ifdef CSZ_PLUS_TBF + init_timer(&q->wd_timer); + q->wd_timer.data = (unsigned long)sch; +#endif + if (ctl) { + if (ctl->flows != CSZ_MAX_GUARANTEED) + return -EINVAL; + q->cell_log = ctl->cell_log; + } + return 0; +} + +static int csz_control(struct Qdisc *sch, struct pschedctl *gctl) +{ +/* + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct cszctl *ctl = (struct cszctl*)gctl->arg; + struct sk_buff *skb; + int i; + + if (op == PSCHED_TC_ATTACH) { + + } +*/ + return 0; +} + + + + +struct Qdisc_ops csz_ops = +{ + NULL, + "csz", + 0, + sizeof(struct csz_sched_data), + csz_enqueue, + csz_dequeue, + csz_reset, + csz_destroy, + csz_init, + csz_control, +}; + + +#ifdef MODULE +#include +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&csz_ops); + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c new file mode 100644 index 000000000000..8134baf16b32 --- /dev/null +++ b/net/sched/sch_fifo.c @@ -0,0 +1,179 @@ +/* + * net/sched/sch_fifo.c Simple FIFO "scheduler" + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* 1 band FIFO pseudo-"scheduler" */ + +struct fifo_sched_data +{ + int qmaxbytes; + int qmaxlen; + int qbytes; +}; + +static int +bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; + + if (q->qbytes <= q->qmaxbytes) { + skb_queue_tail(&sch->q, skb); + q->qbytes += skb->len; + return 0; + } + kfree_skb(skb, FREE_WRITE); + return 1; +} + +static struct sk_buff * +bfifo_dequeue(struct Qdisc* sch) +{ + struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; + struct sk_buff *skb; + + skb = skb_dequeue(&sch->q); + if (skb) + q->qbytes -= skb->len; + return skb; +} + +static void +bfifo_reset(struct Qdisc* sch) +{ + struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; + struct sk_buff *skb; + + while((skb=skb_dequeue(&sch->q)) != NULL) { + q->qbytes -= skb->len; + kfree_skb(skb,FREE_WRITE); + } + if (q->qbytes) { + printk("fifo_reset: qbytes=%d\n", q->qbytes); + q->qbytes = 0; + } +} + +static int +pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; + + if (sch->q.qlen <= q->qmaxlen) { + skb_queue_tail(&sch->q, skb); + return 0; + } + kfree_skb(skb, FREE_WRITE); + return 1; +} + +static struct sk_buff * +pfifo_dequeue(struct Qdisc* sch) +{ + return skb_dequeue(&sch->q); +} + +static void +pfifo_reset(struct Qdisc* sch) +{ + struct sk_buff *skb; + + while((skb=skb_dequeue(&sch->q))!=NULL) + kfree_skb(skb,FREE_WRITE); +} + + +static int fifo_init(struct Qdisc *sch, void *arg /* int bytes, int pkts */) +{ + struct fifo_sched_data *q; +/* + struct device *dev = sch->dev; + */ + + q = (struct fifo_sched_data *)sch->data; +/* + if (pkts<0) + pkts = dev->tx_queue_len; + if (bytes<0) + bytes = pkts*dev->mtu; + q->qmaxbytes = bytes; + q->qmaxlen = pkts; + */ + return 0; +} + +struct Qdisc_ops pfifo_ops = +{ + NULL, + "pfifo", + 0, + sizeof(struct fifo_sched_data), + pfifo_enqueue, + pfifo_dequeue, + pfifo_reset, + NULL, + fifo_init, +}; + +struct Qdisc_ops bfifo_ops = +{ + NULL, + "pfifo", + 0, + sizeof(struct fifo_sched_data), + bfifo_enqueue, + bfifo_dequeue, + bfifo_reset, + NULL, + fifo_init, +}; + +#ifdef MODULE +#include +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&pfifo_ops); + if (err == 0) { + err = register_qdisc(&bfifo_ops); + if (err) + unregister_qdisc(&pfifo_ops); + } + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c new file mode 100644 index 000000000000..83aa8d10eb09 --- /dev/null +++ b/net/sched/sch_generic.c @@ -0,0 +1,541 @@ +/* + * net/sched/sch_generic.c Generic packet scheduler routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct Qdisc_head qdisc_head = { &qdisc_head }; + +static struct Qdisc_ops *qdisc_base = NULL; + +/* NOTES. + + Every discipline has two major routines: enqueue and dequeue. + + ---dequeue + + dequeue usually returns a skb to send. It is allowed to return NULL, + but it does not mean that queue is empty, it just means that + discipline does not want to send anything this time. + Queue is really empty if q->q.qlen == 0. + For complicated disciplines with multiple queues q->q is not + real packet queue, but however q->q.qlen must be valid. + + ---enqueue + + enqueue returns number of enqueued packets i.e. this number is 1, + if packet was enqueued sucessfully and <1 if something (not + necessary THIS packet) was dropped. + + */ + +int register_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + if (strcmp(qops->id, q->id) == 0) + return -EEXIST; + qops->next = NULL; + qops->refcnt = 0; + *qp = qops; + return 0; +} + +int unregister_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + if (q == qops) + break; + if (!q) + return -ENOENT; + *qp = q->next; + return 0; +} + +struct Qdisc *qdisc_lookup(int handle) +{ + return NULL; +} + + +/* "NOOP" scheduler: the best scheduler, recommended for all interfaces + in all curcumstances. It is difficult to invent anything more + fast or cheap. + */ + +static int +noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +{ + kfree_skb(skb, FREE_WRITE); + return 0; +} + +static struct sk_buff * +noop_dequeue(struct Qdisc * qdisc) +{ + return NULL; +} + +struct Qdisc noop_qdisc = +{ + { NULL }, + noop_enqueue, + noop_dequeue, +}; + +struct Qdisc noqueue_qdisc = +{ + { NULL }, + NULL, + NULL, +}; + + +/* 3-band FIFO queue: old style, but should be a bit faster (several CPU insns) */ + +static int +pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 }; + struct sk_buff_head *list; + + list = ((struct sk_buff_head*)qdisc->data) + prio2band[skb->priority&7]; + + if (list->qlen <= skb->dev->tx_queue_len) { + skb_queue_tail(list, skb); + return 1; + } + qdisc->dropped++; + kfree_skb(skb, FREE_WRITE); + return 0; +} + +static struct sk_buff * +pfifo_fast_dequeue(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data); + struct sk_buff *skb; + + for (prio = 0; prio < 3; prio++, list++) { + skb = skb_dequeue(list); + if (skb) + return skb; + } + return NULL; +} + +static void +pfifo_fast_reset(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data); + + for (prio=0; prio < 3; prio++) + skb_queue_purge(list+prio); +} + +static int pfifo_fast_init(struct Qdisc *qdisc, void *arg) +{ + int i; + struct sk_buff_head *list; + + list = ((struct sk_buff_head*)qdisc->data); + + for(i=0; i<3; i++) + skb_queue_head_init(list+i); + + return 0; +} + +static struct Qdisc_ops pfifo_fast_ops = +{ + NULL, + "pfifo_fast", + 1, + 3 * sizeof(struct sk_buff_head), + pfifo_fast_enqueue, + pfifo_fast_dequeue, + pfifo_fast_reset, + NULL, + pfifo_fast_init +}; + +static struct Qdisc * +qdisc_alloc(struct device *dev, struct Qdisc_ops *ops, void *arg) +{ + struct Qdisc *sch; + int size = sizeof(*sch) + ops->priv_size; + + sch = kmalloc(size, GFP_KERNEL); + if (!sch) + return NULL; + memset(sch, 0, size); + + skb_queue_head_init(&sch->q); + skb_queue_head_init(&sch->failure_q); + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev = dev; + if (ops->init && ops->init(sch, arg)) + return NULL; + ops->refcnt++; + return sch; +} + +void qdisc_reset(struct Qdisc *qdisc) +{ + struct Qdisc_ops *ops = qdisc->ops; + if (ops) { + start_bh_atomic(); + if (ops->reset) + ops->reset(qdisc); + skb_queue_purge(&qdisc->failure_q); + end_bh_atomic(); + } +} + +void qdisc_destroy(struct Qdisc *qdisc) +{ + struct Qdisc_ops *ops = qdisc->ops; + if (ops) { + start_bh_atomic(); + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + skb_queue_purge(&qdisc->failure_q); + ops->refcnt--; + end_bh_atomic(); + kfree(qdisc); + } +} + +static void dev_do_watchdog(unsigned long dummy); + +static struct timer_list dev_watchdog = + { NULL, NULL, 0L, 0L, &dev_do_watchdog }; + +static void dev_do_watchdog(unsigned long dummy) +{ + struct Qdisc_head *h; + + for (h = qdisc_head.forw; h != &qdisc_head; h = h->forw) { + struct Qdisc *q = (struct Qdisc*)h; + struct device *dev = q->dev; + if (dev->tbusy && jiffies - q->tx_last > q->tx_timeo) { + qdisc_restart(dev); + } + } + dev_watchdog.expires = jiffies + 5*HZ; + add_timer(&dev_watchdog); +} + + +void dev_activate(struct device *dev) +{ + /* No queueing discipline is attached to device; + create default one i.e. pfifo_fast for devices, + which need queueing and noqueue_qdisc for + virtual intrfaces + */ + + if (dev->qdisc_sleeping == &noop_qdisc) { + if (dev->tx_queue_len) { + struct Qdisc *qdisc; + qdisc = qdisc_alloc(dev, &pfifo_fast_ops, NULL); + if (qdisc == NULL) + return; + dev->qdisc_sleeping = qdisc; + } else + dev->qdisc_sleeping = &noqueue_qdisc; + } + + start_bh_atomic(); + if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) { + dev->qdisc->tx_timeo = 5*HZ; + dev->qdisc->tx_last = jiffies - dev->qdisc->tx_timeo; + if (!dev_watchdog.expires) { + dev_watchdog.expires = jiffies + 5*HZ; + add_timer(&dev_watchdog); + } + } + end_bh_atomic(); +} + +void dev_deactivate(struct device *dev) +{ + struct Qdisc *qdisc; + + start_bh_atomic(); + + qdisc = dev->qdisc; + dev->qdisc = &noop_qdisc; + + qdisc_reset(qdisc); + + if (qdisc->h.forw) { + struct Qdisc_head **hp, *h; + + for (hp = &qdisc_head.forw; (h = *hp) != &qdisc_head; hp = &h->forw) { + if (h == &qdisc->h) { + *hp = h->forw; + break; + } + } + } + + end_bh_atomic(); +} + +void dev_init_scheduler(struct device *dev) +{ + dev->qdisc = &noop_qdisc; + dev->qdisc_sleeping = &noop_qdisc; +} + +void dev_shutdown(struct device *dev) +{ + struct Qdisc *qdisc; + + start_bh_atomic(); + qdisc = dev->qdisc_sleeping; + dev->qdisc_sleeping = &noop_qdisc; + qdisc_destroy(qdisc); + end_bh_atomic(); +} + +void dev_set_scheduler(struct device *dev, struct Qdisc *qdisc) +{ + struct Qdisc *oqdisc; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + start_bh_atomic(); + oqdisc = dev->qdisc_sleeping; + + /* Destroy old scheduler */ + if (oqdisc) + qdisc_destroy(oqdisc); + + /* ... and attach new one */ + dev->qdisc_sleeping = qdisc; + dev->qdisc = &noop_qdisc; + end_bh_atomic(); + + if (dev->flags & IFF_UP) + dev_activate(dev); +} + +/* Kick the queue "q". + Note, that this procedure is called by watchdog timer, so that + we do not check dev->tbusy flag here. + + Returns: 0 - queue is empty. + >0 - queue is not empty, but throttled. + <0 - queue is not empty. Device is throttled, if dev->tbusy != 0. + + NOTE: Called only from NET BH +*/ + + +int qdisc_restart(struct device *dev) +{ + struct Qdisc *q = dev->qdisc; + struct sk_buff *skb; + + skb = skb_dequeue(&q->failure_q); + if (!skb) { + skb = q->dequeue(q); + if (netdev_nit && skb) + dev_queue_xmit_nit(skb,dev); + } + if (skb) { + if (dev->hard_start_xmit(skb, dev) == 0) { + q->tx_last = jiffies; + return -1; + } +#if 0 + if (net_ratelimit()) + printk(KERN_DEBUG "netdevice %s defers output.\n", dev->name); +#endif + skb_queue_head(&q->failure_q, skb); + return -1; + } + return q->q.qlen; +} + +void qdisc_run_queues(void) +{ + struct Qdisc_head **hp, *h; + + hp = &qdisc_head.forw; + while ((h = *hp) != &qdisc_head) { + int res = -1; + struct Qdisc *q = (struct Qdisc*)h; + struct device *dev = q->dev; + + while (!dev->tbusy && (res = qdisc_restart(dev)) < 0) + /* NOTHING */; + + /* The explanation is necessary here. + qdisc_restart called dev->hard_start_xmit, + if device is virtual, it could trigger one more + dev_queue_xmit and new device could appear + in active chain. In this case we cannot unlink + empty queue, because we lost back pointer. + No problem, we will unlink it during the next round. + */ + + if (res == 0 && *hp == h) { + *hp = h->forw; + h->forw = NULL; + continue; + } + hp = &h->forw; + } +} + + +int tc_init(struct pschedctl *pctl) +{ + struct Qdisc *q; + struct Qdisc_ops *qops; + + if (pctl->handle) { + q = qdisc_lookup(pctl->handle); + if (q == NULL) + return -ENOENT; + qops = q->ops; + if (pctl->ifindex && q->dev->ifindex != pctl->ifindex) + return -EINVAL; + } + return -EINVAL; +} + +int tc_destroy(struct pschedctl *pctl) +{ + return -EINVAL; +} + +int tc_attach(struct pschedctl *pctl) +{ + return -EINVAL; +} + +int tc_detach(struct pschedctl *pctl) +{ + return -EINVAL; +} + + +int psched_ioctl(void *arg) +{ + struct pschedctl ctl; + struct pschedctl *pctl = &ctl; + int err; + + if (copy_from_user(&ctl, arg, sizeof(ctl))) + return -EFAULT; + + if (ctl.arglen > 0) { + pctl = kmalloc(sizeof(ctl) + ctl.arglen, GFP_KERNEL); + if (pctl == NULL) + return -ENOBUFS; + memcpy(pctl, &ctl, sizeof(ctl)); + if (copy_from_user(pctl->args, ((struct pschedctl*)arg)->args, ctl.arglen)) { + kfree(pctl); + return -EFAULT; + } + } + + rtnl_lock(); + + switch (ctl.command) { + case PSCHED_TC_INIT: + err = tc_init(pctl); + break; + case PSCHED_TC_DESTROY: + err = tc_destroy(pctl); + break; + case PSCHED_TC_ATTACH: + err = tc_attach(pctl); + break; + case PSCHED_TC_DETACH: + err = tc_detach(pctl); + break; + default: + err = -EINVAL; + } + + rtnl_unlock(); + + if (pctl != &ctl) + kfree(pctl); + return err; +} + +__initfunc(int pktsched_init(void)) +{ +#define INIT_QDISC(name) { \ + extern struct Qdisc_ops name##_ops; \ + register_qdisc(&##name##_ops); \ + } + + skb_queue_head_init(&noop_qdisc.failure_q); + skb_queue_head_init(&noqueue_qdisc.failure_q); + + register_qdisc(&pfifo_fast_ops); +#ifdef CONFIG_NET_SCH_CBQ + INIT_QDISC(cbq); +#endif +#ifdef CONFIG_NET_SCH_CSZ + INIT_QDISC(csz); +#endif +#ifdef CONFIG_NET_SCH_RED + INIT_QDISC(red); +#endif +#ifdef CONFIG_NET_SCH_SFQ + INIT_QDISC(sfq); +#endif +#ifdef CONFIG_NET_SCH_TBF + INIT_QDISC(tbf); +#endif +#ifdef CONFIG_NET_SCH_PFIFO + INIT_QDISC(pfifo); + INIT_QDISC(bfifo); +#endif +#ifdef CONFIG_NET_SCH_PRIO + INIT_QDISC(prio); +#endif + return 0; +} diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c new file mode 100644 index 000000000000..a3806eda4352 --- /dev/null +++ b/net/sched/sch_prio.c @@ -0,0 +1,146 @@ +/* + * net/sched/sch_prio.c Simple 3-band priority "scheduler". + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* New N-band generic scheduler */ + +struct prio_sched_data +{ + int qbytes; + int bands; + u8 prio2band[8]; + struct Qdisc *queues[8]; +}; + +static int +prio_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int prio = q->prio2band[skb->priority&7]; + struct Qdisc *qdisc; + + qdisc = q->queues[prio]; + if (qdisc->enqueue(skb, qdisc) == 0) { + q->qbytes += skb->len; + sch->q.qlen++; + return 0; + } + return 1; +} + +static struct sk_buff * +prio_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int prio; + struct Qdisc *qdisc; + + for (prio = 0; prio < q->bands; prio++) { + qdisc = q->queues[prio]; + skb = qdisc->dequeue(qdisc); + if (skb) { + q->qbytes -= skb->len; + sch->q.qlen--; + return skb; + } + } + return NULL; + +} + +static void +prio_reset(struct Qdisc* sch) +{ + int prio; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + for (prio=0; priobands; prio++) + qdisc_reset(q->queues[prio]); + q->qbytes = 0; +} + +static void +prio_destroy(struct Qdisc* sch) +{ + int prio; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + for (prio=0; priobands; prio++) { + qdisc_destroy(q->queues[prio]); + q->queues[prio] = &noop_qdisc; + } +} + +static int prio_init(struct Qdisc *sch, void *arg) +{ + const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 }; + struct prio_sched_data *q; + int i; + + q = (struct prio_sched_data *)sch->data; + q->bands = 3; + memcpy(q->prio2band, prio2band, sizeof(prio2band)); + for (i=0; ibands; i++) + q->queues[i] = &noop_qdisc; + return 0; +} + +struct Qdisc_ops prio_ops = +{ + NULL, + "prio", + 0, + sizeof(struct prio_sched_data), + prio_enqueue, + prio_dequeue, + prio_reset, + prio_destroy, + prio_init, +}; + +#ifdef MODULE +#include +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&prio_ops); + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c new file mode 100644 index 000000000000..fd3ee43ac79d --- /dev/null +++ b/net/sched/sch_red.c @@ -0,0 +1,303 @@ +/* + * net/sched/sch_red.c Random Early Detection scheduler. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Random Early Detection (RED) algorithm. + ======================================= + + Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways + for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. + + This file codes a "divisionless" version of RED algorithm + written down in Fig.17 of the paper. + +Short description. +------------------ + + When new packet arrives we calculate average queue length: + + avg = (1-W)*avg + W*current_queue_len, + + W is filter time constant (choosen as 2^(-Wlog)), controlling + inertia of algorithm. To allow larger bursts, W should be + decreased. + + if (avg > th_max) -> packet marked (dropped). + if (avg < th_min) -> packet passes. + if (th_min < avg < th_max) we calculate probability: + + Pb = max_P * (avg - th_min)/(th_max-th_min) + + and mark (drop) packet with this probability. + Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). + max_P should be small (not 1!). + + NB. SF&VJ assumed that Pb[avg] is linear function. I think it + is wrong. I'd make: + P[th_min] = 0, P[th_max] = 1; + dP/davg[th_min] = 0, dP/davg[th_max] = infinity, or a large number. + + I choose max_P as a number between 0.01 and 0.1, so that + C1 = max_P/(th_max-th_min) is power of two: C1 = 2^(-C1log) + + Parameters, settable by user (with default values): + + qmaxbytes=256K - hard limit on queue length, should be chosen >qth_max + to allow packet bursts. This parameter does not + affect algorithm behaviour and can be chosen + arbitrarily high (well, less than ram size) + Really, this limit will never be achieved + if RED works correctly. + qth_min=32K + qth_max=128K - qth_max should be at least 2*qth_min + Wlog=8 - log(1/W). + Alog=Wlog - fixed point position in th_min and th_max. + Rlog=10 + C1log=24 - C1log = trueC1log+Alog-Rlog + so that trueC1log=22 and max_P~0.02 + + +NOTES: + +Upper bound on W. +----------------- + + If you want to allow bursts of L packets of size S, + you should choose W: + + L + 1 -th_min/S < (1-(1-W)^L)/W + + For th_min/S = 32 + + log(W) L + -1 33 + -2 35 + -3 39 + -4 46 + -5 57 + -6 75 + -7 101 + -8 135 + -9 190 + etc. + */ + +struct red_sched_data +{ +/* Parameters */ + unsigned long qmaxbytes; /* HARD maximal queue length */ + unsigned long qth_min; /* Min average length threshold: A scaled */ + unsigned long qth_max; /* Max average length threshold: A scaled */ + char Alog; /* Point position in average lengths */ + char Wlog; /* log(W) */ + char Rlog; /* random number bits */ + char C1log; /* log(1/C1) */ + char Slog; + char Stab[256]; + +/* Variables */ + unsigned long qbytes; /* Queue length in bytes */ + unsigned long qave; /* Average queue length: A scaled */ + int qcount; /* Packets since last random number generation */ + unsigned qR; /* Cached random number [0..1data; + + psched_time_t now; + + if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { + long us_idle; + PSCHED_SET_PASTPERFECT(q->qidlestart); + PSCHED_GET_TIME(now); + us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, (256<Slog)-1, 0); + +/* It is wrong, but I do not think that SF+VJ proposal is reasonable + and did not invented anything more clever 8) + + The problem: ideally, average length queue recalcultion should + be done over constant clock intervals. It is too expensive, so that + calculation is driven by outgoing packets. + When queue is idle we have to model this clock by hands. + + SF+VJ proposed to "generate" m = (idletime/bandwidth)*average_pkt_size + dummy packets as burst after idle time, i.e. + + q->qave *= (1-W)^m + + It is apparently overcomplicated solution (f.e. we have to precompute + a table to make this calculation for reasonable time) + I believe, that a simpler model may be used here, + but it is field for experiments. +*/ + q->qave >>= q->Stab[(us_idle>>q->Slog)&0xFF]; + } + + q->qave += ((q->qbytes<Alog) - q->qave) >> q->Wlog; + + if (q->qave < q->qth_min) { +enqueue: + q->qcount = -1; + if (q->qbytes <= q->qmaxbytes) { + skb_queue_tail(&sch->q, skb); + q->qbytes += skb->len; + return 1; + } +drop: + kfree_skb(skb, FREE_WRITE); + return 0; + } + if (q->qave >= q->qth_max) { + q->qcount = -1; + goto drop; + } + q->qcount++; + if (q->qcount++) { + if ((((q->qave - q->qth_min)*q->qcount)>>q->C1log) < q->qR) + goto enqueue; + q->qcount = 0; + q->qR = red_random(q->Rlog); + goto drop; + } + q->qR = red_random(q->Rlog); + goto enqueue; +} + +static struct sk_buff * +red_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct red_sched_data *q = (struct red_sched_data *)sch->data; + + skb = skb_dequeue(&sch->q); + if (skb) { + q->qbytes -= skb->len; + return skb; + } + PSCHED_GET_TIME(q->qidlestart); + return NULL; +} + +static void +red_reset(struct Qdisc* sch) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + struct sk_buff *skb; + + while((skb=skb_dequeue(&sch->q))!=NULL) { + q->qbytes -= skb->len; + kfree_skb(skb,FREE_WRITE); + } + if (q->qbytes) { + printk("red_reset: qbytes=%lu\n", q->qbytes); + q->qbytes = 0; + } + PSCHED_SET_PASTPERFECT(q->qidlestart); + q->qave = 0; + q->qcount = -1; +} + +static int red_init(struct Qdisc *sch, struct pschedctl *pctl) +{ + struct red_sched_data *q; + struct redctl *ctl = (struct redctl*)pctl->args; + + q = (struct red_sched_data *)sch->data; + + if (pctl->arglen < sizeof(struct redctl)) + return -EINVAL; + + q->Wlog = ctl->Wlog; + q->Alog = ctl->Alog; + q->Rlog = ctl->Rlog; + q->C1log = ctl->C1log; + q->Slog = ctl->Slog; + q->qth_min = ctl->qth_min; + q->qth_max = ctl->qth_max; + q->qmaxbytes = ctl->qmaxbytes; + memcpy(q->Stab, ctl->Stab, 256); + + q->qcount = -1; + PSCHED_SET_PASTPERFECT(q->qidlestart); + return 0; +} + +struct Qdisc_ops red_ops = +{ + NULL, + "red", + 0, + sizeof(struct red_sched_data), + red_enqueue, + red_dequeue, + red_reset, + NULL, + red_init, + NULL +}; + + +#ifdef MODULE +#include +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&red_ops); + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c new file mode 100644 index 000000000000..65c3906b47a1 --- /dev/null +++ b/net/sched/sch_sfq.c @@ -0,0 +1,333 @@ +/* + * net/sched/sch_sfq.c Stochastic Fairness Queueing scheduler. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Stochastic Fairness Queuing algorithm. + ======================================= + + Source: + Paul E. McKenney "Stochastic Fairness Queuing", + IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. + + Paul E. McKenney "Stochastic Fairness Queuing", + "Interworking: Research and Experience", v.2, 1991, p.113-131. + + + See also: + M. Shreedhar and George Varghese "Efficient Fair + Queuing using Deficit Round Robin", Proc. SIGCOMM 95. + + + It is not the thing that usually called (W)FQ nowadays. It does not + use any timestamp mechanism, but instead processes queues + in round-robin order. + + ADVANTAGE: + + - It is very cheap. Both CPU and memory requirements are minimal. + + DRAWBACKS: + + - "Stochastic" -> It is not 100% fair. + When hash collisions occur, several flows are considred as one. + + - "Round-robin" -> It introduces larger delays than virtual clock + based schemes, and should not be used for isolation interactive + traffic from non-interactive. It means, that this scheduler + should be used as leaf of CBQ or P3, which put interactive traffic + to higher priority band. + + We still need true WFQ for top level CSZ, but using WFQ + for the best effort traffic is absolutely pointless: + SFQ is superior for this purpose. + + IMPLEMENTATION: + This implementation limits maximal queue length to 128; + maximal mtu to 2^15-1; number of hash buckets to 1024. + The only goal of this restrictions was that all data + fitted to one 4K page :-). Struct sfq_sched_data is + organized in anti-cache manner: all the data for bucket + scattered over different locations. It is not good, + but it allowed to put it into 4K. + + It is easy to increase these values. +*/ + +#define SFQ_DEPTH 128 +#define SFQ_HASH_DIVISOR 1024 + +#define SFQ_HASH(a) 0 + +/* This type should contain at least SFQ_DEPTH*2 values */ +typedef unsigned char sfq_index; + +struct sfq_head +{ + sfq_index next; + sfq_index prev; +}; + +struct sfq_sched_data +{ +/* Parameters */ + unsigned quantum; /* Allotment per round: MUST BE >= MTU */ + +/* Variables */ + sfq_index tail; /* Index of current slot in round */ + sfq_index max_depth; /* Maximal depth */ + + sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ + sfq_index next[SFQ_DEPTH]; /* Active slots link */ + short allot[SFQ_DEPTH]; /* Current allotment per slot */ + unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */ + struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */ + struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */ +}; + +extern __inline__ void sfq_link(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d = q->qs[x].qlen; + + p = d; + n = q->dep[d].next; + q->dep[x].next = n; + q->dep[x].prev = p; + q->dep[p].next = q->dep[n].prev = x; +} + +extern __inline__ void sfq_dec(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + + if (n == p && q->max_depth == q->qs[x].qlen + 1) + q->max_depth--; + + sfq_link(q, x); +} + +extern __inline__ void sfq_inc(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + d = q->qs[x].qlen; + if (q->max_depth < d) + q->max_depth = d; + + sfq_link(q, x); +} + +static __inline__ void sfq_drop(struct sfq_sched_data *q) +{ + struct sk_buff *skb; + sfq_index d = q->max_depth; + + /* Queue is full! Find the longest slot and + drop a packet from it */ + + if (d != 1) { + sfq_index x = q->dep[d].next; + skb = q->qs[x].prev; + __skb_unlink(skb, &q->qs[x]); + kfree_skb(skb, FREE_WRITE); + sfq_dec(q, x); +/* + sch->q.qlen--; + */ + return; + } + + /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + + d = q->next[q->tail]; + q->next[q->tail] = q->next[d]; + q->allot[q->next[d]] += q->quantum; + skb = q->qs[d].prev; + __skb_unlink(skb, &q->qs[d]); + kfree_skb(skb, FREE_WRITE); + sfq_dec(q, d); +/* + sch->q.qlen--; + */ + q->ht[q->hash[d]] = SFQ_DEPTH; + return; +} + +static int +sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + unsigned hash = SFQ_HASH(skb); + sfq_index x; + + x = q->ht[hash]; + if (x == SFQ_DEPTH) { + q->ht[hash] = x = q->dep[SFQ_DEPTH].next; + q->hash[x] = hash; + } + __skb_queue_tail(&q->qs[x], skb); + sfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == SFQ_DEPTH) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } + if (++sch->q.qlen < SFQ_DEPTH-1) + return 1; + + sfq_drop(q); + return 0; +} + +static struct sk_buff * +sfq_dequeue(struct Qdisc* sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + struct sk_buff *skb; + sfq_index a, old_a; + + /* No active slots */ + if (q->tail == SFQ_DEPTH) + return NULL; + + a = old_a = q->next[q->tail]; + + /* Grab packet */ + skb = __skb_dequeue(&q->qs[a]); + sfq_dec(q, a); + sch->q.qlen--; + + /* Is the slot empty? */ + if (q->qs[a].qlen == 0) { + a = q->next[a]; + if (a == old_a) { + q->tail = SFQ_DEPTH; + return skb; + } + q->next[q->tail] = a; + q->allot[a] += q->quantum; + } else if ((q->allot[a] -= skb->len) <= 0) { + q->tail = a; + a = q->next[a]; + q->allot[a] += q->quantum; + } + return skb; +} + +static void +sfq_reset(struct Qdisc* sch) +{ + struct sk_buff *skb; + + while ((skb = sfq_dequeue(sch)) != NULL) + kfree_skb(skb, FREE_WRITE); +} + + +static int sfq_open(struct Qdisc *sch, void *arg) +{ + struct sfq_sched_data *q; + int i; + + q = (struct sfq_sched_data *)sch->data; + + for (i=0; iht[i] = SFQ_DEPTH; + for (i=0; iqs[i]); + q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH; + q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH; + } + q->max_depth = 0; + q->tail = SFQ_DEPTH; + q->quantum = sch->dev->mtu; + if (sch->dev->hard_header) + q->quantum += sch->dev->hard_header_len; + for (i=0; i + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Simple Token Bucket Filter. + ======================================= + + SOURCE. + + None. + + ALGORITHM. + + Sequence of packets satisfy token bucket filter with + rate $r$ and depth $b$, if all the numbers defined by: + \begin{eqnarray*} + n_0 &=& b, \\ + n_i &=& {\rm max} ( b, n_{i-1} + r*(t_i-t_{i-1}) - L_i ), + \end{eqnarray*} + where $t_i$ --- departure time of $i$-th packet and + $L_i$ -- its length, never less than zero. + + It is convenient to rescale $n_i$ by factor $r$, so + that the sequence has "canonical" form: + \[ + n_0 = b/r, + n_i = max { b/r, n_{i-1} + t_i - t_{i-1} - L_i/r }, + \] + + If a packet has n_i < 0, we throttle filter + by $-n_i$ usecs. + + NOTES. + + If TBF throttles, it starts watchdog timer, which will wake up it + after 0...10 msec. + If no new packets will arrive during this period, + or device will not be awaken by EOI for previous packet, + tbf could stop its activity for 10 msec. + + It means that tbf will sometimes introduce pathological + 10msec delays to flow corresponding to rate*10msec bytes. + For 10Mbit/sec flow it is about 12Kb, on 100Mbit/sec -- ~100Kb. + This number puts lower reasonbale bound on token bucket depth, + but even if depth is larger traffic is erratic at large rates. + + This problem is not specific for THIS implementation. Really, + there exists statement that any attempt to shape traffic + in transit will increase delays and jitter much more than + we expected naively. + + Particularily, it means that delay/jitter sensitive traffic + MUST NOT be shaped. Cf. CBQ (wrong) and CSZ (correct) approaches. +*/ + +struct tbf_sched_data +{ +/* Parameters */ + int cell_log; /* 1<= MTU/B */ + unsigned long max_bytes; /* Maximal length of backlog: bytes */ + +/* Variables */ + unsigned long bytes; /* Current length of backlog */ + unsigned long tokens; /* Current number of tokens */ + psched_time_t t_c; /* Time check-point */ + struct timer_list wd_timer; /* Watchdog timer */ +}; + +#define L2T(q,L) ((q)->L_tab[(L)>>(q)->cell_log]) + +static int +tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + __skb_queue_tail(&sch->q, skb); + if ((q->bytes += skb->len) <= q->max_bytes) + return 1; + + /* Drop action: undo the things that we just made, + * i.e. make tail drop + */ + + __skb_unlink(skb, &sch->q); + q->bytes -= skb->len; + kfree_skb(skb, FREE_WRITE); + return 0; +} + +static void tbf_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + q->wd_timer.function = NULL; + + qdisc_wakeup(sch->dev); +} + + +static struct sk_buff * +tbf_dequeue(struct Qdisc* sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->q); + + if (skb) { + psched_time_t now; + long toks; + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->depth, 0) + + q->tokens - L2T(q,skb->len); + + if (toks >= 0) { + q->t_c = now; + q->tokens = toks <= q->depth ? toks : q->depth; + q->bytes -= skb->len; + return skb; + } + + /* Maybe, we have in queue a shorter packet, + which can be sent now. It sounds cool, + but, however, wrong in principle. + We MUST NOT reorder packets in these curcumstances. + + Really, if we splitted flow to independent + subflows, it would be very good solution. + Look at sch_csz.c. + */ + __skb_queue_head(&sch->q, skb); + + if (!sch->dev->tbusy) { + if (q->wd_timer.function) + del_timer(&q->wd_timer); + q->wd_timer.function = tbf_watchdog; + q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(-toks); + add_timer(&q->wd_timer); + } + } + return NULL; +} + + +static void +tbf_reset(struct Qdisc* sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sch->q)) != NULL) + kfree_skb(skb, FREE_WRITE); + q->bytes = 0; + PSCHED_GET_TIME(q->t_c); + q->tokens = q->depth; + if (q->wd_timer.function) { + del_timer(&q->wd_timer); + q->wd_timer.function = NULL; + } +} + +static int tbf_init(struct Qdisc* sch, void *arg) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + struct tbfctl *ctl = (struct tbfctl*)arg; + + PSCHED_GET_TIME(q->t_c); + init_timer(&q->wd_timer); + q->wd_timer.function = NULL; + q->wd_timer.data = (unsigned long)sch; + if (ctl) { + q->max_bytes = ctl->bytes; + q->depth = ctl->depth; + q->tokens = q->tokens; + q->cell_log = ctl->cell_log; + memcpy(q->L_tab, ctl->L_tab, 256*sizeof(unsigned long)); + } + return 0; +} + +struct Qdisc_ops tbf_ops = +{ + NULL, + "tbf", + 0, + sizeof(struct tbf_sched_data), + tbf_enqueue, + tbf_dequeue, + tbf_reset, + NULL, + tbf_init, + NULL, +}; + + +#ifdef MODULE +#include +int init_module(void) +{ + int err; + + /* Load once and never free it. */ + MOD_INC_USE_COUNT; + + err = register_qdisc(&tbf_ops); + if (err) + MOD_DEC_USE_COUNT; + return err; +} + +void cleanup_module(void) +{ +} +#endif diff --git a/net/socket.c b/net/socket.c index ee19a84f2c6b..697a06cd3b49 100644 --- a/net/socket.c +++ b/net/socket.c @@ -74,18 +74,16 @@ #include #include #include +#include #if defined(CONFIG_KERNELD) && defined(CONFIG_NET) #include #endif -#include - #include #include #include -#include #include #include #include @@ -103,7 +101,8 @@ static ssize_t sock_write(struct file *file, const char *buf, size_t size, loff_t *ppos); static int sock_close(struct inode *inode, struct file *file); -static unsigned int sock_poll(struct file *file, poll_table *wait); +static unsigned int sock_poll(struct file *file, + struct poll_table_struct *wait); static int sock_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); static int sock_fasync(struct file *filp, int on); @@ -1158,8 +1157,11 @@ asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) * skbuff accounting stops it from going too far. * I hope this is correct. */ - if (msg_sys.msg_controllen > sizeof(ctl) && - msg_sys.msg_controllen <= 256) + if (msg_sys.msg_controllen > 256) { + err = -EINVAL; + goto failed2; + } + if (msg_sys.msg_controllen > sizeof(ctl)) { ctl_buf = kmalloc(msg_sys.msg_controllen, GFP_KERNEL); if (ctl_buf == NULL) @@ -1176,11 +1178,11 @@ asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) msg_sys.msg_control = ctl_buf; } msg_sys.msg_flags = flags; - if (current->files->fd[fd]->f_flags & O_NONBLOCK) - msg_sys.msg_flags |= MSG_DONTWAIT; if ((sock = sockfd_lookup(fd,&err))!=NULL) { + if (current->files->fd[fd]->f_flags & O_NONBLOCK) + msg_sys.msg_flags |= MSG_DONTWAIT; err = sock_sendmsg(sock, &msg_sys, total_len); sockfd_put(sock); } @@ -1246,11 +1248,10 @@ asmlinkage int sys_recvmsg(int fd, struct msghdr *msg, unsigned int flags) cmsg_ptr = (unsigned long)msg_sys.msg_control; msg_sys.msg_flags = 0; - if (current->files->fd[fd]->f_flags&O_NONBLOCK) - flags |= MSG_DONTWAIT; - if ((sock = sockfd_lookup(fd, &err))!=NULL) { + if (current->files->fd[fd]->f_flags&O_NONBLOCK) + flags |= MSG_DONTWAIT; err=sock_recvmsg(sock, &msg_sys, total_len, flags); if(err>=0) len=err; @@ -1392,9 +1393,10 @@ asmlinkage int sys_socketcall(int call, unsigned long *args) int sock_register(struct net_proto_family *ops) { - if (ops->family < 0 || ops->family >= NPROTO) - return -1; - + if (ops->family >= NPROTO) { + printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); + return -ENOBUFS; + } net_families[ops->family]=ops; return 0; } @@ -1450,13 +1452,6 @@ __initfunc(void sock_init(void)) sk_init(); - /* - * The netlink device handler may be needed early. - */ - -#ifdef CONFIG_NETLINK - init_netlink(); -#endif /* * Wan router layer. @@ -1479,6 +1474,17 @@ __initfunc(void sock_init(void)) */ proto_init(); + + /* + * The netlink device handler may be needed early. + */ + +#ifdef CONFIG_RTNETLINK + rtnetlink_init(); +#endif +#ifdef CONFIG_NETLINK_DEV + init_netlink(); +#endif } int socket_get_info(char *buffer, char **start, off_t offset, int length) diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index b5495df9379c..73f805f40608 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -73,10 +73,10 @@ EXPORT_SYMBOL(svc_wake_up); /* RPC statistics */ #ifdef CONFIG_PROC_FS +EXPORT_SYMBOL(rpc_proc_init); EXPORT_SYMBOL(rpc_proc_register); EXPORT_SYMBOL(rpc_register_sysctl); EXPORT_SYMBOL(rpc_proc_unregister); -EXPORT_SYMBOL(rpc_proc_init); EXPORT_SYMBOL(rpc_proc_read); EXPORT_SYMBOL(svc_proc_register); EXPORT_SYMBOL(svc_proc_unregister); diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 1acd01749d29..5f5e8593e80f 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -24,7 +24,11 @@ extern ctl_table ipv4_table[]; extern ctl_table ipx_table[]; #endif -extern ctl_table core_table[], unix_table[]; +extern ctl_table core_table[]; + +#ifdef CONFIG_UNIX +extern ctl_table unix_table[]; +#endif #ifdef CONFIG_NET extern ctl_table ether_table[], e802_table[]; @@ -44,7 +48,9 @@ extern ctl_table tr_table[]; ctl_table net_table[] = { {NET_CORE, "core", NULL, 0, 0555, core_table}, +#ifdef CONFIG_UNIX {NET_UNIX, "unix", NULL, 0, 0555, unix_table}, +#endif #ifdef CONFIG_NET {NET_802, "802", NULL, 0, 0555, e802_table}, {NET_ETHER, "ethernet", NULL, 0, 0555, ether_table}, diff --git a/net/unix/Makefile b/net/unix/Makefile index afce06790d92..f0bebfae360b 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -9,6 +9,7 @@ O_TARGET := unix.o O_OBJS := af_unix.o garbage.o +M_OBJS := $(O_TARGET) ifeq ($(CONFIG_SYSCTL),y) O_OBJS += sysctl_net_unix.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 936d61220173..09a517f68a03 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -26,6 +26,7 @@ * Alan Cox : Started POSIXisms * Andreas Schwab : Replace inode by dentry for proper * reference counting + * Kirk Petersen : Made this a module * * Known differences from reference BSD that was tested: * @@ -57,6 +58,7 @@ * with BSD names. */ +#include #include #include #include @@ -310,6 +312,9 @@ static void unix_destroy_socket(unix_socket *sk) sk->dead=1; unix_delayed_delete(sk); /* Try every so often until buffers are all freed */ } + + /* socket destroyed, decrement count */ + MOD_DEC_USE_COUNT; } static int unix_listen(struct socket *sock, int backlog) @@ -373,6 +378,10 @@ static int unix_create(struct socket *sock, int protocol) sk->mtu=4096; sk->protinfo.af_unix.list=&unix_sockets_unbound; unix_insert_socket(sk); + + /* socket created, increment count */ + MOD_INC_USE_COUNT; + return 0; } @@ -1465,7 +1474,14 @@ struct net_proto_family unix_family_ops = { unix_create }; +#ifdef MODULE +extern void unix_sysctl_register(void); +extern void unix_sysctl_unregister(void); + +int init_module(void) +#else __initfunc(void unix_proto_init(struct net_proto *pro)) +#endif { struct sk_buff *dummy_skb; struct proc_dir_entry *ent; @@ -1474,14 +1490,33 @@ __initfunc(void unix_proto_init(struct net_proto *pro)) if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) { printk(KERN_CRIT "unix_proto_init: panic\n"); +#ifdef MODULE + return -1; +#else return; +#endif } sock_register(&unix_family_ops); #ifdef CONFIG_PROC_FS ent = create_proc_entry("net/unix", 0, 0); ent->read_proc = unix_read_proc; #endif + +#ifdef MODULE + unix_sysctl_register(); + + return 0; +#endif } + +#ifdef MODULE +void cleanup_module(void) +{ + sock_unregister(AF_UNIX); + unix_sysctl_unregister(); +} +#endif + /* * Local variables: * compile-command: "gcc -g -D__KERNEL__ -Wall -O6 -I/usr/src/linux/include -c af_unix.c" diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index f487ae95adbd..d492e8e2b6c6 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -29,4 +29,31 @@ ctl_table unix_table[] = { &proc_dointvec_jiffies}, {0} }; -#endif + +#ifdef MODULE +static struct ctl_table_header * unix_sysctl_header; +static struct ctl_table unix_root_table[]; +static struct ctl_table unix_net_table[]; + +ctl_table unix_root_table[] = { + {CTL_NET, "net", NULL, 0, 0555, unix_net_table}, + {0} +}; + +ctl_table unix_net_table[] = { + {NET_UNIX, "unix", NULL, 0, 0555, unix_table}, + {0} +}; + +void unix_sysctl_register(void) +{ + unix_sysctl_header = register_sysctl_table(unix_root_table, 0); +} + +void unix_sysctl_unregister(void) +{ + unregister_sysctl_table(unix_sysctl_header); +} +#endif /* MODULE */ + +#endif /* CONFIG_SYSCTL */ diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 2970a82b9de1..bc473e317ae7 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -440,6 +440,7 @@ static struct sock *x25_alloc_socket(void) sock_init_data(NULL, sk); + skb_queue_head_init(&x25->ack_queue); skb_queue_head_init(&x25->fragment_queue); skb_queue_head_init(&x25->interrupt_in_queue); skb_queue_head_init(&x25->interrupt_out_queue); diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index e4cd99ae7ba4..c8ffb33ef2da 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -177,7 +177,6 @@ void x25_establish_link(struct x25_neigh *neigh) } skb->protocol = htons(ETH_P_X25); - skb->priority = SOPRI_NORMAL; skb->dev = neigh->dev; skb->arp = 1; @@ -208,7 +207,6 @@ void x25_terminate_link(struct x25_neigh *neigh) } skb->protocol = htons(ETH_P_X25); - skb->priority = SOPRI_NORMAL; skb->dev = neigh->dev; skb->arp = 1; @@ -236,7 +234,6 @@ void x25_send_frame(struct sk_buff *skb, struct x25_neigh *neigh) } skb->protocol = htons(ETH_P_X25); - skb->priority = SOPRI_NORMAL; skb->dev = neigh->dev; skb->arp = 1; diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 96b459a4ee19..1c4cb3bc71f6 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -174,6 +174,7 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->protinfo.x25->vr = 0; sk->protinfo.x25->va = 0; sk->protinfo.x25->vl = 0; + x25_requeue_frames(sk); break; case X25_CLEAR_REQUEST: @@ -199,11 +200,9 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->protinfo.x25->vl = 0; sk->protinfo.x25->state = X25_STATE_4; } else { - if (sk->protinfo.x25->condition & X25_COND_PEER_RX_BUSY) { - sk->protinfo.x25->va = nr; - } else { - x25_check_iframes_acked(sk, nr); - } + x25_frames_acked(sk, nr); + if (frametype == X25_RNR) + x25_requeue_frames(sk); } break; @@ -221,11 +220,7 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->protinfo.x25->state = X25_STATE_4; break; } - if (sk->protinfo.x25->condition & X25_COND_PEER_RX_BUSY) { - sk->protinfo.x25->va = nr; - } else { - x25_check_iframes_acked(sk, nr); - } + x25_frames_acked(sk, nr); if (sk->protinfo.x25->condition & X25_COND_OWN_RX_BUSY) break; if (ns == sk->protinfo.x25->vr) { @@ -298,6 +293,7 @@ static int x25_state4_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->protinfo.x25->vs = 0; sk->protinfo.x25->vl = 0; sk->protinfo.x25->state = X25_STATE_3; + x25_requeue_frames(sk); break; case X25_CLEAR_REQUEST: diff --git a/net/x25/x25_out.c b/net/x25/x25_out.c index aa8fc2c1bf4c..5283092a110c 100644 --- a/net/x25/x25_out.c +++ b/net/x25/x25_out.c @@ -126,8 +126,8 @@ static void x25_send_iframe(struct sock *sk, struct sk_buff *skb) void x25_kick(struct sock *sk) { - struct sk_buff *skb; - unsigned short end; + struct sk_buff *skb, *skbn; + unsigned short start, end; int modulus; if (sk->protinfo.x25->state != X25_STATE_3) @@ -149,11 +149,15 @@ void x25_kick(struct sock *sk) return; modulus = (sk->protinfo.x25->neighbour->extended) ? X25_EMODULUS : X25_SMODULUS; + + start = (skb_peek(&sk->protinfo.x25->ack_queue) == NULL) ? sk->protinfo.x25->va : sk->protinfo.x25->vs; end = (sk->protinfo.x25->va + sk->protinfo.x25->facilities.winsize_out) % modulus; - if (sk->protinfo.x25->vs == end) + if (start == end) return; + sk->protinfo.x25->vs = start; + /* * Transmit data until either we're out of data to send or * the window is full. @@ -162,13 +166,25 @@ void x25_kick(struct sock *sk) skb = skb_dequeue(&sk->write_queue); do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->write_queue, skb); + break; + } + + skb_set_owner_w(skbn, sk); + /* - * Transmit the frame. + * Transmit the frame copy. */ - x25_send_iframe(sk, skb); + x25_send_iframe(sk, skbn); sk->protinfo.x25->vs = (sk->protinfo.x25->vs + 1) % modulus; + /* + * Requeue the original data frame. + */ + skb_queue_tail(&sk->protinfo.x25->ack_queue, skb); + } while (sk->protinfo.x25->vs != end && (skb = skb_dequeue(&sk->write_queue)) != NULL); sk->protinfo.x25->vl = sk->protinfo.x25->vr; @@ -195,15 +211,4 @@ void x25_enquiry_response(struct sock *sk) x25_stop_timer(sk); } -void x25_check_iframes_acked(struct sock *sk, unsigned short nr) -{ - if (sk->protinfo.x25->vs == nr) { - sk->protinfo.x25->va = nr; - } else { - if (sk->protinfo.x25->va != nr) { - sk->protinfo.x25->va = nr; - } - } -} - #endif diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index f2aff6d12b25..52e5be0cb6ab 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -50,6 +50,9 @@ void x25_clear_queues(struct sock *sk) while ((skb = skb_dequeue(&sk->write_queue)) != NULL) kfree_skb(skb, FREE_WRITE); + while ((skb = skb_dequeue(&sk->protinfo.x25->ack_queue)) != NULL) + kfree_skb(skb, FREE_WRITE); + while ((skb = skb_dequeue(&sk->protinfo.x25->interrupt_in_queue)) != NULL) kfree_skb(skb, FREE_READ); @@ -60,6 +63,49 @@ void x25_clear_queues(struct sock *sk) kfree_skb(skb, FREE_READ); } + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. +*/ +void x25_frames_acked(struct sock *sk, unsigned short nr) +{ + struct sk_buff *skb; + int modulus; + + modulus = (sk->protinfo.x25->neighbour->extended) ? X25_EMODULUS : X25_SMODULUS; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (sk->protinfo.x25->va != nr) { + while (skb_peek(&sk->protinfo.x25->ack_queue) != NULL && sk->protinfo.x25->va != nr) { + skb = skb_dequeue(&sk->protinfo.x25->ack_queue); + kfree_skb(skb, FREE_WRITE); + sk->protinfo.x25->va = (sk->protinfo.x25->va + 1) % modulus; + } + } +} + +void x25_requeue_frames(struct sock *sk) +{ + struct sk_buff *skb, *skb_prev = NULL; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by x25_kick. This arrangement handles the possibility of an empty + * output queue. + */ + while ((skb = skb_dequeue(&sk->protinfo.x25->ack_queue)) != NULL) { + if (skb_prev == NULL) + skb_queue_head(&sk->write_queue, skb); + else + skb_append(skb_prev, skb); + skb_prev = skb; + } +} + /* * Validate that the value of nr is between va and vs. Return true or * false for testing. -- 2.39.5