From dad12642374e31d275d01f5e94aac11769dc4392 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Nov 2007 15:34:58 -0500 Subject: [PATCH] Import 2.3.99pre9-4 --- Documentation/DocBook/Makefile | 2 +- Documentation/DocBook/kernel-hacking.tmpl | 1316 +++++++++++++++ Documentation/DocBook/kernel-locking.tmpl | 1221 ++++++++++++++ Documentation/DocBook/parportbook.tmpl | 2 +- Documentation/kbuild/config-language.txt | 2 +- Documentation/kernel-doc-nano-HOWTO.txt | 128 ++ arch/ppc/mbxboot/embed_config.c | 2 +- arch/sh/kernel/io_se.c | 2 - arch/sh/kernel/irq.c | 1 + arch/sparc/config.in | 7 +- arch/sparc/kernel/Makefile | 4 +- arch/sparc/kernel/entry.S | 2 +- arch/sparc/kernel/head.S | 2 +- arch/sparc/kernel/irq.c | 2 +- arch/sparc/kernel/process.c | 2 +- arch/sparc/kernel/setup.c | 2 +- arch/sparc/kernel/signal.c | 2 +- arch/sparc/kernel/sparc_ksyms.c | 2 +- arch/sparc/kernel/sun4d_irq.c | 2 +- arch/sparc/kernel/sys_sunos.c | 2 +- arch/sparc/kernel/time.c | 2 +- arch/sparc/kernel/traps.c | 2 +- arch/sparc/lib/rwsem.S | 2 +- arch/sparc/mm/btfixup.c | 2 +- arch/sparc/mm/hypersparc.S | 2 +- arch/sparc/mm/init.c | 2 +- arch/sparc/mm/srmmu.c | 2 +- arch/sparc/mm/sun4c.c | 2 +- arch/sparc/mm/swift.S | 2 +- arch/sparc/mm/tsunami.S | 2 +- arch/sparc/mm/viking.S | 2 +- arch/sparc64/config.in | 7 +- arch/sparc64/kernel/Makefile | 2 +- arch/sparc64/kernel/head.S | 2 +- arch/sparc64/kernel/ioctl32.c | 2 +- arch/sparc64/kernel/irq.c | 2 +- arch/sparc64/kernel/process.c | 2 +- arch/sparc64/kernel/setup.c | 2 +- arch/sparc64/kernel/sparc64_ksyms.c | 2 +- arch/sparc64/kernel/sys_sparc32.c | 2 +- arch/sparc64/kernel/sys_sunos32.c | 2 +- arch/sparc64/kernel/systbls.S | 6 +- arch/sparc64/kernel/time.c | 2 +- arch/sparc64/kernel/traps.c | 2 +- arch/sparc64/kernel/ttable.S | 2 +- arch/sparc64/lib/debuglocks.c | 2 +- arch/sparc64/mm/init.c | 2 +- arch/sparc64/mm/ultra.S | 2 +- arch/sparc64/prom/misc.c | 2 +- arch/sparc64/solaris/misc.c | 2 +- drivers/block/floppy.c | 10 +- drivers/char/Config.in | 2 +- drivers/char/bttv.c | 350 ++-- drivers/char/bttv.h | 2 + drivers/char/lp.c | 8 +- drivers/char/rio/rio_linux.h | 1 + drivers/char/rio/rioboot.c | 1 - drivers/char/rio/riocmd.c | 1 - drivers/char/rio/rioctrl.c | 1 - drivers/char/rio/riointr.c | 1 - drivers/char/rio/rioparam.c | 1 - drivers/char/rio/rioroute.c | 1 - drivers/char/rio/riotable.c | 1 - drivers/char/rio/riotty.c | 1 - drivers/char/videodev.c | 6 +- drivers/isdn/avmb1/kcapi.c | 2 +- drivers/net/hp100.c | 16 +- drivers/net/pppoe.c | 83 +- drivers/net/wan/comx-hw-comx.c | 5 - drivers/net/wan/comx-hw-locomx.c | 5 - drivers/net/wan/comx-hw-mixcom.c | 5 - drivers/net/wan/comx-proto-fr.c | 5 - drivers/net/wan/comx-proto-lapb.c | 5 - drivers/net/wan/comx.c | 5 - drivers/net/wavelan.c | 1 + drivers/net/wavelan.p.h | 2 + drivers/parport/share.c | 33 +- drivers/s390/block/dasd_proc.c | 58 +- drivers/scsi/BusLogic.c | 3 +- drivers/scsi/aha152x.c | 31 +- drivers/scsi/megaraid.c | 2 +- drivers/scsi/megaraid.h | 1 + drivers/sound/ac97_codec.c | 6 + drivers/sound/cmpci.c | 2 +- drivers/sound/trident.c | 29 +- drivers/video/riva/fbdev.c | 2 +- fs/coda/cache.c | 21 +- fs/dcache.c | 20 + fs/exec.c | 67 +- fs/nfs/dir.c | 85 +- fs/nfs/inode.c | 20 +- fs/nfsd/nfscache.c | 7 +- fs/openpromfs/inode.c | 2 +- fs/pipe.c | 2 +- fs/proc/base.c | 61 +- fs/proc/generic.c | 6 +- fs/proc/proc_devtree.c | 1 - fs/proc/root.c | 29 - fs/super.c | 38 +- fs/udf/namei.c | 28 +- include/asm-i386/uaccess.h | 22 +- include/asm-ppc/bitops.h | 1 + include/asm-sparc/bitops.h | 2 +- include/asm-sparc/ide.h | 2 +- include/asm-sparc/irq.h | 2 +- include/asm-sparc/pgalloc.h | 2 +- include/asm-sparc/system.h | 2 +- include/asm-sparc/winmacro.h | 2 +- include/asm-sparc64/delay.h | 2 +- include/asm-sparc64/ide.h | 2 +- include/asm-sparc64/irq.h | 2 +- include/asm-sparc64/oplib.h | 2 +- include/asm-sparc64/processor.h | 2 +- include/asm-sparc64/system.h | 2 +- include/asm-sparc64/timer.h | 2 +- include/asm-sparc64/unistd.h | 6 +- include/linux/coda.h | 6 +- include/linux/dcache.h | 1 + include/linux/if_pppox.h | 5 +- include/linux/lvm.h | 6 +- include/linux/mount.h | 1 + include/linux/netfilter_ipv6.h | 10 + include/linux/netfilter_ipv6/ip6_tables.h | 452 ++++++ include/linux/proc_fs.h | 3 - include/linux/vmalloc.h | 49 +- init/main.c | 3 +- kernel/ksyms.c | 1 + mm/filemap.c | 13 +- mm/vmalloc.c | 18 +- mm/vmscan.c | 28 +- net/Makefile | 9 +- net/core/skbuff.c | 2 +- net/ipv4/ip_gre.c | 2 +- net/ipv4/ipip.c | 4 +- net/ipv6/Config.in | 4 + net/ipv6/netfilter/Config.in | 49 + net/ipv6/netfilter/Makefile | 180 +++ net/ipv6/netfilter/ip6_tables.c | 1795 +++++++++++++++++++++ net/ipv6/netfilter/ip6t_MARK.c | 66 + net/ipv6/netfilter/ip6t_limit.c | 135 ++ net/ipv6/netfilter/ip6t_mark.c | 50 + net/ipv6/netfilter/ip6table_filter.c | 183 +++ net/unix/af_unix.c | 2 +- scripts/header.tk | 106 +- scripts/tail.tk | 10 + scripts/tkgen.c | 31 +- scripts/tkparse.c | 4 +- 147 files changed, 6430 insertions(+), 702 deletions(-) create mode 100644 Documentation/DocBook/kernel-hacking.tmpl create mode 100644 Documentation/DocBook/kernel-locking.tmpl create mode 100644 Documentation/kernel-doc-nano-HOWTO.txt create mode 100644 include/linux/netfilter_ipv6/ip6_tables.h create mode 100644 net/ipv6/netfilter/Config.in create mode 100644 net/ipv6/netfilter/Makefile create mode 100644 net/ipv6/netfilter/ip6_tables.c create mode 100644 net/ipv6/netfilter/ip6t_MARK.c create mode 100644 net/ipv6/netfilter/ip6t_limit.c create mode 100644 net/ipv6/netfilter/ip6t_mark.c create mode 100644 net/ipv6/netfilter/ip6table_filter.c diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index b6cdf0567d9d..8a40bfade13a 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -1,4 +1,4 @@ -BOOKS := wanbook.sgml z8530book.sgml mcabook.sgml videobook.sgml kernel-api.sgml parportbook.sgml +BOOKS := wanbook.sgml z8530book.sgml mcabook.sgml videobook.sgml kernel-api.sgml parportbook.sgml kernel-hacking.sgml kernel-locking.sgml PS := $(patsubst %.sgml, %.ps, $(BOOKS)) PDF := $(patsubst %.sgml, %.pdf, $(BOOKS)) diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl new file mode 100644 index 000000000000..8d87787ad673 --- /dev/null +++ b/Documentation/DocBook/kernel-hacking.tmpl @@ -0,0 +1,1316 @@ + + + + + Unreliable Guide To Hacking The Linux Kernel + + + + Paul + Rusty + Russell + +
+ rusty@linuxcare.com +
+
+
+
+ + + 2000 + Paul Russell + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + + + + This is the first release of this document as part of the kernel tarball. + + +
+ + + + + Introduction + + Welcome, gentle reader, to Rusty's Unreliable Guide to Linux + Kernel Hacking. This document describes the common routines and + general requirements for kernel code: its goal is to serve as a + primer for Linux kernel development for experienced C + programmers. I avoid implementation details: that's what the + code is for, and I ignore whole tracts of useful routines. + + + Before you read this, please understand that I never wanted to + write this document, being grossly under-qualified, but I always + wanted to read it, and this was the only way. I hope it will + grow into a compendium of best practice, common starting points + and random information. + + + + + The Players + + + At any time each of the CPUs in a system can be: + + + + + + not associated with any process, serving a hardware interrupt; + + + + + + not associated with any process, serving a softirq, tasklet or bh; + + + + + + running in kernel space, associated with a process; + + + + + + running a process in user space. + + + + + + There is a strict ordering between these: other than the last + category (userspace) each can only be pre-empted by those above. + For example, while a softirq is running on a CPU, no other + softirq will pre-empt it, but a hardware interrupt can. However, + any other CPUs in the system execute independently. + + + + We'll see a number of ways that the user context can block + interrupts, to become truly non-preemptable. + + + + User Context + + + User context is when you are coming in from a system call or + other trap: you can sleep, and you own the CPU (except for + interrupts) until you call schedule(). + In other words, user context (unlike userspace) is not pre-emptable. + + + + + You are always in user context on module load and unload, + and on operations on the block device layer. + + + + + In user context, the current pointer (indicating + the task we are currently executing) is valid, and + in_interrupt() + (include/asm/hardirq.h) is false + . + + + + + Beware that if you have interrupts or bottom halves disabled + (see below), in_interrupt() will return a + false positive. + + + + + + Hardware Interrupts (Hard IRQs) + + + Timer ticks, network cards and + keyboard are examples of real + hardware which produce interrupts at any time. The kernel runs + interrupt handlers, which services the hardware. The kernel + guarantees that this handler is never re-entered: if another + interrupt arrives, it is queued (or dropped). Because it + disables interrupts, this handler has to be fast: frequently it + simply acknowledges the interrupt, marks a `software interrupt' + for execution and exits. + + + + You can tell you are in a hardware interrupt, because + in_irq() returns true. + + + + Beware that this will return a false positive if interrupts are disabled + (see below). + + + + + + Software Interrupt Context: Bottom Halves, Tasklets, softirqs + + + Whenever a system call is about to return to userspace, or a + hardware interrupt handler exits, any `software interrupts' + which are marked pending (usually by hardware interrupts) are + run (kernel/softirq.c). + + + + Much of the real interrupt handling work is done here. Early in + the transition to SMP, there were only `bottom + halves' (BHs), which didn't take advantage of multiple CPUs. Shortly + after we switched from wind-up computers made of match-sticks and snot, + we abandoned this limitation. + + + + include/linux/interrupt.h lists the + different BH's. No matter how many CPUs you have, no two BHs will run at + the same time. This made the transition to SMP simpler, but sucks hard for + scalable performance. A very important bottom half is the timer + BH (include/linux/timer.h): you + can register to have it call functions for you in a given length of time. + + + + 2.3.43 introduced softirqs, and re-implemented the (now + deprecated) BHs underneath them. Softirqs are fully-SMP + versions of BHs: they can run on as many CPUs at once as + required. This means they need to deal with any races in shared + data using their own locks. A bitmask is used to keep track of + which are enabled, so the 32 available softirqs should not be + used up lightly. (Yes, people will + notice). + + + + tasklets (include/linux/interrupt.h) + are like softirqs, except they are dynamically-registrable (meaning you + can have as many as you want), and they also guarantee that any tasklet + will only run on one CPU at any time, although different tasklets can + run simultaneously (unlike different BHs). + + + + The name `tasklet' is misleading: they have nothing to do with `tasks', + and probably more to do with some bad vodka Alexey Kuznetsov had at the + time. + + + + + You can tell you are in a softirq (or bottom half, or tasklet) + using the in_softirq() macro + (include/asm/softirq.h). + + + + Beware that this will return a false positive if a bh lock (see below) + is held. + + + + + + + Some Basic Rules + + + + No memory protection + + + If you corrupt memory, whether in user context or + interrupt context, the whole machine will crash. Are you + sure you can't do what you want in userspace? + + + + + + No floating point or MMX + + + The FPU context is not saved; even in user + context the FPU state probably won't + correspond with the current process: you would mess with some + user process' FPU state. If you really want + to do this, you would have to explicitly save/restore the full + FPU state (and avoid context switches). It + is generally a bad idea; use fixed point arithmetic first. + + + + + + A rigid stack limit + + + The kernel stack is about 6K in 2.2 (for most + architectures: it's about 14K on the Alpha), and shared + with interrupts so you can't use it all. Avoid deep + recursion and huge local arrays on the stack (allocate + them dynamically instead). + + + + + + The Linux kernel is portable + + + Let's keep it that way. Your code should be 64-bit clean, + and endian-independent. You should also minimize CPU + specific stuff, e.g. inline assembly should be cleanly + encapsulated and minimized to ease porting. Generally it + should be restricted to the architecture-dependent part of + the kernel tree. + + + + + + + + ioctls: Not writing a new system call + + + A system call generally looks like this + + + +asmlinkage int sys_mycall(int arg) +{ + return 0; +} + + + + First, in most cases you don't want to create a new system call. + You create a character device and implement an appropriate ioctl + for it. This is much more flexible than system calls, doesn't have + to be entered in every architecture's + include/asm/unistd.h and + arch/kernel/entry.S file, and is much more + likely to be accepted by Linus. + + + + Inside the ioctl you're in user context to a process. When a + error occurs you return a negated errno (see + include/linux/errno.h), + otherwise you return 0. + + + + After you slept you should check if a signal occurred: the + Unix/Linux way of handling signals is to temporarily exit the + system call with the -ERESTARTSYS error. The + system call entry code will switch back to user context, process + the signal handler and then your system call will be restarted + (unless the user disabled that). So you should be prepared to + process the restart, e.g. if you're in the middle of manipulating + some data structure. + + + +if (signal_pending()) + return -ERESTARTSYS; + + + + If you're doing longer computations: first think userspace. If you + really want to do it in kernel you should + regularly check if you need to give up the CPU (remember there is + cooperative multitasking per CPU). Idiom: + + + +if (current->need_resched) + schedule(); /* Will sleep */ + + + + A short note on interface design: the UNIX system call motto is + "Provide mechanism not policy". + + + + + Recipes for Deadlock + + + You cannot call any routines which may sleep, unless: + + + + + You are in user context. + + + + + + You do not own any spinlocks. + + + + + + You have interrupts enabled (actually, Andi Kleen says + that the scheduling code will enable them for you, but + that's probably not what you wanted). + + + + + + Note that some functions may sleep implicitly: common ones are + the user space access functions (*_user) and memory allocation + functions without GFP_ATOMIC. + + + + You will eventually lock up your box if you break these rules. + + + + Really. + + + + + Common Routines + + + + <function>printk()</function> + <filename class=headerfile>include/linux/kernel.h</filename> + + + + printk() feeds kernel messages to the + console, dmesg, and the syslog daemon. It is useful for debugging + and reporting errors, and can be used inside interrupt context, + but use with caution: a machine which has its console flooded with + printk messages is unusable. It uses a format string mostly + compatible with ANSI C printf, and C string concatenation to give + it a first "priority" argument: + + + +printk(KERN_INFO "i = %u\n", i); + + + + See include/linux/kernel.h; + for other KERN_ values; these are interpreted by syslog as the + level. Special case: for printing an IP address use + + + +__u32 ipaddress; +printk(KERN_INFO "my ip: %d.%d.%d.%d\n", NIPQUAD(ipaddress)); + + + + printk() internally uses a 1K buffer and does + not catch overruns. Make sure that will be enough. + + + + + You will know when you are a real kernel hacker + when you start typoing printf as printk in your user programs :) + + + + + + + + Another sidenote: the original Unix Version 6 sources had a + comment on top of its printf function: "Printf should not be + used for chit-chat". You should follow that advice. + + + + + + + <function>copy_[to/from]_user()</function> + / + <function>get_user()</function> + / + <function>put_user()</function> + <filename class=headerfile>include/asm/uaccess.h</filename> + + + + [SLEEPS] + + + + put_user() and get_user() + are used to get and put single values (such as an int, char, or + long) from and to userspace. A pointer into userspace should + never be simply dereferenced: data should be copied using these + routines. Both return -EFAULT or 0. + + + copy_to_user() and + copy_from_user() are more general: they copy + an arbitrary amount of data to and from userspace. + + + Unlike put_user() and + get_user(), they return the amount of + uncopied data (ie. 0 still means + success). + + + [Yes, this moronic interface makes me cringe. Please submit a + patch and become my hero --RR.] + + + The functions may sleep implicitly. This should never be called + outside user context (it makes no sense), with interrupts + disabled, or a spinlock held. + + + + + <function>kmalloc()</function>/<function>kfree()</function> + <filename class=headerfile>include/linux/slab.h</filename> + + + [MAY SLEEP: SEE BELOW] + + + + These routines are used to dynamically request pointer-aligned + chunks of memory, like malloc and free do in userspace, but + kmalloc() takes an extra flag word. + Important values: + + + + + + + GFP_KERNEL + + + + + May sleep and swap to free memory. Only allowed in user + context, but is the most reliable way to allocate memory. + + + + + + + + GFP_ATOMIC + + + + + Don't sleep. Less reliable than GFP_KERNEL, + but may be called from interrupt context. You should + really have a good out-of-memory + error-handling strategy. + + + + + + + + GFP_DMA + + + + + Allocate ISA DMA lower than 16MB. If you don't know what that + is you don't need it. Very unreliable. + + + + + + + If you see a kmem_grow: Called nonatomically from int + warning message you called a memory allocation function + from interrupt context without GFP_ATOMIC. + You should really fix that. Run, don't walk. + + + + If you are allocating at least PAGE_SIZE + (include/asm/page.h) bytes, + consider using __get_free_pages() + + (include/linux/mm.h). It + takes an order argument (0 for page sized, 1 for double page, 2 + for four pages etc.) and the same memory priority flag word as + above. + + + + If you are allocating more than a page worth of bytes you can use + vmalloc(). It'll allocate virtual memory in + the kernel map. This block is not contiguous in physical memory, + but the MMU makes it look like it is for you + (so it'll only look contiguous to the CPUs, not to external device + drivers). If you really need large physically contiguous memory + for some weird device, you have a problem: it is poorly supported + in Linux because after some time memory fragmentation in a running + kernel makes it hard. The best way is to allocate the block early + in the boot process. + + + + Before inventing your own cache of often-used objects consider + using a slab cache in + include/linux/slab.h + + + + + <function>current</function> + <filename class=headerfile>include/asm/current.h</filename> + + + This global variable (really a macro) contains a pointer to + the current task structure, so is only valid in user context. + For example, when a process makes a system call, this will + point to the task structure of the calling process. It is + not NULL in interrupt context. + + + + + <function>local_irq_save()</function>/<function>local_irq_restore()</function> + <filename class=headerfile>include/asm/system.h</filename> + + + + These routines disable hard interrupts on the local CPU, and + restore them. They are reentrant; saving the previous state in + their one unsigned long flags argument. If you + know that interrupts are enabled, you can simply use + local_irq_disable() and + local_irq_enable(). + + + + + <function>local_bh_disable()</function>/<function>local_bh_enable()</function> + <filename class=headerfile>include/asm/softirq.h</filename> + + + These routines disable soft interrupts on the local CPU, and + restore them. They are reentrant; if soft interrupts were + disabled before, they will still be disabled after this pair + of functions has been called. They prevent softirqs, tasklets + and bottom halves from running on the current CPU. + + + + + <function>smp_processor_id</function>()/<function>cpu_[number/logical]_map()</function> + <filename class=headerfile>include/asm/smp.h</filename> + + + smp_processor_id() returns the current + processor number, between 0 and NR_CPUS (the + maximum number of CPUs supported by Linux, currently 32). These + values are not necessarily continuous: to get a number between 0 + and smp_num_cpus() (the number of actual + processors in this machine), the + cpu_number_map() function is used to map the + processor id to a logical number. + cpu_logical_map() does the reverse. + + + + + <type>__init</type>/<type>__exit</type>/<type>__initdata</type> + <filename class=headerfile>include/linux/init.h</filename> + + + After boot, the kernel frees up a special section; functions + marked with __init and data structures marked with + __initdata are dropped after boot is complete (within + modules this directive is currently ignored). __exit + is used to declare a function which is only required on exit: the + function will be dropped if this file is not compiled as a module. + See the header file for use. + + + + + + <function>__initcall()</function>/<function>module_init()</function> + <filename class=headerfile>include/linux/init.h</filename> + + Many parts of the kernel are well served as a module + (dynamically-loadable parts of the kernel). Using the + module_init() and + module_exit() macros it is easy to write code + without #ifdefs which can operate both as a module or built into + the kernel. + + + + The module_init() macro defines which + function is to be called at module insertion time (if the file is + compiled as a module), or at boot time: if the file is not + compiled as a module the module_init() macro + becomes equivalent to __initcall(), which + through linker magic ensures that the function is called on boot. + + + + The function can return a negative error number to cause + module loading to fail (unfortunately, this has no effect if + the module is compiled into the kernel). For modules, this is + called in user context, with interrupts enabled, and the + kernel lock held, so it can sleep. + + + + + <function>module_exit()</function> + <filename class=headerfile>include/linux/init.h</filename> + + + This macro defines the function to be called at module removal + time (or never, in the case of the file compiled into the + kernel). It will only be called if the module usage count has + reached zero. This function can also sleep, but cannot fail: + everything must be cleaned up by the time it returns. + + + + + <function>MOD_INC_USE_COUNT</function>/<function>MOD_DEC_USE_COUNT</function> + <filename class=headerfile>include/linux/module.h</filename> + + + These manipulate the module usage count, to protect against + removal (a module also can't be removed if another module uses + one of its exported symbols: see below). Every reference to + the module from user context should be reflected by this + counter (e.g. for every data structure or socket) before the + function sleeps. To quote Tim Waugh: + + + +/* THIS IS BAD */ +foo_open (...) +{ + stuff.. + if (fail) + return -EBUSY; + sleep.. (might get unloaded here) + stuff.. + MOD_INC_USE_COUNT; + return 0; +} + +/* THIS IS GOOD / +foo_open (...) +{ + MOD_INC_USE_COUNT; + stuff.. + if (fail) { + MOD_DEC_USE_COUNT; + return -EBUSY; + } + sleep.. (safe now) + stuff.. + return 0; +} + + + + + + Wait Queues + <filename class=headerfile>include/linux/wait.h</filename> + + + [SLEEPS] + + + + A wait queue is used to wait for someone to wake you up when a + certain condition is true. They must be used carefully to ensure + there is no race condition. You declare a + wait_queue_head_t, and then processes which want to + wait for that condition declare a wait_queue_t + referring to themselves, and place that in the queue. + + + + Declaring + + + You declare a wait_queue_head_t using the + DECLARE_WAIT_QUEUE_HEAD() macro, or using the + init_waitqueue_head() routine in your + initialization code. + + + + + Queuing + + + Placing yourself in the waitqueue is fairly complex, because you + must put yourself in the queue before checking the condition. + There is a macro to do this: + wait_event_interruptible() + + include/linux/sched.h The + first argument is the wait queue head, and the second is an + expression which is evaluated; the macro returns + 0 when this expression is true, or + -ERESTARTSYS if a signal is received. + The wait_event() version ignores signals. + + + + + Waking Up Queued Tasks + + + Call wake_up() + + include/linux/sched.h;, + which will wake up every process in the queue. The exception is + if one has TASK_EXCLUSIVE set, in which case + the remainder of the queue will not be woken. + + + + + + Atomic Operations + + + Certain operations are guaranteed atomic on all platforms. The + first class of operations work on atomic_t + + include/asm/atomic.h; this + contains a signed integer (at least 32 bits long), and you must use + these functions to manipulate or read atomic_t variables. + atomic_read() and + atomic_set() get and set the counter, + atomic_add(), + atomic_sub(), + atomic_inc(), + atomic_dec(), and + atomic_dec_and_test() (returns + true if it was decremented to zero). + + + + Yes. It returns true (i.e. != 0) if the + atomic variable is zero. + + + + Note that these functions are slower than normal arithmetic, and + so should not be used unnecessarily. On some platforms they + are much slower, like 32-bit Sparc where they use a spinlock. + + + + The second class of atomic operations is atomic bit operations, + defined in + + include/asm/bitops.h. These + operations generally take a pointer to the bit pattern, and a bit + number: 0 is the least significant bit. + set_bit(), clear_bit() + and change_bit() set, clear, and flip the + given bit. test_and_set_bit(), + test_and_clear_bit() and + test_and_change_bit() do the same thing, + except return true if the bit was previously set; these are + particularly useful for very simple locking. + + + + It is possible to call these operations with bit indices greater + than 31. The resulting behavior is strange on big-endian + platforms though so it is a good idea not to do this. + + + + + Symbols + + + Within the kernel proper, the normal linking rules apply + (ie. unless a symbol is declared to be file scope with the + static keyword, it can be used anywhere in the + kernel). However, for modules, a special exported symbol table is + kept which limits the entry points to the kernel proper. Modules + can also export symbols. + + + + <function>EXPORT_SYMBOL()</function> + <filename class=headerfile>include/linux/module.h</filename> + + + This is the classic method of exporting a symbol, and it works + for both modules and non-modules. In the kernel all these + declarations are often bundled into a single file to help + genksyms (which searches source files for these declarations). + See the comment on genksyms and Makefiles below. + + + + + <function>EXPORT_SYMTAB</function> + + + For convenience, a module usually exports all non-file-scope + symbols (ie. all those not declared static). If this + is defined before + + include/linux/module.h is + included, then only symbols explicit exported with + EXPORT_SYMBOL() will be exported. + + + + + + Routines and Conventions + + + Double-linked lists + <filename class=headerfile>include/linux/list.h</filename> + + + There are three sets of linked-list routines in the kernel + headers, but this one seems to be winning out (and Linus has + used it). If you don't have some particular pressing need for + a single list, it's a good choice. In fact, I don't care + whether it's a good choice or not, just use it so we can get + rid of the others. + + + + + Return Conventions + + + For code called in user context, it's very common to defy C + convention, and return 0 for success, + and a negative error number + (eg. -EFAULT) for failure. This can be + unintuitive at first, but it's fairly widespread in the networking + code, for example. + + + + The filesystem code uses ERR_PTR() + + include/linux/fs.h; to + encode a negative error number into a pointer, and + IS_ERR() and PTR_ERR() + to get it back out again: avoids a separate pointer parameter for + the error number. Icky, but in a good way. + + + + + Breaking Compilation + + + Linus and the other developers sometimes change function or + structure names in development kernels; this is not done just to + keep everyone on their toes: it reflects a fundamental change + (eg. can no longer be called with interrupts on, or does extra + checks, or doesn't do checks which were caught before). Usually + this is accompanied by a fairly complete note to the linux-kernel + mailing list; search the archive. Simply doing a global replace + on the file usually makes things worse. + + + + + Initializing structure members + + + The preferred method of initializing structures is to use the + gcc Labeled Elements extension, eg: + + +static struct block_device_operations opt_fops = { + open: opt_open, + release: opt_release, + ioctl: opt_ioctl, + check_media_change: opt_media_change, +}; + + + + This makes it easy to grep for, and makes it clear which + structure fields are set. You should do this because it looks + cool. + + + + + GNU Extensions + + + GNU Extensions are explicitly allowed in the Linux kernel. + Note that some of the more complex ones are not very well + supported, due to lack of general use, but the following are + considered standard (see the GCC info page section "C + Extensions" for more details - Yes, really the info page, the + man page is only a short summary of the stuff in info): + + + + + Inline functions + + + + + Statement expressions (ie. the ({ and }) constructs). + + + + + Declaring attributes of a function / variable / type + (__attribute__) + + + + + Labeled elements + + + + + typeof + + + + + Zero length arrays + + + + + Macro varargs + + + + + Arithmetic on void pointers + + + + + Non-Constant initializers + + + + + Assembler Instructions (not outside arch/ and include/asm/) + + + + + Function names as strings (__FUNCTION__) + + + + + __builtin_constant_p() + + + + + + Be wary when using long long in the kernel, the code gcc generates for + it is horrible and worse: division and multiplication does not work + on i386 because the GCC runtime functions for it are missing from + the kernel environment. + + + + + + + C++ + + + Using C++ in the kernel is usually a bad idea, because the + kernel does not provide the necessary runtime environment + and the include files are not tested for it. It is still + possible, but not recommended. If you really want to do + this, forget about exceptions at least. + + + + + #if + + + It is generally considered cleaner to use macros in header files + (or at the top of .c files) to abstract away functions rather than + using `#if' pre-processor statements throughout the source code. + + + + + + Putting Your Stuff in the Kernel + + + In order to get your stuff into shape for official inclusion, or + even to make a neat patch, there's administrative work to be + done: + + + + + Figure out whose pond you've been pissing in. Look at the top of + the source files, inside the MAINTAINERS + file, and last of all in the CREDITS file. + You should coordinate with this person to make sure you're not + duplicating effort, or trying something that's already been + rejected. + + + + Make sure you put your name and EMail address at the top of + any files you create or mangle significantly. This is the + first place people will look when they find a bug, or when + they want to make a change. + + + + + + Usually you want a configuration option for your kernel hack. + Edit Config.in in the appropriate directory + (but under arch/ it's called + config.in). The Config Language used is not + bash, even though it looks like bash; the safe way is to use only + the constructs that you already see in + Config.in files (see + Documentation/kbuild/config-language.txt). + It's good to run "make xconfig" at least once to test (because + it's the only one with a static parser). + + + + Variables which can be Y or N use bool followed by a + tagline and the config define name (which must start with + CONFIG_). The tristate function is the same, but + allows the answer M (which defines + CONFIG_foo_MODULE in your source, instead of + CONFIG_FOO) if CONFIG_MODULES + is enabled. + + + + You may well want to make your CONFIG option only visible if + CONFIG_EXPERIMENTAL is enabled: this serves as a + warning to users. There many other fancy things you can do: see + the the various Config.in files for ideas. + + + + + + Edit the Makefile: the CONFIG variables are + exported here so you can conditionalize compilation with `ifeq'. + If your file exports symbols then add the names to + MX_OBJS or OX_OBJS instead + of M_OBJS or O_OBJS, so + that genksyms will find them. + + + + + + Document your option in Documentation/Configure.help. Mention + incompatibilities and issues here. Definitely + end your description with if in doubt, say N + (or, occasionally, `Y'); this is for people who have no + idea what you are talking about. + + + + + + Put yourself in CREDITS if you've done + something noteworthy, usually beyond a single file (your name + should be at the top of the source files anyway). + MAINTAINERS means you want to be consulted + when changes are made to a subsystem, and hear about bugs; it + implies a more-than-passing commitment to some part of the code. + + + + + + + Kernel Cantrips + + + Some favorites from browsing the source. Feel free to add to this + list. + + + + include/linux/brlock.h: + + +extern inline void br_read_lock (enum brlock_indices idx) +{ + /* + * This causes a link-time bug message if an + * invalid index is used: + */ + if (idx >= __BR_END) + __br_lock_usage_bug(); + + read_lock(&__brlock_array[smp_processor_id()][idx]); +} + + + + include/linux/fs.h: + + +/* + * Kernel pointers have redundant information, so we can use a + * scheme where we can return either an error code or a dentry + * pointer with the same return value. + * + * This should be a per-architecture thing, to allow different + * error and pointer decisions. + */ + #define ERR_PTR(err) ((void *)((long)(err))) + #define PTR_ERR(ptr) ((long)(ptr)) + #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000)) + + + + include/asm-i386/uaccess.h: + + + +#define copy_to_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + __constant_copy_to_user((to),(from),(n)) : \ + __generic_copy_to_user((to),(from),(n))) + + + + arch/sparc/kernel/head.S: + + + +/* + * Sun people can't spell worth damn. "compatability" indeed. + * At least we *know* we can't spell, and use a spell-checker. + */ + +/* Uh, actually Linus it is I who cannot spell. Too much murky + * Sparc assembly will do this to ya. + */ +C_LABEL(cputypvar): + .asciz "compatability" + +/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */ + .align 4 +C_LABEL(cputypvar_sun4m): + .asciz "compatible" + + + + arch/sparc/lib/checksum.S: + + + + /* Sun, you just can't beat me, you just can't. Stop trying, + * give up. I'm serious, I am going to kick the living shit + * out of you, game over, lights out. + */ + + + + + Thanks + + + Thanks to Andi Kleen for the idea, answering my questions, fixing + my mistakes, filling content, etc. Philipp Rumpf for more spelling + and clarity fixes, and some excellent non-obvious points. Werner + Almesberger for giving me a great summary of + disable_irq(), and Jes Sorensen and Andrea + Arcangeli added caveats. Michael Elizabeth Chastain for checking + and adding to the Configure section. Telsa Gwynne for teaching me DocBook. + + +
+ diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl new file mode 100644 index 000000000000..397e7e954401 --- /dev/null +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -0,0 +1,1221 @@ + + + + + Unreliable Guide To Locking + + + + Paul + Rusty + Russell + +
+ rusty@linuxcare.com +
+
+
+
+ + + 2000 + Paul Russell + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + Introduction + + Welcome, to Rusty's Remarkably Unreliable Guide to Kernel + Locking issues. This document describes the locking systems in + the Linux Kernel as we approach 2.4. + + + It looks like SMP + is here to stay; so everyone hacking on the kernel + these days needs to know the fundamentals of concurrency and locking + for SMP. + + + + The Problem With Concurrency + + (Skip this if you know what a Race Condition is). + + + In a normal program, you can increment a counter like so: + + + very_important_count++; + + + + This is what they would expect to happen: + + + + Expected Results + + + + + + Instance 1 + Instance 2 + + + + + + read very_important_count (5) + + + + add 1 (6) + + + + write very_important_count (6) + + + + + read very_important_count (6) + + + + add 1 (7) + + + + write very_important_count (7) + + + + +
+ + + This is what might happen: + + + + Possible Results + + + + + Instance 1 + Instance 2 + + + + + + read very_important_count (5) + + + + + read very_important_count (5) + + + add 1 (6) + + + + + add 1 (5) + + + write very_important_count (6) + + + + + write very_important_count (6) + + + +
+ + + This overlap, where what actually happens depends on the + relative timing of multiple tasks, is called a race condition. + The piece of code containing the concurrency issue is called a + critical region. And especially since Linux starting running + on SMP machines, they became one of the major issues in kernel + design and implementation. + + + The solution is to recognize when these simultaneous accesses + occur, and use locks to make sure that only one instance can + enter the critical region at any time. There are many + friendly primitives in the Linux kernel to help you do this. + And then there are the unfriendly primitives, but I'll pretend + they don't exist. + +
+
+ + + Two Main Types of Kernel Locks: Spinlocks and Semaphores + + + There are two main types of kernel locks. The fundamental type + is the spinlock + (include/asm/spinlock.h), + which is a very simple single-holder lock: if you can't get the + spinlock, you keep trying (spinning) until you can. Spinlocks are + very small and fast, and can be used anywhere. + + + The second type is a semaphore + (include/asm/semaphore.h): it + can have more than one holder at any time (the number decided at + initialization time), although it is most commonly used as a + single-holder lock (a mutex). If you can't get a semaphore, + your task will put itself on the queue, and be woken up when the + semaphore is released. This means the CPU will do something + else while you are waiting, but there are many cases when you + simply can't sleep, and so have to use a spinlock instead. + + + + Locks and Uniprocessor Kernels + + + For kernels compiled without CONFIG_SMP, spinlocks + do not exist at all. This is an excellent design decision: when + no-one else can run at the same time, there is no reason to + have a lock at all. + + + + You should always test your locking code with CONFIG_SMP + enabled, even if you don't have an SMP test box, because it + will still catch some (simple) kinds of deadlock. + + + + Semaphores still exist, because they are required for + synchronization between user + contexts, as we will see below. + + + + + Read/Write Lock Variants + + + Both spinlocks and semaphores have read/write variants: + rwlock_t and struct rw_semaphore. + These divide users into two classes: the readers and the writers. If + you are only reading the data, you can get a read lock, but to write to + the data you need the write lock. Many people can hold a read lock, + but a writer must be sole holder. + + + + This means much smoother locking if your code divides up + neatly along reader/writer lines. All the discussions below + also apply to read/write variants. + + + + + Locking Only In User Context + + + If you have a data structure which is only ever accessed from + user context, then you can use a simple semaphore + (linux/asm/semaphore.h) to protect it. This + is the most trivial case: you initialize the semaphore to the number + of resources available (usually 1), and call + down_interruptible() to grab the semaphore, and + up() to release it. There is also a + down(), which should be avoided, because it + will not return if a signal is received. + + + + Example: linux/net/core/netfilter.c allows + registration of new setsockopt() and + getsockopt() calls, with + nf_register_sockopt(). Registration and + de-registration are only done on module load and unload (and boot + time, where there is no concurrency), and the list of registrations + is only consulted for an unknown setsockopt() + or getsockopt() system call. The + nf_sockopt_mutex is perfect to protect this, + especially since the setsockopt and getsockopt calls may well + sleep. + + + + + Locking Between User Context and BHs + + + If a bottom half shares + data with user context, you have two problems. Firstly, the current + user context can be interrupted by a bottom half, and secondly, the + critical region could be entered from another CPU. This is where + spin_lock_bh() + (include/linux/spinlock.h) is + used. It disables bottom halves on that CPU, then grabs the lock. + spin_unlock_bh() does the reverse. + + + + This works perfectly for UP + as well: the spin lock vanishes, and this macro + simply becomes local_bh_disable() + (include/asm/softirq.h), which + protects you from the bottom half being run. + + + + + Locking Between User Context and Tasklets/Soft IRQs + + + This is exactly the same as above, because + local_bh_disable() actually disables all + softirqs and tasklets + on that CPU as well. It should really be + called `local_softirq_disable()', but the name has been preserved + for historical reasons. Similarly, + spin_lock_bh() would now be called + spin_lock_softirq() in a perfect world. + + + + + Locking Between Bottom Halves + + + Sometimes a bottom half might want to share data with + another bottom half (especially remember that timers are run + off a bottom half). + + + + The Same BH + + + Since a bottom half is never run on two CPUs at once, you + don't need to worry about your bottom half being run twice + at once, even on SMP. + + + + + Different BHs + + + Since only one bottom half ever runs at a time once, you + don't need to worry about race conditions with other bottom + halves. Beware that things might change under you, however, + if someone changes your bottom half to a tasklet. If you + want to make your code future-proof, pretend you're already + running from a tasklet (see below), and doing the extra + locking. Of course, if it's five years before that happens, + you're gonna look like a damn fool. + + + + + + Locking Between Tasklets + + + Sometimes a tasklet might want to share data with another + tasklet, or a bottom half. + + + + The Same Tasklet + + Since a tasklet is never run on two CPUs at once, you don't + need to worry about your tasklet being reentrant (running + twice at once), even on SMP. + + + + + Different Tasklets + + If another tasklet (or bottom half, such as a timer) wants + to share data with your tasklet, you will both need to use + spin_lock() and + spin_unlock() calls. + spin_lock_bh() is + unnecessary here, as you are already in a a tasklet, and + none will be run on the same CPU. + + + + + + Locking Between Softirqs + + + Often a softirq might + want to share data with itself, a tasklet, or a bottom half. + + + + The Same Softirq + + + The same softirq can run on the other CPUs: you can use a + per-CPU array (see ) for better + performance. If you're going so far as to use a softirq, + you probably care about scalable performance enough + to justify the extra complexity. + + + + You'll need to use spin_lock() and + spin_unlock() for shared data. + + + + + Different Softirqs + + + You'll need to use spin_lock() and + spin_unlock() for shared data, whether it + be a timer (which can be running on a different CPU), bottom half, + tasklet or the same or another softirq. + + + + + + + Hard IRQ Context + + + Hardware interrupts usually communicate with a bottom half, + tasklet or softirq. Frequently this involved putting work in a + queue, which the BH/softirq will take out. + + + + Locking Between Hard IRQ and Softirqs/Tasklets/BHs + + + If a hardware irq handler shares data with a softirq, you have + two concerns. Firstly, the softirq processing can be + interrupted by a hardware interrupt, and secondly, the + critical region could be entered by a hardware interrupt on + another CPU. This is where spin_lock_irq() is + used. It is defined to disable interrupts on that cpu, then grab + the lock. spin_unlock_irq() does the reverse. + + + + This works perfectly for UP as well: the spin lock vanishes, + and this macro simply becomes local_irq_disable() + (include/asm/smp.h), which + protects you from the softirq/tasklet/BH being run. + + + + spin_lock_irqsave() + (include/linux/spinlock.h) is a variant + which saves whether interrupts were on or off in a flags word, + which is passed to spin_lock_irqrestore(). This + means that the same code can be used inside an hard irq handler (where + interrupts are already off) and in softirqs (where the irq + disabling is required). + + + + + + Common Techniques + + + This section lists some common dilemmas and the standard + solutions used in the Linux kernel code. If you use these, + people will find your code simpler to understand. + + + + If I could give you one piece of advice: never sleep with anyone + crazier than yourself. But if I had to give you advice on + locking: keep it simple. + + + + Lock data, not code. + + + + Be reluctant to introduce new locks. + + + + Strangely enough, this is the exact reverse of my advice when + you have slept with someone crazier than yourself. + + + + No Writers in Interrupt Context + + + There is a fairly common case where an interrupt handler needs + access to a critical region, but does not need write access. + In this case, you do not need to use + read_lock_irq(), but only + read_lock() everywhere (since if an interrupt + occurs, the irq handler will only try to grab a read lock, which + won't deadlock). You will still need to use + write_lock_irq(). + + + + Similar logic applies to locking between softirqs/tasklets/BHs + which never need a write lock, and user context: + read_lock() and + write_lock_bh(). + + + + + Deadlock: Simple and Advanced + + + There is a coding bug where a piece of code tries to grab a + spinlock twice: it will spin forever, waiting for the lock to + be released (spinlocks and writelocks are not re-entrant in + Linux). This is trivial to diagnose: not a + stay-up-five-nights-talk-to-fluffy-code-bunnies kind of + problem. + + + + For a slightly more complex case, imagine you have a region + shared by a bottom half and user context. If you use a + spin_lock() call to protect it, it is + possible that the user context will be interrupted by the bottom + half while it holds the lock, and the bottom half will then spin + forever trying to get the same lock. + + + + Both of these are called deadlock, and as shown above, it can + occur even with a single CPU (although not on UP compiles, + since spinlocks vanish on kernel compiles with + CONFIG_SMP=n. You'll still get data corruption + in the second example). + + + + This complete lockup is easy to diagnose: on SMP boxes the + watchdog timer or compiling with DEBUG_SPINLOCKS set + (include/linux/spinlock.h) will show this up + immediately when it happens. + + + + A more complex problem is the so-called `deadly embrace', + involving two or more locks. Say you have a hash table: each + entry in the table is a spinlock, and a chain of hashed + objects. Inside a softirq handler, you sometimes want to + alter an object from one place in the hash to another: you + grab the spinlock of the old hash chain and the spinlock of + the new hash chain, and delete the object from the old one, + and insert it in the new one. + + + + There are two problems here. First, if your code ever + tries to move the object to the same chain, it will deadlock + with itself as it tries to lock it twice. Secondly, if the + same softirq on another CPU is trying to move another object + in the reverse direction, the following could happen: + + + + Consequences + + + + + + CPU 1 + CPU 2 + + + + + + Grab lock A -> OK + Grab lock B -> OK + + + Grab lock B -> spin + Grab lock A -> spin + + + +
+ + + The two CPUs will spin forever, waiting for the other to give up + their lock. It will look, smell, and feel like a crash. + + + + Preventing Deadlock + + + Textbooks will tell you that if you always lock in the same + order, you will never get this kind of deadlock. Practice + will tell you that this approach doesn't scale: when I + create a new lock, I don't understand enough of the kernel + to figure out where in the 5000 lock hierarchy it will fit. + + + + The best locks are encapsulated: they never get exposed in + headers, and are never held around calls to non-trivial + functions outside the same file. You can read through this + code and see that it will never deadlock, because it never + tries to grab another lock while it has that one. People + using your code don't even need to know you are using a + lock. + + + + A classic problem here is when you provide callbacks or + hooks: if you call these with the lock held, you risk simple + deadlock, or a deadly embrace (who knows what the callback + will do?). Remember, the other programmers are out to get + you, so don't do this. + + + + + Overzealous Prevention Of Deadlocks + + + Deadlocks are problematic, but not as bad as data + corruption. Code which grabs a read lock, searches a list, + fails to find what it wants, drops the read lock, grabs a + write lock and inserts the object has a race condition. + + + + If you don't see why, please stay the fuck away from my code. + + +
+ + + Per-CPU Data + + + A great technique for avoiding locking which is used fairly + widely is to duplicate information for each CPU. For example, + if you wanted to keep a count of a common condition, you could + use a spin lock and a single counter. Nice and simple. + + + + If that was too slow [it's probably not], you could instead + use a counter for each CPU [don't], then none of them need an + exclusive lock [you're wasting your time here]. To make sure + the CPUs don't have to synchronize caches all the time, align + the counters to cache boundaries by appending + `__cacheline_aligned' to the declaration + (include/linux/cache.h). + [Can't you think of anything better to do?] + + + + They will need a read lock to access their own counters, + however. That way you can use a write lock to grant exclusive + access to all of them at once, to tally them up. + + + + + Big Reader Locks + + + A classic example of per-CPU information is Ingo's `big + reader' locks + (linux/include/brlock.h). These + use the Per-CPU Data techniques described above to create a lock which + is very fast to get a read lock, but agonizingly slow for a write + lock. + + + + Fortunately, there are a limited number of these locks + available: you have to go through a strict interview process + to get one. + + + + + Avoiding Locks: Read And Write Ordering + + + Sometimes it is possible to avoid locking. Consider the + following case from the 2.2 firewall code, which inserted an + element into a single linked list in user context: + + + + new->next = i->next; + i->next = new; + + + + Here the author (Alan Cox, who knows what he's doing) assumes + that the pointer assignments are atomic. This is important, + because networking packets would traverse this list on bottom + halves without a lock. Depending on their exact timing, they + would either see the new element in the list with a valid + next pointer, or it would not be in the + list yet. + + + + Of course, the writes must be in this + order, otherwise the new element appears in the list with an + invalid next pointer, and any other + CPU iterating at the wrong time will jump through it into garbage. + Because modern CPUs reorder, Alan's code actually read as follows: + + + + new->next = i->next; + wmb(); + i->next = new; + + + + The wmb() is a write memory barrier + (include/asm/system.h): neither + the compiler nor the CPU will allow any writes to memory after the + wmb() to be visible to other hardware + before any of the writes before the wmb(). + + + + As i386 does not do write reordering, this bug would never + show up on that platform. On other SMP platforms, however, it + will. + + + + There is also rmb() for read ordering: to ensure + any previous variable reads occur before following reads. The simple + mb() macro combines both + rmb() and wmb(). + + + + Dropping or gaining a spinlock, and any atomic operation are + all defined to act as memory barriers (ie. as per the + mb() macro). + + + + There is a similar, but unrelated, problem with code like the + following: + + + + if (!(ctrack->status & IPS_CONFIRMED)) { + spin_lock_bh(&ip_conntrack_lock); + if (!(ctrack->status & IPS_CONFIRMED)) { + clean_from_lists(h->ctrack); + h->ctrack->status |= IPS_CONFIRMED; + } + spin_unlock_bh(&ip_conntrack_lock); + } + + + + In this case, the author has tried to be tricky: knowing that + the CONFIRMED bit is set and never reset in the status word, + you can test it outside the lock, and frequently avoid + grabbing the lock at all. However, the compiler could cache + the value in a register, rather than rereading it once the + lock is obtained, creating a subtle race. The way to get + around this is to declare the status field `volatile', or use + a temporary volatile pointer to achieve the same effect in + this one place. + + + + + Avoiding Locks: Atomic Operations + + + There are a number of atomic operations defined in + include/asm/atomic.h: these + are guaranteed to be seen atomically from all CPUs in the system, thus + avoiding races. If your shared data consists of a single counter, say, + these operations might be simpler than using spinlocks (although for + anything non-trivial using spinlocks is clearer). + + + + Note that the atomic operations are defined to act as both + read and write barriers on all platforms. + + + + + Protecting A Collection of Objects: Reference Counts + + + Locking a collection of objects is fairly easy: you get a + single spinlock, and you make sure you grab it before + searching, adding or deleting an object. + + + + The purpose of this lock is not to protect the individual + objects: you might have a separate lock inside each one for + that. It is to protect the data structure + containing the objects from race conditions. Often + the same lock is used to protect the contents of all the + objects as well, for simplicity, but they are inherently + orthogonal (and many other big words designed to confuse). + + + + Changing this to a read-write lock will often help markedly if + reads are far more common that writes. If not, there is + another approach you can use to reduce the time the lock is + held: reference counts. + + + + In this approach, an object has an owner, who sets the + reference count to one. Whenever you get a pointer to the + object, you increment the reference count (a `get' operation). + Whenever you relinquish a pointer, you decrement the reference + count (a `put' operation). When the owner wants to destroy + it, they mark it dead, and do a put. + + + + Whoever drops the reference count to zero (usually implemented + with atomic_dec_and_test()) actually cleans + up and frees the object. + + + + This means that you are guaranteed that the object won't + vanish underneath you, even though you no longer have a lock + for the collection. + + + + Here's some skeleton code: + + + + void create_foo(struct foo *x) + { + atomic_set(&x->use, 1); + spin_lock_bh(&list_lock); + ... insert in list ... + spin_unlock_bh(&list_lock); + } + + struct foo *get_foo(int desc) + { + struct foo *ret; + + spin_lock_bh(&list_lock); + ... find in list ... + if (ret) atomic_inc(&ret->use); + spin_unlock_bh(&list_lock); + + return ret; + } + + void put_foo(struct foo *x) + { + if (atomic_dec_and_test(&x->use)) + kfree(foo); + } + + void destroy_foo(struct foo *x) + { + spin_lock_bh(&list_lock); + ... remove from list ... + spin_unlock_bh(&list_lock); + + put_foo(x); + } + + + + Macros To Help You + + There are a set of debugging macros tucked inside + include/linux/netfilter_ipv4/lockhelp.h + and listhelp.h: these are very + useful for ensuring that locks are held in the right places to protect + infrastructure. + + + + + + Things Which Sleep + + + You can never call the following routines while holding a + spinlock, as they may sleep: + + + + + + Accesses to + userspace: + + + + + copy_from_user() + + + + + copy_to_user() + + + + + get_user() + + + + + put_user() + + + + + + + + kmalloc(GFP_KERNEL) + + + + + + printk(), which can be called from + user context, interestingly enough. + + + + + + + The Fucked Up Sparc + + + Alan Cox says the irq disable/enable is in the register + window on a sparc. Andi Kleen says when you do + restore_flags in a different function you mess up all the + register windows. + + + + So never pass the flags word set by + spin_lock_irqsave() and brethren to another + function (unless it's declared inline. Usually no-one + does this, but now you've been warned. Dave Miller can never do + anything in a straightforward manner (I can say that, because I have + pictures of him and a certain PowerPC maintainer in a compromising + position). + + + + + Racing Timers: A Kernel Pastime + + + Timers can produce their own special problems with races. + Consider a collection of objects (list, hash, etc) where each + object has a timer which is due to destroy it. + + + + If you want to destroy the entire collection (say on module + removal), you might do the following: + + + + /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE + HUNGARIAN NOTATION */ + spin_lock_bh(&list_lock); + + while (list) { + struct foo *next = list->next; + del_timer(&list->timer); + kfree(list); + list = next; + } + + spin_unlock_bh(&list_lock); + + + + Sooner or later, this will crash on SMP, because a timer can + have just gone off before the spin_lock_bh(), + and it will only get the lock after we + spin_unlock_bh(), and then try to free + the element (which has already been freed!). + + + + This can be avoided by checking the result of + del_timer(): if it returns + 1, the timer has been deleted. + If 0, it means (in this + case) that it is currently running, so we can do: + + + + retry: + spin_lock_bh(&list_lock); + + while (list) { + struct foo *next = list->next; + if (!del_timer(&list->timer)) { + /* Give timer a chance to delete this */ + spin_unlock_bh(&list_lock); + goto retry; + } + kfree(list); + list = next; + } + + spin_unlock_bh(&list_lock); + + + + Another common problem is deleting timers which restart + themselves (by calling add_timer() at the end + of their timer function). Because this is a fairly common case + which is prone to races, the function del_timer_sync() + (include/linux/timer.h) is + provided to handle this case. It returns the number of times the timer + had to be deleted before we finally stopped it from adding itself back + in. + + +
+ + + Further reading + + + + + Documentation/spinlocks.txt: + Linus Torvalds' spinlocking tutorial in the kernel sources. + + + + + + Unix Systems for Modern Architectures: Symmetric + Multiprocessing and Caching for Kernel Programmers: + + + + Curt Schimmel's very good introduction to kernel level + locking (not written for Linux, but nearly everything + applies). The book is expensive, but really worth every + penny to understand SMP locking. [ISBN: 0201633388] + + + + + + + Thanks + + + Thanks to Telsa Gwynne for DocBooking, neatening and adding + style. + + + + Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul + Mackerras, Ruedi Aschwanden, Alan Cox for proofreading, + correcting, flaming, commenting. + + + + Thanks to the cabal for having no influence on this document. + + + + + Glossary + + + bh + + + Bottom Half: for historical reasons, functions with + `_bh' in them often now refer to any software interrupt, e.g. + spin_lock_bh() blocks any software interrupt + on the current CPU. Bottom halves are deprecated, and will + eventually be replaced by tasklets. Only one bottom half will be + running at any time. + + + + + + Hardware Interrupt / Hardware IRQ + + + Hardware interrupt request. in_irq() returns + true in a hardware interrupt handler (it + also returns true when interrupts are blocked). + + + + + + Interrupt Context + + + Not user context: processing a hardware irq or software irq. + Indicated by the in_interrupt() macro + returning true (although it also + returns true when interrupts or BHs are blocked). + + + + + + SMP + + + Symmetric Multi-Processor: kernels compiled for multiple-CPU + machines. (CONFIG_SMP=y). + + + + + + softirq + + + Strictly speaking, one of up to 32 enumerated software + interrupts which can run on multiple CPUs at once. + Sometimes used to refer to tasklets and bottom halves as + well (ie. all software interrupts). + + + + + + Software Interrupt / Software IRQ + + + Software interrupt handler. in_irq() returns + false; in_softirq() + returns true. Tasklets, softirqs and + bottom halves all fall into the category of `software interrupts'. + + + + + + tasklet + + + A dynamically-registrable software interrupt, + which is guaranteed to only run on one CPU at a time. + + + + + + UP + + + Uni-Processor: Non-SMP. (CONFIG_SMP=n). + + + + + + User Context + + + The kernel executing on behalf of a particular + process or kernel thread (given by the current() + macro.) Not to be confused with userspace. Can be interrupted by + software or hardware interrupts. + + + + + + Userspace + + + A process executing its own code outside the kernel. + + + + + +
+ diff --git a/Documentation/DocBook/parportbook.tmpl b/Documentation/DocBook/parportbook.tmpl index a6ac266d1419..8d7791df5c8b 100644 --- a/Documentation/DocBook/parportbook.tmpl +++ b/Documentation/DocBook/parportbook.tmpl @@ -2052,7 +2052,7 @@ ssize_t write_printer (int fd, const void *ptr, size_t count) - + API reference diff --git a/Documentation/kbuild/config-language.txt b/Documentation/kbuild/config-language.txt index c446e7ac3268..b37dafb7e5e1 100644 --- a/Documentation/kbuild/config-language.txt +++ b/Documentation/kbuild/config-language.txt @@ -193,7 +193,7 @@ output files. Configure: implemented Menuconfig: implemented -Xconfig: does not display, but writes to output files +Xconfig: implemented mconfig: implemented Example: diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt new file mode 100644 index 000000000000..585d557abace --- /dev/null +++ b/Documentation/kernel-doc-nano-HOWTO.txt @@ -0,0 +1,128 @@ +kernel-doc nano-HOWTO +===================== + +Many places in the source tree have extractable documentation in the +form of block comments above functions. The components of this system +are: + +- scripts/kernel-doc + + This is a perl script that hunts for the block comments and can mark + them up directly into DocBook, man, text, and HTML. (No, not + texinfo.) + +- Documentation/DocBook/*.tmpl + + These are SGML template files, which are normal SGML files with + special place-holders for where the extracted documentation should + go. + +- scripts/docproc.c + + This is a program for converting SGML template files into SGML + files. It invokes kernel-doc, giving it the list of functions that + are to be documented. + +- scripts/gen-all-syms + + This is a script that lists the EXPORT_SYMBOL symbols in a list of C + files. + +- scripts/docgen + + This script invokes docproc, telling it which functions are to be + documented (this list comes from gen-all-syms). + +- Makefile + + The targets 'sgmldocs', 'psdocs', and 'pdfdocs' are used to build + DocBook files, PostScript files, and PDF files in + Documentation/DocBook. + +- Documentation/DocBook/Makefile + + This is where C files are associated with SGML templates. + + +How to extract the documentation +-------------------------------- + +If you just want to read the ready-made books on the various +subsystems (see Documentation/DocBook/*.tmpl), just type 'make +psdocs', or 'make pdfdocs', depending on your preference. If you +would rather read a different format, you can type 'make sgmldocs' and +then use DocBook tools to convert Documentation/DocBook/*.sgml to a +format of your choice (for example, 'db2html ...'). + +If you want to see man pages instead, you can do this: + +$ cd linux +$ scripts/kernel-doc -man $(find -name '*.c') | split-man.pl /tmp/man + +Here is split-man.pl: + +--> +#!/usr/bin/perl + +if ($#ARGV < 0) { + die "where do I put the results?\n"; +} + +mkdir $ARGV[0],0777 or die "Can't create $ARGV[0]: $!\n"; +$state = 0; +while () { + if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) { + if ($state == 1) { close OUT } + $state = 1; + $fn = "$ARGV[0]/$1.4"; + print STDERR "Creating $fn\n"; + open OUT, ">$fn" or die "can't open $fn: $!\n"; + print OUT $_; + } elsif ($state != 0) { + print OUT $_; + } +} + +close OUT; +<-- + +If you just want to view the documentation for one function in one +file, you can do this: + +$ scripts/kernel-doc -man -function fn file | nroff -man | less + +or this: + +$ scripts/kernel-doc -text -function fn file + + +How to add extractable documentation to your source files +--------------------------------------------------------- + +The format of the block comment is like this: + +/** + * function_name(:)? (- short description)? +(* @parameterx: (description of parameter x)?)* +(* a blank line)? + * (Description:)? (Description of function)? + * (section header: (section description)? )* +(*)?*/ + +The short function description cannot be multiline, but the other +descriptions can be. + +All descriptive text is further processed, scanning for the following special +patterns, which are highlighted appropriately. + +'funcname()' - function +'$ENVVAR' - environment variable +'&struct_name' - name of a structure (up to two words including 'struct') +'@parameter' - name of a parameter +'%CONST' - name of a constant. + +Take a look around the source tree for examples. + +Tim. +*/ + diff --git a/arch/ppc/mbxboot/embed_config.c b/arch/ppc/mbxboot/embed_config.c index eca2cb764f59..048f8bcfe529 100644 --- a/arch/ppc/mbxboot/embed_config.c +++ b/arch/ppc/mbxboot/embed_config.c @@ -3,7 +3,7 @@ * not have boot monitor support for board information. */ #include -#include +#include #ifdef CONFIG_8xx #include #endif diff --git a/arch/sh/kernel/io_se.c b/arch/sh/kernel/io_se.c index 0622d1cdb472..1726980b6eca 100644 --- a/arch/sh/kernel/io_se.c +++ b/arch/sh/kernel/io_se.c @@ -7,8 +7,6 @@ * I/O routine for Hitachi SolutionEngine. * */ -#include - #include #include #include diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c index 4245f6293da6..b7d01e9a25bd 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -13,6 +13,7 @@ * Naturally it's not a 1:1 relation, but there are similarities. */ +#include #include #include #include diff --git a/arch/sparc/config.in b/arch/sparc/config.in index 279afb38f9c5..9faf12053f95 100644 --- a/arch/sparc/config.in +++ b/arch/sparc/config.in @@ -1,4 +1,4 @@ -# $Id: config.in,v 1.92 2000/03/29 11:56:48 davem Exp $ +# $Id: config.in,v 1.93 2000/05/22 08:12:19 davem Exp $ # For a description of the syntax of this configuration file, # see the Configure script. # @@ -186,6 +186,11 @@ if [ "$CONFIG_NET" = "y" ]; then if [ "$CONFIG_NETDEVICES" = "y" ]; then tristate ' Dummy net driver support' CONFIG_DUMMY tristate ' Bonding driver support' CONFIG_BONDING + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + if [ "$CONFIG_NETLINK" = "y" ]; then + tristate ' Ethertap network tap (EXPERIMENTAL)' CONFIG_ETHERTAP + fi + fi tristate ' PPP (point-to-point) support' CONFIG_PPP if [ ! "$CONFIG_PPP" = "n" ]; then dep_tristate ' PPP support for async serial ports' CONFIG_PPP_ASYNC $CONFIG_PPP diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index fe83b081ac7b..f63f542bad55 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -1,4 +1,4 @@ -# $Id: Makefile,v 1.53 2000/03/31 04:06:19 davem Exp $ +# $Id: Makefile,v 1.54 2000/05/12 23:51:24 davem Exp $ # Makefile for the linux kernel. # # Note! Dependencies are done automagically by 'make dep', which also @@ -59,13 +59,11 @@ check_asm: dummy @echo "#ifndef CONFIG_SMP" >> asm_offsets.h @echo "" >> asm_offsets.h @echo "#include " > tmp.c - @echo "#undef __SMP__" >> tmp.c @echo "#undef CONFIG_SMP" >> tmp.c @echo "#include " >> tmp.c $(CPP) $(CPPFLAGS) tmp.c -o tmp.i @echo "/* Automatically generated. Do not edit. */" > check_asm.c @echo "#include " >> check_asm.c - @echo "#undef __SMP__" >> check_asm.c @echo "#undef CONFIG_SMP" >> check_asm.c @echo "#include " >> check_asm.c @echo 'struct task_struct _task;' >> check_asm.c diff --git a/arch/sparc/kernel/entry.S b/arch/sparc/kernel/entry.S index 8141d0e24977..61518872f27c 100644 --- a/arch/sparc/kernel/entry.S +++ b/arch/sparc/kernel/entry.S @@ -1,4 +1,4 @@ -/* $Id: entry.S,v 1.163 1999/11/19 04:11:24 davem Exp $ +/* $Id: entry.S,v 1.164 2000/05/09 17:40:13 davem Exp $ * arch/sparc/kernel/entry.S: Sparc trap low-level entry points. * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/kernel/head.S b/arch/sparc/kernel/head.S index 30df8a0f2d67..713dff79911f 100644 --- a/arch/sparc/kernel/head.S +++ b/arch/sparc/kernel/head.S @@ -1,4 +1,4 @@ -/* $Id: head.S,v 1.102 2000/01/29 01:08:54 anton Exp $ +/* $Id: head.S,v 1.103 2000/05/09 17:40:13 davem Exp $ * head.S: The initial boot code for the Sparc port of Linux. * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/kernel/irq.c b/arch/sparc/kernel/irq.c index 5eaa8beda8e9..77a6f47696c5 100644 --- a/arch/sparc/kernel/irq.c +++ b/arch/sparc/kernel/irq.c @@ -1,4 +1,4 @@ -/* $Id: irq.c,v 1.102 2000/02/25 05:44:35 davem Exp $ +/* $Id: irq.c,v 1.103 2000/05/09 17:40:13 davem Exp $ * arch/sparc/kernel/irq.c: Interrupt request handling routines. On the * Sparc the IRQ's are basically 'cast in stone' * and you are supposed to probe the prom's device diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c index 96cd44aea56e..e18d91e0edc5 100644 --- a/arch/sparc/kernel/process.c +++ b/arch/sparc/kernel/process.c @@ -1,4 +1,4 @@ -/* $Id: process.c,v 1.146 2000/03/01 02:53:27 davem Exp $ +/* $Id: process.c,v 1.147 2000/05/09 17:40:13 davem Exp $ * linux/arch/sparc/kernel/process.c * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/kernel/setup.c b/arch/sparc/kernel/setup.c index 25ea5cb8dc75..5398a93818ab 100644 --- a/arch/sparc/kernel/setup.c +++ b/arch/sparc/kernel/setup.c @@ -1,4 +1,4 @@ -/* $Id: setup.c,v 1.117 2000/03/27 12:14:54 davem Exp $ +/* $Id: setup.c,v 1.118 2000/05/09 17:40:13 davem Exp $ * linux/arch/sparc/kernel/setup.c * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/kernel/signal.c b/arch/sparc/kernel/signal.c index 41a8a68b75e0..e75d5d000960 100644 --- a/arch/sparc/kernel/signal.c +++ b/arch/sparc/kernel/signal.c @@ -1,4 +1,4 @@ -/* $Id: signal.c,v 1.102 2000/04/08 02:11:36 davem Exp $ +/* $Id: signal.c,v 1.103 2000/05/09 17:40:13 davem Exp $ * linux/arch/sparc/kernel/signal.c * * Copyright (C) 1991, 1992 Linus Torvalds diff --git a/arch/sparc/kernel/sparc_ksyms.c b/arch/sparc/kernel/sparc_ksyms.c index d4944b5027c5..186f1282520a 100644 --- a/arch/sparc/kernel/sparc_ksyms.c +++ b/arch/sparc/kernel/sparc_ksyms.c @@ -1,4 +1,4 @@ -/* $Id: sparc_ksyms.c,v 1.96 2000/03/16 09:12:49 jj Exp $ +/* $Id: sparc_ksyms.c,v 1.97 2000/05/09 17:40:13 davem Exp $ * arch/sparc/kernel/ksyms.c: Sparc specific ksyms support. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/kernel/sun4d_irq.c b/arch/sparc/kernel/sun4d_irq.c index 7d8fab67f954..192a6bb037cb 100644 --- a/arch/sparc/kernel/sun4d_irq.c +++ b/arch/sparc/kernel/sun4d_irq.c @@ -1,4 +1,4 @@ -/* $Id: sun4d_irq.c,v 1.24 1999/12/27 06:08:34 anton Exp $ +/* $Id: sun4d_irq.c,v 1.25 2000/05/09 17:40:13 davem Exp $ * arch/sparc/kernel/sun4d_irq.c: * SS1000/SC2000 interrupt handling. * diff --git a/arch/sparc/kernel/sys_sunos.c b/arch/sparc/kernel/sys_sunos.c index 81f32ba18db5..0dba9b1ca777 100644 --- a/arch/sparc/kernel/sys_sunos.c +++ b/arch/sparc/kernel/sys_sunos.c @@ -1,4 +1,4 @@ -/* $Id: sys_sunos.c,v 1.122 2000/04/27 02:49:03 davem Exp $ +/* $Id: sys_sunos.c,v 1.123 2000/05/22 07:29:39 davem Exp $ * sys_sunos.c: SunOS specific syscall compatibility support. * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c index 31fcc6166ad8..05bb225d78ec 100644 --- a/arch/sparc/kernel/time.c +++ b/arch/sparc/kernel/time.c @@ -1,4 +1,4 @@ -/* $Id: time.c,v 1.54 2000/04/13 08:14:30 anton Exp $ +/* $Id: time.c,v 1.55 2000/05/09 17:40:13 davem Exp $ * linux/arch/sparc/kernel/time.c * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/kernel/traps.c b/arch/sparc/kernel/traps.c index 4717d9bd3929..ca74c09fcecc 100644 --- a/arch/sparc/kernel/traps.c +++ b/arch/sparc/kernel/traps.c @@ -1,4 +1,4 @@ -/* $Id: traps.c,v 1.61 2000/01/21 11:38:41 jj Exp $ +/* $Id: traps.c,v 1.62 2000/05/09 17:40:13 davem Exp $ * arch/sparc/kernel/traps.c * * Copyright 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/lib/rwsem.S b/arch/sparc/lib/rwsem.S index ebbfd3255057..98b757cb67c6 100644 --- a/arch/sparc/lib/rwsem.S +++ b/arch/sparc/lib/rwsem.S @@ -1,4 +1,4 @@ -/* $Id: rwsem.S,v 1.4 2000/02/13 07:59:39 anton Exp $ +/* $Id: rwsem.S,v 1.5 2000/05/09 17:40:13 davem Exp $ * Assembly part of rw semaphores. * * Copyright (C) 1999 Jakub Jelinek (jakub@redhat.com) diff --git a/arch/sparc/mm/btfixup.c b/arch/sparc/mm/btfixup.c index 72b8cff3db6c..3447b839eada 100644 --- a/arch/sparc/mm/btfixup.c +++ b/arch/sparc/mm/btfixup.c @@ -1,4 +1,4 @@ -/* $Id: btfixup.c,v 1.9 1999/12/27 06:30:02 anton Exp $ +/* $Id: btfixup.c,v 1.10 2000/05/09 17:40:13 davem Exp $ * btfixup.c: Boot time code fixup and relocator, so that * we can get rid of most indirect calls to achieve single * image sun4c and srmmu kernel. diff --git a/arch/sparc/mm/hypersparc.S b/arch/sparc/mm/hypersparc.S index 5c1c0143dcc0..10812273b5bb 100644 --- a/arch/sparc/mm/hypersparc.S +++ b/arch/sparc/mm/hypersparc.S @@ -1,4 +1,4 @@ -/* $Id: hypersparc.S,v 1.14 1999/08/14 03:51:47 anton Exp $ +/* $Id: hypersparc.S,v 1.15 2000/05/09 17:40:13 davem Exp $ * hypersparc.S: High speed Hypersparc mmu/cache operations. * * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c index 87e30eda1269..3f350004675d 100644 --- a/arch/sparc/mm/init.c +++ b/arch/sparc/mm/init.c @@ -1,4 +1,4 @@ -/* $Id: init.c,v 1.84 2000/03/15 23:26:26 anton Exp $ +/* $Id: init.c,v 1.85 2000/05/09 17:40:13 davem Exp $ * linux/arch/sparc/mm/init.c * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 1fa50e946d95..0b46e7d25fec 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -1,4 +1,4 @@ -/* $Id: srmmu.c,v 1.208 2000/02/14 04:52:33 jj Exp $ +/* $Id: srmmu.c,v 1.209 2000/05/09 17:40:13 davem Exp $ * srmmu.c: SRMMU specific routines for memory management. * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c index 9671e7ee7715..b4b9a82768e7 100644 --- a/arch/sparc/mm/sun4c.c +++ b/arch/sparc/mm/sun4c.c @@ -1,4 +1,4 @@ -/* $Id: sun4c.c,v 1.191 2000/04/08 02:11:41 davem Exp $ +/* $Id: sun4c.c,v 1.192 2000/05/09 17:40:13 davem Exp $ * sun4c.c: Doing in software what should be done in hardware. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/mm/swift.S b/arch/sparc/mm/swift.S index f8e2635bca84..2d952f5ad311 100644 --- a/arch/sparc/mm/swift.S +++ b/arch/sparc/mm/swift.S @@ -1,4 +1,4 @@ -/* $Id: swift.S,v 1.4 2000/02/12 03:08:47 zaitcev Exp $ +/* $Id: swift.S,v 1.5 2000/05/09 17:40:13 davem Exp $ * swift.S: MicroSparc-II mmu/cache operations. * * Copyright (C) 1999 David S. Miller (davem@redhat.com) diff --git a/arch/sparc/mm/tsunami.S b/arch/sparc/mm/tsunami.S index 932713eef66e..1eb8fd6da207 100644 --- a/arch/sparc/mm/tsunami.S +++ b/arch/sparc/mm/tsunami.S @@ -1,4 +1,4 @@ -/* $Id: tsunami.S,v 1.3 1999/10/09 05:32:19 zaitcev Exp $ +/* $Id: tsunami.S,v 1.4 2000/05/09 17:40:13 davem Exp $ * tsunami.S: High speed MicroSparc-I mmu/cache operations. * * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/mm/viking.S b/arch/sparc/mm/viking.S index 4e74f34e4ce1..4e083225799a 100644 --- a/arch/sparc/mm/viking.S +++ b/arch/sparc/mm/viking.S @@ -1,4 +1,4 @@ -/* $Id: viking.S,v 1.15 2000/01/15 00:51:36 anton Exp $ +/* $Id: viking.S,v 1.16 2000/05/09 17:40:13 davem Exp $ * viking.S: High speed Viking cache/mmu operations * * Copyright (C) 1997 Eddie C. Dost (ecd@skynet.be) diff --git a/arch/sparc64/config.in b/arch/sparc64/config.in index 4c53b12a16ab..d27061c03675 100644 --- a/arch/sparc64/config.in +++ b/arch/sparc64/config.in @@ -1,4 +1,4 @@ -# $Id: config.in,v 1.109 2000/05/02 06:35:59 davem Exp $ +# $Id: config.in,v 1.112 2000/05/22 08:12:19 davem Exp $ # For a description of the syntax of this configuration file, # see the Configure script. # @@ -204,6 +204,11 @@ if [ "$CONFIG_NET" = "y" ]; then if [ "$CONFIG_NETDEVICES" = "y" ]; then tristate ' Dummy net driver support' CONFIG_DUMMY tristate ' Bonding driver support' CONFIG_BONDING + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + if [ "$CONFIG_NETLINK" = "y" ]; then + tristate ' Ethertap network tap (EXPERIMENTAL)' CONFIG_ETHERTAP + fi + fi tristate ' PPP (point-to-point) support' CONFIG_PPP if [ ! "$CONFIG_PPP" = "n" ]; then dep_tristate ' PPP support for async serial ports' CONFIG_PPP_ASYNC $CONFIG_PPP diff --git a/arch/sparc64/kernel/Makefile b/arch/sparc64/kernel/Makefile index b57e662da6bc..edb6b05718aa 100644 --- a/arch/sparc64/kernel/Makefile +++ b/arch/sparc64/kernel/Makefile @@ -1,4 +1,4 @@ -# $Id: Makefile,v 1.53 2000/03/31 04:06:22 davem Exp $ +# $Id: Makefile,v 1.54 2000/05/12 23:51:24 davem Exp $ # Makefile for the linux kernel. # # Note! Dependencies are done automagically by 'make dep', which also diff --git a/arch/sparc64/kernel/head.S b/arch/sparc64/kernel/head.S index b3fade43ee47..5e855ad2c781 100644 --- a/arch/sparc64/kernel/head.S +++ b/arch/sparc64/kernel/head.S @@ -1,4 +1,4 @@ -/* $Id: head.S,v 1.64 2000/03/06 22:33:42 davem Exp $ +/* $Id: head.S,v 1.65 2000/05/09 17:40:13 davem Exp $ * head.S: Initial boot code for the Sparc64 port of Linux. * * Copyright (C) 1996,1997 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc64/kernel/ioctl32.c b/arch/sparc64/kernel/ioctl32.c index 84fa9936d830..6c00ba2f69c0 100644 --- a/arch/sparc64/kernel/ioctl32.c +++ b/arch/sparc64/kernel/ioctl32.c @@ -1,4 +1,4 @@ -/* $Id: ioctl32.c,v 1.89 2000/05/06 10:38:42 davem Exp $ +/* $Id: ioctl32.c,v 1.90 2000/05/22 07:29:39 davem Exp $ * ioctl32.c: Conversion between 32bit and 64bit native ioctls. * * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c index 16ec61db093d..eb6f789f6685 100644 --- a/arch/sparc64/kernel/irq.c +++ b/arch/sparc64/kernel/irq.c @@ -1,4 +1,4 @@ -/* $Id: irq.c,v 1.86 2000/04/15 06:02:50 davem Exp $ +/* $Id: irq.c,v 1.87 2000/05/09 17:40:13 davem Exp $ * irq.c: UltraSparc IRQ handling/init/registry. * * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c index f691e9ca5720..060d1c477b9c 100644 --- a/arch/sparc64/kernel/process.c +++ b/arch/sparc64/kernel/process.c @@ -1,4 +1,4 @@ -/* $Id: process.c,v 1.106 2000/04/15 06:02:50 davem Exp $ +/* $Id: process.c,v 1.107 2000/05/09 17:40:14 davem Exp $ * arch/sparc64/kernel/process.c * * Copyright (C) 1995, 1996 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c index 824097fa0d75..0f3e5c0691df 100644 --- a/arch/sparc64/kernel/setup.c +++ b/arch/sparc64/kernel/setup.c @@ -1,4 +1,4 @@ -/* $Id: setup.c,v 1.53 2000/03/15 14:42:52 jj Exp $ +/* $Id: setup.c,v 1.54 2000/05/09 17:40:14 davem Exp $ * linux/arch/sparc64/kernel/setup.c * * Copyright (C) 1995,1996 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index 7bf31c1045b2..12a872447040 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -1,4 +1,4 @@ -/* $Id: sparc64_ksyms.c,v 1.83 2000/04/19 08:38:25 davem Exp $ +/* $Id: sparc64_ksyms.c,v 1.84 2000/05/09 17:40:14 davem Exp $ * arch/sparc64/kernel/sparc64_ksyms.c: Sparc64 specific ksyms support. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c index 72cb305ebd88..12592b747d6b 100644 --- a/arch/sparc64/kernel/sys_sparc32.c +++ b/arch/sparc64/kernel/sys_sparc32.c @@ -1,4 +1,4 @@ -/* $Id: sys_sparc32.c,v 1.146 2000/05/09 04:48:34 davem Exp $ +/* $Id: sys_sparc32.c,v 1.147 2000/05/22 07:29:40 davem Exp $ * sys_sparc32.c: Conversion between 32bit and 64bit native syscalls. * * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c index 5ae41792ec3d..7aeb4781fafa 100644 --- a/arch/sparc64/kernel/sys_sunos32.c +++ b/arch/sparc64/kernel/sys_sunos32.c @@ -1,4 +1,4 @@ -/* $Id: sys_sunos32.c,v 1.46 2000/04/27 02:49:03 davem Exp $ +/* $Id: sys_sunos32.c,v 1.47 2000/05/22 07:29:40 davem Exp $ * sys_sunos32.c: SunOS binary compatability layer on sparc64. * * Copyright (C) 1995, 1996, 1997 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S index eb02486f5a52..9c20a2752186 100644 --- a/arch/sparc64/kernel/systbls.S +++ b/arch/sparc64/kernel/systbls.S @@ -1,4 +1,4 @@ -/* $Id: systbls.S,v 1.72 2000/04/13 07:30:34 jj Exp $ +/* $Id: systbls.S,v 1.73 2000/05/10 14:23:39 jj Exp $ * systbls.S: System call entry point tables for OS compatibility. * The native Linux system call table lives here also. * @@ -83,7 +83,7 @@ sys_call_table: /*10*/ .word sys_unlink, sunos_execv, sys_chdir, sys_chown, sys_mknod /*15*/ .word sys_chmod, sys_lchown, sparc_brk, sys_perfctr, sys_lseek /*20*/ .word sys_getpid, sys_capget, sys_capset, sys_setuid, sys_getuid -/*25*/ .word sys_time, sys_ptrace, sys_alarm, sys_sigaltstack, sys_nis_syscall +/*25*/ .word sys_nis_syscall, sys_ptrace, sys_alarm, sys_sigaltstack, sys_nis_syscall /*30*/ .word sys_utime, sys_nis_syscall, sys_nis_syscall, sys_access, sys_nice .word sys_nis_syscall, sys_sync, sys_kill, sys_newstat, sys_sendfile /*40*/ .word sys_newlstat, sys_dup, sys_pipe, sys_times, sys_nis_syscall @@ -124,7 +124,7 @@ sys_call_table: .word sys_ipc, sys_nis_syscall, sys_clone, sys_nis_syscall, sys_adjtimex /*220*/ .word sys_nis_syscall, sys_create_module, sys_delete_module, sys_get_kernel_syms, sys_getpgid .word sys_bdflush, sys_sysfs, sys_nis_syscall, sys_setfsuid, sys_setfsgid -/*230*/ .word sys_select, sys_time, sys_nis_syscall, sys_stime, sys_nis_syscall +/*230*/ .word sys_select, sys_nis_syscall, sys_nis_syscall, sys_stime, sys_nis_syscall .word sys_nis_syscall, sys_llseek, sys_mlock, sys_munlock, sys_mlockall /*240*/ .word sys_munlockall, sys_sched_setparam, sys_sched_getparam, sys_sched_setscheduler, sys_sched_getscheduler .word sys_sched_yield, sys_sched_get_priority_max, sys_sched_get_priority_min, sys_sched_rr_get_interval, sys_nanosleep diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c index c630356e127e..6a84d4c29bc4 100644 --- a/arch/sparc64/kernel/time.c +++ b/arch/sparc64/kernel/time.c @@ -1,4 +1,4 @@ -/* $Id: time.c,v 1.25 2000/04/13 05:29:44 davem Exp $ +/* $Id: time.c,v 1.26 2000/05/09 17:40:14 davem Exp $ * time.c: UltraSparc timer and TOD clock support. * * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c index cf20ffc1a396..7b5d3261457f 100644 --- a/arch/sparc64/kernel/traps.c +++ b/arch/sparc64/kernel/traps.c @@ -1,4 +1,4 @@ -/* $Id: traps.c,v 1.65 2000/01/21 11:39:01 jj Exp $ +/* $Id: traps.c,v 1.66 2000/05/09 17:40:14 davem Exp $ * arch/sparc64/kernel/traps.c * * Copyright (C) 1995,1997 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc64/kernel/ttable.S b/arch/sparc64/kernel/ttable.S index 027d2e124de6..b1081b9fe84a 100644 --- a/arch/sparc64/kernel/ttable.S +++ b/arch/sparc64/kernel/ttable.S @@ -1,4 +1,4 @@ -/* $Id: ttable.S,v 1.30 1999/12/01 23:52:03 davem Exp $ +/* $Id: ttable.S,v 1.31 2000/05/09 17:40:14 davem Exp $ * ttable.S: Sparc V9 Trap Table(s) with SpitFire extensions. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c index d1516a40ba15..f8ebf129f1e8 100644 --- a/arch/sparc64/lib/debuglocks.c +++ b/arch/sparc64/lib/debuglocks.c @@ -1,4 +1,4 @@ -/* $Id: debuglocks.c,v 1.4 2000/01/31 04:59:10 davem Exp $ +/* $Id: debuglocks.c,v 1.5 2000/05/09 17:40:14 davem Exp $ * debuglocks.c: Debugging versions of SMP locking primitives. * * Copyright (C) 1998 David S. Miller (davem@redhat.com) diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index 997e1a1b5b82..88ac931039a7 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -1,4 +1,4 @@ -/* $Id: init.c,v 1.151 2000/04/26 17:09:32 davem Exp $ +/* $Id: init.c,v 1.152 2000/05/09 17:40:14 davem Exp $ * arch/sparc64/mm/init.c * * Copyright (C) 1996-1999 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S index a12bfb4d01e0..90cc898ff42b 100644 --- a/arch/sparc64/mm/ultra.S +++ b/arch/sparc64/mm/ultra.S @@ -1,4 +1,4 @@ -/* $Id: ultra.S,v 1.42 2000/05/05 18:47:41 davem Exp $ +/* $Id: ultra.S,v 1.43 2000/05/09 17:40:14 davem Exp $ * ultra.S: Don't expand these all over the place... * * Copyright (C) 1997, 2000 David S. Miller (davem@redhat.com) diff --git a/arch/sparc64/prom/misc.c b/arch/sparc64/prom/misc.c index 90c77ea744ad..a9ee22cd7e82 100644 --- a/arch/sparc64/prom/misc.c +++ b/arch/sparc64/prom/misc.c @@ -1,4 +1,4 @@ -/* $Id: misc.c,v 1.17 2000/04/15 06:02:50 davem Exp $ +/* $Id: misc.c,v 1.18 2000/05/09 17:40:14 davem Exp $ * misc.c: Miscellaneous prom functions that don't belong * anywhere else. * diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c index 56f1280f8ff4..eaa8fed3298e 100644 --- a/arch/sparc64/solaris/misc.c +++ b/arch/sparc64/solaris/misc.c @@ -1,4 +1,4 @@ -/* $Id: misc.c,v 1.26 2000/04/14 09:59:02 davem Exp $ +/* $Id: misc.c,v 1.27 2000/05/09 17:40:14 davem Exp $ * misc.c: Miscelaneous syscall emulation for Solaris * * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 64c70e33a5d8..8f47348189d3 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -200,15 +200,21 @@ static void register_devfs_entries (int drive) __init; static devfs_handle_t devfs_handle = NULL; #define K_64 0x10000 /* 64KB */ -#include /* the following is the mask of allowed drives. By default units 2 and * 3 of both floppy controllers are disabled, because switching on the * motor of these drives causes system hangs on some PCI computers. drive * 0 is the low bit (0x1), and drive 7 is the high bit (0x80). Bits are on if - * a drive is allowed. */ + * a drive is allowed. + * + * NOTE: This must come before we include the arch floppy header because + * some ports reference this variable from there. -DaveM + */ static int allowed_drive_mask = 0x33; + +#include + static int irqdma_allocated = 0; #define MAJOR_NR FLOPPY_MAJOR diff --git a/drivers/char/Config.in b/drivers/char/Config.in index 25e7c65a5f0b..792832a306e4 100644 --- a/drivers/char/Config.in +++ b/drivers/char/Config.in @@ -205,7 +205,7 @@ if [ "$CONFIG_VIDEO_DEV" != "n" ]; then fi comment 'Video Adapters' if [ "$CONFIG_I2C_ALGOBIT" = "y" -o "$CONFIG_I2C_ALGOBIT" = "m" ]; then - dep_tristate ' BT848 Video For Linux' CONFIG_VIDEO_BT848 $CONFIG_VIDEO_DEV $CONFIG_PCI $CONFIG_I2C_ALGOBIT + dep_tristate ' BT848 Video For Linux' CONFIG_VIDEO_BT848 $CONFIG_VIDEO_DEV $CONFIG_PCI $CONFIG_I2C_ALGOBIT $CONFIG_SOUND fi dep_tristate ' Mediavision Pro Movie Studio Video For Linux' CONFIG_VIDEO_PMS $CONFIG_VIDEO_DEV if [ "$CONFIG_ALL_PPC" = "y" ]; then diff --git a/drivers/char/bttv.c b/drivers/char/bttv.c index f7c9b0db6b10..b17efd42963d 100644 --- a/drivers/char/bttv.c +++ b/drivers/char/bttv.c @@ -43,12 +43,13 @@ #include #include #include +#include #include "bttv.h" #include "tuner.h" -#define DEBUG(x) /* Debug driver */ -#define IDEBUG(x) /* Debug interrupt handler */ +#define DEBUG(x) /* Debug driver */ +#define IDEBUG(x) /* Debug interrupt handler */ #define MIN(a,b) (((a)>(b))?(b):(a)) #define MAX(a,b) (((a)>(b))?(a):(b)) @@ -466,7 +467,7 @@ static struct i2c_client i2c_client_template = { NULL }; -static int init_bttv_i2c(struct bttv *btv) +static int __init init_bttv_i2c(struct bttv *btv) { /* i2c bit_adapter */ memcpy(&btv->i2c_adap, &i2c_adap_template, sizeof(struct i2c_adapter)); @@ -488,7 +489,7 @@ static int init_bttv_i2c(struct bttv *btv) } /* read I2C */ -static int I2CRead(struct bttv *btv, unsigned char addr, char *probe_for) +static int __init I2CRead(struct bttv *btv, unsigned char addr, char *probe_for) { unsigned char buffer = 0; @@ -513,7 +514,7 @@ static int I2CRead(struct bttv *btv, unsigned char addr, char *probe_for) } /* write I2C */ -static int I2CWrite(struct bttv *btv, unsigned char addr, unsigned char b1, +static int __init I2CWrite(struct bttv *btv, unsigned char addr, unsigned char b1, unsigned char b2, int both) { unsigned char buffer[2]; @@ -530,7 +531,7 @@ static int I2CWrite(struct bttv *btv, unsigned char addr, unsigned char b1, } /* read EEPROM */ -static void readee(struct bttv *btv, unsigned char *eedata, int addr) +static void __init readee(struct bttv *btv, unsigned char *eedata, int addr) { int i; @@ -557,7 +558,7 @@ static struct HAUPPAUGE_TUNER int id; char *name; } -hauppauge_tuner[] = +hauppauge_tuner[] __initdata = { { TUNER_ABSENT, "" }, { TUNER_ABSENT, "External" }, @@ -605,8 +606,7 @@ hauppauge_tuner[] = { TUNER_ABSENT, "Temic 4046FM5" }, }; -static void -hauppauge_eeprom(struct bttv *btv) +static void __init hauppauge_eeprom(struct bttv *btv) { if (eeprom_data[9] < sizeof(hauppauge_tuner)/sizeof(struct HAUPPAUGE_TUNER)) { @@ -615,10 +615,11 @@ hauppauge_eeprom(struct bttv *btv) printk("bttv%d: Hauppauge eeprom: tuner=%s (%d)\n",btv->nr, hauppauge_tuner[eeprom_data[9]].name,btv->tuner_type); } + + return; } -static void -hauppauge_boot_msp34xx(struct bttv *btv) +static void __init hauppauge_boot_msp34xx(struct bttv *btv) { int i; @@ -654,7 +655,7 @@ hauppauge_boot_msp34xx(struct bttv *btv) /* This is basically the same procedure as * used by Alessandro Rubini in his pxc200 * driver, but using BTTV functions */ -static void init_PXC200(struct bttv *btv) +static void __init init_PXC200(struct bttv *btv) { static const int vals[] = { 0x08, 0x09, 0x0a, 0x0b, 0x0d, 0x0d, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, @@ -704,7 +705,7 @@ static struct CARD { unsigned id; int cardnr; char *name; -} cards[] = { +} cards[] __initdata = { { 0x00011002, BTTV_HAUPPAUGE878, "ATI TV Wonder" }, { 0x00011461, BTTV_AVPHONE98, "AVerMedia TVPhone98" }, { 0x00031461, BTTV_AVPHONE98, "AVerMedia TVPhone98" }, @@ -912,8 +913,7 @@ static struct tvcard tvcards[] = }; #define TVCARDS (sizeof(tvcards)/sizeof(struct tvcard)) -static void -dump_eeprom(struct bttv *btv,int addr) +static void __init dump_eeprom(struct bttv *btv,int addr) { int i; @@ -930,8 +930,7 @@ dump_eeprom(struct bttv *btv,int addr) } } -static int -idcard_eeprom(struct bttv *btv) +static int __init idcard_eeprom(struct bttv *btv) { unsigned id; int i,n; @@ -1737,8 +1736,7 @@ static void bt848_set_geo(struct bttv *btv, struct tvnorm *tvn; unsigned long flags; - save_flags(flags); - cli(); + spin_lock_irqsave(&btv->s_lock, flags); tvn=&tvnorms[btv->win.norm]; @@ -1792,7 +1790,7 @@ static void bt848_set_geo(struct bttv *btv, btwrite(format, BT848_COLOR_FMT); btwrite(bswap | BT848_COLOR_CTL_GAMMA, BT848_COLOR_CTL); - restore_flags(flags); + spin_unlock_irqrestore(&btv->s_lock, flags); } @@ -1856,6 +1854,7 @@ static int vgrab(struct bttv *btv, struct video_mmap *mp) { unsigned int *ro, *re; unsigned int *vbuf; + unsigned long flags; if(btv->fbuffer==NULL) { @@ -1891,7 +1890,7 @@ static int vgrab(struct bttv *btv, struct video_mmap *mp) if (debug) printk("bttv%d: cap vgrab: queue %d (%d:%dx%d)\n", btv->nr,mp->frame,mp->format,mp->width,mp->height); - cli(); + spin_lock_irqsave(&btv->s_lock, flags); btv->gbuf[mp->frame].stat = GBUFFER_GRABBING; btv->gbuf[mp->frame].fmt = palette2fmt[mp->format]; btv->gbuf[mp->frame].width = mp->width; @@ -1912,7 +1911,7 @@ static int vgrab(struct bttv *btv, struct video_mmap *mp) btv->gqueue[btv->gq_in++] = mp->frame; btv->gq_in = btv->gq_in % MAX_GBUFFERS; - sti(); + spin_unlock_irqrestore(&btv->s_lock, flags); btor(3, BT848_CAP_CTL); btor(3, BT848_GPIO_DMA_CTL); return 0; @@ -1933,23 +1932,25 @@ static long bttv_read(struct video_device *v, char *buf, unsigned long count, in todo=count; while (todo && todo>(q=VBIBUF_SIZE-btv->vbip)) { + unsigned long flags; + if(copy_to_user((void *) buf, (void *) btv->vbibuf+btv->vbip, q)) return -EFAULT; todo-=q; buf+=q; - cli(); + spin_lock_irqsave(&btv->s_lock, flags); if (todo && q==VBIBUF_SIZE-btv->vbip) { if(nonblock) { - sti(); + spin_unlock_irqrestore(&btv->s_lock, flags); if(count==todo) return -EWOULDBLOCK; return count-todo; } + spin_unlock_irqrestore(&btv->s_lock, flags); interruptible_sleep_on(&btv->vbiq); - sti(); if(signal_pending(current)) { if(todo==count) @@ -1957,7 +1958,8 @@ static long bttv_read(struct video_device *v, char *buf, unsigned long count, in else return count-todo; } - } + } else + spin_unlock_irqrestore(&btv->s_lock, flags); } if (todo) { @@ -1980,7 +1982,7 @@ static void bt848_restart(struct bttv *btv) { if (verbose) printk("bttv%d: resetting chip\n",btv->nr); - btwrite(0xfffffUL, BT848_INT_STAT); + btwrite(~0x0UL, BT848_INT_STAT); btand(~15, BT848_GPIO_DMA_CTL); btwrite(0, BT848_SRESET); btwrite(virt_to_bus(btv->risc_jmp+2), @@ -2006,6 +2008,8 @@ static int bttv_open(struct video_device *dev, int flags) int i,ret; ret = -EBUSY; + + MOD_INC_USE_COUNT; down(&btv->lock); if (btv->user) goto out_unlock; @@ -2027,11 +2031,11 @@ static int bttv_open(struct video_device *dev, int flags) set_pll(btv); btv->user++; up(&btv->lock); - MOD_INC_USE_COUNT; return 0; out_unlock: up(&btv->lock); + MOD_DEC_USE_COUNT; return ret; } @@ -2749,6 +2753,8 @@ static long vbi_read(struct video_device *v, char *buf, unsigned long count, todo=count; while (todo && todo>(q=VBIBUF_SIZE-btv->vbip)) { + unsigned long flags; + if (btv->needs_restart) { down(&btv->lock); bt848_restart(btv); @@ -2759,18 +2765,18 @@ static long vbi_read(struct video_device *v, char *buf, unsigned long count, todo-=q; buf+=q; - cli(); + spin_lock_irqsave(&btv->s_lock, flags); if (todo && q==VBIBUF_SIZE-btv->vbip) { if(nonblock) { - sti(); + spin_unlock_irqrestore(&btv->s_lock, flags); if(count==todo) return -EWOULDBLOCK; return count-todo; } + spin_unlock_irqrestore(&btv->s_lock, flags); interruptible_sleep_on(&btv->vbiq); - sti(); if(signal_pending(current)) { if(todo==count) @@ -2778,7 +2784,8 @@ static long vbi_read(struct video_device *v, char *buf, unsigned long count, else return count-todo; } - } + } else + spin_unlock_irqrestore(&btv->s_lock, flags); } if (todo) { @@ -2807,6 +2814,8 @@ static int vbi_open(struct video_device *dev, int flags) { struct bttv *btv=(struct bttv *)(dev-2); + MOD_INC_USE_COUNT; + down(&btv->lock); if (btv->needs_restart) bt848_restart(btv); @@ -2815,7 +2824,6 @@ static int vbi_open(struct video_device *dev, int flags) bt848_set_risc_jmps(btv,-1); up(&btv->lock); - MOD_INC_USE_COUNT; return 0; } @@ -2887,6 +2895,7 @@ static int radio_open(struct video_device *dev, int flags) struct bttv *btv = (struct bttv *)(dev-1); unsigned long v; + MOD_INC_USE_COUNT; down(&btv->lock); if (btv->user) goto busy_unlock; @@ -2899,11 +2908,11 @@ static int radio_open(struct video_device *dev, int flags) bt848_muxsel(btv,0); up(&btv->lock); - MOD_INC_USE_COUNT; return 0; busy_unlock: up(&btv->lock); + MOD_DEC_USE_COUNT; return -EBUSY; } @@ -3009,7 +3018,7 @@ static struct video_device radio_template= #define TRITON_PEER_CONCURRENCY (1<<3) -static void handle_chipset(void) +static void __init handle_chipset(void) { struct pci_dev *dev = NULL; @@ -3041,7 +3050,7 @@ static void handle_chipset(void) /* can tda9855.c handle this too maybe? */ -static void init_tda9840(struct bttv *btv) +static void __init init_tda9840(struct bttv *btv) { /* Horrible Hack */ I2CWrite(btv, I2C_TDA9840, TDA9840_SW, 0x2a, 1); /* sound mode switching */ @@ -3057,17 +3066,16 @@ static void init_tda9840(struct bttv *btv) /* Figure out card and tuner type */ -static void idcard(int i) +static void __init idcard(struct bttv *btv) { - struct bttv *btv = &bttvs[i]; int type,eeprom = 0; btwrite(0, BT848_GPIO_OUT_EN); - DEBUG(printk(KERN_DEBUG "bttv%d: GPIO: 0x%08x\n", i, btread(BT848_GPIO_DATA))); + DEBUG(printk(KERN_DEBUG "bttv%d: GPIO: 0x%08x\n", btv->nr, btread(BT848_GPIO_DATA))); /* Default the card to the user-selected one. */ - if (card[i] >= 0 && card[i] < TVCARDS) - btv->type=card[i]; + if (card[btv->nr] >= 0 && card[btv->nr] < TVCARDS) + btv->type=card[btv->nr]; /* If we were asked to auto-detect, then do so! */ if (btv->type == BTTV_UNKNOWN) { @@ -3220,6 +3228,10 @@ static void idcard(int i) static void bt848_set_risc_jmps(struct bttv *btv, int flags) { + unsigned long irq_flags; + + spin_lock_irqsave(&btv->s_lock, irq_flags); + if (-1 == flags) { /* defaults */ flags = 0; @@ -3306,18 +3318,17 @@ static void bt848_set_risc_jmps(struct bttv *btv, int flags) bt848_dma(btv, 3); else bt848_dma(btv, 0); + + spin_unlock_irqrestore(&btv->s_lock, irq_flags); } -static int -init_video_dev(struct bttv *btv) +static int __init init_video_dev(struct bttv *btv) { - int num = btv - bttvs; - memcpy(&btv->video_dev,&bttv_template, sizeof(bttv_template)); memcpy(&btv->vbi_dev,&vbi_template, sizeof(vbi_template)); memcpy(&btv->radio_dev,&radio_template,sizeof(radio_template)); - idcard(num); + idcard(btv); if(video_register_device(&btv->video_dev,VFL_TYPE_GRABBER)<0) return -1; @@ -3326,7 +3337,7 @@ init_video_dev(struct bttv *btv) video_unregister_device(&btv->video_dev); return -1; } - if (radio[num]) + if (radio[btv->nr]) { if(video_register_device(&btv->radio_dev, VFL_TYPE_RADIO)<0) { @@ -3338,9 +3349,8 @@ init_video_dev(struct bttv *btv) return 1; } -static int init_bt848(int i) +static int __init init_bt848(struct bttv *btv) { - struct bttv *btv = &bttvs[i]; int j; btv->user=0; @@ -3350,14 +3360,14 @@ static int init_bt848(int i) * might help to make a new card work */ if (verbose >= 2) printk("bttv%d: gpio: out_enable=0x%x, data=0x%x, in=0x%x\n", - i, + btv->nr, btread(BT848_GPIO_OUT_EN), btread(BT848_GPIO_DATA), btread(BT848_GPIO_REG_INP)); /* reset the bt848 */ btwrite(0, BT848_SRESET); - DEBUG(printk(KERN_DEBUG "bttv%d: bt848_mem: 0x%lx\n",i,(unsigned long) btv->bt848_mem)); + DEBUG(printk(KERN_DEBUG "bttv%d: bt848_mem: 0x%lx\n", btv->nr, (unsigned long) btv->bt848_mem)); /* not registered yet */ btv->video_dev.minor = -1; @@ -3465,7 +3475,7 @@ static int init_bt848(int i) btwrite(0x00, BT848_O_SCLOOP); /* clear interrupt status */ - btwrite(0xfffffUL, BT848_INT_STAT); + btwrite(~0x0UL, BT848_INT_STAT); /* set interrupt mask */ btwrite(btv->triton1| @@ -3505,7 +3515,6 @@ static void bttv_irq(int irq, void *dev_id, struct pt_regs * regs) astat=stat&btread(BT848_INT_MASK); if (!astat) return; - btwrite(astat,BT848_INT_STAT); IDEBUG(printk ("bttv%d: astat=%08x\n", btv->nr, astat)); IDEBUG(printk ("bttv%d: stat=%08x\n", btv->nr, stat)); @@ -3667,7 +3676,9 @@ static void bttv_irq(int irq, void *dev_id, struct pt_regs * regs) { IDEBUG(printk ("bttv%d: IRQ_I2CDONE\n", btv->nr)); } - + + btwrite(astat,BT848_INT_STAT); + count++; if (count > 10) printk (KERN_WARNING "bttv%d: irq loop %d\n", @@ -3687,7 +3698,76 @@ static void bttv_irq(int irq, void *dev_id, struct pt_regs * regs) * Scan for a Bt848 card, request the irq and map the io memory */ -int configure_bt848(struct pci_dev *dev, int bttv_num) +static void __init bttv_remove(struct pci_dev *pci_dev) +{ + u8 command; + int j; + struct bttv *btv = pci_dev->driver_data; + + /* unregister i2c_bus */ + i2c_bit_del_bus(&btv->i2c_adap); + + /* turn off all capturing, DMA and IRQs */ + btand(~15, BT848_GPIO_DMA_CTL); + + /* first disable interrupts before unmapping the memory! */ + btwrite(0, BT848_INT_MASK); + btwrite(~0x0UL,BT848_INT_STAT); + btwrite(0x0, BT848_GPIO_OUT_EN); + + /* disable PCI bus-mastering */ + pci_read_config_byte(btv->dev, PCI_COMMAND, &command); + /* Should this be &=~ ?? */ + command&=~PCI_COMMAND_MASTER; + pci_write_config_byte(btv->dev, PCI_COMMAND, command); + + /* unmap and free memory */ + for (j = 0; j < gbuffers; j++) + if (btv->gbuf[j].risc) + kfree(btv->gbuf[j].risc); + if (btv->gbuf) + kfree((void *) btv->gbuf); + + if (btv->risc_scr_odd) + kfree((void *) btv->risc_scr_odd); + + if (btv->risc_scr_even) + kfree((void *) btv->risc_scr_even); + + DEBUG(printk(KERN_DEBUG "free: risc_jmp: 0x%p.\n", btv->risc_jmp)); + if (btv->risc_jmp) + kfree((void *) btv->risc_jmp); + + DEBUG(printk(KERN_DEBUG "bt848_vbibuf: 0x%p.\n", btv->vbibuf)); + if (btv->vbibuf) + vfree((void *) btv->vbibuf); + + free_irq(btv->irq,btv); + DEBUG(printk(KERN_DEBUG "bt848_mem: 0x%p.\n", btv->bt848_mem)); + if (btv->bt848_mem) + iounmap(btv->bt848_mem); + + if(btv->video_dev.minor!=-1) + video_unregister_device(&btv->video_dev); + if(btv->vbi_dev.minor!=-1) + video_unregister_device(&btv->vbi_dev); + if (radio[btv->nr] && btv->radio_dev.minor != -1) + video_unregister_device(&btv->radio_dev); + + release_mem_region(btv->bt848_adr, + pci_resource_len(btv->dev,0)); + /* wake up any waiting processes + because shutdown flag is set, no new processes (in this queue) + are expected + */ + btv->shutdown=1; + wake_up(&btv->gpioq); + + return; +} + + +static int __init bttv_probe(struct pci_dev *dev, const struct pci_device_id *pci_id) { int result; unsigned char command; @@ -3696,6 +3776,8 @@ int configure_bt848(struct pci_dev *dev, int bttv_num) unsigned int cmd; #endif + printk(KERN_INFO "bttv: Bt8xx card found (%d).\n", bttv_num); + btv=&bttvs[bttv_num]; btv->dev=dev; btv->nr = bttv_num; @@ -3711,6 +3793,7 @@ int configure_bt848(struct pci_dev *dev, int bttv_num) btv->vbip=VBIBUF_SIZE; init_waitqueue_head(&btv->gpioq); + btv->s_lock = SPIN_LOCK_UNLOCKED; btv->shutdown=0; btv->id=dev->device; @@ -3718,7 +3801,7 @@ int configure_bt848(struct pci_dev *dev, int bttv_num) btv->bt848_adr=pci_resource_start(dev, 0); if (pci_enable_device(dev)) return -EIO; - if (!request_mem_region(pci_resource_start(dev,0), + if (!request_mem_region(btv->bt848_adr, pci_resource_len(dev,0), "bttv")) { return -EBUSY; @@ -3782,119 +3865,53 @@ int configure_bt848(struct pci_dev *dev, int bttv_num) if (!(command&BT878_EN_TBFX)) { printk("bttv: 430FX compatibility could not be enabled\n"); + free_irq(btv->irq,btv); result = -1; goto fail; } } + + dev->driver_data = btv; + + if(init_bt848(btv) < 0) { + bttv_remove(dev); + return -EIO; + } + + bttv_num++; + return 0; fail: - release_mem_region(pci_resource_start(btv->dev,0), + release_mem_region(btv->bt848_adr, pci_resource_len(btv->dev,0)); return result; } -static int find_bt848(void) -{ - struct pci_dev *dev; - int result=0; - - bttv_num=0; - - pci_for_each_dev(dev) { - if (dev->vendor == PCI_VENDOR_ID_BROOKTREE) - if ((dev->device == PCI_DEVICE_ID_BT848)|| - (dev->device == PCI_DEVICE_ID_BT849)|| - (dev->device == PCI_DEVICE_ID_BT878)|| - (dev->device == PCI_DEVICE_ID_BT879)) - result=configure_bt848(dev,bttv_num++); - if (result) - return result; - } - if(bttv_num) - printk(KERN_INFO "bttv: %d Bt8xx card(s) found.\n", bttv_num); - return bttv_num; -} - -static void release_bttv(void) -{ - u8 command; - int i,j; - struct bttv *btv; - - for (i=0;ii2c_adap); - - /* turn off all capturing, DMA and IRQs */ - btand(~15, BT848_GPIO_DMA_CTL); - - /* first disable interrupts before unmapping the memory! */ - btwrite(0, BT848_INT_MASK); - btwrite(0xffffffffUL,BT848_INT_STAT); - btwrite(0x0, BT848_GPIO_OUT_EN); - - /* disable PCI bus-mastering */ - pci_read_config_byte(btv->dev, PCI_COMMAND, &command); - /* Should this be &=~ ?? */ - command&=~PCI_COMMAND_MASTER; - pci_write_config_byte(btv->dev, PCI_COMMAND, command); - - /* unmap and free memory */ - for (j = 0; j < gbuffers; j++) - if (btv->gbuf[j].risc) - kfree(btv->gbuf[j].risc); - if (btv->gbuf) - kfree((void *) btv->gbuf); - - if (btv->risc_scr_odd) - kfree((void *) btv->risc_scr_odd); - - if (btv->risc_scr_even) - kfree((void *) btv->risc_scr_even); - - DEBUG(printk(KERN_DEBUG "free: risc_jmp: 0x%p.\n", btv->risc_jmp)); - if (btv->risc_jmp) - kfree((void *) btv->risc_jmp); - - DEBUG(printk(KERN_DEBUG "bt848_vbibuf: 0x%p.\n", btv->vbibuf)); - if (btv->vbibuf) - vfree((void *) btv->vbibuf); - - - free_irq(btv->irq,btv); - DEBUG(printk(KERN_DEBUG "bt848_mem: 0x%p.\n", btv->bt848_mem)); - if (btv->bt848_mem) - iounmap(btv->bt848_mem); +static struct pci_device_id bttv_pci_tbl[] __initdata = { + {PCI_VENDOR_ID_BROOKTREE, PCI_DEVICE_ID_BT848, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {PCI_VENDOR_ID_BROOKTREE, PCI_DEVICE_ID_BT849, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {PCI_VENDOR_ID_BROOKTREE, PCI_DEVICE_ID_BT878, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {PCI_VENDOR_ID_BROOKTREE, PCI_DEVICE_ID_BT879, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {0,} +}; - if(btv->video_dev.minor!=-1) - video_unregister_device(&btv->video_dev); - if(btv->vbi_dev.minor!=-1) - video_unregister_device(&btv->vbi_dev); - if (radio[btv->nr] && btv->radio_dev.minor != -1) - video_unregister_device(&btv->radio_dev); +MODULE_DEVICE_TABLE(pci, bttv_pci_tbl); - release_mem_region(pci_resource_start(btv->dev,0), - pci_resource_len(btv->dev,0)); - /* wake up any waiting processes - because shutdown flag is set, no new processes (in this queue) - are expected - */ - btv->shutdown=1; - wake_up(&btv->gpioq); - } -} +static struct pci_driver bttv_pci_driver = { + name:"bttv", + id_table:bttv_pci_tbl, + probe:bttv_probe, + remove:bttv_remove, +}; -#ifdef MODULE -int init_module(void) -#else -int init_bttv_cards(struct video_init *unused) -#endif +static int __init bttv_init_module(void) { - int i; + bttv_num = 0; printk(KERN_INFO "bttv: driver version %d.%d.%d loaded\n", (BTTV_VERSION_CODE >> 16) & 0xff, @@ -3909,31 +3926,18 @@ int init_bttv_cards(struct video_init *unused) gbuffers,gbufsize/1024,gbuffers*gbufsize/1024); handle_chipset(); - if (find_bt848()<=0) - return -EIO; - - /* initialize Bt848s */ - for (i=0; i #define RIO_NBOARDS 4 #define RIO_PORTSPERBOARD 128 diff --git a/drivers/char/rio/rioboot.c b/drivers/char/rio/rioboot.c index 64b180930f09..8168f213339f 100644 --- a/drivers/char/rio/rioboot.c +++ b/drivers/char/rio/rioboot.c @@ -35,7 +35,6 @@ static char *_rioboot_c_sccs_ = "@(#)rioboot.c 1.3"; #endif #define __NO_VERSION__ -#include #include #include #include diff --git a/drivers/char/rio/riocmd.c b/drivers/char/rio/riocmd.c index 71adef52b909..835d815d35b0 100644 --- a/drivers/char/rio/riocmd.c +++ b/drivers/char/rio/riocmd.c @@ -35,7 +35,6 @@ static char *_riocmd_c_sccs_ = "@(#)riocmd.c 1.2"; #endif #define __NO_VERSION__ -#include #include #include #include diff --git a/drivers/char/rio/rioctrl.c b/drivers/char/rio/rioctrl.c index f23c872d75a6..fa68646d5c1d 100644 --- a/drivers/char/rio/rioctrl.c +++ b/drivers/char/rio/rioctrl.c @@ -35,7 +35,6 @@ static char *_rioctrl_c_sccs_ = "@(#)rioctrl.c 1.3"; #define __NO_VERSION__ -#include #include #include #include diff --git a/drivers/char/rio/riointr.c b/drivers/char/rio/riointr.c index 427bbf7233e9..e92609839f9b 100644 --- a/drivers/char/rio/riointr.c +++ b/drivers/char/rio/riointr.c @@ -35,7 +35,6 @@ static char *_riointr_c_sccs_ = "@(#)riointr.c 1.2"; #define __NO_VERSION__ -#include #include #include #include diff --git a/drivers/char/rio/rioparam.c b/drivers/char/rio/rioparam.c index aa4b782d3c1c..550e2c17d98d 100644 --- a/drivers/char/rio/rioparam.c +++ b/drivers/char/rio/rioparam.c @@ -35,7 +35,6 @@ static char *_rioparam_c_sccs_ = "@(#)rioparam.c 1.3"; #endif #define __NO_VERSION__ -#include #include #include #include diff --git a/drivers/char/rio/rioroute.c b/drivers/char/rio/rioroute.c index aba66213b539..2f13eb273217 100644 --- a/drivers/char/rio/rioroute.c +++ b/drivers/char/rio/rioroute.c @@ -34,7 +34,6 @@ static char *_rioroute_c_sccs_ = "@(#)rioroute.c 1.3"; #endif #define __NO_VERSION__ -#include #include #include #include diff --git a/drivers/char/rio/riotable.c b/drivers/char/rio/riotable.c index 09f9d8f7ef85..6329a6e18dba 100644 --- a/drivers/char/rio/riotable.c +++ b/drivers/char/rio/riotable.c @@ -34,7 +34,6 @@ static char *_riotable_c_sccs_ = "@(#)riotable.c 1.2"; #endif #define __NO_VERSION__ -#include #include #include #include diff --git a/drivers/char/rio/riotty.c b/drivers/char/rio/riotty.c index 84ea9020994e..cda417f3f4da 100644 --- a/drivers/char/rio/riotty.c +++ b/drivers/char/rio/riotty.c @@ -37,7 +37,6 @@ static char *_riotty_c_sccs_ = "@(#)riotty.c 1.3"; #define __EXPLICIT_DEF_H__ #define __NO_VERSION__ -#include #include #include #include diff --git a/drivers/char/videodev.c b/drivers/char/videodev.c index 364ae8e69525..618f02f85d22 100644 --- a/drivers/char/videodev.c +++ b/drivers/char/videodev.c @@ -41,7 +41,6 @@ static struct video_device *video_device[VIDEO_NUM_DEVICES]; #ifdef CONFIG_VIDEO_BT848 -extern int init_bttv_cards(struct video_init *); extern int i2c_tuner_init(struct video_init *); #endif #ifdef CONFIG_VIDEO_BWQCAM @@ -59,9 +58,8 @@ extern int init_zoran_cards(struct video_init *); static struct video_init video_init_list[]={ #ifdef CONFIG_VIDEO_BT848 - {"i2c-tuner", i2c_tuner_init}, - {"bttv", init_bttv_cards}, -#endif + {"i2c-tuner", i2c_tuner_init}, +#endif #ifdef CONFIG_VIDEO_BWQCAM {"bw-qcam", init_bw_qcams}, #endif diff --git a/drivers/isdn/avmb1/kcapi.c b/drivers/isdn/avmb1/kcapi.c index 140023d49eb3..6da2d24c1a2d 100644 --- a/drivers/isdn/avmb1/kcapi.c +++ b/drivers/isdn/avmb1/kcapi.c @@ -320,7 +320,7 @@ endloop: *eof = 1; if (off >= len+begin) return 0; - *start = page + (begin-off); + *start = page + (off-begin); return ((count < begin+len-off) ? count : begin+len-off); } diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c index 39625e4814fa..443eaffd2446 100644 --- a/drivers/net/hp100.c +++ b/drivers/net/hp100.c @@ -3123,11 +3123,11 @@ int hp100_port[5] = { 0, -1, -1, -1, -1 }; MODULE_PARM(hp100_port, "1-5i"); #endif -#ifndef LINUX_2_1 -static char devname[5][IFNAMSIZ] = { "", "", "", "", "" }; -static char *hp100_name[5] = { devname[0], devname[1], - devname[2], devname[3], - devname[4] }; +/* Allocate 5 string of length IFNAMSIZ, one string for each device */ +char hp100_name[5][IFNAMSIZ] = { "", "", "", "", "" }; +#ifdef LINUX_2_1 +/* Allow insmod to write those 5 strings individually */ +MODULE_PARM(hp100_name, "1-5c" __MODULE_STRING(IFNAMSIZ)); #endif /* List of devices */ @@ -3159,9 +3159,11 @@ int init_module( void ) /* Create device and set basics args */ hp100_devlist[i] = kmalloc(sizeof(struct net_device), GFP_KERNEL); memset(hp100_devlist[i], 0x00, sizeof(struct net_device)); -#ifndef LINUX_2_1 +#if LINUX_VERSION_CODE >= 0x020362 /* 2.3.99-pre7 */ + memcpy(hp100_devlist[i]->name, hp100_name[i], IFNAMSIZ); /* Copy name */ +#else hp100_devlist[i]->name = hp100_name[i]; -#endif +#endif /* LINUX_VERSION_CODE >= 0x020362 */ hp100_devlist[i]->base_addr = hp100_port[i]; hp100_devlist[i]->init = &hp100_probe; diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c index 9e772466c16e..841c0e12ced9 100644 --- a/drivers/net/pppoe.c +++ b/drivers/net/pppoe.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -46,10 +47,12 @@ static int __attribute__((unused)) pppoe_debug = 7; -#define PPPOE_HASH_SIZE 16 +#define PPPOE_HASH_BITS 4 +#define PPPOE_HASH_SIZE (1<> ( j * PPPOE_HASH_BITS ); + } + } - i = (int) (hash.c[0] & (PPPOE_HASH_SIZE - 1)); + for (i = 0; i < (sizeof(unsigned long)*8) / PPPOE_HASH_BITS ; ++i) + hash ^= sid >> (i*PPPOE_HASH_BITS); - return i; + return hash & ( PPPOE_HASH_SIZE - 1 ); } static struct pppox_opt *item_hash_table[PPPOE_HASH_SIZE] = { 0, }; @@ -105,7 +103,7 @@ static struct pppox_opt *item_hash_table[PPPOE_HASH_SIZE] = { 0, }; /********************************************************************** * * Set/get/delete/rehash items (internal versions) - * + * **********************************************************************/ static struct pppox_opt *__get_item(unsigned long sid, unsigned char *addr) { @@ -162,7 +160,7 @@ static struct pppox_opt *__delete_item(unsigned long sid, char *addr) return ret; } -static struct pppox_opt *__find_on_dev(struct net_device *dev, +static struct pppox_opt *__find_on_dev(struct net_device *dev, struct pppox_opt *start) { struct pppox_opt *po; @@ -195,9 +193,9 @@ static struct pppox_opt *__find_on_dev(struct net_device *dev, /********************************************************************** * * Set/get/delete/rehash items - * + * **********************************************************************/ -static inline struct pppox_opt *get_item(unsigned long sid, +static inline struct pppox_opt *get_item(unsigned long sid, unsigned char *addr) { struct pppox_opt *po; @@ -239,7 +237,7 @@ static inline struct pppox_opt *delete_item(unsigned long sid, char *addr) return ret; } -static struct pppox_opt *find_on_dev(struct net_device *dev, +static struct pppox_opt *find_on_dev(struct net_device *dev, struct pppox_opt *start) { struct pppox_opt *po; @@ -255,7 +253,7 @@ static struct pppox_opt *find_on_dev(struct net_device *dev, * Certain device events require that sockets be unconnected * **************************************************************************/ -static int pppoe_device_event(struct notifier_block *this, +static int pppoe_device_event(struct notifier_block *this, unsigned long event, void *ptr) { int error = NOTIFY_DONE; @@ -277,7 +275,7 @@ static int pppoe_device_event(struct notifier_block *this, if (po->sk->state & PPPOX_CONNECTED) pppox_unbind_sock(po->sk); - + if (po->sk->state & PPPOX_CONNECTED) { lock_sock(po->sk); po->sk->shutdown = RCV_SHUTDOWN&SEND_SHUTDOWN; @@ -313,7 +311,7 @@ static struct notifier_block pppoe_notifier = { * Receive a PPPoE Session frame. * ***********************************************************************/ -static int pppoe_rcv(struct sk_buff *skb, +static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) @@ -336,6 +334,19 @@ static int pppoe_rcv(struct sk_buff *skb, skb_pull(skb, sizeof(struct pppoe_hdr)); ppp_input(&po->chan, skb); + } else if( sk->state & PPPOX_RELAY ){ + struct pppox_opt *relay_po; + + relay_po = get_item_by_addr( &po->pppoe_relay ); + + if( relay_po == NULL || + !( relay_po->sk->state & PPPOX_CONNECTED ) ) + goto abort; + + skb_pull(skb, sizeof(struct pppoe_hdr)); + if( !__pppoe_xmit( relay_po->sk , skb) ) + goto abort; + } else { sock_queue_rcv_skb(sk, skb); } @@ -353,7 +364,7 @@ abort: * -- This is solely for detection of PADT frames * ***********************************************************************/ -static int pppoe_disc_rcv(struct sk_buff *skb, +static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) @@ -539,7 +550,7 @@ int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr, dev = dev_get_by_name(sp->sa_addr.pppoe.dev); - error = -ENODEV; + error = -ENODEV; if (!dev) goto end; @@ -768,13 +779,11 @@ end: /************************************************************************ * - * xmit function called by generic PPP driver - * sends PPP frame over PPPoE socket + * xmit function for internal use. * ***********************************************************************/ -int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb) +int __pppoe_xmit(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = (struct sock *) chan->private; struct net_device *dev = sk->protinfo.pppox->pppoe_dev; struct pppoe_hdr hdr; struct pppoe_hdr *ph; @@ -833,6 +842,20 @@ int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb) return 0; } + +/************************************************************************ + * + * xmit function called by generic PPP driver + * sends PPP frame over PPPoE socket + * + ***********************************************************************/ +int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb) +{ + struct sock *sk = (struct sock *) chan->private; + return __pppoe_xmit(sk, skb); +} + + struct ppp_channel_ops pppoe_chan_ops = { pppoe_xmit , NULL }; int pppoe_rcvmsg(struct socket *sock, struct msghdr *m, int total_len, int flags, struct scm_cookie *scm) diff --git a/drivers/net/wan/comx-hw-comx.c b/drivers/net/wan/comx-hw-comx.c index 9def988f9bea..23899bde98a9 100644 --- a/drivers/net/wan/comx-hw-comx.c +++ b/drivers/net/wan/comx-hw-comx.c @@ -1031,11 +1031,6 @@ static int comxhw_write_proc(struct file *file, const char *buffer, char *page; - if (file->f_dentry->d_inode->i_ino != entry->low_ino) { - printk(KERN_ERR "comx_write_proc: file <-> data internal error\n"); - return -EIO; - } - if(ch->init_status & HW_OPEN) { return -EAGAIN; } diff --git a/drivers/net/wan/comx-hw-locomx.c b/drivers/net/wan/comx-hw-locomx.c index 5548fb558fdc..94561e859920 100644 --- a/drivers/net/wan/comx-hw-locomx.c +++ b/drivers/net/wan/comx-hw-locomx.c @@ -330,11 +330,6 @@ static int locomx_write_proc(struct file *file, const char *buffer, int val; char *page; - if (file->f_dentry->d_inode->i_ino != entry->low_ino) { - printk(KERN_ERR "hw_write_proc: file <-> data internal error\n"); - return -EIO; - } - if (!(page = (char *)__get_free_page(GFP_KERNEL))) { return -ENOMEM; } diff --git a/drivers/net/wan/comx-hw-mixcom.c b/drivers/net/wan/comx-hw-mixcom.c index bdda08c72e11..3ee2b690b7a9 100644 --- a/drivers/net/wan/comx-hw-mixcom.c +++ b/drivers/net/wan/comx-hw-mixcom.c @@ -742,11 +742,6 @@ static int mixcom_write_proc(struct file *file, const char *buffer, char *page; int value; - if (file->f_dentry->d_inode->i_ino != entry->low_ino) { - printk(KERN_ERR "mixcom_write_proc: file <-> data internal error\n"); - return -EIO; - } - if (!(page = (char *)__get_free_page(GFP_KERNEL))) { return -ENOMEM; } diff --git a/drivers/net/wan/comx-proto-fr.c b/drivers/net/wan/comx-proto-fr.c index 50cc61b02a05..f32c84e76846 100644 --- a/drivers/net/wan/comx-proto-fr.c +++ b/drivers/net/wan/comx-proto-fr.c @@ -641,11 +641,6 @@ static int fr_write_proc(struct file *file, const char *buffer, fr = ch->LINE_privdata; } - if (file->f_dentry->d_inode->i_ino != entry->low_ino) { - printk(KERN_ERR "comxfr_write_proc: file <-> data internal error\n"); - return -EIO; - } - if (!(page = (char *)__get_free_page(GFP_KERNEL))) { return -ENOMEM; } diff --git a/drivers/net/wan/comx-proto-lapb.c b/drivers/net/wan/comx-proto-lapb.c index af0ef3af34d6..abf8977ff3be 100644 --- a/drivers/net/wan/comx-proto-lapb.c +++ b/drivers/net/wan/comx-proto-lapb.c @@ -221,11 +221,6 @@ static int comxlapb_write_proc(struct file *file, const char *buffer, unsigned long parm; char *page; - if (file->f_dentry->d_inode->i_ino != entry->low_ino) { - printk(KERN_ERR "comxlapb_write_proc: file <-> data internal error\n"); - return -EIO; - } - if (lapb_getparms(dev->priv, &parms)) { return -ENODEV; } diff --git a/drivers/net/wan/comx.c b/drivers/net/wan/comx.c index e51991fdf8e3..d8e76deefb28 100644 --- a/drivers/net/wan/comx.c +++ b/drivers/net/wan/comx.c @@ -598,11 +598,6 @@ static int comx_write_proc(struct file *file, const char *buffer, u_long count, char str[30]; int ret=0; - if (file->f_dentry->d_inode->i_ino != entry->low_ino) { - printk(KERN_ERR "comx_write_proc: file <-> data internal error\n"); - return -EIO; - } - if (count > PAGE_SIZE) { printk(KERN_ERR "count is %lu > %d!!!\n", count, (int)PAGE_SIZE); return -ENOSPC; diff --git a/drivers/net/wavelan.c b/drivers/net/wavelan.c index b6be90321d60..51fc0b9a67b2 100644 --- a/drivers/net/wavelan.c +++ b/drivers/net/wavelan.c @@ -4211,6 +4211,7 @@ int init_module(void) break; } memset(dev, 0x00, sizeof(struct net_device)); + memcpy(dev->name, name[i], IFNAMSIZ); /* Copy name */ dev->base_addr = io[i]; dev->irq = irq[i]; dev->init = &wavelan_config; diff --git a/drivers/net/wavelan.p.h b/drivers/net/wavelan.p.h index 593b5655c722..3e20776fb213 100644 --- a/drivers/net/wavelan.p.h +++ b/drivers/net/wavelan.p.h @@ -695,8 +695,10 @@ static unsigned short iobase[] = /* Parameters set by insmod */ static int io[4] = { 0, 0, 0, 0 }; static int irq[4] = { 0, 0, 0, 0 }; +static char name[4][IFNAMSIZ] = { "", "", "", "" }; MODULE_PARM(io, "1-4i"); MODULE_PARM(irq, "1-4i"); +MODULE_PARM(name, "1-4c" __MODULE_STRING(IFNAMSIZ)); #endif /* MODULE */ #endif /* WAVELAN_P_H */ diff --git a/drivers/parport/share.c b/drivers/parport/share.c index 63a185029fd8..c85addc5601f 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -602,6 +602,7 @@ parport_register_device(struct parport *port, const char *name, void parport_unregister_device(struct pardevice *dev) { struct parport *port; + unsigned long flags; #ifdef PARPORT_PARANOID if (dev == NULL) { @@ -614,11 +615,14 @@ void parport_unregister_device(struct pardevice *dev) port = dev->port->physport; + read_lock_irqsave (&port->cad_lock, flags); if (port->cad == dev) { + read_unlock_irqrestore (&port->cad_lock, flags); printk(KERN_DEBUG "%s: %s forgot to release port\n", port->name, dev->name); parport_release (dev); } + read_unlock_irqrestore (&port->cad_lock, flags); spin_lock(&port->pardevice_lock); if (dev->next) @@ -663,14 +667,17 @@ int parport_claim(struct pardevice *dev) struct parport *port = dev->port->physport; unsigned long flags; + read_lock_irqsave (&port->cad_lock, flags); if (port->cad == dev) { + read_unlock_irqrestore (&port->cad_lock, flags); printk(KERN_INFO "%s: %s already owner\n", dev->port->name,dev->name); return 0; } + read_unlock_irqrestore (&port->cad_lock, flags); -try_again: /* Preempt any current device */ + write_lock_irqsave (&port->cad_lock, flags); if ((oldcad = port->cad) != NULL) { if (oldcad->preempt) { if (oldcad->preempt(oldcad->private)) @@ -680,7 +687,9 @@ try_again: goto blocked; if (port->cad != oldcad) { - printk(KERN_WARNING + /* I think we'll actually deadlock rather than + get here, but just in case.. */ + printk(KERN_WARNING "%s: %s released port when preempted!\n", port->name, oldcad->name); if (port->cad) @@ -707,9 +716,7 @@ try_again: } /* Now we do the change of devices */ - write_lock_irqsave(&port->cad_lock, flags); port->cad = dev; - write_unlock_irqrestore(&port->cad_lock, flags); #ifdef CONFIG_PARPORT_1284 /* If it's a mux port, select it. */ @@ -729,6 +736,7 @@ try_again: /* Restore control registers */ port->ops->restore_state(port, dev->state); + write_unlock_irqrestore(&port->cad_lock, flags); dev->time = jiffies; return 0; @@ -736,13 +744,10 @@ blocked: /* If this is the first time we tried to claim the port, register an interest. This is only allowed for devices sleeping in parport_claim_or_block(), or those with a wakeup function. */ + + /* The cad_lock is still held for writing here */ if (dev->waiting & 2 || dev->wakeup) { - spin_lock_irqsave (&port->waitlist_lock, flags); - if (port->cad == NULL) { - /* The port got released in the meantime. */ - spin_unlock_irqrestore (&port->waitlist_lock, flags); - goto try_again; - } + spin_lock (&port->waitlist_lock); if (test_and_set_bit(0, &dev->waiting) == 0) { /* First add ourselves to the end of the wait list. */ dev->waitnext = NULL; @@ -753,8 +758,9 @@ blocked: } else port->waithead = port->waittail = dev; } - spin_unlock_irqrestore (&port->waitlist_lock, flags); + spin_unlock (&port->waitlist_lock); } + write_unlock_irqrestore (&port->cad_lock, flags); return -EAGAIN; } @@ -826,7 +832,9 @@ void parport_release(struct pardevice *dev) unsigned long flags; /* Make sure that dev is the current device */ + write_lock_irqsave(&port->cad_lock, flags); if (port->cad != dev) { + write_unlock_irqrestore (&port->cad_lock, flags); printk(KERN_WARNING "%s: %s tried to release parport " "when not owner\n", port->name, dev->name); return; @@ -846,7 +854,6 @@ void parport_release(struct pardevice *dev) } #endif - write_lock_irqsave(&port->cad_lock, flags); port->cad = NULL; write_unlock_irqrestore(&port->cad_lock, flags); @@ -863,7 +870,7 @@ void parport_release(struct pardevice *dev) return; } else if (pd->wakeup) { pd->wakeup(pd->private); - if (dev->port->cad) + if (dev->port->cad) /* racy but no matter */ return; } else { printk(KERN_ERR "%s: don't know how to wake %s\n", port->name, pd->name); diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c index e757c435fb66..5a735401c98d 100644 --- a/drivers/s390/block/dasd_proc.c +++ b/drivers/s390/block/dasd_proc.c @@ -19,64 +19,16 @@ extern int dasd_proc_read_statistics ( char *, char **, off_t, int); extern int dasd_proc_read_debug ( char *, char **, off_t, int); #endif /* DASD_PROFILE */ -struct proc_dir_entry dasd_proc_root_entry = { - 0, - 4,"dasd", - S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR | S_IWGRP, - 1,0,0, - 0, - NULL, -}; - -struct proc_dir_entry dasd_proc_devices_entry = { - 0, - 7,"devices", - S_IFREG | S_IRUGO | S_IXUGO | S_IWUSR | S_IWGRP, - 1,0,0, - 0, - NULL, - &dasd_proc_read_devices, -}; - -#ifdef DASD_PROFILE -struct proc_dir_entry dasd_proc_stats_entry = { - 0, - 10,"statistics", - S_IFREG | S_IRUGO | S_IXUGO | S_IWUSR | S_IWGRP, - 1,0,0, - 0, - NULL, - &dasd_proc_read_statistics, -}; - -struct proc_dir_entry dasd_proc_debug_entry = { - 0, - 5,"debug", - S_IFREG | S_IRUGO | S_IXUGO | S_IWUSR | S_IWGRP, - 1,0,0, - 0, - NULL, - &dasd_proc_read_debug, -}; -#endif /* DASD_PROFILE */ - -struct proc_dir_entry dasd_proc_device_template = { - 0, - 6,"dd????", - S_IFBLK | S_IRUGO | S_IWUSR | S_IWGRP, - 1,0,0, - 0, - NULL, -}; +static struct proc_dir_entry *dasd_proc_root_entry; void dasd_proc_init ( void ) { - proc_register( & proc_root, & dasd_proc_root_entry); - proc_register( & dasd_proc_root_entry, & dasd_proc_devices_entry); + dasd_proc_root_entry = proc_mkdir("dasd", NULL); + create_proc_info_entry("devices",0,&dasd_proc_root_entry,dasd_proc_read_devices); #ifdef DASD_PROFILE - proc_register( & dasd_proc_root_entry, & dasd_proc_stats_entry); - proc_register( & dasd_proc_root_entry, & dasd_proc_debug_entry); + create_proc_info_entry("statistics",0,&dasd_proc_root_entry,dasd_proc_read_statistics); + create_proc_info_entry("debug",0,&dasd_proc_root_entry,dasd_proc_read_debug); #endif /* DASD_PROFILE */ } diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index 3e491d886e8c..aa61a55f8b25 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -4354,7 +4354,8 @@ Target Requested Completed Requested Completed Requested Completed\n\ HostAdapter, Length, BusLogic_MessageBufferSize); if ((Length -= Offset) <= 0) return 0; if (Length >= BytesAvailable) Length = BytesAvailable; - *StartPointer = &HostAdapter->MessageBuffer[Offset]; + memcpy(ProcBuffer, HostAdapter->MessageBuffer + Offset, Length); + *StartPointer = ProcBuffer; return Length; } diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c index b11872c42173..240041cf0436 100644 --- a/drivers/scsi/aha152x.c +++ b/drivers/scsi/aha152x.c @@ -13,9 +13,13 @@ * General Public License for more details. * * - * $Id: aha152x.c,v 2.0 1999/12/25 15:07:32 fischer Exp fischer $ + * $Id: aha152x.c,v 2.1 2000/05/17 16:23:17 fischer Exp fischer $ * * $Log: aha152x.c,v $ + * Revision 2.1 2000/05/17 16:23:17 fischer + * - signature update + * - fix for data out w/o scatter gather + * * Revision 2.0 1999/12/25 15:07:32 fischer * - interrupt routine completly reworked * - basic support for new eh code @@ -202,7 +206,7 @@ #include -#ifdef PCMCIA +#if defined(PCMCIA) #undef MODULE #endif @@ -275,7 +279,6 @@ #define DPRINTK(when,msgs...) #define DO_LOCK(flags) spin_lock_irqsave(&QLOCK,flags) #define DO_UNLOCK(flags) spin_unlock_irqrestore(&QLOCK,flags) -#define DEBUG_DEFAULT 0 #endif #define LEAD "(scsi%d:%d:%d) " @@ -290,6 +293,7 @@ (cmd) ? ((cmd)->lun & 0x07) : -1 #define DELAY_DEFAULT 100 +#define DEBUG_DEFAULT 0 /* possible irq range */ #if defined(PCMCIA) @@ -1714,7 +1718,9 @@ static void reset_ports(struct Scsi_Host *shpnt) */ int aha152x_host_reset(Scsi_Cmnd * SCpnt) { +#if defined(AHA152X_DEBUG) struct Scsi_Host *shpnt = SCpnt->host; +#endif DPRINTK(debug_eh, DEBUG_LEAD "aha152x_host_reset(%p)\n", CMDINFO(SCpnt), SCpnt); @@ -2731,14 +2737,19 @@ static void datao_end(struct Scsi_Host *shpnt) CURRENT_SC->resid += data_count; - data_count -= CURRENT_SC->SCp.ptr - CURRENT_SC->SCp.buffer->address; - while(data_count>0) { - CURRENT_SC->SCp.buffer--; - CURRENT_SC->SCp.buffers_residual++; - data_count -= CURRENT_SC->SCp.buffer->length; + if(CURRENT_SC->use_sg) { + data_count -= CURRENT_SC->SCp.ptr - CURRENT_SC->SCp.buffer->address; + while(data_count>0) { + CURRENT_SC->SCp.buffer--; + CURRENT_SC->SCp.buffers_residual++; + data_count -= CURRENT_SC->SCp.buffer->length; + } + CURRENT_SC->SCp.ptr = CURRENT_SC->SCp.buffer->address - data_count; + CURRENT_SC->SCp.this_residual = CURRENT_SC->SCp.buffer->length + data_count; + } else { + CURRENT_SC->SCp.ptr -= data_count; + CURRENT_SC->SCp.this_residual += data_count; } - CURRENT_SC->SCp.ptr = CURRENT_SC->SCp.buffer->address - data_count; - CURRENT_SC->SCp.this_residual = CURRENT_SC->SCp.buffer->length + data_count; } DPRINTK(debug_datao, DEBUG_LEAD "datao_end: request_bufflen=%d; resid=%d; stcnt=%d\n", diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 1a4360b7e031..f051d391ad7f 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -1481,7 +1481,7 @@ int mega_findCard (Scsi_Host_Template * pHostTmpl, if ((flag & BOARD_QUARTZ) && (skip_id == -1)) { u16 magic; pci_read_config_word(pdev, PCI_CONF_AMISIG, &magic); - if (magic != AMI_SIGNATURE) + if ((magic != AMI_SIGNATURE) && (magic != AMI_SIGNATURE_471)) continue; /* not an AMI board */ } printk (KERN_INFO "megaraid: found 0x%4.04x:0x%4.04x: in %s\n", diff --git a/drivers/scsi/megaraid.h b/drivers/scsi/megaraid.h index f650b5ae6372..dc6207ba2fc2 100644 --- a/drivers/scsi/megaraid.h +++ b/drivers/scsi/megaraid.h @@ -109,6 +109,7 @@ #define PCI_CONF_IRQ_OFFSET 0x3c #define PCI_CONF_AMISIG 0xa0 #define AMI_SIGNATURE 0x3344 +#define AMI_SIGNATURE_471 0xCCCC #if LINUX_VERSION_CODE < 0x20100 #define MEGARAID \ diff --git a/drivers/sound/ac97_codec.c b/drivers/sound/ac97_codec.c index 13e588c98530..b26c3a8d5d3f 100644 --- a/drivers/sound/ac97_codec.c +++ b/drivers/sound/ac97_codec.c @@ -71,6 +71,7 @@ static struct { {0x83847605, "SigmaTel STAC9704" , NULL}, {0x83847608, "SigmaTel STAC9708" , NULL}, {0x83847609, "SigmaTel STAC9721/23" , sigmatel_init}, + {0x54524106, "TriTech TR28026" , NULL}, {0x54524108, "TriTech TR28028" , NULL}, {0x574D4C00, "Wolfson WM9704" , NULL}, {0x00000000, NULL, NULL} @@ -330,6 +331,10 @@ static int ac97_recmask_io(struct ac97_codec *codec, int rw, int mask) /* else, write the first set in the mask as the output */ + /* clear out current set value first (AC97 supports only 1 input!) */ + val = (1 << ac97_rm2oss[codec->codec_read(codec, AC97_RECORD_SELECT)&0x07]); + if (mask != val) mask &= ~val; + val = ffs(mask); val = ac97_oss_rm[val-1]; val |= val << 8; /* set both channels */ @@ -418,6 +423,7 @@ static int ac97_mixer_ioctl(struct ac97_codec *codec, unsigned int cmd, unsigned switch (_IOC_NR(cmd)) { case SOUND_MIXER_RECSRC: /* Arg contains a bit for each recording source */ if (!codec->recmask_io) return -EINVAL; + if(!val) return 0; if (!(val &= codec->record_sources)) return -EINVAL; codec->recmask_io(codec, 0, val); diff --git a/drivers/sound/cmpci.c b/drivers/sound/cmpci.c index 7067bbc1f845..df05c9b64bde 100644 --- a/drivers/sound/cmpci.c +++ b/drivers/sound/cmpci.c @@ -586,7 +586,7 @@ static void start_adc(struct cm_state *s) /* --------------------------------------------------------------------- */ -#define DMABUF_DEFAULTORDER (17-PAGE_SHIFT) +#define DMABUF_DEFAULTORDER (16-PAGE_SHIFT) #define DMABUF_MINORDER 1 static void dealloc_dmabuf(struct dmabuf *db) diff --git a/drivers/sound/trident.c b/drivers/sound/trident.c index a1f462f3b34d..6a3b8b4d6c1d 100644 --- a/drivers/sound/trident.c +++ b/drivers/sound/trident.c @@ -29,6 +29,10 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * History + * v0.14.3 May 20 2000 Aaron Holtzman + * Fix kfree'd memory access in release + * Fix race in open while looking for a free virtual channel slot + * remove open_wait wq (which appears to be unused) * v0.14.2 Mar 29 2000 Ching Ling Lee * Add clear to silence advance in trident_update_ptr * fix invalid data of the end of the sound @@ -166,10 +170,6 @@ struct trident_state { unsigned int magic; struct trident_card *card; /* Card info */ - /* single open lock mechanism, only used for recording */ - struct semaphore open_sem; - wait_queue_head_t open_wait; - /* file mode */ mode_t open_mode; @@ -261,6 +261,9 @@ struct trident_card { /* We keep trident cards in a linked list */ struct trident_card *next; + /* single open lock mechanism, only used for recording */ + struct semaphore open_sem; + /* The trident has a certain amount of cross channel interaction so we use a single per card lock */ spinlock_t lock; @@ -1904,6 +1907,7 @@ static int trident_open(struct inode *inode, struct file *file) /* find an avaiable virtual channel (instance of /dev/dsp) */ while (card != NULL) { + down(&card->open_sem); for (i = 0; i < NR_HW_CH; i++) { if (card->states[i] == NULL) { state = card->states[i] = (struct trident_state *) @@ -1915,6 +1919,7 @@ static int trident_open(struct inode *inode, struct file *file) goto found_virt; } } + up(&card->open_sem); card = card->next; } /* no more virtual channel avaiable */ @@ -1939,10 +1944,8 @@ static int trident_open(struct inode *inode, struct file *file) state->card = card; state->magic = TRIDENT_STATE_MAGIC; init_waitqueue_head(&dmabuf->wait); - init_MUTEX(&state->open_sem); file->private_data = state; - down(&state->open_sem); /* set default sample format. According to OSS Programmer's Guide /dev/dsp should be default to unsigned 8-bits, mono, with sample rate 8kHz and @@ -1985,7 +1988,7 @@ static int trident_open(struct inode *inode, struct file *file) } state->open_mode |= file->f_mode & (FMODE_READ | FMODE_WRITE); - up(&state->open_sem); + up(&card->open_sem); #ifdef DEBUG printk(KERN_ERR "trident: open virtual channel %d, hard channel %d\n", @@ -1999,6 +2002,7 @@ static int trident_open(struct inode *inode, struct file *file) static int trident_release(struct inode *inode, struct file *file) { struct trident_state *state = (struct trident_state *)file->private_data; + struct trident_card *card = state->card; struct dmabuf *dmabuf = &state->dmabuf; VALIDATE_STATE(state); @@ -2009,25 +2013,25 @@ static int trident_release(struct inode *inode, struct file *file) } /* stop DMA state machine and free DMA buffers/channels */ - down(&state->open_sem); + down(&card->open_sem); if (file->f_mode & FMODE_WRITE) { stop_dac(state); dealloc_dmabuf(state); state->card->free_pcm_channel(state->card, dmabuf->channel->num); } + if (file->f_mode & FMODE_READ) { stop_adc(state); dealloc_dmabuf(state); state->card->free_pcm_channel(state->card, dmabuf->channel->num); } - kfree(state->card->states[state->virt]); - state->card->states[state->virt] = NULL; - state->open_mode &= (~file->f_mode) & (FMODE_READ|FMODE_WRITE); + card->states[state->virt] = NULL; + kfree(state); /* we're covered by the open_sem */ - up(&state->open_sem); + up(&card->open_sem); MOD_DEC_USE_COUNT; return 0; @@ -2408,6 +2412,7 @@ static int __init trident_probe(struct pci_dev *pci_dev, const struct pci_device card->banks[BANK_A].bitmap = 0UL; card->banks[BANK_B].addresses = &bank_b_addrs; card->banks[BANK_B].bitmap = 0UL; + init_MUTEX(&card->open_sem); spin_lock_init(&card->lock); devs = card; diff --git a/drivers/video/riva/fbdev.c b/drivers/video/riva/fbdev.c index 5aad57367377..4b12369645c8 100644 --- a/drivers/video/riva/fbdev.c +++ b/drivers/video/riva/fbdev.c @@ -69,7 +69,7 @@ if(!(expr)) { \ printk( "Assertion failed! %s,%s,%s,line=%d\n",\ #expr,__FILE__,__FUNCTION__,__LINE__); \ - BUG(); + BUG(); \ } #else #define assert(expr) diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 81307f4db31a..68be7a69d3a8 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -259,8 +259,6 @@ int coda_cache_check(struct inode *inode, int mask) void coda_purge_dentries(struct inode *inode) { - struct list_head *tmp, *head = &inode->i_dentry; - if (!inode) return ; @@ -268,23 +266,7 @@ void coda_purge_dentries(struct inode *inode) iget(inode->i_sb, inode->i_ino); /* catch the dentries later if some are still busy */ coda_flag_inode(inode, C_PURGE); - -restart: - tmp = head; - while ((tmp = tmp->next) != head) { - struct dentry *dentry = list_entry(tmp, struct dentry, d_alias); - if (!dentry->d_count) { - CDEBUG(D_DOWNCALL, - "coda_free_dentries: freeing %s/%s, i_count=%d\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - inode->i_count); - dget(dentry); - d_drop(dentry); - dput(dentry); - goto restart; - } - - } + d_prune_aliases(inode); iput(inode); } @@ -311,7 +293,6 @@ static void coda_flag_children(struct dentry *parent, int flag) void coda_flag_inode_children(struct inode *inode, int flag) { - struct list_head *alias; struct dentry *alias_de; ENTRY; diff --git a/fs/dcache.c b/fs/dcache.c index 6203994f998b..ad897ff38644 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -248,6 +248,26 @@ struct dentry * d_find_alias(struct inode *inode) return NULL; } +/* + * Try to kill dentries associated with this inode. + * WARNING: you must own a reference to inode. + */ +void d_prune_aliases(struct inode *inode) +{ + struct list_head *tmp, *head = &inode->i_dentry; +restart: + tmp = head; + while ((tmp = tmp->next) != head) { + struct dentry *dentry = list_entry(tmp, struct dentry, d_alias); + if (!dentry->d_count) { + dget(dentry); + d_drop(dentry); + dput(dentry); + goto restart; + } + } +} + /* * Throw away a dentry - free the inode, dput the parent. * This requires that the LRU list has already been diff --git a/fs/exec.c b/fs/exec.c index eded5971f2b6..6811398438e1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -45,8 +45,8 @@ #include #endif -static struct linux_binfmt *formats = (struct linux_binfmt *) NULL; -static spinlock_t binfmt_lock = SPIN_LOCK_UNLOCKED; +static struct linux_binfmt *formats; +static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED; int register_binfmt(struct linux_binfmt * fmt) { @@ -56,17 +56,17 @@ int register_binfmt(struct linux_binfmt * fmt) return -EINVAL; if (fmt->next) return -EBUSY; - spin_lock(&binfmt_lock); + write_lock(&binfmt_lock); while (*tmp) { if (fmt == *tmp) { - spin_unlock(&binfmt_lock); + write_unlock(&binfmt_lock); return -EBUSY; } tmp = &(*tmp)->next; } fmt->next = formats; formats = fmt; - spin_unlock(&binfmt_lock); + write_unlock(&binfmt_lock); return 0; } @@ -74,16 +74,16 @@ int unregister_binfmt(struct linux_binfmt * fmt) { struct linux_binfmt ** tmp = &formats; - spin_lock(&binfmt_lock); + write_lock(&binfmt_lock); while (*tmp) { if (fmt == *tmp) { *tmp = fmt->next; - spin_unlock(&binfmt_lock); + write_unlock(&binfmt_lock); return 0; } tmp = &(*tmp)->next; } - spin_unlock(&binfmt_lock); + write_unlock(&binfmt_lock); return -EINVAL; } @@ -103,35 +103,34 @@ asmlinkage long sys_uselib(const char * library) { int fd, retval; struct file * file; - struct linux_binfmt * fmt; - lock_kernel(); fd = sys_open(library, 0, 0); - retval = fd; if (fd < 0) - goto out; + return fd; file = fget(fd); retval = -ENOEXEC; - if (file && file->f_op && file->f_op->read) { - spin_lock(&binfmt_lock); - for (fmt = formats ; fmt ; fmt = fmt->next) { - if (!fmt->load_shlib) - continue; - if (!try_inc_mod_count(fmt->module)) - continue; - spin_unlock(&binfmt_lock); - retval = fmt->load_shlib(file); - spin_lock(&binfmt_lock); - put_binfmt(fmt); - if (retval != -ENOEXEC) - break; + if (file) { + if(file->f_op && file->f_op->read) { + struct linux_binfmt * fmt; + + read_lock(&binfmt_lock); + for (fmt = formats ; fmt ; fmt = fmt->next) { + if (!fmt->load_shlib) + continue; + if (!try_inc_mod_count(fmt->module)) + continue; + read_unlock(&binfmt_lock); + retval = fmt->load_shlib(file); + read_lock(&binfmt_lock); + put_binfmt(fmt); + if (retval != -ENOEXEC) + break; + } + read_unlock(&binfmt_lock); } - spin_unlock(&binfmt_lock); + fput(file); } - fput(file); sys_close(fd); -out: - unlock_kernel(); return retval; } @@ -747,14 +746,14 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) } #endif for (try=0; try<2; try++) { - spin_lock(&binfmt_lock); + read_lock(&binfmt_lock); for (fmt = formats ; fmt ; fmt = fmt->next) { int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; if (!fn) continue; if (!try_inc_mod_count(fmt->module)) continue; - spin_unlock(&binfmt_lock); + read_unlock(&binfmt_lock); retval = fn(bprm, regs); if (retval >= 0) { put_binfmt(fmt); @@ -764,16 +763,16 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) current->did_exec = 1; return retval; } - spin_lock(&binfmt_lock); + read_lock(&binfmt_lock); put_binfmt(fmt); if (retval != -ENOEXEC) break; if (!bprm->file) { - spin_unlock(&binfmt_lock); + read_unlock(&binfmt_lock); return retval; } } - spin_unlock(&binfmt_lock); + read_unlock(&binfmt_lock); if (retval != -ENOEXEC) { break; #ifdef CONFIG_KMOD diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f639d1a95c01..f35ef3bdb5da 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -605,11 +605,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) int error; nfs_zap_caches(dir_i); + NFS_CACHEINV(inode); error = NFS_PROTO(dir_i)->remove(dir, &dentry->d_name); - if (error >= 0) { - if (inode->i_nlink) - inode->i_nlink --; - } } iput(inode); } @@ -678,7 +675,6 @@ static int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, nfs_renew_times(dentry); error = 0; } - NFS_CACHEINV(dentry->d_parent->d_inode); return error; } @@ -772,7 +768,6 @@ static int nfs_mkdir(struct inode *dir_i, struct dentry *dentry, int mode) d_drop(dentry); #endif nfs_zap_caches(dir_i); - dir_i->i_nlink++; error = NFS_PROTO(dir_i)->mkdir(dir, &dentry->d_name, &attr, &fhandle, &fattr); if (!error && fhandle.size != 0) @@ -793,12 +788,6 @@ static int nfs_rmdir(struct inode *dir_i, struct dentry *dentry) nfs_zap_caches(dir_i); error = NFS_PROTO(dir_i)->rmdir(dir, &dentry->d_name); - /* Update i_nlink */ - if (!error) { - if (dir_i->i_nlink) - dir_i->i_nlink--; - } - return error; } @@ -881,7 +870,7 @@ out: * Remove a file after making sure there are no pending writes, * and after checking that the file has only one user. * - * We update inode->i_nlink and free the inode prior to the operation + * We invalidate the attribute cache and free the inode prior to the operation * to avoid possible races if the server reuses the inode. */ static int nfs_safe_remove(struct dentry *dentry) @@ -895,13 +884,6 @@ static int nfs_safe_remove(struct dentry *dentry) dentry->d_parent->d_name.name, dentry->d_name.name, inode->i_ino); - if (dentry->d_count > 1) { -#ifdef NFS_PARANOIA -printk("nfs_safe_remove: %s/%s busy, d_count=%d\n", -dentry->d_parent->d_name.name, dentry->d_name.name, dentry->d_count); -#endif - goto out; - } /* * Unhash the dentry while we remove the file ... */ @@ -909,22 +891,26 @@ dentry->d_parent->d_name.name, dentry->d_name.name, dentry->d_count); d_drop(dentry); rehash = 1; } + if (dentry->d_count > 1) { +#ifdef NFS_PARANOIA + printk("nfs_safe_remove: %s/%s busy, d_count=%d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + dentry->d_count); +#endif + goto out; + } nfs_zap_caches(dir_i); + NFS_CACHEINV(inode); error = NFS_PROTO(dir_i)->remove(dir, &dentry->d_name); if (error < 0) goto out; /* - * Update i_nlink and free the inode + * Free the inode */ - if (inode->i_nlink) - inode->i_nlink --; d_delete(dentry); - /* - * Rehash the negative dentry if the operation succeeded. - */ - if (rehash) - d_add(dentry, NULL); out: + if (rehash) + d_rehash(dentry); return error; } @@ -1018,14 +1004,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry) */ d_drop(dentry); nfs_zap_caches(dir_i); + NFS_CACHEINV(inode); error = NFS_PROTO(dir_i)->link(old_dentry, dir, &dentry->d_name); - if (!error) { - /* - * Update the link count immediately, as some apps - * (e.g. pine) test this after making a link. - */ - inode->i_nlink++; - } return error; } @@ -1058,8 +1038,17 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, { struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; - struct dentry *dentry = NULL; - int error, rehash = 0; + struct dentry *dentry = NULL, *rehash = NULL; + int error = -EBUSY; + + /* + * To prevent any new references to the target during the rename, + * we unhash the dentry and free the inode in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + rehash = new_dentry; + } dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", old_dentry->d_parent->d_name.name, old_dentry->d_name.name, @@ -1076,7 +1065,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ if (!new_inode) goto go_ahead; - error = -EBUSY; if (S_ISDIR(new_inode->i_mode)) goto out; else if (new_dentry->d_count > 1) { @@ -1090,10 +1078,10 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* silly-rename the existing target ... */ err = nfs_sillyrename(new_dir, new_dentry); if (!err) { - new_dentry = dentry; + new_dentry = rehash = dentry; new_inode = NULL; - /* hash the replacement target */ - d_add(new_dentry, NULL); + /* instantiate the replacement target */ + d_instantiate(new_dentry, NULL); } /* dentry still busy? */ @@ -1117,14 +1105,6 @@ go_ahead: shrink_dcache_parent(old_dentry); } - /* - * To prevent any new references to the target during the rename, - * we unhash the dentry and free the inode in advance. - */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - rehash = 1; - } if (new_inode) d_delete(new_dentry); @@ -1134,15 +1114,12 @@ go_ahead: &old_dentry->d_name, new_dentry->d_parent, &new_dentry->d_name); - NFS_CACHEINV(old_dir); - NFS_CACHEINV(new_dir); - /* Update the dcache if needed */ +out: if (rehash) - d_add(new_dentry, NULL); + d_rehash(rehash); if (!error && !S_ISDIR(old_inode->i_mode)) d_move(old_dentry, new_dentry); -out: /* new dentry created? */ if (dentry) dput(dentry); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5b1092846a45..44e3030c3814 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -555,22 +555,18 @@ nfs_free_dentries(struct inode *inode) struct list_head *tmp, *head = &inode->i_dentry; int unhashed; -restart: + if (S_ISDIR(inode->i_mode)) { + struct dentry *dentry = d_find_alias(inode); + if (dentry) { + shrink_dcache_parent(dentry); + dput(dentry); + } + } + d_prune_aliases(inode); tmp = head; unhashed = 0; while ((tmp = tmp->next) != head) { struct dentry *dentry = list_entry(tmp, struct dentry, d_alias); - dprintk("nfs_free_dentries: found %s/%s, d_count=%d, hashed=%d\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - dentry->d_count, !d_unhashed(dentry)); - if (!list_empty(&dentry->d_subdirs)) - shrink_dcache_parent(dentry); - if (!dentry->d_count) { - dget(dentry); - d_drop(dentry); - dput(dentry); - goto restart; - } if (d_unhashed(dentry)) unhashed++; } diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 79ef12a7bc33..c47830ff30a1 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -49,16 +49,21 @@ nfsd_cache_init(void) struct svc_cacherep *rp; struct nfscache_head *rh; size_t i; + unsigned long order; if (cache_initialized) return; i = CACHESIZE * sizeof (struct svc_cacherep); - nfscache = kmalloc (i, GFP_KERNEL); + for (order = 0; (PAGE_SIZE << order) < i; order++) + ; + nfscache = (struct svc_cacherep *) + __get_free_pages(GFP_KERNEL, order); if (!nfscache) { printk (KERN_ERR "nfsd: cannot allocate %d bytes for reply cache\n", i); return; } + memset(nfscache, 0, i); i = HASHSIZE * sizeof (struct nfscache_head); hash_list = kmalloc (i, GFP_KERNEL); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 921d84cc6d67..3b875a4452e2 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -1,4 +1,4 @@ -/* $Id: inode.c,v 1.10 2000/03/24 01:32:51 davem Exp $ +/* $Id: inode.c,v 1.11 2000/05/22 07:29:42 davem Exp $ * openpromfs.c: /proc/openprom handling routines * * Copyright (C) 1996-1999 Jakub Jelinek (jakub@redhat.com) diff --git a/fs/pipe.c b/fs/pipe.c index a2abaed9efcf..b97851fab7be 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -465,7 +465,7 @@ fail_page: return NULL; } -static struct vfsmount *pipe_mnt = NULL; +static struct vfsmount *pipe_mnt; static struct inode * get_pipe_inode(void) { diff --git a/fs/proc/base.c b/fs/proc/base.c index 7da659b57f4b..d513987d828b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * For hysterical raisins we keep the same inumbers as in the old procfs. @@ -866,6 +867,28 @@ static struct inode_operations proc_base_inode_operations = { lookup: proc_base_lookup, }; +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char *buffer, int buflen) +{ + char tmp[30]; + sprintf(tmp, "%d", current->pid); + return vfs_readlink(dentry,buffer,buflen,tmp); +} + +static int proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char tmp[30]; + sprintf(tmp, "%d", current->pid); + return vfs_follow_link(nd,tmp); +} + +static struct inode_operations proc_self_inode_operations = { + readlink: proc_self_readlink, + follow_link: proc_self_follow_link, +}; + struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry) { unsigned int pid, c; @@ -877,6 +900,23 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry) pid = 0; name = dentry->d_name.name; len = dentry->d_name.len; + if (len == 4 && !memcmp(name, "self", 4)) { + inode = get_empty_inode(); + if (!inode) + return ERR_PTR(-ENOMEM); + inode->i_sb = dir->i_sb; + inode->i_dev = dir->i_sb->s_dev; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = fake_ino(0, PROC_PID_INO); + inode->u.proc_i.file = NULL; + inode->u.proc_i.task = NULL; + inode->i_mode = S_IFLNK|S_IRWXUGO; + inode->i_uid = inode->i_gid = 0; + inode->i_size = 64; + inode->i_op = &proc_self_inode_operations; + d_add(dentry, inode); + return NULL; + } while (len-- > 0) { c = *name - '0'; name++; @@ -921,7 +961,8 @@ void proc_pid_delete_inode(struct inode *inode) { if (inode->u.proc_i.file) fput(inode->u.proc_i.file); - free_task_struct(inode->u.proc_i.task); + if (inode->u.proc_i.task) + free_task_struct(inode->u.proc_i.task); } #define PROC_NUMBUF 10 @@ -937,7 +978,7 @@ static int get_pid_list(int index, unsigned int *pids) struct task_struct *p; int nr_pids = 0; - index -= FIRST_PROCESS_ENTRY; + index--; read_lock(&tasklist_lock); for_each_task(p) { int pid = p->pid; @@ -958,9 +999,17 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { unsigned int pid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; - unsigned int nr = filp->f_pos; + unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; unsigned int nr_pids, i; + if (!nr) { + ino_t ino = fake_ino(0,PROC_PID_INO); + if (filldir(dirent, "self", 4, filp->f_pos, ino) < 0) + return 0; + filp->f_pos++; + nr++; + } + nr_pids = get_pid_list(nr, pid_array); for (i = 0; i < nr_pids; i++) { @@ -968,11 +1017,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) ino_t ino = fake_ino(pid,PROC_PID_INO); unsigned long j = PROC_NUMBUF; - do { - j--; - buf[j] = '0' + (pid % 10); - pid /= 10; - } while (pid); + do buf[--j] = '0' + (pid % 10); while (pid/=10); if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino) < 0) break; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 68aa49c1ced4..1585657a2d7b 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -140,9 +140,13 @@ proc_file_lseek(struct file * file, loff_t offset, int orig) { switch (orig) { case 0: + if (offset < 0) + return -EINVAL; file->f_pos = offset; return(file->f_pos); case 1: + if (offset + file->f_pos < 0) + return -EINVAL; file->f_pos += offset; return(file->f_pos); case 2: @@ -339,7 +343,7 @@ static struct inode_operations proc_dir_inode_operations = { lookup: proc_lookup, }; -int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) +static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) { int i; diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 88d41c3c21f4..c64166f78aca 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -112,7 +112,6 @@ static void add_node(struct device_node *np, struct proc_dir_entry *de) al = proc_symlink(at, de, ent->name); if (al == 0) break; - proc_register(de, al); *lastp = al; lastp = &al->next; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 8088d064d903..075a5843df66 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -22,38 +22,9 @@ struct proc_dir_entry *proc_net, *proc_bus, *proc_root_fs, *proc_root_driver; struct proc_dir_entry *proc_sys_root; #endif -/* - * /proc/self: - */ -static int proc_self_readlink(struct dentry *dentry, char *buffer, int buflen) -{ - char tmp[30]; - sprintf(tmp, "%d", current->pid); - return vfs_readlink(dentry,buffer,buflen,tmp); -} - -static int proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - char tmp[30]; - sprintf(tmp, "%d", current->pid); - return vfs_follow_link(nd,tmp); -} - -static struct inode_operations proc_self_inode_operations = { - readlink: proc_self_readlink, - follow_link: proc_self_follow_link -}; - -static struct proc_dir_entry proc_root_self = { - 0, 4, "self", - S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO, 1, 0, 0, - 64, &proc_self_inode_operations, -}; - void __init proc_root_init(void) { proc_misc_init(); - proc_register(&proc_root, &proc_root_self); proc_net = proc_mkdir("net", 0); #ifdef CONFIG_SYSVIPC proc_mkdir("sysvipc", 0); diff --git a/fs/super.c b/fs/super.c index 993668a91490..f1d873331056 100644 --- a/fs/super.c +++ b/fs/super.c @@ -315,6 +315,7 @@ static struct vfsmount *add_vfsmnt(struct super_block *sb, strcpy(name, dir_name); mnt->mnt_dirname = name; } + mnt->mnt_owner = current->uid; if (parent) list_add(&mnt->mnt_child, &parent->mnt_mounts); @@ -1021,9 +1022,6 @@ asmlinkage long sys_umount(char * name, int flags) char *kname; int retval; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - lock_kernel(); kname = getname(name); retval = PTR_ERR(kname); @@ -1038,6 +1036,11 @@ asmlinkage long sys_umount(char * name, int flags) retval = -EINVAL; if (nd.dentry!=nd.mnt->mnt_root) goto dput_and_out; + + retval = -EPERM; + if (!capable(CAP_SYS_ADMIN) && current->uid!=nd.mnt->mnt_owner) + goto dput_and_out; + dput(nd.dentry); /* puts nd.mnt */ down(&mount_sem); @@ -1060,6 +1063,21 @@ asmlinkage long sys_oldumount(char * name) return sys_umount(name,0); } +static int mount_is_safe(struct nameidata *nd) +{ + if (capable(CAP_SYS_ADMIN)) + return 0; + if (S_ISLNK(nd->dentry->d_inode->i_mode)) + return -EPERM; + if (nd->dentry->d_inode->i_mode & S_ISVTX) { + if (current->uid != nd->dentry->d_inode->i_uid) + return -EPERM; + } + if (permission(nd->dentry->d_inode, MAY_WRITE)) + return -EPERM; + return 0; +} + /* * do loopback mount. */ @@ -1069,18 +1087,22 @@ static int do_loopback(char *old_name, char *new_name) int err = 0; if (!old_name || !*old_name) return -EINVAL; - if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &old_nd)) + if (path_init(old_name, LOOKUP_POSITIVE, &old_nd)) err = path_walk(old_name, &old_nd); if (err) goto out; - if (path_init(new_name, LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &new_nd)) + if (path_init(new_name, LOOKUP_POSITIVE, &new_nd)) err = path_walk(new_name, &new_nd); if (err) goto out1; - err = -EPERM; - if (!capable(CAP_SYS_ADMIN) && - current->uid != new_nd.dentry->d_inode->i_uid) + err = mount_is_safe(&new_nd); + if (err) + goto out2; + err = -EINVAL; + if (S_ISDIR(new_nd.dentry->d_inode->i_mode) != + S_ISDIR(old_nd.dentry->d_inode->i_mode)) goto out2; + down(&mount_sem); err = -ENOENT; if (d_unhashed(old_nd.dentry) && !IS_ROOT(old_nd.dentry)) diff --git a/fs/udf/namei.c b/fs/udf/namei.c index bded47be366d..d56ff9a0ccf1 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -346,7 +346,7 @@ udf_add_entry(struct inode *dir, struct dentry *dentry, sb = dir->i_sb; - if (dentry->d_name.len) + if (dentry) { if ( !(udf_char_to_ustr(&unifilename, dentry->d_name.name, dentry->d_name.len)) ) { @@ -447,20 +447,17 @@ udf_add_entry(struct inode *dir, struct dentry *dentry, } } - if (!lfi) + if (!lfi || !dentry) continue; - if ((flen = udf_get_filename(nameptr, fname, lfi))) - { - if (udf_match(flen, fname, &(dentry->d_name))) - { - if (fibh->sbh != fibh->ebh) - udf_release_data(fibh->ebh); - udf_release_data(fibh->sbh); - udf_release_data(bh); - *err = -EEXIST; - return NULL; - } + if ((flen = udf_get_filename(nameptr, fname, lfi)) && + udf_match(flen, fname, &(dentry->d_name))) { + if (fibh->sbh != fibh->ebh) + udf_release_data(fibh->ebh); + udf_release_data(fibh->sbh); + udf_release_data(bh); + *err = -EEXIST; + return NULL; } } } @@ -691,7 +688,6 @@ static int udf_mkdir(struct inode * dir, struct dentry * dentry, int mode) struct udf_fileident_bh fibh; int err; struct FileIdentDesc cfi, *fi; - struct dentry parent; err = -EMLINK; if (dir->i_nlink >= (256<i_nlink))-1) @@ -704,10 +700,8 @@ static int udf_mkdir(struct inode * dir, struct dentry * dentry, int mode) inode->i_op = &udf_dir_inode_operations; inode->i_fop = &udf_dir_operations; - parent.d_name.len = 0; - parent.d_name.name = NULL; inode->i_size = 0; - if (!(fi = udf_add_entry(inode, &parent, &fibh, &cfi, &err))) + if (!(fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err))) { inode->i_nlink--; mark_inode_dirty(inode); diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h index 2e3081a5c366..3f3bc50f4067 100644 --- a/include/asm-i386/uaccess.h +++ b/include/asm-i386/uaccess.h @@ -135,16 +135,8 @@ extern void __put_user_bad(void); :"0" (ptr),"d" (x) \ :"cx") -#define put_user(x,ptr) \ -({ int __ret_pu; \ - switch(sizeof (*(ptr))) { \ - case 1: __put_user_x(1,__ret_pu,(__typeof__(*(ptr)))(x),ptr); break; \ - case 2: __put_user_x(2,__ret_pu,(__typeof__(*(ptr)))(x),ptr); break; \ - case 4: __put_user_x(4,__ret_pu,(__typeof__(*(ptr)))(x),ptr); break; \ - default: __put_user_x(X,__ret_pu,x,ptr); break; \ - } \ - __ret_pu; \ -}) +#define put_user(x,ptr) \ + __put_user_check((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) #define __get_user(x,ptr) \ __get_user_nocheck((x),(ptr),sizeof(*(ptr))) @@ -158,6 +150,16 @@ extern void __put_user_bad(void); __pu_err; \ }) + +#define __put_user_check(x,ptr,size) \ +({ \ + long __pu_err = -EFAULT; \ + __typeof__(*(ptr)) *__pu_addr = (ptr); \ + if (access_ok(VERIFY_WRITE,__pu_addr,size)) \ + __put_user_size((x),__pu_addr,(size),__pu_err); \ + __pu_err; \ +}) + #define __put_user_size(x,ptr,size,retval) \ do { \ retval = 0; \ diff --git a/include/asm-ppc/bitops.h b/include/asm-ppc/bitops.h index 8fea036b99c9..f62ad151f29c 100644 --- a/include/asm-ppc/bitops.h +++ b/include/asm-ppc/bitops.h @@ -6,6 +6,7 @@ #ifndef _PPC_BITOPS_H #define _PPC_BITOPS_H +#include #include extern void set_bit(int nr, volatile void *addr); diff --git a/include/asm-sparc/bitops.h b/include/asm-sparc/bitops.h index 09a08baa2eb2..b2c8acfffb3d 100644 --- a/include/asm-sparc/bitops.h +++ b/include/asm-sparc/bitops.h @@ -1,4 +1,4 @@ -/* $Id: bitops.h,v 1.55 2000/02/09 03:28:32 davem Exp $ +/* $Id: bitops.h,v 1.56 2000/05/09 17:40:15 davem Exp $ * bitops.h: Bit string operations on the Sparc. * * Copyright 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/include/asm-sparc/ide.h b/include/asm-sparc/ide.h index 2bfbcb964477..79b0e47a1344 100644 --- a/include/asm-sparc/ide.h +++ b/include/asm-sparc/ide.h @@ -1,4 +1,4 @@ -/* $Id: ide.h,v 1.4 2000/03/12 03:56:12 davem Exp $ +/* $Id: ide.h,v 1.5 2000/05/22 07:29:43 davem Exp $ * ide.h: SPARC PCI specific IDE glue. * * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) diff --git a/include/asm-sparc/irq.h b/include/asm-sparc/irq.h index 1f40557b486a..512a8e8186a0 100644 --- a/include/asm-sparc/irq.h +++ b/include/asm-sparc/irq.h @@ -1,4 +1,4 @@ -/* $Id: irq.h,v 1.28 2000/01/22 06:06:58 zaitcev Exp $ +/* $Id: irq.h,v 1.29 2000/05/09 17:40:15 davem Exp $ * irq.h: IRQ registers on the Sparc. * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/include/asm-sparc/pgalloc.h b/include/asm-sparc/pgalloc.h index 7a2b81bebf16..c0850c9a9f33 100644 --- a/include/asm-sparc/pgalloc.h +++ b/include/asm-sparc/pgalloc.h @@ -1,4 +1,4 @@ -/* $Id: pgalloc.h,v 1.3 2000/02/03 10:13:31 jj Exp $ */ +/* $Id: pgalloc.h,v 1.4 2000/05/09 17:40:15 davem Exp $ */ #ifndef _SPARC_PGALLOC_H #define _SPARC_PGALLOC_H diff --git a/include/asm-sparc/system.h b/include/asm-sparc/system.h index 00e33e3bff44..69cc06de2307 100644 --- a/include/asm-sparc/system.h +++ b/include/asm-sparc/system.h @@ -1,4 +1,4 @@ -/* $Id: system.h,v 1.81 2000/02/28 04:00:44 anton Exp $ */ +/* $Id: system.h,v 1.82 2000/05/09 17:40:15 davem Exp $ */ #include #ifndef __SPARC_SYSTEM_H diff --git a/include/asm-sparc/winmacro.h b/include/asm-sparc/winmacro.h index e760822574df..619f5e944d02 100644 --- a/include/asm-sparc/winmacro.h +++ b/include/asm-sparc/winmacro.h @@ -1,4 +1,4 @@ -/* $Id: winmacro.h,v 1.21 1999/08/14 03:52:13 anton Exp $ +/* $Id: winmacro.h,v 1.22 2000/05/09 17:40:15 davem Exp $ * winmacro.h: Window loading-unloading macros. * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) diff --git a/include/asm-sparc64/delay.h b/include/asm-sparc64/delay.h index 4a9bfdb1c700..1dc636453165 100644 --- a/include/asm-sparc64/delay.h +++ b/include/asm-sparc64/delay.h @@ -1,4 +1,4 @@ -/* $Id: delay.h,v 1.8 2000/04/13 04:45:59 davem Exp $ +/* $Id: delay.h,v 1.9 2000/05/09 17:40:15 davem Exp $ * delay.h: Linux delay routines on the V9. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu). diff --git a/include/asm-sparc64/ide.h b/include/asm-sparc64/ide.h index f301ab20230b..471ee71a055d 100644 --- a/include/asm-sparc64/ide.h +++ b/include/asm-sparc64/ide.h @@ -1,4 +1,4 @@ -/* $Id: ide.h,v 1.17 1999/12/15 22:18:49 davem Exp $ +/* $Id: ide.h,v 1.18 2000/05/22 07:29:43 davem Exp $ * ide.h: Ultra/PCI specific IDE glue. * * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) diff --git a/include/asm-sparc64/irq.h b/include/asm-sparc64/irq.h index cff5236edf03..111ad568b06e 100644 --- a/include/asm-sparc64/irq.h +++ b/include/asm-sparc64/irq.h @@ -1,4 +1,4 @@ -/* $Id: irq.h,v 1.17 1999/09/21 14:39:41 davem Exp $ +/* $Id: irq.h,v 1.18 2000/05/09 17:40:15 davem Exp $ * irq.h: IRQ registers on the 64-bit Sparc. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) diff --git a/include/asm-sparc64/oplib.h b/include/asm-sparc64/oplib.h index 963ce73e44ff..cf6cbc58967a 100644 --- a/include/asm-sparc64/oplib.h +++ b/include/asm-sparc64/oplib.h @@ -1,4 +1,4 @@ -/* $Id: oplib.h,v 1.12 1999/11/19 05:53:12 davem Exp $ +/* $Id: oplib.h,v 1.13 2000/05/09 17:40:15 davem Exp $ * oplib.h: Describes the interface and available routines in the * Linux Prom library. * diff --git a/include/asm-sparc64/processor.h b/include/asm-sparc64/processor.h index b731a18aa050..b7b124a56fdc 100644 --- a/include/asm-sparc64/processor.h +++ b/include/asm-sparc64/processor.h @@ -1,4 +1,4 @@ -/* $Id: processor.h,v 1.63 2000/03/27 10:38:57 davem Exp $ +/* $Id: processor.h,v 1.64 2000/05/09 17:40:15 davem Exp $ * include/asm-sparc64/processor.h * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h index 895cd74df18e..00303c239944 100644 --- a/include/asm-sparc64/system.h +++ b/include/asm-sparc64/system.h @@ -1,4 +1,4 @@ -/* $Id: system.h,v 1.58 2000/05/05 18:47:41 davem Exp $ */ +/* $Id: system.h,v 1.59 2000/05/09 17:40:15 davem Exp $ */ #ifndef __SPARC64_SYSTEM_H #define __SPARC64_SYSTEM_H diff --git a/include/asm-sparc64/timer.h b/include/asm-sparc64/timer.h index 8eb30d7d89a1..4aa85bedef9c 100644 --- a/include/asm-sparc64/timer.h +++ b/include/asm-sparc64/timer.h @@ -1,4 +1,4 @@ -/* $Id: timer.h,v 1.2 1998/03/15 17:23:52 ecd Exp $ +/* $Id: timer.h,v 1.3 2000/05/09 17:40:15 davem Exp $ * timer.h: System timer definitions for sun5. * * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) diff --git a/include/asm-sparc64/unistd.h b/include/asm-sparc64/unistd.h index abf6c09ebf25..c7fd4cd191c4 100644 --- a/include/asm-sparc64/unistd.h +++ b/include/asm-sparc64/unistd.h @@ -1,4 +1,4 @@ -/* $Id: unistd.h,v 1.42 2000/01/29 17:57:26 jj Exp $ */ +/* $Id: unistd.h,v 1.44 2000/05/16 16:42:33 jj Exp $ */ #ifndef _SPARC64_UNISTD_H #define _SPARC64_UNISTD_H @@ -246,7 +246,9 @@ #define __NR_setfsuid 228 /* Linux Specific */ #define __NR_setfsgid 229 /* Linux Specific */ #define __NR__newselect 230 /* Linux Specific */ -#define __NR_time 231 /* Linux Specific */ +#ifdef __KERNEL__ +#define __NR_time 231 /* Linux sparc32 */ +#endif /* #define __NR_oldstat 232 Linux Specific */ #define __NR_stime 233 /* Linux Specific */ /* #define __NR_oldfstat 234 Linux Specific */ diff --git a/include/linux/coda.h b/include/linux/coda.h index 6e1a939bedba..cbd042242ef5 100644 --- a/include/linux/coda.h +++ b/include/linux/coda.h @@ -61,8 +61,9 @@ Mellon the rights to redistribute these changes without encumbrance. -/* Catch new _KERNEL defn for NetBSD */ -#ifdef __NetBSD__ +/* Catch new _KERNEL defn for NetBSD and DJGPP/__CYGWIN32__ */ +#if defined(__NetBSD__) || \ + ((defined(DJGPP) || defined(__CYGWIN32__)) && !defined(KERNEL)) #include #endif @@ -91,7 +92,6 @@ struct timespec { long ts_nsec; }; #else /* DJGPP but not KERNEL */ -#include #include typedef unsigned long long u_quad_t; #endif /* !KERNEL */ diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 62227b511fcc..12bbfbdc1b77 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -164,6 +164,7 @@ extern struct dentry * d_alloc_root(struct inode *); extern void d_genocide(struct dentry *); extern struct dentry *d_find_alias(struct inode *); +extern void d_prune_aliases(struct inode *); /* test whether we have any submounts in a subdir tree */ extern int have_submounts(struct dentry *); diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 873a7d9d630e..0129c67b9ecf 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -16,12 +16,13 @@ #ifndef __LINUX_IF_PPPOX_H #define __LINUX_IF_PPPOX_H -#include -#include #include #include + #ifdef __KERNEL__ +#include +#include #include #include #include diff --git a/include/linux/lvm.h b/include/linux/lvm.h index 703d8f72e6fd..7afbc1178155 100644 --- a/include/linux/lvm.h +++ b/include/linux/lvm.h @@ -83,11 +83,13 @@ #include #ifndef __KERNEL__ +#define ____NOT_KERNEL____ #define __KERNEL__ +#endif #include +#ifdef ____NOT_KERNEL____ +#undef ____NOT_KERNEL____ #undef __KERNEL__ -#else -#include #endif #include diff --git a/include/linux/mount.h b/include/linux/mount.h index fcec956470db..61ab19b1fc52 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -28,6 +28,7 @@ struct vfsmount char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ char *mnt_dirname; /* Name of directory mounted on */ struct list_head mnt_list; + uid_t mnt_owner; }; static inline struct vfsmount *mntget(struct vfsmount *mnt) diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index be5ea2afaa3a..aa1ad6f0fdf2 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -54,4 +54,14 @@ #define NF_IP6_NUMHOOKS 5 +enum nf_ip_hook_priorities { + NF_IP6_PRI_FIRST = INT_MIN, + NF_IP6_PRI_CONNTRACK = -200, + NF_IP6_PRI_MANGLE = -150, + NF_IP6_PRI_NAT_DST = -100, + NF_IP6_PRI_FILTER = 0, + NF_IP6_PRI_NAT_SRC = 100, + NF_IP6_PRI_LAST = INT_MAX, +}; + #endif /*__LINUX_IP6_NETFILTER_H*/ diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h new file mode 100644 index 000000000000..f3617397cbe9 --- /dev/null +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -0,0 +1,452 @@ +/* + * 25-Jul-1998 Major changes to allow for ip chain table + * + * 3-Jan-2000 Named tables to allow packet selection for different uses. + */ + +/* + * Format of an IP6 firewall descriptor + * + * src, dst, src_mask, dst_mask are always stored in network byte order. + * flags are stored in host byte order (of course). + * Port numbers are stored in HOST byte order. + */ + +#ifndef _IP6_TABLES_H +#define _IP6_TABLES_H + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#endif +#include + +#define IP6T_FUNCTION_MAXNAMELEN 30 +#define IP6T_TABLE_MAXNAMELEN 32 + +/* Yes, Virginia, you have to zero the padding. */ +struct ip6t_ip6 { + /* Source and destination IP6 addr */ + struct in6_addr src, dst; + /* Mask for src and dest IP6 addr */ + struct in6_addr smsk, dmsk; + char iniface[IFNAMSIZ], outiface[IFNAMSIZ]; + unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ]; + + /* ARGH, HopByHop uses 0, so can't do 0 = ANY, + instead IP6T_F_NOPROTO must be set */ + u_int16_t proto; + /* TOS to match iff flags & IP6T_F_TOS */ + u_int8_t tos; + + /* Flags word */ + u_int8_t flags; + /* Inverse flags */ + u_int8_t invflags; +}; + +/* FIXME: If alignment in kernel different from userspace? --RR */ +struct ip6t_entry_match +{ + union { + struct { + u_int16_t match_size; + + /* Used by userspace */ + char name[IP6T_FUNCTION_MAXNAMELEN]; + } user; + struct { + u_int16_t match_size; + + /* Used inside the kernel */ + struct ip6t_match *match; + } kernel; + + /* Total length */ + u_int16_t match_size; + } u; + + unsigned char data[0]; +}; + +struct ip6t_entry_target +{ + union { + struct { + u_int16_t target_size; + + /* Used by userspace */ + char name[IP6T_FUNCTION_MAXNAMELEN]; + } user; + struct { + u_int16_t target_size; + + /* Used inside the kernel */ + struct ip6t_target *target; + } kernel; + + /* Total length */ + u_int16_t target_size; + } u; + + unsigned char data[0]; +}; + +struct ip6t_standard_target +{ + struct ip6t_entry_target target; + int verdict; +}; + +struct ip6t_counters +{ + u_int64_t pcnt, bcnt; /* Packet and byte counters */ +}; + +/* Values for "flag" field in struct ip6t_ip6 (general ip6 structure). */ +#define IP6T_F_PROTO 0x01 /* Set if rule cares about upper + protocols */ +#define IP6T_F_TOS 0x02 /* Match the TOS. */ +#define IP6T_F_MASK 0x03 /* All possible flag bits mask. */ + +/* Values for "inv" field in struct ip6t_ip6. */ +#define IP6T_INV_VIA_IN 0x01 /* Invert the sense of IN IFACE. */ +#define IP6T_INV_VIA_OUT 0x02 /* Invert the sense of OUT IFACE */ +#define IP6T_INV_TOS 0x04 /* Invert the sense of TOS. */ +#define IP6T_INV_SRCIP 0x08 /* Invert the sense of SRC IP. */ +#define IP6T_INV_DSTIP 0x10 /* Invert the sense of DST OP. */ +#define IP6T_INV_FRAG 0x20 /* Invert the sense of FRAG. */ +#define IP6T_INV_PROTO 0x40 /* Invert the sense of PROTO. */ +#define IP6T_INV_MASK 0x7F /* All possible flag bits mask. */ + +/* This structure defines each of the firewall rules. Consists of 3 + parts which are 1) general IP header stuff 2) match specific + stuff 3) the target to perform if the rule matches */ +struct ip6t_entry +{ + struct ip6t_ip6 ipv6; + + /* Mark with fields that we care about. */ + unsigned int nfcache; + + /* Size of ipt_entry + matches */ + u_int16_t target_offset; + /* Size of ipt_entry + matches + target */ + u_int16_t next_offset; + + /* Back pointer */ + unsigned int comefrom; + + /* Packet and byte counters. */ + struct ip6t_counters counters; + + /* The matches (if any), then the target. */ + unsigned char elems[0]; +}; + +/* + * New IP firewall options for [gs]etsockopt at the RAW IP level. + * Unlike BSD Linux inherits IP options so you don't have to use + * a raw socket for this. Instead we check rights in the calls. */ +#define IP6T_BASE_CTL 64 /* base for firewall socket options */ + +#define IP6T_SO_SET_REPLACE (IP6T_BASE_CTL) +#define IP6T_SO_SET_ADD_COUNTERS (IP6T_BASE_CTL + 1) +#define IP6T_SO_SET_MAX IP6T_SO_SET_ADD_COUNTERS + +#define IP6T_SO_GET_INFO (IP6T_BASE_CTL) +#define IP6T_SO_GET_ENTRIES (IP6T_BASE_CTL + 1) +#define IP6T_SO_GET_MAX IP6T_SO_GET_ENTRIES + +/* CONTINUE verdict for targets */ +#define IP6T_CONTINUE 0xFFFFFFFF + +/* For standard target */ +#define IP6T_RETURN (-NF_MAX_VERDICT - 1) + +/* TCP matching stuff */ +struct ip6t_tcp +{ + u_int16_t spts[2]; /* Source port range. */ + u_int16_t dpts[2]; /* Destination port range. */ + u_int8_t option; /* TCP Option iff non-zero*/ + u_int8_t flg_mask; /* TCP flags mask byte */ + u_int8_t flg_cmp; /* TCP flags compare byte */ + u_int8_t invflags; /* Inverse flags */ +}; + +/* Values for "inv" field in struct ipt_tcp. */ +#define IP6T_TCP_INV_SRCPT 0x01 /* Invert the sense of source ports. */ +#define IP6T_TCP_INV_DSTPT 0x02 /* Invert the sense of dest ports. */ +#define IP6T_TCP_INV_FLAGS 0x04 /* Invert the sense of TCP flags. */ +#define IP6T_TCP_INV_OPTION 0x08 /* Invert the sense of option test. */ +#define IP6T_TCP_INV_MASK 0x0F /* All possible flags. */ + +/* UDP matching stuff */ +struct ip6t_udp +{ + u_int16_t spts[2]; /* Source port range. */ + u_int16_t dpts[2]; /* Destination port range. */ + u_int8_t invflags; /* Inverse flags */ +}; + +/* Values for "invflags" field in struct ipt_udp. */ +#define IP6T_UDP_INV_SRCPT 0x01 /* Invert the sense of source ports. */ +#define IP6T_UDP_INV_DSTPT 0x02 /* Invert the sense of dest ports. */ +#define IP6T_UDP_INV_MASK 0x03 /* All possible flags. */ + +/* ICMP matching stuff */ +struct ip6t_icmp +{ + u_int8_t type; /* type to match */ + u_int8_t code[2]; /* range of code */ + u_int8_t invflags; /* Inverse flags */ +}; + +/* Values for "inv" field for struct ipt_icmp. */ +#define IP6T_ICMP_INV 0x01 /* Invert the sense of type/code test */ + +/* The argument to IP6T_SO_GET_INFO */ +struct ip6t_getinfo +{ + /* Which table: caller fills this in. */ + char name[IP6T_TABLE_MAXNAMELEN]; + + /* Kernel fills these in. */ + /* Which hook entry points are valid: bitmask */ + unsigned int valid_hooks; + + /* Hook entry points: one per netfilter hook. */ + unsigned int hook_entry[NF_IP6_NUMHOOKS]; + + /* Underflow points. */ + unsigned int underflow[NF_IP6_NUMHOOKS]; + + /* Number of entries */ + unsigned int num_entries; + + /* Size of entries. */ + unsigned int size; +}; + +/* The argument to IP6T_SO_SET_REPLACE. */ +struct ip6t_replace +{ + /* Which table. */ + char name[IP6T_TABLE_MAXNAMELEN]; + + /* Which hook entry points are valid: bitmask. You can't + change this. */ + unsigned int valid_hooks; + + /* Number of entries */ + unsigned int num_entries; + + /* Total size of new entries */ + unsigned int size; + + /* Hook entry points. */ + unsigned int hook_entry[NF_IP6_NUMHOOKS]; + + /* Underflow points. */ + unsigned int underflow[NF_IP6_NUMHOOKS]; + + /* Information about old entries: */ + /* Number of counters (must be equal to current number of entries). */ + unsigned int num_counters; + /* The old entries' counters. */ + struct ip6t_counters *counters; + + /* The entries (hang off end: not really an array). */ + struct ip6t_entry entries[0]; +}; + +/* The argument to IP6T_SO_ADD_COUNTERS. */ +struct ip6t_counters_info +{ + /* Which table. */ + char name[IP6T_TABLE_MAXNAMELEN]; + + unsigned int num_counters; + + /* The counters (actually `number' of these). */ + struct ip6t_counters counters[0]; +}; + +/* The argument to IP6T_SO_GET_ENTRIES. */ +struct ip6t_get_entries +{ + /* Which table: user fills this in. */ + char name[IP6T_TABLE_MAXNAMELEN]; + + /* User fills this in: total entry size. */ + unsigned int size; + + /* The entries. */ + unsigned char entries[0]; +}; + +/* Standard return verdict, or do jump. */ +#define IP6T_STANDARD_TARGET "" +/* Error verdict. */ +#define IP6T_ERROR_TARGET "ERROR" + +/* Helper functions */ +extern __inline__ struct ip6t_entry_target * +ip6t_get_target(struct ip6t_entry *e) +{ + return (void *)e + e->target_offset; +} + +/* fn returns 0 to continue iteration */ +#define IP6T_MATCH_ITERATE(e, fn, args...) \ +({ \ + unsigned int __i; \ + int __ret = 0; \ + struct ip6t_entry_match *__m; \ + \ + for (__i = sizeof(struct ip6t_entry); \ + __i < (e)->target_offset; \ + __i += __m->u.match_size) { \ + __m = (void *)(e) + __i; \ + \ + __ret = fn(__m , ## args); \ + if (__ret != 0) \ + break; \ + } \ + __ret; \ +}) + +/* fn returns 0 to continue iteration */ +#define IP6T_ENTRY_ITERATE(entries, size, fn, args...) \ +({ \ + unsigned int __i; \ + int __ret = 0; \ + struct ip6t_entry *__e; \ + \ + for (__i = 0; __i < (size); __i += __e->next_offset) { \ + __e = (void *)(entries) + __i; \ + \ + __ret = fn(__e , ## args); \ + if (__ret != 0) \ + break; \ + } \ + __ret; \ +}) + +/* + * Main firewall chains definitions and global var's definitions. + */ + +#ifdef __KERNEL__ + +#include +extern void ip6t_init(void) __init; + +struct ip6t_match +{ + struct list_head list; + + const char name[IP6T_FUNCTION_MAXNAMELEN]; + + /* Return true or false: return FALSE and set *hotdrop = 1 to + force immediate packet drop. */ + int (*match)(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop); + + /* Called when user tries to insert an entry of this type. */ + /* Should return true or false. */ + int (*checkentry)(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask); + + /* Called when entry of this type deleted. */ + void (*destroy)(void *matchinfo, unsigned int matchinfosize); + + /* Set this to THIS_MODULE if you are a module, otherwise NULL */ + struct module *me; +}; + +/* Registration hooks for targets. */ +struct ip6t_target +{ + struct list_head list; + + const char name[IP6T_FUNCTION_MAXNAMELEN]; + + /* Returns verdict. */ + unsigned int (*target)(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userdata); + + /* Called when user tries to insert an entry of this type: + hook_mask is a bitmask of hooks from which it can be + called. */ + /* Should return true or false. */ + int (*checkentry)(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask); + + /* Called when entry of this type deleted. */ + void (*destroy)(void *targinfo, unsigned int targinfosize); + + /* Set this to THIS_MODULE if you are a module, otherwise NULL */ + struct module *me; +}; + +extern int ip6t_register_target(struct ip6t_target *target); +extern void ip6t_unregister_target(struct ip6t_target *target); + +extern int ip6t_register_match(struct ip6t_match *match); +extern void ip6t_unregister_match(struct ip6t_match *match); + +/* Furniture shopping... */ +struct ip6t_table +{ + struct list_head list; + + /* A unique name... */ + char name[IP6T_TABLE_MAXNAMELEN]; + + /* Seed table: copied in register_table */ + struct ip6t_replace *table; + + /* What hooks you will enter on */ + unsigned int valid_hooks; + + /* Lock for the curtain */ + rwlock_t lock; + + /* Man behind the curtain... */ + struct ip6t_table_info *private; +}; + +extern int ip6t_register_table(struct ip6t_table *table); +extern void ip6t_unregister_table(struct ip6t_table *table); +extern unsigned int ip6t_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct ip6t_table *table, + void *userdata); + +#define IP6T_ALIGN(s) (((s) + (__alignof__(struct ip6t_entry)-1)) & ~(__alignof__(struct ip6t_entry)-1)) + +#endif /*__KERNEL__*/ +#endif /* _IP6_TABLES_H */ diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index b7621b293f54..d0487c3df455 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -90,8 +90,6 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry); void proc_pid_delete_inode(struct inode *inode); int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); -extern int proc_register(struct proc_dir_entry *, struct proc_dir_entry *); - extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent); extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent); @@ -167,7 +165,6 @@ extern inline void proc_net_remove(const char *name) #else -extern inline int proc_register(struct proc_dir_entry *a, struct proc_dir_entry *b) { return 0; } extern inline struct proc_dir_entry *proc_net_create(const char *name, mode_t mode, get_info_t *get_info) {return NULL;} extern inline void proc_net_remove(const char *name) {} diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index fefe0681d132..568a89a469d9 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -18,18 +18,49 @@ struct vm_struct { struct vm_struct * next; }; -struct vm_struct * get_vm_area(unsigned long size, unsigned long flags); -void vfree(void * addr); -void * vmalloc(unsigned long size); -long vread(char *buf, char *addr, unsigned long count); -void vmfree_area_pages(unsigned long address, unsigned long size); -int vmalloc_area_pages(unsigned long address, unsigned long size); - -/* vmlist_lock is a read-write spinlock that protects vmlist +extern struct vm_struct * get_vm_area (unsigned long size, unsigned long flags); +extern void vfree(void * addr); +extern void * __vmalloc (unsigned long size, int gfp_mask); +extern long vread(char *buf, char *addr, unsigned long count); +extern void vmfree_area_pages(unsigned long address, unsigned long size); +extern int vmalloc_area_pages(unsigned long address, unsigned long size , int gfp_mask); + +extern struct vm_struct * vmlist; + + +/* + * Allocate any pages + */ + +static inline void * vmalloc (unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); +} + +/* + * Allocate ISA addressable pages for broke crap + */ + +static inline void * vmalloc_dma (unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL|GFP_DMA); +} + +/* + * vmalloc 32bit PA addressable pages - eg for PCI 32bit devices + */ + +static inline void * vmalloc_32(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL); +} + +/* + * vmlist_lock is a read-write spinlock that protects vmlist * Used in mm/vmalloc.c (get_vm_area() and vfree()) and fs/proc/kcore.c. */ extern rwlock_t vmlist_lock; - + extern struct vm_struct * vmlist; #endif diff --git a/init/main.c b/init/main.c index 04180c8e9340..b6237a44e094 100644 --- a/init/main.c +++ b/init/main.c @@ -735,8 +735,7 @@ static void __init do_basic_setup(void) #ifdef CONFIG_BLK_DEV_INITRD root_mountflags = real_root_mountflags; - if (mount_initrd && ROOT_DEV != real_root_dev - && MAJOR(ROOT_DEV) == RAMDISK_MAJOR && MINOR(ROOT_DEV) == 0) { + if (mount_initrd && MAJOR(ROOT_DEV) == RAMDISK_MAJOR && MINOR(ROOT_DEV) == 0) { int error; int i, pid; diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 309a37d3e65c..d5660d90c7cb 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -220,6 +220,7 @@ EXPORT_SYMBOL(dput); EXPORT_SYMBOL(have_submounts); EXPORT_SYMBOL(d_genocide); EXPORT_SYMBOL(d_find_alias); +EXPORT_SYMBOL(d_prune_aliases); EXPORT_SYMBOL(prune_dcache); EXPORT_SYMBOL(shrink_dcache_sb); EXPORT_SYMBOL(shrink_dcache_parent); diff --git a/mm/filemap.c b/mm/filemap.c index be69c2291d8d..b1e2b8547fe6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -244,13 +244,19 @@ repeat: spin_unlock(&pagecache_lock); } +/* + * nr_dirty represents the number of dirty pages that we will write async + * before doing sync writes. We can only do sync writes if we can + * wait for IO (__GFP_IO set). + */ int shrink_mmap(int priority, int gfp_mask) { - int ret = 0, count; + int ret = 0, count, nr_dirty; struct list_head * page_lru; struct page * page = NULL; count = nr_lru_pages / (priority + 1); + nr_dirty = priority; /* we need pagemap_lru_lock for list_del() ... subtle code below */ spin_lock(&pagemap_lru_lock); @@ -258,10 +264,10 @@ int shrink_mmap(int priority, int gfp_mask) page = list_entry(page_lru, struct page, lru); list_del(page_lru); - count--; if (PageTestandClearReferenced(page)) goto dispose_continue; + count--; /* * Avoid unscalable SMP locking for pages we can * immediate tell are untouchable.. @@ -287,7 +293,8 @@ int shrink_mmap(int priority, int gfp_mask) * of zone - it's old. */ if (page->buffers) { - if (!try_to_free_buffers(page, 1)) + int wait = ((gfp_mask & __GFP_IO) && (nr_dirty-- < 0)); + if (!try_to_free_buffers(page, wait)) goto unlock_continue; /* page was locked, inode can't go away under us */ if (!page->mapping) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 00879933cfff..b3f1cf5b76e0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -90,7 +90,8 @@ void vmfree_area_pages(unsigned long address, unsigned long size) flush_tlb_all(); } -static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned long size) +static inline int alloc_area_pte (pte_t * pte, unsigned long address, + unsigned long size, int gfp_mask) { unsigned long end; @@ -102,7 +103,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo struct page * page; if (!pte_none(*pte)) printk(KERN_ERR "alloc_area_pte: page already exists\n"); - page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); + page = alloc_page(gfp_mask); if (!page) return -ENOMEM; set_pte(pte, mk_pte(page, PAGE_KERNEL)); @@ -112,7 +113,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo return 0; } -static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size) +static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, int gfp_mask) { unsigned long end; @@ -124,7 +125,7 @@ static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo pte_t * pte = pte_alloc_kernel(pmd, address); if (!pte) return -ENOMEM; - if (alloc_area_pte(pte, address, end - address)) + if (alloc_area_pte(pte, address, end - address, gfp_mask)) return -ENOMEM; address = (address + PMD_SIZE) & PMD_MASK; pmd++; @@ -132,7 +133,8 @@ static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo return 0; } -int vmalloc_area_pages(unsigned long address, unsigned long size) +inline int vmalloc_area_pages (unsigned long address, + unsigned long size, int gfp_mask) { pgd_t * dir; unsigned long end = address + size; @@ -146,7 +148,7 @@ int vmalloc_area_pages(unsigned long address, unsigned long size) pmd = pmd_alloc_kernel(dir, address); if (!pmd) return -ENOMEM; - if (alloc_area_pmd(pmd, address, end - address)) + if (alloc_area_pmd(pmd, address, end - address, gfp_mask)) return -ENOMEM; if (pgd_val(olddir) != pgd_val(*dir)) set_pgdir(address, *dir); @@ -210,7 +212,7 @@ void vfree(void * addr) printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); } -void * vmalloc(unsigned long size) +void * __vmalloc (unsigned long size, int gfp_mask) { void * addr; struct vm_struct *area; @@ -226,7 +228,7 @@ void * vmalloc(unsigned long size) return NULL; } addr = area->addr; - if (vmalloc_area_pages(VMALLOC_VMADDR(addr), size)) { + if (vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask)) { vfree(addr); BUG(); return NULL; diff --git a/mm/vmscan.c b/mm/vmscan.c index 6a6318339a6d..1919c096122a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -48,6 +48,9 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un if ((page-mem_map >= max_mapnr) || PageReserved(page)) goto out_failed; + if (mm->swap_cnt) + mm->swap_cnt--; + /* Don't look at this pte if it's been accessed recently. */ if (pte_young(pte)) { /* @@ -76,7 +79,6 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: UnlockPage(page); - mm->swap_cnt--; vma->vm_mm->rss--; flush_tlb_page(vma, address); page_cache_release(page); @@ -142,7 +144,6 @@ drop_pte: struct file *file = vma->vm_file; if (file) get_file(file); pte_clear(page_table); - mm->swap_cnt--; vma->vm_mm->rss--; flush_tlb_page(vma, address); vmlist_access_unlock(vma->vm_mm); @@ -174,7 +175,6 @@ drop_pte: add_to_swap_cache(page, entry); /* Put the swap entry into the pte after the page is in swapcache */ - mm->swap_cnt--; vma->vm_mm->rss--; set_pte(page_table, swp_entry_to_pte(entry)); flush_tlb_page(vma, address); @@ -363,7 +363,7 @@ static int swap_out(unsigned int priority, int gfp_mask) * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ - counter = (nr_threads << 1) >> (priority >> 1); + counter = (nr_threads << 2) >> (priority >> 2); if (counter < 1) counter = 1; @@ -430,16 +430,17 @@ out: * latency. */ #define FREE_COUNT 8 -#define SWAP_COUNT 8 +#define SWAP_COUNT 16 static int do_try_to_free_pages(unsigned int gfp_mask) { int priority; int count = FREE_COUNT; + int swap_count; /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); - priority = 32; + priority = 64; do { while (shrink_mmap(priority, gfp_mask)) { if (!--count) @@ -471,12 +472,11 @@ static int do_try_to_free_pages(unsigned int gfp_mask) * put in the swap cache), so we must not count this * as a "count" success. */ - { - int swap_count = SWAP_COUNT; - while (swap_out(priority, gfp_mask)) - if (--swap_count < 0) - break; - } + swap_count = SWAP_COUNT; + while (swap_out(priority, gfp_mask)) + if (--swap_count < 0) + break; + } while (--priority >= 0); /* Always end on a shrink_mmap.. */ @@ -484,8 +484,8 @@ static int do_try_to_free_pages(unsigned int gfp_mask) if (!--count) goto done; } - - return 0; + /* We return 1 if we are freed some page */ + return (count != FREE_COUNT); done: return 1; diff --git a/net/Makefile b/net/Makefile index afdfbb7129d2..dce68b627097 100644 --- a/net/Makefile +++ b/net/Makefile @@ -10,7 +10,7 @@ MOD_SUB_DIRS := ipv4 ALL_SUB_DIRS := 802 ax25 bridge core ethernet ipv4 ipv6 ipx unix appletalk \ netrom rose lapb x25 wanrouter netlink sched packet sunrpc \ - econet irda decnet atm khttpd ipv4/netfilter + econet irda decnet atm khttpd ipv4/netfilter ipv6/netfilter SUB_DIRS := core ethernet sched MOD_LIST_NAME := NET_MISC_MODULES @@ -36,9 +36,16 @@ endif ifeq ($(CONFIG_IPV6),y) SUB_DIRS += ipv6 +ifeq ($(CONFIG_NETFILTER),y) +SUB_DIRS += ipv6/netfilter +MOD_SUB_DIRS += ipv6/netfilter +endif else ifeq ($(CONFIG_IPV6),m) MOD_SUB_DIRS += ipv6 + ifeq ($(CONFIG_NETFILTER),y) + MOD_SUB_DIRS += ipv6/netfilter + endif endif endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2c4116ccccf1..173506c3d449 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4,7 +4,7 @@ * Authors: Alan Cox * Florian La Roche * - * Version: $Id: skbuff.c,v 1.72 2000/04/13 00:55:54 davem Exp $ + * Version: $Id: skbuff.c,v 1.73 2000/05/22 07:29:44 davem Exp $ * * Fixes: * Alan Cox : Fixed the worst of the load balancer bugs. diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 50f09f1ec1e4..b51d1c4e97e5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -268,10 +268,10 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int dev->priv = (void*)(dev+1); nt = (struct ip_tunnel*)dev->priv; nt->dev = dev; - strcpy(dev->name, nt->parms.name); dev->init = ipgre_tunnel_init; dev->new_style = 1; memcpy(&nt->parms, parms, sizeof(*parms)); + strcpy(dev->name, nt->parms.name); if (dev->name[0] == 0) { int i; for (i=1; i<100; i++) { diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ed0c9f3e9645..4069795fb1c3 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,7 +1,7 @@ /* * Linux NET3: IP/IP protocol decoder. * - * Version: $Id: ipip.c,v 1.33 2000/05/05 02:17:17 davem Exp $ + * Version: $Id: ipip.c,v 1.34 2000/05/22 08:12:19 davem Exp $ * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 @@ -237,10 +237,10 @@ struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create) dev->priv = (void*)(dev+1); nt = (struct ip_tunnel*)dev->priv; nt->dev = dev; - strcpy(dev->name, nt->parms.name); dev->init = ipip_tunnel_init; dev->new_style = 1; memcpy(&nt->parms, parms, sizeof(*parms)); + strcpy(dev->name, nt->parms.name); if (dev->name[0] == 0) { int i; for (i=1; i<100; i++) { diff --git a/net/ipv6/Config.in b/net/ipv6/Config.in index 80498a9fec68..dd313de812b7 100644 --- a/net/ipv6/Config.in +++ b/net/ipv6/Config.in @@ -12,3 +12,7 @@ if [ "$CONFIG_NETLINK" = "y" ]; then fi #bool ' IPv6: flow policy support' CONFIG_RT6_POLICY #bool ' IPv6: firewall support' CONFIG_IPV6_FIREWALL + +if [ "$CONFIG_NETFILTER" != "n" ]; then + source net/ipv6/netfilter/Config.in +fi diff --git a/net/ipv6/netfilter/Config.in b/net/ipv6/netfilter/Config.in new file mode 100644 index 000000000000..626e1634d0de --- /dev/null +++ b/net/ipv6/netfilter/Config.in @@ -0,0 +1,49 @@ +# +# IP netfilter configuration +# +mainmenu_option next_comment +comment ' IPv6: Netfilter Configuration' + +#tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP6_NF_CONNTRACK +#if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then +# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK +#fi + +#if [ "$CONFIG_EXPERIMENTAL" = "y" -a "$CONFIG_NETLINK" = "y" ]; then +# tristate 'Userspace queueing via NETLINK (EXPERIMENTAL)' CONFIG_IP6_NF_QUEUE +#fi +tristate 'IP6 tables support (required for filtering/masq/NAT)' CONFIG_IP6_NF_IPTABLES +if [ "$CONFIG_IP6_NF_IPTABLES" != "n" ]; then +# The simple matches. + dep_tristate ' limit match support' CONFIG_IP6_NF_MATCH_LIMIT $CONFIG_IP6_NF_IPTABLES +# dep_tristate ' MAC address match support' CONFIG_IP6_NF_MATCH_MAC $CONFIG_IP6_NF_IPTABLES + dep_tristate ' netfilter MARK match support' CONFIG_IP6_NF_MATCH_MARK $CONFIG_IP6_NF_IPTABLES +# dep_tristate ' Multiple port match support' CONFIG_IP6_NF_MATCH_MULTIPORT $CONFIG_IP6_NF_IPTABLES +# dep_tristate ' TOS match support' CONFIG_IP6_NF_MATCH_TOS $CONFIG_IP6_NF_IPTABLES +# if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then +# dep_tristate ' Connection state match support' CONFIG_IP6_NF_MATCH_STATE $CONFIG_IP6_NF_CONNTRACK $CONFIG_IP6_NF_IPTABLES +# fi +# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then +# dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_UNCLEAN $CONFIG_IP6_NF_IPTABLES +# dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_OWNER $CONFIG_IP6_NF_IPTABLES +# fi + +# The targets + dep_tristate ' Packet filtering' CONFIG_IP6_NF_FILTER $CONFIG_IP6_NF_IPTABLES + + if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then +# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER +# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then +# dep_tristate ' MIRROR target support (EXPERIMENTAL)' CONFIG_IP6_NF_TARGET_MIRROR $CONFIG_IP6_NF_FILTER +# fi + fi + + dep_tristate ' Packet mangling' CONFIG_IP6_NF_MANGLE $CONFIG_IP6_NF_IPTABLES + if [ "$CONFIG_IP6_NF_MANGLE" != "n" ]; then +# dep_tristate ' TOS target support' CONFIG_IP6_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE + dep_tristate ' MARK target support' CONFIG_IP6_NF_TARGET_MARK $CONFIG_IP6_NF_MANGLE + fi + #dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES +fi + +endmenu diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile new file mode 100644 index 000000000000..3dfbefc264d4 --- /dev/null +++ b/net/ipv6/netfilter/Makefile @@ -0,0 +1,180 @@ +# +# Makefile for the netfilter modules on top of IPv6. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := netfilter.o +MOD_LIST_NAME := IPV6_MODULES +M_OBJS := + +IP6_NF_CONNTRACK_OBJ:=ip6_conntrack_core.o ip6_conntrack_proto_generic.o ip6_conntrack_proto_tcp.o ip6_conntrack_proto_udp.o ip6_conntrack_proto_icmp.o + +# Link order matters here. +ifeq ($(CONFIG_IP6_NF_CONNTRACK),y) +O_OBJS += ip6_conntrack_standalone.o $(IP6_NF_CONNTRACK_OBJ) +else + ifeq ($(CONFIG_IP6_NF_CONNTRACK),m) + MI_OBJS += $(IP6_NF_CONNTRACK_OBJ) + MIX_OBJS += ip6_conntrack_standalone.o + M_OBJS += ip6_conntrack.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_FTP),y) +O_OBJS += ip6_conntrack_ftp.o +else + ifeq ($(CONFIG_IP6_NF_FTP),m) + MX_OBJS += ip6_conntrack_ftp.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_IPTABLES),y) +O_OBJS += ip6_tables.o +else + ifeq ($(CONFIG_IP6_NF_IPTABLES),m) + MX_OBJS += ip6_tables.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_MATCH_LIMIT),y) +O_OBJS += ip6t_limit.o +else + ifeq ($(CONFIG_IP6_NF_MATCH_LIMIT),m) + M_OBJS += ip6t_limit.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_MATCH_MARK),y) +O_OBJS += ip6t_mark.o +else + ifeq ($(CONFIG_IP6_NF_MATCH_MARK),m) + M_OBJS += ip6t_mark.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_MATCH_MAC),y) +O_OBJS += ip6t_mac.o +else + ifeq ($(CONFIG_IP6_NF_MATCH_MAC),m) + M_OBJS += ip6t_mac.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_MATCH_MULTIPORT),y) +O_OBJS += ip6t_multiport.o +else + ifeq ($(CONFIG_IP6_NF_MATCH_MULTIPORT),m) + M_OBJS += ip6t_multiport.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_MATCH_OWNER),y) +O_OBJS += ip6t_owner.o +else + ifeq ($(CONFIG_IP6_NF_MATCH_OWNER),m) + M_OBJS += ip6t_owner.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_MATCH_TOS),y) +O_OBJS += ip6t_tos.o +else + ifeq ($(CONFIG_IP6_NF_MATCH_TOS),m) + M_OBJS += ip6t_tos.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_MATCH_STATE),y) +O_OBJS += ip6t_state.o +else + ifeq ($(CONFIG_IP6_NF_MATCH_STATE),m) + M_OBJS += ip6t_state.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_MATCH_UNCLEAN),y) +O_OBJS += ip6t_unclean.o +else + ifeq ($(CONFIG_IP6_NF_MATCH_UNCLEAN),m) + M_OBJS += ip6t_unclean.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_FILTER),y) +O_OBJS += ip6table_filter.o +else + ifeq ($(CONFIG_IP6_NF_FILTER),m) + M_OBJS += ip6table_filter.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_TARGET_REJECT),y) +O_OBJS += ip6t_REJECT.o +else + ifeq ($(CONFIG_IP6_NF_TARGET_REJECT),m) + M_OBJS += ip6t_REJECT.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_TARGET_MIRROR),y) +O_OBJS += ip6t_MIRROR.o +else + ifeq ($(CONFIG_IP6_NF_TARGET_MIRROR),m) + M_OBJS += ip6t_MIRROR.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_TARGET_TOS),y) +O_OBJS += ip6t_TOS.o +else + ifeq ($(CONFIG_IP6_NF_TARGET_TOS),m) + M_OBJS += ip6t_TOS.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_TARGET_MARK),y) +O_OBJS += ip6t_MARK.o +else + ifeq ($(CONFIG_IP6_NF_TARGET_MARK),m) + M_OBJS += ip6t_MARK.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_TARGET_REDIRECT),y) +O_OBJS += ip6t_REDIRECT.o +else + ifeq ($(CONFIG_IP6_NF_TARGET_REDIRECT),m) + M_OBJS += ip6t_REDIRECT.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_TARGET_LOG),y) +O_OBJS += ip6t_LOG.o +else + ifeq ($(CONFIG_IP6_NF_TARGET_LOG),m) + M_OBJS += ip6t_LOG.o + endif +endif + +ifeq ($(CONFIG_IP6_NF_QUEUE),y) +O_OBJS += ip6_queue.o +else + ifeq ($(CONFIG_IP6_NF_QUEUE),m) + M_OBJS += ip6_queue.o + endif +endif + +include $(TOPDIR)/Rules.make + +ip6_conntrack.o: ip6_conntrack_standalone.o $(IP6_NF_CONNTRACK_OBJ) + $(LD) -r -o $@ $(IP6_NF_CONNTRACK_OBJ) ip6_conntrack_standalone.o + +ip6fwadm.o: ipfwadm_core.o $(IP6_NF_COMPAT_LAYER) + $(LD) -r -o $@ ip6fwadm_core.o $(IP6_NF_COMPAT_LAYER) + +ip6chains.o: ip6chains_core.o $(IP6_NF_COMPAT_LAYER) + $(LD) -r -o $@ ip6chains_core.o $(IP6_NF_COMPAT_LAYER) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c new file mode 100644 index 000000000000..2d9c356e9ccd --- /dev/null +++ b/net/ipv6/netfilter/ip6_tables.c @@ -0,0 +1,1795 @@ +/* + * Packet matching code. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define IPV6_HDR_LEN (sizeof(struct ipv6hdr)) + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define IP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + printk("IP_NF_ASSERT: %s:%s:%u\n", \ + __FUNCTION__, __FILE__, __LINE__); \ +} while(0) +#else +#define IP_NF_ASSERT(x) +#endif +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +/* Mutex protects lists (only traversed in user context). */ +static DECLARE_MUTEX(ip6t_mutex); + +/* Must have mutex */ +#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) +#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) +#include +#include + +#if 0 +/* All the better to debug you with... */ +#define static +#define inline +#endif + +/* Locking is simple: we assume at worst case there will be one packet + in user context and one from bottom halves (or soft irq if Alexey's + softnet patch was applied). + + We keep a set of rules for each CPU, so we can avoid write-locking + them; doing a readlock_bh() stops packets coming through if we're + in user context. + + To be cache friendly on SMP, we arrange them like so: + [ n-entries ] + ... cache-align padding ... + [ n-entries ] + + Hence the start of any table is given by get_table() below. */ + +/* The table itself */ +struct ip6t_table_info +{ + /* Size per table */ + unsigned int size; + /* Number of entries: FIXME. --RR */ + unsigned int number; + + /* Entry points and underflows */ + unsigned int hook_entry[NF_IP6_NUMHOOKS]; + unsigned int underflow[NF_IP6_NUMHOOKS]; + + char padding[SMP_ALIGN((NF_IP6_NUMHOOKS*2+2)*sizeof(unsigned int))]; + + /* ip6t_entry tables: one per CPU */ + char entries[0]; +}; + +static LIST_HEAD(ip6t_target); +static LIST_HEAD(ip6t_match); +static LIST_HEAD(ip6t_tables); +#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + +#ifdef CONFIG_SMP +#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*cpu_number_map(p)) +#else +#define TABLE_OFFSET(t,p) 0 +#endif + +#if 0 +#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) +#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) +#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0) +#endif + +static int ip6_masked_addrcmp(struct in6_addr addr1, struct in6_addr mask, + struct in6_addr addr2) +{ + int i; + for( i = 0; i < 16; i++){ + if((addr1.s6_addr[i] & mask.s6_addr[i]) != + (addr2.s6_addr[i] & mask.s6_addr[i])) + return 1; + } + return 0; +} + +/* takes in current header and pointer to the header */ +/* if another header exists, sets hdrptr to the next header + and returns the new header value, else returns 0 */ +static u_int8_t ip6_nexthdr(u_int8_t currenthdr, u_int8_t *hdrptr) +{ + int i; + u_int8_t hdrlen, nexthdr = 0; + switch(currenthdr){ + case IPPROTO_AH: + /* whoever decided to do the length of AUTH for ipv6 + in 32bit units unlike other headers should be beaten... + repeatedly...with a large stick...no, an even LARGER + stick...no, you're still not thinking big enough */ + nexthdr = *hdrptr; + hdrlen = hdrptr[i] * 4 + 8; + hdrptr = hdrptr + hdrlen; + break; + /*stupid rfc2402 */ + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + nexthdr = *hdrptr; + hdrlen = hdrptr[1] * 8 + 8; + hdrptr = hdrptr + hdrlen; + break; + case IPPROTO_FRAGMENT: + nexthdr = *hdrptr; + hdrptr = hdrptr + 8; + break; + } + return nexthdr; + +} + +/* Returns whether matches rule or not. */ +static inline int +ip6_packet_match(const struct ipv6hdr *ipv6, + const char *indev, + const char *outdev, + const struct ip6t_ip6 *ip6info, + int isfrag) +{ + size_t i; + unsigned long ret; + +#define FWINV(bool,invflg) ((bool) ^ !!(ip6info->invflags & invflg)) + + if (FWINV(ip6_masked_addrcmp(ipv6->saddr,ip6info->smsk,ip6info->src), + IP6T_INV_SRCIP) + || FWINV(ip6_masked_addrcmp(ipv6->daddr,ip6info->dmsk,ip6info->dst), + IP6T_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); +/* + dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, + ipinfo->smsk.s_addr, ipinfo->src.s_addr, + ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr, + ipinfo->dmsk.s_addr, ipinfo->dst.s_addr, + ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/ + return 0; + } + + /* Look for ifname matches; this should unroll nicely. */ + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)indev)[i] + ^ ((const unsigned long *)ip6info->iniface)[i]) + & ((const unsigned long *)ip6info->iniface_mask)[i]; + } + + if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, ip6info->iniface, + ip6info->invflags&IP6T_INV_VIA_IN ?" (INV)":""); + return 0; + } + + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)outdev)[i] + ^ ((const unsigned long *)ip6info->outiface)[i]) + & ((const unsigned long *)ip6info->outiface_mask)[i]; + } + + if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, ip6info->outiface, + ip6info->invflags&IP6T_INV_VIA_OUT ?" (INV)":""); + return 0; + } + +/* ... might want to do something with class and flowlabel here ... */ + + /* look for the desired protocol header */ + if((ip6info->flags & IP6T_F_PROTO)) { + u_int8_t currenthdr = ipv6->nexthdr; + u_int8_t *hdrptr; + hdrptr = (u_int8_t *)(ipv6 + 1); + do { + if (ip6info->proto == currenthdr) { + if(ip6info->invflags & IP6T_INV_PROTO) + return 0; + return 1; + } + currenthdr = ip6_nexthdr(currenthdr, hdrptr); + } while(currenthdr); + if (!(ip6info->invflags & IP6T_INV_PROTO)) + return 0; + } + return 1; +} + +/* should be ip6 safe */ +static inline int +ip6_checkentry(const struct ip6t_ip6 *ipv6) +{ + if (ipv6->flags & ~IP6T_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + ipv6->flags & ~IP6T_F_MASK); + return 0; + } + if (ipv6->invflags & ~IP6T_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + ipv6->invflags & ~IP6T_INV_MASK); + return 0; + } + return 1; +} + +static unsigned int +ip6t_error(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + if (net_ratelimit()) + printk("ip6_tables: error: `%s'\n", (char *)targinfo); + + return NF_DROP; +} + +static inline +int do_match(struct ip6t_entry_match *m, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + /* Stop iteration if it doesn't match */ + if (!m->u.kernel.match->match(skb, in, out, m->data, + offset, hdr, datalen, hotdrop)) + return 1; + else + return 0; +} + +static inline struct ip6t_entry * +get_entry(void *base, unsigned int offset) +{ + return (struct ip6t_entry *)(base + offset); +} + +/* Returns one of the generic firewall policies, like NF_ACCEPT. */ +unsigned int +ip6t_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct ip6t_table *table, + void *userdata) +{ + static const char nulldevname[IFNAMSIZ] = { 0 }; + u_int16_t offset = 0; + struct ipv6hdr *ipv6; + void *protohdr; + u_int16_t datalen; + int hotdrop = 0; + /* Initializing verdict to NF_DROP keeps gcc happy. */ + unsigned int verdict = NF_DROP; + const char *indev, *outdev; + void *table_base; + struct ip6t_entry *e, *back; + + /* Initialization */ + ipv6 = (*pskb)->nh.ipv6h; + protohdr = (u_int32_t *)ipv6 + IPV6_HDR_LEN; + datalen = (*pskb)->len - IPV6_HDR_LEN; + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + + read_lock_bh(&table->lock); + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + table_base = (void *)table->private->entries + + TABLE_OFFSET(table->private, smp_processor_id()); + e = get_entry(table_base, table->private->hook_entry[hook]); + +#ifdef CONFIG_NETFILTER_DEBUG + /* Check noone else using our table */ + if (((struct ip6t_entry *)table_base)->comefrom != 0xdead57ac + && ((struct ip6t_entry *)table_base)->comefrom != 0xeeeeeeec) { + printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", + smp_processor_id(), + table->name, + &((struct ip6t_entry *)table_base)->comefrom, + ((struct ip6t_entry *)table_base)->comefrom); + } + ((struct ip6t_entry *)table_base)->comefrom = 0x57acc001; +#endif + + /* For return from builtin chain */ + back = get_entry(table_base, table->private->underflow[hook]); + + do { + IP_NF_ASSERT(e); + IP_NF_ASSERT(back); + (*pskb)->nfcache |= e->nfcache; + if (ip6_packet_match(ipv6, indev, outdev, &e->ipv6, offset)) { + struct ip6t_entry_target *t; + + if (IP6T_MATCH_ITERATE(e, do_match, + *pskb, in, out, + offset, protohdr, + datalen, &hotdrop) != 0) + goto no_match; + + ADD_COUNTER(e->counters, ntohs(ipv6->payload_len) + IPV6_HDR_LEN, 1); + + t = ip6t_get_target(e); + IP_NF_ASSERT(t->u.kernel.target); + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct ip6t_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != IP6T_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } + if (table_base + v + != (void *)e + e->next_offset) { + /* Save old back ptr in next entry */ + struct ip6t_entry *next + = (void *)e + e->next_offset; + next->comefrom + = (void *)back - table_base; + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + } else { + /* Targets which reenter must return + abs. verdicts */ +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ip6t_entry *)table_base)->comefrom + = 0xeeeeeeec; +#endif + verdict = t->u.kernel.target->target(pskb, + hook, + in, out, + t->data, + userdata); + +#ifdef CONFIG_NETFILTER_DEBUG + if (((struct ip6t_entry *)table_base)->comefrom + != 0xeeeeeeec + && verdict == IP6T_CONTINUE) { + printk("Target %s reentered!\n", + t->u.kernel.target->name); + verdict = NF_DROP; + } + ((struct ip6t_entry *)table_base)->comefrom + = 0x57acc001; +#endif + /* Target might have changed stuff. */ + ipv6 = (*pskb)->nh.ipv6h; + protohdr = (u_int32_t *)ipv6 + IPV6_HDR_LEN; + datalen = (*pskb)->len - IPV6_HDR_LEN; + + if (verdict == IP6T_CONTINUE) + e = (void *)e + e->next_offset; + else + /* Verdict */ + break; + } + } else { + + no_match: + e = (void *)e + e->next_offset; + } + } while (!hotdrop); + +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ip6t_entry *)table_base)->comefrom = 0xdead57ac; +#endif + read_unlock_bh(&table->lock); + +#ifdef DEBUG_ALLOW_ALL + return NF_ACCEPT; +#else + if (hotdrop) + return NF_DROP; + else return verdict; +#endif +} + +/* If it succeeds, returns element and locks mutex */ +static inline void * +find_inlist_lock_noload(struct list_head *head, + const char *name, + int *error, + struct semaphore *mutex) +{ + void *ret; + +#if 1 + duprintf("find_inlist: searching for `%s' in %s.\n", + name, head == &ip6t_target ? "ip6t_target" + : head == &ip6t_match ? "ip6t_match" + : head == &ip6t_tables ? "ip6t_tables" : "UNKNOWN"); +#endif + + *error = down_interruptible(mutex); + if (*error != 0) + return NULL; + + ret = list_named_find(head, name); + if (!ret) { + *error = -ENOENT; + up(mutex); + } + return ret; +} + +#ifndef CONFIG_KMOD +#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) +#else +static void * +find_inlist_lock(struct list_head *head, + const char *name, + const char *prefix, + int *error, + struct semaphore *mutex) +{ + void *ret; + + ret = find_inlist_lock_noload(head, name, error, mutex); + if (!ret) { + char modulename[IP6T_FUNCTION_MAXNAMELEN + strlen(prefix) + 1]; + strcpy(modulename, prefix); + strcat(modulename, name); + duprintf("find_inlist: loading `%s'.\n", modulename); + request_module(modulename); + ret = find_inlist_lock_noload(head, name, error, mutex); + } + + return ret; +} +#endif + +static inline struct ip6t_table * +find_table_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ip6t_tables, name, "ip6table_", error, mutex); +} + +static inline struct ip6t_match * +find_match_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ip6t_match, name, "ip6t_", error, mutex); +} + +static inline struct ip6t_target * +find_target_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ip6t_target, name, "ip6t_", error, mutex); +} + +/* All zeroes == unconditional rule. */ +static inline int +unconditional(const struct ip6t_ip6 *ipv6) +{ + unsigned int i; + + for (i = 0; i < sizeof(*ipv6); i++) + if (((char *)ipv6)[i]) + break; + + return (i == sizeof(*ipv6)); +} + +/* Figures out from what hook each rule can be called: returns 0 if + there are loops. Puts hook bitmask in comefrom. */ +static int +mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + to 0 as we leave), and comefrom to save source hook bitmask */ + for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct ip6t_entry *e + = (struct ip6t_entry *)(newinfo->entries + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + struct ip6t_standard_target *t + = (void *)ip6t_get_target(e); + + if (e->comefrom & (1 << NF_IP6_NUMHOOKS)) { + printk("iptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_IP6_NUMHOOKS)); + + /* Unconditional return/END. */ + if (e->target_offset == sizeof(struct ip6t_entry) + && (strcmp(t->target.u.user.name, + IP6T_STANDARD_TARGET) == 0) + && t->verdict < 0 + && unconditional(&e->ipv6)) { + unsigned int oldpos, size; + + /* Return: backtrack through the last + big jump. */ + do { + e->comefrom ^= (1<comefrom + & (1 << NF_IP6_NUMHOOKS)) { + duprintf("Back unset " + "on hook %u " + "rule %u\n", + hook, pos); + } +#endif + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct ip6t_entry *) + (newinfo->entries + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct ip6t_entry *) + (newinfo->entries + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + IP6T_STANDARD_TARGET) == 0 + && newpos >= 0) { + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct ip6t_entry *) + (newinfo->entries + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int +cleanup_match(struct ip6t_entry_match *m, unsigned int *i) +{ + if (i && (*i)-- == 0) + return 1; + + if (m->u.kernel.match->destroy) + m->u.kernel.match->destroy(m->data, + m->u.match_size - sizeof(*m)); + + if (m->u.kernel.match->me) + __MOD_DEC_USE_COUNT(m->u.kernel.match->me); + + return 0; +} + +static inline int +standard_check(const struct ip6t_entry_target *t, + unsigned int max_offset) +{ + struct ip6t_standard_target *targ = (void *)t; + + /* Check standard info. */ + if (t->u.target_size + != IP6T_ALIGN(sizeof(struct ip6t_standard_target))) { + duprintf("standard_check: target size %u != %u\n", + t->u.target_size, + IP6T_ALIGN(sizeof(struct ip6t_standard_target))); + return 0; + } + + if (targ->verdict >= 0 + && targ->verdict > max_offset - sizeof(struct ip6t_entry)) { + duprintf("ip6t_standard_check: bad verdict (%i)\n", + targ->verdict); + return 0; + } + + if (targ->verdict < -NF_MAX_VERDICT - 1) { + duprintf("ip6t_standard_check: bad negative verdict (%i)\n", + targ->verdict); + return 0; + } + return 1; +} + +static inline int +check_match(struct ip6t_entry_match *m, + const char *name, + const struct ip6t_ip6 *ipv6, + unsigned int hookmask, + unsigned int *i) +{ + int ret; + struct ip6t_match *match; + + match = find_match_lock(m->u.user.name, &ret, &ip6t_mutex); + if (!match) { + // duprintf("check_match: `%s' not found\n", m->u.name); + return ret; + } + if (match->me) + __MOD_INC_USE_COUNT(match->me); + m->u.kernel.match = match; + up(&ip6t_mutex); + + if (m->u.kernel.match->checkentry + && !m->u.kernel.match->checkentry(name, ipv6, m->data, + m->u.match_size - sizeof(*m), + hookmask)) { + if (m->u.kernel.match->me) + __MOD_DEC_USE_COUNT(m->u.kernel.match->me); + duprintf("ip_tables: check failed for `%s'.\n", + m->u.kernel.match->name); + return -EINVAL; + } + + (*i)++; + return 0; +} + +static struct ip6t_target ip6t_standard_target; + +static inline int +check_entry(struct ip6t_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct ip6t_entry_target *t; + struct ip6t_target *target; + int ret; + unsigned int j; + + if (!ip6_checkentry(&e->ipv6)) { + duprintf("ip_tables: ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + j = 0; + ret = IP6T_MATCH_ITERATE(e, check_match, name, &e->ipv6, e->comefrom, &j); + if (ret != 0) + goto cleanup_matches; + + t = ip6t_get_target(e); + target = find_target_lock(t->u.user.name, &ret, &ip6t_mutex); + if (!target) { + // duprintf("check_entry: `%s' not found\n", t->u.name); + return ret; + } + if (target->me) + __MOD_INC_USE_COUNT(target->me); + t->u.kernel.target = target; + up(&ip6t_mutex); + + if (t->u.kernel.target == &ip6t_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto cleanup_matches; + } + } else if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, t->data, + t->u.target_size + - sizeof(*t), + e->comefrom)) { + if (t->u.kernel.target->me) + __MOD_DEC_USE_COUNT(t->u.kernel.target->me); + duprintf("ip_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + goto cleanup_matches; + } + + (*i)++; + return 0; + + cleanup_matches: + IP6T_MATCH_ITERATE(e, cleanup_match, &j); + return ret; +} + +static inline int +check_entry_size_and_hooks(struct ip6t_entry *e, + struct ip6t_table_info *newinfo, + unsigned char *base, + unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int *i) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 + || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_IP6_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* FIXME: underflows must be unconditional, standard verdicts + < 0 (not IP6T_RETURN). --RR */ + + /* Clear counters and comefrom */ + e->counters = ((struct ip6t_counters) { 0, 0 }); + e->comefrom = 0; + + (*i)++; + return 0; +} + +static inline int +cleanup_entry(struct ip6t_entry *e, unsigned int *i) +{ + struct ip6t_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + /* Cleanup all matches */ + IP6T_MATCH_ITERATE(e, cleanup_match, NULL); + t = ip6t_get_target(e); + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + if (t->u.kernel.target->me) + __MOD_DEC_USE_COUNT(t->u.kernel.target->me); + + return 0; +} + +/* Checks and translates the user-supplied table segment (held in + newinfo) */ +static int +translate_table(const char *name, + unsigned int valid_hooks, + struct ip6t_table_info *newinfo, + unsigned int size, + unsigned int number, + const unsigned int *hook_entries, + const unsigned int *underflows) +{ + unsigned int i; + int ret; + + newinfo->size = size; + newinfo->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_IP6_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + /* Walk through entries, checking offsets. */ + ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry_size_and_hooks, + newinfo, + newinfo->entries, + newinfo->entries + size, + hook_entries, underflows, &i); + if (ret != 0) + return ret; + + if (i != number) { + duprintf("translate_table: %u not %u entries\n", + i, number); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_IP6_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, valid_hooks)) + return -ELOOP; + + /* Finally, each sanity check must pass */ + i = 0; + ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry, name, size, &i); + + if (ret != 0) { + IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + cleanup_entry, &i); + return ret; + } + + /* And one copy for every other CPU */ + for (i = 1; i < smp_num_cpus; i++) { + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size*i), + newinfo->entries, + SMP_ALIGN(newinfo->size)); + } + + return ret; +} + +static struct ip6t_table_info * +replace_table(struct ip6t_table *table, + unsigned int num_counters, + struct ip6t_table_info *newinfo, + int *error) +{ + struct ip6t_table_info *oldinfo; + +#ifdef CONFIG_NETFILTER_DEBUG + { + struct ip6t_entry *table_base; + unsigned int i; + + for (i = 0; i < smp_num_cpus; i++) { + table_base = + (void *)newinfo->entries + + TABLE_OFFSET(newinfo, i); + + table_base->comefrom = 0xdead57ac; + } + } +#endif + + /* Do the substitution. */ + write_lock_bh(&table->lock); + /* Check inside lock: is the old number correct? */ + if (num_counters != table->private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, table->private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = table->private; + table->private = newinfo; + write_unlock_bh(&table->lock); + + return oldinfo; +} + +/* Gets counters. */ +static inline int +add_entry_to_counter(const struct ip6t_entry *e, + struct ip6t_counters total[], + unsigned int *i) +{ + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + +static void +get_counters(const struct ip6t_table_info *t, + struct ip6t_counters counters[]) +{ + unsigned int cpu; + unsigned int i; + + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + i = 0; + IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + t->size, + add_entry_to_counter, + counters, + &i); + } +} + +static int +copy_entries_to_user(unsigned int total_size, + struct ip6t_table *table, + void *userptr) +{ + unsigned int off, num, countersize; + struct ip6t_entry *e; + struct ip6t_counters *counters; + int ret = 0; + + /* We need atomic snapshot of counters: rest doesn't change + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct ip6t_counters) * table->private->number; + counters = vmalloc(countersize); + + if (counters == NULL) + return -ENOMEM; + + /* First, sum counters... */ + memset(counters, 0, countersize); + write_lock_bh(&table->lock); + get_counters(table->private, counters); + write_unlock_bh(&table->lock); + + /* ... then copy entire thing from CPU 0... */ + if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + unsigned int i; + struct ip6t_entry_match *m; + struct ip6t_entry_target *t; + + e = (struct ip6t_entry *)(table->private->entries + off); + if (copy_to_user(userptr + off + + offsetof(struct ip6t_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + for (i = sizeof(struct ip6t_entry); + i < e->target_offset; + i += m->u.match_size) { + m = (void *)e + i; + + if (copy_to_user(userptr + off + i + + offsetof(struct ip6t_entry_match, + u.user.name), + m->u.kernel.match->name, + strlen(m->u.kernel.match->name)+1) + != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + t = ip6t_get_target(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct ip6t_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +static int +get_entries(const struct ip6t_get_entries *entries, + struct ip6t_get_entries *uptr) +{ + int ret; + struct ip6t_table *t; + + t = find_table_lock(entries->name, &ret, &ip6t_mutex); + if (t) { + duprintf("t->private->number = %u\n", + t->private->number); + if (entries->size == t->private->size) + ret = copy_entries_to_user(t->private->size, + t, uptr->entries); + else { + duprintf("get_entries: I've got %u not %u!\n", + t->private->size, + entries->size); + ret = -EINVAL; + } + up(&ip6t_mutex); + } else + duprintf("get_entries: Can't find %s!\n", + entries->name); + + return ret; +} + +static int +do_replace(void *user, unsigned int len) +{ + int ret; + struct ip6t_replace tmp; + struct ip6t_table *t; + struct ip6t_table_info *newinfo, *oldinfo; + struct ip6t_counters *counters; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + newinfo = vmalloc(sizeof(struct ip6t_table_info) + + SMP_ALIGN(tmp.size) * smp_num_cpus); + if (!newinfo) + return -ENOMEM; + + if (copy_from_user(newinfo->entries, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + counters = vmalloc(tmp.num_counters * sizeof(struct ip6t_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; + } + memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters)); + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo_counters; + + duprintf("ip_tables: Translated table\n"); + + t = find_table_lock(tmp.name, &ret, &ip6t_mutex); + if (!t) + goto free_newinfo_counters_untrans; + + /* You lied! */ + if (tmp.valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + tmp.valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto free_newinfo_counters_untrans_unlock; + } + + oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + if (!oldinfo) + goto free_newinfo_counters_untrans_unlock; + + /* Get the old counters. */ + get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ + IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); + vfree(oldinfo); + /* Silent error: too late now. */ + copy_to_user(tmp.counters, counters, + sizeof(struct ip6t_counters) * tmp.num_counters); + vfree(counters); + up(&ip6t_mutex); + return 0; + + free_newinfo_counters_untrans_unlock: + up(&ip6t_mutex); + free_newinfo_counters_untrans: + IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + free_newinfo_counters: + vfree(counters); + free_newinfo: + vfree(newinfo); + return ret; +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static inline int +add_counter_to_entry(struct ip6t_entry *e, + const struct ip6t_counters addme[], + unsigned int *i) +{ +#if 0 + duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", + *i, + (long unsigned int)e->counters.pcnt, + (long unsigned int)e->counters.bcnt, + (long unsigned int)addme[*i].pcnt, + (long unsigned int)addme[*i].bcnt); +#endif + + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +static int +do_add_counters(void *user, unsigned int len) +{ + unsigned int i; + struct ip6t_counters_info tmp, *paddc; + struct ip6t_table *t; + int ret; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ip6t_counters)) + return -EINVAL; + + paddc = vmalloc(len); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user, len) != 0) { + ret = -EFAULT; + goto free; + } + + t = find_table_lock(tmp.name, &ret, &ip6t_mutex); + if (!t) + goto free; + + write_lock_bh(&t->lock); + if (t->private->number != paddc->num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + IP6T_ENTRY_ITERATE(t->private->entries, + t->private->size, + add_counter_to_entry, + paddc->counters, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); + up(&ip6t_mutex); + free: + vfree(paddc); + + return ret; +} + +static int +do_ip6t_set_ctl(struct sock *sk, int cmd, void *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case IP6T_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len); + break; + + default: + duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int +do_ip6t_get_ctl(struct sock *sk, int cmd, void *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_GET_INFO: { + char name[IP6T_TABLE_MAXNAMELEN]; + struct ip6t_table *t; + + if (*len != sizeof(struct ip6t_getinfo)) { + duprintf("length %u != %u\n", *len, + sizeof(struct ip6t_getinfo)); + ret = -EINVAL; + break; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) { + ret = -EFAULT; + break; + } + t = find_table_lock(name, &ret, &ip6t_mutex); + if (t) { + struct ip6t_getinfo info; + + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, t->private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, t->private->underflow, + sizeof(info.underflow)); + info.num_entries = t->private->number; + info.size = t->private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + up(&ip6t_mutex); + } + } + break; + + case IP6T_SO_GET_ENTRIES: { + struct ip6t_get_entries get; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %u\n", *len, sizeof(get)); + ret = -EINVAL; + } else if (copy_from_user(&get, user, sizeof(get)) != 0) { + ret = -EFAULT; + } else if (*len != sizeof(struct ip6t_get_entries) + get.size) { + duprintf("get_entries: %u != %u\n", *len, + sizeof(struct ip6t_get_entries) + get.size); + ret = -EINVAL; + } else + ret = get_entries(&get, user); + break; + } + + default: + duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +/* Registration hooks for targets. */ +int +ip6t_register_target(struct ip6t_target *target) +{ + int ret; + + MOD_INC_USE_COUNT; + ret = down_interruptible(&ip6t_mutex); + if (ret != 0) + return ret; + + if (!list_named_insert(&ip6t_target, target)) { + duprintf("ip6t_register_target: `%s' already in list!\n", + target->name); + ret = -EINVAL; + MOD_DEC_USE_COUNT; + } + up(&ip6t_mutex); + return ret; +} + +void +ip6t_unregister_target(struct ip6t_target *target) +{ + down(&ip6t_mutex); + LIST_DELETE(&ip6t_target, target); + up(&ip6t_mutex); + MOD_DEC_USE_COUNT; +} + +int +ip6t_register_match(struct ip6t_match *match) +{ + int ret; + + MOD_INC_USE_COUNT; + ret = down_interruptible(&ip6t_mutex); + if (ret != 0) { + MOD_DEC_USE_COUNT; + return ret; + } + if (list_named_insert(&ip6t_match, match)) { + ret = 0; + } else { + duprintf("ip6t_register_match: `%s' already in list!\n", + match->name); + MOD_DEC_USE_COUNT; + ret = -EINVAL; + } + up(&ip6t_mutex); + + return ret; +} + +void +ip6t_unregister_match(struct ip6t_match *match) +{ + down(&ip6t_mutex); + LIST_DELETE(&ip6t_match, match); + up(&ip6t_mutex); + MOD_DEC_USE_COUNT; +} + +int ip6t_register_table(struct ip6t_table *table) +{ + int ret; + struct ip6t_table_info *newinfo; + static struct ip6t_table_info bootstrap + = { 0, 0, { 0 }, { 0 }, { }, { } }; + + MOD_INC_USE_COUNT; + newinfo = vmalloc(sizeof(struct ip6t_table_info) + + SMP_ALIGN(table->table->size) * smp_num_cpus); + if (!newinfo) { + ret = -ENOMEM; + MOD_DEC_USE_COUNT; + return ret; + } + memcpy(newinfo->entries, table->table->entries, table->table->size); + + ret = translate_table(table->name, table->valid_hooks, + newinfo, table->table->size, + table->table->num_entries, + table->table->hook_entry, + table->table->underflow); + if (ret != 0) { + vfree(newinfo); + MOD_DEC_USE_COUNT; + return ret; + } + + ret = down_interruptible(&ip6t_mutex); + if (ret != 0) { + vfree(newinfo); + MOD_DEC_USE_COUNT; + return ret; + } + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&ip6t_tables, table->name)) { + ret = -EEXIST; + goto free_unlock; + } + + /* Simplifies replace_table code. */ + table->private = &bootstrap; + if (!replace_table(table, 0, newinfo, &ret)) + goto free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); + + table->lock = RW_LOCK_UNLOCKED; + list_prepend(&ip6t_tables, table); + + unlock: + up(&ip6t_mutex); + return ret; + + free_unlock: + vfree(newinfo); + MOD_DEC_USE_COUNT; + goto unlock; +} + +void ip6t_unregister_table(struct ip6t_table *table) +{ + down(&ip6t_mutex); + LIST_DELETE(&ip6t_tables, table); + up(&ip6t_mutex); + + /* Decrease module usage counts and free resources */ + IP6T_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); + MOD_DEC_USE_COUNT; +} + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +static inline int +port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) +{ + int ret; + + ret = (port >= min && port <= max) ^ invert; + return ret; +} + +static int +tcp_find_option(u_int8_t option, + const struct tcphdr *tcp, + u_int16_t datalen, + int invert, + int *hotdrop) +{ + unsigned int i = sizeof(struct tcphdr); + const u_int8_t *opt = (u_int8_t *)tcp; + + duprintf("tcp_match: finding option\n"); + /* If we don't have the whole header, drop packet. */ + if (tcp->doff * 4 > datalen) { + *hotdrop = 1; + return 0; + } + + while (i < tcp->doff * 4) { + if (opt[i] == option) return !invert; + if (opt[i] < 2) i++; + else i += opt[i+1]?:1; + } + + return invert; +} + +static int +tcp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct tcphdr *tcp = hdr; + const struct ip6t_tcp *tcpinfo = matchinfo; + + /* To quote Alan: + + Don't allow a fragment of TCP 8 bytes in. Nobody normal + causes this. Its a cracker trying to break in by doing a + flag overwrite to pass the direction checks. + */ + + if (offset == 1) { + duprintf("Dropping evil TCP offset=1 frag.\n"); + *hotdrop = 1; + return 0; + } else if (offset == 0 && datalen < sizeof(struct tcphdr)) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + /* FIXME: Try tcp doff >> packet len against various stacks --RR */ + +#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) + + /* Must not be a fragment. */ + return !offset + && port_match(tcpinfo->spts[0], tcpinfo->spts[1], + ntohs(tcp->source), + !!(tcpinfo->invflags & IP6T_TCP_INV_SRCPT)) + && port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], + ntohs(tcp->dest), + !!(tcpinfo->invflags & IP6T_TCP_INV_DSTPT)) + && FWINVTCP((((unsigned char *)tcp)[13] + & tcpinfo->flg_mask) + == tcpinfo->flg_cmp, + IP6T_TCP_INV_FLAGS) + && (!tcpinfo->option + || tcp_find_option(tcpinfo->option, tcp, datalen, + tcpinfo->invflags + & IP6T_TCP_INV_OPTION, + hotdrop)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +tcp_checkentry(const char *tablename, + const struct ip6t_ip6 *ipv6, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_tcp *tcpinfo = matchinfo; + + /* Must specify proto == TCP, and no unknown invflags */ + return ipv6->proto == IPPROTO_TCP + && !(ipv6->invflags & IP6T_INV_PROTO) + && matchsize == IP6T_ALIGN(sizeof(struct ip6t_tcp)) + && !(tcpinfo->invflags & ~IP6T_TCP_INV_MASK); +} + +static int +udp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct udphdr *udp = hdr; + const struct ip6t_udp *udpinfo = matchinfo; + + if (offset == 0 && datalen < sizeof(struct udphdr)) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil UDP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + /* Must not be a fragment. */ + return !offset + && port_match(udpinfo->spts[0], udpinfo->spts[1], + ntohs(udp->source), + !!(udpinfo->invflags & IP6T_UDP_INV_SRCPT)) + && port_match(udpinfo->dpts[0], udpinfo->dpts[1], + ntohs(udp->dest), + !!(udpinfo->invflags & IP6T_UDP_INV_DSTPT)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +udp_checkentry(const char *tablename, + const struct ip6t_ip6 *ipv6, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_udp *udpinfo = matchinfo; + + /* Must specify proto == UDP, and no unknown invflags */ + if (ipv6->proto != IPPROTO_UDP || (ipv6->invflags & IP6T_INV_PROTO)) { + duprintf("ip6t_udp: Protocol %u != %u\n", ipv6->proto, + IPPROTO_UDP); + return 0; + } + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_udp))) { + duprintf("ip6t_udp: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_udp))); + return 0; + } + if (udpinfo->invflags & ~IP6T_UDP_INV_MASK) { + duprintf("ip6t_udp: unknown flags %X\n", + udpinfo->invflags); + return 0; + } + + return 1; +} + +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static inline int +icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, + u_int8_t type, u_int8_t code, + int invert) +{ + return (type == test_type && code >= min_code && code <= max_code) + ^ invert; +} + +static int +icmp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct icmphdr *icmp = hdr; + const struct ip6t_icmp *icmpinfo = matchinfo; + + if (offset == 0 && datalen < 2) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil ICMP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + /* Must not be a fragment. */ + return !offset + && icmp_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + icmp->type, icmp->code, + !!(icmpinfo->invflags&IP6T_ICMP_INV)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +icmp_checkentry(const char *tablename, + const struct ip6t_ip6 *ipv6, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_icmp *icmpinfo = matchinfo; + + /* Must specify proto == ICMP, and no unknown invflags */ + return ipv6->proto == IPPROTO_ICMP + && !(ipv6->invflags & IP6T_INV_PROTO) + && matchsize == IP6T_ALIGN(sizeof(struct ip6t_icmp)) + && !(icmpinfo->invflags & ~IP6T_ICMP_INV); +} + +/* The built-in targets: standard (NULL) and error. */ +static struct ip6t_target ip6t_standard_target += { { NULL, NULL }, IP6T_STANDARD_TARGET, NULL, NULL, NULL }; +static struct ip6t_target ip6t_error_target += { { NULL, NULL }, IP6T_ERROR_TARGET, ip6t_error, NULL, NULL }; + +static struct nf_sockopt_ops ip6t_sockopts += { { NULL, NULL }, PF_INET6, IP6T_BASE_CTL, IP6T_SO_SET_MAX+1, do_ip6t_set_ctl, + IP6T_BASE_CTL, IP6T_SO_GET_MAX+1, do_ip6t_get_ctl, 0, NULL }; + +static struct ip6t_match tcp_matchstruct += { { NULL, NULL }, "tcp", &tcp_match, &tcp_checkentry, NULL }; +static struct ip6t_match udp_matchstruct += { { NULL, NULL }, "udp", &udp_match, &udp_checkentry, NULL }; +static struct ip6t_match icmp_matchstruct += { { NULL, NULL }, "icmp", &icmp_match, &icmp_checkentry, NULL }; + +#ifdef CONFIG_PROC_FS +static inline int print_name(const struct ip6t_table *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", t->name); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static int ip6t_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ip6t_mutex) != 0) + return 0; + + LIST_FIND(&ip6t_tables, print_name, struct ip6t_table *, + offset, buffer, length, &pos, &count); + + up(&ip6t_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} +#endif /*CONFIG_PROC_FS*/ + +static int __init init(void) +{ + int ret; + + /* Noone else will be downing sem now, so we won't sleep */ + down(&ip6t_mutex); + list_append(&ip6t_target, &ip6t_standard_target); + list_append(&ip6t_target, &ip6t_error_target); + list_append(&ip6t_match, &tcp_matchstruct); + list_append(&ip6t_match, &udp_matchstruct); + list_append(&ip6t_match, &icmp_matchstruct); + up(&ip6t_mutex); + + /* Register setsockopt */ + ret = nf_register_sockopt(&ip6t_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); + return ret; + } + +#ifdef CONFIG_PROC_FS + if (!proc_net_create("ip6_tables_names", 0, ip6t_get_tables)) { + nf_unregister_sockopt(&ip6t_sockopts); + return -ENOMEM; + } +#endif + + printk("ip6_tables: (c)2000 Netfilter core team\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&ip6t_sockopts); +#ifdef CONFIG_PROC_FS + proc_net_remove("ip6_tables_names"); +#endif +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c new file mode 100644 index 000000000000..dd8bb322691c --- /dev/null +++ b/net/ipv6/netfilter/ip6t_MARK.c @@ -0,0 +1,66 @@ +/* This is a module which is used for setting the NFMARK field of an skb. */ +#include +#include +#include +#include + +#include +#include + +static unsigned int +target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + const struct ipt_mark_target_info *markinfo = targinfo; + + if((*pskb)->nfmark != markinfo->mark) { + (*pskb)->nfmark = markinfo->mark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { + printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_mark_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_mark_reg += { { NULL, NULL }, "MARK", target, checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_mark_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_mark_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_limit.c b/net/ipv6/netfilter/ip6t_limit.c new file mode 100644 index 000000000000..0d79a9a8bd73 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_limit.c @@ -0,0 +1,135 @@ +/* Kernel module to control the rate + * + * Jérôme de Vivie + * Hervé Eychenne + * + * 2 September 1999: Changed from the target RATE to the match + * `limit', removed logging. Did I mention that + * Alexey is a fucking genius? + * Rusty Russell (rusty@rustcorp.com.au). */ +#include +#include +#include +#include + +#include +#include + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +static spinlock_t limit_lock = SPIN_LOCK_UNLOCKED; + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To avoid underflow, we multiply by 128 (ie. you get 128 credits per + jiffy). Hence a cost of 2^32-1, means one pass per 32768 seconds + at 1024HZ (or one every 9 hours). A cost of 1 means 12800 passes + per second at 100HZ. */ + +#define CREDITS_PER_JIFFY 128 + +static int +ipt_limit_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master; + unsigned long now = jiffies; + + spin_lock_bh(&limit_lock); + r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY; + if (r->credit > r->credit_cap) + r->credit = r->credit_cap; + + if (r->credit >= r->cost) { + /* We're not limited. */ + r->credit -= r->cost; + spin_unlock_bh(&limit_lock); + return 1; + } + + spin_unlock_bh(&limit_lock); + return 0; +} + +/* Precision saver. */ +static u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE; +} + +static int +ipt_limit_checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_rateinfo *r = matchinfo; + + if (matchsize != IP6T_ALIGN(sizeof(struct ipt_rateinfo))) + return 0; + + /* Check for overflow. */ + if (r->burst == 0 + || user2credits(r->avg * r->burst) < user2credits(r->avg)) { + printk("Call rusty: overflow in ipt_limit: %u/%u\n", + r->avg, r->burst); + return 0; + } + + /* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies * + 128. */ + r->prev = jiffies; + r->credit = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->cost = user2credits(r->avg); + + /* For SMP, we only want to use one set of counters. */ + r->master = r; + + return 1; +} + +static struct ip6t_match ipt_limit_reg += { { NULL, NULL }, "limit", ipt_limit_match, ipt_limit_checkentry, NULL, + THIS_MODULE }; + +static int __init init(void) +{ + if (ip6t_register_match(&ipt_limit_reg)) + return -EINVAL; + return 0; +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&ipt_limit_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_mark.c b/net/ipv6/netfilter/ip6t_mark.c new file mode 100644 index 000000000000..babe202a4e94 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_mark.c @@ -0,0 +1,50 @@ +/* Kernel module to match NFMARK values. */ +#include +#include + +#include +#include + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_mark_info *info = matchinfo; + + return ((skb->nfmark & info->mask) == info->mark) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IP6T_ALIGN(sizeof(struct ipt_mark_info))) + return 0; + + return 1; +} + +static struct ip6t_match mark_match += { { NULL, NULL }, "mark", &match, &checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + return ip6t_register_match(&mark_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&mark_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c new file mode 100644 index 000000000000..cc40383858fc --- /dev/null +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -0,0 +1,183 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + */ +#include +#include + +#define FILTER_VALID_HOOKS ((1 << NF_IP6_LOCAL_IN) | (1 << NF_IP6_FORWARD) | (1 << NF_IP6_LOCAL_OUT)) + +/* Standard entry. */ +struct ip6t_standard +{ + struct ip6t_entry entry; + struct ip6t_standard_target target; +}; + +struct ip6t_error_target +{ + struct ip6t_entry_target target; + char errorname[IP6T_FUNCTION_MAXNAMELEN]; +}; + +struct ip6t_error +{ + struct ip6t_entry entry; + struct ip6t_error_target target; +}; + +static struct +{ + struct ip6t_replace repl; + struct ip6t_standard entries[3]; + struct ip6t_error term; +} initial_table __initdata += { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct ip6t_standard) * 3 + sizeof(struct ip6t_error), + { [NF_IP6_LOCAL_IN] 0, + [NF_IP6_FORWARD] sizeof(struct ip6t_standard), + [NF_IP6_LOCAL_OUT] sizeof(struct ip6t_standard) * 2 }, + { [NF_IP6_LOCAL_IN] 0, + [NF_IP6_FORWARD] sizeof(struct ip6t_standard), + [NF_IP6_LOCAL_OUT] sizeof(struct ip6t_standard) * 2 }, + 0, NULL, { } }, + { + /* LOCAL_IN */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_error), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_error_target)), IP6T_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ip6t_table packet_filter += { { NULL, NULL }, "filter", &initial_table.repl, + FILTER_VALID_HOOKS, RW_LOCK_UNLOCKED, NULL }; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6t_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static unsigned int +ip6t_local_out_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if 0 + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ip6t_hook: happy cracking.\n"); + return NF_ACCEPT; + } +#endif + + return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static struct nf_hook_ops ip6t_ops[] += { { { NULL, NULL }, ip6t_hook, PF_INET6, NF_IP6_LOCAL_IN, NF_IP6_PRI_FILTER }, + { { NULL, NULL }, ip6t_hook, PF_INET6, NF_IP6_FORWARD, NF_IP6_PRI_FILTER }, + { { NULL, NULL }, ip6t_local_out_hook, PF_INET6, NF_IP6_LOCAL_OUT, + NF_IP6_PRI_FILTER } +}; + +/* Default to forward because I got too much mail already. */ +static int forward = NF_ACCEPT; +MODULE_PARM(forward, "i"); + +static int __init init(void) +{ + int ret; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + printk("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + /* Entry 1 is the FORWARD hook */ + initial_table.entries[1].target.verdict = -forward - 1; + + /* Register table */ + ret = ip6t_register_table(&packet_filter); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ip6t_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ip6t_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ip6t_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + return ret; + + cleanup_hook1: + nf_unregister_hook(&ip6t_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ip6t_ops[0]); + cleanup_table: + ip6t_unregister_table(&packet_filter); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ip6t_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ip6t_ops[i]); + + ip6t_unregister_table(&packet_filter); +} + +module_init(init); +module_exit(fini); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 973e8435c97a..0a2a58c34c1d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: af_unix.c,v 1.95 2000/05/09 04:48:37 davem Exp $ + * Version: $Id: af_unix.c,v 1.96 2000/05/12 23:51:26 davem Exp $ * * Fixes: * Linus Torvalds : Assorted bug cures. diff --git a/scripts/header.tk b/scripts/header.tk index 6a88f6881062..f2189e239925 100644 --- a/scripts/header.tk +++ b/scripts/header.tk @@ -312,7 +312,7 @@ proc option_name {w mnum line text helpidx} { $w.x$line.l configure -activefore [cget $w.x$line.l -fg] \ -activeback [cget $w.x$line.l -bg] button $w.x$line.help -text "Help" -relief raised \ - -command "dohelp .dohelp $helpidx" + -command "dohelp .dohelp $helpidx .menu$mnum" pack $w.x$line.help -side right -fill y pack $w.x$line.l -side right -fill both -expand on } @@ -325,9 +325,9 @@ proc toggle_switch2 {w mnum line text variable} { -relief groove -width 2 -command "update_active" radiobutton $w.x$line.n -text "n" -variable $variable -value 0 \ -relief groove -width 2 -command "update_active" - + option_name $w $mnum $line $text $variable - + pack $w.x$line.n $w.x$line.m $w.x$line.y -side right -fill y } @@ -339,9 +339,9 @@ proc toggle_switch3 {w mnum line text variable} { -relief groove -width 2 -command "update_active" radiobutton $w.x$line.n -text "n" -variable $variable -value 0 \ -relief groove -width 2 -command "update_active" - + option_name $w $mnum $line $text $variable - + global CONFIG_MODULES if {($CONFIG_MODULES == 0)} then { $w.x$line.m configure -state disabled @@ -400,6 +400,17 @@ proc minimenu { w mnum line text variable helpidx } { pack $w.x$line -anchor w -fill both -expand on } +proc menusplit {w m n} { + if { $n > 2 } then { + set menuoptsize [expr [$m yposition 2] - [$m yposition 1]] + set maxsize [winfo screenheight $w] + set splitpoint [expr $maxsize * 4 / 5 / $menuoptsize - 1] + for {set i [expr $splitpoint + 1]} {$i <= $n} {incr i $splitpoint} { + $m entryconfigure $i -columnbreak 1 + } + } +} + proc submenu { w mnum line text subnum } { frame $w.x$line button $w.x$line.l -text "" -width 15 -relief groove @@ -412,11 +423,20 @@ proc submenu { w mnum line text subnum } { pack $w.x$line -anchor w -fill both -expand on } -proc comment {w line text } { -#nothing done for comments now. +proc comment {w mnum line text } { + frame $w.x$line + button $w.x$line.l -text "" -width 15 -relief groove + $w.x$line.l configure -activefore [cget $w.x$line.l -fg] \ + -activeback [cget $w.x$line.l -bg] -state disabled + button $w.x$line.m -text "$text" -relief groove -anchor w + $w.x$line.m configure -activefore [cget $w.x$line.m -fg] \ + -activeback [cget $w.x$line.m -bg] + pack $w.x$line.l -side left -fill both + pack $w.x$line.m -anchor w -side right -fill both -expand on + pack $w.x$line -anchor w -fill both -expand on } -proc dohelp {w var } { +proc dohelp {w var parent} { catch {destroy $w} toplevel $w -class Dialog @@ -442,45 +462,73 @@ ${var}:\\ " Documentation/Configure.help] set found [expr [string length "$message"] > 0] } - + frame $w.f1 + pack $w.f1 -fill both -expand on + + # Do the OK button + # + set oldFocus [focus] + frame $w.f2 + button $w.f2.ok -text "OK" \ + -width 10 -command "destroy $w; catch {focus $oldFocus}" + pack $w.f2.ok -side bottom -pady 6 -anchor n + pack $w.f2 -side bottom -padx 10 -anchor s + + scrollbar $w.f1.vscroll -command "$w.f1.canvas yview" + pack $w.f1.vscroll -side right -fill y + + canvas $w.f1.canvas -relief flat -borderwidth 0 \ + -yscrollcommand "$w.f1.vscroll set" + frame $w.f1.f + pack $w.f1.canvas -side right -fill y -expand on if { $found == 0 } then { if { $filefound == 0 } then { - message $w.f1.m -width 750 -aspect 300 -relief flat -text \ + message $w.f1.f.m -width 750 -aspect 300 -relief flat -text \ "No help available - unable to open file Documentation/Configure.help. This file should have come with your kernel." } else { - message $w.f1.m -width 400 -aspect 300 -relief flat -text \ + message $w.f1.f.m -width 400 -aspect 300 -relief flat -text \ "No help available for $var" } label $w.f1.bm -bitmap error wm title $w "RTFM" } else { - text $w.f1.m -width 73 -relief flat -wrap word - $w.f1.m insert 0.0 $message - $w.f1.m conf -state disabled -height [$w.f1.m index end] + text $w.f1.f.m -width 73 -relief flat -wrap word + $w.f1.f.m insert 0.0 $message + $w.f1.f.m conf -state disabled -height [$w.f1.f.m index end] label $w.f1.bm -bitmap info wm title $w "Configuration help" } - pack $w.f1.bm $w.f1.m -side left -padx 10 - pack $w.f1 -side top - set oldFocus [focus] - - # Do the OK button - # - frame $w.f2 - button $w.f2.ok -text "OK" \ - -width 10 -command "destroy $w; catch {focus $oldFocus}" - pack $w.f2.ok -side bottom -pady 6 -anchor n - pack $w.f2 -side bottom -padx 10 -anchor s + pack $w.f1.f.m -side left + pack $w.f1.bm $w.f1.f -side left -padx 10 - # Finish off the window - # focus $w - global winx; global winy - set winx [expr [winfo x .]+30]; set winy [expr [winfo y .]+30] + set winx [expr [winfo x $parent]+20] + set winy [expr [winfo y $parent]+20] wm geometry $w +$winx+$winy + set sizok [expr [winfo reqheight $w.f2.ok] + 12] + set maxy [expr [winfo screenheight .] * 3 / 4] + set canvtotal [winfo reqheight $w.f1.f.m] + if [expr $sizok + $canvtotal < $maxy] { + set sizy $canvtotal + } else { + set sizy [expr $maxy - $sizok] + } + $w.f1.canvas configure -height $sizy -width [winfo reqwidth $w.f1.f.m] \ + -scrollregion "0 0 [winfo reqwidth $w.f1.f.m] \ + [winfo reqheight $w.f1.f.m]" + $w.f1.canvas create window 0 0 -anchor nw -window $w.f1.f + update idletasks + + set maxy [winfo screenheight .] + if [expr $sizok + $canvtotal < $maxy] { + set sizy [expr $sizok + $canvtotal] + } else { + set sizy $maxy + } + wm maxsize $w [winfo width $w] $sizy } proc wrapup {w } { diff --git a/scripts/tail.tk b/scripts/tail.tk index 7ca01f252357..3627424519e0 100644 --- a/scripts/tail.tk +++ b/scripts/tail.tk @@ -55,6 +55,16 @@ pack .f0.right.store .f0.right.load .f0.right.quit .f0.right.save \ pack .f0.left .f0.middle .f0.right -side left -padx 5 -pady 0 -fill y pack .f0 -padx 5 -pady 5 +update idletasks +set winy [expr 10 + [winfo reqheight .f0]] +set scry [lindex [wm maxsize .] 1] +set winx [expr 10 + [winfo reqwidth .f0]] +set scrx [lindex [wm maxsize .] 0] +if {$winx < $scrx} then {set maxx -1} else {set maxx $winx} +if {$winy < $scry} then {set maxy -1} else {set maxy $winy} +.f0 configure -width $winx -height $winy +wm maxsize . $maxx $maxy + # # If we cannot write our config files, disable the write button. # diff --git a/scripts/tkgen.c b/scripts/tkgen.c index e016218d728e..18d968eff8c8 100644 --- a/scripts/tkgen.c +++ b/scripts/tkgen.c @@ -265,6 +265,8 @@ void generate_if( struct kconfig * cfg, struct condition * ocond, || cfg->token == token_define_int || cfg->token == token_define_string || cfg->token == token_define_tristate || cfg->token == token_unset ) return; + if ( cfg->token == token_comment && line_num == -1 ) + return; } else { @@ -464,6 +466,7 @@ void generate_if( struct kconfig * cfg, struct condition * ocond, menu_num, line_num ); break; + case token_comment: case token_mainmenu_option: if ( line_num >= 0 ) { @@ -1135,6 +1138,7 @@ void dump_tk_script( struct kconfig * scfg ) case token_bool: case token_choice_header: case token_choice_item: + case token_comment: case token_dep_bool: case token_dep_tristate: case token_dep_mbool: @@ -1189,6 +1193,8 @@ void dump_tk_script( struct kconfig * scfg ) { int menu_line = 0; int nr_submenu = imenu; + int menu_name_omitted = 0; + int opt_count = 0; clear_globalflags(); start_proc( menu_first[imenu]->label, imenu, @@ -1210,6 +1216,21 @@ void dump_tk_script( struct kconfig * scfg ) cfg = menu_last[nr_submenu]; break; + case token_comment: + if ( !cfg->menu_line && !menu_name_omitted ) + { + cfg->menu_line = -1; + menu_name_omitted = 1; + } + else + { + menu_name_omitted = 1; + cfg->menu_line = menu_line++; + printf( "\tcomment $w.config.f %d %d \"%s\"\n", + cfg->menu_number, cfg->menu_line, cfg->label ); + } + break; + case token_bool: cfg->menu_line = menu_line++; printf( "\tbool $w.config.f %d %d \"%s\" %s\n", @@ -1227,8 +1248,10 @@ void dump_tk_script( struct kconfig * scfg ) printf( "\tminimenu $w.config.f %d %d \"%s\" tmpvar_%d %s\n", cfg->menu_number, cfg->menu_line, cfg->label, -(cfg->nameindex), vartable[cfg->next->nameindex].name ); - printf( "\tmenu $w.config.f.x%d.x.menu\n", cfg->menu_line ); + printf( "\tmenu $w.config.f.x%d.x.menu -title \"%s\"\n", + cfg->menu_line, cfg->label ); cfg1 = cfg; + opt_count = 0; break; case token_choice_item: @@ -1236,6 +1259,12 @@ void dump_tk_script( struct kconfig * scfg ) printf( "\t$w.config.f.x%d.x.menu add radiobutton -label \"%s\" -variable tmpvar_%d -value \"%s\" -command \"update_active\"\n", cfg1->menu_line, cfg->label, -(cfg1->nameindex), cfg->label ); + opt_count++; + if ( cfg->next && cfg->next->token != token_choice_item ) { + /* last option in the menu */ + printf( "\tmenusplit $w $w.config.f.x%d.x.menu %d\n", + cfg1->menu_line, opt_count ); + } break; case token_dep_bool: diff --git a/scripts/tkparse.c b/scripts/tkparse.c index bf27a3a8b4c9..e17f109a49d2 100644 --- a/scripts/tkparse.c +++ b/scripts/tkparse.c @@ -130,7 +130,7 @@ static const char * get_string( const char * pnt, char ** label ) static const char * get_qstring( const char * pnt, char ** label ) { char quote_char; - char newlabel [1024]; + char newlabel [2048]; char * pnt1; /* advance to the open quote */ @@ -705,7 +705,7 @@ static void tokenize_line( const char * pnt ) */ static void do_source( const char * filename ) { - char buffer [1024]; + char buffer [2048]; FILE * infile; const char * old_file; int old_lineno; -- 2.39.5