config X86
bool
default y
- help
- This is Linux's home port. Linux was originally native to the Intel
- 386, and runs on all the later x86 processors including the Intel
- 486, 586, Pentiums, and various instruction-set-compatible chips by
- AMD, Cyrix, and others.
config MMU
bool
config ISA
bool
- help
- Find out whether you have ISA slots on your motherboard. ISA is the
- name of a bus system, i.e. the way the CPU talks to the other stuff
- inside your box. Other bus systems are PCI, EISA, MicroChannel
- (MCA) or VESA. ISA is an older system, now being displaced by PCI;
- newer boards don't support it. If you have ISA, say Y, otherwise N.
config SBUS
bool
-config UID16
- bool
- default y
-
config RWSEM_GENERIC_SPINLOCK
bool
default y
default MK8
config MK8
- bool "AMD-Hammer"
+ bool "AMD-Opteron/Athlon64"
help
- Support for AMD Clawhammer/Sledgehammer CPUs. Only choice for x86-64
- currently so you should choose this if you want a x86-64 kernel. In fact
- you will have no other choice than to choose this.
+ Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs.
config GENERIC_CPU
bool "Generic-x86-64"
+ help
+ Generic x86-64 CPU.
endchoice
singleprocessor machines. On a singleprocessor machine, the kernel
will run faster if you say N here.
- Note that if you say Y here and choose architecture "586" or
- "Pentium" under "Processor family", the kernel will not work on 486
- architectures. Similarly, multiprocessor kernels for the "PPro"
- architecture may not work on all Pentium based boards.
-
- People using multiprocessor machines who say Y here should also say
- Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
- Management" code will be disabled if you say Y here.
-
- See also the <file:Documentation/smp.tex>,
- <file:Documentation/smp.txt>, <file:Documentation/i386/IO-APIC.txt>,
- <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
- <http://www.tldp.org/docs.html#howto>.
-
If you don't know what to do here, say N.
+# broken currently
config PREEMPT
+ depends on NOT_WORKING
bool "Preemptible Kernel"
- depends on !SMP
---help---
This option reduces the latency of the kernel when reacting to
real-time or interactive events by allowing a low priority process to
Say Y here if you are feeling brave and building a kernel for a
desktop, embedded or real-time system. Say N if you are unsure.
+# someone write a better help text please.
+config K8_NUMA
+ bool "K8 NUMA support"
+ depends on SMP
+ help
+ Enable NUMA (Non Unified Memory Architecture) support for
+ AMD Opteron Multiprocessor systems. The kernel will try to allocate
+ memory used by a CPU on the local memory controller of the CPU
+ and in the future do more optimizations. This may improve performance
+ or it may not. Code is still experimental.
+ Say N if unsure.
+
+config DISCONTIGMEM
+ bool
+ depends on K8_NUMA
+ default y
+
+config NUMA
+ bool
+ depends on K8_NUMA
+ default y
+
config HAVE_DEC_LOCK
bool
depends on SMP
kernel will support. The maximum supported value is 32 and the
minimum value which makes sense is 2.
- This is purely to save memory - each supported CPU adds
- approximately eight kilobytes to the kernel image.
+ This is purely to save memory - each supported CPU requires
+ memory in the static kernel configuration.
config GART_IOMMU
bool "IOMMU support"
help
Support the K8 IOMMU. Needed to run systems with more than 4GB of memory
- properly with 32-bit devices. You should probably turn this on.
- The iommu can be turned off at runtime with the iommu=off parameter.
+ properly with 32-bit PCI devices that do not support DAC (Double Address
+ Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter.
+ Normally the kernel will take the right choice by itself.
+ If unsure say Y
config DUMMY_IOMMU
bool
Note that, even if you say N here, Linux on the x86 architecture
will issue the hlt instruction if nothing is to be done, thereby
- sending the processor to sleep and saving power.
+ sending the processor to limited sleep and saving power. However
+ using ACPI will likely save more power.
config SOFTWARE_SUSPEND
bool "Software Suspend (EXPERIMENTAL)"
config PCI
bool "PCI support"
- help
- Find out whether you have a PCI motherboard. PCI is the name of a
- bus system, i.e. the way the CPU talks to the other stuff inside
- your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
- VESA. If you have PCI, say Y, otherwise N.
-
- The PCI-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>, contains valuable
- information about which PCI hardware does work under Linux and which
- doesn't.
# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
config PCI_DIRECT
bool
depends on PROC_FS
default y
- ---help---
- If you enabled support for /proc file system then the file
- /proc/kcore will contain the kernel core image. This can be used
- in gdb:
-
- $ cd /usr/src/linux ; gdb vmlinux /proc/kcore
- You have two choices here: ELF and A.OUT. Selecting ELF will make
- /proc/kcore appear in ELF core format as defined by the Executable
- and Linkable Format specification. Selecting A.OUT will choose the
- old "a.out" format which may be necessary for some old versions
- of binutils or on some architectures.
-
- This is especially useful if you have compiled the kernel with the
- "-g" option to preserve debugging information. It is mainly used
- for examining kernel data structures on the live kernel so if you
- don't understand what this means or are not a kernel hacker, just
- leave it at its default value ELF.
-
-#tristate 'Kernel support for a.out binaries' CONFIG_BINFMT_AOUT
config BINFMT_ELF
- tristate "Kernel support for ELF binaries"
- ---help---
- ELF (Executable and Linkable Format) is a format for libraries and
- executables used across different architectures and operating
- systems. Saying Y here will enable your kernel to run ELF binaries
- and enlarge it by about 13 KB. ELF support under Linux has now all
- but replaced the traditional Linux a.out formats (QMAGIC and ZMAGIC)
- because it is portable (this does *not* mean that you will be able
- to run executables from different architectures or operating systems
- however) and makes building run-time libraries very easy. Many new
- executables are distributed solely in ELF format. You definitely
- want to say Y here.
-
- Information about ELF is contained in the ELF HOWTO available from
- <http://www.tldp.org/docs.html#howto>.
-
- If you find that after upgrading from Linux kernel 1.2 and saying Y
- here, you still can't run any ELF binaries (they just crash), then
- you'll have to install the newest ELF runtime libraries, including
- ld.so (check the file <file:Documentation/Changes> for location and
- latest version).
-
- If you want to compile this as a module ( = code which can be
- inserted in and removed from the running kernel whenever you want),
- say M here and read <file:Documentation/modules.txt>. The module
- will be called binfmt_elf. Saying M or N here is dangerous because
- some crucial programs on your system might be in ELF format.
+ bool
+ default y
config BINFMT_MISC
tristate "Kernel support for MISC binaries"
If you say Y here, it will be possible to plug wrapper-driven binary
formats into the kernel. You will like this especially when you use
programs that need an interpreter to run like Java, Python or
- Emacs-Lisp. It's also useful if you often run DOS executables under
- the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>). Once you have
- registered such a binary class with the kernel, you can start one of
- those programs simply by typing in its name at a shell prompt; Linux
- will automatically feed it to the correct interpreter.
+ Emacs-Lisp. Once you have registered such a binary class with the kernel,
+ you can start one of those programs simply by typing in its name at a shell
+ prompt; Linux will automatically feed it to the correct interpreter.
You can do other nice things, too. Read the file
<file:Documentation/binfmt_misc.txt> to learn how to use this
depends on IA32_EMULATION
default y
+
+config UID16
+ bool
+ depends on IA32_EMULATION
+ default y
+
endmenu
source "drivers/mtd/Kconfig"
best used in conjunction with the NMI watchdog so that spinlock
deadlocks are also debuggable.
+# !SMP for now because the context switch early causes GPF in segment reloading
+# and the GS base checking does the wrong thing then, causing a hang.
config CHECKING
bool "Additional run-time checks"
- depends on DEBUG_KERNEL
+ depends on DEBUG_KERNEL && !SMP
help
Enables some internal consistency checks for kernel debugging.
You should normally say N.
bool "Debug __init statements"
depends on DEBUG_KERNEL
help
- Fill __init and __initdata at the end of boot. This is only for debugging.
+ Fill __init and __initdata at the end of boot. This helps debugging
+ illegal uses of __init and __initdata after initialization.
config KALLSYMS
bool "Load all symbols for debugging/kksymoops"
bool "Compile the kernel with frame pointers"
depends on DEBUG_KERNEL
help
- If you say Y here the resulting kernel image will be slightly larger
- and slower, but it will give very useful debugging information.
- If you don't debug the kernel, you can say N, but we may not be able
- to solve problems without frame pointers.
- Note this is normally not needed on x86-64.
+ Compile the kernel with frame pointers. This may help for some
+ debugging with external debuggers. Note the standard oops backtracer
+ doesn't make use of it and the x86-64 kernel doesn't ensure an consistent
+ frame pointer through inline assembly (semaphores etc.)
+ Normally you should say N.
endmenu
boot := arch/x86_64/boot
-.PHONY: bzImage bzlilo bzdisk install archmrproper
+.PHONY: bzImage bzlilo install archmrproper \
+ fdimage fdimage144 fdimage288 archclean
#Default target when executing "make"
all: bzImage
bzdisk: vmlinux
$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk
-install: vmlinux
+install fdimage fdimage144 fdimage288: vmlinux
$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
archclean:
echo ' install to $$(INSTALL_PATH) and run lilo'
endef
+CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf
+
+
$(obj)/compressed/vmlinux: FORCE
$(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
-zdisk: $(BOOTIMAGE)
- dd bs=8192 if=$(BOOTIMAGE) of=/dev/fd0
+# Set this if you want to pass append arguments to the zdisk/fdimage kernel
+FDARGS =
+
+$(obj)/mtools.conf: $(obj)/mtools.conf.in
+ sed -e 's|@OBJ@|$(obj)|g' < $< > $@
+
+# This requires write access to /dev/fd0
+zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
+ MTOOLSRC=$(src)/mtools.conf mformat a: ; sync
+ syslinux /dev/fd0 ; sync
+ echo 'default linux $(FDARGS)' | \
+ MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg
+ MTOOLSRC=$(src)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync
+
+# These require being root or having syslinux run setuid
+fdimage fdimage144: $(BOOTIMAGE) $(src)/mtools.conf
+ dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
+ MTOOLSRC=$(src)/mtools.conf mformat v: ; sync
+ syslinux $(obj)/fdimage ; sync
+ echo 'default linux $(FDARGS)' | \
+ MTOOLSRC=$(src)/mtools.conf mcopy - v:syslinux.cfg
+ MTOOLSRC=$(src)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync
+
+fdimage288: $(BOOTIMAGE) $(src)/mtools.conf
+ dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
+ MTOOLSRC=$(src)/mtools.conf mformat w: ; sync
+ syslinux $(obj)/fdimage ; sync
+ echo 'default linux $(FDARGS)' | \
+ MTOOLSRC=$(src)/mtools.conf mcopy - w:syslinux.cfg
+ MTOOLSRC=$(src)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync
zlilo: $(BOOTIMAGE)
if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
* modified by Drew Eckhardt
* modified by Bruce Evans (bde)
* modified by Chris Noe (May 1999) (as86 -> gas)
- *
- * 360k/720k disk support: Andrzej Krzysztofowicz <ankry@green.mif.pg.gda.pl>
+ * gutted by H. Peter Anvin (Jan 2003)
*
* BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment
* addresses must be multiplied by 16 to obtain their respective linear
* addresses. To avoid confusion, linear addresses are written using leading
* hex while segment addresses are written as segment:offset.
*
- * bde - should not jump blindly, there may be systems with only 512K low
- * memory. Use int 0x12 to get the top of memory, etc.
- *
- * It then loads 'setup' directly after itself (0x90200), and the system
- * at 0x10000, using BIOS interrupts.
- *
- * NOTE! currently system is at most (8*65536-4096) bytes long. This should
- * be no problem, even in the future. I want to keep it simple. This 508 kB
- * kernel size should be enough, especially as this doesn't contain the
- * buffer cache as in minix (and especially now that the kernel is
- * compressed :-)
- *
- * The loader has been made as simple as possible, and continuous
- * read errors will result in a unbreakable loop. Reboot by hand. It
- * loads pretty fast by getting whole tracks at a time whenever possible.
*/
#include <asm/boot.h>
.global _start
_start:
-# First things first. Move ourself from 0x7C00 -> 0x90000 and jump there.
-
- movw $BOOTSEG, %ax
- movw %ax, %ds # %ds = BOOTSEG
- movw $INITSEG, %ax
- movw %ax, %es # %ax = %es = INITSEG
- movw $256, %cx
- subw %si, %si
- subw %di, %di
- cld
- rep
- movsw
- ljmp $INITSEG, $go
-
-# bde - changed 0xff00 to 0x4000 to use debugger at 0x6400 up (bde). We
-# wouldn't have to worry about this if we checked the top of memory. Also
-# my BIOS can be configured to put the wini drive tables in high memory
-# instead of in the vector table. The old stack might have clobbered the
-# drive table.
+ # Normalize the start address
+ jmpl $BOOTSEG, $start2
-go: movw $0x4000-12, %di # 0x4000 is an arbitrary value >=
- # length of bootsect + length of
- # setup + room for stack;
- # 12 is disk parm size.
- movw %ax, %ds # %ax and %es already contain INITSEG
+start2:
+ movw %cs, %ax
+ movw %ax, %ds
+ movw %ax, %es
movw %ax, %ss
- movw %di, %sp # put stack at INITSEG:0x4000-12.
-
-# Many BIOS's default disk parameter tables will not recognize
-# multi-sector reads beyond the maximum sector number specified
-# in the default diskette parameter tables - this may mean 7
-# sectors in some cases.
-#
-# Since single sector reads are slow and out of the question,
-# we must take care of this by creating new parameter tables
-# (for the first disk) in RAM. We will set the maximum sector
-# count to 36 - the most we will encounter on an ED 2.88.
-#
-# High doesn't hurt. Low does.
-#
-# Segments are as follows: %cs = %ds = %es = %ss = INITSEG, %fs = 0,
-# and %gs is unused.
-
- movw %cx, %fs # %fs = 0
- movw $0x78, %bx # %fs:%bx is parameter table address
- pushw %ds
- ldsw %fs:(%bx), %si # %ds:%si is source
- movb $6, %cl # copy 12 bytes
- pushw %di # %di = 0x4000-12.
- rep # don't worry about cld
- movsw # already done above
- popw %di
- popw %ds
- movb $36, 0x4(%di) # patch sector count
- movw %di, %fs:(%bx)
- movw %es, %fs:2(%bx)
-
-# Get disk drive parameters, specifically number of sectors/track.
+ movw $0x7c00, %sp
+ sti
+ cld
-# It seems that there is no BIOS call to get the number of sectors.
-# Guess 36 sectors if sector 36 can be read, 18 sectors if sector 18
-# can be read, 15 if sector 15 can be read. Otherwise guess 9.
-# Note that %cx = 0 from rep movsw above.
+ movw $bugger_off_msg, %si
- movw $disksizes, %si # table of sizes to try
-probe_loop:
+msg_loop:
lodsb
- cbtw # extend to word
- movw %ax, sectors
- cmpw $disksizes+4, %si
- jae got_sectors # If all else fails, try 9
-
- xchgw %cx, %ax # %cx = track and sector
- xorw %dx, %dx # drive 0, head 0
- movw $0x0200, %bx # address = 512, in INITSEG (%es = %cs)
- movw $0x0201, %ax # service 2, 1 sector
- int $0x13
- jc probe_loop # try next value
-
-got_sectors:
- movb $0x03, %ah # read cursor pos
- xorb %bh, %bh
- int $0x10
- movw $9, %cx
- movb $0x07, %bl # page 0, attribute 7 (normal)
- # %bh is set above; int10 doesn't
- # modify it
- movw $msg1, %bp
- movw $0x1301, %ax # write string, move cursor
- int $0x10 # tell the user we're loading..
-
-# Load the setup-sectors directly after the moved bootblock (at 0x90200).
-# We should know the drive geometry to do it, as setup may exceed first
-# cylinder (for 9-sector 360K and 720K floppies).
-
- movw $0x0001, %ax # set sread (sector-to-read) to 1 as
- movw $sread, %si # the boot sector has already been read
- movw %ax, (%si)
-
- call kill_motor # reset FDC
- movw $0x0200, %bx # address = 512, in INITSEG
-next_step:
- movb setup_sects, %al
- movw sectors, %cx
- subw (%si), %cx # (%si) = sread
- cmpb %cl, %al
- jbe no_cyl_crossing
- movw sectors, %ax
- subw (%si), %ax # (%si) = sread
-no_cyl_crossing:
- call read_track
- pushw %ax # save it
- call set_next # set %bx properly; it uses %ax,%cx,%dx
- popw %ax # restore
- subb %al, setup_sects # rest - for next step
- jnz next_step
-
- pushw $SYSSEG
- popw %es # %es = SYSSEG
- call read_it
- call kill_motor
- call print_nl
-
-# After that we check which root-device to use. If the device is
-# defined (!= 0), nothing is done and the given device is used.
-# Otherwise, one of /dev/fd0H2880 (2,32) or /dev/PS0 (2,28) or /dev/at0 (2,8)
-# depending on the number of sectors we pretend to know we have.
-
-# Segments are as follows: %cs = %ds = %ss = INITSEG,
-# %es = SYSSEG, %fs = 0, %gs is unused.
-
- movw root_dev, %ax
- orw %ax, %ax
- jne root_defined
-
- movw sectors, %bx
- movw $0x0208, %ax # /dev/ps0 - 1.2Mb
- cmpw $15, %bx
- je root_defined
-
- movb $0x1c, %al # /dev/PS0 - 1.44Mb
- cmpw $18, %bx
- je root_defined
-
- movb $0x20, %al # /dev/fd0H2880 - 2.88Mb
- cmpw $36, %bx
- je root_defined
-
- movb $0, %al # /dev/fd0 - autodetect
-root_defined:
- movw %ax, root_dev
-
-# After that (everything loaded), we jump to the setup-routine
-# loaded directly after the bootblock:
-
- ljmp $SETUPSEG, $0
-
-# These variables are addressed via %si register as it gives shorter code.
-
-sread: .word 0 # sectors read of current track
-head: .word 0 # current head
-track: .word 0 # current track
-
-# This routine loads the system at address SYSSEG, making sure
-# no 64kB boundaries are crossed. We try to load it as fast as
-# possible, loading whole tracks whenever we can.
-
-read_it:
- movw %es, %ax # %es = SYSSEG when called
- testw $0x0fff, %ax
-die: jne die # %es must be at 64kB boundary
- xorw %bx, %bx # %bx is starting address within segment
-rp_read:
-#ifdef __BIG_KERNEL__ # look in setup.S for bootsect_kludge
- bootsect_kludge = 0x220 # 0x200 + 0x20 which is the size of the
- lcall *bootsect_kludge # bootsector + bootsect_kludge offset
-#else
- movw %es, %ax
- subw $SYSSEG, %ax
- movw %bx, %cx
- shr $4, %cx
- add %cx, %ax # check offset
-#endif
- cmpw syssize, %ax # have we loaded everything yet?
- jbe ok1_read
-
- ret
-
-ok1_read:
- movw sectors, %ax
- subw (%si), %ax # (%si) = sread
- movw %ax, %cx
- shlw $9, %cx
- addw %bx, %cx
- jnc ok2_read
-
- je ok2_read
-
- xorw %ax, %ax
- subw %bx, %ax
- shrw $9, %ax
-ok2_read:
- call read_track
- call set_next
- jmp rp_read
-
-read_track:
- pusha
- pusha
- movw $0xe2e, %ax # loading... message 2e = .
+ andb %al, %al
+ jz die
+ movb $0xe, %ah
movw $7, %bx
int $0x10
- popa
-
-# Accessing head, track, sread via %si gives shorter code.
+ jmp msg_loop
- movw 4(%si), %dx # 4(%si) = track
- movw (%si), %cx # (%si) = sread
- incw %cx
- movb %dl, %ch
- movw 2(%si), %dx # 2(%si) = head
- movb %dl, %dh
- andw $0x0100, %dx
- movb $2, %ah
- pushw %dx # save for error dump
- pushw %cx
- pushw %bx
- pushw %ax
- int $0x13
- jc bad_rt
-
- addw $8, %sp
- popa
- ret
-
-set_next:
- movw %ax, %cx
- addw (%si), %ax # (%si) = sread
- cmp sectors, %ax
- jne ok3_set
- movw $0x0001, %ax
- xorw %ax, 2(%si) # change head
- jne ok4_set
- incw 4(%si) # next track
-ok4_set:
+die:
+ # Allow the user to press a key, then reboot
xorw %ax, %ax
-ok3_set:
- movw %ax, (%si) # set sread
- shlw $9, %cx
- addw %cx, %bx
- jnc set_next_fin
- movw %es, %ax
- addb $0x10, %ah
- movw %ax, %es
- xorw %bx, %bx
-set_next_fin:
- ret
-
-bad_rt:
- pushw %ax # save error code
- call print_all # %ah = error, %al = read
- xorb %ah, %ah
- xorb %dl, %dl
- int $0x13
- addw $10, %sp
- popa
- jmp read_track
-
-# print_all is for debugging purposes.
-#
-# it will print out all of the registers. The assumption is that this is
-# called from a routine, with a stack frame like
-#
-# %dx
-# %cx
-# %bx
-# %ax
-# (error)
-# ret <- %sp
-
-print_all:
- movw $5, %cx # error code + 4 registers
- movw %sp, %bp
-print_loop:
- pushw %cx # save count remaining
- call print_nl # <-- for readability
- cmpb $5, %cl
- jae no_reg # see if register name is needed
+ int $0x16
+ int $0x19
- movw $0xe05 + 'A' - 1, %ax
- subb %cl, %al
- int $0x10
- movb $'X', %al
- int $0x10
- movb $':', %al
- int $0x10
-no_reg:
- addw $2, %bp # next register
- call print_hex # print it
- popw %cx
- loop print_loop
- ret
-
-print_nl:
- movw $0xe0d, %ax # CR
- int $0x10
- movb $0xa, %al # LF
- int $0x10
- ret
-
-# print_hex is for debugging purposes, and prints the word
-# pointed to by %ss:%bp in hexadecimal.
-
-print_hex:
- movw $4, %cx # 4 hex digits
- movw (%bp), %dx # load word into %dx
-print_digit:
- rolw $4, %dx # rotate to use low 4 bits
- movw $0xe0f, %ax # %ah = request
- andb %dl, %al # %al = mask for nybble
- addb $0x90, %al # convert %al to ascii hex
- daa # in only four instructions!
- adc $0x40, %al
- daa
- int $0x10
- loop print_digit
- ret
+ # int 0x19 should never return. In case it does anyway,
+ # invoke the BIOS reset code...
+ ljmp $0xf000,$0xfff0
-# This procedure turns off the floppy drive motor, so
-# that we enter the kernel in a known state, and
-# don't have to worry about it later.
-# NOTE: Doesn't save %ax or %dx; do it yourself if you need to.
-kill_motor:
- movw $0x3f2, %dx
- xorb %al, %al
- outb %al, %dx
- ret
+bugger_off_msg:
+ .ascii "Direct booting from floppy is no longer supported.\r\n"
+ .ascii "Please use a boot loader program instead.\r\n"
+ .ascii "\n"
+ .ascii "Remove disk and press any key to reboot . . .\r\n"
+ .byte 0
-sectors: .word 0
-disksizes: .byte 36, 18, 15, 9
-msg1: .byte 13, 10
- .ascii "Loading"
-# XXX: This is a fairly snug fit.
+ # Kernel attributes; used by setup
-.org 497
+ .org 497
setup_sects: .byte SETUPSECTS
root_flags: .word ROOT_RDONLY
syssize: .word SYSSIZE
--- /dev/null
+#
+# mtools configuration file for "make (b)zdisk"
+#
+
+# Actual floppy drive
+drive a:
+ file="/dev/fd0"
+
+# 1.44 MB floppy disk image
+drive v:
+ file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter
+
+# 2.88 MB floppy disk image (mostly for virtual uses)
+drive w:
+ file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
+
+
sz = sb.st_size;
fprintf (stderr, "System is %d kB\n", sz/1024);
sys_size = (sz + 15) / 16;
- /* 0x28000*16 = 2.5 MB, conservative estimate for the current maximum */
- if (sys_size > (is_big_kernel ? 0x28000 : DEF_SYSSIZE))
+ /* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */
+ if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE))
die("System is too big. Try using %smodules.",
is_big_kernel ? "" : "bzImage or ");
- if (sys_size > 0xefff)
- fprintf(stderr,"warning: kernel is too big for standalone boot "
- "from floppy\n");
while (sz > 0) {
int l, n;
CONFIG_X86=y
CONFIG_MMU=y
CONFIG_SWAP=y
-CONFIG_UID16=y
CONFIG_RWSEM_GENERIC_SPINLOCK=y
CONFIG_X86_CMPXCHG=y
CONFIG_EARLY_PRINTK=y
CONFIG_SYSVIPC=y
# CONFIG_BSD_PROCESS_ACCT is not set
CONFIG_SYSCTL=y
-# CONFIG_LOG_BUF_SHIFT_17 is not set
-CONFIG_LOG_BUF_SHIFT_16=y
-# CONFIG_LOG_BUF_SHIFT_15 is not set
-# CONFIG_LOG_BUF_SHIFT_14 is not set
-# CONFIG_LOG_BUF_SHIFT_13 is not set
-# CONFIG_LOG_BUF_SHIFT_12 is not set
CONFIG_LOG_BUF_SHIFT=16
#
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
CONFIG_OBSOLETE_MODPARM=y
+# CONFIG_MODVERSIONS is not set
# CONFIG_KMOD is not set
#
# CONFIG_BINFMT_MISC is not set
CONFIG_IA32_EMULATION=y
CONFIG_COMPAT=y
+CONFIG_UID16=y
#
# Memory Technology Devices (MTD)
# Ethernet (10 or 100Mbit)
#
CONFIG_NET_ETHERNET=y
+# CONFIG_MII is not set
# CONFIG_HAPPYMEAL is not set
# CONFIG_SUNGEM is not set
# CONFIG_NET_VENDOR_3COM is not set
# CONFIG_DRM is not set
# CONFIG_MWAVE is not set
CONFIG_RAW_DRIVER=y
+# CONFIG_HANGCHECK_TIMER is not set
#
# Misc devices
# CONFIG_DEBUG_SLAB is not set
CONFIG_MAGIC_SYSRQ=y
# CONFIG_DEBUG_SPINLOCK is not set
-CONFIG_CHECKING=y
# CONFIG_INIT_DEBUG is not set
CONFIG_KALLSYMS=y
# CONFIG_FRAME_POINTER is not set
return -1;
}
tsk->thread.i387.fxsave.mxcsr &= 0xffbf;
+ current->used_math = 1;
return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
}
.quad sys32_io_getevents
.quad sys32_io_submit
.quad sys_io_cancel
- .quad sys_ni_syscall /* 250 alloc_huge_pages */
+ .quad sys_fadvise64
.quad sys_ni_syscall /* free_huge_pages */
.quad sys_exit_group /* exit_group */
.quad sys_lookup_dcookie
obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o suspend_asm.o
obj-$(CONFIG_ACPI) += acpi.o
-#obj-$(CONFIG_ACPI_SLEEP) += acpi_wakeup.o
+obj-$(CONFIG_ACPI_SLEEP) += wakeup.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o
#include <asm/pgalloc.h>
#include <asm/io_apic.h>
#include <asm/proto.h>
+#include <asm/desc.h>
+#include <asm/system.h>
+#include <asm/segment.h>
extern int acpi_disabled;
if (phys_addr < (end_pfn_map << PAGE_SHIFT))
return __va(phys_addr);
- printk("acpi mapping beyond end_pfn: %lx > %lx\n", phys_addr, end_pfn<<PAGE_SHIFT);
return NULL;
}
int __init
-acpi_boot_init (
- char *cmdline)
+acpi_boot_init (void)
{
int result = 0;
/*
* Initialize the ACPI boot-time table parser.
*/
- result = acpi_table_init(cmdline);
+ result = acpi_table_init();
if (result)
return result;
#ifdef CONFIG_ACPI_SLEEP
-#error not ported to x86-64 yet
-
-#ifdef DEBUG
-#include <linux/serial.h>
-#endif
+extern void acpi_prepare_wakeup(void);
+extern unsigned char acpi_wakeup[], acpi_wakeup_end[], s3_prot16[];
/* address in low memory of the wakeup routine. */
-unsigned long acpi_wakeup_address = 0;
-
-/* new page directory that we will be using */
-static pmd_t *pmd;
-
-/* saved page directory */
-static pmd_t saved_pmd;
-
-/* page which we'll use for the new page directory */
-static pte_t *ptep;
-
-extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
-
-/*
- * acpi_create_identity_pmd
- *
- * Create a new, identity mapped pmd.
- *
- * Do this by creating new page directory, and marking all the pages as R/W
- * Then set it as the new Page Middle Directory.
- * And, of course, flush the TLB so it takes effect.
- *
- * We save the address of the old one, for later restoration.
- */
-static void acpi_create_identity_pmd (void)
-{
- pgd_t *pgd;
- int i;
-
- ptep = (pte_t*)__get_free_page(GFP_KERNEL);
-
- /* fill page with low mapping */
- for (i = 0; i < PTRS_PER_PTE; i++)
- set_pte(ptep + i, mk_pte_phys(i << PAGE_SHIFT, PAGE_SHARED));
-
- pgd = pgd_offset(current->active_mm, 0);
- pmd = pmd_alloc(current->mm,pgd, 0);
-
- /* save the old pmd */
- saved_pmd = *pmd;
-
- /* set the new one */
- set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(ptep)));
-
- /* flush the TLB */
- local_flush_tlb();
-}
-
-/*
- * acpi_restore_pmd
- *
- * Restore the old pmd saved by acpi_create_identity_pmd and
- * free the page that said function alloc'd
- */
-static void acpi_restore_pmd (void)
-{
- set_pmd(pmd, saved_pmd);
- local_flush_tlb();
- free_page((unsigned long)ptep);
-}
+unsigned long acpi_wakeup_address;
/**
* acpi_save_state_mem - save kernel state
- *
- * Create an identity mapped page table and copy the wakeup routine to
- * low memory.
*/
int acpi_save_state_mem (void)
{
- acpi_create_identity_pmd();
- acpi_copy_wakeup_routine(acpi_wakeup_address);
+ if (!acpi_wakeup_address)
+ return -1;
+ memcpy((void*)acpi_wakeup_address, acpi_wakeup, acpi_wakeup_end - acpi_wakeup);
return 0;
}
/**
* acpi_save_state_disk - save kernel state to disk
*
+ * Assume preemption/interrupts are already turned off and that we're running
+ * on the BP (note this doesn't imply SMP is handled correctly)
*/
int acpi_save_state_disk (void)
{
+ unsigned long pbase = read_cr3() & PAGE_MASK;
+ if (pbase >= 0xffffffffUL) {
+ printk(KERN_ERR "ACPI: High page table. Suspend disabled.\n");
return 1;
+ }
+ set_seg_base(smp_processor_id(), GDT_ENTRY_KERNELCS16, s3_prot16);
+ swap_low_mappings();
+ acpi_prepare_wakeup();
+ return 0;
}
/*
*/
void acpi_restore_state_mem (void)
{
- acpi_restore_pmd();
+ swap_low_mappings();
}
/**
* acpi_reserve_bootmem - do _very_ early ACPI initialisation
*
- * We allocate a page in low memory for the wakeup
+ * We allocate a page in 1MB low memory for the real-mode wakeup
* routine for when we come back from a sleep state. The
* runtime allocator allows specification of <16M pages, but not
* <1M pages.
void __init acpi_reserve_bootmem(void)
{
acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
- printk(KERN_DEBUG "ACPI: have wakeup address 0x%8.8lx\n", acpi_wakeup_address);
+ if (!acpi_wakeup_address) {
+ printk(KERN_ERR "ACPI: Cannot allocate lowmem. S3 disabled.\n");
+ return;
+ }
}
#endif /*CONFIG_ACPI_SLEEP*/
printk("Cannot allocate aperture memory hole (%p,%uK)\n",
p, aper_size>>10);
if (p)
- free_bootmem((unsigned long)p, aper_size);
+ free_bootmem_node(nd0, (unsigned long)p, aper_size);
return 0;
}
printk("Mapping aperture over %d KB of RAM @ %lx\n",
irq_exit();
}
-int disable_apic __initdata;
+int disable_apic;
/*
* This initializes the IO-APIC and APIC hardware if this is
printk(KERN_INFO "Apic disabled\n");
return -1;
}
- if (!smp_found_config && !cpu_has_apic)
+ if (!smp_found_config && !cpu_has_apic) {
+ disable_apic = 1;
return -1;
+ }
/*
* Complain if the BIOS pretends there is one.
if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_id])) {
printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
boot_cpu_id);
+ disable_apic = 1;
return -1;
}
/*
* arch/x86_64/kernel/bluesmoke.c - x86-64 Machine Check Exception Reporting
+ *
+
+RED-PEN: need to add power management to restore after S3 wakeup.
+
*/
#include <linux/init.h>
#include <asm/proto.h>
#include <asm/bootsetup.h>
-extern unsigned long table_start, table_end;
extern char _end[];
+/*
+ * PFN of last memory page.
+ */
+unsigned long end_pfn;
+
/*
* end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
* The direct mapping extends to end_pfn_map, so that we can directly access
- * ACPI and other tables without having to play with fixmaps.
+ * apertures, ACPI and other tables without having to play with fixmaps.
*/
unsigned long end_pfn_map;
unsigned long addr = *addrp, last = addr + size;
/* various gunk below that needed for SMP startup */
- if (addr < 7*PAGE_SIZE) {
- *addrp = 7*PAGE_SIZE;
+ if (addr < 0x8000) {
+ *addrp = 0x8000;
return 1;
}
-#if 0
/* direct mapping tables of the kernel */
if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
*addrp = table_end << PAGE_SHIFT;
return 1;
}
-#endif
/* initrd */
#ifdef CONFIG_BLK_DEV_INITRD
/*
* Find the highest page frame number we have available
*/
-void __init e820_end_of_ram(void)
+unsigned long __init e820_end_of_ram(void)
{
int i;
- end_pfn = 0;
+ unsigned long end_pfn = 0;
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
end_pfn = end_user_pfn;
if (end_pfn > end_pfn_map)
end_pfn = end_pfn_map;
+
+ return end_pfn;
}
/*
#include <linux/init.h>
#include <linux/string.h>
#include <asm/io.h>
+#include <asm/processor.h>
/* Simple VGA output */
s = strsep(&opt, ",");
if (s != NULL) {
unsigned port;
- if (!strncmp(s,"0x",2))
+ if (!strncmp(s,"0x",2)) {
early_serial_base = simple_strtoul(s, &e, 16);
- else {
+ } else {
static int bases[] = { 0x3f8, 0x2f8 };
if (!strncmp(s,"ttyS",4))
s+=4;
* Exception entry point. This expects an error code/orig_rax on the stack
* and the exception handler in %rax.
*/
- ALIGN
-error_entry:
+ENTRY(error_entry)
/* rdi slot contains rax, oldrax contains error code */
pushq %rsi
movq 8(%rsp),%rsi /* load rax */
xorl %ebx,%ebx
swapgs
error_sti:
- bt $9,EFLAGS(%rsp)
- jnc 1f
- sti
-1: movq %rdi,RDI(%rsp)
+ movq %rdi,RDI(%rsp)
movq %rsp,%rdi
movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp)
swapgs
gs_change:
movl %edi,%gs
-2: swapgs
+2: sfence /* workaround */
+ swapgs
popf
ret
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
rdmsr
- /* Fool rdmsr and reset %eax to avoid dependences */
- xorl %eax, %eax
+
/* Enable Long Mode */
btsl $_EFER_LME, %eax
/* Enable System Call */
jnz second
/* Load new GDT with the 64bit segment using 32bit descriptor */
- /* to avoid 32bit relocations we use fixed adresses here */
movl $(pGDT32 - __START_KERNEL_map), %eax
lgdt (%eax)
.quad 0x00cffe000000ffff /* __USER32_CS */
.quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
.quad 0x00affa000000ffff /* __USER_CS */
- .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
- .word 0 # base address = 0
- .word 0x9A00 # code read/exec
- .word 0x00CF # granularity = 4096, 386
- # (+5th nibble of limit)
- /* __KERNEL32_CS */
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0,0 /* TSS */
.quad 0 /* LDT */
.quad 0,0,0 /* three TLS descriptors */
- .quad 0x00cff2000000ffff /* dummy descriptor for long base */
- .quad 0 /* pad to cache line boundary */
+ .quad 0 /* unused now */
+ .quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */
+ /* base must be patched for real base address. */
+ /* This should be a multiple of the cache line size */
gdt_end:
.globl gdt_end
#include <linux/string.h>
#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
/* Don't add a printk in there. printk relies on the PDA which is not initialized
yet. */
boot_cpu_data.x86_mask = eax & 0xf;
}
-extern void start_kernel(void), pda_init(int), setup_early_printk(char *);
-extern int disable_apic;
-
void __init x86_64_start_kernel(char * real_mode_data)
{
char *s;
s = strstr(saved_command_line, "earlyprintk=");
if (s != NULL)
setup_early_printk(s+12);
+#ifdef CONFIG_DISCONTIGMEM
+ s = strstr(saved_command_line, "numa=");
+ if (s != NULL)
+ numa_setup(s+5);
+#endif
#ifdef CONFIG_X86_IO_APIC
if (strstr(saved_command_line, "disableapic"))
disable_apic = 1;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
/*
struct irqaction * action;
seq_printf(p, " ");
- for_each_cpu(j)
+ for (j=0; j<NR_CPUS; j++)
+ if (cpu_online(j))
seq_printf(p, "CPU%d ",j);
seq_putc(p, '\n');
#ifndef CONFIG_SMP
seq_printf(p, "%10u ", kstat_irqs(i));
#else
- for_each_cpu(j)
+ for (j=0; j<NR_CPUS; j++)
+ if (cpu_online(j))
seq_printf(p, "%10u ",
kstat_cpu(j).irqs[i]);
#endif
seq_putc(p, '\n');
}
seq_printf(p, "NMI: ");
- for_each_cpu(j)
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
seq_printf(p, "%10u ", cpu_pda[j].__nmi_count);
seq_putc(p, '\n');
#if CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
- for_each_cpu(j)
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs);
seq_putc(p, '\n');
#endif
#define DEBUGP(fmt...)
+/* TODO this should be in vmlist, but we must fix get_vm_area first to
+ handle out of bounds entries properly.
+ Also need to fix /proc/kcore, /dev/kmem */
+static struct vm_struct *mod_vmlist;
+
void module_free(struct module *mod, void *module_region)
{
struct vm_struct **prevp, *map;
if (!addr)
return;
write_lock(&vmlist_lock);
- for (prevp = &vmlist ; (map = *prevp) ; prevp = &map->next) {
+ for (prevp = &mod_vmlist ; (map = *prevp) ; prevp = &map->next) {
if ((unsigned long)map->addr == addr) {
*prevp = map->next;
write_unlock(&vmlist_lock);
write_lock(&vmlist_lock);
addr = (void *) MODULES_VADDR;
- for (p = &vmlist; (tmp = *p); p = &tmp->next) {
+ for (p = &mod_vmlist; (tmp = *p); p = &tmp->next) {
void *next;
DEBUGP("vmlist %p %lu addr %p\n", tmp->addr, tmp->size, addr);
if (size + (unsigned long) addr + PAGE_SIZE < (unsigned long) tmp->addr)
#include <asm/mpspec.h>
#include <asm/pgalloc.h>
#include <asm/io_apic.h>
+#include <asm/proto.h>
/* Have we found an MP table */
int smp_found_config;
* Intel MP BIOS table parsing routines:
*/
-#ifndef CONFIG_X86_VISWS_APIC
/*
* Checksum an MP configuration block.
*/
smp_found_config = 1;
printk("found SMP MP-table at %08lx\n",
virt_to_phys(mpf));
- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
if (mpf->mpf_physptr)
- reserve_bootmem(mpf->mpf_physptr, PAGE_SIZE);
+ reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
mpf_found = mpf;
return 1;
}
printk(KERN_WARNING "WARNING: MP table in the EBDA can be UNSAFE, contact linux-smp@vger.kernel.org if you experience SMP problems!\n");
}
-#else
-
-/*
- * The Visual Workstation is Intel MP compliant in the hardware
- * sense, but it doesnt have a BIOS(-configuration table).
- * No problem for Linux.
- */
-void __init find_visws_smp(void)
-{
- smp_found_config = 1;
-
- phys_cpu_present_map |= 2; /* or in id 1 */
- apic_version[1] |= 0x10; /* integrated APIC */
- apic_version[0] |= 0x10;
-
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-}
-
-#endif
-
/*
* - Intel MP Configuration Table
- * - or SGI Visual Workstation configuration
*/
void __init find_smp_config (void)
{
#ifdef CONFIG_X86_LOCAL_APIC
find_intel_smp();
#endif
-#ifdef CONFIG_VISWS
- find_visws_smp();
-#endif
}
*
* This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
* an SMP box will direct the access to CPU %d.
+
+RED-PEN: need to get power management for S3 restore
+
*/
#include <linux/module.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
#include <asm/nmi.h>
+#include <asm/msr.h>
extern void default_do_nmi(struct pt_regs *);
printk(KERN_INFO "testing NMI watchdog ... ");
- for_each_cpu(cpu) {
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
counts[cpu] = cpu_pda[cpu].__nmi_count;
- }
local_irq_enable();
mdelay((10*1000)/nmi_hz); // wait 10 ticks
- for_each_cpu(cpu) {
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ if (!cpu_online(cpu))
+ continue;
if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
printk("CPU#%d: NMI appears to be stuck (%d)!\n",
cpu,
* Original code written by Keith Owens.
*/
-static void __pminit setup_k7_watchdog(void)
+static void setup_k7_watchdog(void)
{
int i;
unsigned int evntsel;
nmi_perfctr_msr = MSR_K7_PERFCTR0;
for(i = 0; i < 4; ++i) {
- wrmsr(MSR_K7_EVNTSEL0+i, 0, 0);
- wrmsr(MSR_K7_PERFCTR0+i, 0, 0);
+ /* Simulator may not support it */
+ if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL))
+ return;
+ wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
}
evntsel = K7_EVNTSEL_INT
wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
}
-void __pminit setup_apic_nmi_watchdog (void)
+void setup_apic_nmi_watchdog (void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if (boot_cpu_data.x86 < 6)
return;
- /* Simics masquerades as AMD, but does not support
- performance counters */
- if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
- return;
setup_k7_watchdog();
break;
default:
also reload when it has changed.
when prev process used 64bit base always reload
to avoid an information leak. */
- if (unlikely((fsindex | next->fsindex) || prev->fs))
+ if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
- /* check if the user changed the selector
- if yes clear 64bit base. */
- if (unlikely(fsindex != prev->fsindex))
+ /* check if the user used a selector != 0
+ * if yes clear 64bit base, since overloaded base
+ * is always mapped to the Null selector
+ */
+ if (fsindex)
prev->fs = 0;
+ }
/* when next process has a 64bit base use it */
if (next->fs)
wrmsrl(MSR_FS_BASE, next->fs);
{
unsigned gsindex;
asm volatile("movl %%gs,%0" : "=g" (gsindex));
- if (unlikely((gsindex | next->gsindex) || prev->gs))
+ if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
- if (unlikely(gsindex != prev->gsindex))
+ if (gsindex)
prev->gs = 0;
+ }
if (next->gs)
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
prev->gsindex = gsindex;
switch (code) {
case ARCH_SET_GS:
-#if 1
- /* For now. We still have one unsolved bug in long gs base context
- switch handling. */
- return -EINVAL;
-#else
if (addr >= TASK_SIZE)
return -EPERM;
get_cpu();
- load_gs_index(__USER_LONGBASE);
- current->thread.gsindex = __USER_LONGBASE;
+ load_gs_index(0);
+ current->thread.gsindex = 0;
current->thread.gs = addr;
ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
put_cpu();
break;
-#endif
case ARCH_SET_FS:
/* Not strictly needed for fs, but do it for symmetry
with gs */
if (addr >= TASK_SIZE)
return -EPERM;
get_cpu();
- asm volatile("movl %0,%%fs" :: "r" (__USER_LONGBASE));
- current->thread.fsindex = __USER_LONGBASE;
+ asm volatile("movl %0,%%fs" :: "r" (0));
+ current->thread.fsindex = 0;
current->thread.fs = addr;
ret = checking_wrmsrl(MSR_FS_BASE, addr);
put_cpu();
#include <linux/seq_file.h>
#include <linux/root_dev.h>
#include <linux/pci.h>
+#include <linux/acpi.h>
#include <asm/mtrr.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/smp.h>
#include <asm/proto.h>
+#define Dprintk(x...) printk(x)
+
/*
* Machine setup..
*/
{
int i;
+ Dprintk("setup_arch\n");
+
ROOT_DEV = ORIG_ROOT_DEV;
drive_info = DRIVE_INFO;
screen_info = SCREEN_INFO;
parse_cmdline_early(cmdline_p);
-#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x) ((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
-
-#define MAXMEM (120UL * 1024 * 1024 * 1024 * 1024) /* 120TB */
-#define MAXMEM_PFN PFN_DOWN(MAXMEM)
-#define MAX_NONPAE_PFN (1 << 20)
-
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
- start_pfn = PFN_UP(__pa_symbol(&_end));
-
- e820_end_of_ram();
+ end_pfn = e820_end_of_ram();
init_memory_mapping();
+#ifdef CONFIG_DISCONTIGMEM
+ numa_initmem_init(0, end_pfn);
+#else
contig_initmem_init();
+#endif
+
+ /* Reserve direct mapping */
+ reserve_bootmem_generic(table_start << PAGE_SHIFT,
+ (table_end - table_start) << PAGE_SHIFT);
/* reserve kernel */
- reserve_bootmem(HIGH_MEMORY, PFN_PHYS(start_pfn) - HIGH_MEMORY);
+ unsigned long kernel_end;
+ kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
+ reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
/*
* reserve physical page 0 - it's a special BIOS page on many boxes,
* enabling clean reboots, SMP operation, laptop functions.
*/
- reserve_bootmem(0, PAGE_SIZE);
+ reserve_bootmem_generic(0, PAGE_SIZE);
#ifdef CONFIG_SMP
/*
* FIXME: Don't need the extra page at 4K, but need to fix
* trampoline before removing it. (see the GDT stuff)
*/
- reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
+ reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
+
+ /* Reserve SMP trampoline */
+ reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
#endif
+
#ifdef CONFIG_ACPI_SLEEP
/*
* Reserve low memory region for sleep support.
#ifdef CONFIG_BLK_DEV_INITRD
if (LOADER_TYPE && INITRD_START) {
if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
- reserve_bootmem(INITRD_START, INITRD_SIZE);
+ reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
initrd_start =
INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
initrd_end = initrd_start+INITRD_SIZE;
}
#endif
- /*
- * NOTE: before this point _nobody_ is allowed to allocate
- * any memory using the bootmem allocator.
- */
-
-#ifdef CONFIG_SMP
- smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
-#endif
paging_init();
#ifdef CONFIG_ACPI_BOOT
/*
* of MADT).
*/
if (!acpi_disabled)
- acpi_boot_init(*cmdline_p);
+ acpi_boot_init();
#endif
#ifdef CONFIG_X86_LOCAL_APIC
/*
/*
* X86-64 specific CPU setup.
* Copyright (C) 1995 Linus Torvalds
- * Copyright 2001, 2002 SuSE Labs / Andi Kleen.
+ * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
* See setup.c for older changelog.
* $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $
*/
pml4_t *level4;
struct x8664_pda *pda = &cpu_pda[cpu];
+ /* Setup up data that may be needed in __get_free_pages early */
+ asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
+ wrmsrl(MSR_GS_BASE, cpu_pda + cpu);
+
+ pda->me = pda;
+ pda->cpunumber = cpu;
+ pda->irqcount = -1;
+ pda->cpudata_offset = 0;
+ pda->kernelstack =
+ (unsigned long)current_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
+
if (cpu == 0) {
/* others are initialized in smpboot.c */
pda->pcurrent = &init_task;
asm volatile("movq %0,%%cr3" :: "r" (__pa(level4)));
pda->irqstackptr += IRQSTACKSIZE-64;
- pda->cpunumber = cpu;
- pda->irqcount = -1;
- pda->kernelstack =
- (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
- pda->me = pda;
- pda->cpudata_offset = 0;
-
pda->active_mm = &init_mm;
pda->mmu_state = 0;
-
- asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
- wrmsrl(MSR_GS_BASE, cpu_pda + cpu);
}
#define EXCEPTION_STK_ORDER 0 /* >= N_EXCEPTION_STACKS*EXCEPTION_STKSZ */
/* CPU 0 is initialised in head64.c */
if (cpu != 0) {
+ pda_init(cpu);
estacks = (char *)__get_free_pages(GFP_ATOMIC, 0);
if (!estacks)
panic("Can't allocate exception stacks for CPU %d\n",cpu);
- pda_init(cpu);
} else
estacks = boot_exception_stacks;
handle_signal(unsigned long sig, siginfo_t *info, sigset_t *oldset,
struct pt_regs * regs)
{
- struct k_sigaction *ka = ¤t->sig->action[sig-1];
+ struct k_sigaction *ka = ¤t->sighand->action[sig-1];
#if DEBUG_SIG
printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", current->pid, sig,
*
* (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
* (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
*
* This code is released under the GNU General Public License version 2 or
* later.
}
}
+
+/* Slow. Should be only used for debugging. */
+int slow_smp_processor_id(void)
+{
+ int stack_location;
+ unsigned long sp = (unsigned long)&stack_location;
+ int cpu;
+ unsigned long mask;
+
+ for_each_cpu(cpu, mask) {
+ if (sp >= (u64)cpu_pda[cpu].irqstackptr - IRQSTACKSIZE &&
+ sp <= (u64)cpu_pda[cpu].irqstackptr)
+ return cpu;
+
+ unsigned long estack = init_tss[cpu].ist[0] - EXCEPTION_STKSZ;
+ if (sp >= estack && sp <= estack+(1<<(PAGE_SHIFT+EXCEPTION_STK_ORDER)))
+ return cpu;
+ }
+
+ return stack_smp_processor_id();
+}
#include <asm/kdebug.h>
#include <asm/tlbflush.h>
-/* Bitmask of currently online CPUs */
-unsigned long cpu_online_map;
+extern int disable_apic;
-/* which CPU (physical APIC ID) maps to which logical CPU number */
-volatile int x86_apicid_to_cpu[NR_CPUS];
-/* which logical CPU number maps to which CPU (physical APIC ID) */
-volatile int x86_cpu_to_apicid[NR_CPUS];
+/* Bitmask of currently online CPUs */
+unsigned long cpu_online_map = 1;
static volatile unsigned long cpu_callin_map;
volatile unsigned long cpu_callout_map;
extern unsigned char trampoline_data [];
extern unsigned char trampoline_end [];
-static unsigned char *trampoline_base;
/*
* Currently trivial. Write the real->protected mode
static unsigned long __init setup_trampoline(void)
{
+ void *tramp = __va(SMP_TRAMPOLINE_BASE);
extern volatile __u32 tramp_gdt_ptr;
tramp_gdt_ptr = __pa_symbol(&cpu_gdt_table);
- memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
- return virt_to_phys(trampoline_base);
-}
-
-/*
- * We are called very early to get the low memory for the
- * SMP bootup trampoline page.
- */
-void __init smp_alloc_memory(void)
-{
- trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
- /*
- * Has to be in very low memory so we can execute
- * real-mode AP code.
- */
- if (__pa(trampoline_base) >= 0x9F000)
- BUG();
+ memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
+ return virt_to_phys(tramp);
}
/*
*/
atomic_inc(&tsc_count_start);
+ sync_core();
rdtscll(tsc_values[smp_processor_id()]);
/*
* We clear the TSC in the last loop:
atomic_inc(&tsc_count_start);
while (atomic_read(&tsc_count_start) != num_booting_cpus()) mb();
+ sync_core();
rdtscll(tsc_values[smp_processor_id()]);
if (i == NR_LOOPS-1)
write_tsc(0, 0);
cpu_init();
smp_callin();
+ /* otherwise gcc will move up the smp_processor_id before the cpu_init */
+ barrier();
+
Dprintk("cpu %d: waiting for commence\n", smp_processor_id());
while (!test_bit(smp_processor_id(), &smp_commenced_mask))
rep_nop();
*/
init_idle(idle,cpu);
- x86_cpu_to_apicid[cpu] = apicid;
- x86_apicid_to_cpu[apicid] = cpu;
idle->thread.rip = (unsigned long)start_secondary;
// idle->thread.rsp = (unsigned long)idle->thread_info + THREAD_SIZE - 512;
}
}
if (boot_error) {
- x86_cpu_to_apicid[cpu] = -1;
- x86_apicid_to_cpu[apicid] = -1;
clear_bit(cpu, &cpu_callout_map); /* was set here (do_boot_cpu()) */
clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
cpucount--;
{
int apicid, cpu;
- /*
- * Initialize the logical to physical CPU number mapping
- */
-
- for (apicid = 0; apicid < NR_CPUS; apicid++) {
- x86_apicid_to_cpu[apicid] = -1;
- }
-
/*
* Setup boot CPU information
*/
printk("CPU%d: ", 0);
print_cpu_info(&cpu_data[0]);
- x86_apicid_to_cpu[boot_cpu_id] = 0;
- x86_cpu_to_apicid[0] = boot_cpu_id;
current_thread_info()->cpu = 0;
smp_tune_scheduling();
io_apic_irqs = 0;
cpu_online_map = phys_cpu_present_map = 1;
phys_cpu_present_map = 1;
+ disable_apic = 1;
return;
}
io_apic_irqs = 0;
cpu_online_map = phys_cpu_present_map = 1;
phys_cpu_present_map = 1;
+ disable_apic = 1;
return;
}
continue;
do_boot_cpu(apicid);
-
- /*
- * Make sure we unmap all failed CPUs
- */
- if ((x86_apicid_to_cpu[apicid] == -1) &&
- (phys_cpu_present_map & (1 << apicid)))
- printk("phys CPU #%d not responding - cannot use it.\n",apicid);
}
/*
if (!file)
goto out;
}
-
down_write(¤t->mm->mmap_sem);
error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
up_write(¤t->mm->mmap_sem);
* Copyright (c) 1996 Ingo Molnar
* Copyright (c) 1998 Andrea Arcangeli
* Copyright (c) 2002 Vojtech Pavlik
+ * Copyright (c) 2003 Andi Kleen
*
*/
#include <linux/bcd.h>
#include <asm/vsyscall.h>
#include <asm/timex.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/apic.h>
+#endif
u64 jiffies_64;
+extern int using_apic_timer;
+
spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
extern int using_apic_timer;
* together by xtime_lock.
*/
-static spinlock_t time_offset_lock = SPIN_LOCK_UNLOCKED;
-static unsigned long timeoffset = 0;
-
inline unsigned int do_gettimeoffset(void)
{
unsigned long t;
+ sync_core();
rdtscll(t);
return (t - hpet.last_tsc) * (1000000L / HZ) / hpet.ticks + hpet.offset;
}
void do_gettimeofday(struct timeval *tv)
{
- unsigned long flags, t, seq;
+ unsigned long seq, t;
unsigned int sec, usec;
- spin_lock_irqsave(&time_offset_lock, flags);
do {
seq = read_seqbegin(&xtime_lock);
usec = xtime.tv_nsec / 1000;
t = (jiffies - wall_jiffies) * (1000000L / HZ) + do_gettimeoffset();
- if (t > timeoffset) timeoffset = t;
- usec += timeoffset;
+ usec += t;
} while (read_seqretry(&xtime_lock, seq));
- spin_unlock_irqrestore(&time_offset_lock, flags);
tv->tv_sec = sec + usec / 1000000;
tv->tv_usec = usec % 1000000;
void do_settimeofday(struct timeval *tv)
{
write_seqlock_irq(&xtime_lock);
- vxtime_lock();
tv->tv_usec -= do_gettimeoffset() +
(jiffies - wall_jiffies) * tick_usec;
xtime.tv_sec = tv->tv_sec;
xtime.tv_nsec = (tv->tv_usec * 1000);
- vxtime_unlock();
time_adjust = 0; /* stop active adjtime() */
time_status |= STA_UNSYNC;
*/
write_seqlock(&xtime_lock);
- vxtime_lock();
{
unsigned long t;
+ sync_core();
rdtscll(t);
hpet.offset = (t - hpet.last_tsc) * (1000000L / HZ) / hpet.ticks + hpet.offset - 1000000L / HZ;
if (hpet.offset >= 1000000L / HZ)
hpet.ticks = min_t(long, max_t(long, (t - hpet.last_tsc) * (1000000L / HZ) / (1000000L / HZ - hpet.offset),
cpu_khz * 1000/HZ * 15 / 16), cpu_khz * 1000/HZ * 16 / 15);
hpet.last_tsc = t;
- timeoffset = 0;
}
/*
rtc_update = xtime.tv_sec + 660;
}
- vxtime_unlock();
write_sequnlock(&xtime_lock);
}
outb((1193182 / (1000 / 50)) & 0xff, 0x42);
outb((1193182 / (1000 / 50)) >> 8, 0x42);
rdtscll(start);
-
+ sync_core();
while ((inb(0x61) & 0x20) == 0);
+ sync_core();
rdtscll(end);
pit_init();
printk(KERN_INFO "time.c: Using 1.1931816 MHz PIT timer.\n");
- setup_irq(0, &irq0);
cpu_khz = pit_calibrate_tsc();
printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
cpu_khz / 1000, cpu_khz % 1000);
hpet.ticks = cpu_khz * (1000 / HZ);
rdtscll(hpet.last_tsc);
+ setup_irq(0, &irq0);
}
__setup("report_lost_ticks", time_setup);
struct notifier_block *die_chain;
+static inline void conditional_sti(struct pt_regs *regs)
+{
+ if (regs->eflags & X86_EFLAGS_IF)
+ local_irq_enable();
+}
+
static int kstack_depth_to_print = 10;
#ifdef CONFIG_KALLSYMS
{
unsigned long addr;
unsigned long *irqstack, *irqstack_end, *estack_end;
- /* FIXME: should read the cpuid from the APIC; to still work with bogus %gs */
- const int cpu = smp_processor_id();
+ const int cpu = safe_smp_processor_id();
int i;
printk("\nCall Trace:");
{
unsigned long *stack;
int i;
- const int cpu = smp_processor_id();
+ const int cpu = safe_smp_processor_id();
unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr);
unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE);
int i;
int in_kernel = (regs->cs & 3) == 0;
unsigned long rsp;
-#ifdef CONFIG_SMP
- /* For SMP should get the APIC id here, just to protect against corrupted GS */
- const int cpu = smp_processor_id();
-#else
- const int cpu = 0;
-#endif
+ const int cpu = safe_smp_processor_id();
struct task_struct *cur = cpu_pda[cpu].pcurrent;
rsp = regs->rsp;
bust_spinlocks(1);
handle_BUG(regs);
printk("%s: %04lx\n", str, err & 0xffff);
- cpu = smp_processor_id();
+ cpu = safe_smp_processor_id();
/* racy, but better than risking deadlock. */
local_irq_disable();
if (!spin_trylock(&die_lock)) {
static void do_trap(int trapnr, int signr, char *str,
struct pt_regs * regs, long error_code, siginfo_t *info)
{
+ conditional_sti(regs);
+
#ifdef CONFIG_CHECKING
{
unsigned long gs;
- struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
+ struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
rdmsrl(MSR_GS_BASE, gs);
if (gs != (unsigned long)pda) {
wrmsrl(MSR_GS_BASE, pda);
asmlinkage void do_general_protection(struct pt_regs * regs, long error_code)
{
+ conditional_sti(regs);
+
#ifdef CONFIG_CHECKING
{
unsigned long gs;
- struct x8664_pda *pda = cpu_pda + hard_smp_processor_id();
+ struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
rdmsrl(MSR_GS_BASE, gs);
if (gs != (unsigned long)pda) {
wrmsrl(MSR_GS_BASE, pda);
#ifdef CONFIG_CHECKING
{
unsigned long gs;
- struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
+ struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
rdmsrl(MSR_GS_BASE, gs);
if (gs != (unsigned long)pda) {
wrmsrl(MSR_GS_BASE, pda);
asm("movq %%db6,%0" : "=r" (condition));
+ conditional_sti(regs);
+
if (notify_die(DIE_DEBUG, "debug", regs, error_code) == NOTIFY_BAD)
return;
struct task_struct * task;
siginfo_t info;
unsigned short cwd, swd;
-
/*
* Save the info for the exception handler and clear the error.
*/
asmlinkage void do_coprocessor_error(struct pt_regs * regs, long error_code)
{
+ conditional_sti(regs);
math_error((void *)regs->rip);
}
asmlinkage void do_simd_coprocessor_error(struct pt_regs * regs,
long error_code)
{
+ conditional_sti(regs);
simd_math_error((void *)regs->rip);
}
* linux/arch/x86_64/kernel/vsyscall.c
*
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright 2003 Andi Kleen, SuSE Labs.
*
* Thanks to hpa@transmeta.com for some useful hint.
* Special thanks to Ingo Molnar for his early experience with
* vsyscalls. One vsyscall can reserve more than 1 slot to avoid
* jumping out of line if necessary.
*
- * $Id: vsyscall.c,v 1.9 2002/03/21 13:42:58 ak Exp $
+ * Note: the concept clashes with user mode linux. If you use UML just
+ * set the kernel.vsyscall sysctl to 0.
*/
/*
* broken programs will segfault and there's no security risk until we choose to
* fix it.
*
+ * Add HPET support (port from 2.4). Still needed?
+ * Nop out vsyscall syscall to avoid anchor for buffer overflows when sysctl off.
+ *
* These are not urgent things that we need to address only before shipping the first
* production binary kernels.
*/
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/timer.h>
+#include <linux/seqlock.h>
#include <asm/vsyscall.h>
#include <asm/pgtable.h>
#include <asm/fixmap.h>
#include <asm/errno.h>
-
#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
-#define NO_VSYSCALL 1
+int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
+seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
-#ifdef NO_VSYSCALL
#include <asm/unistd.h>
-static int errno __section_vxtime_sequence;
-
-static inline _syscall2(int,gettimeofday,struct timeval *,tv,struct timezone *,tz)
-
-#else
static inline void timeval_normalize(struct timeval * tv)
{
time_t __sec;
}
}
-long __vxtime_sequence[2] __section_vxtime_sequence;
-
-
static inline void do_vgettimeofday(struct timeval * tv)
{
long sequence, t;
unsigned long sec, usec;
do {
- sequence = __vxtime_sequence[1];
- rmb();
+ sequence = read_seqbegin(&__xtime_lock);
+ sync_core();
rdtscll(t);
sec = __xtime.tv_sec;
- usec = __xtime.tv_usec +
+ usec = (__xtime.tv_nsec * 1000) +
(__jiffies - __wall_jiffies) * (1000000 / HZ) +
(t - __hpet.last_tsc) * (1000000 / HZ) / __hpet.ticks + __hpet.offset;
- rmb();
- } while (sequence != __vxtime_sequence[0]);
+ } while (read_seqretry(&__xtime_lock, sequence));
tv->tv_sec = sec + usec / 1000000;
tv->tv_usec = usec % 1000000;
}
+/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
static inline void do_get_tz(struct timezone * tz)
{
- long sequence;
-
- do {
- sequence = __vxtime_sequence[1];
- rmb();
-
*tz = __sys_tz;
+}
- rmb();
- } while (sequence != __vxtime_sequence[0]);
+static inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+ int ret;
+ asm volatile("syscall"
+ : "=a" (ret)
+ : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
+ return ret;
}
-#endif
static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
{
-#ifdef NO_VSYSCALL
+ if (unlikely(!__sysctl_vsyscall))
return gettimeofday(tv,tz);
-#else
if (tv)
do_vgettimeofday(tv);
if (tz)
do_get_tz(tz);
return 0;
-#endif
}
static time_t __vsyscall(1) vtime(time_t * t)
{
struct timeval tv;
- vgettimeofday(&tv,NULL);
+ if (unlikely(!__sysctl_vsyscall))
+ gettimeofday(&tv, NULL);
+ else
+ do_vgettimeofday(&tv);
if (t)
*t = tv.tv_sec;
return tv.tv_sec;
static long __vsyscall(3) venosys_1(void)
{
return -ENOSYS;
+
}
static void __init map_vsyscall(void)
{
extern char __vsyscall_0;
- unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - __START_KERNEL_map;
+ unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
}
--- /dev/null
+/*
+ * ACPI S3 entry/exit handling.
+ *
+ * Notes:
+ * Relies on kernel being loaded below 4GB.
+ * Needs restore_low_mappings called before.
+ *
+ * Copyright 2003 by Andi Kleen, SuSE Labs.
+ *
+ * Long mode entry losely based on example code in chapter 14 of the x86-64 system
+ * programmer's manual.
+ *
+ * Notebook:
+
+ FIXME need to interface with suspend.c properly. do_magic. check i386. rename to suspend64.S
+
+ Need to fix vgacon,mtrr,bluesmoke to do resume
+
+ Interrupts should be off until the io-apic code has reinited the APIC.
+ Need support for that in the pm frame work or a special hack?
+
+ SMP support is non existent. Need to somehow restart the other CPUs again.
+ If CPU hotplug was working it could be used. Save/Restore needs to run on the same CPU.
+
+ Should check magic like i386 code
+
+ suspend code copies something. check what it is.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/msr.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+
+#define O(x) (x-acpi_wakeup)
+
+ .text
+ .code16
+ENTRY(acpi_wakeup)
+ /* 16bit real mode entered from ACPI BIOS */
+ /* The machine is just through BIOS setup after power down and everything set up
+ by Linux needs to be restored. */
+ /* The code here needs to be position independent or manually relocated,
+ because it is copied to a <1MB page for real mode execution */
+
+ /* A20 enabled (according to ACPI spec) */
+ /* cs = acpi_wakeup >> 4 ; eip = acpi_wakeup & 0xF */
+
+ movw %cs,%ax
+ movw %ax,%ds /* make %ds point to acpi_wakeup */
+ movw %ax,%ss
+ movw $O(wakeup_stack),%sp /* setup stack */
+
+ pushl $0
+ popfl /* clear EFLAGS */
+
+ lgdt %ds:O(pGDT) /* load kernel GDT */
+
+ movl $0x1,%eax /* enable protected mode */
+ movl %eax,%cr0
+
+ movl %ds:O(wakeup_page_table),%edi
+ ljmpl $__KERNEL16_CS,$0 /* -> s3_prot16 (filled in earlier by caller) */
+
+ /* patched by s3_restore_state below */
+pGDT:
+ .short 0
+ .quad 0
+
+ .align 4
+ .globl wakeup_page_table
+wakeup_page_table:
+ .long 0
+
+ .align 8
+wakeup_stack:
+ .fill 128,1,0
+ .globl acpi_wakeup_end
+acpi_wakeup_end:
+ /* end of real mode trampoline */
+
+ /* pointed to by __KERNEL16_CS:0 */
+ .code16
+ENTRY(s3_prot16)
+ /* Now in 16bit protected mode, still no paging, stack/data segments invalid */
+
+ /* Prepare everything for 64bit paging, but still keep it turned off */
+ movl %cr4,%eax
+ bts $5,%eax /* set PAE bit */
+ movl %eax,%cr4
+
+ movl %edi,%cr3 /* load kernel page table */
+
+ movl $0x80000001,%eax
+ cpuid /* no execute supported ? */
+ movl %edx,%esi
+
+ movl $MSR_EFER,%ecx
+ rdmsr
+ bts $8,%eax /* long mode */
+ bt $20,%esi /* NX supported ? */
+ jnc 1f
+ bt $_EFER_NX,%eax
+1:
+ wrmsr /* set temporary efer - real one is restored a bit later */
+
+ movl %cr0,%eax
+ bts $31,%eax /* paging */
+ movl %eax,%cr0
+
+ /* running in identity mapping now */
+
+ /* go to 64bit code segment */
+ ljmpl $__KERNEL_CS,$s3_restore_state-__START_KERNEL_map
+
+ .code64
+ .macro SAVEMSR msr,target
+ movl $\msr,%ecx
+ rdmsr
+ shlq $32,%rdx
+ orq %rax,%rdx
+ movq %rdx,\target(%rip)
+ .endm
+
+ .macro RESTMSR msr,src
+ movl $\msr,%ecx
+ movq \src(%rip),%rax
+ movq %rax,%rdx
+ shrq $32,%rdx
+ wrmsr
+ .endm
+
+ .macro SAVECTL reg
+ movq %\reg,%rax
+ movq %rax,saved_\reg(%rip)
+ .endm
+
+ .macro RESTCTL reg
+ movq saved_\reg(%rip),%rax
+ movq %rax,%\reg
+ .endm
+
+ /* Running in identity mapping, long mode */
+s3_restore_state_low:
+ movq $s3_restore_state,%rax
+ jmpq *%rax
+
+ /* Running in real kernel mapping now */
+s3_restore_state:
+ xorl %eax,%eax
+ movl %eax,%ds
+ movq saved_rsp(%rip),%rsp
+ movw saved_ss(%rip),%ss
+ movw saved_fs(%rip),%fs
+ movw saved_gs(%rip),%gs
+ movw saved_es(%rip),%es
+ movw saved_ds(%rip),%ds
+
+ lidt saved_idt
+ ltr saved_tr
+ lldt saved_ldt
+ /* gdt is already loaded */
+
+ RESTCTL cr0
+ RESTCTL cr4
+ /* cr3 is already loaded */
+
+ RESTMSR MSR_EFER,saved_efer
+ RESTMSR MSR_LSTAR,saved_lstar
+ RESTMSR MSR_CSTAR,saved_cstar
+ RESTMSR MSR_FS_BASE,saved_fs_base
+ RESTMSR MSR_GS_BASE,saved_gs_base
+ RESTMSR MSR_KERNEL_GS_BASE,saved_kernel_gs_base
+ RESTMSR MSR_SYSCALL_MASK,saved_syscall_mask
+
+ fxrstor fpustate(%rip)
+
+ RESTCTL dr0
+ RESTCTL dr1
+ RESTCTL dr2
+ RESTCTL dr3
+ RESTCTL dr6
+ RESTCTL dr7
+
+ movq saved_rflags(%rip),%rax
+ pushq %rax
+ popfq
+
+ movq saved_rbp(%rip),%rbp
+ movq saved_rbx(%rip),%rbx
+ movq saved_r12(%rip),%r12
+ movq saved_r13(%rip),%r13
+ movq saved_r14(%rip),%r14
+ movq saved_r15(%rip),%r15
+ ret
+
+ENTRY(acpi_prepare_wakeup)
+ sgdt saved_gdt
+
+ /* copy gdt descr and page table to low level wakeup code so that it can
+ reload them early. */
+ movq acpi_wakeup_address(%rip),%rax
+ movw saved_gdt+8(%rip),%cx
+ movw %cx,O(pGDT)+8(%rax)
+ movq saved_gdt(%rip),%rcx
+ movq %rcx,O(pGDT)(%rax)
+
+ movq %cr3,%rdi
+ movl %edi,O(wakeup_page_table)(%rax)
+ ret
+
+ /* Save CPU state. */
+ /* Everything saved here needs to be restored above. */
+ENTRY(do_suspend_lowlevel)
+ testl %edi,%edi
+ jnz s3_restore_state
+
+ SAVECTL cr0
+ SAVECTL cr4
+ SAVECTL cr3
+
+ str saved_tr
+ sidt saved_idt
+ sgdt saved_gdt
+ sldt saved_ldt
+
+ SAVEMSR MSR_EFER,saved_efer
+ SAVEMSR MSR_LSTAR,saved_lstar
+ SAVEMSR MSR_CSTAR,saved_cstar
+ SAVEMSR MSR_FS_BASE,saved_fs_base
+ SAVEMSR MSR_GS_BASE,saved_gs_base
+ SAVEMSR MSR_KERNEL_GS_BASE,saved_kernel_gs_base
+ SAVEMSR MSR_SYSCALL_MASK,saved_syscall_mask
+
+ movw %ds,saved_ds(%rip)
+ movw %es,saved_es(%rip)
+ movw %fs,saved_fs(%rip)
+ movw %gs,saved_gs(%rip)
+ movw %ss,saved_ss(%rip)
+ movq %rsp,saved_rsp(%rip)
+
+ pushfq
+ popq %rax
+ movq %rax,saved_rflags(%rip)
+
+ SAVECTL dr0
+ SAVECTL dr1
+ SAVECTL dr2
+ SAVECTL dr3
+ SAVECTL dr6
+ SAVECTL dr7
+
+ fxsave fpustate(%rip)
+
+ /* finally save callee saved registers */
+ movq %rbp,saved_rbp(%rip)
+ movq %rbx,saved_rbx(%rip)
+ movq %r12,saved_r12(%rip)
+ movq %r13,saved_r13(%rip)
+ movq %r14,saved_r14(%rip)
+ movq %r15,saved_r15(%rip)
+ movq $3,%rdi
+ call acpi_enter_sleep_state
+ ret /* should not happen */
+
+ .data
+ .align 8
+saved_efer: .quad 0
+saved_lstar: .quad 0
+saved_cstar: .quad 0
+saved_cr4: .quad 0
+saved_cr3: .quad 0
+saved_cr0: .quad 0
+saved_rbp: .quad 0
+saved_rbx: .quad 0
+saved_rsp: .quad 0
+saved_r12: .quad 0
+saved_r13: .quad 0
+saved_r14: .quad 0
+saved_r15: .quad 0
+saved_rflags: .quad 0
+saved_gs_base: .quad 0
+saved_fs_base: .quad 0
+saved_kernel_gs_base: .quad 0
+saved_syscall_mask: .quad 0
+saved_dr0: .quad 0
+saved_dr1: .quad 0
+saved_dr2: .quad 0
+saved_dr3: .quad 0
+saved_dr6: .quad 0
+saved_dr7: .quad 0
+saved_ds: .short 0
+saved_fs: .short 0
+saved_gs: .short 0
+saved_es: .short 0
+saved_ss: .short 0
+saved_idt: .short 0
+ .quad 0
+saved_ldt: .short 0
+saved_gdt: .short 0
+ .quad 0
+saved_tr: .short 0
+
+ .align 16
+fpustate: .fill 512,1,0
#
-# Makefile for the linux i386-specific parts of the memory manager.
+# Makefile for the linux x86_64-specific parts of the memory manager.
#
obj-y := init.o fault.o ioremap.o extable.o pageattr.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_DISCONTIGMEM) += numa.o
+obj-$(CONFIG_K8_NUMA) += k8topology.o
/* get the address */
__asm__("movq %%cr2,%0":"=r" (address));
- if (page_fault_trace)
+ if (likely(regs->eflags & X86_EFLAGS_IF))
+ local_irq_enable();
+
+ if (unlikely(page_fault_trace))
printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
* If we're in an interrupt or have no user
* context, we must not take the fault..
*/
- if (in_atomic() || !mm)
+ if (unlikely(in_atomic() || !mm))
goto no_context;
again:
vma = find_vma(mm, address);
if (!vma)
goto bad_area;
- if (vma->vm_start <= address)
+ if (likely(vma->vm_start <= address))
goto good_area;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto bad_area;
return;
}
#endif
- printk("%s[%d] segfault at rip:%lx rsp:%lx adr:%lx err:%lx\n",
+ printk(KERN_INFO
+ "%s[%d] segfault at rip:%lx rsp:%lx adr:%lx err:%lx\n",
tsk->comm, tsk->pid, regs->rip, regs->rsp, address,
error_code);
return i;
}
+struct page *
+follow_huge_addr(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address, int write)
+{
+ return NULL;
+}
+
+struct vm_area_struct *hugepage_vma(struct mm_struct *mm, unsigned long addr)
+{
+ return NULL;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & _PAGE_PSE);
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pmd);
+ if (page) {
+ page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
+ get_page(page);
+ }
+ return page;
+}
+
void free_huge_page(struct page *page)
{
BUG_ON(page_count(page));
BUG_ON(start & (HPAGE_SIZE - 1));
BUG_ON(end & (HPAGE_SIZE - 1));
- spin_lock(&htlbpage_lock);
- spin_unlock(&htlbpage_lock);
for (address = start; address < end; address += HPAGE_SIZE) {
pte = huge_pte_offset(mm, address);
page = pte_page(*pte);
int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
{
struct mm_struct *mm = current->mm;
- struct inode = mapping->host;
+ struct inode *inode = mapping->host;
unsigned long addr;
int ret = 0;
*
* Copyright (C) 1995 Linus Torvalds
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002 Andi Kleen <ak@suse.de>
+ * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
*/
#include <linux/config.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
+#include <asm/smp.h>
-unsigned long start_pfn, end_pfn;
+#define Dprintk(x...) printk(x)
struct mmu_gather mmu_gathers[NR_CPUS];
if (after_bootmem)
ptr = (void *) get_zeroed_page(GFP_ATOMIC);
else
- ptr = alloc_bootmem_low(PAGE_SIZE);
- if (!ptr)
+ ptr = alloc_bootmem_pages(PAGE_SIZE);
+ if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
+
+ Dprintk("spp_getpage %p\n", ptr);
return ptr;
}
pmd_t *pmd;
pte_t *pte;
+ Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
+
level4 = pml4_offset_k(vaddr);
if (pml4_none(*level4)) {
printk("PML4 FIXMAP MISSING, it should be setup in head.S!\n");
pmd = (pmd_t *) spp_getpage();
set_pgd(pgd, __pgd(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
if (pmd != pmd_offset(pgd, 0)) {
- printk("PAGETABLE BUG #01!\n");
+ printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pgd,0));
return;
}
}
}
}
pte = pte_offset_kernel(pmd, vaddr);
+ /* CHECKME: */
if (pte_val(*pte))
pte_ERROR(*pte);
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, prot));
set_pte_phys(address, phys, prot);
}
-extern unsigned long start_pfn, end_pfn;
+unsigned long __initdata table_start, table_end;
+
extern pmd_t temp_boot_pmds[];
static struct temp_map {
{
struct temp_map *ti;
int i;
- unsigned long pfn = start_pfn++, paddr;
+ unsigned long pfn = table_end++, paddr;
void *adr;
- if (pfn >= end_pfn_map)
+ if (pfn >= end_pfn)
panic("alloc_low_page: ran out of memory");
for (i = 0; temp_mappings[i].allocated; i++) {
if (!temp_mappings[i].pmd)
panic("alloc_low_page: ran out of temp mappings");
}
ti = &temp_mappings[i];
- paddr = (pfn & (~511)) << PAGE_SHIFT;
+ paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
ti->allocated = 1;
__flush_tlb();
- adr = ti->address + (pfn & 511)*PAGE_SIZE;
+ adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
*index = i;
*phys = pfn * PAGE_SIZE;
return adr;
pgd = pgd + i;
for (; i < PTRS_PER_PGD; pgd++, i++) {
int map;
- unsigned long paddr = i*PGDIR_SIZE, pmd_phys;
+ unsigned long paddr, pmd_phys;
pmd_t *pmd;
+ paddr = (address & PML4_MASK) + i*PGDIR_SIZE;
if (paddr >= end) {
for (; i < PTRS_PER_PGD; i++, pgd++)
set_pgd(pgd, __pgd(0));
break;
}
+
+ if (!e820_mapped(paddr, paddr+PGDIR_SIZE, 0)) {
+ set_pgd(pgd, __pgd(0));
+ continue;
+ }
+
pmd = alloc_low_page(&map, &pmd_phys);
set_pgd(pgd, __pgd(pmd_phys | _KERNPG_TABLE));
- for (j = 0; j < PTRS_PER_PMD; pmd++, j++) {
+ for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
unsigned long pe;
- paddr = i*PGDIR_SIZE + j*PMD_SIZE;
if (paddr >= end) {
for (; j < PTRS_PER_PMD; j++, pmd++)
set_pmd(pmd, __pmd(0));
unsigned long adr;
unsigned long end;
unsigned long next;
+ unsigned long pgds, pmds, tables;
+
+ Dprintk("init_memory_mapping\n");
+
+ end = end_pfn_map << PAGE_SHIFT;
+
+ /*
+ * Find space for the kernel direct mapping tables.
+ * Later we should allocate these tables in the local node of the memory
+ * mapped. Unfortunately this is done currently before the nodes are
+ * discovered.
+ */
+
+ pgds = (end + PGDIR_SIZE - 1) >> PGDIR_SHIFT;
+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+ tables = round_up(pgds*8, PAGE_SIZE) + round_up(pmds * 8, PAGE_SIZE);
+
+ table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
+ if (table_start == -1UL)
+ panic("Cannot find space for the kernel page tables");
+
+ table_start >>= PAGE_SHIFT;
+ table_end = table_start;
+
+ end += __PAGE_OFFSET; /* turn virtual */
- end = PAGE_OFFSET + (end_pfn_map * PAGE_SIZE);
for (adr = PAGE_OFFSET; adr < end; adr = next) {
int map;
unsigned long pgd_phys;
pgd_t *pgd = alloc_low_page(&map, &pgd_phys);
- next = adr + (512UL * 1024 * 1024 * 1024);
+ next = adr + PML4_SIZE;
if (next > end)
next = end;
phys_pgd_init(pgd, adr-PAGE_OFFSET, next-PAGE_OFFSET);
}
asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
__flush_tlb_all();
+ early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
+ table_start<<PAGE_SHIFT,
+ table_end<<PAGE_SHIFT);
}
extern struct x8664_pda cpu_pda[NR_CPUS];
-void __init zap_low_mappings (void)
+static unsigned long low_pml4[NR_CPUS];
+
+void swap_low_mappings(void)
{
int i;
for (i = 0; i < NR_CPUS; i++) {
- if (cpu_pda[i].level4_pgt)
- cpu_pda[i].level4_pgt[0] = 0;
+ unsigned long t;
+ if (!cpu_pda[i].level4_pgt)
+ continue;
+ t = cpu_pda[i].level4_pgt[0];
+ cpu_pda[i].level4_pgt[0] = low_pml4[i];
+ low_pml4[i] = t;
}
flush_tlb_all();
}
+void zap_low_mappings(void)
+{
+ swap_low_mappings();
+}
+
+#ifndef CONFIG_DISCONTIGMEM
void __init paging_init(void)
{
{
}
return;
}
-
+#endif
static inline int page_is_ram (unsigned long pagenr)
{
int codesize, reservedpages, datasize, initsize;
int tmp;
- if (!mem_map)
- BUG();
-
+ /* How many end-of-memory variables you have, grandma! */
max_low_pfn = end_pfn;
max_pfn = end_pfn;
- max_mapnr = num_physpages = end_pfn;
+ num_physpages = end_pfn;
high_memory = (void *) __va(end_pfn * PAGE_SIZE);
/* clear the zero-page */
memset(empty_zero_page, 0, PAGE_SIZE);
+ reservedpages = 0;
+
/* this will put all low memory onto the freelists */
- totalram_pages += free_all_bootmem();
+#ifdef CONFIG_DISCONTIGMEM
+ totalram_pages += numa_free_all_bootmem();
+ tmp = 0;
+ /* should count reserved pages here for all nodes */
+#else
+ max_mapnr = end_pfn;
+ if (!mem_map) BUG();
- after_bootmem = 1;
+ totalram_pages += free_all_bootmem();
- reservedpages = 0;
for (tmp = 0; tmp < end_pfn; tmp++)
/*
* Only count reserved RAM pages
*/
if (page_is_ram(tmp) && PageReserved(mem_map+tmp))
reservedpages++;
+#endif
+
+ after_bootmem = 1;
+
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
- max_mapnr << (PAGE_SHIFT-10),
+ end_pfn << (PAGE_SHIFT-10),
codesize >> 10,
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
}
}
#endif
+
+void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+{
+ /* Should check here against the e820 map to avoid double free */
+#ifdef CONFIG_DISCONTIGMEM
+ int nid = phys_to_nid(phys);
+ if (phys < HIGH_MEMORY && nid)
+ panic("reserve of %lx at node %d", phys, nid);
+ reserve_bootmem_node(NODE_DATA(nid), phys, len);
+#else
+ reserve_bootmem(phys, len);
+#endif
+}
*/
if (phys_addr < virt_to_phys(high_memory)) {
char *t_addr, *t_end;
- struct page *page;
t_addr = __va(phys_addr);
t_end = t_addr + (size - 1);
+#ifndef CONFIG_DISCONTIGMEM
+ struct page *page;
for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
if(!PageReserved(page))
return NULL;
+#endif
}
/*
--- /dev/null
+/*
+ * AMD K8 NUMA support.
+ * Discover the memory map and associated nodes.
+ *
+ * Doesn't use the ACPI SRAT table because it has a questionable license.
+ * Instead the northbridge registers are read directly.
+ * XXX in 2.5 we could use the generic SRAT code
+ *
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+
+static int find_northbridge(void)
+{
+ int num;
+
+ for (num = 0; num < 32; num++) {
+ u32 header;
+
+ header = read_pci_config(0, num, 0, 0x00);
+ if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
+ continue;
+
+ header = read_pci_config(0, num, 1, 0x00);
+ if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
+ continue;
+ return num;
+ }
+
+ return -1;
+}
+
+int __init k8_scan_nodes(unsigned long start, unsigned long end)
+{
+ unsigned long prevbase;
+ struct node nodes[MAXNODE];
+ int nodeid, numnodes, maxnode, i, nb;
+
+ nb = find_northbridge();
+ if (nb < 0)
+ return nb;
+
+ printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
+
+ numnodes = (read_pci_config(0, nb, 0, 0x60 ) >> 4) & 3;
+
+ memset(&nodes,0,sizeof(nodes));
+ prevbase = 0;
+ maxnode = -1;
+ for (i = 0; i < MAXNODE; i++) {
+ unsigned long base,limit;
+
+ base = read_pci_config(0, nb, 1, 0x40 + i*8);
+ limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+
+ nodeid = limit & 3;
+ if (!limit) {
+ printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, base);
+ continue;
+ }
+ if ((base >> 8) & 3 || (limit >> 8) & 3) {
+ printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
+ nodeid, (base>>8)&3, (limit>>8) & 3);
+ return -1;
+ }
+ if (nodeid > maxnode)
+ maxnode = nodeid;
+ if ((1UL << nodeid) & nodes_present) {
+ printk("Node %d already present. Skipping\n", nodeid);
+ continue;
+ }
+
+ limit >>= 16;
+ limit <<= 24;
+
+ if (limit > end_pfn_map << PAGE_SHIFT)
+ limit = end_pfn_map << PAGE_SHIFT;
+ if (limit <= base) {
+ printk(KERN_INFO "Node %d beyond memory map\n", nodeid);
+ continue;
+ }
+
+ base >>= 16;
+ base <<= 24;
+
+ if (base < start)
+ base = start;
+ if (limit > end)
+ limit = end;
+ if (limit == base)
+ continue;
+ if (limit < base) {
+ printk(KERN_INFO"Node %d bogus settings %lx-%lx. Ignored.\n",
+ nodeid, base, limit);
+ continue;
+ }
+
+ /* Could sort here, but pun for now. Should not happen anyroads. */
+ if (prevbase > base) {
+ printk(KERN_INFO "Node map not sorted %lx,%lx\n",
+ prevbase,base);
+ return -1;
+ }
+
+ printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
+ nodeid, base, limit);
+
+ nodes[nodeid].start = base;
+ nodes[nodeid].end = limit;
+
+ prevbase = base;
+ }
+
+ if (maxnode <= 0)
+ return -1;
+
+ memnode_shift = compute_hash_shift(nodes,maxnode,end);
+ if (memnode_shift < 0) {
+ printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
+ return -1;
+ }
+ printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
+
+ early_for_all_nodes(i) {
+ setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ }
+
+ return 0;
+}
+
--- /dev/null
+/*
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/blk.h>
+#include <linux/ctype.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/numa.h>
+
+#define Dprintk(x...) printk(x)
+
+struct pglist_data *node_data[MAXNODE];
+bootmem_data_t plat_node_bdata[MAX_NUMNODES];
+
+int memnode_shift;
+u8 memnodemap[NODEMAPSIZE];
+
+static int numa_off __initdata;
+
+unsigned long nodes_present;
+int maxnode;
+
+static int emunodes __initdata;
+
+int compute_hash_shift(struct node *nodes, int numnodes, u64 maxmem)
+{
+ int i;
+ int shift = 24;
+ u64 addr;
+
+ /* When in doubt use brute force. */
+ while (shift < 48) {
+ memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
+ early_for_all_nodes (i) {
+ for (addr = nodes[i].start;
+ addr < nodes[i].end;
+ addr += (1UL << shift)) {
+ if (memnodemap[addr >> shift] != 0xff) {
+ printk("node %d shift %d addr %Lx conflict %d\n",
+ i, shift, addr, memnodemap[addr>>shift]);
+ goto next;
+ }
+ memnodemap[addr >> shift] = i;
+ }
+ }
+ return shift;
+ next:
+ shift++;
+ }
+ memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE);
+ return -1;
+}
+
+/* Initialize bootmem allocator for a node */
+void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
+{
+ unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
+ unsigned long nodedata_phys;
+ const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+
+ start = round_up(start, ZONE_ALIGN);
+
+ printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
+
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = end >> PAGE_SHIFT;
+
+ nodedata_phys = find_e820_area(start, end, pgdat_size);
+ if (nodedata_phys == -1L)
+ panic("Cannot find memory pgdat in node %d\n", nodeid);
+
+ Dprintk("nodedata_phys %lx\n", nodedata_phys);
+
+ node_data[nodeid] = phys_to_virt(nodedata_phys);
+ memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
+ NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
+ NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+ NODE_DATA(nodeid)->node_size = end_pfn - start_pfn;
+
+ /* Find a place for the bootmem map */
+ bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
+ bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+ bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
+ if (bootmap_start == -1L)
+ panic("Not enough continuous space for bootmap on node %d", nodeid);
+ Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
+
+ bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+ bootmap_start >> PAGE_SHIFT,
+ start_pfn, end_pfn);
+
+ e820_bootmem_free(NODE_DATA(nodeid), start, end);
+
+ reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
+ reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+
+ if (nodeid > maxnode)
+ maxnode = nodeid;
+ nodes_present |= (1UL << nodeid);
+}
+
+/* Initialize final allocator for a zone */
+void __init setup_node_zones(int nodeid)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long zones[MAX_NR_ZONES];
+ unsigned long dma_end_pfn;
+
+ memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
+
+ start_pfn = node_start_pfn(nodeid);
+ end_pfn = node_end_pfn(nodeid);
+
+ printk("setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
+
+ /* All nodes > 0 have a zero length zone DMA */
+ dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+ if (start_pfn < dma_end_pfn) {
+ zones[ZONE_DMA] = dma_end_pfn - start_pfn;
+ zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
+ } else {
+ zones[ZONE_NORMAL] = end_pfn - start_pfn;
+ }
+
+ free_area_init_node(nodeid, NODE_DATA(nodeid), NULL, zones,
+ start_pfn, NULL);
+}
+
+int fake_node;
+
+int __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+#ifdef CONFIG_K8_NUMA
+ if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
+ return 0;
+#endif
+ printk(KERN_INFO "%s\n",
+ numa_off ? "NUMA turned off" : "No NUMA configuration found");
+
+ if (!numa_off && emunodes > 0) {
+ struct node nodes[MAXNODE];
+ unsigned long nodesize = (end_pfn << PAGE_SHIFT) / emunodes;
+ int i;
+ if (emunodes > MAXNODE)
+ emunodes = MAXNODE;
+ printk(KERN_INFO "Faking %d nodes of size %ld MB\n", emunodes, nodesize>>20);
+ for (i = 0; i < emunodes; i++) {
+ unsigned long end = (i+1)*nodesize;
+ if (i == emunodes-1)
+ end = end_pfn << PAGE_SHIFT;
+ nodes[i].start = i * nodesize;
+ nodes[i].end = end;
+ setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ }
+ memnode_shift = compute_hash_shift(nodes, emunodes, nodes[i-1].end);
+ return 0;
+ }
+
+ printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
+ start_pfn << PAGE_SHIFT,
+ end_pfn << PAGE_SHIFT);
+ /* setup dummy node covering all memory */
+ fake_node = 1;
+ memnode_shift = 63;
+ memnodemap[0] = 0;
+ setup_node_bootmem(0, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+ return -1;
+}
+
+unsigned long __init numa_free_all_bootmem(void)
+{
+ int i;
+ unsigned long pages = 0;
+ for_all_nodes(i) {
+ pages += free_all_bootmem_node(NODE_DATA(i));
+ }
+ return pages;
+}
+
+void __init paging_init(void)
+{
+ int i;
+ for_all_nodes(i) {
+ setup_node_zones(i);
+ }
+}
+
+/* [numa=off] */
+/* [numa=emunodes] */
+__init int numa_setup(char *opt)
+{
+ if (!strncmp(opt,"off",3))
+ numa_off = 1;
+ if (isdigit(opt[0]))
+ emunodes = simple_strtoul(opt, NULL, 10);
+ return 1;
+}
+
+
.vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
__vsyscall_0 = LOADADDR(.vsyscall_0);
. = ALIGN(64);
- .vxtime_sequence : AT ((LOADADDR(.vsyscall_0) + SIZEOF(.vsyscall_0) + 63) & ~(63)) { *(.vxtime_sequence) }
- vxtime_sequence = LOADADDR(.vxtime_sequence);
+ .xtime_lock : AT ((LOADADDR(.vsyscall_0) + SIZEOF(.vsyscall_0) + 63) & ~(63)) { *(.xtime_lock) }
+ xtime_lock = LOADADDR(.xtime_lock);
. = ALIGN(16);
- .hpet : AT ((LOADADDR(.vxtime_sequence) + SIZEOF(.vxtime_sequence) + 15) & ~(15)) { *(.hpet) }
+ .hpet : AT ((LOADADDR(.xtime_lock) + SIZEOF(.xtime_lock) + 15) & ~(15)) { *(.hpet) }
hpet = LOADADDR(.hpet);
. = ALIGN(16);
.wall_jiffies : AT ((LOADADDR(.hpet) + SIZEOF(.hpet) + 15) & ~(15)) { *(.wall_jiffies) }
.sys_tz : AT ((LOADADDR(.wall_jiffies) + SIZEOF(.wall_jiffies) + 15) & ~(15)) { *(.sys_tz) }
sys_tz = LOADADDR(.sys_tz);
. = ALIGN(16);
- .jiffies : AT ((LOADADDR(.sys_tz) + SIZEOF(.sys_tz) + 15) & ~(15)) { *(.jiffies) }
+ .sysctl_vsyscall : AT ((LOADADDR(.sys_tz) + SIZEOF(.sys_tz) + 15) & ~(15)) { *(.sysctl_vsyscall) }
+ sysctl_vsyscall = LOADADDR(.sysctl_vsyscall);
+ . = ALIGN(16);
+ .jiffies : AT ((LOADADDR(.sysctl_vsyscall) + SIZEOF(.sysctl_vsyscall) + 15) & ~(15)) { *(.jiffies) }
jiffies = LOADADDR(.jiffies);
. = ALIGN(16);
.xtime : AT ((LOADADDR(.jiffies) + SIZEOF(.jiffies) + 15) & ~(15)) { *(.xtime) }
#define ACPI_ASM_MACROS
#define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() __cli()
-#define ACPI_ENABLE_IRQS() __sti()
+#define ACPI_DISABLE_IRQS() local_irq_disable()
+#define ACPI_ENABLE_IRQS() local_irq_enable()
#define ACPI_FLUSH_CPU_CACHE() wbinvd()
/*
extern int acpi_disabled;
+#define dmi_broken (0)
+#define BROKEN_ACPI_Sx 0x0001
+#define BROKEN_INIT_AFTER_S1 0x0002
+
#endif /*__KERNEL__*/
#endif /*_ASM_ACPI_H*/
return t;
}
-
-#define stack_current() \
-({ \
- struct thread_info *ti; \
- __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (~8191UL)); \
- ti->task; \
-})
-
-
#define current get_current()
#else
DESC_LDT, size);
}
+static inline void set_seg_base(unsigned cpu, int entry, void *base)
+{
+ struct desc_struct *d = &cpu_gdt_table[cpu][entry];
+ d->base0 = PTR_LOW(base);
+ d->base1 = PTR_MIDDLE(base);
+ d->base2 = PTR_HIGH(base);
+}
+
#define LDT_entry_a(info) \
((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
#define LDT_entry_b(info) \
-#ifndef _ASM_X8664_DMA_MAPPING_H
-#define _ASM_X8664_DMA_MAPPING_H
+#ifndef _X8664_DMA_MAPPING_H
+#define _X8664_DMA_MAPPING_H 1
#include <asm-generic/dma-mapping.h>
int type);
extern void setup_memory_region(void);
extern void contig_e820_setup(void);
-extern void e820_end_of_ram(void);
+extern unsigned long e820_end_of_ram(void);
extern void e820_reserve_resources(void);
extern void e820_print_map(char *who);
extern int e820_mapped(unsigned long start, unsigned long end, int type);
#define kernel_fpu_end() stts()
#define unlazy_fpu(tsk) do { \
- if (test_tsk_thread_flag(tsk, TIF_USEDFPU)) \
+ if ((tsk)->thread_info->flags & TIF_USEDFPU) \
save_init_fpu(tsk); \
} while (0)
-#define unlazy_current_fpu() do { \
- if (test_thread_flag(TIF_USEDFPU)) \
- save_init_fpu(tsk); \
-} while (0)
-
-
#define clear_fpu(tsk) do { \
if (test_tsk_thread_flag(tsk, TIF_USEDFPU)) { \
asm volatile("fwait"); \
{
asm volatile( "fxsave %0 ; fnclex"
: "=m" (tsk->thread.i387.fxsave));
- clear_tsk_thread_flag(tsk, TIF_USEDFPU);
+ tsk->thread_info->flags &= ~TIF_USEDFPU;
stts();
}
#ifndef _ASM_IO_H
#define _ASM_IO_H
+#include <linux/config.h>
+
/*
* This file contains the definitions for the x86 IO instructions
* inb/inw/inl/outb/outw/outl and the "string versions" of the same
/*
* Change "struct page" to physical address.
*/
+#ifdef CONFIG_DISCONTIGMEM
+#include <asm/mmzone.h>
+#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+#else
#define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT)
+#endif
extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
--- /dev/null
+#ifndef _ASM_MMSEGMENT_H
+#define _ASM_MMSEGMENT_H 1
+
+typedef struct {
+ unsigned long seg;
+} mm_segment_t;
+
+#endif
--- /dev/null
+/* K8 NUMA support */
+/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */
+/* 2.5 Version losely based on the NUMAQ Code by Pat Gaughen. */
+#ifndef _ASM_X86_64_MMZONE_H
+#define _ASM_X86_64_MMZONE_H 1
+
+#include <linux/config.h>
+
+#ifdef CONFIG_DISCONTIGMEM
+
+#define VIRTUAL_BUG_ON(x)
+
+#include <asm/numnodes.h>
+#include <asm/smp.h>
+
+#define MAXNODE 8
+#define NODEMAPSIZE 0xff
+
+/* Simple perfect hash to map physical addresses to node numbers */
+extern int memnode_shift;
+extern u8 memnodemap[NODEMAPSIZE];
+extern int maxnode;
+
+extern struct pglist_data *node_data[];
+
+/* kern_addr_valid below hardcodes the same algorithm*/
+static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
+{
+ int nid;
+ VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
+ nid = memnodemap[addr >> memnode_shift];
+ VIRTUAL_BUG_ON(nid > maxnode);
+ return nid;
+}
+
+#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr))
+#define NODE_DATA(nid) (node_data[nid])
+
+#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map)
+
+#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map)
+#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
+ NODE_DATA(nid)->node_size)
+#define node_size(nid) (NODE_DATA(nid)->node_size)
+
+#define local_mapnr(kvaddr) \
+ ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) )
+#define kern_addr_valid(kvaddr) ({ \
+ int ok = 0; \
+ unsigned long index = __pa(kvaddr) >> memnode_shift; \
+ if (index <= NODEMAPSIZE) { \
+ unsigned nodeid = memnodemap[index]; \
+ unsigned long pfn = __pa(kvaddr) >> PAGE_SHIFT; \
+ unsigned long start_pfn = node_start_pfn(nodeid); \
+ ok = (nodeid != 0xff) && \
+ (pfn >= start_pfn) && \
+ (pfn < start_pfn + node_size(nodeid)); \
+ } \
+ ok; \
+})
+
+/* AK: this currently doesn't deal with invalid addresses. We'll see
+ if the 2.5 kernel doesn't pass them
+ (2.4 used to). */
+#define pfn_to_page(pfn) ({ \
+ int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); \
+ ((pfn) - node_start_pfn(nid)) + node_mem_map(nid); \
+})
+
+#define page_to_pfn(page) \
+ (long)(((page) - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn)
+
+/* AK: !DISCONTIGMEM just forces it to 1. Can't we too? */
+#define pfn_valid(pfn) ((pfn) < num_physpages)
+
+
+#endif
+#endif
extern int mp_current_pci_id;
extern unsigned long mp_lapic_addr;
extern int pic_mode;
-extern int using_apic_timer;
#ifdef CONFIG_ACPI_BOOT
extern void mp_register_lapic (u8 id, u8 enabled);
#endif /*CONFIG_X86_IO_APIC*/
#endif
+extern int using_apic_timer;
+
#endif
: "=a" (low), "=d" (high) \
: "c" (counter))
+extern inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx)
+{
+ __asm__("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (op));
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+extern inline unsigned int cpuid_eax(unsigned int op)
+{
+ unsigned int eax;
+
+ __asm__("cpuid"
+ : "=a" (eax)
+ : "0" (op)
+ : "bx", "cx", "dx");
+ return eax;
+}
+extern inline unsigned int cpuid_ebx(unsigned int op)
+{
+ unsigned int eax, ebx;
+
+ __asm__("cpuid"
+ : "=a" (eax), "=b" (ebx)
+ : "0" (op)
+ : "cx", "dx" );
+ return ebx;
+}
+extern inline unsigned int cpuid_ecx(unsigned int op)
+{
+ unsigned int eax, ecx;
+
+ __asm__("cpuid"
+ : "=a" (eax), "=c" (ecx)
+ : "0" (op)
+ : "bx", "dx" );
+ return ecx;
+}
+extern inline unsigned int cpuid_edx(unsigned int op)
+{
+ unsigned int eax, edx;
+
+ __asm__("cpuid"
+ : "=a" (eax), "=d" (edx)
+ : "0" (op)
+ : "bx", "cx");
+ return edx;
+}
+
+
#endif
/* AMD/K8 specific MSRs */
--- /dev/null
+#ifndef _ASM_X8664_NUMA_H
+#define _ASM_X8664_NUMA_H 1
+
+#define MAXNODE 8
+#define NODEMASK 0xff
+
+struct node {
+ u64 start,end;
+};
+
+#define for_all_nodes(x) for ((x) = 0; (x) <= maxnode; (x)++) \
+ if ((1UL << (x)) & nodes_present)
+
+#define early_for_all_nodes(n) \
+ for (n=0; n<MAXNODE;n++) if (nodes[n].start!=nodes[n].end)
+
+extern int compute_hash_shift(struct node *nodes, int numnodes, u64 maxmem);
+extern unsigned long nodes_present;
+
+#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
+
+#endif
--- /dev/null
+#ifndef _ASM_X8664_NUMNODES_H
+#define _ASM_X8664_NUMNODES_H 1
+
+#include <linux/config.h>
+
+#ifdef CONFIG_DISCONTIGMEM
+#define MAX_NUMNODES 8 /* APIC limit currently */
+#else
+#define MAX_NUMNODES 1
+#endif
+
+#endif
#ifndef _X86_64_PAGE_H
#define _X86_64_PAGE_H
+#include <linux/config.h>
+
/* PAGE_SHIFT determines the page size */
#define PAGE_SHIFT 12
#ifdef __ASSEMBLY__
#endif
#define PAGE_MASK (~(PAGE_SIZE-1))
#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & (__PHYSICAL_MASK << PAGE_SHIFT))
-#define THREAD_SIZE (2*PAGE_SIZE)
+
+#define THREAD_ORDER 1
+#ifdef __ASSEMBLY__
+#define THREAD_SIZE (1 << (PAGE_SHIFT + THREAD_ORDER))
+#else
+#define THREAD_SIZE (1UL << (PAGE_SHIFT + THREAD_ORDER))
+#endif
#define CURRENT_MASK (~(THREAD_SIZE-1))
#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
-/* See Documentation/x86_64/mm.txt for a description of the layout. */
+/* See Documentation/x86_64/mm.txt for a description of the memory map. */
#define __START_KERNEL 0xffffffff80100000
#define __START_KERNEL_map 0xffffffff80000000
#define __PAGE_OFFSET 0x0000010000000000
__pa(v); })
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
+#ifndef CONFIG_DISCONTIGMEM
#define pfn_to_page(pfn) (mem_map + (pfn))
#define page_to_pfn(page) ((unsigned long)((page) - mem_map))
-#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
#define pfn_valid(pfn) ((pfn) < max_mapnr)
+#endif
+
+#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
} \
} while (0)
-
+/*
+ * AK: PDA read accesses should be neither volatile nor have an memory clobber.
+ * Unfortunately removing them causes all hell to break lose currently.
+ */
#define pda_from_op(op,field) ({ \
typedef typeof_field(struct x8664_pda, field) T__; T__ ret__; \
switch (sizeof_field(struct x8664_pda, field)) { \
static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
{
- set_pmd(pmd, __pmd(_PAGE_TABLE |
- ((u64)(pte - mem_map) << PAGE_SHIFT)));
+ set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
}
extern __inline__ pmd_t *get_pmd(void)
}
#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
-#define __pmd_free_tlb(tlb,x) do { } while (0)
+#define __pmd_free_tlb(tlb,x) pmd_free(x)
#endif /* _X86_64_PGALLOC_H */
#define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte, 0))
#define pte_same(a, b) ((a).pte == (b).pte)
+#define PML4_SIZE (1UL << PML4_SHIFT)
+#define PML4_MASK (~(PML4_SIZE-1))
#define PMD_SIZE (1UL << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE-1))
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
/* PMD - Level 2 access */
#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
-#define pmd_page(pmd) (mem_map + ((pmd_val(pmd) & PTE_MASK)>>PAGE_SHIFT))
+#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
+
#define __pmd_offset(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \
__pmd_offset(address))
#endif /* !__ASSEMBLY__ */
+#ifndef CONFIG_DISCONTIGMEM
#define kern_addr_valid(addr) (1)
+#endif
#define io_remap_page_range remap_page_range
#include <asm/msr.h>
#include <asm/current.h>
#include <asm/system.h>
+#include <asm/mmsegment.h>
#define TF_MASK 0x00000100
#define IF_MASK 0x00000200
#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
-/*
- * Generic CPUID function
- * FIXME: This really belongs to msr.h
- */
-extern inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx)
-{
- __asm__("cpuid"
- : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
- : "0" (op));
-}
-
-/*
- * CPUID functions returning a single datum
- */
-extern inline unsigned int cpuid_eax(unsigned int op)
-{
- unsigned int eax;
-
- __asm__("cpuid"
- : "=a" (eax)
- : "0" (op)
- : "bx", "cx", "dx");
- return eax;
-}
-extern inline unsigned int cpuid_ebx(unsigned int op)
-{
- unsigned int eax, ebx;
-
- __asm__("cpuid"
- : "=a" (eax), "=b" (ebx)
- : "0" (op)
- : "cx", "dx" );
- return ebx;
-}
-extern inline unsigned int cpuid_ecx(unsigned int op)
-{
- unsigned int eax, ecx;
-
- __asm__("cpuid"
- : "=a" (eax), "=c" (ecx)
- : "0" (op)
- : "bx", "dx" );
- return ecx;
-}
-extern inline unsigned int cpuid_edx(unsigned int op)
-{
- unsigned int eax, edx;
-
- __asm__("cpuid"
- : "=a" (eax), "=d" (edx)
- : "0" (op)
- : "bx", "cx");
- return edx;
-}
-
/*
* Intel CPU features in CR4
*/
:"ax");
}
-#if 0
-/*
- * Cyrix CPU configuration register indexes
- */
-#define CX86_CCR0 0xc0
-#define CX86_CCR1 0xc1
-#define CX86_CCR2 0xc2
-#define CX86_CCR3 0xc3
-#define CX86_CCR4 0xe8
-#define CX86_CCR5 0xe9
-#define CX86_CCR6 0xea
-#define CX86_CCR7 0xeb
-#define CX86_DIR0 0xfe
-#define CX86_DIR1 0xff
-#define CX86_ARR_BASE 0xc4
-#define CX86_RCR_BASE 0xdc
-
-/*
- * Cyrix CPU indexed register access macros
- */
-
-#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
-
-#define setCx86(reg, data) do { \
- outb((reg), 0x22); \
- outb((data), 0x23); \
-} while (0)
-
-#endif
-
/*
* Bus types
*/
struct i387_fxsave_struct fxsave;
};
-typedef struct {
- unsigned long seg;
-} mm_segment_t;
-
struct tss_struct {
u32 reserved1;
u64 rsp0;
u16 reserved5;
u16 io_map_base;
u32 io_bitmap[IO_BITMAP_SIZE];
-} __attribute__((packed));
+} __attribute__((packed)) ____cacheline_aligned;
struct thread_struct {
unsigned long rsp0;
#define NMI_STACK 3
#define N_EXCEPTION_STACKS 3 /* hw limit: 7 */
#define EXCEPTION_STKSZ 1024
+#define EXCEPTION_STK_ORDER 0
#define start_thread(regs,new_rip,new_rsp) do { \
asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
__asm__ __volatile__("rep;nop": : :"memory");
}
+/* Stop speculative execution */
+extern inline void sync_core(void)
+{
+ int tmp;
+ asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
+}
+
#define cpu_has_fpu 1
#define ARCH_HAS_PREFETCH
#define spin_lock_prefetch(x) prefetchw(x)
#define cpu_relax() rep_nop()
-
/*
* NSC/Cyrix CPU configuration register indexes
*/
outb((data), 0x23); \
} while (0)
+#define stack_current() \
+({ \
+ struct thread_info *ti; \
+ asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
+ ti->task; \
+})
+
#endif /* __ASM_X86_64_PROCESSOR_H */
extern void do_softirq_thunk(void);
+extern int numa_setup(char *opt);
+
extern int setup_early_printk(char *);
extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
extern void free_bootmem_generic(unsigned long phys, unsigned len);
-extern unsigned long start_pfn, end_pfn, end_pfn_map;
+extern unsigned long end_pfn_map;
extern void show_stack(unsigned long * rsp);
extern void exception_table_check(void);
-extern int acpi_boot_init(char *);
+extern void acpi_reserve_bootmem(void);
+
+extern void swap_low_mappings(void);
extern int map_syscall32(struct mm_struct *mm, unsigned long address);
extern char *syscall32_page;
+void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end);
+
+extern unsigned long max_mapnr;
+extern unsigned long end_pfn;
+extern unsigned long table_start, table_end;
+
struct thread_struct;
+struct user_desc;
int do_set_thread_area(struct thread_struct *t, struct user_desc *u_info);
int do_get_thread_area(struct thread_struct *t, struct user_desc *u_info);
#define __USER_DS 0x2b /* 5*8+3 */
#define __USER_CS 0x33 /* 6*8+3 */
#define __USER32_DS __USER_DS
+#define __KERNEL16_CS (GDT_ENTRY_KERNELCS16 * 8)
#define GDT_ENTRY_TLS 1
#define GDT_ENTRY_TSS 8 /* needs two entries */
#define GDT_ENTRY_LDT 10
#define GDT_ENTRY_TLS_MIN 11
#define GDT_ENTRY_TLS_MAX 13
-#define GDT_ENTRY_LONGBASE 14
+/* 14 free */
+#define GDT_ENTRY_KERNELCS16 15
#define GDT_ENTRY_TLS_ENTRIES 3
extern void smp_send_reschedule_all(void);
extern void smp_invalidate_rcv(void); /* Process an NMI */
extern void (*mtrr_hook) (void);
-extern void zap_low_mappings (void);
+extern void zap_low_mappings(void);
+
+#define SMP_TRAMPOLINE_BASE 0x6000
/*
* On x86 all CPUs are mapped 1:1 to the APIC space.
extern volatile unsigned long cpu_callout_map;
#define cpu_possible(cpu) (cpu_callout_map & (1<<(cpu)))
+#define cpu_online(cpu) (cpu_online_map & (1<<(cpu)))
-extern inline int cpu_logical_map(int cpu)
-{
- return cpu;
-}
-extern inline int cpu_number_map(int cpu)
-{
- return cpu;
-}
+#define for_each_cpu(cpu, mask) \
+ for(mask = cpu_online_map; \
+ cpu = __ffs(mask), mask != 0; \
+ mask &= ~(1UL<<cpu))
-extern inline unsigned int num_online_cpus(void)
+extern inline int any_online_cpu(unsigned int mask)
{
- return hweight32(cpu_online_map);
-}
+ if (mask & cpu_online_map)
+ return __ffs(mask & cpu_online_map);
-extern inline int find_next_cpu(unsigned cpu)
-{
- unsigned long left = cpu_online_map >> (cpu+1);
- if (!left)
return -1;
- return ffz(~left) + cpu;
}
-extern inline int find_first_cpu(void)
+extern inline unsigned int num_online_cpus(void)
{
- return ffz(~cpu_online_map);
+ return hweight32(cpu_online_map);
}
-/* RED-PEN different from i386 */
-#define for_each_cpu(i) \
- for((i) = find_first_cpu(); (i)>=0; (i)=find_next_cpu(i))
-
static inline int num_booting_cpus(void)
{
return hweight32(cpu_callout_map);
extern volatile unsigned long cpu_callout_map;
-/*
- * Some lowlevel functions might want to know about
- * the real APIC ID <-> CPU # mapping.
- */
-extern volatile int x86_apicid_to_cpu[NR_CPUS];
-extern volatile int x86_cpu_to_apicid[NR_CPUS];
-
-/*
- * This function is needed by all SMP systems. It must _always_ be valid
- * from the initial startup. We map APIC_BASE very early in page_setup(),
- * so this is correct in the x86 case.
- */
-
#define smp_processor_id() read_pda(cpunumber)
-
extern __inline int hard_smp_processor_id(void)
{
/* we don't want to mark this access volatile - bad code generation */
return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
}
+extern int disable_apic;
+extern int slow_smp_processor_id(void);
+
+extern inline int safe_smp_processor_id(void)
+{
+ if (disable_apic)
+ return slow_smp_processor_id();
+ else
+ return hard_smp_processor_id();
+}
+
#define cpu_online(cpu) (cpu_online_map & (1<<(cpu)))
#endif /* !ASSEMBLY */
#ifndef CONFIG_SMP
#define stack_smp_processor_id() 0
+#define safe_smp_processor_id() 0
#define for_each_cpu(x) (x)=0;
#define cpu_logical_map(x) (x)
#else
#define stack_smp_processor_id() \
({ \
struct thread_info *ti; \
- __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (~8191UL)); \
+ __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
ti->cpu; \
})
#endif
typedef struct {
volatile unsigned int lock;
-#if CONFIG_DEBUG_SPINLOCK
+#ifdef CONFIG_DEBUG_SPINLOCK
unsigned magic;
#endif
} spinlock_t;
/*
* This works. Despite all the confusion.
+ * (except on PPro SMP or if we are using OOSTORE)
+ * (PPro errata 66, 92)
*/
+
+#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
+
#define spin_unlock_string \
- "movb $1,%0"
+ "movb $1,%0" \
+ :"=m" (lock->lock) : : "memory"
+
+
+static inline void _raw_spin_unlock(spinlock_t *lock)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+ if (lock->magic != SPINLOCK_MAGIC)
+ BUG();
+ if (!spin_is_locked(lock))
+ BUG();
+#endif
+ __asm__ __volatile__(
+ spin_unlock_string
+ );
+}
+
+#else
+
+#define spin_unlock_string \
+ "xchgb %b0, %1" \
+ :"=q" (oldval), "=m" (lock->lock) \
+ :"0" (oldval) : "memory"
+
+static inline void _raw_spin_unlock(spinlock_t *lock)
+{
+ char oldval = 1;
+#ifdef CONFIG_DEBUG_SPINLOCK
+ if (lock->magic != SPINLOCK_MAGIC)
+ BUG();
+ if (!spin_is_locked(lock))
+ BUG();
+#endif
+ __asm__ __volatile__(
+ spin_unlock_string
+ );
+}
+
+#endif
static inline int _raw_spin_trylock(spinlock_t *lock)
{
- signed char oldval;
+ char oldval;
__asm__ __volatile__(
"xchgb %b0,%1"
:"=q" (oldval), "=m" (lock->lock)
:"=m" (lock->lock) : : "memory");
}
-static inline void _raw_spin_unlock(spinlock_t *lock)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
- if (lock->magic != SPINLOCK_MAGIC)
- BUG();
- if (!spin_is_locked(lock))
- BUG();
-#endif
- __asm__ __volatile__(
- spin_unlock_string
- :"=m" (lock->lock) : : "memory");
-}
/*
* Read-write spinlocks, allowing multiple readers
#define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0)
+#define rwlock_is_locked(x) ((x)->lock != RW_LOCK_BIAS)
+
/*
* On x86, we implement read-write locks as a 32-bit counter
* with the high bit (sign) being the "contended" bit.
* Changed to use the same technique as rw semaphores. See
* semaphore.h for details. -ben
*/
-/* the spinlock helpers are in arch/x86_64/kernel/semaphore.S */
+/* the spinlock helpers are in arch/i386/kernel/semaphore.c */
-extern inline void _raw_read_lock(rwlock_t *rw)
+static inline void _raw_read_lock(rwlock_t *rw)
{
#ifdef CONFIG_DEBUG_SPINLOCK
if (rw->magic != RWLOCK_MAGIC)
return 0;
}
-#define rwlock_is_locked(x) ((x)->lock != RW_LOCK_BIAS)
-
#endif /* __ASM_SPINLOCK_H */
#define loadsegment(seg,value) \
asm volatile("\n" \
"1:\t" \
- "movl %0,%%" #seg "\n" \
+ "movl %k0,%%" #seg "\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3:\t" \
".align 8\n\t" \
".quad 1b,3b\n" \
".previous" \
- : :"r" ((int)(value)))
+ : :"r" (value))
#define set_debug(value,register) \
__asm__("movq %0,%%db" #register \
asm volatile("movq %0,%%cr0" :: "r" (val));
}
+static inline unsigned long read_cr3(void)
+{
+ unsigned long cr3;
+ asm("movq %%cr3,%0" : "=r" (cr3));
+ return cr3;
+}
+
static inline unsigned long read_cr4(void)
{
unsigned long cr4;
#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
-#include <asm/processor.h>
-#include <linux/config.h>
-#include <asm/pda.h>
-#endif
+#include <asm/page.h>
+#include <asm/types.h>
/*
* low level task data that entry.S needs immediate access to
* - this struct shares the supervisor stack pages
*/
#ifndef __ASSEMBLY__
+struct task_struct;
+struct exec_domain;
+#include <asm/mmsegment.h>
+
struct thread_info {
struct task_struct *task; /* main task structure */
struct exec_domain *exec_domain; /* execution domain */
mm_segment_t addr_limit;
struct restart_block restart_block;
};
-
#endif
/*
#define init_thread_info (init_thread_union.thread_info)
#define init_stack (init_thread_union.stack)
-/* how to get the thread information struct from C */
-
-#define THREAD_SIZE (2*PAGE_SIZE)
-
static inline struct thread_info *current_thread_info(void)
{
struct thread_info *ti;
- ti = (void *)read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE;
- return ti;
-}
-
-static inline struct thread_info *stack_thread_info(void)
-{
- struct thread_info *ti;
- __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (~8191UL));
+ asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));
return ti;
}
/* thread information allocation */
-#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,1))
-#define free_thread_info(ti) free_pages((unsigned long) (ti), 1)
+#define alloc_thread_info() \
+ ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER))
+#define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER)
#define get_thread_info(ti) get_task_struct((ti)->task)
#define put_thread_info(ti) put_task_struct((ti)->task)
/* how to get the thread information struct from ASM */
/* only works on the process stack. otherwise get it via the PDA. */
#define GET_THREAD_INFO(reg) \
- movq $-8192, reg; \
+ movq $CURRENT_MASK, reg; \
andq %rsp, reg
#endif
/*
- * linux/include/asm-x8664/timex.h
+ * linux/include/asm-x86_64/timex.h
*
- * x8664 architecture timex specifications
+ * x86-64 architecture timex specifications
*/
#ifndef _ASMx8664_TIMEX_H
#define _ASMx8664_TIMEX_H
(1000000/CLOCK_TICK_FACTOR) / (CLOCK_TICK_RATE/CLOCK_TICK_FACTOR)) \
<< (SHIFT_SCALE-SHIFT_HZ)) / HZ)
-/*
- * Standard way to access the cycle counter on i586+ CPUs.
- * Currently only used on SMP.
- *
- * If you really have a SMP machine with i486 chips or older,
- * compile for that, and this will just always return zero.
- * That's ok, it just means that the nicer scheduling heuristics
- * won't work for you.
- *
- * We only use the low 32 bits, and we'd simply better make sure
- * that we reschedule before that wraps. Scheduling at least every
- * four billion cycles just basically sounds like a good idea,
- * regardless of how fast the machine is.
- */
typedef unsigned long long cycles_t;
extern cycles_t cacheflush_time;
#ifndef _ASM_X86_64_TOPOLOGY_H
#define _ASM_X86_64_TOPOLOGY_H
+#include <linux/config.h>
+
+#ifdef CONFIG_DISCONTIGMEM
+
+/* Map the K8 CPU local memory controllers to a simple 1:1 CPU:NODE topology */
+
+extern int fake_node;
+extern unsigned long cpu_online_map;
+
+#define cpu_to_node(cpu) (fake_node ? 0 : (cpu))
+#define memblk_to_node(memblk) (fake_node ? 0 : (memblk))
+#define parent_node(node) (node)
+#define node_to_first_cpu(node) (fake_node ? 0 : (node))
+#define node_to_cpu_mask(node) (fake_node ? cpu_online_map : (1UL << (node)))
+#define node_to_memblk(node) (node)
+
+#define NODE_BALANCE_RATE 30 /* CHECKME */
+
+#endif
+
#include <asm-generic/topology.h>
-#endif /* _ASM_X86_64_TOPOLOGY_H */
+#endif
__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
#define __NR_semtimedop 220
__SYSCALL(__NR_semtimedop, sys_semtimedop)
+#define __NR_fadvise64 221
+__SYSCALL(__NR_fadvise64, sys_fadvise64)
-#define __NR_syscall_max __NR_semtimedop
+#define __NR_syscall_max __NR_fadvise64
#ifndef __NO_STUBS
/* user-visible error numbers are in the range -1 - -4095 */
#define _ASM_X86_64_VSYSCALL_H_
#include <linux/time.h>
+#include <linux/seqlock.h>
enum vsyscall_num {
__NR_vgettimeofday,
#define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies"), aligned(16)))
#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
#define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16)))
+#define __section_sysctl_vsyscall __attribute__ ((unused, __section__ (".sysctl_vsyscall"), aligned(16)))
#define __section_xtime __attribute__ ((unused, __section__ (".xtime"), aligned(16)))
-#define __section_vxtime_sequence __attribute__ ((unused, __section__ (".vxtime_sequence"), aligned(16)))
+#define __section_xtime_lock __attribute__ ((unused, __section__ (".xtime_lock"), aligned(L1_CACHE_BYTES)))
+
struct hpet_data {
long address; /* base address */
#define hpet_writel(d,a) writel(d, fix_to_virt(FIX_HPET_BASE) + a)
/* vsyscall space (readonly) */
-extern long __vxtime_sequence[2];
extern struct hpet_data __hpet;
extern struct timespec __xtime;
extern volatile unsigned long __jiffies;
extern unsigned long __wall_jiffies;
extern struct timezone __sys_tz;
+extern seqlock_t __xtime_lock;
/* kernel space (writeable) */
-extern long vxtime_sequence[2];
extern struct hpet_data hpet;
extern unsigned long wall_jiffies;
extern struct timezone sys_tz;
+extern int sysctl_vsyscall;
+extern seqlock_t xtime_lock;
-#define vxtime_lock() do { vxtime_sequence[0]++; wmb(); } while(0)
-#define vxtime_unlock() do { wmb(); vxtime_sequence[1]++; } while (0)
+#define ARCH_HAVE_XTIME_LOCK 1
#endif /* __KERNEL__ */