unsigned int local_bh_count[NR_CPUS];
unsigned long hardirq_no[NR_CPUS];
-#define RTC_IRQ 8
-#ifdef CONFIG_RTC
-#define TIMER_IRQ 0 /* timer is the pit */
-#else
-#define TIMER_IRQ RTC_IRQ /* the timer is, in fact, the rtc */
-#endif
-
#if NR_IRQS > 64
# error Unable to handle more than 64 irq levels.
#endif
extern void srm_device_interrupt(unsigned long vector, struct pt_regs * regs);
extern void handle_irq(int irq, int ack, struct pt_regs * regs);
+
+#define RTC_IRQ 8
+#ifdef CONFIG_RTC
+#define TIMER_IRQ 0 /* timer is the pit */
+#else
+#define TIMER_IRQ RTC_IRQ /* timer is the rtc */
+#endif
+
i = ffz(~pld);
pld &= pld - 1; /* clear least bit set */
if (i == 7) { /* if ISA int */
+ /* Ruffian does not have the RTC connected to
+ the CPU timer interrupt. Instead, it uses the
+ PIT connected to IRQ 0. So we must detect that
+ and route that specifically to where we expected
+ to find the timer interrupt come in. */
+
/* Copy this code from isa_device_interrupt because
we need to hook into int 0 for the timer. I
refuse to soil device_interrupt with ifdefs. */
if (j == 7 && !(inb(0x20) & 0x80)) {
/* It's only a passive release... */
} else if (j == 0) {
- handle_irq(8, -1, regs); /* fake it */
+ handle_irq(TIMER_IRQ, -1, regs);
ruffian_ack_irq(0);
} else {
handle_irq(j, j, regs);
#include <linux/timex.h>
#include "proto.h"
-
-#ifdef CONFIG_RTC
-#define TIMER_IRQ 0 /* using pit for timer */
-#else
-#define TIMER_IRQ 8 /* using rtc for timer */
-#endif
+#include "irq.h"
static int set_rtc_mmss(unsigned long);
ALIGN
.globl ret_from_fork
ret_from_fork:
- GET_CURRENT(%ebx)
#ifdef __SMP__
- lock ; btrl $0, SYMBOL_NAME(scheduler_lock)
+ call SYMBOL_NAME(schedule_tail)
#endif /* __SMP__ */
+ GET_CURRENT(%ebx)
jmp ret_from_sys_call
/*
EXPORT_SYMBOL(__global_sti);
EXPORT_SYMBOL(__global_save_flags);
EXPORT_SYMBOL(__global_restore_flags);
-EXPORT_SYMBOL(smp_message_pass);
EXPORT_SYMBOL(mtrr_hook);
#endif
if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
desc->status = status | IRQ_REPLAY;
- send_IPI(APIC_DEST_SELF, IO_APIC_VECTOR(irq));
+ send_IPI_self(IO_APIC_VECTOR(irq));
}
}
/*
* The following vectors are part of the Linux architecture, there
* is no hardware IRQ pin equivalent for them, they are triggered
- * through the ICC by us (IPIs), via smp_message_pass():
+ * through the ICC by us (IPIs)
*/
BUILD_SMP_INTERRUPT(reschedule_interrupt)
BUILD_SMP_INTERRUPT(invalidate_interrupt)
}
p += sprintf(p, "NMI: %10u\n", atomic_read(&nmi_counter));
#ifdef __SMP__
- p += sprintf(p, "IPI: %10lu\n", ipi_count);
+ p += sprintf(p, "ERR: %10lu\n", ipi_count);
#endif
return p - buf;
}
*/
/* IPI for rescheduling */
- set_intr_gate(0x30, reschedule_interrupt);
+ set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
/* IPI for invalidation */
- set_intr_gate(0x31, invalidate_interrupt);
+ set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
/* IPI for CPU halt */
- set_intr_gate(0x40, stop_cpu_interrupt);
+ set_intr_gate(STOP_CPU_VECTOR, stop_cpu_interrupt);
/* self generated IPI for local APIC timer */
- set_intr_gate(0x41, apic_timer_interrupt);
+ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
/* IPI for MTRR control */
- set_intr_gate(0x50, mtrr_interrupt);
+ set_intr_gate(MTRR_CHANGE_VECTOR, mtrr_interrupt);
/* IPI vector for APIC spurious interrupts */
- set_intr_gate(0xff, spurious_interrupt);
+ set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
#endif
request_region(0x20,0x20,"pic1");
request_region(0xa0,0x20,"pic2");
unsigned int depth; /* Disable depth for nested irq disables */
} irq_desc_t;
-#define IRQ0_TRAP_VECTOR 0x51
+/*
+ * Special IRQ vectors used by the SMP architecture:
+ *
+ * (some of the following vectors are 'rare', they might be merged
+ * into a single vector to save vector space. TLB, reschedule and
+ * local APIC vectors are performance-critical.)
+ */
+#define RESCHEDULE_VECTOR 0x30
+#define INVALIDATE_TLB_VECTOR 0x31
+#define STOP_CPU_VECTOR 0x40
+#define LOCAL_TIMER_VECTOR 0x41
+#define MTRR_CHANGE_VECTOR 0x50
+
+/*
+ * First vector available to drivers: (vectors 0x51-0xfe)
+ */
+#define IRQ0_TRAP_VECTOR 0x51
+
+/*
+ * This IRQ should never happen, but we print a message nevertheless.
+ */
+#define SPURIOUS_APIC_VECTOR 0xff
extern irq_desc_t irq_desc[NR_IRQS];
extern int irq_vector[NR_IRQS];
* Interrupt entry/exit code at both C and assembly level
*/
-void mask_irq(unsigned int irq);
-void unmask_irq(unsigned int irq);
-void disable_8259A_irq(unsigned int irq);
-int i8259A_irq_pending(unsigned int irq);
-void ack_APIC_irq(void);
-void setup_IO_APIC(void);
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
-void make_8259A_irq(unsigned int irq);
-void send_IPI(int dest, int vector);
-void init_pic_mode(void);
-void print_IO_APIC(void);
+extern void mask_irq(unsigned int irq);
+extern void unmask_irq(unsigned int irq);
+extern void disable_8259A_irq(unsigned int irq);
+extern int i8259A_irq_pending(unsigned int irq);
+extern void ack_APIC_irq(void);
+extern void setup_IO_APIC(void);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+extern void make_8259A_irq(unsigned int irq);
+extern void FASTCALL(send_IPI_self(int vector));
+extern void smp_send_mtrr(void);
+extern void init_pic_mode(void);
+extern void print_IO_APIC(void);
extern unsigned long long io_apic_irqs;
#include <asm/bitops.h>
#include <asm/atomic.h>
+#include <asm/hardirq.h>
+#include "irq.h"
+
#define MTRR_VERSION "1.26 (19981001)"
#define TRUE 1
/* Send a message to all other CPUs and wait for them to enter the
barrier */
atomic_set (&undone_count, smp_num_cpus - 1);
- smp_message_pass (MSG_ALL_BUT_SELF, MSG_MTRR_CHANGE, 0, 0);
+ smp_send_mtrr();
/* Wait for it to be done */
timeout = jiffies + JIFFIE_TIMEOUT;
while ( (atomic_read (&undone_count) > 0) &&
current->priority = 0;
current->counter = -100;
while(1) {
- if (current_cpu_data.hlt_works_ok && !hlt_counter && !current->need_resched)
+ if (current_cpu_data.hlt_works_ok && !hlt_counter &&
+ !current->need_resched)
__asm__("hlt");
- schedule();
- check_pgt_cache();
+ /*
+ * although we are an idle CPU, we do not want to
+ * get into the scheduler unnecessarily.
+ */
+ if (current->need_resched) {
+ schedule();
+ check_pgt_cache();
+ }
}
}
* hosts.
*
* (c) 1995 Alan Cox, CymruNET Ltd <alan@cymru.net>
+ * (c) 1998 Ingo Molnar
+ *
* Supported by Caldera http://www.caldera.com.
* Much of the core SMP work is based on previous work by Thomas Radke, to
* whom a great many thanks are extended.
*
- * Thanks to Intel for making available several different Pentium and
- * Pentium Pro MP machines.
+ * Thanks to Intel for making available several different Pentium,
+ * Pentium Pro and Pentium-II/Xeon MP machines.
*
* This code is released under the GNU public license version 2 or
* later.
* Ingo Molnar : Added APIC timers, based on code
* from Jose Renau
* Alan Cox : Added EBDA scanning
+ * Ingo Molnar : various cleanups and rewrites
*/
#include <linux/config.h>
#include <asm/bitops.h>
#include <asm/pgtable.h>
#include <asm/io.h>
+#include <linux/io_trace.h>
#ifdef CONFIG_MTRR
# include <asm/mtrr.h>
return b;
}
+/*
+ * function prototypes:
+ */
+static void cache_APIC_registers (void);
+
+
static int smp_b_stepping = 0; /* Set if we find a B stepping CPU */
static int max_cpus = -1; /* Setup configured maximum number of CPUs to activate */
unsigned char boot_cpu_id = 0; /* Processor that is doing the boot up */
static int smp_activated = 0; /* Tripped once we need to start cross invalidating */
int apic_version[NR_CPUS]; /* APIC version number */
-static volatile int smp_commenced=0; /* Tripped when we start scheduling */
+volatile int smp_commenced=0; /* Tripped when we start scheduling */
unsigned long apic_retval; /* Just debugging the assembler.. */
-static volatile unsigned char smp_cpu_in_msg[NR_CPUS]; /* True if this processor is sending an IPI */
-
volatile unsigned long kernel_counter=0; /* Number of times the processor holds the lock */
volatile unsigned long syscall_count=0; /* Number of times the processor holds the syscall lock */
volatile unsigned long ipi_count; /* Number of IPIs delivered */
-volatile unsigned long smp_proc_in_lock[NR_CPUS] = {0,};/* for computing process time */
-volatile int smp_process_available=0;
-
const char lk_lockmsg[] = "lock from interrupt context at %p\n";
int mp_bus_id_to_type [MAX_MP_BUSSES] = { -1, };
if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4))
{
- printk("Bad signature [%c%c%c%c].\n",
+ panic("SMP mptable: bad signature [%c%c%c%c]!\n",
mpc->mpc_signature[0],
mpc->mpc_signature[1],
mpc->mpc_signature[2],
}
if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length))
{
- printk("Checksum error.\n");
+ panic("SMP mptable: checksum error!\n");
return 1;
}
if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04)
/*
* We don't actually need to load the full TSS,
* basically just the stack pointer and the eip.
- *
- * Get the scheduler lock, because we're going
- * to release it as part of the "reschedule" return.
*/
- spin_lock(&scheduler_lock);
asm volatile(
"movl %0,%%esp\n\t"
printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
SMP_PRINTK(("Boot done.\n"));
+ cache_APIC_registers();
/*
* Here we can be sure that there is an IO-APIC in the system. Let's
* go and set it up:
smp_done:
}
-void send_IPI(int dest, int vector)
-{
- unsigned long cfg;
- unsigned long flags;
- __save_flags(flags);
- __cli();
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
- /*
- * prepare target chip field
- */
- cfg = apic_read(APIC_ICR2) & 0x00FFFFFF;
- apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(dest));
+/*
+ * Silly serialization to work around CPU bug in P5s.
+ * We can safely turn it off on a 686.
+ */
+#if defined(CONFIG_M686) & !defined(SMP_DEBUG)
+# define FORCE_APIC_SERIALIZATION 0
+#else
+# define FORCE_APIC_SERIALIZATION 1
+#endif
- cfg = apic_read(APIC_ICR);
- cfg &= ~0xFDFFF;
- cfg |= APIC_DEST_FIELD|APIC_DEST_DM_FIXED|vector;
- cfg |= dest;
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
-
- apic_write(APIC_ICR, cfg);
- __restore_flags(flags);
-}
+static unsigned int cached_APIC_ICR;
+static unsigned int cached_APIC_ICR2;
/*
- * A non wait message cannot pass data or CPU source info. This current setup
- * is only safe because the kernel lock owner is the only person who can send
- * a message.
+ * Caches reserved bits, APIC reads are (mildly) expensive
+ * and force otherwise unnecessary CPU synchronization.
*
- * Wrapping this whole block in a spinlock is not the safe answer either. A
- * processor may get stuck with IRQs off waiting to send a message and thus
- * not replying to the person spinning for a reply.
- *
- * In the end flush tlb ought to be the NMI and a very short function
- * (to avoid the old IDE disk problems), and other messages sent with IRQs
- * enabled in a civilised fashion. That will also boost performance.
+ * (We could cache other APIC registers too, but these are the
+ * main ones used in RL.)
*/
+#define slow_ICR (apic_read(APIC_ICR) & ~0xFDFFF)
+#define slow_ICR2 (apic_read(APIC_ICR2) & 0x00FFFFFF)
-void smp_message_pass(int target, int msg, unsigned long data, int wait)
+void cache_APIC_registers (void)
{
- unsigned long cfg;
- unsigned long dest = 0;
- unsigned long target_map;
- int p=smp_processor_id();
- int irq;
- int ct=0;
+ cached_APIC_ICR = slow_ICR;
+ cached_APIC_ICR2 = slow_ICR2;
+ mb();
+}
+static inline unsigned int __get_ICR (void)
+{
+#if FORCE_APIC_SERIALIZATION
/*
- * During boot up send no messages
+ * Wait for the APIC to become ready - this should never occur. It's
+ * a debugging check really.
*/
-
- if (!smp_activated || !smp_commenced)
- return;
+ int count = 0;
+ unsigned int cfg;
+
+ IO_trace (IO_smp_wait_apic_start, 0, 0, 0, 0);
+ while (count < 1000)
+ {
+ cfg = slow_ICR;
+ if (!(cfg&(1<<12))) {
+ IO_trace (IO_smp_wait_apic_end, 0, 0, 0, 0);
+ if (count)
+ atomic_add(count, (atomic_t*)&ipi_count);
+ return cfg;
+ }
+ count++;
+ udelay(10);
+ }
+ printk("CPU #%d: previous IPI still not cleared after 10mS\n",
+ smp_processor_id());
+ return cfg;
+#else
+ return cached_APIC_ICR;
+#endif
+}
+static inline unsigned int __get_ICR2 (void)
+{
+#if FORCE_APIC_SERIALIZATION
+ return slow_ICR2;
+#else
+ return cached_APIC_ICR2;
+#endif
+}
- /*
- * Skip the reschedule if we are waiting to clear a
- * message at this time. The reschedule cannot wait
- * but is not critical.
- */
+static inline int __prepare_ICR (unsigned int shortcut, int vector)
+{
+ unsigned int cfg;
- switch (msg) {
- case MSG_RESCHEDULE:
- irq = 0x30;
- if (smp_cpu_in_msg[p])
- return;
- break;
+ cfg = __get_ICR();
+ cfg |= APIC_DEST_FIELD|APIC_DEST_DM_FIXED|shortcut|vector;
- case MSG_INVALIDATE_TLB:
- /* make this a NMI some day */
- irq = 0x31;
- break;
+ return cfg;
+}
- case MSG_STOP_CPU:
- irq = 0x40;
- break;
+static inline int __prepare_ICR2 (unsigned int dest)
+{
+ unsigned int cfg;
- case MSG_MTRR_CHANGE:
- irq = 0x50;
- break;
+ cfg = __get_ICR2();
+ cfg |= SET_APIC_DEST_FIELD(dest);
- default:
- printk("Unknown SMP message %d\n", msg);
- return;
- }
+ return cfg;
+}
- /*
- * Sanity check we don't re-enter this across CPUs. Only the kernel
- * lock holder may send messages. For a STOP_CPU we are bringing the
- * entire box to the fastest halt we can. A reschedule carries
- * no data and can occur during a flush. Guess what panic
- * I got to notice this bug.
- */
-
- /*
- * We are busy.
- */
-
- smp_cpu_in_msg[p]++;
+static inline void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+ unsigned int cfg;
+/*
+ * Subtle. In the case of the 'never do double writes' workaround we
+ * have to lock out interrupts to be safe. Otherwise it's just one
+ * single atomic write to the APIC, no need for cli/sti.
+ */
+#if FORCE_APIC_SERIALIZATION
+ unsigned long flags;
-/* printk("SMP message pass #%d to %d of %d\n",
- p, msg, target);*/
+ __save_flags(flags);
+ __cli();
+#endif
/*
- * Wait for the APIC to become ready - this should never occur. It's
- * a debugging check really.
+ * No need to touch the target chip field
*/
-
- while (ct<1000)
- {
- cfg=apic_read(APIC_ICR);
- if (!(cfg&(1<<12)))
- break;
- ct++;
- udelay(10);
- }
- /*
- * Just pray... there is nothing more we can do
- */
-
- if (ct==1000)
- printk("CPU #%d: previous IPI still not cleared after 10mS\n", p);
+ cfg = __prepare_ICR(shortcut, vector);
/*
- * Set the target requirement
+ * Send the IPI. The write to APIC_ICR fires this off.
*/
+
+ IO_trace (IO_smp_send_ipi, shortcut, vector, cfg, 0);
- if (target==MSG_ALL_BUT_SELF)
- {
- dest=APIC_DEST_ALLBUT;
- target_map=cpu_present_map;
- cpu_callin_map[0]=(1<<p);
- }
- else if (target==MSG_ALL)
- {
- dest=APIC_DEST_ALLINC;
- target_map=cpu_present_map;
- cpu_callin_map[0]=0;
- }
- else
- {
- dest=0;
- target_map=(1<<target);
- cpu_callin_map[0]=0;
- }
+ apic_write(APIC_ICR, cfg);
+#if FORCE_APIC_SERIALIZATION
+ __restore_flags(flags);
+#endif
+}
+
+static inline void send_IPI_allbutself(int vector)
+{
+ __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
+}
+
+static inline void send_IPI_all(int vector)
+{
+ __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
+}
+
+void send_IPI_self(int vector)
+{
+ __send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+
+static inline void send_IPI_single(int dest, int vector)
+{
+ unsigned long cfg;
+#if FORCE_APIC_SERIALIZATION
+ unsigned long flags;
+
+ __save_flags(flags);
+ __cli();
+#endif
/*
- * Program the APIC to deliver the IPI
+ * prepare target chip field
*/
- send_IPI(dest,irq);
+ cfg = __prepare_ICR2(dest);
+ apic_write(APIC_ICR2, cfg);
/*
- * Spin waiting for completion
+ * program the ICR
*/
+ cfg = __prepare_ICR(0, vector);
- switch(wait)
- {
- int stuck;
- case 1:
- stuck = 50000000;
- while(cpu_callin_map[0]!=target_map) {
- --stuck;
- if (!stuck) {
- printk("stuck on target_map IPI wait\n");
- break;
- }
- }
- break;
- case 2:
- stuck = 50000000;
- /* Wait for invalidate map to clear */
- while (smp_invalidate_needed) {
- /* Take care of "crossing" invalidates */
- if (test_bit(p, &smp_invalidate_needed))
- clear_bit(p, &smp_invalidate_needed);
- --stuck;
- if (!stuck) {
- printk("stuck on smp_invalidate_needed IPI wait (CPU#%d)\n",p);
- break;
- }
- }
- break;
- }
-
/*
- * Record our completion
+ * Send the IPI. The write to APIC_ICR fires this off.
*/
+
+ IO_trace (IO_smp_send_ipi, dest, vector, cfg, 0);
- smp_cpu_in_msg[p]--;
+ apic_write(APIC_ICR, cfg);
+#if FORCE_APIC_SERIALIZATION
+ __restore_flags(flags);
+#endif
}
/*
- * This is fraught with deadlocks. Linus does a flush tlb at a whim
- * even with IRQs off. We have to avoid a pair of crossing flushes
- * or we are doomed. See the notes about smp_message_pass.
+ * This is fraught with deadlocks. Probably the situation is not that
+ * bad as in the early days of SMP, so we might ease some of the
+ * paranoia here.
*/
void smp_flush_tlb(void)
{
+ int cpu = smp_processor_id();
+ int stuck;
unsigned long flags;
-/* printk("SMI-");*/
-
/*
- * The assignment is safe because it's volatile so the compiler cannot reorder it,
- * because the i586 has strict memory ordering and because only the kernel lock holder
- * may issue a tlb flush. If you break any one of those three change this to an atomic
- * bus locked or.
+ * The assignment is safe because it's volatile so the
+ * compiler cannot reorder it, because the i586 has
+ * strict memory ordering and because only the kernel
+ * lock holder may issue a tlb flush. If you break any
+ * one of those three change this to an atomic bus
+ * locked or.
*/
- smp_invalidate_needed=cpu_present_map;
+ smp_invalidate_needed = cpu_present_map;
/*
- * Processors spinning on the lock will see this IRQ late. The smp_invalidate_needed map will
- * ensure they don't do a spurious flush tlb or miss one.
+ * Processors spinning on some lock with IRQs disabled
+ * will see this IRQ late. The smp_invalidate_needed
+ * map will ensure they don't do a spurious flush tlb
+ * or miss one.
*/
__save_flags(flags);
__cli();
- smp_message_pass(MSG_ALL_BUT_SELF, MSG_INVALIDATE_TLB, 0L, 2);
+
+ IO_trace (IO_smp_message, 0, 0, 0, 0);
+
+ send_IPI_allbutself(INVALIDATE_TLB_VECTOR);
/*
- * Flush the local TLB
+ * Spin waiting for completion
*/
-
- local_flush_tlb();
- __restore_flags(flags);
+ stuck = 50000000;
+ while (smp_invalidate_needed) {
+ /*
+ * Take care of "crossing" invalidates
+ */
+ if (test_bit(cpu, &smp_invalidate_needed))
+ clear_bit(cpu, &smp_invalidate_needed);
+ --stuck;
+ if (!stuck) {
+ printk("stuck on TLB IPI wait (CPU#%d)\n",cpu);
+ break;
+ }
+ }
/*
- * Completed.
+ * Flush the local TLB
*/
-
-/* printk("SMID\n");*/
+ local_flush_tlb();
+
+ __restore_flags(flags);
}
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+
void smp_send_reschedule(int cpu)
{
- unsigned long flags;
+ send_IPI_single(cpu, RESCHEDULE_VECTOR);
+}
- __save_flags(flags);
- __cli();
- smp_message_pass(cpu, MSG_RESCHEDULE, 0L, 0);
- __restore_flags(flags);
+/*
+ * this function sends a 'stop' IPI to all other CPUs in the system.
+ * it goes straight through.
+ */
+
+void smp_send_stop(void)
+{
+ send_IPI_allbutself(STOP_CPU_VECTOR);
+}
+
+/*
+ * this function sends an 'reload MTRR state' IPI to all other CPUs
+ * in the system. it goes straight through, completion processing
+ * is done on the mttr.c level.
+ */
+
+void smp_send_mtrr(void)
+{
+ send_IPI_allbutself(MTRR_CHANGE_VECTOR);
}
/*
*/
asmlinkage void smp_reschedule_interrupt(void)
{
+ IO_trace (IO_smp_reschedule, current->need_resched,
+ current->priority, current->counter, 0);
+
ack_APIC_irq();
}
*/
asmlinkage void smp_invalidate_interrupt(void)
{
+ IO_trace (IO_smp_tlbflush,
+ atomic_read((atomic_t *)&smp_invalidate_needed), 0, 0, 0);
+
if (test_and_clear_bit(smp_processor_id(), &smp_invalidate_needed))
local_flush_tlb();
* Unfortunately the local APIC timer cannot be set up into NMI
* mode. With the IO APIC we can re-route the external timer
* interrupt and broadcast it as an NMI to all CPUs, so no pain.
- *
- * NOTE: this trap vector (0x41) and the gate in
- * BUILD_SMP_TIMER_INTERRUPT should be the same ;)
*/
tmp_value = apic_read(APIC_LVTT);
- lvtt1_value = APIC_LVT_TIMER_PERIODIC | 0x41;
+ lvtt1_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
apic_write(APIC_LVTT , lvtt1_value);
/*
unsigned long cpu_hz; /* Detected as we calibrate the TSC */
+cycles_t cacheflush_time;
+
/* Number of usecs that the last interrupt was delayed */
static int delay_at_last_interrupt;
:"=a" (eax), "=d" (edx));
/* .. relative to previous jiffy (32 bits is enough) */
- edx = 0;
eax -= last_tsc_low; /* tsc_low delta */
/*
__asm__("mull %2"
:"=a" (eax), "=d" (edx)
- :"r" (fast_gettimeoffset_quotient),
- "0" (eax), "1" (edx));
+ :"g" (fast_gettimeoffset_quotient),
+ "0" (eax));
/* our adjusted time offset in microseconds */
- return edx + delay_at_last_interrupt;
+ return delay_at_last_interrupt + edx;
}
/* This function must be called with interrupts disabled
{
extern volatile unsigned long lost_ticks;
unsigned long flags;
+ unsigned long usec, sec;
read_lock_irqsave(&xtime_lock, flags);
- *tv = xtime;
- tv->tv_usec += do_gettimeoffset();
- if (lost_ticks)
- tv->tv_usec += lost_ticks * (1000000/HZ);
+ usec = do_gettimeoffset();
+ {
+ unsigned long lost = lost_ticks;
+ if (lost)
+ usec += lost * (1000000 / HZ);
+ }
+ sec = xtime.tv_sec;
+ usec += xtime.tv_usec;
read_unlock_irqrestore(&xtime_lock, flags);
- while (tv->tv_usec >= 1000000) {
- tv->tv_usec -= 1000000;
- tv->tv_sec++;
+
+ while (usec >= 1000000) {
+ usec -= 1000000;
+ sec++;
}
+
+ tv->tv_sec = sec;
+ tv->tv_usec = usec;
}
void do_settimeofday(struct timeval *tv)
else
last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */
}
-#if 0
- /* As we return to user mode fire off the other CPU schedulers.. this is
- basically because we don't yet share IRQ's around. This message is
- rigged to be safe on the 386 - basically it's a hack, so don't look
- closely for now.. */
- smp_message_pass(MSG_ALL_BUT_SELF, MSG_RESCHEDULE, 0L, 0);
-#endif
#ifdef CONFIG_MCA
if( MCA_bus ) {
printk("Detected %ld Hz processor.\n", cpu_hz);
}
}
+
+ /*
+ * Rough estimation for SMP scheduling, this is the number of
+ * cycles it takes for a fully memory-limited process to flush
+ * the SMP-local cache.
+ */
+ cacheflush_time = cpu_hz/10000;
+
setup_x86_irq(0, &irq0);
}
. = ALIGN(4096);
__init_end = .;
+ . = ALIGN(32);
+ .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+
. = ALIGN(4096);
.data.page_aligned : { *(.data.idt) }
+
__bss_start = .; /* BSS */
.bss : {
*(.bss)
typedef struct { int counter; } atomic_t;
#endif
-#define ATOMIC_INIT(i) { (i) }
+#define ATOMIC_INIT(i) ( (atomic_t) { (i) } )
#define atomic_read(v) ((v)->counter)
#define atomic_set(v,i) ((v)->counter = (i))
__EXTERN_INLINE void apecs_outb(unsigned char b, unsigned long addr)
{
- unsigned int w;
+ unsigned long w;
w = __kernel_insbl(b, addr & 3);
*(vuip) ((addr << 5) + APECS_IO + 0x00) = w;
__EXTERN_INLINE void apecs_outw(unsigned short b, unsigned long addr)
{
- unsigned int w;
+ unsigned long w;
w = __kernel_inswl(b, addr & 3);
*(vuip) ((addr << 5) + APECS_IO + 0x08) = w;
__EXTERN_INLINE void cia_outb(unsigned char b, unsigned long addr)
{
- unsigned int w = __kernel_insbl(b, addr & 3);
+ unsigned long w = __kernel_insbl(b, addr & 3);
*(vuip) ((addr << 5) + CIA_IO + 0x00) = w;
wmb();
}
__EXTERN_INLINE void cia_outw(unsigned short b, unsigned long addr)
{
- unsigned int w = __kernel_inswl(b, addr & 3);
+ unsigned long w = __kernel_inswl(b, addr & 3);
*(vuip) ((addr << 5) + CIA_IO + 0x08) = w;
wmb();
}
__EXTERN_INLINE void lca_outb(unsigned char b, unsigned long addr)
{
- unsigned int w;
+ unsigned long w;
w = __kernel_insbl(b, addr & 3);
*(vuip) ((addr << 5) + LCA_IO + 0x00) = w;
__EXTERN_INLINE void lca_outw(unsigned short b, unsigned long addr)
{
- unsigned int w;
+ unsigned long w;
w = __kernel_inswl(b, addr & 3);
*(vuip) ((addr << 5) + LCA_IO + 0x08) = w;
__EXTERN_INLINE void lca_writeb(unsigned char b, unsigned long addr)
{
unsigned long msb;
- unsigned int w;
+ unsigned long w;
if (addr >= (1UL << 24)) {
msb = addr & 0xf8000000;
__EXTERN_INLINE void lca_writew(unsigned short b, unsigned long addr)
{
unsigned long msb;
- unsigned int w;
+ unsigned long w;
if (addr >= (1UL << 24)) {
msb = addr & 0xf8000000;
{
unsigned long addr = in_addr & 0xffffffffUL;
unsigned long hose = (in_addr >> 32) & 3;
- unsigned int w;
+ unsigned long w;
w = __kernel_insbl(b, addr & 3);
*(vuip) ((addr << 5) + MCPCIA_IO(hose) + 0x00) = w;
{
unsigned long addr = in_addr & 0xffffffffUL;
unsigned long hose = (in_addr >> 32) & 3;
- unsigned int w;
+ unsigned long w;
w = __kernel_inswl(b, addr & 3);
*(vuip) ((addr << 5) + MCPCIA_IO(hose) + 0x08) = w;
__EXTERN_INLINE void pyxis_outb(unsigned char b, unsigned long addr)
{
- unsigned int w;
+ unsigned long w;
w = __kernel_insbl(b, addr & 3);
*(vuip) ((addr << 5) + PYXIS_IO + 0x00) = w;
__EXTERN_INLINE void pyxis_outw(unsigned short b, unsigned long addr)
{
- unsigned int w;
+ unsigned long w;
w = __kernel_inswl(b, addr & 3);
*(vuip) ((addr << 5) + PYXIS_IO + 0x08) = w;
__EXTERN_INLINE void t2_outw(unsigned short b, unsigned long addr)
{
- unsigned int w;
+ unsigned long w;
w = __kernel_inswl(b, addr & 3);
*(vuip) ((addr << 5) + T2_IO + 0x08) = w;
#include <linux/config.h>
#include <asm/system.h>
-#include <asm/machvec.h>
/* We don't use IO slowdowns on the Alpha, but.. */
#define __SLOW_DOWN_IO do { } while (0)
#endif
#ifdef __KERNEL__
+#include <asm/machvec.h>
/*
* We try to avoid hae updates (thus the cache), but when we
* There are different chipsets to interface the Alpha CPUs to the world.
*/
+#ifdef __KERNEL__
#ifdef CONFIG_ALPHA_GENERIC
/* In a generic kernel, we always go through the machine vector. */
#undef __WANT_IO_DEF
#endif /* GENERIC */
+#endif /* __KERNEL__ */
/*
* The convention used for inb/outb etc. is that names starting with
extern void _writel(unsigned int b, unsigned long addr);
extern void _writeq(unsigned long b, unsigned long addr);
+#ifdef __KERNEL__
/*
* The platform header files may define some of these macros to use
* the inlined versions where appropriate. These macros may also be
# define outl_p outl
#endif
+#else
+
+/* Userspace declarations. */
+
+extern unsigned int inb (unsigned long port);
+extern unsigned int inw (unsigned long port);
+extern unsigned int inl (unsigned long port);
+extern void outb (unsigned char b,unsigned long port);
+extern void outw (unsigned short w,unsigned long port);
+extern void outl (unsigned int l,unsigned long port);
+extern unsigned long readb(unsigned long addr);
+extern unsigned long readw(unsigned long addr);
+extern unsigned long readl(unsigned long addr);
+extern void writeb(unsigned char b, unsigned long addr);
+extern void writew(unsigned short b, unsigned long addr);
+extern void writel(unsigned int b, unsigned long addr);
+
+#endif /* __KERNEL__ */
+
+#ifdef __KERNEL__
+
/*
* The "address" in IO memory space is not clearly either an integer or a
* pointer. We will accept both, thus the casts.
# define writeq(v,a) _writeq((v),(unsigned long)(a))
#endif
-#ifdef __KERNEL__
-
/*
* String version of IO memory access ops:
*/
{
bh_mask &= ~(1 << nr);
atomic_inc(&bh_mask_count[nr]);
+ synchronize_bh();
}
extern inline void enable_bh(int nr)
#define __FINIT .previous
#define __INITDATA .section ".data.init",#alloc,#write
+#define __cacheline_aligned __attribute__ \
+ ((__section__ (".data.cacheline_aligned")))
+
#endif
extern void smp_callin(void);
extern void smp_boot_cpus(void);
extern void smp_store_cpu_info(int id); /* Store per CPU info (like the initial udelay numbers */
-extern void smp_message_pass(int target, int msg, unsigned long data, int wait);
-
-extern volatile unsigned long smp_proc_in_lock[NR_CPUS]; /* for computing process time */
-extern volatile int smp_process_available;
/*
* APIC handlers: Note according to the Intel specification update
* processes are run.
*/
-#define PROC_CHANGE_PENALTY 10 /* Schedule penalty */
+#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */
-#define SMP_FROM_INT 1
-#define SMP_FROM_SYSCALL 2
#endif
#endif
--- /dev/null
+torvalds@penguin.transmeta.com
\ No newline at end of file
(1000000/CLOCK_TICK_FACTOR) / (CLOCK_TICK_RATE/CLOCK_TICK_FACTOR)) \
<< (SHIFT_SCALE-SHIFT_HZ)) / HZ)
+/*
+ * Standard way to access the cycle counter on i586+ CPUs.
+ * Currently only used on SMP.
+ */
+typedef unsigned long long cycles_t;
+
+extern cycles_t cacheflush_time;
+
+static inline cycles_t get_cycles (void)
+{
+ cycles_t value;
+
+ __asm__("rdtsc"
+ :"=a" (*(((int *)&value)+0)),
+ "=d" (*(((int *)&value)+1)));
+ return value;
+}
+
#endif
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/times.h>
+#include <linux/timex.h>
#include <asm/system.h>
#include <asm/semaphore.h>
/* various fields */
long counter;
long priority;
+ cycles_t avg_slice;
/* SMP and runqueue state */
int has_cpu;
int processor;
*/
#define INIT_TASK \
/* state etc */ { 0,0,0,KERNEL_DS,&default_exec_domain,0, \
-/* counter */ DEF_PRIORITY,DEF_PRIORITY, \
+/* counter */ DEF_PRIORITY,DEF_PRIORITY,0, \
/* SMP */ 0,0,0,-1, \
/* schedlink */ &init_task,&init_task, &init_task, &init_task, \
/* binfmt */ NULL, \
#include <asm/smp.h>
/*
- * main IPI interface, handles INIT, TLB flush, STOP, etc. (defined in asm header):
- *
- * extern void smp_message_pass(int target, int msg, unsigned long data, int wait);
+ * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
+ * (defined in asm header):
*/
+/*
+ * stops all CPUs but the current one:
+ */
+extern void smp_send_stop(void);
+
+/*
+ * sends a 'reschedule' event to another CPU:
+ */
+extern void FASTCALL(smp_send_reschedule(int cpu));
+
+
/*
* Boot processor call to load the other CPU's
*/
#define smp_num_cpus 1
#define smp_processor_id() 0
#define hard_smp_processor_id() 0
-#define smp_message_pass(t,m,d,w)
#define smp_threads_ready 1
#define kernel_lock()
#define cpu_logical_map(cpu) 0
*/
smp_init();
kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ current->need_resched = 1;
cpu_idle(NULL);
}
unblank_console();
#ifdef __SMP__
- smp_message_pass(MSG_ALL_BUT_SELF, MSG_STOP_CPU, 0, 0);
+ smp_send_stop();
#endif
if (panic_timeout > 0)
{
* 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
* serialize accesses to xtime/lost_ticks).
* Copyright (C) 1998 Andrea Arcangeli
+ * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
*/
/*
void scheduling_functions_start_here(void) { }
-static inline void reschedule_idle(struct task_struct * p)
+#ifdef __SMP__
+static void reschedule_idle_slow(struct task_struct * p)
{
+/*
+ * (see reschedule_idle() for an explanation first ...)
+ *
+ * Pass #2
+ *
+ * We try to find another (idle) CPU for this woken-up process.
+ *
+ * On SMP, we mostly try to see if the CPU the task used
+ * to run on is idle.. but we will use another idle CPU too,
+ * at this point we already know that this CPU is not
+ * willing to reschedule in the near future.
+ *
+ * An idle CPU is definitely wasted, especially if this CPU is
+ * running long-timeslice processes. The following algorithm is
+ * pretty good at finding the best idle CPU to send this process
+ * to.
+ *
+ * [We can try to preempt low-priority processes on other CPUs in
+ * 2.3. Also we can try to use the avg_slice value to predict
+ * 'likely reschedule' events even on other CPUs.]
+ */
+ int best_cpu = p->processor, this_cpu = smp_processor_id();
+ struct task_struct **idle = task, *tsk, *target_tsk;
+ int i = smp_num_cpus;
+
+ target_tsk = NULL;
+ do {
+ tsk = *idle;
+ idle++;
+ if (tsk->has_cpu) {
+ if (tsk->processor == this_cpu)
+ continue;
+ target_tsk = tsk;
+ if (tsk->processor == best_cpu) {
+ /*
+ * bingo, we couldnt get a better
+ * CPU, activate it.
+ */
+ goto send; /* this one helps GCC ... */
+ }
+ }
+ } while (--i > 0);
/*
- * For SMP, we try to see if the CPU the task used
- * to run on is idle..
+ * found any idle CPU?
*/
-#if 0
+ if (target_tsk) {
+send:
+ target_tsk->need_resched = 1;
+ smp_send_reschedule(target_tsk->processor);
+ return;
+ }
+}
+#endif /* __SMP__ */
+
+static inline void reschedule_idle(struct task_struct * p)
+{
+
+ if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) {
+ current->need_resched = 1;
+ return;
+ }
+
+#ifdef __SMP__
/*
- * Disable this for now. Ingo has some interesting
- * code that looks too complex, and I have some ideas,
- * but in the meantime.. One problem is that "wakeup()"
- * can be (and is) called before we've even initialized
- * SMP completely, so..
+ * ("wakeup()" should not be called before we've initialized
+ * SMP completely. [Linus, is there any exception to this?]
+ * Basically a not-yet initialized SMP subsystem can be
+ * considered as a not-yet working scheduler, simply dont use
+ * it before it'd up and running ...)
+ *
+ * SMP rescheduling is done in 2 passes:
+ * - pass #1: faster: 'quick decisions'
+ * - pass #2: slower: 'lets try and find another CPU'
*/
-#ifdef __SMP__
- int want_cpu = p->processor;
/*
- * Don't even try to find another CPU for us if the task
- * ran on this one before..
+ * Pass #1
+ *
+ * There are two metrics here:
+ *
+ * first, a 'cutoff' interval, currently ~250 usecs on
+ * x86 CPUs. If the current process has longer average
+ * timeslices than this, then we utilize the idle CPU.
+ *
+ * second, if the wakeup comes from a process context,
+ * then the two processes are 'related'. (they form a
+ * 'gang')
+ *
+ * An idle CPU is almost always a bad thing, thus we skip
+ * the idle-CPU utilization only if both these conditions
+ * are true. (ie. a 'process-gang' rescheduling with rather
+ * high frequency should stay on the same CPU).
+ *
+ * [We can switch to something more finegrained in 2.3.]
*/
- if (want_cpu != smp_processor_id()) {
- struct task_struct **idle = task;
- int i = smp_num_cpus;
-
- do {
- struct task_struct *tsk = *idle;
- idle++;
- /* Something like this.. */
- if (tsk->has_cpu && tsk->processor == want_cpu) {
- tsk->need_resched = 1;
- smp_send_reschedule(want_cpu);
- return;
- }
- } while (--i > 0);
- }
-#endif
-#endif
- if (p->policy != SCHED_OTHER || p->counter > current->counter + 3)
- current->need_resched = 1;
+ if ((current->avg_slice < cacheflush_time) && !in_interrupt())
+ return;
+
+ reschedule_idle_slow(p);
+#endif /* __SMP__ */
}
/*
wake_up_process(p);
}
+int _PROC_CHANGE_PENALTY = 13;
+
/*
* This is the function that decides how desirable a process is..
* You can weigh different processes against each other depending
return timeout < 0 ? 0 : timeout;
}
+/*
+ * This one aligns per-CPU data on cacheline boundaries.
+ */
+static union {
+ struct schedule_data {
+ struct task_struct * prev;
+ long prevstate;
+ cycles_t last_schedule;
+ } schedule_data;
+ char __pad [L1_CACHE_BYTES];
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+
+
+static inline void __schedule_tail (void)
+{
+#ifdef __SMP__
+ struct schedule_data * sched_data;
+
+ /*
+ * We might have switched CPUs:
+ */
+ sched_data = & aligned_data[smp_processor_id()].schedule_data;
+
+ /*
+ * Subtle. In the rare event that we got a wakeup to 'prev' just
+ * during the reschedule (this is possible, the scheduler is pretty
+ * parallel), we should do another reschedule in the next task's
+ * context. schedule() will do the right thing next time around.
+ * this is equivalent to 'delaying' the wakeup until the reschedule
+ * has finished.
+ */
+ if (sched_data->prev->state != sched_data->prevstate)
+ current->need_resched = 1;
+
+ /*
+ * Release the previous process ...
+ *
+ * We have dropped all locks, and we must make sure that we
+ * only mark the previous process as no longer having a CPU
+ * after all other state has been seen by other CPU's. Thus
+ * the memory barrier!
+ */
+ mb();
+ sched_data->prev->has_cpu = 0;
+#endif /* __SMP__ */
+}
+
+/*
+ * schedule_tail() is getting called from the fork return path. This
+ * cleans up all remaining scheduler things, without impacting the
+ * common case.
+ */
+void schedule_tail (void)
+{
+ __schedule_tail();
+}
+
/*
* 'schedule()' is the scheduler function. It's a very simple and nice
* scheduler: it's not perfect, but certainly works for most things.
*/
asmlinkage void schedule(void)
{
+ struct schedule_data * sched_data;
struct task_struct * prev, * next;
int this_cpu;
prev = current;
this_cpu = prev->processor;
+ /*
+ * 'sched_data' is protected by the fact that we can run
+ * only one process per CPU.
+ */
+ sched_data = & aligned_data[this_cpu].schedule_data;
+
if (in_interrupt())
goto scheduling_in_interrupt;
release_kernel_lock(prev, this_cpu);
/* move an exhausted RR process to be last.. */
prev->need_resched = 0;
+
if (!prev->counter && prev->policy == SCHED_RR) {
prev->counter = prev->priority;
move_last_runqueue(prev);
del_from_runqueue(prev);
case TASK_RUNNING:
}
+
+ sched_data->prevstate = prev->state;
+
{
struct task_struct * p = init_task.next_run;
/*
}
}
+ /*
+ * maintain the per-process 'average timeslice' value.
+ * (this has to be recalculated even if we reschedule to
+ * the same process) Currently this is only used on SMP:
+ */
#ifdef __SMP__
- next->has_cpu = 1;
-#endif
+ {
+ cycles_t t, this_slice;
+
+ t = get_cycles();
+ this_slice = t - sched_data->last_schedule;
+ sched_data->last_schedule = t;
+
+ /*
+ * Simple, exponentially fading average calculation:
+ */
+ prev->avg_slice = this_slice + prev->avg_slice;
+ prev->avg_slice >>= 1;
+ }
+
+ /*
+ * We drop the scheduler lock early (it's a global spinlock),
+ * thus we have to lock the previous process from getting
+ * rescheduled during switch_to().
+ */
+ prev->has_cpu = 1;
- if (prev != next) {
+ next->has_cpu = 1;
+ next->processor = this_cpu;
+ spin_unlock(&scheduler_lock);
+#endif /* __SMP__ */
+ if (prev != next) {
#ifdef __SMP__
- next->processor = this_cpu;
+ sched_data->prev = prev;
#endif
- kstat.context_swtch++;
+ kstat.context_swtch++;
get_mmu_context(next);
switch_to(prev,next);
- }
-
- spin_unlock(&scheduler_lock);
- /*
- * At this point "prev" is "current", as we just
- * switched into it (from an even more "previous"
- * prev)
- */
- reacquire_kernel_lock(prev);
+ __schedule_tail();
+ }
+
+ reacquire_kernel_lock(current);
return;
scheduling_in_interrupt:
*(int *)0 = 0;
}
-
rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;
/*