Import 2.1.133pre4

author Linus Torvalds <torvalds@linuxfoundation.org>

Fri, 23 Nov 2007 20:17:44 +0000 (15:17 -0500)

committer Linus Torvalds <torvalds@linuxfoundation.org>

Fri, 23 Nov 2007 20:17:44 +0000 (15:17 -0500)
author Linus Torvalds <torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:17:44 +0000 (15:17 -0500)
committer Linus Torvalds <torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:17:44 +0000 (15:17 -0500)
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c

index 467a861d1478da1ed32cd3e7ac5b36c968f66d2c..144b3d2ca3245434e7694ca78ed7c26fd47b78b7 100644 (file)
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -38,13 +38,6 @@ unsigned int local_irq_count[NR_CPUS];
  unsigned int local_bh_count[NR_CPUS];
  unsigned long hardirq_no[NR_CPUS];
  
-#define RTC_IRQ    8
-#ifdef CONFIG_RTC
-#define TIMER_IRQ  0        /* timer is the pit */
-#else
-#define TIMER_IRQ  RTC_IRQ  /* the timer is, in fact, the rtc */
-#endif
-
  #if NR_IRQS > 64
  #  error Unable to handle more than 64 irq levels.
  #endif
diff --git a/arch/alpha/kernel/irq.h b/arch/alpha/kernel/irq.h

index c46d5df4e82bd6ce0e50f84a343dfe92ca07d26f..59034807adc4d03cd4d0aa8a25c3d403a8b2343c 100644 (file)
--- a/arch/alpha/kernel/irq.h
+++ b/arch/alpha/kernel/irq.h
@@ -21,3 +21,11 @@ extern void isa_device_interrupt(unsigned long vector, struct pt_regs * regs);
  extern void srm_device_interrupt(unsigned long vector, struct pt_regs * regs);
  
  extern void handle_irq(int irq, int ack, struct pt_regs * regs);
+
+#define RTC_IRQ    8
+#ifdef CONFIG_RTC
+#define TIMER_IRQ  0                    /* timer is the pit */
+#else
+#define TIMER_IRQ  RTC_IRQ              /* timer is the rtc */
+#endif
+
diff --git a/arch/alpha/kernel/sys_ruffian.c b/arch/alpha/kernel/sys_ruffian.c

index a7ae730d86ce107d8a10fb587d9039b723e544d6..fab0448b4eb31a9d8471632bae643d862497f064 100644 (file)
--- a/arch/alpha/kernel/sys_ruffian.c
+++ b/arch/alpha/kernel/sys_ruffian.c
@@ -92,6 +92,12 @@ ruffian_device_interrupt(unsigned long vector, struct pt_regs *regs)
                 i = ffz(~pld);
                 pld &= pld - 1; /* clear least bit set */
                 if (i == 7) { /* if ISA int */
+                       /* Ruffian does not have the RTC connected to 
+                          the CPU timer interrupt.  Instead, it uses the
+                          PIT connected to IRQ 0.  So we must detect that
+                          and route that specifically to where we expected
+                          to find the timer interrupt come in.  */
+
                         /* Copy this code from isa_device_interrupt because
                            we need to hook into int 0 for the timer.  I
                            refuse to soil device_interrupt with ifdefs.  */
@@ -107,7 +113,7 @@ ruffian_device_interrupt(unsigned long vector, struct pt_regs *regs)
                         if (j == 7 && !(inb(0x20) & 0x80)) {
                                 /* It's only a passive release... */
                         } else if (j == 0) {
-                               handle_irq(8, -1, regs); /* fake it */
+                               handle_irq(TIMER_IRQ, -1, regs);
                                 ruffian_ack_irq(0);
                         } else {
                                 handle_irq(j, j, regs);
diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c

index acbb76896d3dfdc8d7fa9dd8b0e9bef1addb81b5..5c51687abc4819c557a69a279787b6cbc5f3c381 100644 (file)
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -35,12 +35,7 @@
  #include <linux/timex.h>
  
  #include "proto.h"
-
-#ifdef CONFIG_RTC 
-#define TIMER_IRQ 0  /* using pit for timer */
-#else 
-#define TIMER_IRQ 8  /* using rtc for timer */
-#endif
+#include "irq.h"
  
  static int set_rtc_mmss(unsigned long);
  
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S

index d977acbab50ea4f9dd908831db8481f4572b3413..1b0ef412ac9b4c4b1e8ecb46e018eaf5049a8a62 100644 (file)
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -153,10 +153,10 @@ ENTRY(lcall7)
         ALIGN
         .globl  ret_from_fork
  ret_from_fork:
-       GET_CURRENT(%ebx)
  #ifdef __SMP__
-       lock ; btrl $0, SYMBOL_NAME(scheduler_lock)
+       call SYMBOL_NAME(schedule_tail)
  #endif /* __SMP__ */
+       GET_CURRENT(%ebx)
         jmp     ret_from_sys_call
  
  /*
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c

index e1833f43c83f15a528e7f511f33aac473ab00f6d..85352886fb852c884d74ce2cd81b7067538a3ec7 100644 (file)
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -83,7 +83,6 @@ EXPORT_SYMBOL(__global_cli);
  EXPORT_SYMBOL(__global_sti);
  EXPORT_SYMBOL(__global_save_flags);
  EXPORT_SYMBOL(__global_restore_flags);
-EXPORT_SYMBOL(smp_message_pass);
  EXPORT_SYMBOL(mtrr_hook);
  #endif
  
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c

index e24633850110dc18673756047397ad86e3bf117a..6e653e9272fa2bd726c5a222d8e2a1dc40eb5a9c 100644 (file)
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -953,7 +953,7 @@ static inline void self_IPI(unsigned int irq)
  
         if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
                 desc->status = status | IRQ_REPLAY;
-               send_IPI(APIC_DEST_SELF, IO_APIC_VECTOR(irq));
+               send_IPI_self(IO_APIC_VECTOR(irq));
         }
  }
  
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c

index 3844fa00c4a1d51773a8bcdb6d505d3209c33de9..435de9bc28ae1394dc23fbf9d05047c2affa4853 100644 (file)
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -189,7 +189,7 @@ BUILD_IRQ(60) BUILD_IRQ(61) BUILD_IRQ(62) BUILD_IRQ(63)
  /*
   * The following vectors are part of the Linux architecture, there
   * is no hardware IRQ pin equivalent for them, they are triggered
- * through the ICC by us (IPIs), via smp_message_pass():
+ * through the ICC by us (IPIs)
   */
  BUILD_SMP_INTERRUPT(reschedule_interrupt)
  BUILD_SMP_INTERRUPT(invalidate_interrupt)
@@ -297,7 +297,7 @@ int get_irq_list(char *buf)
         }
         p += sprintf(p, "NMI: %10u\n", atomic_read(&nmi_counter));
  #ifdef __SMP__
-       p += sprintf(p, "IPI: %10lu\n", ipi_count);
+       p += sprintf(p, "ERR: %10lu\n", ipi_count);
  #endif         
         return p - buf;
  }
@@ -989,22 +989,22 @@ __initfunc(void init_IRQ(void))
          */
  
         /* IPI for rescheduling */
-       set_intr_gate(0x30, reschedule_interrupt);
+       set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
  
         /* IPI for invalidation */
-       set_intr_gate(0x31, invalidate_interrupt);
+       set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
  
         /* IPI for CPU halt */
-       set_intr_gate(0x40, stop_cpu_interrupt);
+       set_intr_gate(STOP_CPU_VECTOR, stop_cpu_interrupt);
  
         /* self generated IPI for local APIC timer */
-       set_intr_gate(0x41, apic_timer_interrupt);
+       set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
  
         /* IPI for MTRR control */
-       set_intr_gate(0x50, mtrr_interrupt);
+       set_intr_gate(MTRR_CHANGE_VECTOR, mtrr_interrupt);
  
         /* IPI vector for APIC spurious interrupts */
-       set_intr_gate(0xff, spurious_interrupt);
+       set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
  #endif 
         request_region(0x20,0x20,"pic1");
         request_region(0xa0,0x20,"pic2");
diff --git a/arch/i386/kernel/irq.h b/arch/i386/kernel/irq.h

index aa6bfdcd112be6ec7cd877dfc915f3faf8c88964..7fe26e74e6ff497efcccb9afed7c058ccd09ed94 100644 (file)
--- a/arch/i386/kernel/irq.h
+++ b/arch/i386/kernel/irq.h
@@ -40,7 +40,28 @@ typedef struct {
         unsigned int depth;                     /* Disable depth for nested irq disables */
  } irq_desc_t;
  
-#define IRQ0_TRAP_VECTOR 0x51
+/*
+ * Special IRQ vectors used by the SMP architecture:
+ *
+ * (some of the following vectors are 'rare', they might be merged
+ *  into a single vector to save vector space. TLB, reschedule and
+ *  local APIC vectors are performance-critical.)
+ */
+#define RESCHEDULE_VECTOR      0x30
+#define INVALIDATE_TLB_VECTOR  0x31
+#define STOP_CPU_VECTOR                0x40
+#define LOCAL_TIMER_VECTOR     0x41
+#define MTRR_CHANGE_VECTOR     0x50
+
+/*
+ * First vector available to drivers: (vectors 0x51-0xfe)
+ */
+#define IRQ0_TRAP_VECTOR       0x51
+
+/*
+ * This IRQ should never happen, but we print a message nevertheless.
+ */
+#define SPURIOUS_APIC_VECTOR   0xff
  
  extern irq_desc_t irq_desc[NR_IRQS];
  extern int irq_vector[NR_IRQS];
@@ -56,17 +77,18 @@ extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
   * Interrupt entry/exit code at both C and assembly level
   */
  
-void mask_irq(unsigned int irq);
-void unmask_irq(unsigned int irq);
-void disable_8259A_irq(unsigned int irq);
-int i8259A_irq_pending(unsigned int irq);
-void ack_APIC_irq(void);
-void setup_IO_APIC(void);
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
-void make_8259A_irq(unsigned int irq);
-void send_IPI(int dest, int vector);
-void init_pic_mode(void);
-void print_IO_APIC(void);
+extern void mask_irq(unsigned int irq);
+extern void unmask_irq(unsigned int irq);
+extern void disable_8259A_irq(unsigned int irq);
+extern int i8259A_irq_pending(unsigned int irq);
+extern void ack_APIC_irq(void);
+extern void setup_IO_APIC(void);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+extern void make_8259A_irq(unsigned int irq);
+extern void FASTCALL(send_IPI_self(int vector));
+extern void smp_send_mtrr(void);
+extern void init_pic_mode(void);
+extern void print_IO_APIC(void);
  
  extern unsigned long long io_apic_irqs;
  
diff --git a/arch/i386/kernel/mtrr.c b/arch/i386/kernel/mtrr.c

index fd330ac294baff2eed29a7d51da931fdeedfda11..16c767b4a2977f4f7b15ba7b2ce0d721762328d6 100644 (file)
--- a/arch/i386/kernel/mtrr.c
+++ b/arch/i386/kernel/mtrr.c
@@ -164,6 +164,9 @@
  #include <asm/bitops.h>
  #include <asm/atomic.h>
  
+#include <asm/hardirq.h>
+#include "irq.h"
+
  #define MTRR_VERSION            "1.26 (19981001)"
  
  #define TRUE  1
@@ -612,7 +615,7 @@ static void do_all_cpus (void (*handler) (struct set_mtrr_context *ctxt,
      /*  Send a message to all other CPUs and wait for them to enter the
         barrier  */
      atomic_set (&undone_count, smp_num_cpus - 1);
-    smp_message_pass (MSG_ALL_BUT_SELF, MSG_MTRR_CHANGE, 0, 0);
+    smp_send_mtrr();
      /*  Wait for it to be done  */
      timeout = jiffies + JIFFIE_TIMEOUT;
      while ( (atomic_read (&undone_count) > 0) &&
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c

index 0841530cf5a217fd9b5e71a2c77d455888492137..b7d00ea493bc62e9c113b056fea16b70070fa088 100644 (file)
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -140,10 +140,17 @@ int cpu_idle(void *unused)
         current->priority = 0;
         current->counter = -100;
         while(1) {
-               if (current_cpu_data.hlt_works_ok && !hlt_counter && !current->need_resched)
+               if (current_cpu_data.hlt_works_ok && !hlt_counter &&
+                                !current->need_resched)
                         __asm__("hlt");
-               schedule();
-               check_pgt_cache();
+               /*
+                * although we are an idle CPU, we do not want to
+                * get into the scheduler unnecessarily.
+                */
+               if (current->need_resched) {
+                       schedule();
+                       check_pgt_cache();
+               }
         }
  }
  
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c

index c01eb35f5b73c6af37598edda1b90aed48741821..0105843ad9061a76705f4e0f53a59cf91929dad4 100644 (file)
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -3,12 +3,14 @@
   *     hosts.
   *
   *     (c) 1995 Alan Cox, CymruNET Ltd  <alan@cymru.net>
+ *     (c) 1998 Ingo Molnar
+ *
   *     Supported by Caldera http://www.caldera.com.
   *     Much of the core SMP work is based on previous work by Thomas Radke, to
   *     whom a great many thanks are extended.
   *
- *     Thanks to Intel for making available several different Pentium and
- *     Pentium Pro MP machines.
+ *     Thanks to Intel for making available several different Pentium,
+ *     Pentium Pro and Pentium-II/Xeon MP machines.
   *
   *     This code is released under the GNU public license version 2 or
   *     later.
@@ -26,6 +28,7 @@
   *             Ingo Molnar     :       Added APIC timers, based on code
   *                                     from Jose Renau
   *             Alan Cox        :       Added EBDA scanning
+ *             Ingo Molnar     :       various cleanups and rewrites
   */
  
  #include <linux/config.h>
@@ -41,6 +44,7 @@
  #include <asm/bitops.h>
  #include <asm/pgtable.h>
  #include <asm/io.h>
+#include <linux/io_trace.h>
  
  #ifdef CONFIG_MTRR
  #  include <asm/mtrr.h>
@@ -112,6 +116,12 @@ extern __inline int max(int a,int b)
         return b;
  }
  
+/*
+ * function prototypes:
+ */
+static void cache_APIC_registers (void);
+
+
  static int smp_b_stepping = 0;                         /* Set if we find a B stepping CPU                      */
  
  static int max_cpus = -1;                              /* Setup configured maximum number of CPUs to activate  */
@@ -131,19 +141,14 @@ unsigned long mp_ioapic_addr = 0xFEC00000;                /* Address of the I/O apic (not yet
  unsigned char boot_cpu_id = 0;                         /* Processor that is doing the boot up                  */
  static int smp_activated = 0;                          /* Tripped once we need to start cross invalidating     */
  int apic_version[NR_CPUS];                             /* APIC version number                                  */
-static volatile int smp_commenced=0;                   /* Tripped when we start scheduling                     */
+volatile int smp_commenced=0;                  /* Tripped when we start scheduling                     */
  unsigned long apic_retval;                             /* Just debugging the assembler..                       */
  
-static volatile unsigned char smp_cpu_in_msg[NR_CPUS]; /* True if this processor is sending an IPI             */
-
  volatile unsigned long kernel_counter=0;               /* Number of times the processor holds the lock         */
  volatile unsigned long syscall_count=0;                        /* Number of times the processor holds the syscall lock */
  
  volatile unsigned long ipi_count;                      /* Number of IPIs delivered                             */
  
-volatile unsigned long  smp_proc_in_lock[NR_CPUS] = {0,};/* for computing process time */
-volatile int smp_process_available=0;
-
  const char lk_lockmsg[] = "lock from interrupt context at %p\n"; 
  
  int mp_bus_id_to_type [MAX_MP_BUSSES] = { -1, };
@@ -245,7 +250,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
  
         if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4))
         {
-               printk("Bad signature [%c%c%c%c].\n",
+               panic("SMP mptable: bad signature [%c%c%c%c]!\n",
                         mpc->mpc_signature[0],
                         mpc->mpc_signature[1],
                         mpc->mpc_signature[2],
@@ -254,7 +259,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
         }
         if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length))
         {
-               printk("Checksum error.\n");
+               panic("SMP mptable: checksum error!\n");
                 return 1;
         }
         if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04)
@@ -760,11 +765,7 @@ void __init initialize_secondary(void)
         /*
          * We don't actually need to load the full TSS,
          * basically just the stack pointer and the eip.
-        *
-        * Get the scheduler lock, because we're going
-        * to release it as part of the "reschedule" return.
          */
-       spin_lock(&scheduler_lock);
  
         asm volatile(
                 "movl %0,%%esp\n\t"
@@ -1165,6 +1166,7 @@ void __init smp_boot_cpus(void)
                 printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
         SMP_PRINTK(("Boot done.\n"));
  
+       cache_APIC_registers();
         /*
          * Here we can be sure that there is an IO-APIC in the system. Let's
          * go and set it up:
@@ -1175,257 +1177,280 @@ void __init smp_boot_cpus(void)
  smp_done:
  }
  
-void send_IPI(int dest, int vector)
-{
-       unsigned long cfg;
-       unsigned long flags;
  
-       __save_flags(flags);
-       __cli();
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
  
-       /*
-        * prepare target chip field
-        */
  
-       cfg = apic_read(APIC_ICR2) & 0x00FFFFFF;
-       apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(dest));
+/*
+ * Silly serialization to work around CPU bug in P5s.
+ * We can safely turn it off on a 686.
+ */
+#if defined(CONFIG_M686) & !defined(SMP_DEBUG)
+# define FORCE_APIC_SERIALIZATION 0
+#else
+# define FORCE_APIC_SERIALIZATION 1
+#endif
  
-       cfg = apic_read(APIC_ICR);
-       cfg &= ~0xFDFFF;
-       cfg |= APIC_DEST_FIELD|APIC_DEST_DM_FIXED|vector;
-       cfg |= dest;
-       
-       /*
-        * Send the IPI. The write to APIC_ICR fires this off.
-        */
-       
-       apic_write(APIC_ICR, cfg);
-       __restore_flags(flags);
-}
+static unsigned int cached_APIC_ICR;
+static unsigned int cached_APIC_ICR2;
  
  /*
- * A non wait message cannot pass data or CPU source info. This current setup
- * is only safe because the kernel lock owner is the only person who can send
- * a message.
+ * Caches reserved bits, APIC reads are (mildly) expensive
+ * and force otherwise unnecessary CPU synchronization.
   *
- * Wrapping this whole block in a spinlock is not the safe answer either. A
- * processor may get stuck with IRQs off waiting to send a message and thus
- * not replying to the person spinning for a reply.
- *
- * In the end flush tlb ought to be the NMI and a very short function
- * (to avoid the old IDE disk problems), and other messages sent with IRQs
- * enabled in a civilised fashion. That will also boost performance.
+ * (We could cache other APIC registers too, but these are the
+ * main ones used in RL.)
   */
+#define slow_ICR (apic_read(APIC_ICR) & ~0xFDFFF)
+#define slow_ICR2 (apic_read(APIC_ICR2) & 0x00FFFFFF)
  
-void smp_message_pass(int target, int msg, unsigned long data, int wait)
+void cache_APIC_registers (void)
  {
-       unsigned long cfg;
-       unsigned long dest = 0;
-       unsigned long target_map;
-       int p=smp_processor_id();
-       int irq;
-       int ct=0;
+       cached_APIC_ICR = slow_ICR;
+       cached_APIC_ICR2 = slow_ICR2;
+       mb();
+}
  
+static inline unsigned int __get_ICR (void)
+{
+#if FORCE_APIC_SERIALIZATION
         /*
-        *      During boot up send no messages
+        * Wait for the APIC to become ready - this should never occur. It's
+        * a debugging check really.
          */
-       
-       if (!smp_activated || !smp_commenced)
-               return;
+       int count = 0;
+       unsigned int cfg;
+
+       IO_trace (IO_smp_wait_apic_start, 0, 0, 0, 0);
+       while (count < 1000)
+       {
+               cfg = slow_ICR;
+               if (!(cfg&(1<<12))) {
+                       IO_trace (IO_smp_wait_apic_end, 0, 0, 0, 0);
+                       if (count)
+                               atomic_add(count, (atomic_t*)&ipi_count);
+                       return cfg;
+               }
+               count++;
+               udelay(10);
+       }
+       printk("CPU #%d: previous IPI still not cleared after 10mS\n",
+                       smp_processor_id());
+       return cfg;
+#else
+       return cached_APIC_ICR;
+#endif
+}
  
+static inline unsigned int __get_ICR2 (void)
+{
+#if FORCE_APIC_SERIALIZATION
+       return slow_ICR2;
+#else
+       return cached_APIC_ICR2;
+#endif
+}
  
-       /*
-        *      Skip the reschedule if we are waiting to clear a
-        *      message at this time. The reschedule cannot wait
-        *      but is not critical.
-        */
+static inline int __prepare_ICR (unsigned int shortcut, int vector)
+{
+       unsigned int cfg;
  
-       switch (msg) {
-               case MSG_RESCHEDULE:
-                       irq = 0x30;
-                       if (smp_cpu_in_msg[p])
-                               return;
-                       break;
+       cfg = __get_ICR();
+       cfg |= APIC_DEST_FIELD|APIC_DEST_DM_FIXED|shortcut|vector;
  
-               case MSG_INVALIDATE_TLB:
-                       /* make this a NMI some day */
-                       irq = 0x31;
-                       break;
+       return cfg;
+}
  
-               case MSG_STOP_CPU:
-                       irq = 0x40;
-                       break;
+static inline int __prepare_ICR2 (unsigned int dest)
+{
+       unsigned int cfg;
  
-               case MSG_MTRR_CHANGE:
-                       irq = 0x50;
-                       break;
+       cfg = __get_ICR2();
+       cfg |= SET_APIC_DEST_FIELD(dest);
  
-               default:
-                       printk("Unknown SMP message %d\n", msg);
-                       return;
-       }
+       return cfg;
+}
  
-       /*
-        * Sanity check we don't re-enter this across CPUs.  Only the kernel
-        * lock holder may send messages.  For a STOP_CPU we are bringing the
-        * entire box to the fastest halt we can.  A reschedule carries
-        * no data and can occur during a flush.  Guess what panic
-        * I got to notice this bug.
-        */
-       
-       /*
-        *      We are busy.
-        */
-       
-       smp_cpu_in_msg[p]++;
+static inline void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+       unsigned int cfg;
+/*
+ * Subtle. In the case of the 'never do double writes' workaround we
+ * have to lock out interrupts to be safe. Otherwise it's just one
+ * single atomic write to the APIC, no need for cli/sti.
+ */
+#if FORCE_APIC_SERIALIZATION
+       unsigned long flags;
  
-/*     printk("SMP message pass #%d to %d of %d\n",
-               p, msg, target);*/
+       __save_flags(flags);
+       __cli();
+#endif
  
         /*
-        * Wait for the APIC to become ready - this should never occur. It's
-        * a debugging check really.
+        * No need to touch the target chip field
          */
-       
-       while (ct<1000)
-       {
-               cfg=apic_read(APIC_ICR);
-               if (!(cfg&(1<<12)))
-                       break;
-               ct++;
-               udelay(10);
-       }
  
-       /*
-        *      Just pray... there is nothing more we can do
-        */
-       
-       if (ct==1000)
-               printk("CPU #%d: previous IPI still not cleared after 10mS\n", p);
+       cfg = __prepare_ICR(shortcut, vector);
  
         /*
-        *      Set the target requirement
+        * Send the IPI. The write to APIC_ICR fires this off.
          */
+
+       IO_trace (IO_smp_send_ipi, shortcut, vector, cfg, 0);
         
-       if (target==MSG_ALL_BUT_SELF)
-       {
-               dest=APIC_DEST_ALLBUT;
-               target_map=cpu_present_map;
-               cpu_callin_map[0]=(1<<p);
-       }
-       else if (target==MSG_ALL)
-       {
-               dest=APIC_DEST_ALLINC;
-               target_map=cpu_present_map;
-               cpu_callin_map[0]=0;
-       }
-       else
-       {
-               dest=0;
-               target_map=(1<<target);
-               cpu_callin_map[0]=0;
-       }
+       apic_write(APIC_ICR, cfg);
+#if FORCE_APIC_SERIALIZATION
+       __restore_flags(flags);
+#endif
+}
+
+static inline void send_IPI_allbutself(int vector)
+{
+       __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
+}
+
+static inline void send_IPI_all(int vector)
+{
+       __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
+}
+
+void send_IPI_self(int vector)
+{
+       __send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+
+static inline void send_IPI_single(int dest, int vector)
+{
+       unsigned long cfg;
+#if FORCE_APIC_SERIALIZATION
+       unsigned long flags;
+
+       __save_flags(flags);
+       __cli();
+#endif
  
         /*
-        * Program the APIC to deliver the IPI
+        * prepare target chip field
          */
  
-       send_IPI(dest,irq);
+       cfg = __prepare_ICR2(dest);
+       apic_write(APIC_ICR2, cfg);
  
         /*
-        * Spin waiting for completion
+        * program the ICR 
          */
+       cfg = __prepare_ICR(0, vector);
         
-       switch(wait)
-       {
-               int stuck;
-               case 1:
-                       stuck = 50000000;
-                       while(cpu_callin_map[0]!=target_map) {
-                               --stuck;
-                               if (!stuck) {
-                                       printk("stuck on target_map IPI wait\n");
-                                       break;
-                               }
-                       }
-                       break;
-               case 2:
-                       stuck = 50000000;
-                       /* Wait for invalidate map to clear */
-                       while (smp_invalidate_needed) {
-                               /* Take care of "crossing" invalidates */
-                               if (test_bit(p, &smp_invalidate_needed))
-                                       clear_bit(p, &smp_invalidate_needed);
-                               --stuck;
-                               if (!stuck) {
-                                       printk("stuck on smp_invalidate_needed IPI wait (CPU#%d)\n",p);
-                                       break;
-                               }
-                       }
-                       break;
-       }
-
         /*
-        *      Record our completion
+        * Send the IPI. The write to APIC_ICR fires this off.
          */
+
+       IO_trace (IO_smp_send_ipi, dest, vector, cfg, 0);
         
-       smp_cpu_in_msg[p]--;
+       apic_write(APIC_ICR, cfg);
+#if FORCE_APIC_SERIALIZATION
+       __restore_flags(flags);
+#endif
  }
  
  /*
- *     This is fraught with deadlocks. Linus does a flush tlb at a whim
- *     even with IRQs off. We have to avoid a pair of crossing flushes
- *     or we are doomed.  See the notes about smp_message_pass.
+ * This is fraught with deadlocks. Probably the situation is not that
+ * bad as in the early days of SMP, so we might ease some of the
+ * paranoia here.
   */
  
  void smp_flush_tlb(void)
  {
+       int cpu = smp_processor_id();
+       int stuck;
         unsigned long flags;
  
-/*     printk("SMI-");*/
-
         /*
-        *      The assignment is safe because it's volatile so the compiler cannot reorder it,
-        *      because the i586 has strict memory ordering and because only the kernel lock holder
-        *      may issue a tlb flush. If you break any one of those three change this to an atomic
-        *      bus locked or.
+        * The assignment is safe because it's volatile so the
+        * compiler cannot reorder it, because the i586 has
+        * strict memory ordering and because only the kernel
+        * lock holder may issue a tlb flush. If you break any
+        * one of those three change this to an atomic bus
+        * locked or.
          */
  
-       smp_invalidate_needed=cpu_present_map;
+       smp_invalidate_needed = cpu_present_map;
  
         /*
-        *      Processors spinning on the lock will see this IRQ late. The smp_invalidate_needed map will
-        *      ensure they don't do a spurious flush tlb or miss one.
+        * Processors spinning on some lock with IRQs disabled
+        * will see this IRQ late. The smp_invalidate_needed
+        * map will ensure they don't do a spurious flush tlb
+        * or miss one.
          */
         
         __save_flags(flags);
         __cli();
-       smp_message_pass(MSG_ALL_BUT_SELF, MSG_INVALIDATE_TLB, 0L, 2);
+
+       IO_trace (IO_smp_message, 0, 0, 0, 0);
+
+       send_IPI_allbutself(INVALIDATE_TLB_VECTOR);
  
         /*
-        *      Flush the local TLB
+        * Spin waiting for completion
          */
-       
-       local_flush_tlb();
  
-       __restore_flags(flags);
+       stuck = 50000000;
+       while (smp_invalidate_needed) {
+               /*
+                * Take care of "crossing" invalidates
+                */
+               if (test_bit(cpu, &smp_invalidate_needed))
+                       clear_bit(cpu, &smp_invalidate_needed);
+               --stuck;
+               if (!stuck) {
+                       printk("stuck on TLB IPI wait (CPU#%d)\n",cpu);
+                       break;
+               }
+       }
  
         /*
-        *      Completed.
+        *      Flush the local TLB
          */
-       
-/*     printk("SMID\n");*/
+       local_flush_tlb();
+
+       __restore_flags(flags);
  }
  
  
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+
  void smp_send_reschedule(int cpu)
  {
-       unsigned long flags;
+       send_IPI_single(cpu, RESCHEDULE_VECTOR);
+}
  
-       __save_flags(flags);
-       __cli();
-       smp_message_pass(cpu, MSG_RESCHEDULE, 0L, 0);
-       __restore_flags(flags);
+/*
+ * this function sends a 'stop' IPI to all other CPUs in the system.
+ * it goes straight through.
+ */
+
+void smp_send_stop(void)
+{
+       send_IPI_allbutself(STOP_CPU_VECTOR);
+}
+
+/*
+ * this function sends an 'reload MTRR state' IPI to all other CPUs
+ * in the system. it goes straight through, completion processing
+ * is done on the mttr.c level.
+ */
+
+void smp_send_mtrr(void)
+{
+       send_IPI_allbutself(MTRR_CHANGE_VECTOR);
  }
  
  /*
@@ -1531,6 +1556,9 @@ void smp_apic_timer_interrupt(struct pt_regs * regs)
   */
  asmlinkage void smp_reschedule_interrupt(void)
  {
+       IO_trace (IO_smp_reschedule, current->need_resched,
+                        current->priority, current->counter, 0);
+
         ack_APIC_irq();
  }
  
@@ -1539,6 +1567,9 @@ asmlinkage void smp_reschedule_interrupt(void)
   */
  asmlinkage void smp_invalidate_interrupt(void)
  {
+       IO_trace (IO_smp_tlbflush,
+                atomic_read((atomic_t *)&smp_invalidate_needed), 0, 0, 0);
+
         if (test_and_clear_bit(smp_processor_id(), &smp_invalidate_needed))
                 local_flush_tlb();
  
@@ -1626,12 +1657,9 @@ void setup_APIC_timer(unsigned int clocks)
          * Unfortunately the local APIC timer cannot be set up into NMI
          * mode. With the IO APIC we can re-route the external timer
          * interrupt and broadcast it as an NMI to all CPUs, so no pain.
-        *
-        * NOTE: this trap vector (0x41) and the gate in
-        * BUILD_SMP_TIMER_INTERRUPT should be the same ;)
          */
         tmp_value = apic_read(APIC_LVTT);
-       lvtt1_value = APIC_LVT_TIMER_PERIODIC | 0x41;
+       lvtt1_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
         apic_write(APIC_LVTT , lvtt1_value);
  
         /*
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c

index 30f0dac3f4f58747635cfbcc1942a443752b74ba..28a57ead740d21f2cf1abd335d1544a2aefb2bad 100644 (file)
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -72,6 +72,8 @@ extern int setup_x86_irq(int, struct irqaction *);
  
  unsigned long cpu_hz;  /* Detected as we calibrate the TSC */
  
+cycles_t cacheflush_time;
+
  /* Number of usecs that the last interrupt was delayed */
  static int delay_at_last_interrupt;
  
@@ -96,7 +98,6 @@ static unsigned long do_fast_gettimeoffset(void)
                 :"=a" (eax), "=d" (edx));
  
         /* .. relative to previous jiffy (32 bits is enough) */
-       edx = 0;
         eax -= last_tsc_low;    /* tsc_low delta */
  
         /*
@@ -110,11 +111,11 @@ static unsigned long do_fast_gettimeoffset(void)
  
         __asm__("mull %2"
                 :"=a" (eax), "=d" (edx)
-               :"r" (fast_gettimeoffset_quotient),
-                "0" (eax), "1" (edx));
+               :"g" (fast_gettimeoffset_quotient),
+                "0" (eax));
  
         /* our adjusted time offset in microseconds */
-       return edx + delay_at_last_interrupt;
+       return delay_at_last_interrupt + edx;
  }
  
  /* This function must be called with interrupts disabled 
@@ -240,17 +241,26 @@ void do_gettimeofday(struct timeval *tv)
  {
         extern volatile unsigned long lost_ticks;
         unsigned long flags;
+       unsigned long usec, sec;
  
         read_lock_irqsave(&xtime_lock, flags);
-       *tv = xtime;
-       tv->tv_usec += do_gettimeoffset();
-       if (lost_ticks)
-               tv->tv_usec += lost_ticks * (1000000/HZ);
+       usec = do_gettimeoffset();
+       {
+               unsigned long lost = lost_ticks;
+               if (lost)
+                       usec += lost * (1000000 / HZ);
+       }
+       sec = xtime.tv_sec;
+       usec += xtime.tv_usec;
         read_unlock_irqrestore(&xtime_lock, flags);
-       while (tv->tv_usec >= 1000000) {
-               tv->tv_usec -= 1000000;
-               tv->tv_sec++;
+
+       while (usec >= 1000000) {
+               usec -= 1000000;
+               sec++;
         }
+
+       tv->tv_sec = sec;
+       tv->tv_usec = usec;
  }
  
  void do_settimeofday(struct timeval *tv)
@@ -377,13 +387,6 @@ static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *reg
                 else
                         last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */
         }
-#if 0
-       /* As we return to user mode fire off the other CPU schedulers.. this is 
-          basically because we don't yet share IRQ's around. This message is
-          rigged to be safe on the 386 - basically it's a hack, so don't look
-          closely for now.. */
-       smp_message_pass(MSG_ALL_BUT_SELF, MSG_RESCHEDULE, 0L, 0);
-#endif
             
  #ifdef CONFIG_MCA
         if( MCA_bus ) {
@@ -639,5 +642,13 @@ __initfunc(void time_init(void))
                         printk("Detected %ld Hz processor.\n", cpu_hz);
                 }
         }
+
+       /*
+        * Rough estimation for SMP scheduling, this is the number of
+        * cycles it takes for a fully memory-limited process to flush
+        * the SMP-local cache.
+        */
+       cacheflush_time = cpu_hz/10000;
+
         setup_x86_irq(0, &irq0);
  }
diff --git a/arch/i386/vmlinux.lds b/arch/i386/vmlinux.lds

index c23007bc889ed4ca008653d8b01820877836ca42..203b9a927d4dfae09d6025c351f2bce52c8816bd 100644 (file)
--- a/arch/i386/vmlinux.lds
+++ b/arch/i386/vmlinux.lds
@@ -45,9 +45,13 @@ SECTIONS
    . = ALIGN(4096);
    __init_end = .;
  
+  . = ALIGN(32);
+  .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+
    . = ALIGN(4096);
    .data.page_aligned : { *(.data.idt) }
  
+
    __bss_start = .;             /* BSS */
    .bss : {
         *(.bss)
diff --git a/include/asm-alpha/atomic.h b/include/asm-alpha/atomic.h

index 16366d05581dac93fdc38fb5178daf5e7d70b6ff..2dccf35217eceabbffc1f28a587776c57afa1eed 100644 (file)
--- a/include/asm-alpha/atomic.h
+++ b/include/asm-alpha/atomic.h
@@ -15,7 +15,7 @@ typedef struct { volatile int counter; } atomic_t;
  typedef struct { int counter; } atomic_t;
  #endif
  
-#define ATOMIC_INIT(i) { (i) }
+#define ATOMIC_INIT(i) ( (atomic_t) { (i) } )
  
  #define atomic_read(v)         ((v)->counter)
  #define atomic_set(v,i)                ((v)->counter = (i))
diff --git a/include/asm-alpha/core_apecs.h b/include/asm-alpha/core_apecs.h

index a8f0bd6efa9797463b0a11c58e7afcca35ce845b..3346346f9cb6bf99fed6ec2902c6825a79baf077 100644 (file)
--- a/include/asm-alpha/core_apecs.h
+++ b/include/asm-alpha/core_apecs.h
@@ -458,7 +458,7 @@ __EXTERN_INLINE unsigned int apecs_inb(unsigned long addr)
  
  __EXTERN_INLINE void apecs_outb(unsigned char b, unsigned long addr)
  {
-       unsigned int w;
+       unsigned long w;
  
         w = __kernel_insbl(b, addr & 3);
         *(vuip) ((addr << 5) + APECS_IO + 0x00) = w;
@@ -473,7 +473,7 @@ __EXTERN_INLINE unsigned int apecs_inw(unsigned long addr)
  
  __EXTERN_INLINE void apecs_outw(unsigned short b, unsigned long addr)
  {
-       unsigned int w;
+       unsigned long w;
  
         w = __kernel_inswl(b, addr & 3);
         *(vuip) ((addr << 5) + APECS_IO + 0x08) = w;
diff --git a/include/asm-alpha/core_cia.h b/include/asm-alpha/core_cia.h

index 32fd81f2ed80258749154e500e8805069ec36922..bd3aad980e8f7fc939d03f4e69eda52a252fb9df 100644 (file)
--- a/include/asm-alpha/core_cia.h
+++ b/include/asm-alpha/core_cia.h
@@ -326,7 +326,7 @@ __EXTERN_INLINE unsigned int cia_inb(unsigned long addr)
  
  __EXTERN_INLINE void cia_outb(unsigned char b, unsigned long addr)
  {
-       unsigned int w = __kernel_insbl(b, addr & 3);
+       unsigned long w = __kernel_insbl(b, addr & 3);
         *(vuip) ((addr << 5) + CIA_IO + 0x00) = w;
         wmb();
  }
@@ -340,7 +340,7 @@ __EXTERN_INLINE unsigned int cia_inw(unsigned long addr)
  
  __EXTERN_INLINE void cia_outw(unsigned short b, unsigned long addr)
  {
-       unsigned int w = __kernel_inswl(b, addr & 3);
+       unsigned long w = __kernel_inswl(b, addr & 3);
         *(vuip) ((addr << 5) + CIA_IO + 0x08) = w;
         wmb();
  }
diff --git a/include/asm-alpha/core_lca.h b/include/asm-alpha/core_lca.h

index bce449fa4b9523873164ae00a0c934e4ae977003..63f25892459d7047462b2c1ffa26d3c560f5939c 100644 (file)
--- a/include/asm-alpha/core_lca.h
+++ b/include/asm-alpha/core_lca.h
@@ -262,7 +262,7 @@ __EXTERN_INLINE unsigned int lca_inb(unsigned long addr)
  
  __EXTERN_INLINE void lca_outb(unsigned char b, unsigned long addr)
  {
-       unsigned int w;
+       unsigned long w;
  
         w = __kernel_insbl(b, addr & 3);
         *(vuip) ((addr << 5) + LCA_IO + 0x00) = w;
@@ -277,7 +277,7 @@ __EXTERN_INLINE unsigned int lca_inw(unsigned long addr)
  
  __EXTERN_INLINE void lca_outw(unsigned short b, unsigned long addr)
  {
-       unsigned int w;
+       unsigned long w;
  
         w = __kernel_inswl(b, addr & 3);
         *(vuip) ((addr << 5) + LCA_IO + 0x08) = w;
@@ -340,7 +340,7 @@ __EXTERN_INLINE unsigned long lca_readq(unsigned long addr)
  __EXTERN_INLINE void lca_writeb(unsigned char b, unsigned long addr)
  {
         unsigned long msb;
-       unsigned int w;
+       unsigned long w;
  
         if (addr >= (1UL << 24)) {
                 msb = addr & 0xf8000000;
@@ -354,7 +354,7 @@ __EXTERN_INLINE void lca_writeb(unsigned char b, unsigned long addr)
  __EXTERN_INLINE void lca_writew(unsigned short b, unsigned long addr)
  {
         unsigned long msb;
-       unsigned int w;
+       unsigned long w;
  
         if (addr >= (1UL << 24)) {
                 msb = addr & 0xf8000000;
diff --git a/include/asm-alpha/core_mcpcia.h b/include/asm-alpha/core_mcpcia.h

index 33e67b462f7c8248600ef8dab047ab698937ded0..ed51de2143d4699364fdfc4c454faf6c2ba07def 100644 (file)
--- a/include/asm-alpha/core_mcpcia.h
+++ b/include/asm-alpha/core_mcpcia.h
@@ -264,7 +264,7 @@ __EXTERN_INLINE void mcpcia_outb(unsigned char b, unsigned long in_addr)
  {
         unsigned long addr = in_addr & 0xffffffffUL;
         unsigned long hose = (in_addr >> 32) & 3;
-       unsigned int w;
+       unsigned long w;
  
         w = __kernel_insbl(b, addr & 3);
         *(vuip) ((addr << 5) + MCPCIA_IO(hose) + 0x00) = w;
@@ -283,7 +283,7 @@ __EXTERN_INLINE void mcpcia_outw(unsigned short b, unsigned long in_addr)
  {
         unsigned long addr = in_addr & 0xffffffffUL;
         unsigned long hose = (in_addr >> 32) & 3;
-       unsigned int w;
+       unsigned long w;
  
         w = __kernel_inswl(b, addr & 3);
         *(vuip) ((addr << 5) + MCPCIA_IO(hose) + 0x08) = w;
diff --git a/include/asm-alpha/core_pyxis.h b/include/asm-alpha/core_pyxis.h

index da80e501e60df62c282db619c1abd14a91fb677f..213adf4ba4330873d842bfe68f223dc71d7447c4 100644 (file)
--- a/include/asm-alpha/core_pyxis.h
+++ b/include/asm-alpha/core_pyxis.h
@@ -326,7 +326,7 @@ __EXTERN_INLINE unsigned int pyxis_inb(unsigned long addr)
  
  __EXTERN_INLINE void pyxis_outb(unsigned char b, unsigned long addr)
  {
-       unsigned int w;
+       unsigned long w;
  
         w = __kernel_insbl(b, addr & 3);
         *(vuip) ((addr << 5) + PYXIS_IO + 0x00) = w;
@@ -341,7 +341,7 @@ __EXTERN_INLINE unsigned int pyxis_inw(unsigned long addr)
  
  __EXTERN_INLINE void pyxis_outw(unsigned short b, unsigned long addr)
  {
-       unsigned int w;
+       unsigned long w;
  
         w = __kernel_inswl(b, addr & 3);
         *(vuip) ((addr << 5) + PYXIS_IO + 0x08) = w;
diff --git a/include/asm-alpha/core_t2.h b/include/asm-alpha/core_t2.h

index 1f0984b383f8ecbdee80c78baafbb3f9d305f3b8..fdb0f82fe4f1379562f0318c77296cc6aae8b179 100644 (file)
--- a/include/asm-alpha/core_t2.h
+++ b/include/asm-alpha/core_t2.h
@@ -378,7 +378,7 @@ __EXTERN_INLINE unsigned int t2_inw(unsigned long addr)
  
  __EXTERN_INLINE void t2_outw(unsigned short b, unsigned long addr)
  {
-       unsigned int w;
+       unsigned long w;
  
         w = __kernel_inswl(b, addr & 3);
         *(vuip) ((addr << 5) + T2_IO + 0x08) = w;
diff --git a/include/asm-alpha/io.h b/include/asm-alpha/io.h

index 51d2af5968920e6f98ab422a463672dd1050568e..87e363fba6267f12b2493c2f807f6c966bca66c2 100644 (file)
--- a/include/asm-alpha/io.h
+++ b/include/asm-alpha/io.h
@@ -3,7 +3,6 @@
  
  #include <linux/config.h>
  #include <asm/system.h>
-#include <asm/machvec.h>
  
  /* We don't use IO slowdowns on the Alpha, but.. */
  #define __SLOW_DOWN_IO do { } while (0)
@@ -19,6 +18,7 @@
  #endif
  
  #ifdef __KERNEL__
+#include <asm/machvec.h>
  
  /*
   * We try to avoid hae updates (thus the cache), but when we
@@ -78,6 +78,7 @@ extern void _sethae (unsigned long addr);     /* cached version */
   * There are different chipsets to interface the Alpha CPUs to the world.
   */
  
+#ifdef __KERNEL__
  #ifdef CONFIG_ALPHA_GENERIC
  
  /* In a generic kernel, we always go through the machine vector.  */
@@ -147,6 +148,7 @@ extern void _sethae (unsigned long addr);   /* cached version */
  #undef __WANT_IO_DEF
  
  #endif /* GENERIC */
+#endif /* __KERNEL__ */
  
  /*
   * The convention used for inb/outb etc. is that names starting with
@@ -172,6 +174,7 @@ extern void         _writew(unsigned short b, unsigned long addr);
  extern void            _writel(unsigned int b, unsigned long addr);
  extern void            _writeq(unsigned long b, unsigned long addr);
  
+#ifdef __KERNEL__
  /*
   * The platform header files may define some of these macros to use
   * the inlined versions where appropriate.  These macros may also be
@@ -216,6 +219,27 @@ extern void                _writeq(unsigned long b, unsigned long addr);
  # define outl_p                outl
  #endif
  
+#else 
+
+/* Userspace declarations.  */
+
+extern unsigned int    inb (unsigned long port);
+extern unsigned int    inw (unsigned long port);
+extern unsigned int    inl (unsigned long port);
+extern void            outb (unsigned char b,unsigned long port);
+extern void            outw (unsigned short w,unsigned long port);
+extern void            outl (unsigned int l,unsigned long port);
+extern unsigned long   readb(unsigned long addr);
+extern unsigned long   readw(unsigned long addr);
+extern unsigned long   readl(unsigned long addr);
+extern void            writeb(unsigned char b, unsigned long addr);
+extern void            writew(unsigned short b, unsigned long addr);
+extern void            writel(unsigned int b, unsigned long addr);
+
+#endif /* __KERNEL__ */
+
+#ifdef __KERNEL__
+
  /*
   * The "address" in IO memory space is not clearly either an integer or a
   * pointer. We will accept both, thus the casts.
@@ -257,8 +281,6 @@ static inline void iounmap(void *addr)
  # define writeq(v,a)   _writeq((v),(unsigned long)(a))
  #endif
  
-#ifdef __KERNEL__
-
  /*
   * String version of IO memory access ops:
   */
diff --git a/include/asm-alpha/softirq.h b/include/asm-alpha/softirq.h

index 66aed19eba88a90323638d3e7bb7f8447ebdfb8f..41ccc29c96e51039b0f038d9b396b37af6eadd83 100644 (file)
--- a/include/asm-alpha/softirq.h
+++ b/include/asm-alpha/softirq.h
@@ -117,6 +117,7 @@ extern inline void disable_bh(int nr)
  {
         bh_mask &= ~(1 << nr);
         atomic_inc(&bh_mask_count[nr]);
+       synchronize_bh();
  }
  
  extern inline void enable_bh(int nr)
diff --git a/include/asm-i386/init.h b/include/asm-i386/init.h

index 83215545f95607480a73a88c14754eb82c1ee995..7618c005426a662c68578ed13a1316210ee3bae0 100644 (file)
--- a/include/asm-i386/init.h
+++ b/include/asm-i386/init.h
@@ -11,4 +11,7 @@
  #define __FINIT        .previous
  #define __INITDATA     .section        ".data.init",#alloc,#write
  
+#define __cacheline_aligned __attribute__ \
+                        ((__section__ (".data.cacheline_aligned")))
+
  #endif
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h

index e3b69926719df63f0737e6af8f25a184b0d8390b..dea282ad012f19d73e24d094d41db324ca2d6503 100644 (file)
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -185,10 +185,6 @@ extern inline int cpu_logical_map(int cpu)
  extern void smp_callin(void);
  extern void smp_boot_cpus(void);
  extern void smp_store_cpu_info(int id);                /* Store per CPU info (like the initial udelay numbers */
-extern void smp_message_pass(int target, int msg, unsigned long data, int wait);
-
-extern volatile unsigned long smp_proc_in_lock[NR_CPUS]; /* for computing process time */
-extern volatile int smp_process_available;
  
  /*
   *     APIC handlers: Note according to the Intel specification update
@@ -237,9 +233,7 @@ extern __inline int hard_smp_processor_id(void)
   *     processes are run.
   */
   
-#define PROC_CHANGE_PENALTY    10              /* Schedule penalty */
+#define PROC_CHANGE_PENALTY    15              /* Schedule penalty */
  
-#define SMP_FROM_INT           1
-#define SMP_FROM_SYSCALL       2
  #endif
  #endif
diff --git a/include/asm-i386/system.h.lock~ b/include/asm-i386/system.h.lock~

new file mode 100644 (file)

index 0000000..860783b
--- /dev/null
+++ b/include/asm-i386/system.h.lock~
@@ -0,0 +1 @@
+torvalds@penguin.transmeta.com
+\ No newline at end of file
diff --git a/include/asm-i386/timex.h b/include/asm-i386/timex.h

index c492e1b930249f21aa02143cd57e0b620e1df170..4cd811e9a2cf7040762741c0e7d2db4adebbffba 100644 (file)
--- a/include/asm-i386/timex.h
+++ b/include/asm-i386/timex.h
@@ -12,4 +12,22 @@
         (1000000/CLOCK_TICK_FACTOR) / (CLOCK_TICK_RATE/CLOCK_TICK_FACTOR)) \
                 << (SHIFT_SCALE-SHIFT_HZ)) / HZ)
  
+/*
+ * Standard way to access the cycle counter on i586+ CPUs.
+ * Currently only used on SMP.
+ */
+typedef unsigned long long cycles_t;
+
+extern cycles_t cacheflush_time;
+
+static inline cycles_t get_cycles (void)
+{
+       cycles_t value;
+
+       __asm__("rdtsc"
+               :"=a" (*(((int *)&value)+0)),
+                "=d" (*(((int *)&value)+1)));
+       return value;
+}
+
  #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 086cb1c95e06b18ab5cd1ecb00894eadbc5753a9..b091a8af5c440ea0f29120a606450934abc14d9f 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -11,6 +11,7 @@ extern unsigned long event;
  #include <linux/kernel.h>
  #include <linux/types.h>
  #include <linux/times.h>
+#include <linux/timex.h>
  
  #include <asm/system.h>
  #include <asm/semaphore.h>
@@ -219,6 +220,7 @@ struct task_struct {
  /* various fields */
         long counter;
         long priority;
+       cycles_t avg_slice;
  /* SMP and runqueue state */
         int has_cpu;
         int processor;
@@ -336,7 +338,7 @@ struct task_struct {
   */
  #define INIT_TASK \
  /* state etc */        { 0,0,0,KERNEL_DS,&default_exec_domain,0, \
-/* counter */  DEF_PRIORITY,DEF_PRIORITY, \
+/* counter */  DEF_PRIORITY,DEF_PRIORITY,0, \
  /* SMP */      0,0,0,-1, \
  /* schedlink */        &init_task,&init_task, &init_task, &init_task, \
  /* binfmt */   NULL, \
diff --git a/include/linux/smp.h b/include/linux/smp.h

index 5034c71f6ed6c38418b9d78125500b5cd69df073..80ea3056ffa915101affd8875f2d9904bbf8ade7 100644 (file)
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -11,11 +11,21 @@
  #include <asm/smp.h>
  
  /*
- * main IPI interface, handles INIT, TLB flush, STOP, etc. (defined in asm header):
- *
- * extern void smp_message_pass(int target, int msg, unsigned long data, int wait);
+ * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
+ * (defined in asm header):
   */ 
  
+/*
+ * stops all CPUs but the current one:
+ */
+extern void smp_send_stop(void);
+
+/*
+ * sends a 'reschedule' event to another CPU:
+ */
+extern void FASTCALL(smp_send_reschedule(int cpu));
+
+
  /*
   * Boot processor call to load the other CPU's
   */
@@ -61,7 +71,6 @@ extern volatile int smp_msg_id;
  #define smp_num_cpus                   1
  #define smp_processor_id()             0
  #define hard_smp_processor_id()                0
-#define smp_message_pass(t,m,d,w)      
  #define smp_threads_ready              1
  #define kernel_lock()
  #define cpu_logical_map(cpu)           0
diff --git a/init/main.c b/init/main.c

index 9459d9fbc1a339e535cd1b12dd7b25962f124187..d901189cfd1e550784b215f82a6af72b3a1c5398 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -1177,6 +1177,7 @@ asmlinkage void __init start_kernel(void)
          */
         smp_init();
         kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+       current->need_resched = 1;
         cpu_idle(NULL);
  }
  
diff --git a/kernel/panic.c b/kernel/panic.c

index 5365b3015fbf90acb04a06ff5a14a35633f8de99..a7dbe450341c4370541b535493d1bc53cc65abbc 100644 (file)
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -50,7 +50,7 @@ NORET_TYPE void panic(const char * fmt, ...)
         unblank_console();
  
  #ifdef __SMP__
-       smp_message_pass(MSG_ALL_BUT_SELF, MSG_STOP_CPU, 0, 0);
+       smp_send_stop();
  #endif
         if (panic_timeout > 0)
         {
diff --git a/kernel/sched.c b/kernel/sched.c

index bd3746995d33e312735df69557609d367a6de26e..ad151d1198d5b04dc2614c5e9f8e6677443b08c9 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -12,6 +12,7 @@
   *  1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
   *             serialize accesses to xtime/lost_ticks).
   *                             Copyright (C) 1998  Andrea Arcangeli
+ *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
   */
  
  /*
@@ -96,47 +97,110 @@ struct kernel_stat kstat = { 0 };
  
  void scheduling_functions_start_here(void) { }
  
-static inline void reschedule_idle(struct task_struct * p)
+#ifdef __SMP__
+static void reschedule_idle_slow(struct task_struct * p)
  {
+/*
+ * (see reschedule_idle() for an explanation first ...)
+ *
+ * Pass #2
+ *
+ * We try to find another (idle) CPU for this woken-up process.
+ *
+ * On SMP, we mostly try to see if the CPU the task used
+ * to run on is idle.. but we will use another idle CPU too,
+ * at this point we already know that this CPU is not
+ * willing to reschedule in the near future.
+ *
+ * An idle CPU is definitely wasted, especially if this CPU is
+ * running long-timeslice processes. The following algorithm is
+ * pretty good at finding the best idle CPU to send this process
+ * to.
+ *
+ * [We can try to preempt low-priority processes on other CPUs in
+ * 2.3. Also we can try to use the avg_slice value to predict
+ * 'likely reschedule' events even on other CPUs.]
+ */
+       int best_cpu = p->processor, this_cpu = smp_processor_id();
+       struct task_struct **idle = task, *tsk, *target_tsk;
+       int i = smp_num_cpus;
+
+       target_tsk = NULL;
+       do {
+               tsk = *idle;
+               idle++;
+               if (tsk->has_cpu) {
+                       if (tsk->processor == this_cpu)
+                               continue;
+                       target_tsk = tsk;
+                       if (tsk->processor == best_cpu) {
+                               /*
+                                * bingo, we couldnt get a better
+                                * CPU, activate it.
+                                */
+                               goto send; /* this one helps GCC ... */
+                       }
+               }
+       } while (--i > 0);
  
         /*
-        * For SMP, we try to see if the CPU the task used
-        * to run on is idle..
+        * found any idle CPU?
          */
-#if 0
+       if (target_tsk) {
+send:
+               target_tsk->need_resched = 1;
+               smp_send_reschedule(target_tsk->processor);
+               return;
+       }
+}
+#endif /* __SMP__ */
+
+static inline void reschedule_idle(struct task_struct * p)
+{
+
+       if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) {
+               current->need_resched = 1;
+               return;
+       }
+
+#ifdef __SMP__
         /*
-        * Disable this for now. Ingo has some interesting
-        * code that looks too complex, and I have some ideas,
-        * but in the meantime.. One problem is that "wakeup()"
-        * can be (and is) called before we've even initialized
-        * SMP completely, so..
+        * ("wakeup()" should not be called before we've initialized
+        * SMP completely. [Linus, is there any exception to this?]
+        * Basically a not-yet initialized SMP subsystem can be
+        * considered as a not-yet working scheduler, simply dont use
+        * it before it'd up and running ...)
+        *
+        * SMP rescheduling is done in 2 passes:
+        *  - pass #1: faster: 'quick decisions'
+        *  - pass #2: slower: 'lets try and find another CPU'
          */
-#ifdef __SMP__
-       int want_cpu = p->processor;
  
         /*
-        * Don't even try to find another CPU for us if the task
-        * ran on this one before..
+        * Pass #1
+        *
+        * There are two metrics here:
+        *
+        * first, a 'cutoff' interval, currently ~250 usecs on
+        * x86 CPUs. If the current process has longer average
+        * timeslices than this, then we utilize the idle CPU.
+        *
+        * second, if the wakeup comes from a process context,
+        * then the two processes are 'related'. (they form a
+        * 'gang')
+        *
+        * An idle CPU is almost always a bad thing, thus we skip
+        * the idle-CPU utilization only if both these conditions
+        * are true. (ie. a 'process-gang' rescheduling with rather
+        * high frequency should stay on the same CPU).
+        *
+        * [We can switch to something more finegrained in 2.3.]
          */
-       if (want_cpu != smp_processor_id()) {
-               struct task_struct **idle = task;
-               int i = smp_num_cpus;
-
-               do {
-                       struct task_struct *tsk = *idle;
-                       idle++;
-                       /* Something like this.. */
-                       if (tsk->has_cpu && tsk->processor == want_cpu) {
-                               tsk->need_resched = 1;
-                               smp_send_reschedule(want_cpu);
-                               return;
-                       }
-               } while (--i > 0);
-       }
-#endif
-#endif
-       if (p->policy != SCHED_OTHER || p->counter > current->counter + 3)
-               current->need_resched = 1;      
+       if ((current->avg_slice < cacheflush_time) && !in_interrupt())
+               return;
+
+       reschedule_idle_slow(p);
+#endif /* __SMP__ */
  }
  
  /*
@@ -244,6 +308,8 @@ static void process_timeout(unsigned long __data)
         wake_up_process(p);
  }
  
+int _PROC_CHANGE_PENALTY = 13;
+
  /*
   * This is the function that decides how desirable a process is..
   * You can weigh different processes against each other depending
@@ -488,6 +554,63 @@ signed long schedule_timeout(signed long timeout)
         return timeout < 0 ? 0 : timeout;
  }
  
+/*
+ * This one aligns per-CPU data on cacheline boundaries.
+ */
+static union {
+       struct schedule_data {
+               struct task_struct * prev;
+               long prevstate;
+               cycles_t last_schedule;
+       } schedule_data;
+       char __pad [L1_CACHE_BYTES];
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+
+
+static inline void __schedule_tail (void)
+{
+#ifdef __SMP__
+       struct schedule_data * sched_data;
+
+       /*
+        * We might have switched CPUs:
+        */
+       sched_data = & aligned_data[smp_processor_id()].schedule_data;
+
+       /*
+        * Subtle. In the rare event that we got a wakeup to 'prev' just
+        * during the reschedule (this is possible, the scheduler is pretty
+        * parallel), we should do another reschedule in the next task's
+        * context. schedule() will do the right thing next time around.
+        * this is equivalent to 'delaying' the wakeup until the reschedule
+        * has finished.
+        */
+       if (sched_data->prev->state != sched_data->prevstate)
+               current->need_resched = 1;
+
+       /*
+        * Release the previous process ...
+        *
+        * We have dropped all locks, and we must make sure that we
+        * only mark the previous process as no longer having a CPU
+        * after all other state has been seen by other CPU's. Thus
+        * the memory barrier!
+        */
+       mb();
+       sched_data->prev->has_cpu = 0;
+#endif /* __SMP__ */
+}
+
+/*
+ * schedule_tail() is getting called from the fork return path. This
+ * cleans up all remaining scheduler things, without impacting the
+ * common case.
+ */
+void schedule_tail (void)
+{
+       __schedule_tail();
+}
+
  /*
   *  'schedule()' is the scheduler function. It's a very simple and nice
   * scheduler: it's not perfect, but certainly works for most things.
@@ -500,11 +623,18 @@ signed long schedule_timeout(signed long timeout)
   */
  asmlinkage void schedule(void)
  {
+       struct schedule_data * sched_data;
         struct task_struct * prev, * next;
         int this_cpu;
  
         prev = current;
         this_cpu = prev->processor;
+       /*
+        * 'sched_data' is protected by the fact that we can run
+        * only one process per CPU.
+        */
+       sched_data = & aligned_data[this_cpu].schedule_data;
+
         if (in_interrupt())
                 goto scheduling_in_interrupt;
         release_kernel_lock(prev, this_cpu);
@@ -519,6 +649,7 @@ asmlinkage void schedule(void)
  
         /* move an exhausted RR process to be last.. */
         prev->need_resched = 0;
+
         if (!prev->counter && prev->policy == SCHED_RR) {
                 prev->counter = prev->priority;
                 move_last_runqueue(prev);
@@ -534,6 +665,9 @@ asmlinkage void schedule(void)
                         del_from_runqueue(prev);
                 case TASK_RUNNING:
         }
+
+       sched_data->prevstate = prev->state;
+
         {
                 struct task_struct * p = init_task.next_run;
                 /*
@@ -580,27 +714,49 @@ asmlinkage void schedule(void)
                 }
         }
  
+       /*
+        * maintain the per-process 'average timeslice' value.
+        * (this has to be recalculated even if we reschedule to
+        * the same process) Currently this is only used on SMP:
+        */
  #ifdef __SMP__
-       next->has_cpu = 1;
-#endif
+       {
+               cycles_t t, this_slice;
+
+               t = get_cycles();
+               this_slice = t - sched_data->last_schedule;
+               sched_data->last_schedule = t;
+
+               /*
+                * Simple, exponentially fading average calculation:
+                */
+               prev->avg_slice = this_slice + prev->avg_slice;
+               prev->avg_slice >>= 1;
+       }
+
+       /*
+        * We drop the scheduler lock early (it's a global spinlock),
+        * thus we have to lock the previous process from getting
+        * rescheduled during switch_to().
+        */
+       prev->has_cpu = 1;
  
-       if (prev != next) {
+       next->has_cpu = 1;
+       next->processor = this_cpu;
+       spin_unlock(&scheduler_lock);
+#endif /* __SMP__ */
+       if (prev != next) {
  #ifdef __SMP__
-               next->processor = this_cpu;
+               sched_data->prev = prev;
  #endif
-               kstat.context_swtch++;
+               kstat.context_swtch++;
                 get_mmu_context(next);
                 switch_to(prev,next);
-       }
-
-       spin_unlock(&scheduler_lock);
  
-       /*
-        * At this point "prev" is "current", as we just
-        * switched into it (from an even more "previous"
-        * prev)
-        */
-       reacquire_kernel_lock(prev);
+               __schedule_tail();
+       }
+  
+       reacquire_kernel_lock(current);
         return;
  
  scheduling_in_interrupt:
@@ -608,7 +764,6 @@ scheduling_in_interrupt:
         *(int *)0 = 0;
  }
  
-
  rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;
  
  /*
author	Linus Torvalds <torvalds@linuxfoundation.org>
	Fri, 23 Nov 2007 20:17:44 +0000 (15:17 -0500)
committer	Linus Torvalds <torvalds@linuxfoundation.org>
	Fri, 23 Nov 2007 20:17:44 +0000 (15:17 -0500)
arch/alpha/kernel/irq.c		patch \| blob \| history
arch/alpha/kernel/irq.h		patch \| blob \| history
arch/alpha/kernel/sys_ruffian.c		patch \| blob \| history
arch/alpha/kernel/time.c		patch \| blob \| history
arch/i386/kernel/entry.S		patch \| blob \| history
arch/i386/kernel/i386_ksyms.c		patch \| blob \| history
arch/i386/kernel/io_apic.c		patch \| blob \| history
arch/i386/kernel/irq.c		patch \| blob \| history
arch/i386/kernel/irq.h		patch \| blob \| history
arch/i386/kernel/mtrr.c		patch \| blob \| history
arch/i386/kernel/process.c		patch \| blob \| history
arch/i386/kernel/smp.c		patch \| blob \| history
arch/i386/kernel/time.c		patch \| blob \| history
arch/i386/vmlinux.lds		patch \| blob \| history
include/asm-alpha/atomic.h		patch \| blob \| history
include/asm-alpha/core_apecs.h		patch \| blob \| history
include/asm-alpha/core_cia.h		patch \| blob \| history
include/asm-alpha/core_lca.h		patch \| blob \| history
include/asm-alpha/core_mcpcia.h		patch \| blob \| history
include/asm-alpha/core_pyxis.h		patch \| blob \| history
include/asm-alpha/core_t2.h		patch \| blob \| history
include/asm-alpha/io.h		patch \| blob \| history
include/asm-alpha/softirq.h		patch \| blob \| history
include/asm-i386/init.h		patch \| blob \| history
include/asm-i386/smp.h		patch \| blob \| history
include/asm-i386/system.h.lock~	[new file with mode: 0644]	patch \| blob
include/asm-i386/timex.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/smp.h		patch \| blob \| history
init/main.c		patch \| blob \| history
kernel/panic.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history