merge to the -K3 scheduler.

author Ingo Molnar <mingo@earth2.(none)>

Mon, 11 Feb 2002 21:47:52 +0000 (22:47 +0100)

committer Ingo Molnar <mingo@earth2.(none)>

Mon, 11 Feb 2002 21:47:52 +0000 (22:47 +0100)
author Ingo Molnar <mingo@earth2.(none)>
Mon, 11 Feb 2002 21:47:52 +0000 (22:47 +0100)
committer Ingo Molnar <mingo@earth2.(none)>
Mon, 11 Feb 2002 21:47:52 +0000 (22:47 +0100)
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S

index f006f47c4d18df2951797eb63200d994df9f8043..c51ed2871acc21431c549500fbfce275e1329df8 100644 (file)
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -190,9 +190,11 @@ ENTRY(lcall27)
  
  
  ENTRY(ret_from_fork)
+#if CONFIG_SMP
         pushl %ebx
         call SYMBOL_NAME(schedule_tail)
         addl $4, %esp
+#endif
         GET_THREAD_INFO(%ebx)
         jmp syscall_exit
  
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c

index 00830386641c6de821fa71048a83b84217d2fb10..2b8a35d2a940daff543d06de7d6c9a60cdcb0eac 100644 (file)
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1119,7 +1119,7 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
         psinfo.pr_state = i;
         psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
         psinfo.pr_zomb = psinfo.pr_sname == 'Z';
-       psinfo.pr_nice = current->__nice;
+       psinfo.pr_nice = task_nice(current);
         psinfo.pr_flag = current->flags;
         psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
         psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
diff --git a/fs/proc/array.c b/fs/proc/array.c

index d1e5d9eede6c44829c825eec4430b4cb06a9cb01..a718316d7ffddabe9c227a4476201ef8545cb2c6 100644 (file)
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -336,12 +336,8 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
  
         /* scale priority and nice values from timeslices to -20..20 */
         /* to make it look like a "normal" Unix priority/nice value  */
-       priority = task->prio;
-       if (priority >= MAX_RT_PRIO)
-               priority -= MAX_RT_PRIO;
-       else
-               priority = priority-100;
-       nice = task->__nice;
+       priority = task_prio(task);
+       nice = task_nice(task);
  
         read_lock(&tasklist_lock);
         ppid = task->pid ? task->p_opptr->pid : 0;
diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h

index 4853f679166ecc9002d7774917ca0003de975f85..7ff7555040e1f5c93c07cbd79e52396a5a8d8d93 100644 (file)
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -358,7 +358,7 @@ static __inline__ int find_next_zero_bit (void * addr, int size, int offset)
   * @offset: The bitnumber to start searching at
   * @size: The maximum size to search
   */
-static __inline__ int find_next_bit (void * addr, int size, int offset)
+static __inline__ int find_next_bit(void * addr, int size, int offset)
  {
         unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
         int set = 0, bit = offset & 31, res;
diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h

index c8bc24cd9a7048884993b2ce3cc6adb54038bbd8..79df13ec9d6b40e729fbf8a6b8256e1953370c57 100644 (file)
--- a/include/asm-i386/mmu_context.h
+++ b/include/asm-i386/mmu_context.h
@@ -8,14 +8,10 @@
  
  /*
   * Every architecture must define this function. It's the fastest
- * way of searching a 168-bit bitmap where the first 128 bits are
- * unlikely to be set. It's guaranteed that at least one of the 168
+ * way of searching a 140-bit bitmap where the first 100 bits are
+ * unlikely to be set. It's guaranteed that at least one of the 140
   * bits is cleared.
   */
-#if MAX_RT_PRIO != 128 || MAX_PRIO != 168
-# error update this function.
-#endif
-
  static inline int sched_find_first_bit(unsigned long *b)
  {
         if (unlikely(b[0]))
@@ -24,11 +20,9 @@ static inline int sched_find_first_bit(unsigned long *b)
                 return __ffs(b[1]) + 32;
         if (unlikely(b[2]))
                 return __ffs(b[2]) + 64;
-       if (unlikely(b[3]))
+       if (b[3])
                 return __ffs(b[3]) + 96;
-       if (b[4])
-               return __ffs(b[4]) + 128;
-       return __ffs(b[5]) + 32 + 128;
+       return __ffs(b[4]) + 128;
  }
  /*
   * possibly do the LDT unload here?
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index 9007fe56c6ac8ab4da19f031f767037604ceaa30..9f34e057079af4d145c678f75fbcd93c98bf12f8 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -45,7 +45,8 @@
      thread_info:       &init_thread_info,                              \
      flags:             0,                                              \
      lock_depth:                -1,                                             \
-    __nice:            DEF_USER_NICE,                                  \
+    prio:              120,                                            \
+    static_prio:       120,                                            \
      policy:            SCHED_OTHER,                                    \
      cpus_allowed:      -1,                                             \
      mm:                        NULL,                                           \
diff --git a/include/linux/sched.h b/include/linux/sched.h

index dd6dc6983e044324c907e7240d063187ba1bcac6..f7e14aced6b9b75497dfa10c19dbedb92b13d451 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -150,7 +150,7 @@ extern void trap_init(void);
  extern void update_process_times(int user);
  extern void update_one_process(struct task_struct *p, unsigned long user,
                                unsigned long system, int cpu);
-extern void scheduler_tick(struct task_struct *p);
+extern void scheduler_tick(int user_tick, int system);
  extern void sched_task_migrated(struct task_struct *p);
  extern void smp_migrate_task(int cpu, task_t *task);
  extern unsigned long cache_decay_ticks;
@@ -241,18 +241,16 @@ struct task_struct {
  
         int lock_depth;         /* Lock depth */
  
-       int prio;
-       long __nice;
+       int prio, static_prio;
         list_t run_list;
         prio_array_t *array;
  
-       unsigned int time_slice;
-
         unsigned long sleep_avg;
         unsigned long sleep_timestamp;
  
         unsigned long policy;
         unsigned long cpus_allowed;
+       unsigned int time_slice;
  
         struct task_struct *next_task, *prev_task;
  
@@ -385,66 +383,12 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
   */
  #define _STK_LIM       (8*1024*1024)
  
-/*
- * RT priorites go from 0 to 99, but internally we max
- * them out at 128 to make it easier to search the
- * scheduler bitmap.
- */
-#define MAX_RT_PRIO            128
-/*
- * The lower the priority of a process, the more likely it is
- * to run. Priority of a process goes from 0 to 167. The 0-99
- * priority range is allocated to RT tasks, the 128-167 range
- * is for SCHED_OTHER tasks.
- */
-#define MAX_PRIO               (MAX_RT_PRIO + 40)
-
-/*
- * Scales user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ 128 ... 167 (MAX_PRIO-1) ]
- *
- * User-nice value of -20 == static priority 128, and
- * user-nice value 19 == static priority 167. The lower
- * the priority value, the higher the task's priority.
- */
-#define NICE_TO_PRIO(n)                (MAX_RT_PRIO + (n) + 20)
-#define DEF_USER_NICE          0
-
-/*
- * Default timeslice is 150 msecs, maximum is 300 msecs.
- * Minimum timeslice is 10 msecs.
- *
- * These are the 'tuning knobs' of the scheduler:
- */
-#define MIN_TIMESLICE          ( 10 * HZ / 1000)
-#define MAX_TIMESLICE          (300 * HZ / 1000)
-#define CHILD_FORK_PENALTY     95
-#define PARENT_FORK_PENALTY    100
-#define EXIT_WEIGHT            3
-#define PRIO_INTERACTIVE_RATIO 20
-#define PRIO_CPU_HOG_RATIO     60
-#define PRIO_BONUS_RATIO       70
-#define INTERACTIVE_DELTA      3
-#define MAX_SLEEP_AVG          (2*HZ)
-#define STARVATION_LIMIT       (2*HZ)
-
-#define USER_PRIO(p)           ((p)-MAX_RT_PRIO)
-#define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
-
-/*
- * NICE_TO_TIMESLICE scales nice values [ -20 ... 19 ]
- * to time slice values.
- *
- * The higher a process's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority process gets MIN_TIMESLICE worth of execution time.
- */
-
-#define NICE_TO_TIMESLICE(n) (MIN_TIMESLICE + \
-       ((MAX_TIMESLICE - MIN_TIMESLICE) * (19-(n))) / 39)
-
  extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
  extern void set_user_nice(task_t *p, long nice);
+extern int task_prio(task_t *p);
+extern int task_nice(task_t *p);
+extern int idle_cpu(int cpu);
+
  asmlinkage long sys_sched_yield(void);
  #define yield() sys_sched_yield()
  
@@ -526,6 +470,7 @@ extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
                                                     signed long timeout));
  extern int FASTCALL(wake_up_process(struct task_struct * tsk));
  extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
+extern void FASTCALL(sched_exit(task_t * p));
  
  #define wake_up(x)                     __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
  #define wake_up_nr(x, nr)              __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
diff --git a/kernel/exit.c b/kernel/exit.c

index 2566c5c5ba5a313c128af5ac46f97b189b0fb176..4a36dd9ce05b0cb402c5234236cb82ca58e27828 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,8 +31,6 @@ int getrusage(struct task_struct *, int, struct rusage *);
  
  static void release_task(struct task_struct * p)
  {
-       unsigned long flags;
-
         if (p == current)
                 BUG();
  #ifdef CONFIG_SMP
@@ -46,25 +44,7 @@ static void release_task(struct task_struct * p)
         current->cmin_flt += p->min_flt + p->cmin_flt;
         current->cmaj_flt += p->maj_flt + p->cmaj_flt;
         current->cnswap += p->nswap + p->cnswap;
-       /*
-        * Potentially available timeslices are retrieved
-        * here - this way the parent does not get penalized
-        * for creating too many processes.
-        *
-        * (this cannot be used to artificially 'generate'
-        * timeslices, because any timeslice recovered here
-        * was given away by the parent in the first place.)
-        */
-       __save_flags(flags);
-       __cli();
-       current->time_slice += p->time_slice;
-       if (current->time_slice > MAX_TIMESLICE)
-               current->time_slice = MAX_TIMESLICE;
-       if (p->sleep_avg < current->sleep_avg)
-               current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT +
-                               p->sleep_avg) / (EXIT_WEIGHT + 1);
-       __restore_flags(flags);
-
+       sched_exit(p);
         p->pid = 0;
         put_task_struct(p);
  }
@@ -172,9 +152,8 @@ void reparent_to_init(void)
         current->exit_signal = SIGCHLD;
  
         current->ptrace = 0;
-       if ((current->policy == SCHED_OTHER) &&
-                       (current->__nice < DEF_USER_NICE))
-               set_user_nice(current, DEF_USER_NICE);
+       if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0))
+               set_user_nice(current, 0);
         /* cpus_allowed? */
         /* rt_priority? */
         /* signals? */
diff --git a/kernel/fork.c b/kernel/fork.c

index 94ccd2aead70a67c2f969f3affdcdbbffab0f0e8..3ba556b4770538be348c854b12372dce3ed521bf 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -749,14 +749,11 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start,
                  * runqueue lock is not a problem.
                  */
                 current->time_slice = 1;
-               scheduler_tick(current);
+               scheduler_tick(0, 0);
         }
         p->sleep_timestamp = jiffies;
         __restore_flags(flags);
  
-       if (p->policy == SCHED_OTHER)
-               p->prio = MAX_PRIO - 1 - ((MAX_PRIO - 1 - p->prio) * 1) / 3;
-
         /*
          * Ok, add it to the run-queues and make it
          * visible to the rest of the system.
diff --git a/kernel/ksyms.c b/kernel/ksyms.c

index 3f0b16ccf16b1a383ee091493355ea8e2e1ae135..8621163d04b373a54a992c5c5aaafcb53a9a6622 100644 (file)
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -455,6 +455,8 @@ EXPORT_SYMBOL(preempt_schedule);
  EXPORT_SYMBOL(schedule_timeout);
  EXPORT_SYMBOL(sys_sched_yield);
  EXPORT_SYMBOL(set_user_nice);
+EXPORT_SYMBOL(task_nice);
+EXPORT_SYMBOL_GPL(idle_cpu);
  EXPORT_SYMBOL(jiffies);
  EXPORT_SYMBOL(xtime);
  EXPORT_SYMBOL(do_gettimeofday);
diff --git a/kernel/sched.c b/kernel/sched.c

index c8b01155f1c06fdf830c6945cbdec2ce9b16c50b..08e1293ab92a8f7c20e2b42887594ab5ee23a083 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -20,8 +20,107 @@
  #include <linux/interrupt.h>
  #include <linux/completion.h>
  #include <asm/mmu_context.h>
+#include <linux/kernel_stat.h>
  
-#define BITMAP_SIZE ((((MAX_PRIO+7)/8)+sizeof(long)-1)/sizeof(long))
+/*
+ * Priority of a process goes from 0 to 139. The 0-99
+ * priority range is allocated to RT tasks, the 100-139
+ * range is for SCHED_OTHER tasks. Priority values are
+ * inverted: lower p->prio value means higher priority.
+ */
+#define MAX_RT_PRIO            100
+#define MAX_PRIO               (MAX_RT_PRIO + 40)
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ 100 ... 139 (MAX_PRIO-1) ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)     (MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)     ((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p)           PRIO_TO_NICE((p)->static_prio)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)           ((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)      USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
+
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
+ * maximum timeslice is 300 msecs. Timeslices get refilled after
+ * they expire.
+ */
+#define MIN_TIMESLICE          ( 10 * HZ / 1000)
+#define MAX_TIMESLICE          (300 * HZ / 1000)
+#define CHILD_PENALTY          95
+#define PARENT_PENALTY         100
+#define EXIT_WEIGHT            3
+#define PRIO_BONUS_RATIO       25
+#define INTERACTIVE_DELTA      2
+#define MAX_SLEEP_AVG          (2*HZ)
+#define STARVATION_LIMIT       (2*HZ)
+
+/*
+ * If a task is 'interactive' then we reinsert it in the active
+ * array after it has expired its current timeslice. (it will not
+ * continue to run immediately, it will still roundrobin with
+ * other interactive tasks.)
+ *
+ * This part scales the interactivity limit depending on niceness.
+ *
+ * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+ * Here are a few examples of different nice levels:
+ *
+ *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
+ *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
+ *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
+ *
+ * (the X axis represents the possible -5 ... 0 ... +5 dynamic
+ *  priority range a task can explore, a value of '1' means the
+ *  task is rated interactive.)
+ *
+ * Ie. nice +19 tasks can never get 'interactive' enough to be
+ * reinserted into the active array. And only heavily CPU-hog nice -20
+ * tasks will be expired. Default nice 0 tasks are somewhere between,
+ * it takes some effort for them to get interactive, but it's not
+ * too hard.
+ */
+
+#define SCALE(v1,v1_max,v2_max) \
+       (v1) * (v2_max) / (v1_max)
+
+#define DELTA(p) \
+       (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
+               INTERACTIVE_DELTA)
+
+#define TASK_INTERACTIVE(p) \
+       ((p)->prio <= (p)->static_prio - DELTA(p))
+
+/*
+ * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ]
+ * to time slice values.
+ *
+ * The higher a process's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority process gets MIN_TIMESLICE worth of execution time.
+ */
+
+#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \
+       ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39))
+
+/*
+ * These are the runqueue data structures:
+ */
+
+#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
  
  typedef struct runqueue runqueue_t;
  
@@ -54,8 +153,7 @@ static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
  #define this_rq()              cpu_rq(smp_processor_id())
  #define task_rq(p)             cpu_rq((p)->thread_info->cpu)
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
-#define rt_task(p)             ((p)->policy != SCHED_OTHER)
-
+#define rt_task(p)             ((p)->prio < MAX_RT_PRIO)
  
  static inline runqueue_t *lock_task_rq(task_t *p, unsigned long *flags)
  {
@@ -78,6 +176,7 @@ static inline void unlock_task_rq(runqueue_t *rq, unsigned long *flags)
         spin_unlock_irqrestore(&rq->lock, *flags);
         preempt_enable();
  }
+
  /*
   * Adding/removing a task to/from a priority array:
   */
@@ -97,66 +196,25 @@ static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
         p->array = array;
  }
  
-/*
- * A task is 'heavily interactive' if it either has reached the
- * bottom 25% of the SCHED_OTHER priority range, or if it is below
- * its default priority by at least 3 priority levels. In this
- * case we favor it by reinserting it on the active array,
- * even after it expired its current timeslice.
- *
- * A task is a 'CPU hog' if it's either in the upper 25% of the
- * SCHED_OTHER priority range, or if's not an interactive task.
- *
- * A task can get a priority bonus by being 'somewhat
- * interactive' - and it will get a priority penalty for
- * being a CPU hog.
- *
- */
-
-#define PRIO_INTERACTIVE \
-               (MAX_RT_PRIO + MAX_USER_PRIO*PRIO_INTERACTIVE_RATIO/100)
-#define PRIO_CPU_HOG \
-               (MAX_RT_PRIO + MAX_USER_PRIO*PRIO_CPU_HOG_RATIO/100)
-
-#define TASK_INTERACTIVE(p) \
-       (((p)->prio <= PRIO_INTERACTIVE) || \
-       (((p)->prio < PRIO_CPU_HOG) && \
-               ((p)->prio <= NICE_TO_PRIO((p)->__nice) - INTERACTIVE_DELTA)))
-
-/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks:
- */
-#define EXPIRED_STARVING(rq) \
-               ((rq)->expired_timestamp && \
-               (jiffies - (rq)->expired_timestamp >= \
-                       STARVATION_LIMIT * ((rq)->nr_running) + 1))
-
  static inline int effective_prio(task_t *p)
  {
         int bonus, prio;
  
         /*
          * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG]
-        * into the -14 ... +14 bonus/penalty range.
+        * into the -5 ... 0 ... +5 bonus/penalty range.
          *
-        * We use 70% of the full 0...39 priority range so that:
+        * We use 25% of the full 0...39 priority range so that:
          *
-        * 1) nice +19 CPU hogs do not preempt nice 0 CPU hogs.
-        * 2) nice -20 interactive tasks do not get preempted by
-        *    nice 0 interactive tasks.
+        * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+        * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
          *
          * Both properties are important to certain workloads.
          */
         bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
                         MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
  
-       prio = NICE_TO_PRIO(p->__nice) - bonus;
+       prio = p->static_prio - bonus;
         if (prio < MAX_RT_PRIO)
                 prio = MAX_RT_PRIO;
         if (prio > MAX_PRIO-1)
@@ -191,7 +249,6 @@ static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
         rq->nr_running--;
         dequeue_task(p, p->array);
         p->array = NULL;
-       p->sleep_timestamp = jiffies;
  }
  
  static inline void resched_task(task_t *p)
@@ -305,16 +362,20 @@ int wake_up_process(task_t * p)
  void wake_up_forked_process(task_t * p)
  {
         runqueue_t *rq;
-       
+
         preempt_disable();
         rq = this_rq();
  
         p->state = TASK_RUNNING;
         if (!rt_task(p)) {
-               p->sleep_avg = p->sleep_avg * CHILD_FORK_PENALTY / 100;
+               /*
+                * We decrease the sleep average of forking parents
+                * and children as well, to keep max-interactive tasks
+                * from forking tasks that are max-interactive.
+                */
+               current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
+               p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
                 p->prio = effective_prio(p);
-
-               current->sleep_avg = current->sleep_avg * PARENT_FORK_PENALTY / 100;
         }
         spin_lock_irq(&rq->lock);
         p->thread_info->cpu = smp_processor_id();
@@ -323,10 +384,37 @@ void wake_up_forked_process(task_t * p)
         preempt_enable();
  }
  
+/*
+ * Potentially available exiting-child timeslices are
+ * retrieved here - this way the parent does not get
+ * penalized for creating too many processes.
+ *
+ * (this cannot be used to 'generate' timeslices
+ * artificially, because any timeslice recovered here
+ * was given away by the parent in the first place.)
+ */
+void sched_exit(task_t * p)
+{
+       __cli();
+       current->time_slice += p->time_slice;
+       if (unlikely(current->time_slice > MAX_TIMESLICE))
+               current->time_slice = MAX_TIMESLICE;
+       __sti();
+       /*
+        * If the child was a (relative-) CPU hog then decrease
+        * the sleep_avg of the parent as well.
+        */
+       if (p->sleep_avg < current->sleep_avg)
+               current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT +
+                       p->sleep_avg) / (EXIT_WEIGHT + 1);
+}
+
+#if CONFIG_SMP
  asmlinkage void schedule_tail(task_t *prev)
  {
         spin_unlock_irq(&this_rq()->lock);
  }
+#endif
  
  static inline void context_switch(task_t *prev, task_t *next)
  {
@@ -403,6 +491,7 @@ static inline unsigned int double_lock_balance(runqueue_t *this_rq,
         }
         return nr_running;
  }
+
  /*
   * Current runqueue is empty, or rebalance tick: if there is an
   * inbalance (current runqueue is too short) then pull from
@@ -492,13 +581,13 @@ static void load_balance(runqueue_t *this_rq, int idle)
                 array = busiest->active;
  
  new_array:
-       /*
-        * Load-balancing does not affect RT tasks, so we start the
-        * searching at priority 128.
-        */
-       idx = MAX_RT_PRIO;
+       /* Start searching at priority 0: */
+       idx = 0;
  skip_bitmap:
-       idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+       if (!idx)
+               idx = sched_find_first_bit(array->bitmap);
+       else
+               idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
         if (idx == MAX_PRIO) {
                 if (array == busiest->expired) {
                         array = busiest->active;
@@ -576,19 +665,44 @@ static inline void idle_tick(void)
  
  #endif
  
+/*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks:
+ */
+#define EXPIRED_STARVING(rq) \
+               ((rq)->expired_timestamp && \
+               (jiffies - (rq)->expired_timestamp >= \
+                       STARVATION_LIMIT * ((rq)->nr_running) + 1))
+
  /*
   * This function gets called by the timer code, with HZ frequency.
   * We call it with interrupts disabled.
   */
-void scheduler_tick(task_t *p)
+void scheduler_tick(int user_tick, int system)
  {
+       int cpu = smp_processor_id();
         runqueue_t *rq = this_rq();
-#if CONFIG_SMP
-       unsigned long now = jiffies;
+       task_t *p = current;
  
-       if (p == rq->idle)
-               return idle_tick();
+       if (p == rq->idle) {
+               if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
+                       kstat.per_cpu_system[cpu] += system;
+#if CONFIG_SMP
+               idle_tick();
  #endif
+               return;
+       }
+       if (TASK_NICE(p) > 0)
+               kstat.per_cpu_nice[cpu] += user_tick;
+       else
+               kstat.per_cpu_user[cpu] += user_tick;
+       kstat.per_cpu_system[cpu] += system;
+
         /* Task might have expired already, but not scheduled off yet */
         if (p->array != rq->active) {
                 set_tsk_need_resched(p);
@@ -601,7 +715,7 @@ void scheduler_tick(task_t *p)
                  * FIFO tasks have no timeslices.
                  */
                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
-                       p->time_slice = NICE_TO_TIMESLICE(p->__nice);
+                       p->time_slice = TASK_TIMESLICE(p);
                         set_tsk_need_resched(p);
  
                         /* put it at the end of the queue: */
@@ -624,7 +738,7 @@ void scheduler_tick(task_t *p)
                 dequeue_task(p, rq->active);
                 set_tsk_need_resched(p);
                 p->prio = effective_prio(p);
-               p->time_slice = NICE_TO_TIMESLICE(p->__nice);
+               p->time_slice = TASK_TIMESLICE(p);
  
                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
                         if (!rq->expired_timestamp)
@@ -635,7 +749,7 @@ void scheduler_tick(task_t *p)
         }
  out:
  #if CONFIG_SMP
-       if (!(now % BUSY_REBALANCE_TICK))
+       if (!(jiffies % BUSY_REBALANCE_TICK))
                 load_balance(rq, 0);
  #endif
         spin_unlock(&rq->lock);
@@ -656,12 +770,12 @@ asmlinkage void schedule(void)
  
         if (unlikely(in_interrupt()))
                 BUG();
-
         preempt_disable();
         prev = current;
         rq = this_rq();
-       
+
         release_kernel_lock(prev, smp_processor_id());
+       prev->sleep_timestamp = jiffies;
         spin_lock_irq(&rq->lock);
  
  #ifdef CONFIG_PREEMPT
@@ -672,19 +786,17 @@ asmlinkage void schedule(void)
         if (unlikely(preempt_get_count() & PREEMPT_ACTIVE))
                 goto pick_next_task;
  #endif
-       
+
         switch (prev->state) {
-       case TASK_RUNNING:
-               prev->sleep_timestamp = jiffies;
-               break;
         case TASK_INTERRUPTIBLE:
                 if (unlikely(signal_pending(prev))) {
                         prev->state = TASK_RUNNING;
-                       prev->sleep_timestamp = jiffies;
                         break;
                 }
         default:
                 deactivate_task(prev, rq);
+       case TASK_RUNNING:
+               ;
         }
  #if CONFIG_SMP || CONFIG_PREEMPT
  pick_next_task:
@@ -905,6 +1017,8 @@ void set_cpus_allowed(task_t *p, unsigned long new_mask)
         new_mask &= cpu_online_map;
         if (!new_mask)
                 BUG();
+       if (p != current)
+               BUG();
  
         p->cpus_allowed = new_mask;
         /*
@@ -929,7 +1043,7 @@ void set_user_nice(task_t *p, long nice)
         prio_array_t *array;
         runqueue_t *rq;
  
-       if (p->__nice == nice)
+       if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
                 return;
         /*
          * We have to be careful, if called from sys_setpriority(),
@@ -937,13 +1051,13 @@ void set_user_nice(task_t *p, long nice)
          */
         rq = lock_task_rq(p, &flags);
         if (rt_task(p)) {
-               p->__nice = nice;
+               p->static_prio = NICE_TO_PRIO(nice);
                 goto out_unlock;
         }
         array = p->array;
         if (array)
                 dequeue_task(p, array);
-       p->__nice = nice;
+       p->static_prio = NICE_TO_PRIO(nice);
         p->prio = NICE_TO_PRIO(nice);
         if (array) {
                 enqueue_task(p, array);
@@ -951,8 +1065,7 @@ void set_user_nice(task_t *p, long nice)
                  * If the task is running and lowered its priority,
                  * or increased its priority then reschedule its CPU:
                  */
-               if ((nice < p->__nice) ||
-                               ((p->__nice < nice) && (p == rq->curr)))
+               if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr))
                         resched_task(rq->curr);
         }
  out_unlock:
@@ -985,7 +1098,7 @@ asmlinkage long sys_nice(int increment)
         if (increment > 40)
                 increment = 40;
  
-       nice = current->__nice + increment;
+       nice = PRIO_TO_NICE(current->static_prio) + increment;
         if (nice < -20)
                 nice = -20;
         if (nice > 19)
@@ -996,6 +1109,27 @@ asmlinkage long sys_nice(int increment)
  
  #endif
  
+/*
+ * This is the priority value as seen by users in /proc
+ *
+ * RT tasks are offset by -200. Normal tasks are centered
+ * around 0, value goes from -16 to +15.
+ */
+int task_prio(task_t *p)
+{
+       return p->prio - 100;
+}
+
+int task_nice(task_t *p)
+{
+       return TASK_NICE(p);
+}
+
+int idle_cpu(int cpu)
+{
+       return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
  static inline task_t *find_process_by_pid(pid_t pid)
  {
         return pid ? find_task_by_pid(pid) : current;
@@ -1069,9 +1203,9 @@ static int setscheduler(pid_t pid, int policy, struct sched_param *param)
         p->policy = policy;
         p->rt_priority = lp.sched_priority;
         if (rt_task(p))
-               p->prio = 99-p->rt_priority;
+               p->prio = 99 - p->rt_priority;
         else
-               p->prio = NICE_TO_PRIO(p->__nice);
+               p->prio = p->static_prio;
         if (array)
                 activate_task(p, task_rq(p));
  
@@ -1186,7 +1320,6 @@ asmlinkage long sys_sched_yield(void)
  
         return 0;
  }
-
  asmlinkage long sys_sched_get_priority_max(int policy)
  {
         int ret = -EINVAL;
@@ -1232,7 +1365,7 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
         p = find_process_by_pid(pid);
         if (p)
                 jiffies_to_timespec(p->policy & SCHED_FIFO ?
-                                        0 : NICE_TO_TIMESLICE(p->__nice), &t);
+                                        0 : TASK_TIMESLICE(p), &t);
         read_unlock(&tasklist_lock);
         if (p)
                 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
diff --git a/kernel/sys.c b/kernel/sys.c

index 15e7e3455ecf507a3189e238ae6acee08a8abc9b..56d483f984fd4c2536fbe59f75e86e469030abc2 100644 (file)
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -221,7 +221,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
                 }
                 if (error == -ESRCH)
                         error = 0;
-               if (niceval < p->__nice && !capable(CAP_SYS_NICE))
+               if (niceval < task_nice(p) && !capable(CAP_SYS_NICE))
                         error = -EACCES;
                 else
                         set_user_nice(p, niceval);
@@ -250,7 +250,7 @@ asmlinkage long sys_getpriority(int which, int who)
                 long niceval;
                 if (!proc_sel(p, which, who))
                         continue;
-               niceval = 20 - p->__nice;
+               niceval = 20 - task_nice(p);
                 if (niceval > retval)
                         retval = niceval;
         }
diff --git a/kernel/timer.c b/kernel/timer.c

index da17ae4acdd8069e5c009c1abe191fa173a175f9..6b79d204588297ae8a2e94b5ae2403a93793d179 100644 (file)
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -584,17 +584,7 @@ void update_process_times(int user_tick)
         int cpu = smp_processor_id(), system = user_tick ^ 1;
  
         update_one_process(p, user_tick, system, cpu);
-       if (p->pid) {
-               if (p->__nice > 0)
-                       kstat.per_cpu_nice[cpu] += user_tick;
-               else
-                       kstat.per_cpu_user[cpu] += user_tick;
-               kstat.per_cpu_system[cpu] += system;
-       } else {
-               if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
-                       kstat.per_cpu_system[cpu] += system;
-       }
-       scheduler_tick(p);
+       scheduler_tick(user_tick, system);
  }
  
  /*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 8c476d491dcdc9dee15d7f08c11dc2d61b88bb4c..30e761117982aa3526144d725ceffafa83b51f2f 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -82,7 +82,7 @@ static int badness(struct task_struct *p)
          * Niced processes are most likely less important, so double
          * their badness points.
          */
-       if (p->__nice > 0)
+       if (task_nice(p) > 0)
                 points *= 2;
  
         /*
@@ -146,7 +146,7 @@ void oom_kill_task(struct task_struct *p)
          * all the memory it needs. That way it should be able to
          * exit() and clear out its resources quickly...
          */
-       p->time_slice = 2 * MAX_TIMESLICE;
+       p->time_slice = HZ;
         p->flags |= PF_MEMALLOC | PF_MEMDIE;
  
         /* This process has hardware access, be more careful. */
author	Ingo Molnar <mingo@earth2.(none)>
	Mon, 11 Feb 2002 21:47:52 +0000 (22:47 +0100)
committer	Ingo Molnar <mingo@earth2.(none)>
	Mon, 11 Feb 2002 21:47:52 +0000 (22:47 +0100)
arch/i386/kernel/entry.S		patch \| blob \| history
fs/binfmt_elf.c		patch \| blob \| history
fs/proc/array.c		patch \| blob \| history
include/asm-i386/bitops.h		patch \| blob \| history
include/asm-i386/mmu_context.h		patch \| blob \| history
include/linux/init_task.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/ksyms.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/sys.c		patch \| blob \| history
kernel/timer.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history