extern void update_process_times(int user);
extern void update_one_process(struct task_struct *p, unsigned long user,
unsigned long system, int cpu);
-extern void scheduler_tick(struct task_struct *p);
+extern void scheduler_tick(int user_tick, int system);
extern void sched_task_migrated(struct task_struct *p);
extern void smp_migrate_task(int cpu, task_t *task);
extern unsigned long cache_decay_ticks;
int lock_depth; /* Lock depth */
- int prio;
- long __nice;
+ int prio, static_prio;
list_t run_list;
prio_array_t *array;
- unsigned int time_slice;
-
unsigned long sleep_avg;
unsigned long sleep_timestamp;
unsigned long policy;
unsigned long cpus_allowed;
+ unsigned int time_slice;
struct task_struct *next_task, *prev_task;
*/
#define _STK_LIM (8*1024*1024)
-/*
- * RT priorites go from 0 to 99, but internally we max
- * them out at 128 to make it easier to search the
- * scheduler bitmap.
- */
-#define MAX_RT_PRIO 128
-/*
- * The lower the priority of a process, the more likely it is
- * to run. Priority of a process goes from 0 to 167. The 0-99
- * priority range is allocated to RT tasks, the 128-167 range
- * is for SCHED_OTHER tasks.
- */
-#define MAX_PRIO (MAX_RT_PRIO + 40)
-
-/*
- * Scales user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ 128 ... 167 (MAX_PRIO-1) ]
- *
- * User-nice value of -20 == static priority 128, and
- * user-nice value 19 == static priority 167. The lower
- * the priority value, the higher the task's priority.
- */
-#define NICE_TO_PRIO(n) (MAX_RT_PRIO + (n) + 20)
-#define DEF_USER_NICE 0
-
-/*
- * Default timeslice is 150 msecs, maximum is 300 msecs.
- * Minimum timeslice is 10 msecs.
- *
- * These are the 'tuning knobs' of the scheduler:
- */
-#define MIN_TIMESLICE ( 10 * HZ / 1000)
-#define MAX_TIMESLICE (300 * HZ / 1000)
-#define CHILD_FORK_PENALTY 95
-#define PARENT_FORK_PENALTY 100
-#define EXIT_WEIGHT 3
-#define PRIO_INTERACTIVE_RATIO 20
-#define PRIO_CPU_HOG_RATIO 60
-#define PRIO_BONUS_RATIO 70
-#define INTERACTIVE_DELTA 3
-#define MAX_SLEEP_AVG (2*HZ)
-#define STARVATION_LIMIT (2*HZ)
-
-#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
-#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
-
-/*
- * NICE_TO_TIMESLICE scales nice values [ -20 ... 19 ]
- * to time slice values.
- *
- * The higher a process's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority process gets MIN_TIMESLICE worth of execution time.
- */
-
-#define NICE_TO_TIMESLICE(n) (MIN_TIMESLICE + \
- ((MAX_TIMESLICE - MIN_TIMESLICE) * (19-(n))) / 39)
-
extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
extern void set_user_nice(task_t *p, long nice);
+extern int task_prio(task_t *p);
+extern int task_nice(task_t *p);
+extern int idle_cpu(int cpu);
+
asmlinkage long sys_sched_yield(void);
#define yield() sys_sched_yield()
signed long timeout));
extern int FASTCALL(wake_up_process(struct task_struct * tsk));
extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
+extern void FASTCALL(sched_exit(task_t * p));
#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
#define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
#include <linux/interrupt.h>
#include <linux/completion.h>
#include <asm/mmu_context.h>
+#include <linux/kernel_stat.h>
-#define BITMAP_SIZE ((((MAX_PRIO+7)/8)+sizeof(long)-1)/sizeof(long))
+/*
+ * Priority of a process goes from 0 to 139. The 0-99
+ * priority range is allocated to RT tasks, the 100-139
+ * range is for SCHED_OTHER tasks. Priority values are
+ * inverted: lower p->prio value means higher priority.
+ */
+#define MAX_RT_PRIO 100
+#define MAX_PRIO (MAX_RT_PRIO + 40)
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ 100 ... 139 (MAX_PRIO-1) ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
+ * maximum timeslice is 300 msecs. Timeslices get refilled after
+ * they expire.
+ */
+#define MIN_TIMESLICE ( 10 * HZ / 1000)
+#define MAX_TIMESLICE (300 * HZ / 1000)
+#define CHILD_PENALTY 95
+#define PARENT_PENALTY 100
+#define EXIT_WEIGHT 3
+#define PRIO_BONUS_RATIO 25
+#define INTERACTIVE_DELTA 2
+#define MAX_SLEEP_AVG (2*HZ)
+#define STARVATION_LIMIT (2*HZ)
+
+/*
+ * If a task is 'interactive' then we reinsert it in the active
+ * array after it has expired its current timeslice. (it will not
+ * continue to run immediately, it will still roundrobin with
+ * other interactive tasks.)
+ *
+ * This part scales the interactivity limit depending on niceness.
+ *
+ * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+ * Here are a few examples of different nice levels:
+ *
+ * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
+ * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
+ * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
+ * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
+ * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
+ *
+ * (the X axis represents the possible -5 ... 0 ... +5 dynamic
+ * priority range a task can explore, a value of '1' means the
+ * task is rated interactive.)
+ *
+ * Ie. nice +19 tasks can never get 'interactive' enough to be
+ * reinserted into the active array. And only heavily CPU-hog nice -20
+ * tasks will be expired. Default nice 0 tasks are somewhere between,
+ * it takes some effort for them to get interactive, but it's not
+ * too hard.
+ */
+
+#define SCALE(v1,v1_max,v2_max) \
+ (v1) * (v2_max) / (v1_max)
+
+#define DELTA(p) \
+ (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
+ INTERACTIVE_DELTA)
+
+#define TASK_INTERACTIVE(p) \
+ ((p)->prio <= (p)->static_prio - DELTA(p))
+
+/*
+ * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ]
+ * to time slice values.
+ *
+ * The higher a process's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority process gets MIN_TIMESLICE worth of execution time.
+ */
+
+#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \
+ ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39))
+
+/*
+ * These are the runqueue data structures:
+ */
+
+#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
typedef struct runqueue runqueue_t;
#define this_rq() cpu_rq(smp_processor_id())
#define task_rq(p) cpu_rq((p)->thread_info->cpu)
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-#define rt_task(p) ((p)->policy != SCHED_OTHER)
-
+#define rt_task(p) ((p)->prio < MAX_RT_PRIO)
static inline runqueue_t *lock_task_rq(task_t *p, unsigned long *flags)
{
spin_unlock_irqrestore(&rq->lock, *flags);
preempt_enable();
}
+
/*
* Adding/removing a task to/from a priority array:
*/
p->array = array;
}
-/*
- * A task is 'heavily interactive' if it either has reached the
- * bottom 25% of the SCHED_OTHER priority range, or if it is below
- * its default priority by at least 3 priority levels. In this
- * case we favor it by reinserting it on the active array,
- * even after it expired its current timeslice.
- *
- * A task is a 'CPU hog' if it's either in the upper 25% of the
- * SCHED_OTHER priority range, or if's not an interactive task.
- *
- * A task can get a priority bonus by being 'somewhat
- * interactive' - and it will get a priority penalty for
- * being a CPU hog.
- *
- */
-
-#define PRIO_INTERACTIVE \
- (MAX_RT_PRIO + MAX_USER_PRIO*PRIO_INTERACTIVE_RATIO/100)
-#define PRIO_CPU_HOG \
- (MAX_RT_PRIO + MAX_USER_PRIO*PRIO_CPU_HOG_RATIO/100)
-
-#define TASK_INTERACTIVE(p) \
- (((p)->prio <= PRIO_INTERACTIVE) || \
- (((p)->prio < PRIO_CPU_HOG) && \
- ((p)->prio <= NICE_TO_PRIO((p)->__nice) - INTERACTIVE_DELTA)))
-
-/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks:
- */
-#define EXPIRED_STARVING(rq) \
- ((rq)->expired_timestamp && \
- (jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1))
-
static inline int effective_prio(task_t *p)
{
int bonus, prio;
/*
* Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -14 ... +14 bonus/penalty range.
+ * into the -5 ... 0 ... +5 bonus/penalty range.
*
- * We use 70% of the full 0...39 priority range so that:
+ * We use 25% of the full 0...39 priority range so that:
*
- * 1) nice +19 CPU hogs do not preempt nice 0 CPU hogs.
- * 2) nice -20 interactive tasks do not get preempted by
- * nice 0 interactive tasks.
+ * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+ * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
*
* Both properties are important to certain workloads.
*/
bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
- prio = NICE_TO_PRIO(p->__nice) - bonus;
+ prio = p->static_prio - bonus;
if (prio < MAX_RT_PRIO)
prio = MAX_RT_PRIO;
if (prio > MAX_PRIO-1)
rq->nr_running--;
dequeue_task(p, p->array);
p->array = NULL;
- p->sleep_timestamp = jiffies;
}
static inline void resched_task(task_t *p)
void wake_up_forked_process(task_t * p)
{
runqueue_t *rq;
-
+
preempt_disable();
rq = this_rq();
p->state = TASK_RUNNING;
if (!rt_task(p)) {
- p->sleep_avg = p->sleep_avg * CHILD_FORK_PENALTY / 100;
+ /*
+ * We decrease the sleep average of forking parents
+ * and children as well, to keep max-interactive tasks
+ * from forking tasks that are max-interactive.
+ */
+ current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
+ p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
p->prio = effective_prio(p);
-
- current->sleep_avg = current->sleep_avg * PARENT_FORK_PENALTY / 100;
}
spin_lock_irq(&rq->lock);
p->thread_info->cpu = smp_processor_id();
preempt_enable();
}
+/*
+ * Potentially available exiting-child timeslices are
+ * retrieved here - this way the parent does not get
+ * penalized for creating too many processes.
+ *
+ * (this cannot be used to 'generate' timeslices
+ * artificially, because any timeslice recovered here
+ * was given away by the parent in the first place.)
+ */
+void sched_exit(task_t * p)
+{
+ __cli();
+ current->time_slice += p->time_slice;
+ if (unlikely(current->time_slice > MAX_TIMESLICE))
+ current->time_slice = MAX_TIMESLICE;
+ __sti();
+ /*
+ * If the child was a (relative-) CPU hog then decrease
+ * the sleep_avg of the parent as well.
+ */
+ if (p->sleep_avg < current->sleep_avg)
+ current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT +
+ p->sleep_avg) / (EXIT_WEIGHT + 1);
+}
+
+#if CONFIG_SMP
asmlinkage void schedule_tail(task_t *prev)
{
spin_unlock_irq(&this_rq()->lock);
}
+#endif
static inline void context_switch(task_t *prev, task_t *next)
{
}
return nr_running;
}
+
/*
* Current runqueue is empty, or rebalance tick: if there is an
* inbalance (current runqueue is too short) then pull from
array = busiest->active;
new_array:
- /*
- * Load-balancing does not affect RT tasks, so we start the
- * searching at priority 128.
- */
- idx = MAX_RT_PRIO;
+ /* Start searching at priority 0: */
+ idx = 0;
skip_bitmap:
- idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+ if (!idx)
+ idx = sched_find_first_bit(array->bitmap);
+ else
+ idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
if (idx == MAX_PRIO) {
if (array == busiest->expired) {
array = busiest->active;
#endif
+/*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks:
+ */
+#define EXPIRED_STARVING(rq) \
+ ((rq)->expired_timestamp && \
+ (jiffies - (rq)->expired_timestamp >= \
+ STARVATION_LIMIT * ((rq)->nr_running) + 1))
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
-void scheduler_tick(task_t *p)
+void scheduler_tick(int user_tick, int system)
{
+ int cpu = smp_processor_id();
runqueue_t *rq = this_rq();
-#if CONFIG_SMP
- unsigned long now = jiffies;
+ task_t *p = current;
- if (p == rq->idle)
- return idle_tick();
+ if (p == rq->idle) {
+ if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
+ kstat.per_cpu_system[cpu] += system;
+#if CONFIG_SMP
+ idle_tick();
#endif
+ return;
+ }
+ if (TASK_NICE(p) > 0)
+ kstat.per_cpu_nice[cpu] += user_tick;
+ else
+ kstat.per_cpu_user[cpu] += user_tick;
+ kstat.per_cpu_system[cpu] += system;
+
/* Task might have expired already, but not scheduled off yet */
if (p->array != rq->active) {
set_tsk_need_resched(p);
* FIFO tasks have no timeslices.
*/
if ((p->policy == SCHED_RR) && !--p->time_slice) {
- p->time_slice = NICE_TO_TIMESLICE(p->__nice);
+ p->time_slice = TASK_TIMESLICE(p);
set_tsk_need_resched(p);
/* put it at the end of the queue: */
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
p->prio = effective_prio(p);
- p->time_slice = NICE_TO_TIMESLICE(p->__nice);
+ p->time_slice = TASK_TIMESLICE(p);
if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
if (!rq->expired_timestamp)
}
out:
#if CONFIG_SMP
- if (!(now % BUSY_REBALANCE_TICK))
+ if (!(jiffies % BUSY_REBALANCE_TICK))
load_balance(rq, 0);
#endif
spin_unlock(&rq->lock);
if (unlikely(in_interrupt()))
BUG();
-
preempt_disable();
prev = current;
rq = this_rq();
-
+
release_kernel_lock(prev, smp_processor_id());
+ prev->sleep_timestamp = jiffies;
spin_lock_irq(&rq->lock);
#ifdef CONFIG_PREEMPT
if (unlikely(preempt_get_count() & PREEMPT_ACTIVE))
goto pick_next_task;
#endif
-
+
switch (prev->state) {
- case TASK_RUNNING:
- prev->sleep_timestamp = jiffies;
- break;
case TASK_INTERRUPTIBLE:
if (unlikely(signal_pending(prev))) {
prev->state = TASK_RUNNING;
- prev->sleep_timestamp = jiffies;
break;
}
default:
deactivate_task(prev, rq);
+ case TASK_RUNNING:
+ ;
}
#if CONFIG_SMP || CONFIG_PREEMPT
pick_next_task:
new_mask &= cpu_online_map;
if (!new_mask)
BUG();
+ if (p != current)
+ BUG();
p->cpus_allowed = new_mask;
/*
prio_array_t *array;
runqueue_t *rq;
- if (p->__nice == nice)
+ if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
/*
* We have to be careful, if called from sys_setpriority(),
*/
rq = lock_task_rq(p, &flags);
if (rt_task(p)) {
- p->__nice = nice;
+ p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
array = p->array;
if (array)
dequeue_task(p, array);
- p->__nice = nice;
+ p->static_prio = NICE_TO_PRIO(nice);
p->prio = NICE_TO_PRIO(nice);
if (array) {
enqueue_task(p, array);
* If the task is running and lowered its priority,
* or increased its priority then reschedule its CPU:
*/
- if ((nice < p->__nice) ||
- ((p->__nice < nice) && (p == rq->curr)))
+ if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr))
resched_task(rq->curr);
}
out_unlock:
if (increment > 40)
increment = 40;
- nice = current->__nice + increment;
+ nice = PRIO_TO_NICE(current->static_prio) + increment;
if (nice < -20)
nice = -20;
if (nice > 19)
#endif
+/*
+ * This is the priority value as seen by users in /proc
+ *
+ * RT tasks are offset by -200. Normal tasks are centered
+ * around 0, value goes from -16 to +15.
+ */
+int task_prio(task_t *p)
+{
+ return p->prio - 100;
+}
+
+int task_nice(task_t *p)
+{
+ return TASK_NICE(p);
+}
+
+int idle_cpu(int cpu)
+{
+ return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
static inline task_t *find_process_by_pid(pid_t pid)
{
return pid ? find_task_by_pid(pid) : current;
p->policy = policy;
p->rt_priority = lp.sched_priority;
if (rt_task(p))
- p->prio = 99-p->rt_priority;
+ p->prio = 99 - p->rt_priority;
else
- p->prio = NICE_TO_PRIO(p->__nice);
+ p->prio = p->static_prio;
if (array)
activate_task(p, task_rq(p));
return 0;
}
-
asmlinkage long sys_sched_get_priority_max(int policy)
{
int ret = -EINVAL;
p = find_process_by_pid(pid);
if (p)
jiffies_to_timespec(p->policy & SCHED_FIFO ?
- 0 : NICE_TO_TIMESLICE(p->__nice), &t);
+ 0 : TASK_TIMESLICE(p), &t);
read_unlock(&tasklist_lock);
if (p)
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;