patch-2.2.0-pre1 linux/kernel/sched.c
Next file: linux/mm/page_io.c
Previous file: linux/kernel/panic.c
Back to the patch index
Back to the overall index
- Lines: 409
- Date:
Mon Dec 28 10:54:09 1998
- Orig file:
v2.1.132/linux/kernel/sched.c
- Orig date:
Fri Nov 27 13:09:30 1998
diff -u --recursive --new-file v2.1.132/linux/kernel/sched.c linux/kernel/sched.c
@@ -7,6 +7,12 @@
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
* make semaphores SMP safe
* 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
+ * 1998-11-19 Implemented schedule_timeout() and related stuff
+ * by Andrea Arcangeli
+ * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ * serialize accesses to xtime/lost_ticks).
+ * Copyright (C) 1998 Andrea Arcangeli
+ * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
*/
/*
@@ -91,47 +97,111 @@
void scheduling_functions_start_here(void) { }
-static inline void reschedule_idle(struct task_struct * p)
+#ifdef __SMP__
+static void reschedule_idle_slow(struct task_struct * p)
{
+/*
+ * (see reschedule_idle() for an explanation first ...)
+ *
+ * Pass #2
+ *
+ * We try to find another (idle) CPU for this woken-up process.
+ *
+ * On SMP, we mostly try to see if the CPU the task used
+ * to run on is idle.. but we will use another idle CPU too,
+ * at this point we already know that this CPU is not
+ * willing to reschedule in the near future.
+ *
+ * An idle CPU is definitely wasted, especially if this CPU is
+ * running long-timeslice processes. The following algorithm is
+ * pretty good at finding the best idle CPU to send this process
+ * to.
+ *
+ * [We can try to preempt low-priority processes on other CPUs in
+ * 2.3. Also we can try to use the avg_slice value to predict
+ * 'likely reschedule' events even on other CPUs.]
+ */
+ int best_cpu = p->processor, this_cpu = smp_processor_id();
+ struct task_struct **idle = task, *tsk, *target_tsk;
+ int i = smp_num_cpus;
+
+ target_tsk = NULL;
+ do {
+ tsk = *idle;
+ idle++;
+ if (tsk->has_cpu) {
+ if (tsk->processor == this_cpu)
+ continue;
+ target_tsk = tsk;
+ if (tsk->processor == best_cpu) {
+ /*
+ * bingo, we couldnt get a better
+ * CPU, activate it.
+ */
+ goto send; /* this one helps GCC ... */
+ }
+ }
+ } while (--i > 0);
/*
- * For SMP, we try to see if the CPU the task used
- * to run on is idle..
+ * found any idle CPU?
*/
-#if 0
+ if (target_tsk) {
+send:
+ target_tsk->need_resched = 1;
+ smp_send_reschedule(target_tsk->processor);
+ return;
+ }
+}
+#endif /* __SMP__ */
+
+static inline void reschedule_idle(struct task_struct * p)
+{
+
+ if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) {
+ current->need_resched = 1;
+ return;
+ }
+
+#ifdef __SMP__
/*
- * Disable this for now. Ingo has some interesting
- * code that looks too complex, and I have some ideas,
- * but in the meantime.. One problem is that "wakeup()"
- * can be (and is) called before we've even initialized
- * SMP completely, so..
+ * ("wakeup()" should not be called before we've initialized
+ * SMP completely.
+ * Basically a not-yet initialized SMP subsystem can be
+ * considered as a not-yet working scheduler, simply dont use
+ * it before it's up and running ...)
+ *
+ * SMP rescheduling is done in 2 passes:
+ * - pass #1: faster: 'quick decisions'
+ * - pass #2: slower: 'lets try and find another CPU'
*/
-#ifdef __SMP__
- int want_cpu = p->processor;
/*
- * Don't even try to find another CPU for us if the task
- * ran on this one before..
+ * Pass #1
+ *
+ * There are two metrics here:
+ *
+ * first, a 'cutoff' interval, currently 0-200 usecs on
+ * x86 CPUs, depending on the size of the 'SMP-local cache'.
+ * If the current process has longer average timeslices than
+ * this, then we utilize the idle CPU.
+ *
+ * second, if the wakeup comes from a process context,
+ * then the two processes are 'related'. (they form a
+ * 'gang')
+ *
+ * An idle CPU is almost always a bad thing, thus we skip
+ * the idle-CPU utilization only if both these conditions
+ * are true. (ie. a 'process-gang' rescheduling with rather
+ * high frequency should stay on the same CPU).
+ *
+ * [We can switch to something more finegrained in 2.3.]
*/
- if (want_cpu != smp_processor_id()) {
- struct task_struct **idle = task;
- int i = smp_num_cpus;
-
- do {
- struct task_struct *tsk = *idle;
- idle++;
- /* Something like this.. */
- if (tsk->has_cpu && tsk->processor == want_cpu) {
- tsk->need_resched = 1;
- smp_send_reschedule(want_cpu);
- return;
- }
- } while (--i > 0);
- }
-#endif
-#endif
- if (p->policy != SCHED_OTHER || p->counter > current->counter + 3)
- current->need_resched = 1;
+ if ((current->avg_slice < cacheflush_time) && !in_interrupt())
+ return;
+
+ reschedule_idle_slow(p);
+#endif /* __SMP__ */
}
/*
@@ -149,6 +219,7 @@
init_task.next_run = p;
p->next_run = next;
next->prev_run = p;
+ nr_running++;
}
static inline void del_from_runqueue(struct task_struct * p)
@@ -227,7 +298,6 @@
if (!p->next_run) {
add_to_runqueue(p);
reschedule_idle(p);
- nr_running++;
}
spin_unlock_irqrestore(&runqueue_lock, flags);
}
@@ -437,23 +507,6 @@
struct timer_list timer;
unsigned long expire;
- /*
- * PARANOID.
- */
- if (current->state == TASK_UNINTERRUPTIBLE)
- {
- printk(KERN_WARNING "schedule_timeout: task not interrutible "
- "from %p\n", __builtin_return_address(0));
- /*
- * We don' t want to interrupt a not interruptible task
- * risking to cause corruption. Better a a deadlock ;-).
- */
- timeout = MAX_SCHEDULE_TIMEOUT;
- }
-
- /*
- * Here we start for real.
- */
switch (timeout)
{
case MAX_SCHEDULE_TIMEOUT:
@@ -501,6 +554,63 @@
}
/*
+ * This one aligns per-CPU data on cacheline boundaries.
+ */
+static union {
+ struct schedule_data {
+ struct task_struct * prev;
+ long prevstate;
+ cycles_t last_schedule;
+ } schedule_data;
+ char __pad [L1_CACHE_BYTES];
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+
+
+static inline void __schedule_tail (void)
+{
+#ifdef __SMP__
+ struct schedule_data * sched_data;
+
+ /*
+ * We might have switched CPUs:
+ */
+ sched_data = & aligned_data[smp_processor_id()].schedule_data;
+
+ /*
+ * Subtle. In the rare event that we got a wakeup to 'prev' just
+ * during the reschedule (this is possible, the scheduler is pretty
+ * parallel), we should do another reschedule in the next task's
+ * context. schedule() will do the right thing next time around.
+ * this is equivalent to 'delaying' the wakeup until the reschedule
+ * has finished.
+ */
+ if (sched_data->prev->state != sched_data->prevstate)
+ current->need_resched = 1;
+
+ /*
+ * Release the previous process ...
+ *
+ * We have dropped all locks, and we must make sure that we
+ * only mark the previous process as no longer having a CPU
+ * after all other state has been seen by other CPU's. Thus
+ * the write memory barrier!
+ */
+ wmb();
+ sched_data->prev->has_cpu = 0;
+#endif /* __SMP__ */
+}
+
+/*
+ * schedule_tail() is getting called from the fork return path. This
+ * cleans up all remaining scheduler things, without impacting the
+ * common case.
+ */
+void schedule_tail (void)
+{
+ __schedule_tail();
+}
+
+/*
* 'schedule()' is the scheduler function. It's a very simple and nice
* scheduler: it's not perfect, but certainly works for most things.
*
@@ -512,11 +622,18 @@
*/
asmlinkage void schedule(void)
{
+ struct schedule_data * sched_data;
struct task_struct * prev, * next;
int this_cpu;
prev = current;
this_cpu = prev->processor;
+ /*
+ * 'sched_data' is protected by the fact that we can run
+ * only one process per CPU.
+ */
+ sched_data = & aligned_data[this_cpu].schedule_data;
+
if (in_interrupt())
goto scheduling_in_interrupt;
release_kernel_lock(prev, this_cpu);
@@ -531,6 +648,7 @@
/* move an exhausted RR process to be last.. */
prev->need_resched = 0;
+
if (!prev->counter && prev->policy == SCHED_RR) {
prev->counter = prev->priority;
move_last_runqueue(prev);
@@ -546,6 +664,9 @@
del_from_runqueue(prev);
case TASK_RUNNING:
}
+
+ sched_data->prevstate = prev->state;
+
{
struct task_struct * p = init_task.next_run;
/*
@@ -592,25 +713,49 @@
}
}
+ /*
+ * maintain the per-process 'average timeslice' value.
+ * (this has to be recalculated even if we reschedule to
+ * the same process) Currently this is only used on SMP:
+ */
#ifdef __SMP__
- next->has_cpu = 1;
- next->processor = this_cpu;
-#endif
+ {
+ cycles_t t, this_slice;
- if (prev != next) {
- kstat.context_swtch++;
- get_mmu_context(next);
- switch_to(prev,next);
- }
+ t = get_cycles();
+ this_slice = t - sched_data->last_schedule;
+ sched_data->last_schedule = t;
- spin_unlock(&scheduler_lock);
+ /*
+ * Simple, exponentially fading average calculation:
+ */
+ prev->avg_slice = this_slice + prev->avg_slice;
+ prev->avg_slice >>= 1;
+ }
/*
- * At this point "prev" is "current", as we just
- * switched into it (from an even more "previous"
- * prev)
+ * We drop the scheduler lock early (it's a global spinlock),
+ * thus we have to lock the previous process from getting
+ * rescheduled during switch_to().
*/
- reacquire_kernel_lock(prev);
+ prev->has_cpu = 1;
+
+ next->has_cpu = 1;
+ next->processor = this_cpu;
+ spin_unlock(&scheduler_lock);
+#endif /* __SMP__ */
+ if (prev != next) {
+#ifdef __SMP__
+ sched_data->prev = prev;
+#endif
+ kstat.context_swtch++;
+ get_mmu_context(next);
+ switch_to(prev,next);
+
+ __schedule_tail();
+ }
+
+ reacquire_kernel_lock(current);
return;
scheduling_in_interrupt:
@@ -618,7 +763,6 @@
*(int *)0 = 0;
}
-
rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;
/*
@@ -1189,13 +1333,21 @@
volatile unsigned long lost_ticks = 0;
static unsigned long lost_ticks_system = 0;
+/*
+ * This spinlock protect us from races in SMP while playing with xtime. -arca
+ */
+rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
+
static inline void update_times(void)
{
unsigned long ticks;
- unsigned long flags;
- save_flags(flags);
- cli();
+ /*
+ * update_times() is run from the raw timer_bh handler so we
+ * just know that the irqs are locally enabled and so we don't
+ * need to save/restore the flags of the local CPU here. -arca
+ */
+ write_lock_irq(&xtime_lock);
ticks = lost_ticks;
lost_ticks = 0;
@@ -1206,12 +1358,12 @@
calc_load(ticks);
update_wall_time(ticks);
- restore_flags(flags);
+ write_unlock_irq(&xtime_lock);
update_process_times(ticks, system);
} else
- restore_flags(flags);
+ write_unlock_irq(&xtime_lock);
}
static void timer_bh(void)
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov