diff -urN linux-2.4.19-pre9/Documentation/Configure.help linux/Documentation/Configure.help
--- linux-2.4.19-pre9/Documentation/Configure.help	Tue May 28 16:14:36 2002
+++ linux/Documentation/Configure.help	Tue May 28 19:06:47 2002
@@ -3904,6 +3904,38 @@
   you have use for it; the module is called binfmt_misc.o. If you
   don't know what to answer at this point, say Y.
 
+Maximum User Real-Time Priority
+CONFIG_MAX_USER_RT_PRIO
+  The maximum user real-time priority. Tasks with priorities from
+  zero through one less than this value are scheduled as real-time.
+  To the application, a higher priority value implies a higher
+  priority task.
+
+  The minimum allowed value is 100 and the maximum allowed value
+  is (arbitrary) 1000. Values specified outside this range will
+  be rounded accordingly during compile-time. The default is 100.
+  Setting this higher than 100 is safe but will result in slightly
+  more processing overhead in the scheduler. 
+
+  Unless you are doing specialized real-time computing and require
+  a much larger range than usual, the default is fine.
+
+Maximum Kernel Real-Time Priority
+CONFIG_MAX_RT_PRIO
+  The difference between the maximum real-time priority and the
+  maximum user real-time priority.  Usually this value is zero,
+  which sets the maximum real-time priority to the same as the
+  maximum user real-time priority.  Setting this higher,
+  however, will allow kernel threads to set their priority to a
+  value higher than any user task. This is safe, but will result
+  in slightly more processing overhead in the scheduler.
+
+  This value can be at most 200.  The default is zero, i.e. the
+  maximum priority and maximum user priority are the same.
+
+  Unless you are doing specialized real-time programming with
+  kernel threads, the default is fine.
+
 Kernel support for JAVA binaries
 CONFIG_BINFMT_JAVA
   If you say Y here, the kernel will load and execute Java J-code
diff -urN linux-2.4.19-pre9/Documentation/sched-coding.txt linux/Documentation/sched-coding.txt
--- linux-2.4.19-pre9/Documentation/sched-coding.txt	Wed Dec 31 16:00:00 1969
+++ linux/Documentation/sched-coding.txt	Tue May 28 19:06:47 2002
@@ -0,0 +1,126 @@
+     Reference for various scheduler-related methods in the O(1) scheduler
+		Robert Love <rml@tech9.net>, MontaVista Software
+
+
+Note most of these methods are local to kernel/sched.c - this is by design.
+The scheduler is meant to be self-contained and abstracted away.  This document
+is primarily for understanding the scheduler, not interfacing to it.  Some of
+the discussed interfaces, however, are general process/scheduling methods.
+They are typically defined in include/linux/sched.h.
+
+
+Main Scheduling Methods
+-----------------------
+
+void load_balance(runqueue_t *this_rq, int idle)
+	Attempts to pull tasks from one cpu to another to balance cpu usage,
+	if needed.  This method is called explicitly if the runqueues are
+	inbalanced or periodically by the timer tick.  Prior to calling,
+	the current runqueue must be locked and interrupts disabled.
+
+void schedule()
+	The main scheduling function.  Upon return, the highest priority
+	process will be active.
+
+
+Locking
+-------
+
+Each runqueue has its own lock, rq->lock.  When multiple runqueues need
+to be locked, lock acquires must be ordered by ascending &runqueue value.
+
+A specific runqueue is locked via
+
+	task_rq_lock(task_t pid, unsigned long *flags)
+
+which disables preemption, disables interrupts, and locks the runqueue pid is
+running on.  Likewise,
+
+	task_rq_unlock(task_t pid, unsigned long *flags)
+
+unlocks the runqueue pid is running on, restores interrupts to their previous
+state, and reenables preemption.
+
+The routines
+
+	double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
+
+and
+
+	double_rq_unlock(runqueue_t *rq1, runqueue_t rq2)
+
+safely lock and unlock, respectively, the two specified runqueues.  They do
+not, however, disable and restore interrupts.  Users are required to do so
+manually before and after calls.
+
+
+Values
+------
+
+MAX_PRIO
+	The maximum priority of the system, stored in the task as task->prio.
+	Lower priorities are higher.  Normal (non-RT) priorities range from
+	MAX_RT_PRIO to (MAX_PRIO - 1).
+MAX_RT_PRIO
+	The maximum real-time priority of the system.  Valid RT priorities
+	range from 0 to (MAX_RT_PRIO - 1).
+MAX_USER_RT_PRIO
+	The maximum real-time priority that is exported to user-space.  Should
+	always be equal to or less than MAX_RT_PRIO.  Setting it less allows
+	kernel threads to have higher priorities than any user-space task.
+MIN_TIMESLICE
+MAX_TIMESLICE
+	Respectively, the minimum and maximum timeslices (quanta) of a process.
+
+Data
+----
+
+struct runqueue
+	The main per-CPU runqueue data structure.
+struct task_struct
+	The main per-process data structure.
+
+
+General Methods
+---------------
+
+cpu_rq(cpu)
+	Returns the runqueue of the specified cpu.
+this_rq()
+	Returns the runqueue of the current cpu.
+task_rq(pid)
+	Returns the runqueue which holds the specified pid.
+cpu_curr(cpu)
+	Returns the task currently running on the given cpu.
+rt_task(pid)
+	Returns true if pid is real-time, false if not.
+
+
+Process Control Methods
+-----------------------
+
+void set_user_nice(task_t *p, long nice)
+	Sets the "nice" value of task p to the given value.
+int setscheduler(pid_t pid, int policy, struct sched_param *param)
+	Sets the scheduling policy and parameters for the given pid.
+void set_cpus_allowed(task_t *p, unsigned long new_mask)
+	Sets a given task's CPU affinity and migrates it to a proper cpu.
+	Callers must have a valid reference to the task and assure the
+	task not exit prematurely.  No locks can be held during the call.
+set_task_state(tsk, state_value)
+	Sets the given task's state to the given value.
+set_current_state(state_value)
+	Sets the current task's state to the given value.
+void set_tsk_need_resched(struct task_struct *tsk)
+	Sets need_resched in the given task.
+void clear_tsk_need_resched(struct task_struct *tsk)
+	Clears need_resched in the given task.
+void set_need_resched()
+	Sets need_resched in the current task.
+void clear_need_resched()
+	Clears need_resched in the current task.
+int need_resched()
+	Returns true if need_resched is set in the current task, false
+	otherwise.
+yield()
+	Place the current process at the end of the runqueue and call schedule.
diff -urN linux-2.4.19-pre9/Documentation/sched-design.txt linux/Documentation/sched-design.txt
--- linux-2.4.19-pre9/Documentation/sched-design.txt	Wed Dec 31 16:00:00 1969
+++ linux/Documentation/sched-design.txt	Tue May 28 19:06:47 2002
@@ -0,0 +1,165 @@
+		   Goals, Design and Implementation of the
+		      new ultra-scalable O(1) scheduler
+
+
+  This is an edited version of an email Ingo Molnar sent to
+  lkml on 4 Jan 2002.  It describes the goals, design, and
+  implementation of Ingo's new ultra-scalable O(1) scheduler.
+  Last Updated: 18 April 2002.
+
+
+Goal
+====
+
+The main goal of the new scheduler is to keep all the good things we know
+and love about the current Linux scheduler:
+
+ - good interactive performance even during high load: if the user
+   types or clicks then the system must react instantly and must execute
+   the user tasks smoothly, even during considerable background load.
+
+ - good scheduling/wakeup performance with 1-2 runnable processes.
+
+ - fairness: no process should stay without any timeslice for any
+   unreasonable amount of time. No process should get an unjustly high
+   amount of CPU time.
+
+ - priorities: less important tasks can be started with lower priority,
+   more important tasks with higher priority.
+
+ - SMP efficiency: no CPU should stay idle if there is work to do.
+
+ - SMP affinity: processes which run on one CPU should stay affine to
+   that CPU. Processes should not bounce between CPUs too frequently.
+
+ - plus additional scheduler features: RT scheduling, CPU binding.
+
+and the goal is also to add a few new things:
+
+ - fully O(1) scheduling. Are you tired of the recalculation loop
+   blowing the L1 cache away every now and then? Do you think the goodness
+   loop is taking a bit too long to finish if there are lots of runnable
+   processes? This new scheduler takes no prisoners: wakeup(), schedule(),
+   the timer interrupt are all O(1) algorithms. There is no recalculation
+   loop. There is no goodness loop either.
+
+ - 'perfect' SMP scalability. With the new scheduler there is no 'big'
+   runqueue_lock anymore - it's all per-CPU runqueues and locks - two
+   tasks on two separate CPUs can wake up, schedule and context-switch
+   completely in parallel, without any interlocking. All
+   scheduling-relevant data is structured for maximum scalability.
+
+ - better SMP affinity. The old scheduler has a particular weakness that
+   causes the random bouncing of tasks between CPUs if/when higher
+   priority/interactive tasks, this was observed and reported by many
+   people. The reason is that the timeslice recalculation loop first needs
+   every currently running task to consume its timeslice. But when this
+   happens on eg. an 8-way system, then this property starves an
+   increasing number of CPUs from executing any process. Once the last
+   task that has a timeslice left has finished using up that timeslice,
+   the recalculation loop is triggered and other CPUs can start executing
+   tasks again - after having idled around for a number of timer ticks.
+   The more CPUs, the worse this effect.
+
+   Furthermore, this same effect causes the bouncing effect as well:
+   whenever there is such a 'timeslice squeeze' of the global runqueue,
+   idle processors start executing tasks which are not affine to that CPU.
+   (because the affine tasks have finished off their timeslices already.)
+
+   The new scheduler solves this problem by distributing timeslices on a
+   per-CPU basis, without having any global synchronization or
+   recalculation.
+
+ - batch scheduling. A significant proportion of computing-intensive tasks
+   benefit from batch-scheduling, where timeslices are long and processes
+   are roundrobin scheduled. The new scheduler does such batch-scheduling
+   of the lowest priority tasks - so nice +19 jobs will get
+   'batch-scheduled' automatically. With this scheduler, nice +19 jobs are
+   in essence SCHED_IDLE, from an interactiveness point of view.
+
+ - handle extreme loads more smoothly, without breakdown and scheduling
+   storms.
+
+ - O(1) RT scheduling. For those RT folks who are paranoid about the
+   O(nr_running) property of the goodness loop and the recalculation loop.
+
+ - run fork()ed children before the parent. Andrea has pointed out the
+   advantages of this a few months ago, but patches for this feature
+   do not work with the old scheduler as well as they should,
+   because idle processes often steal the new child before the fork()ing
+   CPU gets to execute it.
+
+
+Design
+======
+
+the core of the new scheduler are the following mechanizms:
+
+ - *two*, priority-ordered 'priority arrays' per CPU. There is an 'active'
+   array and an 'expired' array. The active array contains all tasks that
+   are affine to this CPU and have timeslices left. The expired array
+   contains all tasks which have used up their timeslices - but this array
+   is kept sorted as well. The active and expired array is not accessed
+   directly, it's accessed through two pointers in the per-CPU runqueue
+   structure. If all active tasks are used up then we 'switch' the two
+   pointers and from now on the ready-to-go (former-) expired array is the
+   active array - and the empty active array serves as the new collector
+   for expired tasks.
+
+ - there is a 64-bit bitmap cache for array indices. Finding the highest
+   priority task is thus a matter of two x86 BSFL bit-search instructions.
+
+the split-array solution enables us to have an arbitrary number of active
+and expired tasks, and the recalculation of timeslices can be done
+immediately when the timeslice expires. Because the arrays are always
+access through the pointers in the runqueue, switching the two arrays can
+be done very quickly.
+
+this is a hybride priority-list approach coupled with roundrobin
+scheduling and the array-switch method of distributing timeslices.
+
+ - there is a per-task 'load estimator'.
+
+one of the toughest things to get right is good interactive feel during
+heavy system load. While playing with various scheduler variants i found
+that the best interactive feel is achieved not by 'boosting' interactive
+tasks, but by 'punishing' tasks that want to use more CPU time than there
+is available. This method is also much easier to do in an O(1) fashion.
+
+to establish the actual 'load' the task contributes to the system, a
+complex-looking but pretty accurate method is used: there is a 4-entry
+'history' ringbuffer of the task's activities during the last 4 seconds.
+This ringbuffer is operated without much overhead. The entries tell the
+scheduler a pretty accurate load-history of the task: has it used up more
+CPU time or less during the past N seconds. [the size '4' and the interval
+of 4x 1 seconds was found by lots of experimentation - this part is
+flexible and can be changed in both directions.]
+
+the penalty a task gets for generating more load than the CPU can handle
+is a priority decrease - there is a maximum amount to this penalty
+relative to their static priority, so even fully CPU-bound tasks will
+observe each other's priorities, and will share the CPU accordingly.
+
+the SMP load-balancer can be extended/switched with additional parallel
+computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs
+can be supported easily by changing the load-balancer. Right now it's
+tuned for my SMP systems.
+
+i skipped the prev->mm == next->mm advantage - no workload i know of shows
+any sensitivity to this. It can be added back by sacrificing O(1)
+schedule() [the current and one-lower priority list can be searched for a
+that->mm == current->mm condition], but costs a fair number of cycles
+during a number of important workloads, so i wanted to avoid this as much
+as possible.
+
+- the SMP idle-task startup code was still racy and the new scheduler
+triggered this. So i streamlined the idle-setup code a bit. We do not call
+into schedule() before all processors have started up fully and all idle
+threads are in place.
+
+- the patch also cleans up a number of aspects of sched.c - moves code
+into other areas of the kernel where it's appropriate, and simplifies
+certain code paths and data constructs. As a result, the new scheduler's
+code is smaller than the old one.
+
+	Ingo
diff -urN linux-2.4.19-pre9/arch/alpha/config.in linux/arch/alpha/config.in
--- linux-2.4.19-pre9/arch/alpha/config.in	Tue May 28 16:14:11 2002
+++ linux/arch/alpha/config.in	Tue May 28 19:06:48 2002
@@ -273,6 +273,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 if [ "$CONFIG_PROC_FS" = "y" ]; then
    choice 'Kernel core (/proc/kcore) format' \
 	"ELF		CONFIG_KCORE_ELF	\
diff -urN linux-2.4.19-pre9/arch/alpha/mm/fault.c linux/arch/alpha/mm/fault.c
--- linux-2.4.19-pre9/arch/alpha/mm/fault.c	Tue May 28 16:14:11 2002
+++ linux/arch/alpha/mm/fault.c	Tue May 28 19:06:48 2002
@@ -196,8 +196,7 @@
  */
 out_of_memory:
 	if (current->pid == 1) {
-		current->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
diff -urN linux-2.4.19-pre9/arch/arm/config.in linux/arch/arm/config.in
--- linux-2.4.19-pre9/arch/arm/config.in	Tue May 28 16:14:24 2002
+++ linux/arch/arm/config.in	Tue May 28 19:06:48 2002
@@ -427,6 +427,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 comment 'At least one math emulation must be selected'
 tristate 'NWFPE math emulation' CONFIG_FPE_NWFPE
 dep_tristate 'FastFPE math emulation (experimental)' CONFIG_FPE_FASTFPE $CONFIG_EXPERIMENTAL
diff -urN linux-2.4.19-pre9/arch/arm/mm/fault-common.c linux/arch/arm/mm/fault-common.c
--- linux-2.4.19-pre9/arch/arm/mm/fault-common.c	Tue May 28 16:14:25 2002
+++ linux/arch/arm/mm/fault-common.c	Tue May 28 19:06:48 2002
@@ -225,8 +225,7 @@
 	 * If we are out of memory for pid1,
 	 * sleep for a while and retry
 	 */
-	tsk->policy |= SCHED_YIELD;
-	schedule();
+	yield();
 	goto survive;
 
 check_stack:
diff -urN linux-2.4.19-pre9/arch/cris/config.in linux/arch/cris/config.in
--- linux-2.4.19-pre9/arch/cris/config.in	Tue May 28 16:14:31 2002
+++ linux/arch/cris/config.in	Tue May 28 19:06:48 2002
@@ -29,6 +29,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 
 tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF
 
diff -urN linux-2.4.19-pre9/arch/i386/config.in linux/arch/i386/config.in
--- linux-2.4.19-pre9/arch/i386/config.in	Tue May 28 16:14:10 2002
+++ linux/arch/i386/config.in	Tue May 28 19:06:48 2002
@@ -261,6 +261,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 if [ "$CONFIG_PROC_FS" = "y" ]; then
    choice 'Kernel core (/proc/kcore) format' \
 	"ELF		CONFIG_KCORE_ELF	\
diff -urN linux-2.4.19-pre9/arch/i386/kernel/entry.S linux/arch/i386/kernel/entry.S
--- linux-2.4.19-pre9/arch/i386/kernel/entry.S	Tue May 28 16:14:10 2002
+++ linux/arch/i386/kernel/entry.S	Tue May 28 19:06:48 2002
@@ -77,7 +77,7 @@
 exec_domain	= 16
 need_resched	= 20
 tsk_ptrace	= 24
-processor	= 52
+cpu		= 32
 
 ENOSYS = 38
 
@@ -176,9 +176,11 @@
 
 
 ENTRY(ret_from_fork)
+#if CONFIG_SMP
 	pushl %ebx
 	call SYMBOL_NAME(schedule_tail)
 	addl $4, %esp
+#endif
 	GET_CURRENT(%ebx)
 	testb $0x02,tsk_ptrace(%ebx)	# PT_TRACESYS
 	jne tracesys_exit
diff -urN linux-2.4.19-pre9/arch/i386/kernel/process.c linux/arch/i386/kernel/process.c
--- linux-2.4.19-pre9/arch/i386/kernel/process.c	Tue May 28 16:14:10 2002
+++ linux/arch/i386/kernel/process.c	Tue May 28 19:06:48 2002
@@ -124,15 +124,12 @@
 void cpu_idle (void)
 {
 	/* endless idle loop with no priority at all */
-	init_idle();
-	current->nice = 20;
-	current->counter = -100;
 
 	while (1) {
 		void (*idle)(void) = pm_idle;
 		if (!idle)
 			idle = default_idle;
-		while (!current->need_resched)
+		if (!current->need_resched)
 			idle();
 		schedule();
 		check_pgt_cache();
@@ -697,15 +694,17 @@
 	asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
 
 	/*
-	 * Restore %fs and %gs.
+	 * Restore %fs and %gs if needed.
 	 */
-	loadsegment(fs, next->fs);
-	loadsegment(gs, next->gs);
+	if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
+		loadsegment(fs, next->fs);
+		loadsegment(gs, next->gs);
+	}
 
 	/*
 	 * Now maybe reload the debug registers
 	 */
-	if (next->debugreg[7]){
+	if (unlikely(next->debugreg[7])) {
 		loaddebug(next, 0);
 		loaddebug(next, 1);
 		loaddebug(next, 2);
@@ -715,7 +714,7 @@
 		loaddebug(next, 7);
 	}
 
-	if (prev->ioperm || next->ioperm) {
+	if (unlikely(prev->ioperm || next->ioperm)) {
 		if (next->ioperm) {
 			/*
 			 * 4 cachelines copy ... not good, but not that
diff -urN linux-2.4.19-pre9/arch/i386/kernel/setup.c linux/arch/i386/kernel/setup.c
--- linux-2.4.19-pre9/arch/i386/kernel/setup.c	Tue May 28 16:14:10 2002
+++ linux/arch/i386/kernel/setup.c	Tue May 28 19:06:48 2002
@@ -3010,9 +3010,10 @@
 	load_TR(nr);
 	load_LDT(&init_mm);
 
-	/*
-	 * Clear all 6 debug registers:
-	 */
+	/* Clear %fs and %gs. */
+	asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
+
+	/* Clear all 6 debug registers: */
 
 #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );
 
diff -urN linux-2.4.19-pre9/arch/i386/kernel/smp.c linux/arch/i386/kernel/smp.c
--- linux-2.4.19-pre9/arch/i386/kernel/smp.c	Tue May 28 16:14:10 2002
+++ linux/arch/i386/kernel/smp.c	Tue May 28 19:06:48 2002
@@ -489,13 +489,23 @@
  * it goes straight through and wastes no time serializing
  * anything. Worst case is that we lose a reschedule ...
  */
-
 void smp_send_reschedule(int cpu)
 {
 	send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR);
 }
 
 /*
+ * this function sends a reschedule IPI to all (other) CPUs.
+ * This should only be used if some 'global' task became runnable,
+ * such as a RT task, that must be handled now. The first CPU
+ * that manages to grab the task will run it.
+ */
+void smp_send_reschedule_all(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
diff -urN linux-2.4.19-pre9/arch/i386/kernel/smpboot.c linux/arch/i386/kernel/smpboot.c
--- linux-2.4.19-pre9/arch/i386/kernel/smpboot.c	Tue May 28 16:14:10 2002
+++ linux/arch/i386/kernel/smpboot.c	Tue May 28 19:06:48 2002
@@ -308,14 +308,14 @@
 			if (tsc_values[i] < avg)
 				realdelta = -realdelta;
 
-			printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
-				i, realdelta);
+			printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
 		}
 
 		sum += delta;
 	}
 	if (!buggy)
 		printk("passed.\n");
+		;
 }
 
 static void __init synchronize_tsc_ap (void)
@@ -365,7 +365,7 @@
 	 * (This works even if the APIC is not enabled.)
 	 */
 	phys_id = GET_APIC_ID(apic_read(APIC_ID));
-	cpuid = current->processor;
+	cpuid = cpu();
 	if (test_and_set_bit(cpuid, &cpu_online_map)) {
 		printk("huh, phys CPU#%d, CPU#%d already present??\n",
 					phys_id, cpuid);
@@ -435,6 +435,7 @@
 	 */
  	smp_store_cpu_info(cpuid);
 
+	disable_APIC_timer();
 	/*
 	 * Allow the master to continue.
 	 */
@@ -465,6 +466,7 @@
 	smp_callin();
 	while (!atomic_read(&smp_commenced))
 		rep_nop();
+	enable_APIC_timer();
 	/*
 	 * low-memory mappings have been cleared, flush them from
 	 * the local TLBs too.
@@ -803,16 +805,13 @@
 	if (!idle)
 		panic("No idle process for CPU %d", cpu);
 
-	idle->processor = cpu;
-	idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
+	init_idle(idle, cpu);
 
 	map_cpu_to_boot_apicid(cpu, apicid);
 
 	idle->thread.eip = (unsigned long) start_secondary;
 
-	del_from_runqueue(idle);
 	unhash_process(idle);
-	init_tasks[cpu] = idle;
 
 	/* start_eip had better be page-aligned! */
 	start_eip = setup_trampoline();
@@ -925,6 +924,7 @@
 }
 
 cycles_t cacheflush_time;
+unsigned long cache_decay_ticks;
 
 static void smp_tune_scheduling (void)
 {
@@ -958,9 +958,13 @@
 		cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
 	}
 
+	cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000;
+
 	printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
 		(long)cacheflush_time/(cpu_khz/1000),
 		((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
+	printk("task migration cache decay timeout: %ld msecs.\n",
+		(cache_decay_ticks + 1) * 1000 / HZ);
 }
 
 /*
@@ -1023,8 +1027,7 @@
 	map_cpu_to_boot_apicid(0, boot_cpu_apicid);
 
 	global_irq_holder = 0;
-	current->processor = 0;
-	init_idle();
+	current->cpu = 0;
 	smp_tune_scheduling();
 
 	/*
diff -urN linux-2.4.19-pre9/arch/i386/mm/fault.c linux/arch/i386/mm/fault.c
--- linux-2.4.19-pre9/arch/i386/mm/fault.c	Tue May 28 16:14:10 2002
+++ linux/arch/i386/mm/fault.c	Tue May 28 19:06:48 2002
@@ -86,8 +86,7 @@
 
 out_of_memory:
 	if (current->pid == 1) {
-		current->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		goto survive;
 	}
 	goto bad_area;
@@ -336,8 +335,7 @@
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	if (tsk->pid == 1) {
-		tsk->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
diff -urN linux-2.4.19-pre9/arch/ia64/config.in linux/arch/ia64/config.in
--- linux-2.4.19-pre9/arch/ia64/config.in	Tue May 28 16:14:27 2002
+++ linux/arch/ia64/config.in	Tue May 28 19:06:48 2002
@@ -103,6 +103,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF
 tristate 'Kernel support for MISC binaries' CONFIG_BINFMT_MISC
 
diff -urN linux-2.4.19-pre9/arch/ia64/mm/fault.c linux/arch/ia64/mm/fault.c
--- linux-2.4.19-pre9/arch/ia64/mm/fault.c	Tue May 28 16:14:27 2002
+++ linux/arch/ia64/mm/fault.c	Tue May 28 19:06:48 2002
@@ -196,8 +196,7 @@
   out_of_memory:
 	up_read(&mm->mmap_sem);
 	if (current->pid == 1) {
-		current->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
diff -urN linux-2.4.19-pre9/arch/m68k/config.in linux/arch/m68k/config.in
--- linux-2.4.19-pre9/arch/m68k/config.in	Tue May 28 16:14:20 2002
+++ linux/arch/m68k/config.in	Tue May 28 19:06:48 2002
@@ -92,6 +92,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 if [ "$CONFIG_PROC_FS" = "y" ]; then
    choice 'Kernel core (/proc/kcore) format' \
 	"ELF		CONFIG_KCORE_ELF	\
diff -urN linux-2.4.19-pre9/arch/m68k/mm/fault.c linux/arch/m68k/mm/fault.c
--- linux-2.4.19-pre9/arch/m68k/mm/fault.c	Tue May 28 16:14:21 2002
+++ linux/arch/m68k/mm/fault.c	Tue May 28 19:12:10 2002
@@ -181,8 +181,7 @@
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	if (current->pid == 1) {
-		current->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
diff -urN linux-2.4.19-pre9/arch/mips/config.in linux/arch/mips/config.in
--- linux-2.4.19-pre9/arch/mips/config.in	Tue May 28 16:14:12 2002
+++ linux/arch/mips/config.in	Tue May 28 19:06:48 2002
@@ -461,6 +461,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 
 if [ "$CONFIG_DECSTATION" = "y" ]; then
     bool 'TURBOchannel support' CONFIG_TC
diff -urN linux-2.4.19-pre9/arch/mips/mm/fault.c linux/arch/mips/mm/fault.c
--- linux-2.4.19-pre9/arch/mips/mm/fault.c	Tue May 28 16:14:12 2002
+++ linux/arch/mips/mm/fault.c	Tue May 28 19:12:10 2002
@@ -211,8 +211,7 @@
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	if (tsk->pid == 1) {
-		tsk->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
diff -urN linux-2.4.19-pre9/arch/mips64/config.in linux/arch/mips64/config.in
--- linux-2.4.19-pre9/arch/mips64/config.in	Tue May 28 16:14:29 2002
+++ linux/arch/mips64/config.in	Tue May 28 19:06:48 2002
@@ -181,6 +181,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 
 if [ "$CONFIG_ARC32" = "y" ]; then
    bool 'ARC console support' CONFIG_ARC_CONSOLE
diff -urN linux-2.4.19-pre9/arch/mips64/mm/fault.c linux/arch/mips64/mm/fault.c
--- linux-2.4.19-pre9/arch/mips64/mm/fault.c	Tue May 28 16:14:30 2002
+++ linux/arch/mips64/mm/fault.c	Tue May 28 19:12:10 2002
@@ -240,8 +240,7 @@
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	if (tsk->pid == 1) {
-		tsk->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
diff -urN linux-2.4.19-pre9/arch/parisc/config.in linux/arch/parisc/config.in
--- linux-2.4.19-pre9/arch/parisc/config.in	Tue May 28 16:14:31 2002
+++ linux/arch/parisc/config.in	Tue May 28 19:06:48 2002
@@ -65,6 +65,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 tristate 'Kernel support for SOM binaries' CONFIG_BINFMT_SOM
 tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF
 tristate 'Kernel support for MISC binaries' CONFIG_BINFMT_MISC
diff -urN linux-2.4.19-pre9/arch/ppc/config.in linux/arch/ppc/config.in
--- linux-2.4.19-pre9/arch/ppc/config.in	Tue May 28 16:14:18 2002
+++ linux/arch/ppc/config.in	Tue May 28 19:06:48 2002
@@ -159,6 +159,8 @@
 bool 'Sysctl support' CONFIG_SYSCTL
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 
 # only elf supported, a.out is not -- Cort
 if [ "$CONFIG_PROC_FS" = "y" ]; then
diff -urN linux-2.4.19-pre9/arch/ppc/mm/fault.c linux/arch/ppc/mm/fault.c
--- linux-2.4.19-pre9/arch/ppc/mm/fault.c	Tue May 28 16:14:17 2002
+++ linux/arch/ppc/mm/fault.c	Tue May 28 19:06:48 2002
@@ -197,8 +197,7 @@
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	if (current->pid == 1) {
-		current->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
diff -urN linux-2.4.19-pre9/arch/s390/config.in linux/arch/s390/config.in
--- linux-2.4.19-pre9/arch/s390/config.in	Tue May 28 16:14:30 2002
+++ linux/arch/s390/config.in	Tue May 28 19:06:48 2002
@@ -49,6 +49,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 define_bool CONFIG_KCORE_ELF y
 tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF
 tristate 'Kernel support for MISC binaries' CONFIG_BINFMT_MISC
diff -urN linux-2.4.19-pre9/arch/s390/mm/fault.c linux/arch/s390/mm/fault.c
--- linux-2.4.19-pre9/arch/s390/mm/fault.c	Tue May 28 16:14:31 2002
+++ linux/arch/s390/mm/fault.c	Tue May 28 19:06:48 2002
@@ -290,8 +290,7 @@
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	if (tsk->pid == 1) {
-		tsk->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
diff -urN linux-2.4.19-pre9/arch/s390x/config.in linux/arch/s390x/config.in
--- linux-2.4.19-pre9/arch/s390x/config.in	Tue May 28 16:14:32 2002
+++ linux/arch/s390x/config.in	Tue May 28 19:06:48 2002
@@ -50,6 +50,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 define_bool CONFIG_KCORE_ELF y
 tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF
 tristate 'Kernel support for MISC binaries' CONFIG_BINFMT_MISC
diff -urN linux-2.4.19-pre9/arch/s390x/mm/fault.c linux/arch/s390x/mm/fault.c
--- linux-2.4.19-pre9/arch/s390x/mm/fault.c	Tue May 28 16:14:32 2002
+++ linux/arch/s390x/mm/fault.c	Tue May 28 19:06:48 2002
@@ -290,8 +290,7 @@
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	if (tsk->pid == 1) {
-		tsk->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
diff -urN linux-2.4.19-pre9/arch/sh/config.in linux/arch/sh/config.in
--- linux-2.4.19-pre9/arch/sh/config.in	Tue May 28 16:14:26 2002
+++ linux/arch/sh/config.in	Tue May 28 19:06:48 2002
@@ -205,6 +205,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 if [ "$CONFIG_PROC_FS" = "y" ]; then
    choice 'Kernel core (/proc/kcore) format' \
 	"ELF		CONFIG_KCORE_ELF	\
diff -urN linux-2.4.19-pre9/arch/sh/mm/fault.c linux/arch/sh/mm/fault.c
--- linux-2.4.19-pre9/arch/sh/mm/fault.c	Tue May 28 16:14:27 2002
+++ linux/arch/sh/mm/fault.c	Tue May 28 19:06:48 2002
@@ -207,8 +207,7 @@
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	if (current->pid == 1) {
-		current->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
diff -urN linux-2.4.19-pre9/arch/sparc/config.in linux/arch/sparc/config.in
--- linux-2.4.19-pre9/arch/sparc/config.in	Tue May 28 16:14:12 2002
+++ linux/arch/sparc/config.in	Tue May 28 19:06:48 2002
@@ -65,6 +65,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 if [ "$CONFIG_PROC_FS" = "y" ]; then
    define_bool CONFIG_KCORE_ELF y
 fi
diff -urN linux-2.4.19-pre9/arch/sparc64/config.in linux/arch/sparc64/config.in
--- linux-2.4.19-pre9/arch/sparc64/config.in	Tue May 28 16:14:21 2002
+++ linux/arch/sparc64/config.in	Tue May 28 19:06:48 2002
@@ -64,6 +64,8 @@
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL
+int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100
+int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0
 if [ "$CONFIG_PROC_FS" = "y" ]; then
    define_bool CONFIG_KCORE_ELF y
 fi
diff -urN linux-2.4.19-pre9/drivers/block/loop.c linux/drivers/block/loop.c
--- linux-2.4.19-pre9/drivers/block/loop.c	Tue May 28 16:13:38 2002
+++ linux/drivers/block/loop.c	Tue May 28 19:06:48 2002
@@ -573,9 +573,6 @@
 	flush_signals(current);
 	spin_unlock_irq(&current->sigmask_lock);
 
-	current->policy = SCHED_OTHER;
-	current->nice = -20;
-
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_state = Lo_bound;
 	atomic_inc(&lo->lo_pending);
diff -urN linux-2.4.19-pre9/drivers/char/drm-4.0/ffb_drv.c linux/drivers/char/drm-4.0/ffb_drv.c
--- linux-2.4.19-pre9/drivers/char/drm-4.0/ffb_drv.c	Tue May 28 16:13:43 2002
+++ linux/drivers/char/drm-4.0/ffb_drv.c	Tue May 28 19:06:48 2002
@@ -710,8 +710,7 @@
 		/* Contention */
 		atomic_inc(&dev->total_sleeps);
 		current->state = TASK_INTERRUPTIBLE;
-		current->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
diff -urN linux-2.4.19-pre9/drivers/char/drm-4.0/tdfx_drv.c linux/drivers/char/drm-4.0/tdfx_drv.c
--- linux-2.4.19-pre9/drivers/char/drm-4.0/tdfx_drv.c	Tue May 28 16:13:43 2002
+++ linux/drivers/char/drm-4.0/tdfx_drv.c	Tue May 28 19:06:48 2002
@@ -554,7 +554,6 @@
 					lock.context, current->pid, j,
 					dev->lock.lock_time, jiffies);
                                 current->state = TASK_INTERRUPTIBLE;
-				current->policy |= SCHED_YIELD;
                                 schedule_timeout(DRM_LOCK_SLICE-j);
 				DRM_DEBUG("jiffies=%d\n", jiffies);
                         }
@@ -578,10 +577,7 @@
 
                                 /* Contention */
                         atomic_inc(&dev->total_sleeps);
-#if 1
-			current->policy |= SCHED_YIELD;
-#endif
-                        schedule();
+			yield();
                         if (signal_pending(current)) {
                                 ret = -ERESTARTSYS;
                                 break;
@@ -604,8 +600,7 @@
                    when dev->last_context == lock.context
                    NOTE WE HOLD THE LOCK THROUGHOUT THIS
                    TIME! */
-		current->policy |= SCHED_YIELD;
-	        schedule();
+		yield();
 	        current->state = TASK_RUNNING;
 	        remove_wait_queue(&dev->context_wait, &entry);
 	        if (signal_pending(current)) {
diff -urN linux-2.4.19-pre9/drivers/char/mwave/mwavedd.c linux/drivers/char/mwave/mwavedd.c
--- linux-2.4.19-pre9/drivers/char/mwave/mwavedd.c	Tue May 28 16:13:41 2002
+++ linux/drivers/char/mwave/mwavedd.c	Tue May 28 19:06:48 2002
@@ -279,7 +279,6 @@
 			pDrvData->IPCs[ipcnum].bIsHere = FALSE;
 			pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
 	#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
-			current->nice = -20;	/* boost to provide priority timing */
 	#else
 			current->priority = 0x28;	/* boost to provide priority timing */
 	#endif
diff -urN linux-2.4.19-pre9/drivers/ide/ataraid.c linux/drivers/ide/ataraid.c
--- linux-2.4.19-pre9/drivers/ide/ataraid.c	Tue May 28 16:13:56 2002
+++ linux/drivers/ide/ataraid.c	Tue May 28 19:06:48 2002
@@ -123,8 +123,7 @@
 		ptr=kmalloc(sizeof(struct buffer_head),GFP_NOIO);
 		if (!ptr) {
 			__set_current_state(TASK_RUNNING);
-	                current->policy |= SCHED_YIELD;
-	                schedule();             
+			yield();
 		}
 	}
 	return ptr;
@@ -139,8 +138,7 @@
 		ptr=kmalloc(sizeof(struct ataraid_bh_private),GFP_NOIO);
 		if (!ptr) {
 			__set_current_state(TASK_RUNNING);
-	                current->policy |= SCHED_YIELD;
-	                schedule();             
+			yield();
 		}
 	}
 	return ptr;
diff -urN linux-2.4.19-pre9/drivers/md/md.c linux/drivers/md/md.c
--- linux-2.4.19-pre9/drivers/md/md.c	Tue May 28 16:14:08 2002
+++ linux/drivers/md/md.c	Tue May 28 19:06:48 2002
@@ -2936,8 +2936,6 @@
 	 * bdflush, otherwise bdflush will deadlock if there are too
 	 * many dirty RAID5 blocks.
 	 */
-	current->policy = SCHED_OTHER;
-	current->nice = -20;
 	md_unlock_kernel();
 
 	complete(thread->event);
@@ -3391,11 +3389,6 @@
 	       "(but not more than %d KB/sec) for reconstruction.\n",
 	       sysctl_speed_limit_max);
 
-	/*
-	 * Resync has low priority.
-	 */
-	current->nice = 19;
-
 	is_mddev_idle(mddev); /* this also initializes IO event counters */
 	for (m = 0; m < SYNC_MARKS; m++) {
 		mark[m] = jiffies;
@@ -3473,16 +3466,13 @@
 		currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
 
 		if (currspeed > sysctl_speed_limit_min) {
-			current->nice = 19;
-
 			if ((currspeed > sysctl_speed_limit_max) ||
 					!is_mddev_idle(mddev)) {
 				current->state = TASK_INTERRUPTIBLE;
 				md_schedule_timeout(HZ/4);
 				goto repeat;
 			}
-		} else
-			current->nice = -20;
+		}
 	}
 	printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
 	err = 0;
diff -urN linux-2.4.19-pre9/drivers/net/slip.c linux/drivers/net/slip.c
--- linux-2.4.19-pre9/drivers/net/slip.c	Tue May 28 16:13:28 2002
+++ linux/drivers/net/slip.c	Tue May 28 19:06:48 2002
@@ -1393,10 +1393,8 @@
 		/* First of all: check for active disciplines and hangup them.
 		 */
 		do {
-			if (busy) {
-				current->counter = 0;
-				schedule();
-			}
+			if (busy)
+				sys_sched_yield();
 
 			busy = 0;
 			local_bh_disable();
diff -urN linux-2.4.19-pre9/fs/binfmt_elf.c linux/fs/binfmt_elf.c
--- linux-2.4.19-pre9/fs/binfmt_elf.c	Tue May 28 16:12:53 2002
+++ linux/fs/binfmt_elf.c	Tue May 28 19:06:48 2002
@@ -1143,7 +1143,7 @@
 	psinfo.pr_state = i;
 	psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
 	psinfo.pr_zomb = psinfo.pr_sname == 'Z';
-	psinfo.pr_nice = current->nice;
+	psinfo.pr_nice = task_nice(current);
 	psinfo.pr_flag = current->flags;
 	psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
 	psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
diff -urN linux-2.4.19-pre9/fs/buffer.c linux/fs/buffer.c
--- linux-2.4.19-pre9/fs/buffer.c	Tue May 28 16:12:53 2002
+++ linux/fs/buffer.c	Tue May 28 19:06:48 2002
@@ -736,9 +736,8 @@
 	wakeup_bdflush();
 	try_to_free_pages(zone, GFP_NOFS, 0);
 	run_task_queue(&tq_disk);
-	current->policy |= SCHED_YIELD;
 	__set_current_state(TASK_RUNNING);
-	schedule();
+	sys_sched_yield();
 }
 
 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
diff -urN linux-2.4.19-pre9/fs/jbd/journal.c linux/fs/jbd/journal.c
--- linux-2.4.19-pre9/fs/jbd/journal.c	Tue May 28 16:13:02 2002
+++ linux/fs/jbd/journal.c	Tue May 28 19:06:48 2002
@@ -462,8 +462,7 @@
 			printk (KERN_NOTICE __FUNCTION__
 				": ENOMEM at get_unused_buffer_head, "
 				"trying again.\n");
-			current->policy |= SCHED_YIELD;
-			schedule();
+			yield();
 		}
 	} while (!new_bh);
 	/* keep subsequent assertions sane */
@@ -1543,8 +1542,7 @@
 			last_warning = jiffies;
 		}
 		
-		current->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 	}
 }
 
@@ -1602,8 +1600,7 @@
 			last_warning = jiffies;
 		}
 		while (ret == 0) {
-			current->policy |= SCHED_YIELD;
-			schedule();
+			yield();
 			ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
 		}
 	}
diff -urN linux-2.4.19-pre9/fs/jbd/revoke.c linux/fs/jbd/revoke.c
--- linux-2.4.19-pre9/fs/jbd/revoke.c	Tue May 28 16:13:02 2002
+++ linux/fs/jbd/revoke.c	Tue May 28 19:06:48 2002
@@ -137,8 +137,7 @@
 	if (!journal_oom_retry)
 		return -ENOMEM;
 	jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
-	current->policy |= SCHED_YIELD;
-	schedule();
+	yield();
 	goto repeat;
 }
 
diff -urN linux-2.4.19-pre9/fs/jbd/transaction.c linux/fs/jbd/transaction.c
--- linux-2.4.19-pre9/fs/jbd/transaction.c	Tue May 28 16:13:02 2002
+++ linux/fs/jbd/transaction.c	Tue May 28 19:06:48 2002
@@ -1379,8 +1379,7 @@
 		do {
 			old_handle_count = transaction->t_handle_count;
 			set_current_state(TASK_RUNNING);
-			current->policy |= SCHED_YIELD;
-			schedule();
+			yield();
 		} while (old_handle_count != transaction->t_handle_count);
 	}
 
diff -urN linux-2.4.19-pre9/fs/jffs2/background.c linux/fs/jffs2/background.c
--- linux-2.4.19-pre9/fs/jffs2/background.c	Tue May 28 16:13:01 2002
+++ linux/fs/jffs2/background.c	Tue May 28 19:06:48 2002
@@ -106,9 +106,6 @@
 
         sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index);
 
-	/* FIXME in the 2.2 backport */
-	current->nice = 10;
-
 	for (;;) {
 		spin_lock_irq(&current->sigmask_lock);
 		siginitsetinv (&current->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT));
diff -urN linux-2.4.19-pre9/fs/locks.c linux/fs/locks.c
--- linux-2.4.19-pre9/fs/locks.c	Tue May 28 16:12:53 2002
+++ linux/fs/locks.c	Tue May 28 19:06:48 2002
@@ -445,8 +445,7 @@
 			/* Let the blocked process remove waiter from the
 			 * block list when it gets scheduled.
 			 */
-			current->policy |= SCHED_YIELD;
-			schedule();
+			yield();
 		} else {
 			/* Remove waiter from the block list, because by the
 			 * time it wakes up blocker won't exist any more.
diff -urN linux-2.4.19-pre9/fs/nfs/pagelist.c linux/fs/nfs/pagelist.c
--- linux-2.4.19-pre9/fs/nfs/pagelist.c	Tue May 28 16:12:53 2002
+++ linux/fs/nfs/pagelist.c	Tue May 28 19:06:49 2002
@@ -96,8 +96,7 @@
 			continue;
 		if (signalled() && (server->flags & NFS_MOUNT_INTR))
 			return ERR_PTR(-ERESTARTSYS);
-		current->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 	}
 
 	/* Initialize the request struct. Initially, we assume a
diff -urN linux-2.4.19-pre9/fs/pipe.c linux/fs/pipe.c
--- linux-2.4.19-pre9/fs/pipe.c	Tue May 28 16:12:53 2002
+++ linux/fs/pipe.c	Tue May 28 19:06:49 2002
@@ -115,7 +115,7 @@
 		 * writers synchronously that there is more
 		 * room.
 		 */
-		wake_up_interruptible_sync(PIPE_WAIT(*inode));
+		wake_up_interruptible(PIPE_WAIT(*inode));
 		if (!PIPE_EMPTY(*inode))
 			BUG();
 		goto do_more_read;
@@ -215,7 +215,7 @@
 			 * is going to give up this CPU, so it doesnt have
 			 * to do idle reschedules.
 			 */
-			wake_up_interruptible_sync(PIPE_WAIT(*inode));
+			wake_up_interruptible(PIPE_WAIT(*inode));
 			PIPE_WAITING_WRITERS(*inode)++;
 			pipe_wait(inode);
 			PIPE_WAITING_WRITERS(*inode)--;
diff -urN linux-2.4.19-pre9/fs/proc/array.c linux/fs/proc/array.c
--- linux-2.4.19-pre9/fs/proc/array.c	Tue May 28 16:12:53 2002
+++ linux/fs/proc/array.c	Tue May 28 19:06:49 2002
@@ -335,9 +335,8 @@
 
 	/* scale priority and nice values from timeslices to -20..20 */
 	/* to make it look like a "normal" Unix priority/nice value  */
-	priority = task->counter;
-	priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER;
-	nice = task->nice;
+	priority = task_prio(task);
+	nice = task_nice(task);
 
 	read_lock(&tasklist_lock);
 	ppid = task->pid ? task->p_opptr->pid : 0;
@@ -387,7 +386,7 @@
 		task->nswap,
 		task->cnswap,
 		task->exit_signal,
-		task->processor);
+		task->cpu);
 	if(mm)
 		mmput(mm);
 	return res;
diff -urN linux-2.4.19-pre9/fs/proc/proc_misc.c linux/fs/proc/proc_misc.c
--- linux-2.4.19-pre9/fs/proc/proc_misc.c	Tue May 28 16:12:53 2002
+++ linux/fs/proc/proc_misc.c	Tue May 28 19:09:34 2002
@@ -107,11 +107,11 @@
 	a = avenrun[0] + (FIXED_1/200);
 	b = avenrun[1] + (FIXED_1/200);
 	c = avenrun[2] + (FIXED_1/200);
-	len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
+	len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
 		LOAD_INT(a), LOAD_FRAC(a),
 		LOAD_INT(b), LOAD_FRAC(b),
 		LOAD_INT(c), LOAD_FRAC(c),
-		nr_running, nr_threads, last_pid);
+		nr_running(), nr_threads, last_pid);
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
@@ -123,7 +123,7 @@
 	int len;
 
 	uptime = jiffies;
-	idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
+	idle = init_task.times.tms_utime + init_task.times.tms_stime;
 
 	/* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
 	   that would overflow about every five days at HZ == 100.
@@ -346,10 +346,10 @@
 	}
 
 	proc_sprintf(page, &off, &len,
-		"\nctxt %u\n"
+		"\nctxt %lu\n"
 		"btime %lu\n"
 		"processes %lu\n",
-		kstat.context_swtch,
+		nr_context_switches(),
 		xtime.tv_sec - jif / HZ,
 		total_forks);
 
diff -urN linux-2.4.19-pre9/fs/reiserfs/buffer2.c linux/fs/reiserfs/buffer2.c
--- linux-2.4.19-pre9/fs/reiserfs/buffer2.c	Tue May 28 16:13:01 2002
+++ linux/fs/reiserfs/buffer2.c	Tue May 28 19:06:49 2002
@@ -33,8 +33,7 @@
 			buffer_journal_dirty(bh) ? ' ' : '!');
     }
     run_task_queue(&tq_disk);
-    current->policy |= SCHED_YIELD;
-    schedule();
+    yield();
   }
   if (repeat_counter > 30000000) {
     reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ;
@@ -52,11 +51,11 @@
 struct buffer_head  * reiserfs_bread (struct super_block *super, int n_block, int n_size) 
 {
     struct buffer_head  *result;
-    PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
+    PROC_EXP( unsigned int ctx_switches = nr_context_switches(); );
 
     result = bread (super -> s_dev, n_block, n_size);
     PROC_INFO_INC( super, breads );
-    PROC_EXP( if( kstat.context_swtch != ctx_switches ) 
+    PROC_EXP( if( nr_context_switches() != ctx_switches ) 
 	      PROC_INFO_INC( super, bread_miss ) );
     return result;
 }
diff -urN linux-2.4.19-pre9/fs/reiserfs/journal.c linux/fs/reiserfs/journal.c
--- linux-2.4.19-pre9/fs/reiserfs/journal.c	Tue May 28 16:13:01 2002
+++ linux/fs/reiserfs/journal.c	Tue May 28 19:06:49 2002
@@ -151,8 +151,7 @@
   }
   bn = allocate_bitmap_node(p_s_sb) ;
   if (!bn) {
-    current->policy |= SCHED_YIELD ;
-    schedule() ;
+    yield();
     goto repeat ;
   }
   return bn ;
diff -urN linux-2.4.19-pre9/fs/ufs/truncate.c linux/fs/ufs/truncate.c
--- linux-2.4.19-pre9/fs/ufs/truncate.c	Tue May 28 16:12:54 2002
+++ linux/fs/ufs/truncate.c	Tue May 28 19:06:49 2002
@@ -448,10 +448,7 @@
 		if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
 			ufs_sync_inode (inode);
 		run_task_queue(&tq_disk);
-		current->policy |= SCHED_YIELD;
-		schedule ();
-
-
+		yield();
 	}
 	offset = inode->i_size & uspi->s_fshift;
 	if (offset) {
diff -urN linux-2.4.19-pre9/include/asm-i386/bitops.h linux/include/asm-i386/bitops.h
--- linux-2.4.19-pre9/include/asm-i386/bitops.h	Tue May 28 16:13:07 2002
+++ linux/include/asm-i386/bitops.h	Tue May 28 19:06:49 2002
@@ -6,6 +6,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/compiler.h>
 
 /*
  * These have to be done with inline assembly: that way the bit-setting
@@ -75,6 +76,14 @@
 		:"=m" (ADDR)
 		:"Ir" (nr));
 }
+
+static __inline__ void __clear_bit(int nr, volatile void * addr)
+{
+	__asm__ __volatile__(
+		"btrl %1,%0"
+		:"=m" (ADDR)
+		:"Ir" (nr));
+}
 #define smp_mb__before_clear_bit()	barrier()
 #define smp_mb__after_clear_bit()	barrier()
 
@@ -284,6 +293,34 @@
 }
 
 /**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+static __inline__ int find_first_bit(void * addr, unsigned size)
+{
+	int d0, d1;
+	int res;
+
+	/* This looks at memory. Mark it volatile to tell gcc not to move it around */
+	__asm__ __volatile__(
+		"xorl %%eax,%%eax\n\t"
+		"repe; scasl\n\t"
+		"jz 1f\n\t"
+		"leal -4(%%edi),%%edi\n\t"
+		"bsfl (%%edi),%%eax\n"
+		"1:\tsubl %%ebx,%%edi\n\t"
+		"shll $3,%%edi\n\t"
+		"addl %%edi,%%eax"
+		:"=a" (res), "=&c" (d0), "=&D" (d1)
+		:"1" ((size + 31) >> 5), "2" (addr), "b" (addr));
+	return res;
+}
+
+/**
  * find_next_zero_bit - find the first zero bit in a memory region
  * @addr: The address to base the search on
  * @offset: The bitnumber to start searching at
@@ -296,7 +333,7 @@
 	
 	if (bit) {
 		/*
-		 * Look for zero in first byte
+		 * Look for zero in the first 32 bits.
 		 */
 		__asm__("bsfl %1,%0\n\t"
 			"jne 1f\n\t"
@@ -317,6 +354,39 @@
 }
 
 /**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+static __inline__ int find_next_bit (void * addr, int size, int offset)
+{
+	unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
+	int set = 0, bit = offset & 31, res;
+	
+	if (bit) {
+		/*
+		 * Look for nonzero in the first 32 bits:
+		 */
+		__asm__("bsfl %1,%0\n\t"
+			"jne 1f\n\t"
+			"movl $32, %0\n"
+			"1:"
+			: "=r" (set)
+			: "r" (*p >> bit));
+		if (set < (32 - bit))
+			return set + offset;
+		set = 32 - bit;
+		p++;
+	}
+	/*
+	 * No set bit yet, search remaining full words for a bit
+	 */
+	res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr));
+	return (offset + set + res);
+}
+
+/**
  * ffz - find first zero in word.
  * @word: The word to search
  *
@@ -330,8 +400,41 @@
 	return word;
 }
 
+/**
+ * __ffs - find first bit in word.
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static __inline__ unsigned long __ffs(unsigned long word)
+{
+	__asm__("bsfl %1,%0"
+		:"=r" (word)
+		:"rm" (word));
+	return word;
+}
+
 #ifdef __KERNEL__
 
+/*
+ * Every architecture must define this function. It's the fastest
+ * way of searching a 140-bit bitmap where the first 100 bits are
+ * unlikely to be set. It's guaranteed that at least one of the 140
+ * bits is cleared.
+ */
+static inline int _sched_find_first_bit(unsigned long *b)
+{
+	if (unlikely(b[0]))
+		return __ffs(b[0]);
+	if (unlikely(b[1]))
+		return __ffs(b[1]) + 32;
+	if (unlikely(b[2]))
+		return __ffs(b[2]) + 64;
+	if (b[3])
+		return __ffs(b[3]) + 96;
+	return __ffs(b[4]) + 128;
+}
+
 /**
  * ffs - find first bit set
  * @x: the word to search
diff -urN linux-2.4.19-pre9/include/asm-i386/mmu_context.h linux/include/asm-i386/mmu_context.h
--- linux-2.4.19-pre9/include/asm-i386/mmu_context.h	Tue May 28 16:13:07 2002
+++ linux/include/asm-i386/mmu_context.h	Tue May 28 19:06:49 2002
@@ -27,13 +27,13 @@
 
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
 {
-	if (prev != next) {
+	if (likely(prev != next)) {
 		/* stop flush ipis for the previous mm */
 		clear_bit(cpu, &prev->cpu_vm_mask);
 		/*
 		 * Re-load LDT if necessary
 		 */
-		if (prev->context.segments != next->context.segments)
+		if (unlikely(prev->context.segments != next->context.segments))
 			load_LDT(next);
 #ifdef CONFIG_SMP
 		cpu_tlbstate[cpu].state = TLBSTATE_OK;
diff -urN linux-2.4.19-pre9/include/asm-i386/pgalloc.h linux/include/asm-i386/pgalloc.h
--- linux-2.4.19-pre9/include/asm-i386/pgalloc.h	Tue May 28 16:13:07 2002
+++ linux/include/asm-i386/pgalloc.h	Tue May 28 19:06:49 2002
@@ -224,6 +224,7 @@
 {
 	struct mm_struct *active_mm;
 	int state;
+	char __cacheline_padding[24];
 };
 extern struct tlb_state cpu_tlbstate[NR_CPUS];
 
diff -urN linux-2.4.19-pre9/include/asm-i386/smp.h linux/include/asm-i386/smp.h
--- linux-2.4.19-pre9/include/asm-i386/smp.h	Tue May 28 16:13:07 2002
+++ linux/include/asm-i386/smp.h	Tue May 28 19:06:49 2002
@@ -63,6 +63,7 @@
 extern void smp_flush_tlb(void);
 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
 extern void smp_send_reschedule(int cpu);
+extern void smp_send_reschedule_all(void);
 extern void smp_invalidate_rcv(void);		/* Process an NMI */
 extern void (*mtrr_hook) (void);
 extern void zap_low_mappings (void);
@@ -104,7 +105,7 @@
  * so this is correct in the x86 case.
  */
 
-#define smp_processor_id() (current->processor)
+#define smp_processor_id() (current->cpu)
 
 static __inline int hard_smp_processor_id(void)
 {
@@ -122,17 +123,5 @@
 
 #define NO_PROC_ID		0xFF		/* No processor magic marker */
 
-/*
- *	This magic constant controls our willingness to transfer
- *	a process across CPUs. Such a transfer incurs misses on the L1
- *	cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
- *	gut feeling is this will vary by board in value. For a board
- *	with separate L2 cache it probably depends also on the RSS, and
- *	for a board with shared L2 cache it ought to decay fast as other
- *	processes are run.
- */
- 
-#define PROC_CHANGE_PENALTY	15		/* Schedule penalty */
-
 #endif
 #endif
diff -urN linux-2.4.19-pre9/include/linux/kernel_stat.h linux/include/linux/kernel_stat.h
--- linux-2.4.19-pre9/include/linux/kernel_stat.h	Tue May 28 16:13:04 2002
+++ linux/include/linux/kernel_stat.h	Tue May 28 19:06:49 2002
@@ -29,9 +29,10 @@
 #if !defined(CONFIG_ARCH_S390)
 	unsigned int irqs[NR_CPUS][NR_IRQS];
 #endif
-	unsigned int context_swtch;
 };
 
+extern unsigned long nr_context_switches(void);
+
 extern struct kernel_stat kstat;
 
 #if !defined(CONFIG_ARCH_S390)
diff -urN linux-2.4.19-pre9/include/linux/list.h linux/include/linux/list.h
--- linux-2.4.19-pre9/include/linux/list.h	Tue May 28 16:13:02 2002
+++ linux/include/linux/list.h	Tue May 28 19:06:49 2002
@@ -19,6 +19,8 @@
 	struct list_head *next, *prev;
 };
 
+typedef struct list_head list_t;
+
 #define LIST_HEAD_INIT(name) { &(name), &(name) }
 
 #define LIST_HEAD(name) \
diff -urN linux-2.4.19-pre9/include/linux/sched.h linux/include/linux/sched.h
--- linux-2.4.19-pre9/include/linux/sched.h	Tue May 28 16:13:02 2002
+++ linux/include/linux/sched.h	Tue May 28 19:12:10 2002
@@ -6,6 +6,7 @@
 extern unsigned long event;
 
 #include <linux/config.h>
+#include <linux/compiler.h>
 #include <linux/binfmts.h>
 #include <linux/threads.h>
 #include <linux/kernel.h>
@@ -21,7 +22,7 @@
 #include <asm/mmu.h>
 
 #include <linux/smp.h>
-#include <linux/tty.h>
+//#include <linux/tty.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
 #include <linux/securebits.h>
@@ -73,10 +74,11 @@
 #define CT_TO_SECS(x)	((x) / HZ)
 #define CT_TO_USECS(x)	(((x) % HZ) * 1000000/HZ)
 
-extern int nr_running, nr_threads;
+extern int nr_threads;
 extern int last_pid;
+extern unsigned long nr_running(void);
 
-#include <linux/fs.h>
+//#include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/param.h>
 #include <linux/resource.h>
@@ -119,12 +121,6 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 
-/*
- * This is an additional bit set when we want to
- * yield the CPU for one re-schedule..
- */
-#define SCHED_YIELD		0x10
-
 struct sched_param {
 	int sched_priority;
 };
@@ -142,17 +138,21 @@
  * a separate lock).
  */
 extern rwlock_t tasklist_lock;
-extern spinlock_t runqueue_lock;
 extern spinlock_t mmlist_lock;
 
+typedef struct task_struct task_t;
+
 extern void sched_init(void);
-extern void init_idle(void);
+extern void init_idle(task_t *idle, int cpu);
 extern void show_state(void);
 extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
-extern void update_one_process(struct task_struct *p, unsigned long user,
+extern void update_one_process(task_t *p, unsigned long user,
 			       unsigned long system, int cpu);
+extern void scheduler_tick(int user_tick, int system);
+extern void migration_init(void);
+extern unsigned long cache_decay_ticks;
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
@@ -164,6 +164,51 @@
 extern int current_is_keventd(void);
 
 /*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_OTHER tasks are
+ * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values
+ * are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_RT_USER_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ *
+ * Both values are configurable at compile-time.
+ */
+
+#if CONFIG_MAX_USER_RT_PRIO < 100
+#define MAX_USER_RT_PRIO	100
+#elif CONFIG_MAX_USER_RT_PRIO > 1000
+#define MAX_USER_RT_PRIO	1000
+#else
+#define MAX_USER_RT_PRIO	CONFIG_MAX_USER_RT_PRIO
+#endif
+
+#if CONFIG_MAX_RT_PRIO < 0
+#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+#elif CONFIG_MAX_RT_PRIO > 200
+#define MAX_RT_PRIO		(MAX_USER_RT_PRIO + 200)
+#else
+#define MAX_RT_PRIO		(MAX_USER_RT_PRIO + CONFIG_MAX_RT_PRIO)
+#endif
+
+#define MAX_PRIO		(MAX_RT_PRIO + 40)
+
+/*
+ * The maximum RT priority is configurable.  If the resulting
+ * bitmap is 160-bits , we can use a hand-coded routine which
+ * is optimal.  Otherwise, we fall back on a generic routine for
+ * finding the first set bit from an arbitrarily-sized bitmap.
+ */
+#if MAX_PRIO < 160 && MAX_PRIO > 127
+#define sched_find_first_bit(map)	_sched_find_first_bit(map)
+#else
+#define sched_find_first_bit(map)	find_first_bit(map, MAX_PRIO)
+#endif
+
+/*
  * The default fd array needs to be at least BITS_PER_LONG,
  * as this is the granularity returned by copy_fdset().
  */
@@ -284,6 +329,8 @@
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 
+typedef struct prio_array prio_array_t;
+
 struct task_struct {
 	/*
 	 * offsets of these are hardcoded elsewhere - touch with care
@@ -301,35 +348,26 @@
 
 	int lock_depth;		/* Lock depth */
 
-/*
- * offset 32 begins here on 32-bit platforms. We keep
- * all fields in a single cacheline that are needed for
- * the goodness() loop in schedule().
- */
-	long counter;
-	long nice;
-	unsigned long policy;
-	struct mm_struct *mm;
-	int processor;
 	/*
-	 * cpus_runnable is ~0 if the process is not running on any
-	 * CPU. It's (1 << cpu) if it's running on a CPU. This mask
-	 * is updated under the runqueue lock.
-	 *
-	 * To determine whether a process might run on a CPU, this
-	 * mask is AND-ed with cpus_allowed.
+	 * offset 32 begins here on 32-bit platforms.
 	 */
-	unsigned long cpus_runnable, cpus_allowed;
-	/*
-	 * (only the 'next' pointer fits into the cacheline, but
-	 * that's just fine.)
-	 */
-	struct list_head run_list;
-	unsigned long sleep_time;
+	unsigned int cpu;
+	int prio, static_prio;
+	list_t run_list;
+	prio_array_t *array;
+
+	unsigned long sleep_avg;
+	unsigned long sleep_timestamp;
+
+	unsigned long policy;
+	unsigned long cpus_allowed;
+	unsigned int time_slice;
+
+	task_t *next_task, *prev_task;
 
-	struct task_struct *next_task, *prev_task;
-	struct mm_struct *active_mm;
+	struct mm_struct *mm, *active_mm;
 	struct list_head local_pages;
+
 	unsigned int allocation_order, nr_local_pages;
 
 /* task state */
@@ -351,12 +389,12 @@
 	 * older sibling, respectively.  (p->father can be replaced with 
 	 * p->p_pptr->pid)
 	 */
-	struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
+	task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
 	struct list_head thread_group;
 
 	/* PID hash table linkage. */
-	struct task_struct *pidhash_next;
-	struct task_struct **pidhash_pprev;
+	task_t *pidhash_next;
+	task_t **pidhash_pprev;
 
 	wait_queue_head_t wait_chldexit;	/* for wait4() */
 	struct completion *vfork_done;		/* for vfork() */
@@ -454,9 +492,15 @@
  */
 #define _STK_LIM	(8*1024*1024)
 
-#define DEF_COUNTER	(10*HZ/100)	/* 100 ms time slice */
-#define MAX_COUNTER	(20*HZ/100)
-#define DEF_NICE	(0)
+#if CONFIG_SMP
+extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
+#else
+#define set_cpus_allowed(p, new_mask)	do { } while (0)
+#endif
+
+extern void set_user_nice(task_t *p, long nice);
+extern int task_prio(task_t *p);
+extern int task_nice(task_t *p);
 
 asmlinkage long sys_sched_yield(void);
 #define yield()	sys_sched_yield()
@@ -478,14 +522,14 @@
     addr_limit:		KERNEL_DS,					\
     exec_domain:	&default_exec_domain,				\
     lock_depth:		-1,						\
-    counter:		DEF_COUNTER,					\
-    nice:		DEF_NICE,					\
+    prio:		MAX_PRIO-20,					\
+    static_prio:	MAX_PRIO-20,					\
     policy:		SCHED_OTHER,					\
+    cpus_allowed:	-1,						\
     mm:			NULL,						\
     active_mm:		&init_mm,					\
-    cpus_runnable:	-1,						\
-    cpus_allowed:	-1,						\
     run_list:		LIST_HEAD_INIT(tsk.run_list),			\
+    time_slice:		HZ,						\
     next_task:		&tsk,						\
     prev_task:		&tsk,						\
     p_opptr:		&tsk,						\
@@ -519,24 +563,24 @@
 #endif
 
 union task_union {
-	struct task_struct task;
+	task_t task;
 	unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
 };
 
 extern union task_union init_task_union;
 
 extern struct   mm_struct init_mm;
-extern struct task_struct *init_tasks[NR_CPUS];
+extern task_t *init_tasks[NR_CPUS];
 
 /* PID hashing. (shouldnt this be dynamic?) */
 #define PIDHASH_SZ (4096 >> 2)
-extern struct task_struct *pidhash[PIDHASH_SZ];
+extern task_t *pidhash[PIDHASH_SZ];
 
 #define pid_hashfn(x)	((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
 
-static inline void hash_pid(struct task_struct *p)
+static inline void hash_pid(task_t *p)
 {
-	struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
+	task_t **htable = &pidhash[pid_hashfn(p->pid)];
 
 	if((p->pidhash_next = *htable) != NULL)
 		(*htable)->pidhash_pprev = &p->pidhash_next;
@@ -544,16 +588,16 @@
 	p->pidhash_pprev = htable;
 }
 
-static inline void unhash_pid(struct task_struct *p)
+static inline void unhash_pid(task_t *p)
 {
 	if(p->pidhash_next)
 		p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
 	*p->pidhash_pprev = p->pidhash_next;
 }
 
-static inline struct task_struct *find_task_by_pid(int pid)
+static inline task_t *find_task_by_pid(int pid)
 {
-	struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
+	task_t *p, **htable = &pidhash[pid_hashfn(pid)];
 
 	for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
 		;
@@ -561,19 +605,6 @@
 	return p;
 }
 
-#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
-
-static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
-{
-	tsk->processor = cpu;
-	tsk->cpus_runnable = 1UL << cpu;
-}
-
-static inline void task_release_cpu(struct task_struct *tsk)
-{
-	tsk->cpus_runnable = ~0UL;
-}
-
 /* per-UID process charging. */
 extern struct user_struct * alloc_uid(uid_t);
 extern void free_uid(struct user_struct *);
@@ -593,54 +624,50 @@
 #define CURRENT_TIME (xtime.tv_sec)
 
 extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr));
-extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr));
 extern void FASTCALL(sleep_on(wait_queue_head_t *q));
 extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q,
 				      signed long timeout));
 extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
 extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
 						    signed long timeout));
-extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process(task_t * p));
+extern void FASTCALL(wake_up_forked_process(task_t * p));
 
 #define wake_up(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
 #define wake_up_nr(x, nr)		__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
 #define wake_up_all(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0)
-#define wake_up_sync(x)			__wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
-#define wake_up_sync_nr(x, nr)		__wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
 #define wake_up_interruptible(x)	__wake_up((x),TASK_INTERRUPTIBLE, 1)
 #define wake_up_interruptible_nr(x, nr)	__wake_up((x),TASK_INTERRUPTIBLE, nr)
 #define wake_up_interruptible_all(x)	__wake_up((x),TASK_INTERRUPTIBLE, 0)
-#define wake_up_interruptible_sync(x)	__wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
-#define wake_up_interruptible_sync_nr(x) __wake_up_sync((x),TASK_INTERRUPTIBLE,  nr)
 asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
 
 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);
 
 extern void proc_caches_init(void);
-extern void flush_signals(struct task_struct *);
-extern void flush_signal_handlers(struct task_struct *);
+extern void flush_signals(task_t *);
+extern void flush_signal_handlers(task_t *);
 extern void sig_exit(int, int, struct siginfo *);
 extern int dequeue_signal(sigset_t *, siginfo_t *);
 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 			      sigset_t *mask);
 extern void unblock_all_signals(void);
-extern int send_sig_info(int, struct siginfo *, struct task_struct *);
-extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+extern int send_sig_info(int, struct siginfo *, task_t *);
+extern int force_sig_info(int, struct siginfo *, task_t *);
 extern int kill_pg_info(int, struct siginfo *, pid_t);
 extern int kill_sl_info(int, struct siginfo *, pid_t);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
-extern void notify_parent(struct task_struct *, int);
-extern void do_notify_parent(struct task_struct *, int);
-extern void force_sig(int, struct task_struct *);
-extern int send_sig(int, struct task_struct *, int);
+extern void notify_parent(task_t *, int);
+extern void do_notify_parent(task_t *, int);
+extern void force_sig(int, task_t *);
+extern int send_sig(int, task_t *, int);
 extern int kill_pg(pid_t, int, int);
 extern int kill_sl(pid_t, int, int);
 extern int kill_proc(pid_t, int, int);
 extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
 extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
 
-static inline int signal_pending(struct task_struct *p)
+static inline int signal_pending(task_t *p)
 {
 	return (p->sigpending != 0);
 }
@@ -679,7 +706,7 @@
    This is required every time the blocked sigset_t changes.
    All callers should have t->sigmask_lock.  */
 
-static inline void recalc_sigpending(struct task_struct *t)
+static inline void recalc_sigpending(task_t *t)
 {
 	t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
 }
@@ -786,16 +813,17 @@
 extern int expand_fdset(struct files_struct *, int nr);
 extern void free_fdset(fd_set *, int);
 
-extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *);
 extern void flush_thread(void);
 extern void exit_thread(void);
 
-extern void exit_mm(struct task_struct *);
-extern void exit_files(struct task_struct *);
-extern void exit_sighand(struct task_struct *);
+extern void exit_mm(task_t *);
+extern void exit_files(task_t *);
+extern void exit_sighand(task_t *);
 
 extern void reparent_to_init(void);
 extern void daemonize(void);
+extern task_t *child_reaper;
 
 extern int do_execve(char *, char **, char **, struct pt_regs *);
 extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
@@ -804,6 +832,9 @@
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 
+extern void wait_task_inactive(task_t * p);
+extern void kick_if_running(task_t * p);
+
 #define __wait_event(wq, condition) 					\
 do {									\
 	wait_queue_t __wait;						\
@@ -885,27 +916,12 @@
 	for (task = next_thread(current) ; task != current ; task = next_thread(task))
 
 #define next_thread(p) \
-	list_entry((p)->thread_group.next, struct task_struct, thread_group)
+	list_entry((p)->thread_group.next, task_t, thread_group)
 
 #define thread_group_leader(p)	(p->pid == p->tgid)
 
-static inline void del_from_runqueue(struct task_struct * p)
-{
-	nr_running--;
-	p->sleep_time = jiffies;
-	list_del(&p->run_list);
-	p->run_list.next = NULL;
-}
-
-static inline int task_on_runqueue(struct task_struct *p)
+static inline void unhash_process(task_t *p)
 {
-	return (p->run_list.next != NULL);
-}
-
-static inline void unhash_process(struct task_struct *p)
-{
-	if (task_on_runqueue(p))
-		out_of_line_bug();
 	write_lock_irq(&tasklist_lock);
 	nr_threads--;
 	unhash_pid(p);
@@ -915,12 +931,12 @@
 }
 
 /* Protects ->fs, ->files, ->mm, and synchronises with wait4().  Nests inside tasklist_lock */
-static inline void task_lock(struct task_struct *p)
+static inline void task_lock(task_t *p)
 {
 	spin_lock(&p->alloc_lock);
 }
 
-static inline void task_unlock(struct task_struct *p)
+static inline void task_unlock(task_t *p)
 {
 	spin_unlock(&p->alloc_lock);
 }
@@ -944,6 +960,31 @@
 	return res;
 }
 
+static inline void set_need_resched(void)
+{
+	current->need_resched = 1;
+}
+
+static inline void clear_need_resched(void)
+{
+	current->need_resched = 0;
+}
+
+static inline void set_tsk_need_resched(task_t *tsk)
+{
+	tsk->need_resched = 1;
+}
+
+static inline void clear_tsk_need_resched(task_t *tsk)
+{
+	tsk->need_resched = 0;
+}
+
+static inline int need_resched(void)
+{
+	return unlikely(current->need_resched);
+}
+
 #endif /* __KERNEL__ */
 
 #endif
diff -urN linux-2.4.19-pre9/include/linux/smp.h linux/include/linux/smp.h
--- linux-2.4.19-pre9/include/linux/smp.h	Tue May 28 16:13:05 2002
+++ linux/include/linux/smp.h	Tue May 28 19:06:49 2002
@@ -86,6 +86,14 @@
 #define cpu_number_map(cpu)			0
 #define smp_call_function(func,info,retry,wait)	({ 0; })
 #define cpu_online_map				1
+static inline void smp_send_reschedule(int cpu) { }
+static inline void smp_send_reschedule_all(void) { }
 
 #endif
+
+/*
+ * Common definitions:
+ */
+#define cpu()					smp_processor_id()
+
 #endif
diff -urN linux-2.4.19-pre9/init/do_mounts.c linux/init/do_mounts.c
--- linux-2.4.19-pre9/init/do_mounts.c	Tue May 28 16:13:02 2002
+++ linux/init/do_mounts.c	Tue May 28 19:12:10 2002
@@ -774,10 +774,8 @@
 
 	pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
 	if (pid > 0) {
-		while (pid != wait(&i)) {
-			current->policy |= SCHED_YIELD;
-			schedule();
-		}
+		while (pid != wait(&i))
+			yield();
 	}
 
 	sys_mount("..", ".", NULL, MS_MOVE, NULL);
diff -urN linux-2.4.19-pre9/init/main.c linux/init/main.c
--- linux-2.4.19-pre9/init/main.c	Tue May 28 16:13:02 2002
+++ linux/init/main.c	Tue May 28 19:06:49 2002
@@ -288,8 +288,6 @@
 extern void setup_arch(char **);
 extern void cpu_idle(void);
 
-unsigned long wait_init_idle;
-
 #ifndef CONFIG_SMP
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -298,34 +296,24 @@
 	APIC_init_uniprocessor();
 }
 #else
-#define smp_init()	do { } while (0)
+#define smp_init()      do { } while (0)
 #endif
 
 #else
 
-
 /* Called by boot processor to activate the rest. */
 static void __init smp_init(void)
 {
 	/* Get other processors into their bootup holding patterns. */
 	smp_boot_cpus();
-	wait_init_idle = cpu_online_map;
-	clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */
 
 	smp_threads_ready=1;
 	smp_commence();
-
-	/* Wait for the other cpus to set up their idle processes */
-	printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
-	while (wait_init_idle) {
-		cpu_relax();
-		barrier();
-	}
-	printk("All processors have done init_idle\n");
 }
 
 #endif
 
+
 /*
  * We need to finalize in a non-__init function or else race conditions
  * between the root thread and the init thread may cause start_kernel to
@@ -337,9 +325,8 @@
 {
 	kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 	unlock_kernel();
-	current->need_resched = 1;
- 	cpu_idle();
-} 
+	cpu_idle();
+}
 
 /*
  *	Activate the first processor.
@@ -424,14 +411,18 @@
 	ipc_init();
 #endif
 	check_bugs();
+
 	printk("POSIX conformance testing by UNIFIX\n");
 
-	/* 
-	 *	We count on the initial thread going ok 
-	 *	Like idlers init is an unlocked kernel thread, which will
-	 *	make syscalls (and thus be locked).
+	init_idle(current, smp_processor_id());
+	/*
+	 *      We count on the initial thread going ok
+	 *      Like idlers init is an unlocked kernel thread, which will
+	 *      make syscalls (and thus be locked).
 	 */
 	smp_init();
+
+	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
 
@@ -460,6 +451,10 @@
  */
 static void __init do_basic_setup(void)
 {
+	/* Start the per-CPU migration threads */
+#if CONFIG_SMP
+	migration_init();
+#endif
 
 	/*
 	 * Tell the world that we're going to be the grim
diff -urN linux-2.4.19-pre9/kernel/capability.c linux/kernel/capability.c
--- linux-2.4.19-pre9/kernel/capability.c	Tue May 28 16:13:02 2002
+++ linux/kernel/capability.c	Tue May 28 19:06:49 2002
@@ -8,6 +8,8 @@
 #include <linux/mm.h>
 #include <asm/uaccess.h>
 
+unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
+
 kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
 
 /* Note: never hold tasklist_lock while spinning for this one */
diff -urN linux-2.4.19-pre9/kernel/exit.c linux/kernel/exit.c
--- linux-2.4.19-pre9/kernel/exit.c	Tue May 28 16:13:02 2002
+++ linux/kernel/exit.c	Tue May 28 19:06:49 2002
@@ -28,49 +28,22 @@
 
 static void release_task(struct task_struct * p)
 {
-	if (p != current) {
+	if (p == current)
+		BUG();
 #ifdef CONFIG_SMP
-		/*
-		 * Wait to make sure the process isn't on the
-		 * runqueue (active on some other CPU still)
-		 */
-		for (;;) {
-			task_lock(p);
-			if (!task_has_cpu(p))
-				break;
-			task_unlock(p);
-			do {
-				cpu_relax();
-				barrier();
-			} while (task_has_cpu(p));
-		}
-		task_unlock(p);
+	wait_task_inactive(p);
 #endif
-		atomic_dec(&p->user->processes);
-		free_uid(p->user);
-		unhash_process(p);
-
-		release_thread(p);
-		current->cmin_flt += p->min_flt + p->cmin_flt;
-		current->cmaj_flt += p->maj_flt + p->cmaj_flt;
-		current->cnswap += p->nswap + p->cnswap;
-		/*
-		 * Potentially available timeslices are retrieved
-		 * here - this way the parent does not get penalized
-		 * for creating too many processes.
-		 *
-		 * (this cannot be used to artificially 'generate'
-		 * timeslices, because any timeslice recovered here
-		 * was given away by the parent in the first place.)
-		 */
-		current->counter += p->counter;
-		if (current->counter >= MAX_COUNTER)
-			current->counter = MAX_COUNTER;
-		p->pid = 0;
-		free_task_struct(p);
-	} else {
-		printk("task releasing itself\n");
-	}
+	atomic_dec(&p->user->processes);
+	free_uid(p->user);
+	unhash_process(p);
+
+	release_thread(p);
+	current->cmin_flt += p->min_flt + p->cmin_flt;
+	current->cmaj_flt += p->maj_flt + p->cmaj_flt;
+	current->cnswap += p->nswap + p->cnswap;
+	sched_exit(p);
+	p->pid = 0;
+	free_task_struct(p);
 }
 
 /*
@@ -150,6 +123,79 @@
 	return retval;
 }
 
+/**
+ * reparent_to_init() - Reparent the calling kernel thread to the init task.
+ *
+ * If a kernel thread is launched as a result of a system call, or if
+ * it ever exits, it should generally reparent itself to init so that
+ * it is correctly cleaned up on exit.
+ *
+ * The various task state such as scheduling policy and priority may have
+ * been inherited from a user process, so we reset them to sane values here.
+ *
+ * NOTE that reparent_to_init() gives the caller full capabilities.
+ */
+void reparent_to_init(void)
+{
+	write_lock_irq(&tasklist_lock);
+
+	/* Reparent to init */
+	REMOVE_LINKS(current);
+	current->p_pptr = child_reaper;
+	current->p_opptr = child_reaper;
+	SET_LINKS(current);
+
+	/* Set the exit signal to SIGCHLD so we signal init on exit */
+	current->exit_signal = SIGCHLD;
+
+	current->ptrace = 0;
+	if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0))
+		set_user_nice(current, 0);
+	/* cpus_allowed? */
+	/* rt_priority? */
+	/* signals? */
+	current->cap_effective = CAP_INIT_EFF_SET;
+	current->cap_inheritable = CAP_INIT_INH_SET;
+	current->cap_permitted = CAP_FULL_SET;
+	current->keep_capabilities = 0;
+	memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
+	current->user = INIT_USER;
+
+	write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ *	Put all the gunge required to become a kernel thread without
+ *	attached user resources in one place where it belongs.
+ */
+
+void daemonize(void)
+{
+	struct fs_struct *fs;
+
+
+	/*
+	 * If we were started as result of loading a module, close all of the
+	 * user space pages.  We don't need them, and if we didn't close them
+	 * they would be locked into memory.
+	 */
+	exit_mm(current);
+
+	current->session = 1;
+	current->pgrp = 1;
+	current->tty = NULL;
+
+	/* Become as one with the init task */
+
+	exit_fs(current);	/* current->fs->count--; */
+	fs = init_task.fs;
+	current->fs = fs;
+	atomic_inc(&fs->count);
+ 	exit_files(current);
+	current->files = init_task.files;
+	atomic_inc(&current->files->count);
+}
+
 /*
  * When we die, we re-parent all our children.
  * Try to give them to another thread in our process
diff -urN linux-2.4.19-pre9/kernel/fork.c linux/kernel/fork.c
--- linux-2.4.19-pre9/kernel/fork.c	Tue May 28 16:13:02 2002
+++ linux/kernel/fork.c	Tue May 28 19:06:49 2002
@@ -30,7 +30,6 @@
 
 /* The idle threads do not count.. */
 int nr_threads;
-int nr_running;
 
 int max_threads;
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
@@ -38,6 +37,8 @@
 
 struct task_struct *pidhash[PIDHASH_SZ];
 
+rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
+
 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
@@ -578,6 +579,7 @@
 	    struct pt_regs *regs, unsigned long stack_size)
 {
 	int retval;
+	unsigned long flags;
 	struct task_struct *p;
 	struct completion vfork;
 
@@ -638,8 +640,7 @@
 	if (p->pid == 0 && current->pid != 0)
 		goto bad_fork_cleanup;
 
-	p->run_list.next = NULL;
-	p->run_list.prev = NULL;
+	INIT_LIST_HEAD(&p->run_list);
 
 	p->p_cptr = NULL;
 	init_waitqueue_head(&p->wait_chldexit);
@@ -665,14 +666,15 @@
 #ifdef CONFIG_SMP
 	{
 		int i;
-		p->cpus_runnable = ~0UL;
-		p->processor = current->processor;
+
 		/* ?? should we just memset this ?? */
 		for(i = 0; i < smp_num_cpus; i++)
-			p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
+			p->per_cpu_utime[cpu_logical_map(i)] =
+				p->per_cpu_stime[cpu_logical_map(i)] = 0;
 		spin_lock_init(&p->sigmask_lock);
 	}
 #endif
+	p->array = NULL;
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
 
@@ -706,15 +708,27 @@
 	p->pdeath_signal = 0;
 
 	/*
-	 * "share" dynamic priority between parent and child, thus the
-	 * total amount of dynamic priorities in the system doesnt change,
-	 * more scheduling fairness. This is only important in the first
-	 * timeslice, on the long run the scheduling behaviour is unchanged.
-	 */
-	p->counter = (current->counter + 1) >> 1;
-	current->counter >>= 1;
-	if (!current->counter)
-		current->need_resched = 1;
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesnt change,
+	 * resulting in more scheduling fairness.
+	 */
+	__save_flags(flags);
+	__cli();
+	if (!current->time_slice)
+		BUG();
+	p->time_slice = (current->time_slice + 1) >> 1;
+	current->time_slice >>= 1;
+	if (!current->time_slice) {
+		/*
+		 * This case is rare, it happens when the parent has only
+		 * a single jiffy left from its timeslice. Taking the
+		 * runqueue lock is not a problem.
+		 */
+		current->time_slice = 1;
+		scheduler_tick(0,0);
+	}
+	p->sleep_timestamp = jiffies;
+	__restore_flags(flags);
 
 	/*
 	 * Ok, add it to the run-queues and make it
@@ -750,11 +764,16 @@
 
 	if (p->ptrace & PT_PTRACED)
 		send_sig(SIGSTOP, p, 1);
-
-	wake_up_process(p);		/* do this last */
+	wake_up_forked_process(p);	/* do this last */
 	++total_forks;
 	if (clone_flags & CLONE_VFORK)
 		wait_for_completion(&vfork);
+	else
+		/*
+		 * Let the child process run first, to avoid most of the
+		 * COW overhead when the child exec()s afterwards.
+		 */
+		current->need_resched = 1;
 
 fork_out:
 	return retval;
diff -urN linux-2.4.19-pre9/kernel/ksyms.c linux/kernel/ksyms.c
--- linux-2.4.19-pre9/kernel/ksyms.c	Tue May 28 16:13:02 2002
+++ linux/kernel/ksyms.c	Tue May 28 19:06:49 2002
@@ -434,7 +434,6 @@
 /* process management */
 EXPORT_SYMBOL(complete_and_exit);
 EXPORT_SYMBOL(__wake_up);
-EXPORT_SYMBOL(__wake_up_sync);
 EXPORT_SYMBOL(wake_up_process);
 EXPORT_SYMBOL(sleep_on);
 EXPORT_SYMBOL(sleep_on_timeout);
@@ -443,6 +442,10 @@
 EXPORT_SYMBOL(schedule);
 EXPORT_SYMBOL(schedule_timeout);
 EXPORT_SYMBOL(sys_sched_yield);
+EXPORT_SYMBOL(set_user_nice);
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL_GPL(set_cpus_allowed);
+#endif
 EXPORT_SYMBOL(jiffies);
 EXPORT_SYMBOL(xtime);
 EXPORT_SYMBOL(do_gettimeofday);
@@ -454,6 +457,7 @@
 
 EXPORT_SYMBOL(kstat);
 EXPORT_SYMBOL(nr_running);
+EXPORT_SYMBOL(nr_context_switches);
 
 /* misc */
 EXPORT_SYMBOL(panic);
diff -urN linux-2.4.19-pre9/kernel/printk.c linux/kernel/printk.c
--- linux-2.4.19-pre9/kernel/printk.c	Tue May 28 16:13:02 2002
+++ linux/kernel/printk.c	Tue May 28 19:06:49 2002
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>			/* For in_interrupt() */
 #include <linux/config.h>
+#include <linux/delay.h>
 
 #include <asm/uaccess.h>
 
diff -urN linux-2.4.19-pre9/kernel/ptrace.c linux/kernel/ptrace.c
--- linux-2.4.19-pre9/kernel/ptrace.c	Tue May 28 16:13:02 2002
+++ linux/kernel/ptrace.c	Tue May 28 19:06:49 2002
@@ -31,20 +31,7 @@
 		if (child->state != TASK_STOPPED)
 			return -ESRCH;
 #ifdef CONFIG_SMP
-		/* Make sure the child gets off its CPU.. */
-		for (;;) {
-			task_lock(child);
-			if (!task_has_cpu(child))
-				break;
-			task_unlock(child);
-			do {
-				if (child->state != TASK_STOPPED)
-					return -ESRCH;
-				barrier();
-				cpu_relax();
-			} while (task_has_cpu(child));
-		}
-		task_unlock(child);
+		wait_task_inactive(child);
 #endif		
 	}
 
diff -urN linux-2.4.19-pre9/kernel/sched.c linux/kernel/sched.c
--- linux-2.4.19-pre9/kernel/sched.c	Tue May 28 16:13:02 2002
+++ linux/kernel/sched.c	Tue May 28 19:06:49 2002
@@ -3,342 +3,297 @@
  *
  *  Kernel scheduler and related syscalls
  *
- *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *              make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
- *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
+ *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
+ *  		hybrid priority-list and round-robin design with
+ *  		an array-switch method of distributing timeslices
+ *  		and per-CPU runqueues.  Additional code by Davide
+ *  		Libenzi, Robert Love, and Rusty Russel.
  */
 
-/*
- * 'sched.c' is the main kernel file. It contains scheduling primitives
- * (sleep_on, wakeup, schedule etc) as well as a number of simple system
- * call functions (type getpid()), which just extract a field from
- * current-task
- */
-
-#include <linux/config.h>
 #include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/nmi.h>
 #include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-#include <linux/completion.h>
-#include <linux/prefetch.h>
-#include <linux/compiler.h>
-
+#include <linux/init.h>
 #include <asm/uaccess.h>
+#include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
-
-extern void timer_bh(void);
-extern void tqueue_bh(void);
-extern void immediate_bh(void);
+#include <linux/kernel_stat.h>
 
 /*
- * scheduler variables
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
  */
+#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 
-unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
-
-extern void mem_use(void);
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 
 /*
- * Scheduling quanta.
+ * These are the 'tuning knobs' of the scheduler:
  *
- * NOTE! The unix "nice" value influences how long a process
- * gets. The nice value ranges from -20 to +19, where a -20
- * is a "high-priority" task, and a "+10" is a low-priority
- * task.
- *
- * We want the time-slice to be around 50ms or so, so this
- * calculation depends on the value of HZ.
+ * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
+ * maximum timeslice is 300 msecs. Timeslices get refilled after
+ * they expire.
  */
-#if HZ < 200
-#define TICK_SCALE(x)	((x) >> 2)
-#elif HZ < 400
-#define TICK_SCALE(x)	((x) >> 1)
-#elif HZ < 800
-#define TICK_SCALE(x)	(x)
-#elif HZ < 1600
-#define TICK_SCALE(x)	((x) << 1)
-#else
-#define TICK_SCALE(x)	((x) << 2)
-#endif
-
-#define NICE_TO_TICKS(nice)	(TICK_SCALE(20-(nice))+1)
-
+#define MIN_TIMESLICE		( 10 * HZ / 1000)
+#define MAX_TIMESLICE		(300 * HZ / 1000)
+#define CHILD_PENALTY		95
+#define PARENT_PENALTY		100
+#define EXIT_WEIGHT		3
+#define PRIO_BONUS_RATIO	25
+#define INTERACTIVE_DELTA	2
+#define MAX_SLEEP_AVG		(2*HZ)
+#define STARVATION_LIMIT	(2*HZ)
 
 /*
- *	Init task must be ok at boot for the ix86 as we will check its signals
- *	via the SMP irq return path.
+ * If a task is 'interactive' then we reinsert it in the active
+ * array after it has expired its current timeslice. (it will not
+ * continue to run immediately, it will still roundrobin with
+ * other interactive tasks.)
+ *
+ * This part scales the interactivity limit depending on niceness.
+ *
+ * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+ * Here are a few examples of different nice levels:
+ *
+ *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
+ *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
+ *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
+ *
+ * (the X axis represents the possible -5 ... 0 ... +5 dynamic
+ *  priority range a task can explore, a value of '1' means the
+ *  task is rated interactive.)
+ *
+ * Ie. nice +19 tasks can never get 'interactive' enough to be
+ * reinserted into the active array. And only heavily CPU-hog nice -20
+ * tasks will be expired. Default nice 0 tasks are somewhere between,
+ * it takes some effort for them to get interactive, but it's not
+ * too hard.
  */
- 
-struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
+
+#define SCALE(v1,v1_max,v2_max) \
+	(v1) * (v2_max) / (v1_max)
+
+#define DELTA(p) \
+	(SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
+		INTERACTIVE_DELTA)
+
+#define TASK_INTERACTIVE(p) \
+	((p)->prio <= (p)->static_prio - DELTA(p))
 
 /*
- * The tasklist_lock protects the linked list of processes.
- *
- * The runqueue_lock locks the parts that actually access
- * and change the run-queues, and have to be interrupt-safe.
+ * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ]
+ * to time slice values.
  *
- * If both locks are to be concurrently held, the runqueue_lock
- * nests inside the tasklist_lock.
- *
- * task->alloc_lock nests inside tasklist_lock.
+ * The higher a process's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority process gets MIN_TIMESLICE worth of execution time.
  */
-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
-rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */
 
-static LIST_HEAD(runqueue_head);
+#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \
+	((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39))
 
 /*
- * We align per-CPU scheduling data on cacheline boundaries,
- * to prevent cacheline ping-pong.
+ * These are the runqueue data structures:
  */
-static union {
-	struct schedule_data {
-		struct task_struct * curr;
-		cycles_t last_schedule;
-	} schedule_data;
-	char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
-
-#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
 
-struct kernel_stat kstat;
-extern struct task_struct *child_reaper;
+#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 
-#ifdef CONFIG_SMP
+typedef struct runqueue runqueue_t;
 
-#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
-#define can_schedule(p,cpu) \
-	((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
-
-#else
-
-#define idle_task(cpu) (&init_task)
-#define can_schedule(p,cpu) (1)
-
-#endif
-
-void scheduling_functions_start_here(void) { }
+struct prio_array {
+	int nr_active;
+	unsigned long bitmap[BITMAP_SIZE];
+	list_t queue[MAX_PRIO];
+};
 
 /*
- * This is the function that decides how desirable a process is..
- * You can weigh different processes against each other depending
- * on what CPU they've run on lately etc to try to handle cache
- * and TLB miss penalties.
+ * This is the main, per-CPU runqueue data structure.
  *
- * Return values:
- *	 -1000: never select this
- *	     0: out of time, recalculate counters (but it might still be
- *		selected)
- *	   +ve: "goodness" value (the larger, the better)
- *	 +1000: realtime process, select this.
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the process migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
  */
-
-static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
-{
-	int weight;
-
-	/*
-	 * select the current process after every other
-	 * runnable process, but before the idle thread.
-	 * Also, dont trigger a counter recalculation.
-	 */
-	weight = -1;
-	if (p->policy & SCHED_YIELD)
-		goto out;
-
-	/*
-	 * Non-RT process - normal case first.
-	 */
-	if (p->policy == SCHED_OTHER) {
-		/*
-		 * Give the process a first-approximation goodness value
-		 * according to the number of clock-ticks it has left.
-		 *
-		 * Don't do any other calculations if the time slice is
-		 * over..
-		 */
-		weight = p->counter;
-		if (!weight)
-			goto out;
-			
-#ifdef CONFIG_SMP
-		/* Give a largish advantage to the same processor...   */
-		/* (this is equivalent to penalizing other processors) */
-		if (p->processor == this_cpu)
-			weight += PROC_CHANGE_PENALTY;
-#endif
-
-		/* .. and a slight advantage to the current MM */
-		if (p->mm == this_mm || !p->mm)
-			weight += 1;
-		weight += 20 - p->nice;
-		goto out;
+struct runqueue {
+	spinlock_t lock;
+	spinlock_t frozen;
+	unsigned long nr_running, nr_switches, expired_timestamp;
+	task_t *curr, *idle;
+	prio_array_t *active, *expired, arrays[2];
+	int prev_nr_running[NR_CPUS];
+	task_t *migration_thread;
+	list_t migration_queue;
+} ____cacheline_aligned;
+
+static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
+
+#define cpu_rq(cpu)		(runqueues + (cpu))
+#define this_rq()		cpu_rq(smp_processor_id())
+#define task_rq(p)		cpu_rq((p)->cpu)
+#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+#define rt_task(p)		((p)->prio < MAX_RT_PRIO)
+
+static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+{
+	struct runqueue *rq;
+
+repeat_lock_task:
+	rq = task_rq(p);
+	spin_lock_irqsave(&rq->lock, *flags);
+	if (unlikely(rq != task_rq(p))) {
+		spin_unlock_irqrestore(&rq->lock, *flags);
+		goto repeat_lock_task;
 	}
+	return rq;
+}
 
-	/*
-	 * Realtime process, select the first one on the
-	 * runqueue (taking priorities within processes
-	 * into account).
-	 */
-	weight = 1000 + p->rt_priority;
-out:
-	return weight;
+static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
+{
+	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 
 /*
- * the 'goodness value' of replacing a process on a given CPU.
- * positive value means 'replace', zero or negative means 'dont'.
+ * Adding/removing a task to/from a priority array:
  */
-static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
+static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
 {
-	return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
+	array->nr_active--;
+	list_del(&p->run_list);
+	if (list_empty(array->queue + p->prio))
+		__clear_bit(p->prio, array->bitmap);
 }
 
-/*
- * This is ugly, but reschedule_idle() is very timing-critical.
- * We are called with the runqueue spinlock held and we must
- * not claim the tasklist_lock.
- */
-static FASTCALL(void reschedule_idle(struct task_struct * p));
+static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
+{
+	list_add_tail(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+}
 
-static void reschedule_idle(struct task_struct * p)
+static inline int effective_prio(task_t *p)
 {
-#ifdef CONFIG_SMP
-	int this_cpu = smp_processor_id();
-	struct task_struct *tsk, *target_tsk;
-	int cpu, best_cpu, i, max_prio;
-	cycles_t oldest_idle;
-
-	/*
-	 * shortcut if the woken up task's last CPU is
-	 * idle now.
-	 */
-	best_cpu = p->processor;
-	if (can_schedule(p, best_cpu)) {
-		tsk = idle_task(best_cpu);
-		if (cpu_curr(best_cpu) == tsk) {
-			int need_resched;
-send_now_idle:
-			/*
-			 * If need_resched == -1 then we can skip sending
-			 * the IPI altogether, tsk->need_resched is
-			 * actively watched by the idle thread.
-			 */
-			need_resched = tsk->need_resched;
-			tsk->need_resched = 1;
-			if ((best_cpu != this_cpu) && !need_resched)
-				smp_send_reschedule(best_cpu);
-			return;
-		}
-	}
+	int bonus, prio;
 
 	/*
-	 * We know that the preferred CPU has a cache-affine current
-	 * process, lets try to find a new idle CPU for the woken-up
-	 * process. Select the least recently active idle CPU. (that
-	 * one will have the least active cache context.) Also find
-	 * the executing process which has the least priority.
-	 */
-	oldest_idle = (cycles_t) -1;
-	target_tsk = NULL;
-	max_prio = 0;
+	 * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+	 * into the -5 ... 0 ... +5 bonus/penalty range.
+	 *
+	 * We use 25% of the full 0...39 priority range so that:
+	 *
+	 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+	 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+	 *
+	 * Both properties are important to certain workloads.
+	 */
+	bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
+			MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
 
-	for (i = 0; i < smp_num_cpus; i++) {
-		cpu = cpu_logical_map(i);
-		if (!can_schedule(p, cpu))
-			continue;
-		tsk = cpu_curr(cpu);
+	prio = p->static_prio - bonus;
+	if (prio < MAX_RT_PRIO)
+		prio = MAX_RT_PRIO;
+	if (prio > MAX_PRIO-1)
+		prio = MAX_PRIO-1;
+	return prio;
+}
+
+static inline void activate_task(task_t *p, runqueue_t *rq)
+{
+	unsigned long sleep_time = jiffies - p->sleep_timestamp;
+	prio_array_t *array = rq->active;
+
+	if (!rt_task(p) && sleep_time) {
 		/*
-		 * We use the first available idle CPU. This creates
-		 * a priority list between idle CPUs, but this is not
-		 * a problem.
+		 * This code gives a bonus to interactive tasks. We update
+		 * an 'average sleep time' value here, based on
+		 * sleep_timestamp. The more time a task spends sleeping,
+		 * the higher the average gets - and the higher the priority
+		 * boost gets as well.
 		 */
-		if (tsk == idle_task(cpu)) {
-#if defined(__i386__) && defined(CONFIG_SMP)
-                        /*
-			 * Check if two siblings are idle in the same
-			 * physical package. Use them if found.
-			 */
-			if (smp_num_siblings == 2) {
-				if (cpu_curr(cpu_sibling_map[cpu]) == 
-			            idle_task(cpu_sibling_map[cpu])) {
-					oldest_idle = last_schedule(cpu);
-					target_tsk = tsk;
-					break;
-				}
-				
-                        }
-#endif		
-			if (last_schedule(cpu) < oldest_idle) {
-				oldest_idle = last_schedule(cpu);
-				target_tsk = tsk;
-			}
-		} else {
-			if (oldest_idle == -1ULL) {
-				int prio = preemption_goodness(tsk, p, cpu);
-
-				if (prio > max_prio) {
-					max_prio = prio;
-					target_tsk = tsk;
-				}
-			}
-		}
-	}
-	tsk = target_tsk;
-	if (tsk) {
-		if (oldest_idle != -1ULL) {
-			best_cpu = tsk->processor;
-			goto send_now_idle;
-		}
-		tsk->need_resched = 1;
-		if (tsk->processor != this_cpu)
-			smp_send_reschedule(tsk->processor);
+		p->sleep_avg += sleep_time;
+		if (p->sleep_avg > MAX_SLEEP_AVG)
+			p->sleep_avg = MAX_SLEEP_AVG;
+		p->prio = effective_prio(p);
 	}
-	return;
-		
+	enqueue_task(p, array);
+	rq->nr_running++;
+}
 
-#else /* UP */
-	int this_cpu = smp_processor_id();
-	struct task_struct *tsk;
-
-	tsk = cpu_curr(this_cpu);
-	if (preemption_goodness(tsk, p, this_cpu) > 0)
-		tsk->need_resched = 1;
-#endif
+static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
+{
+	rq->nr_running--;
+	dequeue_task(p, p->array);
+	p->array = NULL;
 }
 
-/*
- * Careful!
- *
- * This has to add the process to the _beginning_ of the
- * run-queue, not the end. See the comment about "This is
- * subtle" in the scheduler proper..
- */
-static inline void add_to_runqueue(struct task_struct * p)
+static inline void resched_task(task_t *p)
 {
-	list_add(&p->run_list, &runqueue_head);
-	nr_running++;
+	int need_resched;
+
+	need_resched = p->need_resched;
+	wmb();
+	set_tsk_need_resched(p);
+	if (!need_resched && (p->cpu != smp_processor_id()))
+		smp_send_reschedule(p->cpu);
 }
 
-static inline void move_last_runqueue(struct task_struct * p)
+#ifdef CONFIG_SMP
+
+/*
+ * Wait for a process to unschedule. This is used by the exit() and
+ * ptrace() code.
+ */
+void wait_task_inactive(task_t * p)
 {
-	list_del(&p->run_list);
-	list_add_tail(&p->run_list, &runqueue_head);
+	unsigned long flags;
+	runqueue_t *rq;
+
+repeat:
+	rq = task_rq(p);
+	while (unlikely(rq->curr == p)) {
+		cpu_relax();
+		barrier();
+	}
+	rq = task_rq_lock(p, &flags);
+	if (unlikely(rq->curr == p)) {
+		task_rq_unlock(rq, &flags);
+		goto repeat;
+	}
+	task_rq_unlock(rq, &flags);
 }
 
-static inline void move_first_runqueue(struct task_struct * p)
+/*
+ * Kick the remote CPU if the task is running currently,
+ * this code is used by the signal code to signal tasks
+ * which are in user-mode as quickly as possible.
+ *
+ * (Note that we do this lockless - if the task does anything
+ * while the message is in flight then it will notice the
+ * sigpending condition anyway.)
+ */
+void kick_if_running(task_t * p)
 {
-	list_del(&p->run_list);
-	list_add(&p->run_list, &runqueue_head);
+	if (p == task_rq(p)->curr)
+		resched_task(p);
 }
+#endif
 
 /*
  * Wake up a process. Put it on the run-queue if it's not
@@ -348,413 +303,547 @@
  * "current->state = TASK_RUNNING" to mark yourself runnable
  * without the overhead of this.
  */
-static inline int try_to_wake_up(struct task_struct * p, int synchronous)
+static int try_to_wake_up(task_t * p)
 {
 	unsigned long flags;
 	int success = 0;
+	runqueue_t *rq;
 
-	/*
-	 * We want the common case fall through straight, thus the goto.
-	 */
-	spin_lock_irqsave(&runqueue_lock, flags);
+	rq = task_rq_lock(p, &flags);
 	p->state = TASK_RUNNING;
-	if (task_on_runqueue(p))
-		goto out;
-	add_to_runqueue(p);
-	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
-		reschedule_idle(p);
-	success = 1;
-out:
-	spin_unlock_irqrestore(&runqueue_lock, flags);
+	if (!p->array) {
+		activate_task(p, rq);
+		if (p->prio < rq->curr->prio)
+			resched_task(rq->curr);
+		success = 1;
+	}
+	task_rq_unlock(rq, &flags);
 	return success;
 }
 
-inline int wake_up_process(struct task_struct * p)
+int wake_up_process(task_t * p)
 {
-	return try_to_wake_up(p, 0);
+	return try_to_wake_up(p);
 }
 
-static void process_timeout(unsigned long __data)
+void wake_up_forked_process(task_t * p)
 {
-	struct task_struct * p = (struct task_struct *) __data;
+	runqueue_t *rq;
+
+	rq = this_rq();
+	spin_lock_irq(&rq->lock);
 
-	wake_up_process(p);
+	p->state = TASK_RUNNING;
+	if (!rt_task(p)) {
+		/*
+		 * We decrease the sleep average of forking parents
+		 * and children as well, to keep max-interactive tasks
+		 * from forking tasks that are max-interactive.
+		 */
+		current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
+		p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
+		p->prio = effective_prio(p);
+	}
+	p->cpu = smp_processor_id();
+	activate_task(p, rq);
+	spin_unlock_irq(&rq->lock);
 }
 
-/**
- * schedule_timeout - sleep until timeout
- * @timeout: timeout value in jiffies
- *
- * Make the current task sleep until @timeout jiffies have
- * elapsed. The routine will return immediately unless
- * the current task state has been set (see set_current_state()).
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
- * pass before the routine returns. The routine will return 0
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task. In this case the remaining time
- * in jiffies will be returned, or 0 if the timer expired in time
- *
- * The current task state is guaranteed to be TASK_RUNNING when this 
- * routine returns.
- *
- * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
- * the CPU away without a bound on the timeout. In this case the return
- * value will be %MAX_SCHEDULE_TIMEOUT.
- *
- * In all cases the return value is guaranteed to be non-negative.
+/*
+ * Potentially available exiting-child timeslices are
+ * retrieved here - this way the parent does not get
+ * penalized for creating too many processes.
+ *
+ * (this cannot be used to 'generate' timeslices
+ * artificially, because any timeslice recovered here
+ * was given away by the parent in the first place.)
  */
-signed long schedule_timeout(signed long timeout)
+void sched_exit(task_t * p)
 {
-	struct timer_list timer;
-	unsigned long expire;
-
-	switch (timeout)
-	{
-	case MAX_SCHEDULE_TIMEOUT:
-		/*
-		 * These two special cases are useful to be comfortable
-		 * in the caller. Nothing more. We could take
-		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
-		 * but I' d like to return a valid offset (>=0) to allow
-		 * the caller to do everything it want with the retval.
-		 */
-		schedule();
-		goto out;
-	default:
-		/*
-		 * Another bit of PARANOID. Note that the retval will be
-		 * 0 since no piece of kernel is supposed to do a check
-		 * for a negative retval of schedule_timeout() (since it
-		 * should never happens anyway). You just have the printk()
-		 * that will tell you if something is gone wrong and where.
-		 */
-		if (timeout < 0)
-		{
-			printk(KERN_ERR "schedule_timeout: wrong timeout "
-			       "value %lx from %p\n", timeout,
-			       __builtin_return_address(0));
-			current->state = TASK_RUNNING;
-			goto out;
-		}
-	}
+	__cli();
+	current->time_slice += p->time_slice;
+	if (unlikely(current->time_slice > MAX_TIMESLICE))
+		current->time_slice = MAX_TIMESLICE;
+	__sti();
+	/*
+	 * If the child was a (relative-) CPU hog then decrease
+	 * the sleep_avg of the parent as well.
+	 */
+	if (p->sleep_avg < current->sleep_avg)
+		current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT +
+			p->sleep_avg) / (EXIT_WEIGHT + 1);
+}
 
-	expire = timeout + jiffies;
+#if CONFIG_SMP
+asmlinkage void schedule_tail(task_t *prev)
+{
+	spin_unlock_irq(&this_rq()->frozen);
+}
+#endif
 
-	init_timer(&timer);
-	timer.expires = expire;
-	timer.data = (unsigned long) current;
-	timer.function = process_timeout;
+static inline void context_switch(task_t *prev, task_t *next)
+{
+	struct mm_struct *mm = next->mm;
+	struct mm_struct *oldmm = prev->active_mm;
 
-	add_timer(&timer);
-	schedule();
-	del_timer_sync(&timer);
+	prepare_to_switch();
 
-	timeout = expire - jiffies;
+	if (unlikely(!mm)) {
+		next->active_mm = oldmm;
+		atomic_inc(&oldmm->mm_count);
+		enter_lazy_tlb(oldmm, next, smp_processor_id());
+	} else
+		switch_mm(oldmm, mm, next, smp_processor_id());
+
+	if (unlikely(!prev->mm)) {
+		prev->active_mm = NULL;
+		mmdrop(oldmm);
+	}
 
- out:
-	return timeout < 0 ? 0 : timeout;
+	/* Here we just switch the register state and the stack. */
+	switch_to(prev, next, prev);
 }
 
-/*
- * schedule_tail() is getting called from the fork return path. This
- * cleans up all remaining scheduler things, without impacting the
- * common case.
- */
-static inline void __schedule_tail(struct task_struct *prev)
+unsigned long nr_running(void)
 {
-#ifdef CONFIG_SMP
-	int policy;
-
-	/*
-	 * prev->policy can be written from here only before `prev'
-	 * can be scheduled (before setting prev->cpus_runnable to ~0UL).
-	 * Of course it must also be read before allowing prev
-	 * to be rescheduled, but since the write depends on the read
-	 * to complete, wmb() is enough. (the spin_lock() acquired
-	 * before setting cpus_runnable is not enough because the spin_lock()
-	 * common code semantics allows code outside the critical section
-	 * to enter inside the critical section)
-	 */
-	policy = prev->policy;
-	prev->policy = policy & ~SCHED_YIELD;
-	wmb();
+	unsigned long i, sum = 0;
 
-	/*
-	 * fast path falls through. We have to clear cpus_runnable before
-	 * checking prev->state to avoid a wakeup race. Protect against
-	 * the task exiting early.
-	 */
-	task_lock(prev);
-	task_release_cpu(prev);
-	mb();
-	if (prev->state == TASK_RUNNING)
-		goto needs_resched;
+	for (i = 0; i < smp_num_cpus; i++)
+		sum += cpu_rq(cpu_logical_map(i))->nr_running;
 
-out_unlock:
-	task_unlock(prev);	/* Synchronise here with release_task() if prev is TASK_ZOMBIE */
-	return;
+	return sum;
+}
 
-	/*
-	 * Slow path - we 'push' the previous process and
-	 * reschedule_idle() will attempt to find a new
-	 * processor for it. (but it might preempt the
-	 * current process as well.) We must take the runqueue
-	 * lock and re-check prev->state to be correct. It might
-	 * still happen that this process has a preemption
-	 * 'in progress' already - but this is not a problem and
-	 * might happen in other circumstances as well.
-	 */
-needs_resched:
-	{
-		unsigned long flags;
+unsigned long nr_context_switches(void)
+{
+	unsigned long i, sum = 0;
 
-		/*
-		 * Avoid taking the runqueue lock in cases where
-		 * no preemption-check is necessery:
-		 */
-		if ((prev == idle_task(smp_processor_id())) ||
-						(policy & SCHED_YIELD))
-			goto out_unlock;
+	for (i = 0; i < smp_num_cpus; i++)
+		sum += cpu_rq(cpu_logical_map(i))->nr_switches;
 
-		spin_lock_irqsave(&runqueue_lock, flags);
-		if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
-			reschedule_idle(prev);
-		spin_unlock_irqrestore(&runqueue_lock, flags);
-		goto out_unlock;
-	}
-#else
-	prev->policy &= ~SCHED_YIELD;
-#endif /* CONFIG_SMP */
+	return sum;
 }
 
-asmlinkage void schedule_tail(struct task_struct *prev)
+#if CONFIG_SMP
+/*
+ * Lock the busiest runqueue as well, this_rq is locked already.
+ * Recalculate nr_running if we have to drop the runqueue lock.
+ */
+static inline unsigned int double_lock_balance(runqueue_t *this_rq,
+	runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running)
 {
-	__schedule_tail(prev);
+	if (unlikely(!spin_trylock(&busiest->lock))) {
+		if (busiest < this_rq) {
+			spin_unlock(&this_rq->lock);
+			spin_lock(&busiest->lock);
+			spin_lock(&this_rq->lock);
+			/* Need to recalculate nr_running */
+			if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
+				nr_running = this_rq->nr_running;
+			else
+				nr_running = this_rq->prev_nr_running[this_cpu];
+		} else
+			spin_lock(&busiest->lock);
+	}
+	return nr_running;
 }
 
 /*
- *  'schedule()' is the scheduler function. It's a very simple and nice
- * scheduler: it's not perfect, but certainly works for most things.
+ * Current runqueue is empty, or rebalance tick: if there is an
+ * inbalance (current runqueue is too short) then pull from
+ * busiest runqueue(s).
  *
- * The goto is "interesting".
- *
- *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
- * tasks can run. It can not be killed, and it cannot sleep. The 'state'
- * information in task[0] is never used.
+ * We call this with the current runqueue locked,
+ * irqs disabled.
  */
-asmlinkage void schedule(void)
+static void load_balance(runqueue_t *this_rq, int idle)
 {
-	struct schedule_data * sched_data;
-	struct task_struct *prev, *next, *p;
-	struct list_head *tmp;
-	int this_cpu, c;
-
-
-	spin_lock_prefetch(&runqueue_lock);
-
-	BUG_ON(!current->active_mm);
-need_resched_back:
-	prev = current;
-	this_cpu = prev->processor;
-
-	if (unlikely(in_interrupt())) {
-		printk("Scheduling in interrupt\n");
-		BUG();
-	}
-
-	release_kernel_lock(prev, this_cpu);
+	int imbalance, nr_running, load, max_load,
+		idx, i, this_cpu = smp_processor_id();
+	task_t *next = this_rq->idle, *tmp;
+	runqueue_t *busiest, *rq_src;
+	prio_array_t *array;
+	list_t *head, *curr;
 
 	/*
-	 * 'sched_data' is protected by the fact that we can run
-	 * only one process per CPU.
+	 * We search all runqueues to find the most busy one.
+	 * We do this lockless to reduce cache-bouncing overhead,
+	 * we re-check the 'best' source CPU later on again, with
+	 * the lock held.
+	 *
+	 * We fend off statistical fluctuations in runqueue lengths by
+	 * saving the runqueue length during the previous load-balancing
+	 * operation and using the smaller one the current and saved lengths.
+	 * If a runqueue is long enough for a longer amount of time then
+	 * we recognize it and pull tasks from it.
+	 *
+	 * The 'current runqueue length' is a statistical maximum variable,
+	 * for that one we take the longer one - to avoid fluctuations in
+	 * the other direction. So for a load-balance to happen it needs
+	 * stable long runqueue on the target CPU and stable short runqueue
+	 * on the local runqueue.
+	 *
+	 * We make an exception if this CPU is about to become idle - in
+	 * that case we are less picky about moving a task across CPUs and
+	 * take what can be taken.
 	 */
-	sched_data = & aligned_data[this_cpu].schedule_data;
+	if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
+		nr_running = this_rq->nr_running;
+	else
+		nr_running = this_rq->prev_nr_running[this_cpu];
 
-	spin_lock_irq(&runqueue_lock);
+	busiest = NULL;
+	max_load = 1;
+	for (i = 0; i < smp_num_cpus; i++) {
+		int logical = cpu_logical_map(i);
 
-	/* move an exhausted RR process to be last.. */
-	if (unlikely(prev->policy == SCHED_RR))
-		if (!prev->counter) {
-			prev->counter = NICE_TO_TICKS(prev->nice);
-			move_last_runqueue(prev);
+		rq_src = cpu_rq(logical);
+		if (idle || (rq_src->nr_running < this_rq->prev_nr_running[logical]))
+			load = rq_src->nr_running;
+		else
+			load = this_rq->prev_nr_running[logical];
+		this_rq->prev_nr_running[logical] = rq_src->nr_running;
+
+		if ((load > max_load) && (rq_src != this_rq)) {
+			busiest = rq_src;
+			max_load = load;
 		}
-
-	switch (prev->state) {
-		case TASK_INTERRUPTIBLE:
-			if (signal_pending(prev)) {
-				prev->state = TASK_RUNNING;
-				break;
-			}
-		default:
-			del_from_runqueue(prev);
-		case TASK_RUNNING:;
 	}
-	prev->need_resched = 0;
 
+	if (likely(!busiest))
+		return;
+
+	imbalance = (max_load - nr_running) / 2;
+
+	/* It needs an at least ~25% imbalance to trigger balancing. */
+	if (!idle && (imbalance < (max_load + 3)/4))
+		return;
+
+	nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
 	/*
-	 * this is the scheduler proper:
+	 * Make sure nothing changed since we checked the
+	 * runqueue length.
 	 */
+	if (busiest->nr_running <= nr_running + 1)
+		goto out_unlock;
 
-repeat_schedule:
 	/*
-	 * Default process to select..
+	 * We first consider expired tasks. Those will likely not be
+	 * executed in the near future, and they are most likely to
+	 * be cache-cold, thus switching CPUs has the least effect
+	 * on them.
 	 */
-	next = idle_task(this_cpu);
-	c = -1000;
-	list_for_each(tmp, &runqueue_head) {
-		p = list_entry(tmp, struct task_struct, run_list);
-		if (can_schedule(p, this_cpu)) {
-			int weight = goodness(p, this_cpu, prev->active_mm);
-			if (weight > c)
-				c = weight, next = p;
+	if (busiest->expired->nr_active)
+		array = busiest->expired;
+	else
+		array = busiest->active;
+
+new_array:
+	/* Start searching at priority 0: */
+	idx = 0;
+skip_bitmap:
+	if (!idx)
+		idx = sched_find_first_bit(array->bitmap);
+	else
+		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+	if (idx == MAX_PRIO) {
+		if (array == busiest->expired) {
+			array = busiest->active;
+			goto new_array;
+		}
+		goto out_unlock;
+	}
+
+	head = array->queue + idx;
+	curr = head->prev;
+skip_queue:
+	tmp = list_entry(curr, task_t, run_list);
+
+	/*
+	 * We do not migrate tasks that are:
+	 * 1) running (obviously), or
+	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
+	 * 3) are cache-hot on their current CPU.
+	 */
+
+#define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
+	((jiffies - (p)->sleep_timestamp > cache_decay_ticks) &&	\
+		((p) != (rq)->curr) &&					\
+			((p)->cpus_allowed & (1 << (this_cpu))))
+
+	if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
+		curr = curr->next;
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
+	}
+	next = tmp;
+	/*
+	 * take the task out of the other runqueue and
+	 * put it into this one:
+	 */
+	dequeue_task(next, array);
+	busiest->nr_running--;
+	next->cpu = this_cpu;
+	this_rq->nr_running++;
+	enqueue_task(next, this_rq->active);
+	if (next->prio < current->prio)
+		set_need_resched();
+	if (!idle && --imbalance) {
+		if (array == busiest->expired) {
+			array = busiest->active;
+			goto new_array;
 		}
 	}
+out_unlock:
+	spin_unlock(&busiest->lock);
+}
+
+/*
+ * One of the idle_cpu_tick() or the busy_cpu_tick() function will
+ * gets called every timer tick, on every CPU. Our balancing action
+ * frequency and balancing agressivity depends on whether the CPU is
+ * idle or not.
+ *
+ * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
+ * systems with HZ=100, every 10 msecs.)
+ */
+#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
+#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
 
-	/* Do we need to re-calculate counters? */
-	if (unlikely(!c)) {
-		struct task_struct *p;
-
-		spin_unlock_irq(&runqueue_lock);
-		read_lock(&tasklist_lock);
-		for_each_task(p)
-			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
-		read_unlock(&tasklist_lock);
-		spin_lock_irq(&runqueue_lock);
-		goto repeat_schedule;
+static inline void idle_tick(void)
+{
+	if (jiffies % IDLE_REBALANCE_TICK)
+		return;
+	spin_lock(&this_rq()->lock);
+	load_balance(this_rq(), 1);
+	spin_unlock(&this_rq()->lock);
+}
+
+#endif
+
+/*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks:
+ */
+#define EXPIRED_STARVING(rq) \
+		((rq)->expired_timestamp && \
+		(jiffies - (rq)->expired_timestamp >= \
+			STARVATION_LIMIT * ((rq)->nr_running) + 1))
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ */
+void scheduler_tick(int user_tick, int system)
+{
+	int cpu = smp_processor_id();
+	runqueue_t *rq = this_rq();
+	task_t *p = current;
+
+	if (p == rq->idle) {
+		if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
+			kstat.per_cpu_system[cpu] += system;
+#if CONFIG_SMP
+		idle_tick();
+#endif
+		return;
 	}
+	if (TASK_NICE(p) > 0)
+		kstat.per_cpu_nice[cpu] += user_tick;
+	else
+		kstat.per_cpu_user[cpu] += user_tick;
+	kstat.per_cpu_system[cpu] += system;
 
+	/* Task might have expired already, but not scheduled off yet */
+	if (p->array != rq->active) {
+		set_tsk_need_resched(p);
+		return;
+	}
+	spin_lock(&rq->lock);
+	if (unlikely(rt_task(p))) {
+		/*
+		 * RR tasks need a special form of timeslice management.
+		 * FIFO tasks have no timeslices.
+		 */
+		if ((p->policy == SCHED_RR) && !--p->time_slice) {
+			p->time_slice = TASK_TIMESLICE(p);
+			set_tsk_need_resched(p);
+
+			/* put it at the end of the queue: */
+			dequeue_task(p, rq->active);
+			enqueue_task(p, rq->active);
+		}
+		goto out;
+	}
 	/*
-	 * from this point on nothing can prevent us from
-	 * switching to the next task, save this fact in
-	 * sched_data.
-	 */
-	sched_data->curr = next;
-	task_set_cpu(next, this_cpu);
-	spin_unlock_irq(&runqueue_lock);
-
-	if (unlikely(prev == next)) {
-		/* We won't go through the normal tail, so do this by hand */
-		prev->policy &= ~SCHED_YIELD;
-		goto same_process;
+	 * The task was running during this tick - update the
+	 * time slice counter and the sleep average. Note: we
+	 * do not update a process's priority until it either
+	 * goes to sleep or uses up its timeslice. This makes
+	 * it possible for interactive tasks to use up their
+	 * timeslices at their highest priority levels.
+	 */
+	if (p->sleep_avg)
+		p->sleep_avg--;
+	if (!--p->time_slice) {
+		dequeue_task(p, rq->active);
+		set_tsk_need_resched(p);
+		p->prio = effective_prio(p);
+		p->time_slice = TASK_TIMESLICE(p);
+
+		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
+			if (!rq->expired_timestamp)
+				rq->expired_timestamp = jiffies;
+			enqueue_task(p, rq->expired);
+		} else
+			enqueue_task(p, rq->active);
 	}
+out:
+#if CONFIG_SMP
+	if (!(jiffies % BUSY_REBALANCE_TICK))
+		load_balance(rq, 0);
+#endif
+	spin_unlock(&rq->lock);
+}
 
-#ifdef CONFIG_SMP
- 	/*
- 	 * maintain the per-process 'last schedule' value.
- 	 * (this has to be recalculated even if we reschedule to
- 	 * the same process) Currently this is only used on SMP,
-	 * and it's approximate, so we do not have to maintain
-	 * it while holding the runqueue spinlock.
- 	 */
- 	sched_data->last_schedule = get_cycles();
+void scheduling_functions_start_here(void) { }
 
-	/*
-	 * We drop the scheduler lock early (it's a global spinlock),
-	 * thus we have to lock the previous process from getting
-	 * rescheduled during switch_to().
-	 */
+/*
+ * 'schedule()' is the main scheduler function.
+ */
+asmlinkage void schedule(void)
+{
+	task_t *prev, *next;
+	runqueue_t *rq;
+	prio_array_t *array;
+	list_t *queue;
+	int idx;
 
-#endif /* CONFIG_SMP */
+	if (unlikely(in_interrupt()))
+		BUG();
 
-	kstat.context_swtch++;
-	/*
-	 * there are 3 processes which are affected by a context switch:
-	 *
-	 * prev == .... ==> (last => next)
-	 *
-	 * It's the 'much more previous' 'prev' that is on next's stack,
-	 * but prev is set to (the just run) 'last' process by switch_to().
-	 * This might sound slightly confusing but makes tons of sense.
-	 */
-	prepare_to_switch();
-	{
-		struct mm_struct *mm = next->mm;
-		struct mm_struct *oldmm = prev->active_mm;
-		if (!mm) {
-			BUG_ON(next->active_mm);
-			next->active_mm = oldmm;
-			atomic_inc(&oldmm->mm_count);
-			enter_lazy_tlb(oldmm, next, this_cpu);
-		} else {
-			BUG_ON(next->active_mm != mm);
-			switch_mm(oldmm, mm, next, this_cpu);
-		}
+need_resched:
+	prev = current;
+	rq = this_rq();
+
+	release_kernel_lock(prev, smp_processor_id());
+	prev->sleep_timestamp = jiffies;
+	spin_lock_irq(&rq->lock);
 
-		if (!prev->mm) {
-			prev->active_mm = NULL;
-			mmdrop(oldmm);
+	switch (prev->state) {
+	case TASK_INTERRUPTIBLE:
+		if (unlikely(signal_pending(prev))) {
+			prev->state = TASK_RUNNING;
+			break;
 		}
+	default:
+		deactivate_task(prev, rq);
+	case TASK_RUNNING:
+		;
+	}
+#if CONFIG_SMP
+pick_next_task:
+#endif
+	if (unlikely(!rq->nr_running)) {
+#if CONFIG_SMP
+		load_balance(rq, 1);
+		if (rq->nr_running)
+			goto pick_next_task;
+#endif
+		next = rq->idle;
+		rq->expired_timestamp = 0;
+		goto switch_tasks;
 	}
 
-	/*
-	 * This just switches the register state and the
-	 * stack.
-	 */
-	switch_to(prev, next, prev);
-	__schedule_tail(prev);
+	array = rq->active;
+	if (unlikely(!array->nr_active)) {
+		/*
+		 * Switch the active and expired arrays.
+		 */
+		rq->active = rq->expired;
+		rq->expired = array;
+		array = rq->active;
+		rq->expired_timestamp = 0;
+	}
+
+	idx = sched_find_first_bit(array->bitmap);
+	queue = array->queue + idx;
+	next = list_entry(queue->next, task_t, run_list);
+
+switch_tasks:
+	prefetch(next);
+	clear_tsk_need_resched(prev);
+
+	if (likely(prev != next)) {
+		rq->nr_switches++;
+		rq->curr = next;
+		spin_lock(&rq->frozen);
+		spin_unlock(&rq->lock);
+
+		context_switch(prev, next);
+
+		/*
+		 * The runqueue pointer might be from another CPU
+		 * if the new task was last running on a different
+		 * CPU - thus re-load it.
+		 */
+		mb();
+		rq = this_rq();
+		spin_unlock_irq(&rq->frozen);
+	} else {
+		spin_unlock_irq(&rq->lock);
+	}
 
-same_process:
 	reacquire_kernel_lock(current);
-	if (current->need_resched)
-		goto need_resched_back;
+	if (need_resched())
+		goto need_resched;
 	return;
 }
 
 /*
- * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just wake everything
- * up.  If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
- * non-exclusive tasks and one exclusive task.
+ * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns zero
- * in this (rare) case, and we handle it by contonuing to scan the queue.
+ * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
-			 	     int nr_exclusive, const int sync)
+static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+				    int nr_exclusive)
 {
 	struct list_head *tmp;
-	struct task_struct *p;
+	unsigned int state;
+	wait_queue_t *curr;
+	task_t *p;
 
-	CHECK_MAGIC_WQHEAD(q);
-	WQ_CHECK_LIST_HEAD(&q->task_list);
-	
-	list_for_each(tmp,&q->task_list) {
-		unsigned int state;
-                wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
-
-		CHECK_MAGIC(curr->__magic);
+	list_for_each(tmp, &q->task_list) {
+		curr = list_entry(tmp, wait_queue_t, task_list);
 		p = curr->task;
 		state = p->state;
-		if (state & mode) {
-			WQ_NOTE_WAKER(curr);
-			if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+		if ((state & mode) && try_to_wake_up(p) &&
+			((curr->flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive))
 				break;
-		}
 	}
 }
 
-void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr)
+void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
-	if (q) {
-		unsigned long flags;
-		wq_read_lock_irqsave(&q->lock, flags);
-		__wake_up_common(q, mode, nr, 0);
-		wq_read_unlock_irqrestore(&q->lock, flags);
-	}
-}
+	unsigned long flags;
 
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)
-{
-	if (q) {
-		unsigned long flags;
-		wq_read_lock_irqsave(&q->lock, flags);
-		__wake_up_common(q, mode, nr, 1);
-		wq_read_unlock_irqrestore(&q->lock, flags);
-	}
+	if (unlikely(!q))
+		return;
+
+	wq_read_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive);
+	wq_read_unlock_irqrestore(&q->lock, flags);
 }
 
 void complete(struct completion *x)
@@ -763,7 +852,7 @@
 
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
-	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0);
+	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 
@@ -852,6 +941,41 @@
 
 void scheduling_functions_end_here(void) { }
 
+void set_user_nice(task_t *p, long nice)
+{
+	unsigned long flags;
+	prio_array_t *array;
+	runqueue_t *rq;
+
+	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+		return;
+	/*
+	 * We have to be careful, if called from sys_setpriority(),
+	 * the task might be in the middle of scheduling on another CPU.
+	 */
+	rq = task_rq_lock(p, &flags);
+	if (rt_task(p)) {
+		p->static_prio = NICE_TO_PRIO(nice);
+		goto out_unlock;
+	}
+	array = p->array;
+	if (array)
+		dequeue_task(p, array);
+	p->static_prio = NICE_TO_PRIO(nice);
+	p->prio = NICE_TO_PRIO(nice);
+	if (array) {
+		enqueue_task(p, array);
+		/*
+		 * If the task is running and lowered its priority,
+		 * or increased its priority then reschedule its CPU:
+		 */
+		if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr))
+			resched_task(rq->curr);
+	}
+out_unlock:
+	task_rq_unlock(rq, &flags);
+}
+
 #ifndef __alpha__
 
 /*
@@ -862,7 +986,7 @@
 
 asmlinkage long sys_nice(int increment)
 {
-	long newprio;
+	long nice;
 
 	/*
 	 *	Setpriority might change our priority at the same moment.
@@ -878,32 +1002,46 @@
 	if (increment > 40)
 		increment = 40;
 
-	newprio = current->nice + increment;
-	if (newprio < -20)
-		newprio = -20;
-	if (newprio > 19)
-		newprio = 19;
-	current->nice = newprio;
+	nice = PRIO_TO_NICE(current->static_prio) + increment;
+	if (nice < -20)
+		nice = -20;
+	if (nice > 19)
+		nice = 19;
+	set_user_nice(current, nice);
 	return 0;
 }
 
 #endif
 
-static inline struct task_struct *find_process_by_pid(pid_t pid)
+/*
+ * This is the priority value as seen by users in /proc
+ *
+ * RT tasks are offset by -200. Normal tasks are centered
+ * around 0, value goes from -16 to +15.
+ */
+int task_prio(task_t *p)
+{
+	return p->prio - MAX_USER_RT_PRIO;
+}
+
+int task_nice(task_t *p)
 {
-	struct task_struct *tsk = current;
+	return TASK_NICE(p);
+}
 
-	if (pid)
-		tsk = find_task_by_pid(pid);
-	return tsk;
+static inline task_t *find_process_by_pid(pid_t pid)
+{
+	return pid ? find_task_by_pid(pid) : current;
 }
 
-static int setscheduler(pid_t pid, int policy, 
-			struct sched_param *param)
+static int setscheduler(pid_t pid, int policy, struct sched_param *param)
 {
 	struct sched_param lp;
-	struct task_struct *p;
+	prio_array_t *array;
+	unsigned long flags;
+	runqueue_t *rq;
 	int retval;
+	task_t *p;
 
 	retval = -EINVAL;
 	if (!param || pid < 0)
@@ -917,14 +1055,19 @@
 	 * We play safe to avoid deadlocks.
 	 */
 	read_lock_irq(&tasklist_lock);
-	spin_lock(&runqueue_lock);
 
 	p = find_process_by_pid(pid);
 
 	retval = -ESRCH;
 	if (!p)
-		goto out_unlock;
-			
+		goto out_unlock_tasklist;
+
+	/*
+	 * To be able to change p->policy safely, the apropriate
+	 * runqueue lock must be held.
+	 */
+	rq = task_rq_lock(p, &flags);
+
 	if (policy < 0)
 		policy = p->policy;
 	else {
@@ -933,42 +1076,48 @@
 				policy != SCHED_OTHER)
 			goto out_unlock;
 	}
-	
+
 	/*
-	 * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
-	 * priority for SCHED_OTHER is 0.
+	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0.
 	 */
 	retval = -EINVAL;
-	if (lp.sched_priority < 0 || lp.sched_priority > 99)
+	if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
 		goto out_unlock;
 	if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
 		goto out_unlock;
 
 	retval = -EPERM;
-	if ((policy == SCHED_FIFO || policy == SCHED_RR) && 
+	if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
 	    !capable(CAP_SYS_NICE))
 		goto out_unlock;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 	    !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
+	array = p->array;
+	if (array)
+		deactivate_task(p, task_rq(p));
 	retval = 0;
 	p->policy = policy;
 	p->rt_priority = lp.sched_priority;
-	if (task_on_runqueue(p))
-		move_first_runqueue(p);
-
-	current->need_resched = 1;
+	if (policy != SCHED_OTHER)
+		p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
+	else
+		p->prio = p->static_prio;
+	if (array)
+		activate_task(p, task_rq(p));
 
 out_unlock:
-	spin_unlock(&runqueue_lock);
+	task_rq_unlock(rq, &flags);
+out_unlock_tasklist:
 	read_unlock_irq(&tasklist_lock);
 
 out_nounlock:
 	return retval;
 }
 
-asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 				      struct sched_param *param)
 {
 	return setscheduler(pid, policy, param);
@@ -981,7 +1130,7 @@
 
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
-	struct task_struct *p;
+	task_t *p;
 	int retval;
 
 	retval = -EINVAL;
@@ -992,7 +1141,7 @@
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p)
-		retval = p->policy & ~SCHED_YIELD;
+		retval = p->policy;
 	read_unlock(&tasklist_lock);
 
 out_nounlock:
@@ -1001,7 +1150,7 @@
 
 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
 {
-	struct task_struct *p;
+	task_t *p;
 	struct sched_param lp;
 	int retval;
 
@@ -1032,42 +1181,40 @@
 
 asmlinkage long sys_sched_yield(void)
 {
+	runqueue_t *rq;
+	prio_array_t *array;
+
+	rq = this_rq();
+
 	/*
-	 * Trick. sched_yield() first counts the number of truly 
-	 * 'pending' runnable processes, then returns if it's
-	 * only the current processes. (This test does not have
-	 * to be atomic.) In threaded applications this optimization
-	 * gets triggered quite often.
+	 * Decrease the yielding task's priority by one, to avoid
+	 * livelocks. This priority loss is temporary, it's recovered
+	 * once the current timeslice expires.
+	 *
+	 * If priority is already MAX_PRIO-1 then we still
+	 * roundrobin the task within the runlist.
 	 */
+	spin_lock_irq(&rq->lock);
+	array = current->array;
+	/*
+	 * If the task has reached maximum priority (or is a RT task)
+	 * then just requeue the task to the end of the runqueue:
+	 */
+	if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) {
+		list_del(&current->run_list);
+		list_add_tail(&current->run_list, array->queue + current->prio);
+	} else {
+		list_del(&current->run_list);
+		if (list_empty(array->queue + current->prio))
+			__clear_bit(current->prio, array->bitmap);
+		current->prio++;
+		list_add_tail(&current->run_list, array->queue + current->prio);
+		__set_bit(current->prio, array->bitmap);
+	}
+	spin_unlock(&rq->lock);
 
-	int nr_pending = nr_running;
-
-#if CONFIG_SMP
-	int i;
+	schedule();
 
-	// Subtract non-idle processes running on other CPUs.
-	for (i = 0; i < smp_num_cpus; i++) {
-		int cpu = cpu_logical_map(i);
-		if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
-			nr_pending--;
-	}
-#else
-	// on UP this process is on the runqueue as well
-	nr_pending--;
-#endif
-	if (nr_pending) {
-		/*
-		 * This process can only be rescheduled by us,
-		 * so this is safe without any locking.
-		 */
-		if (current->policy == SCHED_OTHER)
-			current->policy |= SCHED_YIELD;
-		current->need_resched = 1;
-
-		spin_lock_irq(&runqueue_lock);
-		move_last_runqueue(current);
-		spin_unlock_irq(&runqueue_lock);
-	}
 	return 0;
 }
 
@@ -1078,7 +1225,7 @@
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
-		ret = 99;
+		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_OTHER:
 		ret = 0;
@@ -1105,7 +1252,7 @@
 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
 {
 	struct timespec t;
-	struct task_struct *p;
+	task_t *p;
 	int retval = -EINVAL;
 
 	if (pid < 0)
@@ -1115,8 +1262,8 @@
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p)
-		jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
-				    &t);
+		jiffies_to_timespec(p->policy & SCHED_FIFO ?
+					 0 : TASK_TIMESLICE(p), &t);
 	read_unlock(&tasklist_lock);
 	if (p)
 		retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -1124,14 +1271,14 @@
 	return retval;
 }
 
-static void show_task(struct task_struct * p)
+static void show_task(task_t * p)
 {
 	unsigned long free = 0;
 	int state;
 	static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
 
 	printk("%-13.13s ", p->comm);
-	state = p->state ? ffz(~p->state) + 1 : 0;
+	state = p->state ? __ffs(p->state) + 1 : 0;
 	if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
 		printk(stat_nam[state]);
 	else
@@ -1172,7 +1319,7 @@
 		printk(" (NOTLB)\n");
 
 	{
-		extern void show_trace_task(struct task_struct *tsk);
+		extern void show_trace_task(task_t *tsk);
 		show_trace_task(p);
 	}
 }
@@ -1194,7 +1341,7 @@
 
 void show_state(void)
 {
-	struct task_struct *p;
+	task_t *p;
 
 #if (BITS_PER_LONG == 32)
 	printk("\n"
@@ -1217,128 +1364,274 @@
 	read_unlock(&tasklist_lock);
 }
 
-/**
- * reparent_to_init() - Reparent the calling kernel thread to the init task.
- *
- * If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to init so that
- * it is correctly cleaned up on exit.
+/*
+ * double_rq_lock - safely lock two runqueues
  *
- * The various task state such as scheduling policy and priority may have
- * been inherited fro a user process, so we reset them to sane values here.
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
+{
+	if (rq1 == rq2)
+		spin_lock(&rq1->lock);
+	else {
+		if (rq1 < rq2) {
+			spin_lock(&rq1->lock);
+			spin_lock(&rq2->lock);
+		} else {
+			spin_lock(&rq2->lock);
+			spin_lock(&rq1->lock);
+		}
+	}
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
  *
- * NOTE that reparent_to_init() gives the caller full capabilities.
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
  */
-void reparent_to_init(void)
+static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
 {
-	struct task_struct *this_task = current;
+	spin_unlock(&rq1->lock);
+	if (rq1 != rq2)
+		spin_unlock(&rq2->lock);
+}
 
-	write_lock_irq(&tasklist_lock);
+void init_idle(task_t *idle, int cpu)
+{
+	runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(idle->cpu);
+	unsigned long flags;
 
-	/* Reparent to init */
-	REMOVE_LINKS(this_task);
-	this_task->p_pptr = child_reaper;
-	this_task->p_opptr = child_reaper;
-	SET_LINKS(this_task);
+	__save_flags(flags);
+	__cli();
+	double_rq_lock(idle_rq, rq);
+
+	idle_rq->curr = idle_rq->idle = idle;
+	deactivate_task(idle, rq);
+	idle->array = NULL;
+	idle->prio = MAX_PRIO;
+	idle->state = TASK_RUNNING;
+	idle->cpu = cpu;
+	double_rq_unlock(idle_rq, rq);
+	set_tsk_need_resched(idle);
+	__restore_flags(flags);
+}
 
-	/* Set the exit signal to SIGCHLD so we signal init on exit */
-	this_task->exit_signal = SIGCHLD;
+extern void init_timervecs(void);
+extern void timer_bh(void);
+extern void tqueue_bh(void);
+extern void immediate_bh(void);
 
-	/* We also take the runqueue_lock while altering task fields
-	 * which affect scheduling decisions */
-	spin_lock(&runqueue_lock);
+void __init sched_init(void)
+{
+	runqueue_t *rq;
+	int i, j, k;
 
-	this_task->ptrace = 0;
-	this_task->nice = DEF_NICE;
-	this_task->policy = SCHED_OTHER;
-	/* cpus_allowed? */
-	/* rt_priority? */
-	/* signals? */
-	this_task->cap_effective = CAP_INIT_EFF_SET;
-	this_task->cap_inheritable = CAP_INIT_INH_SET;
-	this_task->cap_permitted = CAP_FULL_SET;
-	this_task->keep_capabilities = 0;
-	memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
-	this_task->user = INIT_USER;
+	for (i = 0; i < NR_CPUS; i++) {
+		runqueue_t *rq = cpu_rq(i);
+		prio_array_t *array;
+
+		rq->active = rq->arrays;
+		rq->expired = rq->arrays + 1;
+		spin_lock_init(&rq->lock);
+		spin_lock_init(&rq->frozen);
+		INIT_LIST_HEAD(&rq->migration_queue);
+
+		for (j = 0; j < 2; j++) {
+			array = rq->arrays + j;
+			for (k = 0; k < MAX_PRIO; k++) {
+				INIT_LIST_HEAD(array->queue + k);
+				__clear_bit(k, array->bitmap);
+			}
+			// delimiter for bitsearch
+			__set_bit(MAX_PRIO, array->bitmap);
+		}
+	}
+	/*
+	 * We have to do a little magic to get the first
+	 * process right in SMP mode.
+	 */
+	rq = this_rq();
+	rq->curr = current;
+	rq->idle = current;
+	wake_up_process(current);
 
-	spin_unlock(&runqueue_lock);
-	write_unlock_irq(&tasklist_lock);
+	init_timervecs();
+	init_bh(TIMER_BH, timer_bh);
+	init_bh(TQUEUE_BH, tqueue_bh);
+	init_bh(IMMEDIATE_BH, immediate_bh);
+
+	/*
+	 * The boot idle thread does lazy MMU switching as well:
+	 */
+	atomic_inc(&init_mm.mm_count);
+	enter_lazy_tlb(&init_mm, current, smp_processor_id());
 }
 
+#if CONFIG_SMP
+
 /*
- *	Put all the gunge required to become a kernel thread without
- *	attached user resources in one place where it belongs.
+ * This is how migration works:
+ *
+ * 1) we queue a migration_req_t structure in the source CPU's
+ *    runqueue and wake up that CPU's migration thread.
+ * 2) we down() the locked semaphore => thread blocks.
+ * 3) migration thread wakes up (implicitly it forces the migrated
+ *    thread off the CPU)
+ * 4) it gets the migration request and checks whether the migrated
+ *    task is still in the wrong runqueue.
+ * 5) if it's in the wrong runqueue then the migration thread removes
+ *    it and puts it into the right queue.
+ * 6) migration thread up()s the semaphore.
+ * 7) we wake up and the migration is done.
  */
 
-void daemonize(void)
+typedef struct {
+	list_t list;
+	task_t *task;
+	struct semaphore sem;
+} migration_req_t;
+
+/*
+ * Change a given task's CPU affinity. Migrate the process to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely.  The
+ * call is not atomic; no spinlocks may be held.
+ */
+void set_cpus_allowed(task_t *p, unsigned long new_mask)
 {
-	struct fs_struct *fs;
+	unsigned long flags;
+	migration_req_t req;
+	runqueue_t *rq;
 
+	new_mask &= cpu_online_map;
+	if (!new_mask)
+		BUG();
 
+	rq = task_rq_lock(p, &flags);
+	p->cpus_allowed = new_mask;
 	/*
-	 * If we were started as result of loading a module, close all of the
-	 * user space pages.  We don't need them, and if we didn't close them
-	 * they would be locked into memory.
+	 * Can the task run on the task's current CPU? If not then
+	 * migrate the process off to a proper CPU.
 	 */
-	exit_mm(current);
+	if (new_mask & (1UL << p->cpu)) {
+		task_rq_unlock(rq, &flags);
+		return;
+	}
 
-	current->session = 1;
-	current->pgrp = 1;
-	current->tty = NULL;
+	/*
+	 * If the task is not on a runqueue, then it is safe to
+	 * simply update the task's cpu field.
+	 */
+	if (!p->array) {
+		p->cpu = __ffs(p->cpus_allowed);
+		task_rq_unlock(rq, &flags);
+		return;
+	}
 
-	/* Become as one with the init task */
+	init_MUTEX_LOCKED(&req.sem);
+	req.task = p;
+	list_add(&req.list, &rq->migration_queue);
+	task_rq_unlock(rq, &flags);
+	wake_up_process(rq->migration_thread);
 
-	exit_fs(current);	/* current->fs->count--; */
-	fs = init_task.fs;
-	current->fs = fs;
-	atomic_inc(&fs->count);
- 	exit_files(current);
-	current->files = init_task.files;
-	atomic_inc(&current->files->count);
+	down(&req.sem);
 }
 
-extern unsigned long wait_init_idle;
-
-void __init init_idle(void)
+static int migration_thread(void * bind_cpu)
 {
-	struct schedule_data * sched_data;
-	sched_data = &aligned_data[smp_processor_id()].schedule_data;
+	int cpu = cpu_logical_map((int) (long) bind_cpu);
+	struct sched_param param = { sched_priority: MAX_RT_PRIO-1 };
+	runqueue_t *rq;
+	int ret;
 
-	if (current != &init_task && task_on_runqueue(current)) {
-		printk("UGH! (%d:%d) was on the runqueue, removing.\n",
-			smp_processor_id(), current->pid);
-		del_from_runqueue(current);
+	daemonize();
+	sigfillset(&current->blocked);
+	set_fs(KERNEL_DS);
+	/*
+	 * The first migration thread is started on CPU #0. This one can
+	 * migrate the other migration threads to their destination CPUs.
+	 */
+	if (cpu != 0) {
+		while (!cpu_rq(cpu_logical_map(0))->migration_thread)
+			yield();
+		set_cpus_allowed(current, 1UL << cpu);
 	}
-	sched_data->curr = current;
-	sched_data->last_schedule = get_cycles();
-	clear_bit(current->processor, &wait_init_idle);
-}
+	printk("migration_task %d on cpu=%d\n", cpu, smp_processor_id());
+	ret = setscheduler(0, SCHED_FIFO, &param);
 
-extern void init_timervecs (void);
+	rq = this_rq();
+	rq->migration_thread = current;
 
-void __init sched_init(void)
-{
-	/*
-	 * We have to do a little magic to get the first
-	 * process right in SMP mode.
-	 */
-	int cpu = smp_processor_id();
-	int nr;
+	sprintf(current->comm, "migration_CPU%d", smp_processor_id());
 
-	init_task.processor = cpu;
+	for (;;) {
+		runqueue_t *rq_src, *rq_dest;
+		struct list_head *head;
+		int cpu_src, cpu_dest;
+		migration_req_t *req;
+		unsigned long flags;
+		task_t *p;
 
-	for(nr = 0; nr < PIDHASH_SZ; nr++)
-		pidhash[nr] = NULL;
+		spin_lock_irqsave(&rq->lock, flags);
+		head = &rq->migration_queue;
+		current->state = TASK_INTERRUPTIBLE;
+		if (list_empty(head)) {
+			spin_unlock_irqrestore(&rq->lock, flags);
+			schedule();
+			continue;
+		}
+		req = list_entry(head->next, migration_req_t, list);
+		list_del_init(head->next);
+		spin_unlock_irqrestore(&rq->lock, flags);
+
+		p = req->task;
+		cpu_dest = __ffs(p->cpus_allowed);
+		rq_dest = cpu_rq(cpu_dest);
+repeat:
+		cpu_src = p->cpu;
+		rq_src = cpu_rq(cpu_src);
+
+		local_irq_save(flags);
+		double_rq_lock(rq_src, rq_dest);
+		if (p->cpu != cpu_src) {
+			double_rq_unlock(rq_src, rq_dest);
+			local_irq_restore(flags);
+			goto repeat;
+		}
+		if (rq_src == rq) {
+			p->cpu = cpu_dest;
+			if (p->array) {
+				deactivate_task(p, rq_src);
+				activate_task(p, rq_dest);
+			}
+		}
+		double_rq_unlock(rq_src, rq_dest);
+		local_irq_restore(flags);
 
-	init_timervecs();
+		up(&req->sem);
+	}
+}
 
-	init_bh(TIMER_BH, timer_bh);
-	init_bh(TQUEUE_BH, tqueue_bh);
-	init_bh(IMMEDIATE_BH, immediate_bh);
+void __init migration_init(void)
+{
+	int cpu;
 
-	/*
-	 * The boot idle thread does lazy MMU switching as well:
-	 */
-	atomic_inc(&init_mm.mm_count);
-	enter_lazy_tlb(&init_mm, current, cpu);
+	current->cpus_allowed = 1UL << cpu_logical_map(0);
+	for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+		if (kernel_thread(migration_thread, (void *) (long) cpu,
+				CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
+			BUG();
+	}
+	current->cpus_allowed = -1L;
+
+	for (cpu = 0; cpu < smp_num_cpus; cpu++)
+		while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
+			schedule_timeout(2);
 }
+
+#endif /* CONFIG_SMP */
diff -urN linux-2.4.19-pre9/kernel/signal.c linux/kernel/signal.c
--- linux-2.4.19-pre9/kernel/signal.c	Tue May 28 16:13:02 2002
+++ linux/kernel/signal.c	Tue May 28 19:06:49 2002
@@ -507,12 +507,9 @@
 	 * process of changing - but no harm is done by that
 	 * other than doing an extra (lightweight) IPI interrupt.
 	 */
-	spin_lock(&runqueue_lock);
-	if (task_has_cpu(t) && t->processor != smp_processor_id())
-		smp_send_reschedule(t->processor);
-	spin_unlock(&runqueue_lock);
-#endif /* CONFIG_SMP */
-
+	if ((t->state == TASK_RUNNING) && (t->cpu != cpu()))
+		kick_if_running(t);
+#endif
 	if (t->state & TASK_INTERRUPTIBLE) {
 		wake_up_process(t);
 		return;
diff -urN linux-2.4.19-pre9/kernel/softirq.c linux/kernel/softirq.c
--- linux-2.4.19-pre9/kernel/softirq.c	Tue May 28 16:13:02 2002
+++ linux/kernel/softirq.c	Tue May 28 19:06:49 2002
@@ -259,10 +259,9 @@
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		current->state = TASK_RUNNING;
-		do {
-			current->policy |= SCHED_YIELD;
-			schedule();
-		} while (test_bit(TASKLET_STATE_SCHED, &t->state));
+		do
+			sys_sched_yield();
+		while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
 	clear_bit(TASKLET_STATE_SCHED, &t->state);
@@ -365,13 +364,13 @@
 	int cpu = cpu_logical_map(bind_cpu);
 
 	daemonize();
-	current->nice = 19;
+	set_user_nice(current, 19);
 	sigfillset(&current->blocked);
 
 	/* Migrate to the right CPU */
-	current->cpus_allowed = 1UL << cpu;
-	while (smp_processor_id() != cpu)
-		schedule();
+	set_cpus_allowed(current, 1UL << cpu);
+	if (cpu() != cpu)
+		BUG();
 
 	sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
 
@@ -396,7 +395,7 @@
 	}
 }
 
-static __init int spawn_ksoftirqd(void)
+__init int spawn_ksoftirqd(void)
 {
 	int cpu;
 
@@ -405,10 +404,8 @@
 				  CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
 			printk("spawn_ksoftirqd() failed for cpu %d\n", cpu);
 		else {
-			while (!ksoftirqd_task(cpu_logical_map(cpu))) {
-				current->policy |= SCHED_YIELD;
-				schedule();
-			}
+			while (!ksoftirqd_task(cpu_logical_map(cpu)))
+				sys_sched_yield();
 		}
 	}
 
diff -urN linux-2.4.19-pre9/kernel/sys.c linux/kernel/sys.c
--- linux-2.4.19-pre9/kernel/sys.c	Tue May 28 16:13:02 2002
+++ linux/kernel/sys.c	Tue May 28 19:06:49 2002
@@ -220,10 +220,10 @@
 		}
 		if (error == -ESRCH)
 			error = 0;
-		if (niceval < p->nice && !capable(CAP_SYS_NICE))
+		if (niceval < task_nice(p) && !capable(CAP_SYS_NICE))
 			error = -EACCES;
 		else
-			p->nice = niceval;
+			set_user_nice(p, niceval);
 	}
 	read_unlock(&tasklist_lock);
 
@@ -249,7 +249,7 @@
 		long niceval;
 		if (!proc_sel(p, which, who))
 			continue;
-		niceval = 20 - p->nice;
+		niceval = 20 - task_nice(p);
 		if (niceval > retval)
 			retval = niceval;
 	}
diff -urN linux-2.4.19-pre9/kernel/timer.c linux/kernel/timer.c
--- linux-2.4.19-pre9/kernel/timer.c	Tue May 28 16:13:02 2002
+++ linux/kernel/timer.c	Tue May 28 19:06:49 2002
@@ -25,6 +25,8 @@
 
 #include <asm/uaccess.h>
 
+struct kernel_stat kstat;
+
 /*
  * Timekeeping variables
  */
@@ -598,18 +600,7 @@
 	int cpu = smp_processor_id(), system = user_tick ^ 1;
 
 	update_one_process(p, user_tick, system, cpu);
-	if (p->pid) {
-		if (--p->counter <= 0) {
-			p->counter = 0;
-			p->need_resched = 1;
-		}
-		if (p->nice > 0)
-			kstat.per_cpu_nice[cpu] += user_tick;
-		else
-			kstat.per_cpu_user[cpu] += user_tick;
-		kstat.per_cpu_system[cpu] += system;
-	} else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
-		kstat.per_cpu_system[cpu] += system;
+	scheduler_tick(user_tick, system);
 }
 
 /*
@@ -810,6 +801,89 @@
 
 #endif
 
+static void process_timeout(unsigned long __data)
+{
+	wake_up_process((task_t *)__data);
+}
+
+/**
+ * schedule_timeout - sleep until timeout
+ * @timeout: timeout value in jiffies
+ *
+ * Make the current task sleep until @timeout jiffies have
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
+ * pass before the routine returns. The routine will return 0
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task. In this case the remaining time
+ * in jiffies will be returned, or 0 if the timer expired in time
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this 
+ * routine returns.
+ *
+ * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
+ * the CPU away without a bound on the timeout. In this case the return
+ * value will be %MAX_SCHEDULE_TIMEOUT.
+ *
+ * In all cases the return value is guaranteed to be non-negative.
+ */
+signed long schedule_timeout(signed long timeout)
+{
+	struct timer_list timer;
+	unsigned long expire;
+
+	switch (timeout)
+	{
+	case MAX_SCHEDULE_TIMEOUT:
+		/*
+		 * These two special cases are useful to be comfortable
+		 * in the caller. Nothing more. We could take
+		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
+		 * but I' d like to return a valid offset (>=0) to allow
+		 * the caller to do everything it want with the retval.
+		 */
+		schedule();
+		goto out;
+	default:
+		/*
+		 * Another bit of PARANOID. Note that the retval will be
+		 * 0 since no piece of kernel is supposed to do a check
+		 * for a negative retval of schedule_timeout() (since it
+		 * should never happens anyway). You just have the printk()
+		 * that will tell you if something is gone wrong and where.
+		 */
+		if (timeout < 0)
+		{
+			printk(KERN_ERR "schedule_timeout: wrong timeout "
+			       "value %lx from %p\n", timeout,
+			       __builtin_return_address(0));
+			current->state = TASK_RUNNING;
+			goto out;
+		}
+	}
+
+	expire = timeout + jiffies;
+
+	init_timer(&timer);
+	timer.expires = expire;
+	timer.data = (unsigned long) current;
+	timer.function = process_timeout;
+
+	add_timer(&timer);
+	schedule();
+	del_timer_sync(&timer);
+
+	timeout = expire - jiffies;
+
+ out:
+	return timeout < 0 ? 0 : timeout;
+}
+
 /* Thread ID - the internal kernel "pid" */
 asmlinkage long sys_gettid(void)
 {
@@ -856,4 +930,3 @@
 	}
 	return 0;
 }
-
diff -urN linux-2.4.19-pre9/mm/highmem.c linux/mm/highmem.c
--- linux-2.4.19-pre9/mm/highmem.c	Tue May 28 16:13:02 2002
+++ linux/mm/highmem.c	Tue May 28 19:06:49 2002
@@ -355,9 +355,8 @@
 	/* we need to wait I/O completion */
 	run_task_queue(&tq_disk);
 
-	current->policy |= SCHED_YIELD;
 	__set_current_state(TASK_RUNNING);
-	schedule();
+	yield();
 	goto repeat_alloc;
 }
 
@@ -393,9 +392,8 @@
 	/* we need to wait I/O completion */
 	run_task_queue(&tq_disk);
 
-	current->policy |= SCHED_YIELD;
 	__set_current_state(TASK_RUNNING);
-	schedule();
+	yield();
 	goto repeat_alloc;
 }
 
diff -urN linux-2.4.19-pre9/mm/oom_kill.c linux/mm/oom_kill.c
--- linux-2.4.19-pre9/mm/oom_kill.c	Tue May 28 16:13:02 2002
+++ linux/mm/oom_kill.c	Tue May 28 19:06:49 2002
@@ -82,7 +82,7 @@
 	 * Niced processes are most likely less important, so double
 	 * their badness points.
 	 */
-	if (p->nice > 0)
+	if (task_nice(p) > 0)
 		points *= 2;
 
 	/*
@@ -146,7 +146,7 @@
 	 * all the memory it needs. That way it should be able to
 	 * exit() and clear out its resources quickly...
 	 */
-	p->counter = 5 * HZ;
+	p->time_slice = HZ;
 	p->flags |= PF_MEMALLOC | PF_MEMDIE;
 
 	/* This process has hardware access, be more careful. */
@@ -188,8 +188,7 @@
 	 * killing itself before someone else gets the chance to ask
 	 * for more memory.
 	 */
-	current->policy |= SCHED_YIELD;
-	schedule();
+	yield();
 	return;
 }
 
diff -urN linux-2.4.19-pre9/mm/page_alloc.c linux/mm/page_alloc.c
--- linux-2.4.19-pre9/mm/page_alloc.c	Tue May 28 16:13:02 2002
+++ linux/mm/page_alloc.c	Tue May 28 19:06:49 2002
@@ -410,9 +410,8 @@
 		return NULL;
 
 	/* Yield for kswapd, and try again */
-	current->policy |= SCHED_YIELD;
 	__set_current_state(TASK_RUNNING);
-	schedule();
+	yield();
 	goto rebalance;
 }
 
diff -urN linux-2.4.19-pre9/net/ipv4/tcp_output.c linux/net/ipv4/tcp_output.c
--- linux-2.4.19-pre9/net/ipv4/tcp_output.c	Tue May 28 16:13:24 2002
+++ linux/net/ipv4/tcp_output.c	Tue May 28 19:06:49 2002
@@ -1010,8 +1010,7 @@
 			skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
 			if (skb)
 				break;
-			current->policy |= SCHED_YIELD;
-			schedule();
+			yield();
 		}
 
 		/* Reserve space for headers and prepare control bits. */
diff -urN linux-2.4.19-pre9/net/sched/sch_generic.c linux/net/sched/sch_generic.c
--- linux-2.4.19-pre9/net/sched/sch_generic.c	Tue May 28 16:13:27 2002
+++ linux/net/sched/sch_generic.c	Tue May 28 19:06:49 2002
@@ -475,10 +475,8 @@
 
 	dev_watchdog_down(dev);
 
-	while (test_bit(__LINK_STATE_SCHED, &dev->state)) {
-		current->policy |= SCHED_YIELD;
-		schedule();
-	}
+	while (test_bit(__LINK_STATE_SCHED, &dev->state))
+		yield();
 
 	spin_unlock_wait(&dev->xmit_lock);
 }
diff -urN linux-2.4.19-pre9/net/socket.c linux/net/socket.c
--- linux-2.4.19-pre9/net/socket.c	Tue May 28 16:13:23 2002
+++ linux/net/socket.c	Tue May 28 19:06:49 2002
@@ -147,8 +147,7 @@
 	while (atomic_read(&net_family_lockct) != 0) {
 		spin_unlock(&net_family_lock);
 
-		current->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 
 		spin_lock(&net_family_lock);
 	}
diff -urN linux-2.4.19-pre9/net/sunrpc/sched.c linux/net/sunrpc/sched.c
--- linux-2.4.19-pre9/net/sunrpc/sched.c	Tue May 28 16:13:26 2002
+++ linux/net/sunrpc/sched.c	Tue May 28 19:06:49 2002
@@ -770,8 +770,7 @@
 		}
 		if (flags & RPC_TASK_ASYNC)
 			return NULL;
-		current->policy |= SCHED_YIELD;
-		schedule();
+		yield();
 	} while (!signalled());
 
 	return NULL;
@@ -1112,8 +1111,7 @@
 		__rpc_schedule();
 		if (all_tasks) {
 			dprintk("rpciod_killall: waiting for tasks to exit\n");
-			current->policy |= SCHED_YIELD;
-			schedule();
+			yield();
 		}
 	}
 
@@ -1183,8 +1181,7 @@
 	 * wait briefly before checking the process id.
 	 */
 	current->sigpending = 0;
-	current->policy |= SCHED_YIELD;
-	schedule();
+	yield();
 	/*
 	 * Display a message if we're going to wait longer.
 	 */
diff -urN linux-2.4.19-pre9/net/unix/af_unix.c linux/net/unix/af_unix.c
--- linux-2.4.19-pre9/net/unix/af_unix.c	Tue May 28 16:13:23 2002
+++ linux/net/unix/af_unix.c	Tue May 28 19:06:49 2002
@@ -565,10 +565,8 @@
 				      addr->hash)) {
 		write_unlock(&unix_table_lock);
 		/* Sanity yield. It is unusual case, but yet... */
-		if (!(ordernum&0xFF)) {
-			current->policy |= SCHED_YIELD;
-			schedule();
-		}
+		if (!(ordernum&0xFF))
+			yield();
 		goto retry;
 	}
 	addr->hash ^= sk->type;