diff --git a/pkgs/misc/emulators/wine/default.nix b/pkgs/misc/emulators/wine/default.nix
index 8106390674eb..fc9b97d5af9f 100644
--- a/pkgs/misc/emulators/wine/default.nix
+++ b/pkgs/misc/emulators/wine/default.nix
@@ -7,11 +7,11 @@ assert stdenv.isLinux;
 let lib = import ../../../lib/default.nix; in
 
 stdenv.mkDerivation {
-  name = "wine-0.9.46";
+  name = "wine-0.9.49";
 
   src = fetchurl {
-		url = mirror://sourceforge/wine/wine-0.9.46.tar.bz2;
-		sha256 = "0c5fapw38bivipi8yzci3swxyhl9g67dpicqzslwmffwbi9y9z3i";
+		url = mirror://sourceforge/wine/wine-0.9.49.tar.bz2;
+		sha256 = "d41edd08cf7fd21d7350a633995107533a25f925c8859995d3a6fc131f54b3c1";
 	};
 
   buildInputs = [
diff --git a/pkgs/os-specific/linux/bridge_utils/default.nix b/pkgs/os-specific/linux/bridge_utils/default.nix
new file mode 100644
index 000000000000..3fb80b7484a0
--- /dev/null
+++ b/pkgs/os-specific/linux/bridge_utils/default.nix
@@ -0,0 +1,19 @@
+args:
+args.stdenv.mkDerivation {
+  name = "bridge-utils-1.2";
+
+  src = args.fetchurl {
+    url = http://mirror/sourceforge/bridge/bridge-utils-1.2.tar.gz;
+    sha256 = "0jg3z51c2c34byg4zi39j9g4b66js5kanjhid77hpa0jdfmryfy9";
+  };
+
+  buildInputs =(with args; [autoconf automake]);
+
+  preConfigure="autoreconf";
+
+  meta = {
+      description = "http://sourceforge.net/projects/bridge/";
+      homepage = [ "http://www.linux-foundation.org/en/Net:Bridge/" "http://sourceforge.net/projects/bridge/" ];
+      license = "GPL";
+  };
+}
diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix b/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix
new file mode 100644
index 000000000000..55748d3e9f54
--- /dev/null
+++ b/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix
@@ -0,0 +1,84 @@
+{ stdenv, fetchurl, perl, mktemp, module_init_tools
+
+  # A list of patches to apply to the kernel.  Each element of this list
+  # should be an attribute set {name, patch} where `name' is a
+  # symbolic name and `patch' is the actual patch.  The patch may
+  # optionally be compressed with gzip or bzip2.
+, kernelPatches ? []
+
+, # Whether to build a User-Mode Linux kernel.
+  userModeLinux ? false
+
+, # Allows you to set your own kernel version suffix (e.g.,
+  # "-my-kernel").
+  localVersion ? ""
+
+, # Your own kernel configuration file, if you don't want to use the
+  # default. 
+  kernelConfig ? null
+
+, # A list of additional statements to be appended to the
+  # configuration file.
+  extraConfig ? []
+}:
+
+assert stdenv.system == "i686-linux" || stdenv.system == "x86_64-linux";
+
+let
+
+  lib = import ../../../lib;
+
+  version = "2.6.21";
+
+in
+
+stdenv.mkDerivation {
+  name = if userModeLinux then "user-mode-linux-${version}" else "linux-${version}";
+  builder = ./builder.sh;
+  
+  src = fetchurl {
+    url = "http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.21.tar.bz2";
+    sha256 = "f187b12d70e0a48ce81f0472dfe9504fb5f0f966be339ac9d57dd2b991a74942";
+  };
+  
+  patches = map (p: p.patch) kernelPatches;
+  extraConfig =
+    let addNewlines = map (s: "\n" + s + "\n");
+        configFromPatches =
+          map (p: if p ? extraConfig then p.extraConfig else "") kernelPatches;
+    in lib.concatStrings (addNewlines (configFromPatches ++ extraConfig));
+
+  config =
+    if kernelConfig != null then kernelConfig else
+    if userModeLinux then ./config-2.6.21-uml else
+    if stdenv.system == "i686-linux" then ./config-2.6.21-i686-smp else
+    if stdenv.system == "x86_64-linux" then ./config-2.6.21-x86_64-smp else
+    abort "No kernel configuration for your platform!";
+  
+  buildInputs = [perl mktemp];
+  
+  arch =
+    if userModeLinux then "um" else
+    if stdenv.system == "i686-linux" then "i386" else
+    if stdenv.system == "x86_64-linux" then "x86_64" else
+    abort "Platform ${stdenv.system} is not supported.";
+
+  makeFlags = if userModeLinux then "ARCH=um SHELL=bash" else "";
+
+  inherit module_init_tools;
+
+  allowLocalVersion = false; # don't allow patches to set a suffix
+  inherit localVersion; # but do allow the user to set one.
+
+  meta = {
+    description =
+      (if userModeLinux then
+        "User-Mode Linux"
+       else
+        "The Linux kernel") +
+      (if kernelPatches == [] then "" else
+        " (with patches: "
+        + lib.concatStrings (lib.intersperse ", " (map (x: x.name) kernelPatches))
+        + ")");
+  };
+}
diff --git a/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1 b/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1
new file mode 100644
index 000000000000..0bf63f5aca37
--- /dev/null
+++ b/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1
@@ -0,0 +1,5040 @@
+Index: linux-2.6.21-ck1/Makefile
+===================================================================
+--- linux-2.6.21-ck1.orig/Makefile	2007-05-04 12:10:52.000000000 +1000
++++ linux-2.6.21-ck1/Makefile	2007-05-04 12:21:37.000000000 +1000
+@@ -1,7 +1,7 @@
+ VERSION = 2
+ PATCHLEVEL = 6
+ SUBLEVEL = 21
+-EXTRAVERSION =
++EXTRAVERSION = -ck1
+ NAME = Nocturnal Monster Puppy
+ 
+ # *DOCUMENTATION*
+Index: linux-2.6.21-ck1/kernel/workqueue.c
+===================================================================
+--- linux-2.6.21-ck1.orig/kernel/workqueue.c	2007-05-04 12:10:52.000000000 +1000
++++ linux-2.6.21-ck1/kernel/workqueue.c	2007-05-04 12:10:54.000000000 +1000
+@@ -355,8 +355,6 @@ static int worker_thread(void *__cwq)
+ 	if (!cwq->freezeable)
+ 		current->flags |= PF_NOFREEZE;
+ 
+-	set_user_nice(current, -5);
+-
+ 	/* Block and flush all signals */
+ 	sigfillset(&blocked);
+ 	sigprocmask(SIG_BLOCK, &blocked, NULL);
+Index: linux-2.6.21-ck1/fs/proc/array.c
+===================================================================
+--- linux-2.6.21-ck1.orig/fs/proc/array.c	2007-05-04 12:10:52.000000000 +1000
++++ linux-2.6.21-ck1/fs/proc/array.c	2007-05-04 12:10:54.000000000 +1000
+@@ -165,7 +165,6 @@ static inline char * task_state(struct t
+ 	rcu_read_lock();
+ 	buffer += sprintf(buffer,
+ 		"State:\t%s\n"
+-		"SleepAVG:\t%lu%%\n"
+ 		"Tgid:\t%d\n"
+ 		"Pid:\t%d\n"
+ 		"PPid:\t%d\n"
+@@ -173,7 +172,6 @@ static inline char * task_state(struct t
+ 		"Uid:\t%d\t%d\t%d\t%d\n"
+ 		"Gid:\t%d\t%d\t%d\t%d\n",
+ 		get_task_state(p),
+-		(p->sleep_avg/1024)*100/(1020000000/1024),
+ 	       	p->tgid, p->pid,
+ 	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
+ 		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
+Index: linux-2.6.21-ck1/include/linux/init_task.h
+===================================================================
+--- linux-2.6.21-ck1.orig/include/linux/init_task.h	2007-05-04 12:10:52.000000000 +1000
++++ linux-2.6.21-ck1/include/linux/init_task.h	2007-05-04 12:24:19.000000000 +1000
+@@ -102,13 +102,15 @@ extern struct group_info init_groups;
+ 	.prio		= MAX_PRIO-20,					\
+ 	.static_prio	= MAX_PRIO-20,					\
+ 	.normal_prio	= MAX_PRIO-20,					\
++	.rotation	= 0,						\
+ 	.policy		= SCHED_NORMAL,					\
+ 	.cpus_allowed	= CPU_MASK_ALL,					\
+ 	.mm		= NULL,						\
+ 	.active_mm	= &init_mm,					\
+ 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+ 	.ioprio		= 0,						\
+-	.time_slice	= HZ,						\
++	.time_slice	= 1000000000,						\
++	.quota		= 1000000000,						\
+ 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+ 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
+ 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
+@@ -135,6 +137,7 @@ extern struct group_info init_groups;
+ 		.signal = {{0}}},					\
+ 	.blocked	= {{0}},					\
+ 	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
++	.mutexes_held	= 0,						\
+ 	.journal_info	= NULL,						\
+ 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+ 	.fs_excl	= ATOMIC_INIT(0),				\
+Index: linux-2.6.21-ck1/include/linux/sched.h
+===================================================================
+--- linux-2.6.21-ck1.orig/include/linux/sched.h	2007-05-04 12:10:52.000000000 +1000
++++ linux-2.6.21-ck1/include/linux/sched.h	2007-05-04 12:24:19.000000000 +1000
+@@ -34,9 +34,14 @@
+ #define SCHED_FIFO		1
+ #define SCHED_RR		2
+ #define SCHED_BATCH		3
++#define SCHED_ISO		4
++#define SCHED_IDLEPRIO		5
+ 
+ #ifdef __KERNEL__
+ 
++#define SCHED_MAX		SCHED_IDLEPRIO
++#define SCHED_RANGE(policy)	((policy) <= SCHED_MAX)
++
+ struct sched_param {
+ 	int sched_priority;
+ };
+@@ -149,8 +154,7 @@ extern unsigned long weighted_cpuload(co
+ #define EXIT_ZOMBIE		16
+ #define EXIT_DEAD		32
+ /* in tsk->state again */
+-#define TASK_NONINTERACTIVE	64
+-#define TASK_DEAD		128
++#define TASK_DEAD		64
+ 
+ #define __set_task_state(tsk, state_value)		\
+ 	do { (tsk)->state = (state_value); } while (0)
+@@ -522,14 +526,19 @@ struct signal_struct {
+ 
+ #define MAX_USER_RT_PRIO	100
+ #define MAX_RT_PRIO		MAX_USER_RT_PRIO
++#define PRIO_RANGE		(40)
++#define ISO_PRIO		(MAX_RT_PRIO - 1)
+ 
+-#define MAX_PRIO		(MAX_RT_PRIO + 40)
++#define MAX_PRIO		(MAX_RT_PRIO + PRIO_RANGE)
+ 
+-#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
++#define rt_prio(prio)		unlikely((prio) < ISO_PRIO)
+ #define rt_task(p)		rt_prio((p)->prio)
+ #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
+-#define is_rt_policy(p)		((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
++#define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
++					(policy) == SCHED_RR)
+ #define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
++#define iso_task(p)		unlikely((p)->policy == SCHED_ISO)
++#define idleprio_task(p)	unlikely((p)->policy == SCHED_IDLEPRIO)
+ 
+ /*
+  * Some day this will be a full-fledged user tracking system..
+@@ -740,6 +749,22 @@ extern unsigned int max_cache_size;
+ 
+ #endif	/* CONFIG_SMP */
+ 
++/*
++ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
++ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
++ * task of nice 0 or enough lower priority tasks to bring up the
++ * weighted_cpuload
++ */
++static inline int above_background_load(void)
++{
++	unsigned long cpu;
++
++	for_each_online_cpu(cpu) {
++		if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
++			return 1;
++	}
++	return 0;
++}
+ 
+ struct io_context;			/* See blkdev.h */
+ struct cpuset;
+@@ -788,13 +813,6 @@ struct mempolicy;
+ struct pipe_inode_info;
+ struct uts_namespace;
+ 
+-enum sleep_type {
+-	SLEEP_NORMAL,
+-	SLEEP_NONINTERACTIVE,
+-	SLEEP_INTERACTIVE,
+-	SLEEP_INTERRUPTED,
+-};
+-
+ struct prio_array;
+ 
+ struct task_struct {
+@@ -814,20 +832,33 @@ struct task_struct {
+ 	int load_weight;	/* for niceness load balancing purposes */
+ 	int prio, static_prio, normal_prio;
+ 	struct list_head run_list;
++	/*
++	 * This bitmap shows what priorities this task has received quota
++	 * from for this major priority rotation on its current runqueue.
++	 */
++	DECLARE_BITMAP(bitmap, PRIO_RANGE + 1);
+ 	struct prio_array *array;
++	/* Which major runqueue rotation did this task run */
++	unsigned long rotation;
+ 
+ 	unsigned short ioprio;
+ #ifdef CONFIG_BLK_DEV_IO_TRACE
+ 	unsigned int btrace_seq;
+ #endif
+-	unsigned long sleep_avg;
+ 	unsigned long long timestamp, last_ran;
+ 	unsigned long long sched_time; /* sched_clock time spent running */
+-	enum sleep_type sleep_type;
+ 
+ 	unsigned long policy;
+ 	cpumask_t cpus_allowed;
+-	unsigned int time_slice, first_time_slice;
++	/*
++	 * How much this task is entitled to run at the current priority
++	 * before being requeued at a lower priority.
++	 */
++	int time_slice;
++	/* Is this the very first time_slice this task has ever run. */
++	unsigned int first_time_slice;
++	/* How much this task receives at each priority level */
++	int quota;
+ 
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ 	struct sched_info sched_info;
+@@ -992,6 +1023,7 @@ struct task_struct {
+ 	struct held_lock held_locks[MAX_LOCK_DEPTH];
+ 	unsigned int lockdep_recursion;
+ #endif
++	unsigned long mutexes_held;
+ 
+ /* journalling filesystem info */
+ 	void *journal_info;
+@@ -1156,8 +1188,10 @@ static inline void put_task_struct(struc
+ #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+ #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
+ #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
++#define PF_ISOREF	0x04000000	/* SCHED_ISO task has used up quota */
+ #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
+ #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
++#define PF_NONSLEEP	0x40000000	/* Waiting on in-kernel activity */
+ 
+ /*
+  * Only the _current_ task can read/write to tsk->flags, but other
+Index: linux-2.6.21-ck1/kernel/sched.c
+===================================================================
+--- linux-2.6.21-ck1.orig/kernel/sched.c	2007-05-04 12:10:52.000000000 +1000
++++ linux-2.6.21-ck1/kernel/sched.c	2007-05-04 12:24:22.000000000 +1000
+@@ -16,6 +16,7 @@
+  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
+  *  2003-09-03	Interactivity tuning by Con Kolivas.
+  *  2004-04-02	Scheduler domains code by Nick Piggin
++ *  2007-03-02	Staircase deadline scheduling policy by Con Kolivas
+  */
+ 
+ #include <linux/mm.h>
+@@ -52,6 +53,7 @@
+ #include <linux/tsacct_kern.h>
+ #include <linux/kprobes.h>
+ #include <linux/delayacct.h>
++#include <linux/log2.h>
+ #include <asm/tlb.h>
+ 
+ #include <asm/unistd.h>
+@@ -83,126 +85,85 @@ unsigned long long __attribute__((weak))
+ #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
+ #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
+ #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
++#define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
+ 
+-/*
+- * Some helpers for converting nanosecond timing to jiffy resolution
+- */
+-#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
++/* Some helpers for converting to/from various scales.*/
+ #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+-
+-/*
+- * These are the 'tuning knobs' of the scheduler:
+- *
+- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
+- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+- * Timeslices get refilled after they expire.
+- */
+-#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
+-#define DEF_TIMESLICE		(100 * HZ / 1000)
+-#define ON_RUNQUEUE_WEIGHT	 30
+-#define CHILD_PENALTY		 95
+-#define PARENT_PENALTY		100
+-#define EXIT_WEIGHT		  3
+-#define PRIO_BONUS_RATIO	 25
+-#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
+-#define INTERACTIVE_DELTA	  2
+-#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
+-#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
+-#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+-
+-/*
+- * If a task is 'interactive' then we reinsert it in the active
+- * array after it has expired its current timeslice. (it will not
+- * continue to run immediately, it will still roundrobin with
+- * other interactive tasks.)
+- *
+- * This part scales the interactivity limit depending on niceness.
+- *
+- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+- * Here are a few examples of different nice levels:
+- *
+- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
+- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
+- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
+- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
+- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
+- *
+- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
+- *  priority range a task can explore, a value of '1' means the
+- *  task is rated interactive.)
+- *
+- * Ie. nice +19 tasks can never get 'interactive' enough to be
+- * reinserted into the active array. And only heavily CPU-hog nice -20
+- * tasks will be expired. Default nice 0 tasks are somewhere between,
+- * it takes some effort for them to get interactive, but it's not
+- * too hard.
+- */
+-
+-#define CURRENT_BONUS(p) \
+-	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
+-		MAX_SLEEP_AVG)
+-
+-#define GRANULARITY	(10 * HZ / 1000 ? : 1)
+-
+-#ifdef CONFIG_SMP
+-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
+-			num_online_cpus())
+-#else
+-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
+-#endif
+-
+-#define SCALE(v1,v1_max,v2_max) \
+-	(v1) * (v2_max) / (v1_max)
+-
+-#define DELTA(p) \
+-	(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
+-		INTERACTIVE_DELTA)
+-
+-#define TASK_INTERACTIVE(p) \
+-	((p)->prio <= (p)->static_prio - DELTA(p))
+-
+-#define INTERACTIVE_SLEEP(p) \
+-	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
+-		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+-
+-#define TASK_PREEMPTS_CURR(p, rq) \
+-	((p)->prio < (rq)->curr->prio)
+-
+-#define SCALE_PRIO(x, prio) \
+-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
+-
+-static unsigned int static_prio_timeslice(int static_prio)
+-{
+-	if (static_prio < NICE_TO_PRIO(0))
+-		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
+-	else
+-		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
+-}
+-
+-/*
+- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+- * to time slice values: [800ms ... 100ms ... 5ms]
+- *
+- * The higher a thread's priority, the bigger timeslices
+- * it gets during one round of execution. But even the lowest
+- * priority thread gets MIN_TIMESLICE worth of execution time.
++#define MS_TO_NS(TIME)		((TIME) * 1000000)
++#define MS_TO_US(TIME)		((TIME) * 1000)
++#define US_TO_MS(TIME)		((TIME) / 1000)
++
++#define TASK_PREEMPTS_CURR(p, curr)	((p)->prio < (curr)->prio)
++
++/*
++ * This is the time all tasks within the same priority round robin.
++ * Value is in ms and set to a minimum of 8ms. Scales with number of cpus.
++ * Tunable via /proc interface.
++ */
++int rr_interval __read_mostly = 6;
++int sched_interactive __read_mostly = 1;
++
++/*
++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
++ * are allowed to run (over ISO_PERIOD seconds) as real time tasks.
++ * sched_iso_period - sysctl which determines the number of seconds over
++ * which cpu usage of SCHED_ISO tasks is averaged to determine if they are
++ * exceeding their allowable bandwidth.
++*/
++int sched_iso_cpu __read_mostly = 80;
++int sched_iso_period __read_mostly = 5;
++
++#define ISO_PERIOD	((sched_iso_period * HZ) + 1)
++
++/*
++ * This contains a bitmap for each dynamic priority level with empty slots
++ * for the valid priorities each different nice level can have. It allows
++ * us to stagger the slots where differing priorities run in a way that
++ * keeps latency differences between different nice levels at a minimum.
++ * The purpose of a pre-generated matrix is for rapid lookup of next slot in
++ * O(1) time without having to recalculate every time priority gets demoted.
++ * All nice levels use priority slot 39 as this allows less niced tasks to
++ * get all priority slots better than that before expiration is forced.
++ * ie, where 0 means a slot for that priority, priority running from left to
++ * right is from prio 0 to prio 39:
++ * nice -20 0000000000000000000000000000000000000000
++ * nice -10 1000100010001000100010001000100010010000
++ * nice   0 1010101010101010101010101010101010101010
++ * nice   5 1011010110110101101101011011010110110110
++ * nice  10 1110111011101110111011101110111011101110
++ * nice  15 1111111011111110111111101111111011111110
++ * nice  19 1111111111111111111111111111111111111110
+  */
++static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)]
++				 __read_mostly;
+ 
+-static inline unsigned int task_timeslice(struct task_struct *p)
+-{
+-	return static_prio_timeslice(p->static_prio);
+-}
++struct rq;
+ 
+ /*
+  * These are the runqueue data structures:
+  */
+-
+ struct prio_array {
+-	unsigned int nr_active;
+-	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
+-	struct list_head queue[MAX_PRIO];
++	/* Tasks queued at each priority */
++	struct list_head queue[MAX_PRIO + 1];
++
++	/*
++	 * The bitmap of priorities queued for this array. While the expired
++	 * array will never have realtime tasks on it, it is simpler to have
++	 * equal sized bitmaps for a cheap array swap. Include 1 bit for
++	 * delimiter.
++	 */
++	DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
++
++	/*
++	 * The best static priority (of the dynamic priority tasks) queued
++	 * this array.
++	 */
++	int best_static_prio;
++
++#ifdef CONFIG_SMP
++	/* For convenience looks back at rq */
++	struct rq *rq;
++#endif
+ };
+ 
+ /*
+@@ -234,14 +195,28 @@ struct rq {
+ 	 */
+ 	unsigned long nr_uninterruptible;
+ 
+-	unsigned long expired_timestamp;
+ 	/* Cached timestamp set by update_cpu_clock() */
+ 	unsigned long long most_recent_timestamp;
+ 	struct task_struct *curr, *idle;
+ 	unsigned long next_balance;
+ 	struct mm_struct *prev_mm;
+-	struct prio_array *active, *expired, arrays[2];
+-	int best_expired_prio;
++
++	struct prio_array *active, *expired, *idleprio, arrays[2];
++	unsigned long *dyn_bitmap, *exp_bitmap;
++
++	/*
++	 * The current dynamic priority level this runqueue is at per static
++	 * priority level.
++	 */
++	int prio_level[PRIO_RANGE];
++
++	/* How many times we have rotated the priority queue */
++	unsigned long prio_rotation;
++	unsigned long iso_ticks;
++	unsigned short iso_refractory;
++
++	/* Number of idleprio tasks running */
++	unsigned long nr_idleprio;
+ 	atomic_t nr_iowait;
+ 
+ #ifdef CONFIG_SMP
+@@ -579,12 +554,9 @@ static inline struct rq *this_rq_lock(vo
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ /*
+  * Called when a process is dequeued from the active array and given
+- * the cpu.  We should note that with the exception of interactive
+- * tasks, the expired queue will become the active queue after the active
+- * queue is empty, without explicitly dequeuing and requeuing tasks in the
+- * expired queue.  (Interactive tasks may be requeued directly to the
+- * active queue, thus delaying tasks in the expired queue from running;
+- * see scheduler_tick()).
++ * the cpu.  We should note that the expired queue will become the active
++ * queue after the active queue is empty, without explicitly dequeuing and
++ * requeuing tasks in the expired queue.
+  *
+  * This function is only called from sched_info_arrive(), rather than
+  * dequeue_task(). Even though a task may be queued and dequeued multiple
+@@ -682,71 +654,304 @@ sched_info_switch(struct task_struct *pr
+ #define sched_info_switch(t, next)	do { } while (0)
+ #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+ 
++static int idleprio_suitable(struct task_struct *p)
++{
++	return (!p->mutexes_held && !freezing(p) && !signal_pending(p) &&
++		!(p->flags & (PF_NONSLEEP | PF_EXITING)));
++}
++
++static int idleprio(const struct task_struct *p)
++{
++	return (p->prio == MAX_PRIO);
++}
++
++static inline int task_queued(struct task_struct *task)
++{
++	return !list_empty(&task->run_list);
++}
++
++static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq)
++{
++	__set_bit(p->prio, p->array->prio_bitmap);
++}
++
+ /*
+- * Adding/removing a task to/from a priority array:
++ * Removing from a runqueue.
+  */
+-static void dequeue_task(struct task_struct *p, struct prio_array *array)
++static void dequeue_task(struct task_struct *p, struct rq *rq)
+ {
+-	array->nr_active--;
+-	list_del(&p->run_list);
+-	if (list_empty(array->queue + p->prio))
+-		__clear_bit(p->prio, array->bitmap);
++	list_del_init(&p->run_list);
++	if (idleprio_task(p) && idleprio(p))
++		rq->nr_idleprio--;
++	else if (list_empty(p->array->queue + p->prio))
++		__clear_bit(p->prio, p->array->prio_bitmap);
+ }
+ 
+-static void enqueue_task(struct task_struct *p, struct prio_array *array)
++static void reset_first_time_slice(struct task_struct *p)
+ {
+-	sched_info_queued(p);
+-	list_add_tail(&p->run_list, array->queue + p->prio);
+-	__set_bit(p->prio, array->bitmap);
+-	array->nr_active++;
++	if (unlikely(p->first_time_slice))
++		p->first_time_slice = 0;
++}
++
++/*
++ * The task is being queued on a fresh array so it has its entitlement
++ * bitmap cleared.
++ */
++static void task_new_array(struct task_struct *p, struct rq *rq,
++			   struct prio_array *array)
++{
++	bitmap_zero(p->bitmap, PRIO_RANGE);
++	p->rotation = rq->prio_rotation;
++	p->time_slice = p->quota;
+ 	p->array = array;
++	reset_first_time_slice(p);
++}
++
++/* Find the first slot from the relevant prio_matrix entry */
++static int first_prio_slot(struct task_struct *p)
++{
++	if (unlikely(p->policy == SCHED_BATCH))
++		return p->static_prio;
++	return SCHED_PRIO(find_first_zero_bit(
++		prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE));
+ }
+ 
+ /*
+- * Put task to the end of the run list without the overhead of dequeue
+- * followed by enqueue.
++ * In sched_interactive mode priority allocation occurs per process per rq
++ * array swap. In !sched_interactive mode all waking tasks must obey the
++ * current prio level of all other tasks running per array swap.
+  */
+-static void requeue_task(struct task_struct *p, struct prio_array *array)
++static int minprio(struct rq *rq, int uprio)
+ {
+-	list_move_tail(&p->run_list, array->queue + p->prio);
++	if (sched_interactive)
++		return MAX_RT_PRIO;
++	return rq->prio_level[uprio];
+ }
+ 
+-static inline void
+-enqueue_task_head(struct task_struct *p, struct prio_array *array)
++/*
++ * Find the first unused slot by this task that is also in its prio_matrix
++ * level. SCHED_BATCH tasks do not use the priority matrix. They only take
++ * priority slots from their static_prio and above.
++ */
++static int next_entitled_slot(struct task_struct *p, struct rq *rq)
+ {
+-	list_add(&p->run_list, array->queue + p->prio);
+-	__set_bit(p->prio, array->bitmap);
+-	array->nr_active++;
+-	p->array = array;
++	int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio);
++	struct prio_array *array = rq->active;
++	DECLARE_BITMAP(tmp, PRIO_RANGE);
++
++	/*
++	 * Go straight to expiration if there are higher priority tasks
++	 * already expired.
++	 */
++	if (p->static_prio > rq->expired->best_static_prio)
++		return MAX_PRIO;
++	if (!rq->prio_level[uprio])
++		rq->prio_level[uprio] = MAX_RT_PRIO;
++	/*
++	 * Only priorities equal to the prio_level and above for their
++	 * static_prio are acceptable, and only if it's not better than
++	 * a queued better static_prio's prio_level.
++	 */
++	if (p->static_prio < array->best_static_prio) {
++		if (likely(p->policy != SCHED_BATCH))
++			array->best_static_prio = p->static_prio;
++	} else if (p->static_prio == array->best_static_prio) {
++		search_prio = minprio(rq, uprio);
++	} else {
++		int i;
++
++		search_prio = minprio(rq, uprio);
++		/* A bound O(n) function, worst case n is 40 */
++		for (i = array->best_static_prio; i <= p->static_prio ; i++) {
++			if (!rq->prio_level[USER_PRIO(i)])
++				rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO;
++			search_prio = max(search_prio,
++				      rq->prio_level[USER_PRIO(i)]);
++		}
++	}
++	if (unlikely(p->policy == SCHED_BATCH)) {
++		search_prio = max(search_prio, p->static_prio);
++		return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE,
++				  USER_PRIO(search_prio)));
++	}
++	bitmap_or(tmp, p->bitmap, prio_matrix[uprio], PRIO_RANGE);
++	return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE,
++		USER_PRIO(search_prio)));
++}
++
++static void queue_expired(struct task_struct *p, struct rq *rq)
++{
++	task_new_array(p, rq, rq->expired);
++	p->prio = p->normal_prio = first_prio_slot(p);
++	if (p->static_prio < rq->expired->best_static_prio)
++		rq->expired->best_static_prio = p->static_prio;
++	reset_first_time_slice(p);
+ }
+ 
++#ifdef CONFIG_SMP
+ /*
+- * __normal_prio - return the priority that is based on the static
+- * priority but is modified by bonuses/penalties.
+- *
+- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+- * into the -5 ... 0 ... +5 bonus/penalty range.
+- *
+- * We use 25% of the full 0...39 priority range so that:
+- *
+- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+- *
+- * Both properties are important to certain workloads.
++ * If we're waking up a task that was previously on a different runqueue,
++ * update its data appropriately. Note we may be reading data from src_rq->
++ * outside of lock, but the occasional inaccurate result should be harmless.
+  */
++ static void update_if_moved(struct task_struct *p, struct rq *rq)
++{
++	struct rq *src_rq = p->array->rq;
+ 
+-static inline int __normal_prio(struct task_struct *p)
++	if (src_rq == rq)
++		return;
++	/*
++	 * Only need to set p->array when p->rotation == rq->prio_rotation as
++	 * they will be set in recalc_task_prio when != rq->prio_rotation.
++	 */
++	if (p->rotation == src_rq->prio_rotation) {
++		p->rotation = rq->prio_rotation;
++		if (p->array == src_rq->expired)
++			p->array = rq->expired;
++		else
++			p->array = rq->active;
++	} else
++		p->rotation = 0;
++}
++#else
++static inline void update_if_moved(struct task_struct *p, struct rq *rq)
++{
++}
++#endif
++
++static inline int isoprio_suitable(struct task_struct *p)
+ {
+-	int bonus, prio;
++	return !(p->flags & PF_ISOREF);
++}
+ 
+-	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
++static int task_timeslice(struct task_struct *p);
+ 
+-	prio = p->static_prio - bonus;
+-	if (prio < MAX_RT_PRIO)
+-		prio = MAX_RT_PRIO;
+-	if (prio > MAX_PRIO-1)
+-		prio = MAX_PRIO-1;
+-	return prio;
++/*
++ * recalc_task_prio determines what priority a non rt_task will be
++ * queued at. If the task has already been running during this runqueue's
++ * major rotation (rq->prio_rotation) then it continues at the same
++ * priority if it has tick entitlement left. If it does not have entitlement
++ * left, it finds the next priority slot according to its nice value that it
++ * has not extracted quota from. If it has not run during this major
++ * rotation, it starts at the next_entitled_slot and has its bitmap quota
++ * cleared. If it does not have any slots left it has all its slots reset and
++ * is queued on the expired at its first_prio_slot.
++ */
++static void recalc_task_prio(struct task_struct *p, struct rq *rq)
++{
++	struct prio_array *array = rq->active;
++	int queue_prio;
++
++	if (iso_task(p)) {
++		if (isoprio_suitable(p)) {
++			/*
++			 * If SCHED_ISO tasks have not used up their real time
++			 * quota they have run just better than highest
++			 * SCHED_NORMAL priority. Otherwise they run as
++			 * SCHED_NORMAL.
++			 */
++			p->prio = p->normal_prio = ISO_PRIO;
++			p->array = rq->active;
++			if (p->time_slice <= 0)
++				p->time_slice = p->quota;
++			return;
++		} else if (p->prio == ISO_PRIO) {
++			/* Just about to be demoted to SCHED_NORMAL */
++			p->time_slice = 0;
++		}
++	} else if (idleprio_task(p)) {
++		if (idleprio_suitable(p)) {
++			/*
++			 * If suitable idleprio_tasks are queued at MAX_PRIO
++			 * only on the idleprio array. Their time_slice is
++			 * their full task_timeslice as they cooperatively
++			 * multitask.
++			 */
++			p->prio = p->normal_prio = MAX_PRIO;
++			p->array = rq->idleprio;
++			if (p->time_slice <= 0)
++				p->time_slice = task_timeslice(p);
++			return;
++		}
++		/*
++		 * If unsuitable idleprio_tasks are queued equivalent to
++		 * nice 19 tasks on the expired array.
++		 */
++		p->flags &= ~PF_NONSLEEP;
++		p->prio = p->normal_prio = MAX_PRIO - 1;
++		p->array = rq->expired;
++		if (p->time_slice <= 0 || p->time_slice > p->quota)
++			p->time_slice = p->quota;
++		return;
++	}
++
++	update_if_moved(p, rq);
++	if (p->rotation == rq->prio_rotation) {
++		if (p->array == array) {
++			if (p->time_slice > 0)
++				return;
++			p->time_slice = p->quota;
++		} else if (p->array == rq->expired) {
++			queue_expired(p, rq);
++			return;
++		} else
++			task_new_array(p, rq, array);
++	} else
++		task_new_array(p, rq, array);
++
++	queue_prio = next_entitled_slot(p, rq);
++	if (queue_prio >= MAX_PRIO) {
++		queue_expired(p, rq);
++		return;
++	}
++	p->prio = p->normal_prio = queue_prio;
++	__set_bit(USER_PRIO(p->prio), p->bitmap);
++}
++
++/*
++ * Adding to a runqueue. The dynamic priority queue that it is added to is
++ * determined by recalc_task_prio() above.
++ */
++static inline void __enqueue_task(struct task_struct *p, struct rq *rq)
++{
++	if (rt_task(p))
++		p->array = rq->active;
++	else
++		recalc_task_prio(p, rq);
++
++	if (idleprio_task(p) && idleprio(p))
++		rq->nr_idleprio++;
++	sched_info_queued(p);
++	set_dynamic_bit(p, rq);
++}
++
++static void enqueue_task(struct task_struct *p, struct rq *rq)
++{
++	__enqueue_task(p, rq);
++	list_add_tail(&p->run_list, p->array->queue + p->prio);
++}
++
++static inline void enqueue_task_head(struct task_struct *p, struct rq *rq)
++{
++	__enqueue_task(p, rq);
++	list_add(&p->run_list, p->array->queue + p->prio);
++}
++
++/*
++ * requeue_task is only called when p->static_prio does not change. p->prio
++ * can change with dynamic tasks.
++ */
++static void requeue_task(struct task_struct *p, struct rq *rq,
++			 struct prio_array *old_array, int old_prio)
++{
++	if (p->array == rq->expired)
++		queue_expired(p, rq);
++	list_move_tail(&p->run_list, p->array->queue + p->prio);
++	if (!rt_task(p)) {
++		if (list_empty(old_array->queue + old_prio))
++			__clear_bit(old_prio, old_array->prio_bitmap);
++		set_dynamic_bit(p, rq);
++	}
+ }
+ 
+ /*
+@@ -759,20 +964,29 @@ static inline int __normal_prio(struct t
+  */
+ 
+ /*
+- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
+- * If static_prio_timeslice() is ever changed to break this assumption then
+- * this code will need modification
+- */
+-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
+-#define LOAD_WEIGHT(lp) \
+-	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
+-#define PRIO_TO_LOAD_WEIGHT(prio) \
+-	LOAD_WEIGHT(static_prio_timeslice(prio))
+-#define RTPRIO_TO_LOAD_WEIGHT(rp) \
+-	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
++ * task_timeslice - the total duration a task can run during one major
++ * rotation. Returns value in milliseconds as the smallest value can be 1.
++ */
++static int task_timeslice(struct task_struct *p)
++{
++	int slice = p->quota;	/* quota is in us */
++
++	if (!rt_task(p))
++		slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice;
++	return US_TO_MS(slice);
++}
++
++/*
++ * The load weight is basically the task_timeslice in ms. Realtime tasks are
++ * special cased to be proportionately larger than nice -20 by their
++ * rt_priority. The weight for rt tasks can only be arbitrary at best.
++ */
++#define RTPRIO_TO_LOAD_WEIGHT(rp)	(rr_interval * 20 * (40 + rp))
+ 
+ static void set_load_weight(struct task_struct *p)
+ {
++	int load_weight;
++
+ 	if (has_rt_policy(p)) {
+ #ifdef CONFIG_SMP
+ 		if (p == task_rq(p)->migration_thread)
+@@ -781,12 +995,19 @@ static void set_load_weight(struct task_
+ 			 * Giving its load any weight will skew balancing
+ 			 * adversely.
+ 			 */
+-			p->load_weight = 0;
++			load_weight = 0;
+ 		else
+ #endif
+-			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
++			load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+ 	} else
+-		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
++		load_weight = task_timeslice(p);
++	/*
++	 * idleprio tasks have much lower weight than SCHED_NORMAL tasks but
++	 * still need to be weighted to allow balancing to occur.
++	 */
++	if (likely(!idleprio_task(p)))
++		load_weight *= PRIO_RANGE;
++	p->load_weight = load_weight;
+ }
+ 
+ static inline void
+@@ -814,28 +1035,38 @@ static inline void dec_nr_running(struct
+ }
+ 
+ /*
+- * Calculate the expected normal priority: i.e. priority
+- * without taking RT-inheritance into account. Might be
+- * boosted by interactivity modifiers. Changes upon fork,
+- * setprio syscalls, and whenever the interactivity
+- * estimator recalculates.
++ * __activate_task - move a task to the runqueue.
+  */
+-static inline int normal_prio(struct task_struct *p)
++static inline void __activate_task(struct task_struct *p, struct rq *rq)
+ {
+-	int prio;
++	enqueue_task(p, rq);
++	inc_nr_running(p, rq);
++}
+ 
++/*
++ * __activate_idle_task - move idle task to the _front_ of runqueue.
++ */
++static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
++{
++	enqueue_task_head(p, rq);
++	inc_nr_running(p, rq);
++}
++
++static inline int normal_prio(struct task_struct *p)
++{
+ 	if (has_rt_policy(p))
+-		prio = MAX_RT_PRIO-1 - p->rt_priority;
++		return MAX_RT_PRIO-1 - p->rt_priority;
++	/* Other tasks all have normal_prio set in recalc_task_prio */
++	if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO))
++		return p->prio;
+ 	else
+-		prio = __normal_prio(p);
+-	return prio;
++		return p->static_prio;
+ }
+ 
+ /*
+  * Calculate the current priority, i.e. the priority
+  * taken into account by the scheduler. This value might
+- * be boosted by RT tasks, or might be boosted by
+- * interactivity modifiers. Will be RT if the task got
++ * be boosted by RT tasks as it will be RT if the task got
+  * RT-boosted. If not then it returns p->normal_prio.
+  */
+ static int effective_prio(struct task_struct *p)
+@@ -852,111 +1083,41 @@ static int effective_prio(struct task_st
+ }
+ 
+ /*
+- * __activate_task - move a task to the runqueue.
++ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval.
++ * From nice 1 to 19 they are smaller than it only if they are at least one
++ * tick still. Below nice 0 they get progressively larger.
++ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval
++ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2.
++ * Value returned is in microseconds.
+  */
+-static void __activate_task(struct task_struct *p, struct rq *rq)
++static inline unsigned int rr_quota(struct task_struct *p)
+ {
+-	struct prio_array *target = rq->active;
++	int nice = TASK_NICE(p), rr = rr_interval;
+ 
+-	if (batch_task(p))
+-		target = rq->expired;
+-	enqueue_task(p, target);
+-	inc_nr_running(p, rq);
+-}
+-
+-/*
+- * __activate_idle_task - move idle task to the _front_ of runqueue.
+- */
+-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
+-{
+-	enqueue_task_head(p, rq->active);
+-	inc_nr_running(p, rq);
++	if (!rt_task(p)) {
++		if (nice < -6) {
++			rr *= nice * nice;
++			rr /= 40;
++		} else if (nice > 0)
++			rr = rr / 2 ? : 1;
++	}
++	return MS_TO_US(rr);
+ }
+ 
+-/*
+- * Recalculate p->normal_prio and p->prio after having slept,
+- * updating the sleep-average too:
+- */
+-static int recalc_task_prio(struct task_struct *p, unsigned long long now)
++/* Every time we set the quota we need to set the load weight */
++static void set_quota(struct task_struct *p)
+ {
+-	/* Caller must always ensure 'now >= p->timestamp' */
+-	unsigned long sleep_time = now - p->timestamp;
+-
+-	if (batch_task(p))
+-		sleep_time = 0;
+-
+-	if (likely(sleep_time > 0)) {
+-		/*
+-		 * This ceiling is set to the lowest priority that would allow
+-		 * a task to be reinserted into the active array on timeslice
+-		 * completion.
+-		 */
+-		unsigned long ceiling = INTERACTIVE_SLEEP(p);
+-
+-		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
+-			/*
+-			 * Prevents user tasks from achieving best priority
+-			 * with one single large enough sleep.
+-			 */
+-			p->sleep_avg = ceiling;
+-			/*
+-			 * Using INTERACTIVE_SLEEP() as a ceiling places a
+-			 * nice(0) task 1ms sleep away from promotion, and
+-			 * gives it 700ms to round-robin with no chance of
+-			 * being demoted.  This is more than generous, so
+-			 * mark this sleep as non-interactive to prevent the
+-			 * on-runqueue bonus logic from intervening should
+-			 * this task not receive cpu immediately.
+-			 */
+-			p->sleep_type = SLEEP_NONINTERACTIVE;
+-		} else {
+-			/*
+-			 * Tasks waking from uninterruptible sleep are
+-			 * limited in their sleep_avg rise as they
+-			 * are likely to be waiting on I/O
+-			 */
+-			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
+-				if (p->sleep_avg >= ceiling)
+-					sleep_time = 0;
+-				else if (p->sleep_avg + sleep_time >=
+-					 ceiling) {
+-						p->sleep_avg = ceiling;
+-						sleep_time = 0;
+-				}
+-			}
+-
+-			/*
+-			 * This code gives a bonus to interactive tasks.
+-			 *
+-			 * The boost works by updating the 'average sleep time'
+-			 * value here, based on ->timestamp. The more time a
+-			 * task spends sleeping, the higher the average gets -
+-			 * and the higher the priority boost gets as well.
+-			 */
+-			p->sleep_avg += sleep_time;
+-
+-		}
+-		if (p->sleep_avg > NS_MAX_SLEEP_AVG)
+-			p->sleep_avg = NS_MAX_SLEEP_AVG;
+-	}
+-
+-	return effective_prio(p);
++	p->quota = rr_quota(p);
++	set_load_weight(p);
+ }
+ 
+ /*
+  * activate_task - move a task to the runqueue and do priority recalculation
+- *
+- * Update all the scheduling statistics stuff. (sleep average
+- * calculation, priority modifiers, etc.)
+  */
+ static void activate_task(struct task_struct *p, struct rq *rq, int local)
+ {
+-	unsigned long long now;
+-
+-	if (rt_task(p))
+-		goto out;
++	unsigned long long now = sched_clock();
+ 
+-	now = sched_clock();
+ #ifdef CONFIG_SMP
+ 	if (!local) {
+ 		/* Compensate for drifting sched_clock */
+@@ -977,32 +1138,9 @@ static void activate_task(struct task_st
+ 				     (now - p->timestamp) >> 20);
+ 	}
+ 
+-	p->prio = recalc_task_prio(p, now);
+-
+-	/*
+-	 * This checks to make sure it's not an uninterruptible task
+-	 * that is now waking up.
+-	 */
+-	if (p->sleep_type == SLEEP_NORMAL) {
+-		/*
+-		 * Tasks which were woken up by interrupts (ie. hw events)
+-		 * are most likely of interactive nature. So we give them
+-		 * the credit of extending their sleep time to the period
+-		 * of time they spend on the runqueue, waiting for execution
+-		 * on a CPU, first time around:
+-		 */
+-		if (in_interrupt())
+-			p->sleep_type = SLEEP_INTERRUPTED;
+-		else {
+-			/*
+-			 * Normal first-time wakeups get a credit too for
+-			 * on-runqueue time, but it will be weighted down:
+-			 */
+-			p->sleep_type = SLEEP_INTERACTIVE;
+-		}
+-	}
++	set_quota(p);
++	p->prio = effective_prio(p);
+ 	p->timestamp = now;
+-out:
+ 	__activate_task(p, rq);
+ }
+ 
+@@ -1012,8 +1150,7 @@ out:
+ static void deactivate_task(struct task_struct *p, struct rq *rq)
+ {
+ 	dec_nr_running(p, rq);
+-	dequeue_task(p, p->array);
+-	p->array = NULL;
++	dequeue_task(p, rq);
+ }
+ 
+ /*
+@@ -1095,7 +1232,7 @@ migrate_task(struct task_struct *p, int 
+ 	 * If the task is not on a runqueue (and not running), then
+ 	 * it is sufficient to simply update the task's cpu field.
+ 	 */
+-	if (!p->array && !task_running(rq, p)) {
++	if (!task_queued(p) && !task_running(rq, p)) {
+ 		set_task_cpu(p, dest_cpu);
+ 		return 0;
+ 	}
+@@ -1126,7 +1263,7 @@ void wait_task_inactive(struct task_stru
+ repeat:
+ 	rq = task_rq_lock(p, &flags);
+ 	/* Must be off runqueue entirely, not preempted. */
+-	if (unlikely(p->array || task_running(rq, p))) {
++	if (unlikely(task_queued(p) || task_running(rq, p))) {
+ 		/* If it's preempted, we yield.  It could be a while. */
+ 		preempted = !task_running(rq, p);
+ 		task_rq_unlock(rq, &flags);
+@@ -1391,6 +1528,31 @@ static inline int wake_idle(int cpu, str
+ }
+ #endif
+ 
++/*
++ * We need to have a special definition for an idle runqueue when testing
++ * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as
++ * a realtime task in sched_idle_next.
++ */
++#ifdef CONFIG_HOTPLUG_CPU
++#define rq_idle(rq)	((rq)->curr == (rq)->idle && !rt_task((rq)->curr))
++#else
++#define rq_idle(rq)	((rq)->curr == (rq)->idle)
++#endif
++
++static inline int task_preempts_curr(struct task_struct *p, struct rq *rq)
++{
++	struct task_struct *curr = rq->curr;
++
++	return ((p->array == task_rq(p)->active &&
++		TASK_PREEMPTS_CURR(p, curr)) || rq_idle(rq));
++}
++
++static inline void try_preempt(struct task_struct *p, struct rq *rq)
++{
++	if (task_preempts_curr(p, rq))
++		resched_task(rq->curr);
++}
++
+ /***
+  * try_to_wake_up - wake up a thread
+  * @p: the to-be-woken-up thread
+@@ -1422,7 +1584,7 @@ static int try_to_wake_up(struct task_st
+ 	if (!(old_state & state))
+ 		goto out;
+ 
+-	if (p->array)
++	if (task_queued(p))
+ 		goto out_running;
+ 
+ 	cpu = task_cpu(p);
+@@ -1515,7 +1677,7 @@ out_set_cpu:
+ 		old_state = p->state;
+ 		if (!(old_state & state))
+ 			goto out;
+-		if (p->array)
++		if (task_queued(p))
+ 			goto out_running;
+ 
+ 		this_cpu = smp_processor_id();
+@@ -1524,25 +1686,9 @@ out_set_cpu:
+ 
+ out_activate:
+ #endif /* CONFIG_SMP */
+-	if (old_state == TASK_UNINTERRUPTIBLE) {
++	if (old_state == TASK_UNINTERRUPTIBLE)
+ 		rq->nr_uninterruptible--;
+-		/*
+-		 * Tasks on involuntary sleep don't earn
+-		 * sleep_avg beyond just interactive state.
+-		 */
+-		p->sleep_type = SLEEP_NONINTERACTIVE;
+-	} else
+-
+-	/*
+-	 * Tasks that have marked their sleep as noninteractive get
+-	 * woken up with their sleep average not weighted in an
+-	 * interactive way.
+-	 */
+-		if (old_state & TASK_NONINTERACTIVE)
+-			p->sleep_type = SLEEP_NONINTERACTIVE;
+ 
+-
+-	activate_task(p, rq, cpu == this_cpu);
+ 	/*
+ 	 * Sync wakeups (i.e. those types of wakeups where the waker
+ 	 * has indicated that it will leave the CPU in short order)
+@@ -1551,15 +1697,22 @@ out_activate:
+ 	 * the waker guarantees that the freshly woken up task is going
+ 	 * to be considered on this CPU.)
+ 	 */
+-	if (!sync || cpu != this_cpu) {
+-		if (TASK_PREEMPTS_CURR(p, rq))
+-			resched_task(rq->curr);
+-	}
++	activate_task(p, rq, cpu == this_cpu);
++	if (!sync || cpu != this_cpu)
++		try_preempt(p, rq);
+ 	success = 1;
+ 
+ out_running:
+ 	p->state = TASK_RUNNING;
+ out:
++	/*
++	 * Special case when freezing we need to reschedule idleprio tasks
++	 * as SCHED_NORMAL or else they'll never freeze
++	 */
++	if (idleprio_task(p) && freezing(p) && idleprio(p)) {
++		dequeue_task(p, rq);
++		enqueue_task(p, rq);
++	}
+ 	task_rq_unlock(rq, &flags);
+ 
+ 	return success;
+@@ -1577,7 +1730,6 @@ int fastcall wake_up_state(struct task_s
+ 	return try_to_wake_up(p, state, 0);
+ }
+ 
+-static void task_running_tick(struct rq *rq, struct task_struct *p);
+ /*
+  * Perform scheduler related setup for a newly forked process p.
+  * p is forked by current.
+@@ -1605,7 +1757,6 @@ void fastcall sched_fork(struct task_str
+ 	p->prio = current->normal_prio;
+ 
+ 	INIT_LIST_HEAD(&p->run_list);
+-	p->array = NULL;
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ 	if (unlikely(sched_info_on()))
+ 		memset(&p->sched_info, 0, sizeof(p->sched_info));
+@@ -1617,30 +1768,31 @@ void fastcall sched_fork(struct task_str
+ 	/* Want to start with kernel preemption disabled. */
+ 	task_thread_info(p)->preempt_count = 1;
+ #endif
++	if (unlikely(p->policy == SCHED_FIFO))
++		goto out;
+ 	/*
+ 	 * Share the timeslice between parent and child, thus the
+ 	 * total amount of pending timeslices in the system doesn't change,
+ 	 * resulting in more scheduling fairness.
+ 	 */
+ 	local_irq_disable();
+-	p->time_slice = (current->time_slice + 1) >> 1;
+-	/*
+-	 * The remainder of the first timeslice might be recovered by
+-	 * the parent if the child exits early enough.
+-	 */
+-	p->first_time_slice = 1;
+-	current->time_slice >>= 1;
+-	p->timestamp = sched_clock();
+-	if (unlikely(!current->time_slice)) {
++	if (current->time_slice > 0) {
++		current->time_slice /= 2;
++		if (current->time_slice)
++			p->time_slice = current->time_slice;
++		else
++			p->time_slice = 1;
+ 		/*
+-		 * This case is rare, it happens when the parent has only
+-		 * a single jiffy left from its timeslice. Taking the
+-		 * runqueue lock is not a problem.
++		 * The remainder of the first timeslice might be recovered by
++		 * the parent if the child exits early enough.
+ 		 */
+-		current->time_slice = 1;
+-		task_running_tick(cpu_rq(cpu), current);
+-	}
++		p->first_time_slice = 1;
++	} else
++		p->time_slice = 0;
++
++	p->timestamp = sched_clock();
+ 	local_irq_enable();
++out:
+ 	put_cpu();
+ }
+ 
+@@ -1662,38 +1814,16 @@ void fastcall wake_up_new_task(struct ta
+ 	this_cpu = smp_processor_id();
+ 	cpu = task_cpu(p);
+ 
+-	/*
+-	 * We decrease the sleep average of forking parents
+-	 * and children as well, to keep max-interactive tasks
+-	 * from forking tasks that are max-interactive. The parent
+-	 * (current) is done further down, under its lock.
+-	 */
+-	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
+-		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+-
+-	p->prio = effective_prio(p);
+-
+ 	if (likely(cpu == this_cpu)) {
++		activate_task(p, rq, 1);
+ 		if (!(clone_flags & CLONE_VM)) {
+ 			/*
+ 			 * The VM isn't cloned, so we're in a good position to
+ 			 * do child-runs-first in anticipation of an exec. This
+ 			 * usually avoids a lot of COW overhead.
+ 			 */
+-			if (unlikely(!current->array))
+-				__activate_task(p, rq);
+-			else {
+-				p->prio = current->prio;
+-				p->normal_prio = current->normal_prio;
+-				list_add_tail(&p->run_list, &current->run_list);
+-				p->array = current->array;
+-				p->array->nr_active++;
+-				inc_nr_running(p, rq);
+-			}
+ 			set_need_resched();
+-		} else
+-			/* Run child last */
+-			__activate_task(p, rq);
++		}
+ 		/*
+ 		 * We skip the following code due to cpu == this_cpu
+ 	 	 *
+@@ -1710,19 +1840,16 @@ void fastcall wake_up_new_task(struct ta
+ 		 */
+ 		p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
+ 					+ rq->most_recent_timestamp;
+-		__activate_task(p, rq);
+-		if (TASK_PREEMPTS_CURR(p, rq))
+-			resched_task(rq->curr);
++		activate_task(p, rq, 0);
++		try_preempt(p, rq);
+ 
+ 		/*
+ 		 * Parent and child are on different CPUs, now get the
+-		 * parent runqueue to update the parent's ->sleep_avg:
++		 * parent runqueue to update the parent's ->flags:
+ 		 */
+ 		task_rq_unlock(rq, &flags);
+ 		this_rq = task_rq_lock(current, &flags);
+ 	}
+-	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+-		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+ 	task_rq_unlock(this_rq, &flags);
+ }
+ 
+@@ -1737,23 +1864,17 @@ void fastcall wake_up_new_task(struct ta
+  */
+ void fastcall sched_exit(struct task_struct *p)
+ {
++	struct task_struct *parent;
+ 	unsigned long flags;
+ 	struct rq *rq;
+ 
+-	/*
+-	 * If the child was a (relative-) CPU hog then decrease
+-	 * the sleep_avg of the parent as well.
+-	 */
+-	rq = task_rq_lock(p->parent, &flags);
+-	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
+-		p->parent->time_slice += p->time_slice;
+-		if (unlikely(p->parent->time_slice > task_timeslice(p)))
+-			p->parent->time_slice = task_timeslice(p);
+-	}
+-	if (p->sleep_avg < p->parent->sleep_avg)
+-		p->parent->sleep_avg = p->parent->sleep_avg /
+-		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
+-		(EXIT_WEIGHT + 1);
++	parent = p->parent;
++	rq = task_rq_lock(parent, &flags);
++	if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) {
++		parent->time_slice += p->time_slice;
++		if (unlikely(parent->time_slice > parent->quota))
++			parent->time_slice = parent->quota;
++	}
+ 	task_rq_unlock(rq, &flags);
+ }
+ 
+@@ -2085,23 +2206,17 @@ void sched_exec(void)
+  * pull_task - move a task from a remote runqueue to the local runqueue.
+  * Both runqueues must be locked.
+  */
+-static void pull_task(struct rq *src_rq, struct prio_array *src_array,
+-		      struct task_struct *p, struct rq *this_rq,
+-		      struct prio_array *this_array, int this_cpu)
++static void pull_task(struct rq *src_rq, struct task_struct *p,
++		      struct rq *this_rq, int this_cpu)
+ {
+-	dequeue_task(p, src_array);
++	dequeue_task(p, src_rq);
+ 	dec_nr_running(p, src_rq);
+ 	set_task_cpu(p, this_cpu);
+ 	inc_nr_running(p, this_rq);
+-	enqueue_task(p, this_array);
++	enqueue_task(p, this_rq);
+ 	p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
+ 				+ this_rq->most_recent_timestamp;
+-	/*
+-	 * Note that idle threads have a prio of MAX_PRIO, for this test
+-	 * to be always true for them.
+-	 */
+-	if (TASK_PREEMPTS_CURR(p, this_rq))
+-		resched_task(this_rq->curr);
++	try_preempt(p, this_rq);
+ }
+ 
+ /*
+@@ -2144,7 +2259,16 @@ int can_migrate_task(struct task_struct 
+ 	return 1;
+ }
+ 
+-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
++static inline int rq_best_prio(struct rq *rq)
++{
++	int best_prio, exp_prio;
++
++	best_prio = sched_find_first_bit(rq->dyn_bitmap);
++	exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO);
++	if (unlikely(best_prio > exp_prio))
++		best_prio = exp_prio;
++	return best_prio;
++}
+ 
+ /*
+  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+@@ -2160,7 +2284,7 @@ static int move_tasks(struct rq *this_rq
+ {
+ 	int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
+ 	    best_prio_seen, skip_for_load;
+-	struct prio_array *array, *dst_array;
++	struct prio_array *array;
+ 	struct list_head *head, *curr;
+ 	struct task_struct *tmp;
+ 	long rem_load_move;
+@@ -2187,31 +2311,29 @@ static int move_tasks(struct rq *this_rq
+ 	 * be cache-cold, thus switching CPUs has the least effect
+ 	 * on them.
+ 	 */
+-	if (busiest->expired->nr_active) {
+-		array = busiest->expired;
+-		dst_array = this_rq->expired;
+-	} else {
+-		array = busiest->active;
+-		dst_array = this_rq->active;
+-	}
+-
++	array = busiest->expired;
+ new_array:
+-	/* Start searching at priority 0: */
+-	idx = 0;
++	/* Expired arrays don't have RT tasks so they're always MAX_RT_PRIO+ */
++	if (array == busiest->expired)
++		idx = MAX_RT_PRIO;
++	else
++		idx = 0;
+ skip_bitmap:
+ 	if (!idx)
+-		idx = sched_find_first_bit(array->bitmap);
++		idx = sched_find_first_bit(array->prio_bitmap);
+ 	else
+-		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+-	if (idx >= MAX_PRIO) {
+-		if (array == busiest->expired && busiest->active->nr_active) {
++		idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx);
++	if (idx == MAX_PRIO) {
++		if (array == busiest->idleprio && busiest->nr_idleprio)
++			goto found_idleprio;
++		if (array == busiest->expired) {
+ 			array = busiest->active;
+-			dst_array = this_rq->active;
+ 			goto new_array;
+ 		}
+ 		goto out;
+ 	}
+ 
++found_idleprio:
+ 	head = array->queue + idx;
+ 	curr = head->prev;
+ skip_queue:
+@@ -2233,11 +2355,22 @@ skip_queue:
+ 		best_prio_seen |= idx == best_prio;
+ 		if (curr != head)
+ 			goto skip_queue;
++		if (idx == MAX_PRIO) {
++			/*
++			 * Occurs either when balancing idleprio tasks or
++			 * there really are no more tasks to find.
++			 */
++			if (array == busiest->expired) {
++				array = busiest->active;
++				goto new_array;
++			}
++			goto out;
++		}
+ 		idx++;
+ 		goto skip_bitmap;
+ 	}
+ 
+-	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
++	pull_task(busiest, tmp, this_rq, this_cpu);
+ 	pulled++;
+ 	rem_load_move -= tmp->load_weight;
+ 
+@@ -2250,6 +2383,13 @@ skip_queue:
+ 			this_best_prio = idx;
+ 		if (curr != head)
+ 			goto skip_queue;
++		if (idx == MAX_PRIO) {
++			if (array == busiest->expired) {
++				array = busiest->active;
++				goto new_array;
++			}
++			goto out;
++		}
+ 		idx++;
+ 		goto skip_bitmap;
+ 	}
+@@ -3013,11 +3153,36 @@ EXPORT_PER_CPU_SYMBOL(kstat);
+ /*
+  * This is called on clock ticks and on context switches.
+  * Bank in p->sched_time the ns elapsed since the last tick or switch.
++ * CPU scheduler quota accounting is also performed here in microseconds.
++ * The value returned from sched_clock() occasionally gives bogus values so
++ * some sanity checking is required.
+  */
+-static inline void
+-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
++static void
++update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
++		 int tick)
+ {
+-	p->sched_time += now - p->last_ran;
++	long time_diff = now - p->last_ran;
++
++	if (tick) {
++		/*
++		 * Called from scheduler_tick() there should be less than two
++		 * jiffies worth, and not negative/overflow.
++		 */
++		if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0)
++			time_diff = JIFFIES_TO_NS(1);
++	} else {
++		/*
++		 * Called from context_switch there should be less than one
++		 * jiffy worth, and not negative/overflow. There should be
++		 * some time banked here so use a nominal 1us.
++		 */
++		if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1)
++			time_diff = 1000;
++	}
++	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
++	if (p != rq->idle && p->policy != SCHED_FIFO)
++		p->time_slice -= time_diff / 1000;
++	p->sched_time += time_diff;
+ 	p->last_ran = rq->most_recent_timestamp = now;
+ }
+ 
+@@ -3038,27 +3203,6 @@ unsigned long long current_sched_time(co
+ }
+ 
+ /*
+- * We place interactive tasks back into the active array, if possible.
+- *
+- * To guarantee that this does not starve expired tasks we ignore the
+- * interactivity of a task if the first expired task had to wait more
+- * than a 'reasonable' amount of time. This deadline timeout is
+- * load-dependent, as the frequency of array switched decreases with
+- * increasing number of running tasks. We also ignore the interactivity
+- * if a better static_prio task has expired:
+- */
+-static inline int expired_starving(struct rq *rq)
+-{
+-	if (rq->curr->static_prio > rq->best_expired_prio)
+-		return 1;
+-	if (!STARVATION_LIMIT || !rq->expired_timestamp)
+-		return 0;
+-	if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
+-		return 1;
+-	return 0;
+-}
+-
+-/*
+  * Account user cpu time to a process.
+  * @p: the process that the cpu time gets accounted to
+  * @hardirq_offset: the offset to subtract from hardirq_count()
+@@ -3073,7 +3217,7 @@ void account_user_time(struct task_struc
+ 
+ 	/* Add user time to cpustat. */
+ 	tmp = cputime_to_cputime64(cputime);
+-	if (TASK_NICE(p) > 0)
++	if (TASK_NICE(p) > 0 || idleprio_task(p))
+ 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
+ 	else
+ 		cpustat->user = cputime64_add(cpustat->user, tmp);
+@@ -3131,87 +3275,94 @@ void account_steal_time(struct task_stru
+ 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
+ }
+ 
+-static void task_running_tick(struct rq *rq, struct task_struct *p)
++/*
++ * The task has used up its quota of running in this prio_level so it must be
++ * dropped a priority level, all managed by recalc_task_prio().
++ */
++static void task_expired_entitlement(struct rq *rq, struct task_struct *p)
+ {
+-	if (p->array != rq->active) {
+-		/* Task has expired but was not scheduled yet */
+-		set_tsk_need_resched(p);
++	int overrun;
++
++	reset_first_time_slice(p);
++	if (rt_task(p)) {
++		p->time_slice += p->quota;
++		list_move_tail(&p->run_list, p->array->queue + p->prio);
+ 		return;
+ 	}
+-	spin_lock(&rq->lock);
++	overrun = p->time_slice;
++	dequeue_task(p, rq);
++	enqueue_task(p, rq);
+ 	/*
+-	 * The task was running during this tick - update the
+-	 * time slice counter. Note: we do not update a thread's
+-	 * priority until it either goes to sleep or uses up its
+-	 * timeslice. This makes it possible for interactive tasks
+-	 * to use up their timeslices at their highest priority levels.
++	 * Subtract any extra time this task ran over its time_slice; ie
++	 * overrun will either be 0 or negative.
+ 	 */
+-	if (rt_task(p)) {
+-		/*
+-		 * RR tasks need a special form of timeslice management.
+-		 * FIFO tasks have no timeslices.
+-		 */
+-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
+-			p->time_slice = task_timeslice(p);
+-			p->first_time_slice = 0;
+-			set_tsk_need_resched(p);
++	p->time_slice += overrun;
++}
+ 
+-			/* put it at the end of the queue: */
+-			requeue_task(p, rq->active);
+-		}
+-		goto out_unlock;
++/*
++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
++ * tasks and set the refractory flag if necessary. There is 10% hysteresis
++ * for unsetting the flag.
++ */
++static unsigned int test_ret_isorefractory(struct rq *rq)
++{
++	if (likely(!rq->iso_refractory)) {
++		if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu)
++			rq->iso_refractory = 1;
++	} else {
++		if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100))
++			rq->iso_refractory = 0;
+ 	}
+-	if (!--p->time_slice) {
+-		dequeue_task(p, rq->active);
+-		set_tsk_need_resched(p);
+-		p->prio = effective_prio(p);
+-		p->time_slice = task_timeslice(p);
+-		p->first_time_slice = 0;
++	return rq->iso_refractory;
++}
+ 
+-		if (!rq->expired_timestamp)
+-			rq->expired_timestamp = jiffies;
+-		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+-			enqueue_task(p, rq->expired);
+-			if (p->static_prio < rq->best_expired_prio)
+-				rq->best_expired_prio = p->static_prio;
+-		} else
+-			enqueue_task(p, rq->active);
+-	} else {
+-		/*
+-		 * Prevent a too long timeslice allowing a task to monopolize
+-		 * the CPU. We do this by splitting up the timeslice into
+-		 * smaller pieces.
+-		 *
+-		 * Note: this does not mean the task's timeslices expire or
+-		 * get lost in any way, they just might be preempted by
+-		 * another task of equal priority. (one with higher
+-		 * priority would have preempted this task already.) We
+-		 * requeue this task to the end of the list on this priority
+-		 * level, which is in essence a round-robin of tasks with
+-		 * equal priority.
+-		 *
+-		 * This only applies to tasks in the interactive
+-		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
+-		 */
+-		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
+-			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
+-			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
+-			(p->array == rq->active)) {
++/* No SCHED_ISO task was running so decrease rq->iso_ticks */
++static inline void no_iso_tick(struct rq *rq)
++{
++	rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
++}
+ 
+-			requeue_task(p, rq->active);
+-			set_tsk_need_resched(p);
+-		}
++/* This manages tasks that have run out of timeslice during a scheduler_tick */
++static void task_running_tick(struct rq *rq, struct task_struct *p)
++{
++	/*
++	 * If a SCHED_ISO task is running we increment the iso_ticks. In
++	 * order to prevent SCHED_ISO tasks from causing starvation in the
++	 * presence of true RT tasks we account those as iso_ticks as well.
++	 */
++	if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) {
++		if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100)
++			rq->iso_ticks += 100;
++	} else
++		no_iso_tick(rq);
++
++	if (iso_task(p)) {
++		if (unlikely(test_ret_isorefractory(rq))) {
++			if (isoprio_suitable(p)) {
++				/*
++				 * SCHED_ISO task is running as RT and limit
++				 * has been hit. Set the PF_ISOREF flag and
++				 * force it to reschedule as SCHED_NORMAL
++				 * by zeroing its time_slice
++				 */
++				p->flags |= PF_ISOREF;
++				p->time_slice = 0;
++			}
++		} else
++			p->flags &= ~PF_ISOREF;
+ 	}
+-out_unlock:
+-	spin_unlock(&rq->lock);
++	/* SCHED_FIFO tasks never run out of timeslice. */
++	if (p->time_slice > 0 || p->policy == SCHED_FIFO)
++		return;
++	/* p->time_slice <= 0 */
++	set_tsk_need_resched(p);
++	if (likely(task_queued(p)))
++		task_expired_entitlement(rq, p);
+ }
+ 
+ /*
+  * This function gets called by the timer code, with HZ frequency.
+  * We call it with interrupts disabled.
+- *
+- * It also gets called by the fork code, when changing the parent's
+- * timeslices.
+  */
+ void scheduler_tick(void)
+ {
+@@ -3220,10 +3371,14 @@ void scheduler_tick(void)
+ 	int cpu = smp_processor_id();
+ 	struct rq *rq = cpu_rq(cpu);
+ 
+-	update_cpu_clock(p, rq, now);
++	update_cpu_clock(p, rq, now, 1);
+ 
++	spin_lock(&rq->lock);
+ 	if (p != rq->idle)
+ 		task_running_tick(rq, p);
++	else
++		no_iso_tick(rq);
++	spin_unlock(&rq->lock);
+ #ifdef CONFIG_SMP
+ 	update_load(rq);
+ 	if (time_after_eq(jiffies, rq->next_balance))
+@@ -3269,10 +3424,80 @@ EXPORT_SYMBOL(sub_preempt_count);
+ 
+ #endif
+ 
+-static inline int interactive_sleep(enum sleep_type sleep_type)
++static void reset_prio_levels(struct rq *rq)
++{
++	rq->active->best_static_prio = MAX_PRIO - 1;
++	rq->expired->best_static_prio = MAX_PRIO - 1;
++	memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE);
++}
++
++/*
++ * Only tasks running are SCHED_IDLEPRIO. Set the active array to the
++ * idleprio array and if it isn't already active
++ */
++static struct task_struct *next_idleprio_task(struct rq *rq)
+ {
+-	return (sleep_type == SLEEP_INTERACTIVE ||
+-		sleep_type == SLEEP_INTERRUPTED);
++	struct prio_array *array = rq->active;
++	struct list_head *queue;
++
++	if (array != rq->idleprio) {
++		rq->active = rq->idleprio;
++		rq->expired = array;
++		array = rq->active;
++		rq->exp_bitmap = rq->expired->prio_bitmap;
++		rq->dyn_bitmap = rq->active->prio_bitmap;
++	}
++	rq->prio_rotation++;
++	reset_prio_levels(rq);
++	queue = array->queue + MAX_PRIO;
++	return list_entry(queue->next, struct task_struct, run_list);
++}
++
++/*
++ * next_dynamic_task finds the next suitable dynamic task.
++ */
++static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx)
++{
++	struct prio_array *array = rq->active;
++	struct task_struct *next;
++	struct list_head *queue;
++	int nstatic;
++
++retry:
++	if (unlikely(rq->nr_running == rq->nr_idleprio))
++		return next_idleprio_task(rq);
++	if (idx >= MAX_PRIO) {
++		/* There are no more tasks in the active array. Swap arrays */
++		array = rq->expired;
++		rq->expired = rq->active;
++		rq->active = array;
++		rq->exp_bitmap = rq->expired->prio_bitmap;
++		rq->dyn_bitmap = rq->active->prio_bitmap;
++		rq->prio_rotation++;
++		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
++		reset_prio_levels(rq);
++	}
++	queue = array->queue + idx;
++	next = list_entry(queue->next, struct task_struct, run_list);
++	if (unlikely(next->time_slice <= 0 && !(iso_task(next) &&
++	    isoprio_suitable(next)))) {
++		/*
++		 * Unlucky enough that this task ran out of time_slice
++		 * before it hit a scheduler_tick so it should have its
++		 * priority reassessed and choose another task (possibly
++		 * the same one)
++		 */
++		task_expired_entitlement(rq, next);
++		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
++		goto retry;
++	}
++	next->rotation = rq->prio_rotation;
++	nstatic = next->static_prio;
++	if (nstatic < array->best_static_prio)
++		array->best_static_prio = nstatic;
++	if (idx > rq->prio_level[USER_PRIO(nstatic)])
++		rq->prio_level[USER_PRIO(nstatic)] = idx;
++	return next;
+ }
+ 
+ /*
+@@ -3281,13 +3506,11 @@ static inline int interactive_sleep(enum
+ asmlinkage void __sched schedule(void)
+ {
+ 	struct task_struct *prev, *next;
+-	struct prio_array *array;
+ 	struct list_head *queue;
+ 	unsigned long long now;
+-	unsigned long run_time;
+-	int cpu, idx, new_prio;
+ 	long *switch_count;
+ 	struct rq *rq;
++	int cpu, idx;
+ 
+ 	/*
+ 	 * Test if we are atomic.  Since do_exit() needs to call into
+@@ -3323,18 +3546,6 @@ need_resched_nonpreemptible:
+ 
+ 	schedstat_inc(rq, sched_cnt);
+ 	now = sched_clock();
+-	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
+-		run_time = now - prev->timestamp;
+-		if (unlikely((long long)(now - prev->timestamp) < 0))
+-			run_time = 0;
+-	} else
+-		run_time = NS_MAX_SLEEP_AVG;
+-
+-	/*
+-	 * Tasks charged proportionately less run_time at high sleep_avg to
+-	 * delay them losing their interactive status
+-	 */
+-	run_time /= (CURRENT_BONUS(prev) ? : 1);
+ 
+ 	spin_lock_irq(&rq->lock);
+ 
+@@ -3345,8 +3556,10 @@ need_resched_nonpreemptible:
+ 				unlikely(signal_pending(prev))))
+ 			prev->state = TASK_RUNNING;
+ 		else {
+-			if (prev->state == TASK_UNINTERRUPTIBLE)
++			if (prev->state == TASK_UNINTERRUPTIBLE) {
++				prev->flags |= PF_NONSLEEP;
+ 				rq->nr_uninterruptible++;
++			}
+ 			deactivate_task(prev, rq);
+ 		}
+ 	}
+@@ -3356,59 +3569,29 @@ need_resched_nonpreemptible:
+ 		idle_balance(cpu, rq);
+ 		if (!rq->nr_running) {
+ 			next = rq->idle;
+-			rq->expired_timestamp = 0;
+ 			goto switch_tasks;
+ 		}
+ 	}
+ 
+-	array = rq->active;
+-	if (unlikely(!array->nr_active)) {
+-		/*
+-		 * Switch the active and expired arrays.
+-		 */
+-		schedstat_inc(rq, sched_switch);
+-		rq->active = rq->expired;
+-		rq->expired = array;
+-		array = rq->active;
+-		rq->expired_timestamp = 0;
+-		rq->best_expired_prio = MAX_PRIO;
++	idx = sched_find_first_bit(rq->dyn_bitmap);
++	if (likely(idx > ISO_PRIO))
++		next = next_dynamic_task(rq, idx);
++	else {
++		queue = rq->active->queue + idx;
++		next = list_entry(queue->next, struct task_struct, run_list);
+ 	}
+-
+-	idx = sched_find_first_bit(array->bitmap);
+-	queue = array->queue + idx;
+-	next = list_entry(queue->next, struct task_struct, run_list);
+-
+-	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
+-		unsigned long long delta = now - next->timestamp;
+-		if (unlikely((long long)(now - next->timestamp) < 0))
+-			delta = 0;
+-
+-		if (next->sleep_type == SLEEP_INTERACTIVE)
+-			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
+-
+-		array = next->array;
+-		new_prio = recalc_task_prio(next, next->timestamp + delta);
+-
+-		if (unlikely(next->prio != new_prio)) {
+-			dequeue_task(next, array);
+-			next->prio = new_prio;
+-			enqueue_task(next, array);
+-		}
+-	}
+-	next->sleep_type = SLEEP_NORMAL;
+ switch_tasks:
+-	if (next == rq->idle)
++	if (next == rq->idle) {
++		reset_prio_levels(rq);
++		rq->prio_rotation++;
+ 		schedstat_inc(rq, sched_goidle);
++	}
+ 	prefetch(next);
+ 	prefetch_stack(next);
+ 	clear_tsk_need_resched(prev);
+ 	rcu_qsctr_inc(task_cpu(prev));
+ 
+-	update_cpu_clock(prev, rq, now);
+-
+-	prev->sleep_avg -= run_time;
+-	if ((long)prev->sleep_avg <= 0)
+-		prev->sleep_avg = 0;
++	update_cpu_clock(prev, rq, now, 0);
+ 	prev->timestamp = prev->last_ran = now;
+ 
+ 	sched_info_switch(prev, next);
+@@ -3844,29 +4027,22 @@ EXPORT_SYMBOL(sleep_on_timeout);
+  */
+ void rt_mutex_setprio(struct task_struct *p, int prio)
+ {
+-	struct prio_array *array;
+ 	unsigned long flags;
++	int queued, oldprio;
+ 	struct rq *rq;
+-	int oldprio;
+ 
+ 	BUG_ON(prio < 0 || prio > MAX_PRIO);
+ 
+ 	rq = task_rq_lock(p, &flags);
+ 
+ 	oldprio = p->prio;
+-	array = p->array;
+-	if (array)
+-		dequeue_task(p, array);
++	queued = task_queued(p);
++	if (queued)
++		dequeue_task(p, rq);
+ 	p->prio = prio;
+ 
+-	if (array) {
+-		/*
+-		 * If changing to an RT priority then queue it
+-		 * in the active array!
+-		 */
+-		if (rt_task(p))
+-			array = rq->active;
+-		enqueue_task(p, array);
++	if (queued) {
++		enqueue_task(p, rq);
+ 		/*
+ 		 * Reschedule if we are currently running on this runqueue and
+ 		 * our priority decreased, or if we are not currently running on
+@@ -3875,8 +4051,8 @@ void rt_mutex_setprio(struct task_struct
+ 		if (task_running(rq, p)) {
+ 			if (p->prio > oldprio)
+ 				resched_task(rq->curr);
+-		} else if (TASK_PREEMPTS_CURR(p, rq))
+-			resched_task(rq->curr);
++		} else
++			try_preempt(p, rq);
+ 	}
+ 	task_rq_unlock(rq, &flags);
+ }
+@@ -3885,8 +4061,7 @@ void rt_mutex_setprio(struct task_struct
+ 
+ void set_user_nice(struct task_struct *p, long nice)
+ {
+-	struct prio_array *array;
+-	int old_prio, delta;
++	int queued, old_prio,delta;
+ 	unsigned long flags;
+ 	struct rq *rq;
+ 
+@@ -3907,26 +4082,27 @@ void set_user_nice(struct task_struct *p
+ 		p->static_prio = NICE_TO_PRIO(nice);
+ 		goto out_unlock;
+ 	}
+-	array = p->array;
+-	if (array) {
+-		dequeue_task(p, array);
++	queued = task_queued(p);
++	if (queued) {
++		dequeue_task(p, rq);
+ 		dec_raw_weighted_load(rq, p);
+ 	}
+ 
+ 	p->static_prio = NICE_TO_PRIO(nice);
+-	set_load_weight(p);
+ 	old_prio = p->prio;
+ 	p->prio = effective_prio(p);
++	set_quota(p);
+ 	delta = p->prio - old_prio;
+ 
+-	if (array) {
+-		enqueue_task(p, array);
++	if (queued) {
++		enqueue_task(p, rq);
+ 		inc_raw_weighted_load(rq, p);
+ 		/*
+ 		 * If the task increased its priority or is running and
+ 		 * lowered its priority, then reschedule its CPU:
+ 		 */
+-		if (delta < 0 || (delta > 0 && task_running(rq, p)))
++		if (delta < 0 || ((delta > 0 || idleprio_task(p)) &&
++		    task_running(rq, p)))
+ 			resched_task(rq->curr);
+ 	}
+ out_unlock:
+@@ -3996,7 +4172,7 @@ asmlinkage long sys_nice(int increment)
+  *
+  * This is the priority value as seen by users in /proc.
+  * RT tasks are offset by -200. Normal tasks are centered
+- * around 0, value goes from -16 to +15.
++ * around 0, value goes from 0 to +39.
+  */
+ int task_prio(const struct task_struct *p)
+ {
+@@ -4043,19 +4219,14 @@ static inline struct task_struct *find_p
+ /* Actually do priority change: must hold rq lock. */
+ static void __setscheduler(struct task_struct *p, int policy, int prio)
+ {
+-	BUG_ON(p->array);
++	BUG_ON(task_queued(p));
+ 
+ 	p->policy = policy;
+ 	p->rt_priority = prio;
+ 	p->normal_prio = normal_prio(p);
+ 	/* we are holding p->pi_lock already */
+ 	p->prio = rt_mutex_getprio(p);
+-	/*
+-	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+-	 */
+-	if (policy == SCHED_BATCH)
+-		p->sleep_avg = 0;
+-	set_load_weight(p);
++	set_quota(p);
+ }
+ 
+ /**
+@@ -4069,19 +4240,27 @@ static void __setscheduler(struct task_s
+ int sched_setscheduler(struct task_struct *p, int policy,
+ 		       struct sched_param *param)
+ {
+-	int retval, oldprio, oldpolicy = -1;
+-	struct prio_array *array;
++	struct sched_param zero_param = { .sched_priority = 0 };
++	int queued, retval, oldprio, oldpolicy = -1;
+ 	unsigned long flags;
+ 	struct rq *rq;
+ 
+ 	/* may grab non-irq protected spin_locks */
+ 	BUG_ON(in_interrupt());
++	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
++		/*
++		 * If the caller requested an RT policy without having the
++		 * necessary rights, we downgrade the policy to SCHED_ISO.
++		 * We also set the parameter to zero to pass the checks.
++		 */
++		policy = SCHED_ISO;
++		param = &zero_param;
++	}
+ recheck:
+ 	/* double check policy once rq lock held */
+ 	if (policy < 0)
+ 		policy = oldpolicy = p->policy;
+-	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
+-			policy != SCHED_NORMAL && policy != SCHED_BATCH)
++	else if (!SCHED_RANGE(policy))
+ 		return -EINVAL;
+ 	/*
+ 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+@@ -4116,6 +4295,31 @@ recheck:
+ 			if (param->sched_priority > p->rt_priority &&
+ 			    param->sched_priority > rlim_rtprio)
+ 				return -EPERM;
++		} else {
++			switch (p->policy) {
++				/*
++				 * Can only downgrade policies but not back to
++				 * SCHED_NORMAL
++				 */
++				case SCHED_ISO:
++					if (policy == SCHED_ISO)
++						goto out;
++					if (policy == SCHED_NORMAL)
++						return -EPERM;
++					break;
++				case SCHED_BATCH:
++					if (policy == SCHED_BATCH)
++						goto out;
++					if (policy != SCHED_IDLEPRIO)
++					    	return -EPERM;
++					break;
++				case SCHED_IDLEPRIO:
++					if (policy == SCHED_IDLEPRIO)
++						goto out;
++					return -EPERM;
++				default:
++					break;
++			}
+ 		}
+ 
+ 		/* can't change other user's priorities */
+@@ -4124,6 +4328,11 @@ recheck:
+ 			return -EPERM;
+ 	}
+ 
++	if (!(p->mm) && policy == SCHED_IDLEPRIO) {
++		/* Don't allow kernel threads to be SCHED_IDLEPRIO. */
++		return -EINVAL;
++	}
++
+ 	retval = security_task_setscheduler(p, policy, param);
+ 	if (retval)
+ 		return retval;
+@@ -4144,12 +4353,12 @@ recheck:
+ 		spin_unlock_irqrestore(&p->pi_lock, flags);
+ 		goto recheck;
+ 	}
+-	array = p->array;
+-	if (array)
++	queued = task_queued(p);
++	if (queued)
+ 		deactivate_task(p, rq);
+ 	oldprio = p->prio;
+ 	__setscheduler(p, policy, param->sched_priority);
+-	if (array) {
++	if (queued) {
+ 		__activate_task(p, rq);
+ 		/*
+ 		 * Reschedule if we are currently running on this runqueue and
+@@ -4159,14 +4368,15 @@ recheck:
+ 		if (task_running(rq, p)) {
+ 			if (p->prio > oldprio)
+ 				resched_task(rq->curr);
+-		} else if (TASK_PREEMPTS_CURR(p, rq))
+-			resched_task(rq->curr);
++		} else
++			try_preempt(p, rq);
+ 	}
+ 	__task_rq_unlock(rq);
+ 	spin_unlock_irqrestore(&p->pi_lock, flags);
+ 
+ 	rt_mutex_adjust_pi(p);
+ 
++out:
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(sched_setscheduler);
+@@ -4433,41 +4643,34 @@ asmlinkage long sys_sched_getaffinity(pi
+  * sys_sched_yield - yield the current processor to other threads.
+  *
+  * This function yields the current CPU by moving the calling thread
+- * to the expired array. If there are no other threads running on this
+- * CPU then this function will return.
++ * to the expired array if SCHED_NORMAL or the end of its current priority
++ * queue if a realtime task. If there are no other threads running on this
++ * cpu this function will return.
+  */
+ asmlinkage long sys_sched_yield(void)
+ {
+ 	struct rq *rq = this_rq_lock();
+-	struct prio_array *array = current->array, *target = rq->expired;
++	struct task_struct *p = current;
+ 
+ 	schedstat_inc(rq, yld_cnt);
+-	/*
+-	 * We implement yielding by moving the task into the expired
+-	 * queue.
+-	 *
+-	 * (special rule: RT tasks will just roundrobin in the active
+-	 *  array.)
+-	 */
+-	if (rt_task(current))
+-		target = rq->active;
+-
+-	if (array->nr_active == 1) {
+-		schedstat_inc(rq, yld_act_empty);
+-		if (!rq->expired->nr_active)
+-			schedstat_inc(rq, yld_both_empty);
+-	} else if (!rq->expired->nr_active)
+-		schedstat_inc(rq, yld_exp_empty);
+-
+-	if (array != target) {
+-		dequeue_task(current, array);
+-		enqueue_task(current, target);
+-	} else
+-		/*
+-		 * requeue_task is cheaper so perform that if possible.
+-		 */
+-		requeue_task(current, array);
++	if (rq->nr_running == 1)
++		schedstat_inc(rq, yld_both_empty);
++	else {
++		struct prio_array *old_array = p->array;
++		int old_prio = p->prio;
++
++		if (idleprio_task(p)) {
++			dequeue_task(p, rq);
++			enqueue_task(p, rq);
++			goto out_release;
++		}
++		/* p->prio will be updated in requeue_task via queue_expired */
++		if (!rt_task(p))
++			p->array = rq->expired;
++		requeue_task(p, rq, old_array, old_prio);
++	}
+ 
++out_release:
+ 	/*
+ 	 * Since we are going to call schedule() anyway, there's
+ 	 * no need to preempt or enable interrupts:
+@@ -4619,6 +4822,8 @@ asmlinkage long sys_sched_get_priority_m
+ 		break;
+ 	case SCHED_NORMAL:
+ 	case SCHED_BATCH:
++	case SCHED_ISO:
++	case SCHED_IDLEPRIO:
+ 		ret = 0;
+ 		break;
+ 	}
+@@ -4643,6 +4848,8 @@ asmlinkage long sys_sched_get_priority_m
+ 		break;
+ 	case SCHED_NORMAL:
+ 	case SCHED_BATCH:
++	case SCHED_ISO:
++	case SCHED_IDLEPRIO:
+ 		ret = 0;
+ 	}
+ 	return ret;
+@@ -4676,8 +4883,8 @@ long sys_sched_rr_get_interval(pid_t pid
+ 	if (retval)
+ 		goto out_unlock;
+ 
+-	jiffies_to_timespec(p->policy == SCHED_FIFO ?
+-				0 : task_timeslice(p), &t);
++	t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 :
++			   MS_TO_NS(task_timeslice(p)));
+ 	read_unlock(&tasklist_lock);
+ 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+ out_nounlock:
+@@ -4771,10 +4978,10 @@ void __cpuinit init_idle(struct task_str
+ 	struct rq *rq = cpu_rq(cpu);
+ 	unsigned long flags;
+ 
+-	idle->timestamp = sched_clock();
+-	idle->sleep_avg = 0;
+-	idle->array = NULL;
+-	idle->prio = idle->normal_prio = MAX_PRIO;
++	bitmap_zero(idle->bitmap, PRIO_RANGE);
++	idle->timestamp = idle->last_ran = sched_clock();
++	idle->array = rq->active;
++	idle->prio = idle->normal_prio = NICE_TO_PRIO(0);
+ 	idle->state = TASK_RUNNING;
+ 	idle->cpus_allowed = cpumask_of_cpu(cpu);
+ 	set_task_cpu(idle, cpu);
+@@ -4893,7 +5100,7 @@ static int __migrate_task(struct task_st
+ 		goto out;
+ 
+ 	set_task_cpu(p, dest_cpu);
+-	if (p->array) {
++	if (task_queued(p)) {
+ 		/*
+ 		 * Sync timestamp with rq_dest's before activating.
+ 		 * The same thing could be achieved by doing this step
+@@ -4904,8 +5111,7 @@ static int __migrate_task(struct task_st
+ 				+ rq_dest->most_recent_timestamp;
+ 		deactivate_task(p, rq_src);
+ 		__activate_task(p, rq_dest);
+-		if (TASK_PREEMPTS_CURR(p, rq_dest))
+-			resched_task(rq_dest->curr);
++		try_preempt(p, rq_dest);
+ 	}
+ 	ret = 1;
+ out:
+@@ -5194,7 +5400,7 @@ migration_call(struct notifier_block *nf
+ 		/* Idle task back to normal (off runqueue, low prio) */
+ 		rq = task_rq_lock(rq->idle, &flags);
+ 		deactivate_task(rq->idle, rq);
+-		rq->idle->static_prio = MAX_PRIO;
++		rq->idle->static_prio = NICE_TO_PRIO(0);
+ 		__setscheduler(rq->idle, SCHED_NORMAL, 0);
+ 		migrate_dead_tasks(cpu);
+ 		task_rq_unlock(rq, &flags);
+@@ -6706,6 +6912,13 @@ void __init sched_init_smp(void)
+ 	/* Move init over to a non-isolated CPU */
+ 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
+ 		BUG();
++
++	/*
++	 * Assume that every added cpu gives us slightly less overall latency
++	 * allowing us to increase the base rr_interval, but in a non linear
++	 * fashion.
++	 */
++	rr_interval *= 1 + ilog2(num_online_cpus());
+ }
+ #else
+ void __init sched_init_smp(void)
+@@ -6727,6 +6940,16 @@ void __init sched_init(void)
+ {
+ 	int i, j, k;
+ 
++	/* Generate the priority matrix */
++	for (i = 0; i < PRIO_RANGE; i++) {
++		bitmap_fill(prio_matrix[i], PRIO_RANGE);
++		j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i);
++		for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) {
++			__clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE),
++				    prio_matrix[i]);
++		}
++	}
++
+ 	for_each_possible_cpu(i) {
+ 		struct prio_array *array;
+ 		struct rq *rq;
+@@ -6734,12 +6957,20 @@ void __init sched_init(void)
+ 		rq = cpu_rq(i);
+ 		spin_lock_init(&rq->lock);
+ 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
++		rq->iso_ticks = 0;
+ 		rq->nr_running = 0;
++		rq->nr_idleprio = 0;
++		rq->prio_rotation = 0;
+ 		rq->active = rq->arrays;
++		rq->idleprio = rq->active;
+ 		rq->expired = rq->arrays + 1;
+-		rq->best_expired_prio = MAX_PRIO;
++		reset_prio_levels(rq);
++		rq->dyn_bitmap = rq->active->prio_bitmap;
++		rq->exp_bitmap = rq->expired->prio_bitmap;
+ 
+ #ifdef CONFIG_SMP
++		rq->active->rq = rq;
++		rq->expired->rq = rq;
+ 		rq->sd = NULL;
+ 		for (j = 1; j < 3; j++)
+ 			rq->cpu_load[j] = 0;
+@@ -6752,16 +6983,16 @@ void __init sched_init(void)
+ 		atomic_set(&rq->nr_iowait, 0);
+ 
+ 		for (j = 0; j < 2; j++) {
++
+ 			array = rq->arrays + j;
+-			for (k = 0; k < MAX_PRIO; k++) {
++			for (k = 0; k <= MAX_PRIO; k++)
+ 				INIT_LIST_HEAD(array->queue + k);
+-				__clear_bit(k, array->bitmap);
+-			}
+-			// delimiter for bitsearch
+-			__set_bit(MAX_PRIO, array->bitmap);
++			bitmap_zero(array->prio_bitmap, MAX_PRIO);
++			/* delimiter for bitsearch */
++			__set_bit(MAX_PRIO, array->prio_bitmap);
+ 		}
+-	}
+ 
++	}
+ 	set_load_weight(&init_task);
+ 
+ #ifdef CONFIG_SMP
+@@ -6815,24 +7046,24 @@ EXPORT_SYMBOL(__might_sleep);
+ #ifdef CONFIG_MAGIC_SYSRQ
+ void normalize_rt_tasks(void)
+ {
+-	struct prio_array *array;
+ 	struct task_struct *p;
+ 	unsigned long flags;
+ 	struct rq *rq;
++	int queued;
+ 
+ 	read_lock_irq(&tasklist_lock);
+ 	for_each_process(p) {
+-		if (!rt_task(p))
++		if (!rt_task(p) && !iso_task(p))
+ 			continue;
+ 
+ 		spin_lock_irqsave(&p->pi_lock, flags);
+ 		rq = __task_rq_lock(p);
+ 
+-		array = p->array;
+-		if (array)
++		queued = task_queued(p);
++		if (queued)
+ 			deactivate_task(p, task_rq(p));
+ 		__setscheduler(p, SCHED_NORMAL, 0);
+-		if (array) {
++		if (queued) {
+ 			__activate_task(p, task_rq(p));
+ 			resched_task(rq->curr);
+ 		}
+Index: linux-2.6.21-ck1/Documentation/sysctl/kernel.txt
+===================================================================
+--- linux-2.6.21-ck1.orig/Documentation/sysctl/kernel.txt	2007-05-04 12:10:52.000000000 +1000
++++ linux-2.6.21-ck1/Documentation/sysctl/kernel.txt	2007-05-04 12:10:55.000000000 +1000
+@@ -25,6 +25,9 @@ show up in /proc/sys/kernel:
+ - domainname
+ - hostname
+ - hotplug
++- interactive
++- iso_cpu
++- iso_period
+ - java-appletviewer           [ binfmt_java, obsolete ]
+ - java-interpreter            [ binfmt_java, obsolete ]
+ - kstack_depth_to_print       [ X86 only ]
+@@ -43,6 +46,7 @@ show up in /proc/sys/kernel:
+ - printk
+ - real-root-dev               ==> Documentation/initrd.txt
+ - reboot-cmd                  [ SPARC only ]
++- rr_interval
+ - rtsig-max
+ - rtsig-nr
+ - sem
+@@ -164,6 +168,40 @@ Default value is "/sbin/hotplug".
+ 
+ ==============================================================
+ 
++interactive:
++
++The staircase-deadline cpu scheduler can be set in either purely
++forward-looking mode for absolutely rigid fairness and cpu distribution
++according to nice level, or it can allow a small per-process history
++to smooth out cpu usage perturbations common in interactive tasks by
++enabling this sysctl. While small fairness issues can arise with this
++enabled, overall fairness is usually still strongly maintained and
++starvation is never possible. Enabling this can significantly smooth
++out 3d graphics and games.
++
++Default value is 1 (enabled).
++
++==============================================================
++
++iso_cpu:
++
++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
++run effectively at realtime priority, averaged over a rolling iso_period
++seconds.
++
++Set to 80 (percent) by default.
++
++==============================================================
++
++iso_period:
++
++This sets the number of seconds over which SCHED_ISO cpu usage is averaged
++to see if it exceeds its allocated cpu bandwidth.
++
++Set to 5 (seconds) by default.
++
++==============================================================
++
+ l2cr: (PPC only)
+ 
+ This flag controls the L2 cache of G3 processor boards. If
+@@ -288,6 +326,19 @@ rebooting. ???
+ 
+ ==============================================================
+ 
++rr_interval:
++
++This is the smallest duration that any cpu process scheduling unit
++will run for. Increasing this value can increase throughput of cpu
++bound tasks substantially but at the expense of increased latencies
++overall. This value is in milliseconds and the default value chosen
++depends on the number of cpus available at scheduler initialisation
++with a minimum of 8.
++
++Valid values are from 1-5000.
++
++==============================================================
++
+ rtsig-max & rtsig-nr:
+ 
+ The file rtsig-max can be used to tune the maximum number
+Index: linux-2.6.21-ck1/kernel/sysctl.c
+===================================================================
+--- linux-2.6.21-ck1.orig/kernel/sysctl.c	2007-05-04 12:10:52.000000000 +1000
++++ linux-2.6.21-ck1/kernel/sysctl.c	2007-05-04 12:24:21.000000000 +1000
+@@ -22,6 +22,7 @@
+ #include <linux/mm.h>
+ #include <linux/swap.h>
+ #include <linux/slab.h>
++#include <linux/swap-prefetch.h>
+ #include <linux/sysctl.h>
+ #include <linux/proc_fs.h>
+ #include <linux/capability.h>
+@@ -70,12 +71,17 @@ extern int suid_dumpable;
+ extern char core_pattern[];
+ extern int pid_max;
+ extern int min_free_kbytes;
++extern int vm_tail_largefiles;
+ extern int printk_ratelimit_jiffies;
+ extern int printk_ratelimit_burst;
+ extern int pid_max_min, pid_max_max;
+ extern int sysctl_drop_caches;
+ extern int percpu_pagelist_fraction;
+ extern int compat_log;
++extern int rr_interval;
++extern int sched_interactive;
++extern int sched_iso_cpu;
++extern int sched_iso_period;
+ 
+ /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+ static int maxolduid = 65535;
+@@ -159,6 +165,14 @@ int sysctl_legacy_va_layout;
+ #endif
+ 
+ 
++/* Constants for minimum and maximum testing.
++   We use these as one-element integer vectors. */
++static int __read_mostly zero;
++static int __read_mostly one = 1;
++static int __read_mostly one_hundred = 100;
++static int __read_mostly five_thousand = 5000;
++
++
+ /* The default sysctl tables: */
+ 
+ static ctl_table root_table[] = {
+@@ -499,6 +513,47 @@ static ctl_table kern_table[] = {
+ 		.mode		= 0444,
+ 		.proc_handler	= &proc_dointvec,
+ 	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "rr_interval",
++		.data		= &rr_interval,
++		.maxlen		= sizeof (int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.strategy	= &sysctl_intvec,
++		.extra1		= &one,
++		.extra2		= &five_thousand,
++	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "interactive",
++		.data		= &sched_interactive,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "iso_cpu",
++		.data		= &sched_iso_cpu,
++		.maxlen		= sizeof (int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.strategy	= &sysctl_intvec,
++		.extra1		= &zero,
++		.extra2		= &one_hundred,
++	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "iso_period",
++		.data		= &sched_iso_period,
++		.maxlen		= sizeof (int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.strategy	= &sysctl_intvec,
++		.extra1		= &one,
++		.extra2		= &one_hundred,
++	},
+ #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+ 	{
+ 		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
+@@ -607,12 +662,6 @@ static ctl_table kern_table[] = {
+ 	{ .ctl_name = 0 }
+ };
+ 
+-/* Constants for minimum and maximum testing in vm_table.
+-   We use these as one-element integer vectors. */
+-static int zero;
+-static int one_hundred = 100;
+-
+-
+ static ctl_table vm_table[] = {
+ 	{
+ 		.ctl_name	= VM_OVERCOMMIT_MEMORY,
+@@ -693,16 +742,32 @@ static ctl_table vm_table[] = {
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ 	{
+-		.ctl_name	= VM_SWAPPINESS,
+-		.procname	= "swappiness",
+-		.data		= &vm_swappiness,
+-		.maxlen		= sizeof(vm_swappiness),
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "mapped",
++		.data		= &vm_mapped,
++		.maxlen		= sizeof(vm_mapped),
+ 		.mode		= 0644,
+ 		.proc_handler	= &proc_dointvec_minmax,
+ 		.strategy	= &sysctl_intvec,
+ 		.extra1		= &zero,
+ 		.extra2		= &one_hundred,
+ 	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "hardmaplimit",
++		.data		= &vm_hardmaplimit,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "tail_largefiles",
++		.data		= &vm_tail_largefiles,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
+ #ifdef CONFIG_HUGETLB_PAGE
+ 	 {
+ 		.ctl_name	= VM_HUGETLB_PAGES,
+@@ -859,6 +924,16 @@ static ctl_table vm_table[] = {
+ 		.extra1		= &zero,
+ 	},
+ #endif
++#ifdef CONFIG_SWAP_PREFETCH
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "swap_prefetch",
++		.data		= &swap_prefetch,
++		.maxlen		= sizeof(swap_prefetch),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++#endif
+ 	{ .ctl_name = 0 }
+ };
+ 
+Index: linux-2.6.21-ck1/fs/pipe.c
+===================================================================
+--- linux-2.6.21-ck1.orig/fs/pipe.c	2007-05-04 12:10:52.000000000 +1000
++++ linux-2.6.21-ck1/fs/pipe.c	2007-05-04 12:10:54.000000000 +1000
+@@ -41,12 +41,7 @@ void pipe_wait(struct pipe_inode_info *p
+ {
+ 	DEFINE_WAIT(wait);
+ 
+-	/*
+-	 * Pipes are system-local resources, so sleeping on them
+-	 * is considered a noninteractive wait:
+-	 */
+-	prepare_to_wait(&pipe->wait, &wait,
+-			TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
++	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
+ 	if (pipe->inode)
+ 		mutex_unlock(&pipe->inode->i_mutex);
+ 	schedule();
+Index: linux-2.6.21-ck1/Documentation/sched-design.txt
+===================================================================
+--- linux-2.6.21-ck1.orig/Documentation/sched-design.txt	2007-05-04 12:10:52.000000000 +1000
++++ linux-2.6.21-ck1/Documentation/sched-design.txt	2007-05-04 12:10:54.000000000 +1000
+@@ -1,11 +1,14 @@
+-		   Goals, Design and Implementation of the
+-		      new ultra-scalable O(1) scheduler
++ Goals, Design and Implementation of the ultra-scalable O(1) scheduler by
++ Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by
++ Con Kolivas.
+ 
+ 
+-  This is an edited version of an email Ingo Molnar sent to
+-  lkml on 4 Jan 2002.  It describes the goals, design, and
+-  implementation of Ingo's new ultra-scalable O(1) scheduler.
+-  Last Updated: 18 April 2002.
++  This was originally an edited version of an email Ingo Molnar sent to
++  lkml on 4 Jan 2002.  It describes the goals, design, and implementation
++  of Ingo's ultra-scalable O(1) scheduler. It now contains a description
++  of the Staircase Deadline priority scheduler that was built on this
++  design.
++  Last Updated: Fri, 4 May 2007
+ 
+ 
+ Goal
+@@ -163,3 +166,222 @@ certain code paths and data constructs. 
+ code is smaller than the old one.
+ 
+ 	Ingo
++
++
++Staircase Deadline cpu scheduler policy
++================================================
++
++Design summary
++==============
++
++A novel design which incorporates a foreground-background descending priority
++system (the staircase) via a bandwidth allocation matrix according to nice
++level.
++
++
++Features
++========
++
++A starvation free, strict fairness O(1) scalable design with interactivity
++as good as the above restrictions can provide. There is no interactivity
++estimator, no sleep/run measurements and only simple fixed accounting.
++The design has strict enough a design and accounting that task behaviour
++can be modelled and maximum scheduling latencies can be predicted by
++the virtual deadline mechanism that manages runqueues. The prime concern
++in this design is to maintain fairness at all costs determined by nice level,
++yet to maintain as good interactivity as can be allowed within the
++constraints of strict fairness.
++
++
++Design description
++==================
++
++SD works off the principle of providing each task a quota of runtime that it is
++allowed to run at a number of priority levels determined by its static priority
++(ie. its nice level). If the task uses up its quota it has its priority
++decremented to the next level determined by a priority matrix. Once every
++runtime quota has been consumed of every priority level, a task is queued on the
++"expired" array. When no other tasks exist with quota, the expired array is
++activated and fresh quotas are handed out. This is all done in O(1).
++
++Design details
++==============
++
++Each task keeps a record of its own entitlement of cpu time. Most of the rest of
++these details apply to non-realtime tasks as rt task management is straight
++forward.
++
++Each runqueue keeps a record of what major epoch it is up to in the
++rq->prio_rotation field which is incremented on each major epoch. It also
++keeps a record of the current prio_level for each static priority task.
++
++Each task keeps a record of what major runqueue epoch it was last running
++on in p->rotation. It also keeps a record of what priority levels it has
++already been allocated quota from during this epoch in a bitmap p->bitmap.
++
++The only tunable that determines all other details is the RR_INTERVAL. This
++is set to 8ms, and is scaled gently upwards with more cpus. This value is
++tunable via a /proc interface.
++
++All tasks are initially given a quota based on RR_INTERVAL. This is equal to
++RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and
++progressively larger for nice values from -1 to -20. This is assigned to
++p->quota and only changes with changes in nice level.
++
++As a task is first queued, it checks in recalc_task_prio to see if it has run at
++this runqueue's current priority rotation. If it has not, it will have its
++p->prio level set according to the first slot in a "priority matrix" and will be
++given a p->time_slice equal to the p->quota, and has its allocation bitmap bit
++set in p->bitmap for this prio level. It is then queued on the current active
++priority array.
++
++If a task has already been running during this major epoch, and it has
++p->time_slice left and the rq->prio_quota for the task's p->prio still
++has quota, it will be placed back on the active array, but no more quota
++will be added.
++
++If a task has been running during this major epoch, but does not have
++p->time_slice left, it will find the next lowest priority in its bitmap that it
++has not been allocated quota from. It then gets the a full quota in
++p->time_slice. It is then queued on the current active priority array at the
++newly determined lower priority.
++
++If a task has been running during this major epoch, and does not have
++any entitlement left in p->bitmap and no time_slice left, it will have its
++bitmap cleared, and be queued at its best prio again, but on the expired
++priority array.
++
++When a task is queued, it has its relevant bit set in the array->prio_bitmap.
++
++p->time_slice is stored in nanosconds and is updated via update_cpu_clock on
++schedule() and scheduler_tick. If p->time_slice is below zero then the
++recalc_task_prio is readjusted and the task rescheduled.
++
++
++Priority Matrix
++===============
++
++In order to minimise the latencies between tasks of different nice levels
++running concurrently, the dynamic priority slots where different nice levels
++are queued are dithered instead of being sequential. What this means is that
++there are 40 priority slots where a task may run during one major rotation,
++and the allocation of slots is dependant on nice level. In the
++following table, a zero represents a slot where the task may run.
++
++PRIORITY:0..................20.................39
++nice -20 0000000000000000000000000000000000000000
++nice -10 1000100010001000100010001000100010010000
++nice   0 1010101010101010101010101010101010101010
++nice   5 1011010110110101101101011011010110110110
++nice  10 1110111011101110111011101110111011101110
++nice  15 1111111011111110111111101111111011111110
++nice  19 1111111111111111111111111111111111111110
++
++As can be seen, a nice -20 task runs in every priority slot whereas a nice 19
++task only runs one slot per major rotation. This dithered table allows for the
++smallest possible maximum latencies between tasks of varying nice levels, thus
++allowing vastly different nice levels to be used.
++
++SCHED_BATCH tasks are managed slightly differently, receiving only the top
++slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but
++slightly higher latencies.
++
++
++Modelling deadline behaviour
++============================
++
++As the accounting in this design is hard and not modified by sleep average
++calculations or interactivity modifiers, it is possible to accurately
++predict the maximum latency that a task may experience under different
++conditions. This is a virtual deadline mechanism enforced by mandatory
++timeslice expiration and not outside bandwidth measurement.
++
++The maximum duration a task can run during one major epoch is determined by its
++nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL
++duration during each epoch. Nice 10 tasks can run at 9 priority levels for each
++epoch, and so on. The table in the priority matrix above demonstrates how this
++is enforced.
++
++Therefore the maximum duration a runqueue epoch can take is determined by
++the number of tasks running, and their nice level. After that, the maximum
++duration it can take before a task can wait before it get scheduled is
++determined by the position of its first slot on the matrix.
++
++In the following examples, these are _worst case scenarios_ and would rarely
++occur, but can be modelled nonetheless to determine the maximum possible
++latency.
++
++So for example, if two nice 0 tasks are running, and one has just expired as
++another is activated for the first time receiving a full quota for this
++runqueue rotation, the first task will wait:
++
++nr_tasks * max_duration + nice_difference * rr_interval
++1 * 19 * RR_INTERVAL + 0 = 152ms
++
++In the presence of a nice 10 task, a nice 0 task would wait a maximum of
++1 * 10 * RR_INTERVAL + 0 = 80ms
++
++In the presence of a nice 0 task, a nice 10 task would wait a maximum of
++1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms
++
++More useful than these values, though, are the average latencies which are
++a matter of determining the average distance between priority slots of
++different nice values and multiplying them by the tasks' quota. For example
++in the presence of a nice -10 task, a nice 0 task will wait either one or
++two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL,
++this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or
++20 and 40ms respectively (on uniprocessor at 1000HZ).
++
++
++Achieving interactivity
++=======================
++
++A requirement of this scheduler design was to achieve good interactivity
++despite being a completely fair deadline based design. The disadvantage of
++designs that try to achieve interactivity is that they usually do so at
++the expense of maintaining fairness. As cpu speeds increase, the requirement
++for some sort of metered unfairness towards interactive tasks becomes a less
++desirable phenomenon, but low latency and fairness remains mandatory to
++good interactive performance.
++
++This design relies on the fact that interactive tasks, by their nature,
++sleep often. Most fair scheduling designs end up penalising such tasks
++indirectly giving them less than their fair possible share because of the
++sleep, and have to use a mechanism of bonusing their priority to offset
++this based on the duration they sleep. This becomes increasingly inaccurate
++as the number of running tasks rises and more tasks spend time waiting on
++runqueues rather than sleeping, and it is impossible to tell whether the
++task that's waiting on a runqueue only intends to run for a short period and
++then sleep again after than runqueue wait. Furthermore, all such designs rely
++on a period of time to pass to accumulate some form of statistic on the task
++before deciding on how much to give them preference. The shorter this period,
++the more rapidly bursts of cpu ruin the interactive tasks behaviour. The
++longer this period, the longer it takes for interactive tasks to get low
++scheduling latencies and fair cpu.
++
++This design does not measure sleep time at all. Interactive tasks that sleep
++often will wake up having consumed very little if any of their quota for
++the current major priority rotation. The longer they have slept, the less
++likely they are to even be on the current major priority rotation. Once
++woken up, though, they get to use up a their full quota for that epoch,
++whether part of a quota remains or a full quota. Overall, however, they
++can still only run as much cpu time for that epoch as any other task of the
++same nice level. This means that two tasks behaving completely differently
++from fully cpu bound to waking/sleeping extremely frequently will still
++get the same quota of cpu, but the latter will be using its quota for that
++epoch in bursts rather than continuously. This guarantees that interactive
++tasks get the same amount of cpu as cpu bound ones.
++
++The other requirement of interactive tasks is also to obtain low latencies
++for when they are scheduled. Unlike fully cpu bound tasks and the maximum
++latencies possible described in the modelling deadline behaviour section
++above, tasks that sleep will wake up with quota available usually at the
++current runqueue's priority_level or better. This means that the most latency
++they are likely to see is one RR_INTERVAL, and often they will preempt the
++current task if it is not of a sleeping nature. This then guarantees very
++low latency for interactive tasks, and the lowest latencies for the least
++cpu bound tasks.
++
++
++Fri, 4 May 2007
++Con Kolivas <kernel@kolivas.org>
+Index: linux-2.6.21-ck1/kernel/softirq.c
+===================================================================
+--- linux-2.6.21-ck1.orig/kernel/softirq.c	2007-05-04 12:10:52.000000000 +1000
++++ linux-2.6.21-ck1/kernel/softirq.c	2007-05-04 12:10:54.000000000 +1000
+@@ -488,7 +488,7 @@ void __init softirq_init(void)
+ 
+ static int ksoftirqd(void * __bind_cpu)
+ {
+-	set_user_nice(current, 19);
++	set_user_nice(current, 15);
+ 	current->flags |= PF_NOFREEZE;
+ 
+ 	set_current_state(TASK_INTERRUPTIBLE);
+Index: linux-2.6.21-ck1/kernel/fork.c
+===================================================================
+--- linux-2.6.21-ck1.orig/kernel/fork.c	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/kernel/fork.c	2007-05-04 12:24:19.000000000 +1000
+@@ -1060,6 +1060,7 @@ static struct task_struct *copy_process(
+ 	p->io_context = NULL;
+ 	p->io_wait = NULL;
+ 	p->audit_context = NULL;
++	p->mutexes_held = 0;
+ 	cpuset_fork(p);
+ #ifdef CONFIG_NUMA
+  	p->mempolicy = mpol_copy(p->mempolicy);
+Index: linux-2.6.21-ck1/kernel/mutex.c
+===================================================================
+--- linux-2.6.21-ck1.orig/kernel/mutex.c	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/kernel/mutex.c	2007-05-04 12:24:19.000000000 +1000
+@@ -60,6 +60,16 @@ EXPORT_SYMBOL(__mutex_init);
+ static void fastcall noinline __sched
+ __mutex_lock_slowpath(atomic_t *lock_count);
+ 
++static inline void inc_mutex_count(void)
++{
++	current->mutexes_held++;
++}
++
++static inline void dec_mutex_count(void)
++{
++	current->mutexes_held--;
++}
++
+ /***
+  * mutex_lock - acquire the mutex
+  * @lock: the mutex to be acquired
+@@ -89,6 +99,7 @@ void inline fastcall __sched mutex_lock(
+ 	 * 'unlocked' into 'locked' state.
+ 	 */
+ 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
++	inc_mutex_count();
+ }
+ 
+ EXPORT_SYMBOL(mutex_lock);
+@@ -114,6 +125,7 @@ void fastcall __sched mutex_unlock(struc
+ 	 * into 'unlocked' state:
+ 	 */
+ 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
++	dec_mutex_count();
+ }
+ 
+ EXPORT_SYMBOL(mutex_unlock);
+@@ -283,9 +295,14 @@ __mutex_lock_interruptible_slowpath(atom
+  */
+ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
+ {
++	int ret;
++
+ 	might_sleep();
+-	return __mutex_fastpath_lock_retval
++	ret = __mutex_fastpath_lock_retval
+ 			(&lock->count, __mutex_lock_interruptible_slowpath);
++	if (likely(!ret))
++		inc_mutex_count();
++	return ret;
+ }
+ 
+ EXPORT_SYMBOL(mutex_lock_interruptible);
+@@ -340,8 +357,12 @@ static inline int __mutex_trylock_slowpa
+  */
+ int fastcall __sched mutex_trylock(struct mutex *lock)
+ {
+-	return __mutex_fastpath_trylock(&lock->count,
++	int ret = __mutex_fastpath_trylock(&lock->count,
+ 					__mutex_trylock_slowpath);
++
++	if (likely(ret))
++		inc_mutex_count();
++	return ret;
+ }
+ 
+ EXPORT_SYMBOL(mutex_trylock);
+Index: linux-2.6.21-ck1/block/cfq-iosched.c
+===================================================================
+--- linux-2.6.21-ck1.orig/block/cfq-iosched.c	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/block/cfq-iosched.c	2007-05-04 12:24:19.000000000 +1000
+@@ -1258,10 +1258,12 @@ static void cfq_init_prio_data(struct cf
+ 			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
+ 		case IOPRIO_CLASS_NONE:
+ 			/*
+-			 * no prio set, place us in the middle of the BE classes
++			 * Select class and ioprio according to policy and nice
+ 			 */
++			cfqq->ioprio_class = task_policy_ioprio_class(tsk);
+ 			cfqq->ioprio = task_nice_ioprio(tsk);
+-			cfqq->ioprio_class = IOPRIO_CLASS_BE;
++			if (cfqq->ioprio_class == IOPRIO_CLASS_IDLE)
++				cfq_clear_cfqq_idle_window(cfqq);
+ 			break;
+ 		case IOPRIO_CLASS_RT:
+ 			cfqq->ioprio = task_ioprio(tsk);
+Index: linux-2.6.21-ck1/include/linux/ioprio.h
+===================================================================
+--- linux-2.6.21-ck1.orig/include/linux/ioprio.h	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/include/linux/ioprio.h	2007-05-04 12:24:19.000000000 +1000
+@@ -22,7 +22,7 @@
+  * class, the default for any process. IDLE is the idle scheduling class, it
+  * is only served when no one else is using the disk.
+  */
+-enum {
++enum ioprio_class {
+ 	IOPRIO_CLASS_NONE,
+ 	IOPRIO_CLASS_RT,
+ 	IOPRIO_CLASS_BE,
+@@ -51,8 +51,25 @@ static inline int task_ioprio(struct tas
+ 	return IOPRIO_PRIO_DATA(task->ioprio);
+ }
+ 
++static inline enum ioprio_class
++	task_policy_ioprio_class(struct task_struct *task)
++{
++	if (rt_task(task))
++		return IOPRIO_CLASS_RT;
++	if (idleprio_task(task))
++		return IOPRIO_CLASS_IDLE;
++	return IOPRIO_CLASS_BE;
++}
++
+ static inline int task_nice_ioprio(struct task_struct *task)
+ {
++	if (rt_task(task))
++		return (MAX_RT_PRIO - task->rt_priority) * IOPRIO_BE_NR /
++			(MAX_RT_PRIO + 1);
++	if (iso_task(task))
++		return 0;
++	if (idleprio_task(task))
++		return IOPRIO_BE_NR - 1;
+ 	return (task_nice(task) + 20) / 5;
+ }
+ 
+Index: linux-2.6.21-ck1/Documentation/sysctl/vm.txt
+===================================================================
+--- linux-2.6.21-ck1.orig/Documentation/sysctl/vm.txt	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/Documentation/sysctl/vm.txt	2007-05-04 12:24:21.000000000 +1000
+@@ -22,6 +22,8 @@ Currently, these files are in /proc/sys/
+ - dirty_background_ratio
+ - dirty_expire_centisecs
+ - dirty_writeback_centisecs
++- hardmaplimit
++- mapped
+ - max_map_count
+ - min_free_kbytes
+ - laptop_mode
+@@ -31,12 +33,13 @@ Currently, these files are in /proc/sys/
+ - min_unmapped_ratio
+ - min_slab_ratio
+ - panic_on_oom
++- swap_prefetch
+ 
+ ==============================================================
+ 
+ dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
+ dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
+-block_dump, swap_token_timeout, drop-caches:
++block_dump, swap_token_timeout, drop-caches, tail_largefiles:
+ 
+ See Documentation/filesystems/proc.txt
+ 
+@@ -86,6 +89,27 @@ for swap because we only cluster swap da
+ 
+ ==============================================================
+ 
++hardmaplimit:
++
++This flag makes the vm adhere to the mapped value as closely as possible
++except in the most extreme vm stress where doing so would provoke an out
++of memory condition (see mapped below).
++
++Enabled by default.
++
++==============================================================
++
++mapped:
++
++This is the percentage ram that is filled with mapped pages (applications)
++before the vm will start reclaiming mapped pages by moving them to swap.
++It is altered by the relative stress of the vm at the time so is not
++strictly adhered to to prevent provoking out of memory kills.
++
++Set to 66 by default.
++
++==============================================================
++
+ max_map_count:
+ 
+ This file contains the maximum number of memory map areas a process
+@@ -205,3 +229,14 @@ rather than killing rogue processes, set
+ 
+ The default value is 0.
+ 
++==============================================================
++
++swap_prefetch
++
++This enables or disables the swap prefetching feature. When the virtual
++memory subsystem has been extremely idle for at least 5 seconds it will start
++copying back pages from swap into the swapcache and keep a copy in swap. In
++practice it can take many minutes before the vm is idle enough.
++
++The default value is 1.
++
+Index: linux-2.6.21-ck1/include/linux/swap.h
+===================================================================
+--- linux-2.6.21-ck1.orig/include/linux/swap.h	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/include/linux/swap.h	2007-05-04 12:24:20.000000000 +1000
+@@ -180,6 +180,7 @@ extern unsigned int nr_free_pagecache_pa
+ /* linux/mm/swap.c */
+ extern void FASTCALL(lru_cache_add(struct page *));
+ extern void FASTCALL(lru_cache_add_active(struct page *));
++extern void FASTCALL(lru_cache_add_tail(struct page *));
+ extern void FASTCALL(activate_page(struct page *));
+ extern void FASTCALL(mark_page_accessed(struct page *));
+ extern void lru_add_drain(void);
+@@ -188,9 +189,11 @@ extern int rotate_reclaimable_page(struc
+ extern void swap_setup(void);
+ 
+ /* linux/mm/vmscan.c */
+-extern unsigned long try_to_free_pages(struct zone **, gfp_t);
++extern unsigned long try_to_free_pages(struct zone **, gfp_t,
++				       struct task_struct *p);
+ extern unsigned long shrink_all_memory(unsigned long nr_pages);
+-extern int vm_swappiness;
++extern int vm_mapped;
++extern int vm_hardmaplimit;
+ extern int remove_mapping(struct address_space *mapping, struct page *page);
+ extern long vm_total_pages;
+ 
+@@ -237,6 +240,7 @@ extern void free_pages_and_swap_cache(st
+ extern struct page * lookup_swap_cache(swp_entry_t);
+ extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
+ 					   unsigned long addr);
++extern int add_to_swap_cache(struct page *page, swp_entry_t entry);
+ /* linux/mm/swapfile.c */
+ extern long total_swap_pages;
+ extern unsigned int nr_swapfiles;
+Index: linux-2.6.21-ck1/init/Kconfig
+===================================================================
+--- linux-2.6.21-ck1.orig/init/Kconfig	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/init/Kconfig	2007-05-04 12:24:20.000000000 +1000
+@@ -101,6 +101,28 @@ config SWAP
+ 	  used to provide more virtual memory than the actual RAM present
+ 	  in your computer.  If unsure say Y.
+ 
++config SWAP_PREFETCH
++	bool "Support for prefetching swapped memory"
++	depends on SWAP
++	default y
++	---help---
++	  This option will allow the kernel to prefetch swapped memory pages
++	  when idle. The pages will be kept on both swap and in swap_cache
++	  thus avoiding the need for further I/O if either ram or swap space
++	  is required.
++
++	  What this will do on workstations is slowly bring back applications
++	  that have swapped out after memory intensive workloads back into
++	  physical ram if you have free ram at a later stage and the machine
++	  is relatively idle. This means that when you come back to your
++	  computer after leaving it idle for a while, applications will come
++	  to life faster. Note that your swap usage will appear to increase
++	  but these are cached pages, can be dropped freely by the vm, and it
++	  should stabilise around 50% swap usage maximum.
++
++	  Workstations and multiuser workstation servers will most likely want
++	  to say Y.
++
+ config SYSVIPC
+ 	bool "System V IPC"
+ 	---help---
+Index: linux-2.6.21-ck1/mm/Makefile
+===================================================================
+--- linux-2.6.21-ck1.orig/mm/Makefile	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/mm/Makefile	2007-05-04 12:24:20.000000000 +1000
+@@ -17,6 +17,7 @@ ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
+ obj-y			+= bounce.o
+ endif
+ obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
++obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o
+ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
+ obj-$(CONFIG_NUMA) 	+= mempolicy.o
+ obj-$(CONFIG_SPARSEMEM)	+= sparse.o
+Index: linux-2.6.21-ck1/mm/swap.c
+===================================================================
+--- linux-2.6.21-ck1.orig/mm/swap.c	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/mm/swap.c	2007-05-04 12:24:21.000000000 +1000
+@@ -17,6 +17,7 @@
+ #include <linux/sched.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
++#include <linux/swap-prefetch.h>
+ #include <linux/mman.h>
+ #include <linux/pagemap.h>
+ #include <linux/pagevec.h>
+@@ -176,6 +177,7 @@ EXPORT_SYMBOL(mark_page_accessed);
+  */
+ static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
+ static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
++static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, };
+ 
+ void fastcall lru_cache_add(struct page *page)
+ {
+@@ -197,6 +199,31 @@ void fastcall lru_cache_add_active(struc
+ 	put_cpu_var(lru_add_active_pvecs);
+ }
+ 
++static void __pagevec_lru_add_tail(struct pagevec *pvec)
++{
++	int i;
++	struct zone *zone = NULL;
++
++	for (i = 0; i < pagevec_count(pvec); i++) {
++		struct page *page = pvec->pages[i];
++		struct zone *pagezone = page_zone(page);
++
++		if (pagezone != zone) {
++			if (zone)
++				spin_unlock_irq(&zone->lru_lock);
++			zone = pagezone;
++			spin_lock_irq(&zone->lru_lock);
++		}
++		BUG_ON(PageLRU(page));
++		SetPageLRU(page);
++		add_page_to_inactive_list_tail(zone, page);
++	}
++	if (zone)
++		spin_unlock_irq(&zone->lru_lock);
++	release_pages(pvec->pages, pvec->nr, pvec->cold);
++	pagevec_reinit(pvec);
++}
++
+ static void __lru_add_drain(int cpu)
+ {
+ 	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+@@ -207,6 +234,9 @@ static void __lru_add_drain(int cpu)
+ 	pvec = &per_cpu(lru_add_active_pvecs, cpu);
+ 	if (pagevec_count(pvec))
+ 		__pagevec_lru_add_active(pvec);
++	pvec = &per_cpu(lru_add_tail_pvecs, cpu);
++	if (pagevec_count(pvec))
++		__pagevec_lru_add_tail(pvec);
+ }
+ 
+ void lru_add_drain(void)
+@@ -403,6 +433,20 @@ void __pagevec_lru_add_active(struct pag
+ }
+ 
+ /*
++ * Function used uniquely to put pages back to the lru at the end of the
++ * inactive list to preserve the lru order.
++ */
++void fastcall lru_cache_add_tail(struct page *page)
++{
++	struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs);
++
++	page_cache_get(page);
++	if (!pagevec_add(pvec, page))
++		__pagevec_lru_add_tail(pvec);
++	put_cpu_var(lru_add_pvecs);
++}
++
++/*
+  * Try to drop buffers from the pages in a pagevec
+  */
+ void pagevec_strip(struct pagevec *pvec)
+@@ -514,6 +558,9 @@ void __init swap_setup(void)
+ 	 * Right now other parts of the system means that we
+ 	 * _really_ don't want to cluster much more
+ 	 */
++
++	prepare_swap_prefetch();
++
+ #ifdef CONFIG_HOTPLUG_CPU
+ 	hotcpu_notifier(cpu_swap_callback, 0);
+ #endif
+Index: linux-2.6.21-ck1/mm/swap_prefetch.c
+===================================================================
+--- /dev/null	1970-01-01 00:00:00.000000000 +0000
++++ linux-2.6.21-ck1/mm/swap_prefetch.c	2007-05-04 12:24:20.000000000 +1000
+@@ -0,0 +1,581 @@
++/*
++ * linux/mm/swap_prefetch.c
++ *
++ * Copyright (C) 2005-2006 Con Kolivas
++ *
++ * Written by Con Kolivas <kernel@kolivas.org>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ */
++
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/swap.h>
++#include <linux/swap-prefetch.h>
++#include <linux/ioprio.h>
++#include <linux/kthread.h>
++#include <linux/pagemap.h>
++#include <linux/syscalls.h>
++#include <linux/writeback.h>
++#include <linux/vmstat.h>
++#include <linux/freezer.h>
++
++/*
++ * Time to delay prefetching if vm is busy or prefetching unsuccessful. There
++ * needs to be at least this duration of idle time meaning in practice it can
++ * be much longer
++ */
++#define PREFETCH_DELAY	(HZ * 5)
++
++/* sysctl - enable/disable swap prefetching */
++int swap_prefetch __read_mostly = 1;
++
++struct swapped_root {
++	unsigned long		busy;		/* vm busy */
++	spinlock_t		lock;		/* protects all data */
++	struct list_head	list;		/* MRU list of swapped pages */
++	struct radix_tree_root	swap_tree;	/* Lookup tree of pages */
++	unsigned int		count;		/* Number of entries */
++	unsigned int		maxcount;	/* Maximum entries allowed */
++	struct kmem_cache	*cache;		/* Of struct swapped_entry */
++};
++
++static struct swapped_root swapped = {
++	.lock		= SPIN_LOCK_UNLOCKED,
++	.list  		= LIST_HEAD_INIT(swapped.list),
++	.swap_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
++};
++
++static struct task_struct *kprefetchd_task;
++
++/*
++ * We check to see no part of the vm is busy. If it is this will interrupt
++ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy.
++ */
++inline void delay_swap_prefetch(void)
++{
++	if (!test_bit(0, &swapped.busy))
++		__set_bit(0, &swapped.busy);
++}
++
++/*
++ * Drop behind accounting which keeps a list of the most recently used swap
++ * entries.
++ */
++void add_to_swapped_list(struct page *page)
++{
++	struct swapped_entry *entry;
++	unsigned long index, flags;
++	int wakeup;
++
++	if (!swap_prefetch)
++		return;
++
++	wakeup = 0;
++
++	spin_lock_irqsave(&swapped.lock, flags);
++	if (swapped.count >= swapped.maxcount) {
++		/*
++		 * We limit the number of entries to 2/3 of physical ram.
++		 * Once the number of entries exceeds this we start removing
++		 * the least recently used entries.
++		 */
++		entry = list_entry(swapped.list.next,
++			struct swapped_entry, swapped_list);
++		radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val);
++		list_del(&entry->swapped_list);
++		swapped.count--;
++	} else {
++		entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC);
++		if (unlikely(!entry))
++			/* bad, can't allocate more mem */
++			goto out_locked;
++	}
++
++	index = page_private(page);
++	entry->swp_entry.val = index;
++	/*
++	 * On numa we need to store the node id to ensure that we prefetch to
++	 * the same node it came from.
++	 */
++	store_swap_entry_node(entry, page);
++
++	if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) {
++		/*
++		 * If this is the first entry, kprefetchd needs to be
++		 * (re)started.
++		 */
++		if (!swapped.count)
++			wakeup = 1;
++		list_add(&entry->swapped_list, &swapped.list);
++		swapped.count++;
++	}
++
++out_locked:
++	spin_unlock_irqrestore(&swapped.lock, flags);
++
++	/* Do the wakeup outside the lock to shorten lock hold time. */
++	if (wakeup)
++		wake_up_process(kprefetchd_task);
++
++	return;
++}
++
++/*
++ * Removes entries from the swapped_list. The radix tree allows us to quickly
++ * look up the entry from the index without having to iterate over the whole
++ * list.
++ */
++void remove_from_swapped_list(const unsigned long index)
++{
++	struct swapped_entry *entry;
++	unsigned long flags;
++
++	if (list_empty(&swapped.list))
++		return;
++
++	spin_lock_irqsave(&swapped.lock, flags);
++	entry = radix_tree_delete(&swapped.swap_tree, index);
++	if (likely(entry)) {
++		list_del_init(&entry->swapped_list);
++		swapped.count--;
++		kmem_cache_free(swapped.cache, entry);
++	}
++	spin_unlock_irqrestore(&swapped.lock, flags);
++}
++
++enum trickle_return {
++	TRICKLE_SUCCESS,
++	TRICKLE_FAILED,
++	TRICKLE_DELAY,
++};
++
++struct node_stats {
++	unsigned long	last_free;
++	/* Free ram after a cycle of prefetching */
++	unsigned long	current_free;
++	/* Free ram on this cycle of checking prefetch_suitable */
++	unsigned long	prefetch_watermark;
++	/* Maximum amount we will prefetch to */
++	unsigned long	highfree[MAX_NR_ZONES];
++	/* The amount of free ram before we start prefetching */
++	unsigned long	lowfree[MAX_NR_ZONES];
++	/* The amount of free ram where we will stop prefetching */
++	unsigned long	*pointfree[MAX_NR_ZONES];
++	/* highfree or lowfree depending on whether we've hit a watermark */
++};
++
++/*
++ * prefetch_stats stores the free ram data of each node and this is used to
++ * determine if a node is suitable for prefetching into.
++ */
++struct prefetch_stats {
++	nodemask_t	prefetch_nodes;
++	/* Which nodes are currently suited to prefetching */
++	unsigned long	prefetched_pages;
++	/* Total pages we've prefetched on this wakeup of kprefetchd */
++	struct node_stats node[MAX_NUMNODES];
++};
++
++static struct prefetch_stats sp_stat;
++
++/*
++ * This tries to read a swp_entry_t into swap cache for swap prefetching.
++ * If it returns TRICKLE_DELAY we should delay further prefetching.
++ */
++static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry,
++	const int node)
++{
++	enum trickle_return ret = TRICKLE_FAILED;
++	struct page *page;
++
++	read_lock_irq(&swapper_space.tree_lock);
++	/* Entry may already exist */
++	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
++	read_unlock_irq(&swapper_space.tree_lock);
++	if (page) {
++		remove_from_swapped_list(entry.val);
++		goto out;
++	}
++
++	/*
++	 * Get a new page to read from swap. We have already checked the
++	 * watermarks so __alloc_pages will not call on reclaim.
++	 */
++	page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0);
++	if (unlikely(!page)) {
++		ret = TRICKLE_DELAY;
++		goto out;
++	}
++
++	if (add_to_swap_cache(page, entry)) {
++		/* Failed to add to swap cache */
++		goto out_release;
++	}
++
++	/* Add them to the tail of the inactive list to preserve LRU order */
++	lru_cache_add_tail(page);
++	if (unlikely(swap_readpage(NULL, page))) {
++		ret = TRICKLE_DELAY;
++		goto out_release;
++	}
++
++	sp_stat.prefetched_pages++;
++	sp_stat.node[node].last_free--;
++
++	ret = TRICKLE_SUCCESS;
++out_release:
++	page_cache_release(page);
++out:
++	return ret;
++}
++
++static void clear_last_prefetch_free(void)
++{
++	int node;
++
++	/*
++	 * Reset the nodes suitable for prefetching to all nodes. We could
++	 * update the data to take into account memory hotplug if desired..
++	 */
++	sp_stat.prefetch_nodes = node_online_map;
++	for_each_node_mask(node, sp_stat.prefetch_nodes) {
++		struct node_stats *ns = &sp_stat.node[node];
++
++		ns->last_free = 0;
++	}
++}
++
++static void clear_current_prefetch_free(void)
++{
++	int node;
++
++	sp_stat.prefetch_nodes = node_online_map;
++	for_each_node_mask(node, sp_stat.prefetch_nodes) {
++		struct node_stats *ns = &sp_stat.node[node];
++
++		ns->current_free = 0;
++	}
++}
++
++/*
++ * This updates the high and low watermarks of amount of free ram in each
++ * node used to start and stop prefetching. We prefetch from pages_high * 4
++ * down to pages_high * 3.
++ */
++static void examine_free_limits(void)
++{
++	struct zone *z;
++
++	for_each_zone(z) {
++		struct node_stats *ns;
++		int idx;
++
++		if (!populated_zone(z))
++			continue;
++
++		ns = &sp_stat.node[z->zone_pgdat->node_id];
++		idx = zone_idx(z);
++		ns->lowfree[idx] = z->pages_high * 3;
++		ns->highfree[idx] = ns->lowfree[idx] + z->pages_high;
++
++		if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) {
++			/*
++			 * We've gotten above the high watermark of free pages
++			 * so we can start prefetching till we get to the low
++			 * watermark.
++			 */
++			ns->pointfree[idx] = &ns->lowfree[idx];
++		}
++	}
++}
++
++/*
++ * We want to be absolutely certain it's ok to start prefetching.
++ */
++static int prefetch_suitable(void)
++{
++	unsigned long limit;
++	struct zone *z;
++	int node, ret = 0, test_pagestate = 0;
++
++	/* Purposefully racy */
++	if (test_bit(0, &swapped.busy)) {
++		__clear_bit(0, &swapped.busy);
++		goto out;
++	}
++
++	/*
++	 * get_page_state and above_background_load are expensive so we only
++	 * perform them every SWAP_CLUSTER_MAX prefetched_pages.
++	 * We test to see if we're above_background_load as disk activity
++	 * even at low priority can cause interrupt induced scheduling
++	 * latencies.
++	 */
++	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
++		if (above_background_load())
++			goto out;
++		test_pagestate = 1;
++	}
++
++	clear_current_prefetch_free();
++
++	/*
++	 * Have some hysteresis between where page reclaiming and prefetching
++	 * will occur to prevent ping-ponging between them.
++	 */
++	for_each_zone(z) {
++		struct node_stats *ns;
++		unsigned long free;
++		int idx;
++
++		if (!populated_zone(z))
++			continue;
++
++		node = z->zone_pgdat->node_id;
++		ns = &sp_stat.node[node];
++		idx = zone_idx(z);
++
++		free = zone_page_state(z, NR_FREE_PAGES);
++		if (free < *ns->pointfree[idx]) {
++			/*
++			 * Free pages have dropped below the low watermark so
++			 * we won't start prefetching again till we hit the
++			 * high watermark of free pages.
++			 */
++			ns->pointfree[idx] = &ns->highfree[idx];
++			node_clear(node, sp_stat.prefetch_nodes);
++			continue;
++		}
++		ns->current_free += free;
++	}
++
++	/*
++	 * We iterate over each node testing to see if it is suitable for
++	 * prefetching and clear the nodemask if it is not.
++	 */
++	for_each_node_mask(node, sp_stat.prefetch_nodes) {
++		struct node_stats *ns = &sp_stat.node[node];
++
++		/*
++		 * We check to see that pages are not being allocated
++		 * elsewhere at any significant rate implying any
++		 * degree of memory pressure (eg during file reads)
++		 */
++		if (ns->last_free) {
++			if (ns->current_free + SWAP_CLUSTER_MAX <
++			    ns->last_free) {
++				ns->last_free = ns->current_free;
++				node_clear(node,
++					sp_stat.prefetch_nodes);
++				continue;
++			}
++		} else
++			ns->last_free = ns->current_free;
++
++		if (!test_pagestate)
++			continue;
++
++		/* We shouldn't prefetch when we are doing writeback */
++		if (node_page_state(node, NR_WRITEBACK)) {
++			node_clear(node, sp_stat.prefetch_nodes);
++			continue;
++		}
++
++		/*
++		 * >2/3 of the ram on this node is mapped, slab, swapcache or
++		 * dirty, we need to leave some free for pagecache.
++		 */
++		limit = node_page_state(node, NR_FILE_PAGES);
++		limit += node_page_state(node, NR_SLAB_RECLAIMABLE);
++		limit += node_page_state(node, NR_SLAB_UNRECLAIMABLE);
++		limit += node_page_state(node, NR_FILE_DIRTY);
++		limit += node_page_state(node, NR_UNSTABLE_NFS);
++		limit += total_swapcache_pages;
++		if (limit > ns->prefetch_watermark) {
++			node_clear(node, sp_stat.prefetch_nodes);
++			continue;
++		}
++	}
++
++	if (nodes_empty(sp_stat.prefetch_nodes))
++		goto out;
++
++	/* Survived all that? Hooray we can prefetch! */
++	ret = 1;
++out:
++	return ret;
++}
++
++/*
++ * Get previous swapped entry when iterating over all entries. swapped.lock
++ * should be held and we should already ensure that entry exists.
++ */
++static inline struct swapped_entry *prev_swapped_entry
++	(struct swapped_entry *entry)
++{
++	return list_entry(entry->swapped_list.prev->prev,
++		struct swapped_entry, swapped_list);
++}
++
++/*
++ * trickle_swap is the main function that initiates the swap prefetching. It
++ * first checks to see if the busy flag is set, and does not prefetch if it
++ * is, as the flag implied we are low on memory or swapping in currently.
++ * Otherwise it runs until prefetch_suitable fails which occurs when the
++ * vm is busy, we prefetch to the watermark, or the list is empty or we have
++ * iterated over all entries
++ */
++static enum trickle_return trickle_swap(void)
++{
++	enum trickle_return ret = TRICKLE_DELAY;
++	struct swapped_entry *entry;
++	unsigned long flags;
++
++	/*
++	 * If laptop_mode is enabled don't prefetch to avoid hard drives
++	 * doing unnecessary spin-ups
++	 */
++	if (!swap_prefetch || laptop_mode)
++		return ret;
++
++	examine_free_limits();
++	entry = NULL;
++
++	for ( ; ; ) {
++		swp_entry_t swp_entry;
++		int node;
++
++		if (!prefetch_suitable())
++			break;
++
++		spin_lock_irqsave(&swapped.lock, flags);
++		if (list_empty(&swapped.list)) {
++			ret = TRICKLE_FAILED;
++			spin_unlock_irqrestore(&swapped.lock, flags);
++			break;
++		}
++
++		if (!entry) {
++			/*
++			 * This sets the entry for the first iteration. It
++			 * also is a safeguard against the entry disappearing
++			 * while the lock is not held.
++			 */
++			entry = list_entry(swapped.list.prev,
++				struct swapped_entry, swapped_list);
++		} else if (entry->swapped_list.prev == swapped.list.next) {
++			/*
++			 * If we have iterated over all entries and there are
++			 * still entries that weren't swapped out there may
++			 * be a reason we could not swap them back in so
++			 * delay attempting further prefetching.
++			 */
++			spin_unlock_irqrestore(&swapped.lock, flags);
++			break;
++		}
++
++		node = get_swap_entry_node(entry);
++		if (!node_isset(node, sp_stat.prefetch_nodes)) {
++			/*
++			 * We found an entry that belongs to a node that is
++			 * not suitable for prefetching so skip it.
++			 */
++			entry = prev_swapped_entry(entry);
++			spin_unlock_irqrestore(&swapped.lock, flags);
++			continue;
++		}
++		swp_entry = entry->swp_entry;
++		entry = prev_swapped_entry(entry);
++		spin_unlock_irqrestore(&swapped.lock, flags);
++
++		if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY)
++			break;
++	}
++
++	if (sp_stat.prefetched_pages) {
++		lru_add_drain();
++		sp_stat.prefetched_pages = 0;
++	}
++	return ret;
++}
++
++static int kprefetchd(void *__unused)
++{
++	struct sched_param param = { .sched_priority = 0 };
++
++	sched_setscheduler(current, SCHED_BATCH, &param);
++	set_user_nice(current, 19);
++	/* Set ioprio to lowest if supported by i/o scheduler */
++	sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE);
++
++	/* kprefetchd has nothing to do until it is woken up the first time */
++	set_current_state(TASK_INTERRUPTIBLE);
++	schedule();
++
++	do {
++		try_to_freeze();
++
++		/*
++		 * TRICKLE_FAILED implies no entries left - we do not schedule
++		 * a wakeup, and further delay the next one.
++		 */
++		if (trickle_swap() == TRICKLE_FAILED) {
++			set_current_state(TASK_INTERRUPTIBLE);
++			schedule();
++		}
++		clear_last_prefetch_free();
++		schedule_timeout_interruptible(PREFETCH_DELAY);
++	} while (!kthread_should_stop());
++
++	return 0;
++}
++
++/*
++ * Create kmem cache for swapped entries
++ */
++void __init prepare_swap_prefetch(void)
++{
++	struct zone *zone;
++
++	swapped.cache = kmem_cache_create("swapped_entry",
++		sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
++
++	/*
++	 * Set max number of entries to 2/3 the size of physical ram  as we
++	 * only ever prefetch to consume 2/3 of the ram.
++	 */
++	swapped.maxcount = nr_free_pagecache_pages() / 3 * 2;
++
++	for_each_zone(zone) {
++		unsigned long present;
++		struct node_stats *ns;
++		int idx;
++
++		present = zone->present_pages;
++		if (!present)
++			continue;
++
++		ns = &sp_stat.node[zone->zone_pgdat->node_id];
++		ns->prefetch_watermark += present / 3 * 2;
++		idx = zone_idx(zone);
++		ns->pointfree[idx] = &ns->highfree[idx];
++	}
++}
++
++static int __init kprefetchd_init(void)
++{
++	kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd");
++
++	return 0;
++}
++
++static void __exit kprefetchd_exit(void)
++{
++	kthread_stop(kprefetchd_task);
++}
++
++module_init(kprefetchd_init);
++module_exit(kprefetchd_exit);
+Index: linux-2.6.21-ck1/mm/swap_state.c
+===================================================================
+--- linux-2.6.21-ck1.orig/mm/swap_state.c	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/mm/swap_state.c	2007-05-04 12:24:20.000000000 +1000
+@@ -10,6 +10,7 @@
+ #include <linux/mm.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
++#include <linux/swap-prefetch.h>
+ #include <linux/init.h>
+ #include <linux/pagemap.h>
+ #include <linux/buffer_head.h>
+@@ -82,6 +83,7 @@ static int __add_to_swap_cache(struct pa
+ 		error = radix_tree_insert(&swapper_space.page_tree,
+ 						entry.val, page);
+ 		if (!error) {
++			remove_from_swapped_list(entry.val);
+ 			page_cache_get(page);
+ 			SetPageLocked(page);
+ 			SetPageSwapCache(page);
+@@ -95,11 +97,12 @@ static int __add_to_swap_cache(struct pa
+ 	return error;
+ }
+ 
+-static int add_to_swap_cache(struct page *page, swp_entry_t entry)
++int add_to_swap_cache(struct page *page, swp_entry_t entry)
+ {
+ 	int error;
+ 
+ 	if (!swap_duplicate(entry)) {
++		remove_from_swapped_list(entry.val);
+ 		INC_CACHE_INFO(noent_race);
+ 		return -ENOENT;
+ 	}
+@@ -148,6 +151,9 @@ int add_to_swap(struct page * page, gfp_
+ 	swp_entry_t entry;
+ 	int err;
+ 
++	/* Swap prefetching is delayed if we're swapping pages */
++	delay_swap_prefetch();
++
+ 	BUG_ON(!PageLocked(page));
+ 
+ 	for (;;) {
+@@ -320,6 +326,9 @@ struct page *read_swap_cache_async(swp_e
+ 	struct page *found_page, *new_page = NULL;
+ 	int err;
+ 
++	/* Swap prefetching is delayed if we're already reading from swap */
++	delay_swap_prefetch();
++
+ 	do {
+ 		/*
+ 		 * First check the swap cache.  Since this is normally
+Index: linux-2.6.21-ck1/mm/vmscan.c
+===================================================================
+--- linux-2.6.21-ck1.orig/mm/vmscan.c	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/mm/vmscan.c	2007-05-04 12:24:21.000000000 +1000
+@@ -16,6 +16,7 @@
+ #include <linux/slab.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
++#include <linux/swap-prefetch.h>
+ #include <linux/pagemap.h>
+ #include <linux/init.h>
+ #include <linux/highmem.h>
+@@ -36,6 +37,7 @@
+ #include <linux/rwsem.h>
+ #include <linux/delay.h>
+ #include <linux/kthread.h>
++#include <linux/timer.h>
+ #include <linux/freezer.h>
+ 
+ #include <asm/tlbflush.h>
+@@ -63,7 +65,7 @@ struct scan_control {
+ 	 * whole list at once. */
+ 	int swap_cluster_max;
+ 
+-	int swappiness;
++	int mapped;
+ 
+ 	int all_unreclaimable;
+ };
+@@ -110,9 +112,10 @@ struct shrinker {
+ #endif
+ 
+ /*
+- * From 0 .. 100.  Higher means more swappy.
++ * From 0 .. 100.  Lower means more swappy.
+  */
+-int vm_swappiness = 60;
++int vm_mapped __read_mostly = 66;
++int vm_hardmaplimit __read_mostly = 1;
+ long vm_total_pages;	/* The total number of pages which the VM controls */
+ 
+ static LIST_HEAD(shrinker_list);
+@@ -424,6 +427,7 @@ int remove_mapping(struct address_space 
+ 
+ 	if (PageSwapCache(page)) {
+ 		swp_entry_t swap = { .val = page_private(page) };
++		add_to_swapped_list(page);
+ 		__delete_from_swap_cache(page);
+ 		write_unlock_irq(&mapping->tree_lock);
+ 		swap_free(swap);
+@@ -807,10 +811,14 @@ static void shrink_active_list(unsigned 
+ 		 * The distress ratio is important - we don't want to start
+ 		 * going oom.
+ 		 *
+-		 * A 100% value of vm_swappiness overrides this algorithm
+-		 * altogether.
++		 * This distress value is ignored if we apply a hardmaplimit except
++		 * in extreme distress.
++		 *
++		 * A 0% value of vm_mapped overrides this algorithm altogether.
+ 		 */
+-		swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
++		swap_tendency = mapped_ratio * 100 / (sc->mapped + 1);
++		if (!vm_hardmaplimit || distress == 100)
++			swap_tendency += distress;
+ 
+ 		/*
+ 		 * Now use this metric to decide whether to start moving mapped
+@@ -959,6 +967,41 @@ static unsigned long shrink_zone(int pri
+ }
+ 
+ /*
++ * Helper functions to adjust nice level of kswapd, based on the priority of
++ * the task (p) that called it. If it is already higher priority we do not
++ * demote its nice level since it is still working on behalf of a higher
++ * priority task. With kernel threads we leave it at nice 0.
++ *
++ * We don't ever run kswapd real time, so if a real time task calls kswapd we
++ * set it to highest SCHED_NORMAL priority.
++ */
++static int effective_sc_prio(struct task_struct *p)
++{
++	if (likely(p->mm)) {
++		if (rt_task(p))
++			return -20;
++		if (idleprio_task(p))
++			return 19;
++		return task_nice(p);
++	}
++	return 0;
++}
++
++static void set_kswapd_nice(struct task_struct *kswapd, struct task_struct *p,
++			    int active)
++{
++	long nice = effective_sc_prio(p);
++
++	if (task_nice(kswapd) > nice || !active)
++		set_user_nice(kswapd, nice);
++}
++
++static int sc_priority(struct task_struct *p)
++{
++	return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40));
++}
++
++/*
+  * This is the direct reclaim path, for page-allocating processes.  We only
+  * try to reclaim pages from zones which will satisfy the caller's allocation
+  * request.
+@@ -1015,7 +1058,8 @@ static unsigned long shrink_zones(int pr
+  * holds filesystem locks which prevent writeout this might not work, and the
+  * allocation attempt will fail.
+  */
+-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
++unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
++				struct task_struct *p)
+ {
+ 	int priority;
+ 	int ret = 0;
+@@ -1023,15 +1067,20 @@ unsigned long try_to_free_pages(struct z
+ 	unsigned long nr_reclaimed = 0;
+ 	struct reclaim_state *reclaim_state = current->reclaim_state;
+ 	unsigned long lru_pages = 0;
+-	int i;
++	int i, scan_priority = DEF_PRIORITY;
+ 	struct scan_control sc = {
+ 		.gfp_mask = gfp_mask,
+ 		.may_writepage = !laptop_mode,
+ 		.swap_cluster_max = SWAP_CLUSTER_MAX,
+ 		.may_swap = 1,
+-		.swappiness = vm_swappiness,
++		.mapped = vm_mapped,
+ 	};
+ 
++	if (p)
++		scan_priority = sc_priority(p);
++
++	delay_swap_prefetch();
++
+ 	count_vm_event(ALLOCSTALL);
+ 
+ 	for (i = 0; zones[i] != NULL; i++) {
+@@ -1044,7 +1093,7 @@ unsigned long try_to_free_pages(struct z
+ 				+ zone_page_state(zone, NR_INACTIVE);
+ 	}
+ 
+-	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
++	for (priority = scan_priority; priority >= 0; priority--) {
+ 		sc.nr_scanned = 0;
+ 		if (!priority)
+ 			disable_swap_token();
+@@ -1074,7 +1123,7 @@ unsigned long try_to_free_pages(struct z
+ 		}
+ 
+ 		/* Take a nap, wait for some writeback to complete */
+-		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
++		if (sc.nr_scanned && priority < scan_priority - 2)
+ 			congestion_wait(WRITE, HZ/10);
+ 	}
+ 	/* top priority shrink_caches still had more to do? don't OOM, then */
+@@ -1124,9 +1173,9 @@ out:
+  */
+ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+ {
+-	int all_zones_ok;
++	int all_zones_ok = 0;
+ 	int priority;
+-	int i;
++	int i, scan_priority;
+ 	unsigned long total_scanned;
+ 	unsigned long nr_reclaimed;
+ 	struct reclaim_state *reclaim_state = current->reclaim_state;
+@@ -1134,7 +1183,7 @@ static unsigned long balance_pgdat(pg_da
+ 		.gfp_mask = GFP_KERNEL,
+ 		.may_swap = 1,
+ 		.swap_cluster_max = SWAP_CLUSTER_MAX,
+-		.swappiness = vm_swappiness,
++		.mapped = vm_mapped,
+ 	};
+ 	/*
+ 	 * temp_priority is used to remember the scanning priority at which
+@@ -1142,6 +1191,8 @@ static unsigned long balance_pgdat(pg_da
+ 	 */
+ 	int temp_priority[MAX_NR_ZONES];
+ 
++	scan_priority = sc_priority(pgdat->kswapd);
++
+ loop_again:
+ 	total_scanned = 0;
+ 	nr_reclaimed = 0;
+@@ -1149,9 +1200,9 @@ loop_again:
+ 	count_vm_event(PAGEOUTRUN);
+ 
+ 	for (i = 0; i < pgdat->nr_zones; i++)
+-		temp_priority[i] = DEF_PRIORITY;
++		temp_priority[i] = scan_priority;
+ 
+-	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
++	for (priority = scan_priority; priority >= 0; priority--) {
+ 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
+ 		unsigned long lru_pages = 0;
+ 
+@@ -1167,15 +1218,22 @@ loop_again:
+ 		 */
+ 		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+ 			struct zone *zone = pgdat->node_zones + i;
++			unsigned long watermark;
+ 
+ 			if (!populated_zone(zone))
+ 				continue;
+ 
+-			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
++			if (zone->all_unreclaimable && priority != scan_priority)
+ 				continue;
+ 
+-			if (!zone_watermark_ok(zone, order, zone->pages_high,
+-					       0, 0)) {
++			/*
++			 * The watermark is relaxed depending on the
++			 * level of "priority" till it drops to
++			 * pages_high.
++			 */
++			watermark = zone->pages_high + (zone->pages_high *
++				    priority / scan_priority);
++			if (!zone_watermark_ok(zone, order, watermark, 0, 0)) {
+ 				end_zone = i;
+ 				break;
+ 			}
+@@ -1202,14 +1260,18 @@ loop_again:
+ 		for (i = 0; i <= end_zone; i++) {
+ 			struct zone *zone = pgdat->node_zones + i;
+ 			int nr_slab;
++			unsigned long watermark;
+ 
+ 			if (!populated_zone(zone))
+ 				continue;
+ 
+-			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
++			if (zone->all_unreclaimable && priority != scan_priority)
+ 				continue;
+ 
+-			if (!zone_watermark_ok(zone, order, zone->pages_high,
++			watermark = zone->pages_high + (zone->pages_high *
++				    priority / scan_priority);
++
++			if (!zone_watermark_ok(zone, order, watermark,
+ 					       end_zone, 0))
+ 				all_zones_ok = 0;
+ 			temp_priority[i] = priority;
+@@ -1242,7 +1304,7 @@ loop_again:
+ 		 * OK, kswapd is getting into trouble.  Take a nap, then take
+ 		 * another pass across the zones.
+ 		 */
+-		if (total_scanned && priority < DEF_PRIORITY - 2)
++		if (total_scanned && priority < scan_priority - 2)
+ 			congestion_wait(WRITE, HZ/10);
+ 
+ 		/*
+@@ -1276,6 +1338,8 @@ out:
+ 	return nr_reclaimed;
+ }
+ 
++#define WT_EXPIRY	(HZ * 5)	/* Time to wakeup watermark_timer */
++
+ /*
+  * The background pageout daemon, started as a kernel thread
+  * from the init process. 
+@@ -1325,6 +1389,8 @@ static int kswapd(void *p)
+ 
+ 		try_to_freeze();
+ 
++		/* kswapd has been busy so delay watermark_timer */
++		mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY);
+ 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+ 		new_order = pgdat->kswapd_max_order;
+ 		pgdat->kswapd_max_order = 0;
+@@ -1335,6 +1401,7 @@ static int kswapd(void *p)
+ 			 */
+ 			order = new_order;
+ 		} else {
++			set_user_nice(tsk, 0);
+ 			schedule();
+ 			order = pgdat->kswapd_max_order;
+ 		}
+@@ -1348,9 +1415,10 @@ static int kswapd(void *p)
+ /*
+  * A zone is low on free memory, so wake its kswapd task to service it.
+  */
+-void wakeup_kswapd(struct zone *zone, int order)
++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p)
+ {
+ 	pg_data_t *pgdat;
++	int active;
+ 
+ 	if (!populated_zone(zone))
+ 		return;
+@@ -1362,7 +1430,9 @@ void wakeup_kswapd(struct zone *zone, in
+ 		pgdat->kswapd_max_order = order;
+ 	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ 		return;
+-	if (!waitqueue_active(&pgdat->kswapd_wait))
++	active = waitqueue_active(&pgdat->kswapd_wait);
++	set_kswapd_nice(pgdat->kswapd, p, active);
++	if (!active)
+ 		return;
+ 	wake_up_interruptible(&pgdat->kswapd_wait);
+ }
+@@ -1381,6 +1451,8 @@ static unsigned long shrink_all_zones(un
+ 	struct zone *zone;
+ 	unsigned long nr_to_scan, ret = 0;
+ 
++	delay_swap_prefetch();
++
+ 	for_each_zone(zone) {
+ 
+ 		if (!populated_zone(zone))
+@@ -1440,7 +1512,7 @@ unsigned long shrink_all_memory(unsigned
+ 		.may_swap = 0,
+ 		.swap_cluster_max = nr_pages,
+ 		.may_writepage = 1,
+-		.swappiness = vm_swappiness,
++		.mapped = vm_mapped,
+ 	};
+ 
+ 	current->reclaim_state = &reclaim_state;
+@@ -1475,7 +1547,7 @@ unsigned long shrink_all_memory(unsigned
+ 		/* Force reclaiming mapped pages in the passes #3 and #4 */
+ 		if (pass > 2) {
+ 			sc.may_swap = 1;
+-			sc.swappiness = 100;
++			sc.mapped = 0;
+ 		}
+ 
+ 		for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+@@ -1539,20 +1611,57 @@ static int __devinit cpu_callback(struct
+ }
+ 
+ /*
++ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots
++ */
++static void watermark_wakeup(unsigned long data)
++{
++	pg_data_t *pgdat = (pg_data_t *)data;
++	struct timer_list *wt = &pgdat->watermark_timer;
++	int i;
++
++	if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load())
++		goto out;
++	for (i = pgdat->nr_zones - 1; i >= 0; i--) {
++		struct zone *z = pgdat->node_zones + i;
++
++		if (!populated_zone(z) || is_highmem(z)) {
++			/* We are better off leaving highmem full */
++			continue;
++		}
++		if (!zone_watermark_ok(z, 0, z->pages_lots, 0, 0)) {
++			wake_up_interruptible(&pgdat->kswapd_wait);
++			goto out;
++		}
++	}
++out:
++	mod_timer(wt, jiffies + WT_EXPIRY);
++	return;
++}
++
++/*
+  * This kswapd start function will be called by init and node-hot-add.
+  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+  */
+ int kswapd_run(int nid)
+ {
+ 	pg_data_t *pgdat = NODE_DATA(nid);
++	struct timer_list *wt;
+ 	int ret = 0;
+ 
+ 	if (pgdat->kswapd)
+ 		return 0;
+ 
++	wt = &pgdat->watermark_timer;
++	init_timer(wt);
++	wt->data = (unsigned long)pgdat;
++	wt->function = watermark_wakeup;
++	wt->expires = jiffies + WT_EXPIRY;
++	add_timer(wt);
++
+ 	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ 	if (IS_ERR(pgdat->kswapd)) {
+ 		/* failure at boot is fatal */
++		del_timer(wt);
+ 		BUG_ON(system_state == SYSTEM_BOOTING);
+ 		printk("Failed to start kswapd on node %d\n",nid);
+ 		ret = -1;
+@@ -1623,7 +1732,7 @@ static int __zone_reclaim(struct zone *z
+ 		.swap_cluster_max = max_t(unsigned long, nr_pages,
+ 					SWAP_CLUSTER_MAX),
+ 		.gfp_mask = gfp_mask,
+-		.swappiness = vm_swappiness,
++		.mapped = vm_mapped,
+ 	};
+ 	unsigned long slab_reclaimable;
+ 
+Index: linux-2.6.21-ck1/include/linux/mm_inline.h
+===================================================================
+--- linux-2.6.21-ck1.orig/include/linux/mm_inline.h	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/include/linux/mm_inline.h	2007-05-04 12:24:20.000000000 +1000
+@@ -13,6 +13,13 @@ add_page_to_inactive_list(struct zone *z
+ }
+ 
+ static inline void
++add_page_to_inactive_list_tail(struct zone *zone, struct page *page)
++{
++	list_add_tail(&page->lru, &zone->inactive_list);
++	__inc_zone_state(zone, NR_INACTIVE);
++}
++
++static inline void
+ del_page_from_active_list(struct zone *zone, struct page *page)
+ {
+ 	list_del(&page->lru);
+Index: linux-2.6.21-ck1/include/linux/swap-prefetch.h
+===================================================================
+--- /dev/null	1970-01-01 00:00:00.000000000 +0000
++++ linux-2.6.21-ck1/include/linux/swap-prefetch.h	2007-05-04 12:24:20.000000000 +1000
+@@ -0,0 +1,55 @@
++#ifndef SWAP_PREFETCH_H_INCLUDED
++#define SWAP_PREFETCH_H_INCLUDED
++
++#ifdef CONFIG_SWAP_PREFETCH
++/* mm/swap_prefetch.c */
++extern int swap_prefetch;
++struct swapped_entry {
++	swp_entry_t		swp_entry;	/* The actual swap entry */
++	struct list_head	swapped_list;	/* Linked list of entries */
++#if MAX_NUMNODES > 1
++	int			node;		/* Node id */
++#endif
++} __attribute__((packed));
++
++static inline void store_swap_entry_node(struct swapped_entry *entry,
++	struct page *page)
++{
++#if MAX_NUMNODES > 1
++	entry->node = page_to_nid(page);
++#endif
++}
++
++static inline int get_swap_entry_node(struct swapped_entry *entry)
++{
++#if MAX_NUMNODES > 1
++	return entry->node;
++#else
++	return 0;
++#endif
++}
++
++extern void add_to_swapped_list(struct page *page);
++extern void remove_from_swapped_list(const unsigned long index);
++extern void delay_swap_prefetch(void);
++extern void prepare_swap_prefetch(void);
++
++#else	/* CONFIG_SWAP_PREFETCH */
++static inline void add_to_swapped_list(struct page *__unused)
++{
++}
++
++static inline void prepare_swap_prefetch(void)
++{
++}
++
++static inline void remove_from_swapped_list(const unsigned long __unused)
++{
++}
++
++static inline void delay_swap_prefetch(void)
++{
++}
++#endif	/* CONFIG_SWAP_PREFETCH */
++
++#endif		/* SWAP_PREFETCH_H_INCLUDED */
+Index: linux-2.6.21-ck1/include/linux/sysctl.h
+===================================================================
+--- linux-2.6.21-ck1.orig/include/linux/sysctl.h	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/include/linux/sysctl.h	2007-05-04 12:24:20.000000000 +1000
+@@ -190,7 +190,7 @@ enum
+ 	VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */
+ 	VM_PAGEBUF=17,		/* struct: Control pagebuf parameters */
+ 	VM_HUGETLB_PAGES=18,	/* int: Number of available Huge Pages */
+-	VM_SWAPPINESS=19,	/* Tendency to steal mapped memory */
++	VM_MAPPED=19,		/* percent mapped min while evicting cache */
+ 	VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */
+ 	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
+ 	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
+Index: linux-2.6.21-ck1/include/linux/mmzone.h
+===================================================================
+--- linux-2.6.21-ck1.orig/include/linux/mmzone.h	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/include/linux/mmzone.h	2007-05-04 12:24:21.000000000 +1000
+@@ -13,6 +13,7 @@
+ #include <linux/init.h>
+ #include <linux/seqlock.h>
+ #include <linux/nodemask.h>
++#include <linux/timer.h>
+ #include <asm/atomic.h>
+ #include <asm/page.h>
+ 
+@@ -178,7 +179,7 @@ enum zone_type {
+ 
+ struct zone {
+ 	/* Fields commonly accessed by the page allocator */
+-	unsigned long		pages_min, pages_low, pages_high;
++	unsigned long		pages_min, pages_low, pages_high, pages_lots;
+ 	/*
+ 	 * We don't know if the memory that we're going to allocate will be freeable
+ 	 * or/and it will be released eventually, so to avoid totally wasting several
+@@ -449,6 +450,7 @@ typedef struct pglist_data {
+ 	wait_queue_head_t kswapd_wait;
+ 	struct task_struct *kswapd;
+ 	int kswapd_max_order;
++	struct timer_list watermark_timer;
+ } pg_data_t;
+ 
+ #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
+@@ -465,7 +467,7 @@ typedef struct pglist_data {
+ void get_zone_counts(unsigned long *active, unsigned long *inactive,
+ 			unsigned long *free);
+ void build_all_zonelists(void);
+-void wakeup_kswapd(struct zone *zone, int order);
++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p);
+ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ 		int classzone_idx, int alloc_flags);
+ enum memmap_context {
+Index: linux-2.6.21-ck1/mm/page_alloc.c
+===================================================================
+--- linux-2.6.21-ck1.orig/mm/page_alloc.c	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/mm/page_alloc.c	2007-05-04 12:24:20.000000000 +1000
+@@ -1277,7 +1277,7 @@ restart:
+ 		goto nopage;
+ 
+ 	for (z = zonelist->zones; *z; z++)
+-		wakeup_kswapd(*z, order);
++		wakeup_kswapd(*z, order, p);
+ 
+ 	/*
+ 	 * OK, we're below the kswapd watermark and have kicked background
+@@ -1341,7 +1341,7 @@ nofail_alloc:
+ 	reclaim_state.reclaimed_slab = 0;
+ 	p->reclaim_state = &reclaim_state;
+ 
+-	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
++	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask, p);
+ 
+ 	p->reclaim_state = NULL;
+ 	p->flags &= ~PF_MEMALLOC;
+@@ -1597,6 +1597,7 @@ void show_free_areas(void)
+ 			" min:%lukB"
+ 			" low:%lukB"
+ 			" high:%lukB"
++			" lots:%lukB"
+ 			" active:%lukB"
+ 			" inactive:%lukB"
+ 			" present:%lukB"
+@@ -1608,6 +1609,7 @@ void show_free_areas(void)
+ 			K(zone->pages_min),
+ 			K(zone->pages_low),
+ 			K(zone->pages_high),
++			K(zone->pages_lots),
+ 			K(zone_page_state(zone, NR_ACTIVE)),
+ 			K(zone_page_state(zone, NR_INACTIVE)),
+ 			K(zone->present_pages),
+@@ -3146,6 +3148,7 @@ void setup_per_zone_pages_min(void)
+ 
+ 		zone->pages_low   = zone->pages_min + (tmp >> 2);
+ 		zone->pages_high  = zone->pages_min + (tmp >> 1);
++		zone->pages_lots  = zone->pages_min + tmp;
+ 		spin_unlock_irqrestore(&zone->lru_lock, flags);
+ 	}
+ 
+Index: linux-2.6.21-ck1/fs/buffer.c
+===================================================================
+--- linux-2.6.21-ck1.orig/fs/buffer.c	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/fs/buffer.c	2007-05-04 12:24:20.000000000 +1000
+@@ -363,7 +363,7 @@ static void free_more_memory(void)
+ 	for_each_online_pgdat(pgdat) {
+ 		zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
+ 		if (*zones)
+-			try_to_free_pages(zones, GFP_NOFS);
++			try_to_free_pages(zones, GFP_NOFS, NULL);
+ 	}
+ }
+ 
+Index: linux-2.6.21-ck1/mm/filemap.c
+===================================================================
+--- linux-2.6.21-ck1.orig/mm/filemap.c	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/mm/filemap.c	2007-05-04 12:24:21.000000000 +1000
+@@ -466,6 +466,16 @@ int add_to_page_cache_lru(struct page *p
+ 	return ret;
+ }
+ 
++int add_to_page_cache_lru_tail(struct page *page,
++	struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
++{
++	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
++
++	if (ret == 0)
++		lru_cache_add_tail(page);
++	return ret;
++}
++
+ #ifdef CONFIG_NUMA
+ struct page *__page_cache_alloc(gfp_t gfp)
+ {
+@@ -836,6 +846,34 @@ static void shrink_readahead_size_eio(st
+ 	ra->ra_pages /= 4;
+ }
+ 
++/*
++ * Sysctl which determines whether we should read from large files to the
++ * tail of the inactive lru list.
++ */
++int vm_tail_largefiles __read_mostly = 1;
++
++static inline int nr_mapped(void)
++{
++	return global_page_state(NR_FILE_MAPPED) +
++		global_page_state(NR_ANON_PAGES);
++}
++
++/*
++ * This examines how large in pages a file size is and returns 1 if it is
++ * more than half the unmapped ram. Avoid doing read_page_state which is
++ * expensive unless we already know it is likely to be large enough.
++ */
++static int large_isize(unsigned long nr_pages)
++{
++	if (nr_pages * 6 > vm_total_pages) {
++		 unsigned long unmapped_ram = vm_total_pages - nr_mapped();
++
++		if (nr_pages * 2 > unmapped_ram)
++			return 1;
++	}
++	return 0;
++}
++
+ /**
+  * do_generic_mapping_read - generic file read routine
+  * @mapping:	address_space to be read
+@@ -1044,8 +1082,19 @@ no_cached_page:
+ 				goto out;
+ 			}
+ 		}
+-		error = add_to_page_cache_lru(cached_page, mapping,
+-						index, GFP_KERNEL);
++
++		/*
++		 * If we know the file is large we add the pages read to the
++		 * end of the lru as we're unlikely to be able to cache the
++		 * whole file in ram so make those pages the first to be
++		 * dropped if not referenced soon.
++		 */
++		if (vm_tail_largefiles && large_isize(end_index))
++			error = add_to_page_cache_lru_tail(cached_page,
++						mapping, index, GFP_KERNEL);
++		else
++			error = add_to_page_cache_lru(cached_page, mapping,
++							index, GFP_KERNEL);
+ 		if (error) {
+ 			if (error == -EEXIST)
+ 				goto find_page;
+Index: linux-2.6.21-ck1/Documentation/filesystems/proc.txt
+===================================================================
+--- linux-2.6.21-ck1.orig/Documentation/filesystems/proc.txt	2007-05-04 12:24:01.000000000 +1000
++++ linux-2.6.21-ck1/Documentation/filesystems/proc.txt	2007-05-04 12:24:21.000000000 +1000
+@@ -1325,6 +1325,14 @@ To free pagecache, dentries and inodes:
+ As this is a non-destructive operation and dirty objects are not freeable, the
+ user should run `sync' first.
+ 
++tail_largefiles
++---------------
++
++When enabled reads from large files to the tail end of the inactive lru list.
++This means that any cache from reading large files is dropped very quickly,
++preventing loss of mapped ram and useful pagecache when large files are read.
++This does, however, make caching less effective when working with large files.
++
+ 
+ 2.5 /proc/sys/dev - Device specific parameters
+ ----------------------------------------------
+Index: linux-2.6.21-ck1/arch/i386/Kconfig
+===================================================================
+--- linux-2.6.21-ck1.orig/arch/i386/Kconfig	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/arch/i386/Kconfig	2007-05-04 12:24:21.000000000 +1000
+@@ -546,7 +546,7 @@ endchoice
+ 
+ choice
+ 	depends on EXPERIMENTAL
+-	prompt "Memory split" if EMBEDDED
++	prompt "Memory split"
+ 	default VMSPLIT_3G
+ 	help
+ 	  Select the desired split between kernel and user memory.
+@@ -565,14 +565,14 @@ choice
+ 	  option alone!
+ 
+ 	config VMSPLIT_3G
+-		bool "3G/1G user/kernel split"
++		bool "Default 896MB lowmem (3G/1G user/kernel split)"
+ 	config VMSPLIT_3G_OPT
+ 		depends on !HIGHMEM
+-		bool "3G/1G user/kernel split (for full 1G low memory)"
++		bool "1GB lowmem (3G/1G user/kernel split)"
+ 	config VMSPLIT_2G
+-		bool "2G/2G user/kernel split"
++		bool "2GB lowmem (2G/2G user/kernel split)"
+ 	config VMSPLIT_1G
+-		bool "1G/3G user/kernel split"
++		bool "3GB lowmem (1G/3G user/kernel split)"
+ endchoice
+ 
+ config PAGE_OFFSET
+Index: linux-2.6.21-ck1/kernel/Kconfig.hz
+===================================================================
+--- linux-2.6.21-ck1.orig/kernel/Kconfig.hz	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/kernel/Kconfig.hz	2007-05-04 12:24:21.000000000 +1000
+@@ -4,7 +4,7 @@
+ 
+ choice
+ 	prompt "Timer frequency"
+-	default HZ_250
++	default HZ_1000
+ 	help
+ 	 Allows the configuration of the timer frequency. It is customary
+ 	 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
+@@ -13,8 +13,7 @@ choice
+ 	 contention and cacheline bounces as a result of timer interrupts.
+ 	 Note that the timer interrupt occurs on each processor in an SMP
+ 	 environment leading to NR_CPUS * HZ number of timer interrupts
+-	 per second.
+-
++	 per second.Laptops may also show improved battery life.
+ 
+ 	config HZ_100
+ 		bool "100 HZ"
+@@ -23,13 +22,14 @@ choice
+ 	  with lots of processors that may show reduced performance if
+ 	  too many timer interrupts are occurring.
+ 
+-	config HZ_250
++	config HZ_250_NODEFAULT
+ 		bool "250 HZ"
+ 	help
+-	 250 Hz is a good compromise choice allowing server performance
+-	 while also showing good interactive responsiveness even
+-	 on SMP and NUMA systems. If you are going to be using NTSC video
+-	 or multimedia, selected 300Hz instead.
++	 250 HZ is a lousy compromise choice allowing server interactivity
++	 while also showing desktop throughput and no extra power saving on
++	 laptops. Good for when you can't make up your mind.
++
++	 Recommend 100 or 1000 instead.
+ 
+ 	config HZ_300
+ 		bool "300 HZ"
+@@ -45,12 +45,76 @@ choice
+ 	 1000 Hz is the preferred choice for desktop systems and other
+ 	 systems requiring fast interactive responses to events.
+ 
++	config HZ_1500
++		bool "1500 HZ"
++	help
++	 1500 Hz is an insane value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_2000
++		bool "2000 HZ"
++	help
++	 2000 Hz is an insane value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_3000
++		bool "3000 HZ"
++	help
++	 3000 Hz is an insane value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_4000
++		bool "4000 HZ"
++	help
++	 4000 Hz is an insane value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_5000
++		bool "5000 HZ"
++	help
++	 5000 Hz is an obscene value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_7500
++		bool "7500 HZ"
++	help
++	 7500 Hz is an obscene value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_10000
++		bool "10000 HZ"
++	help
++	 10000 Hz is an obscene value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++
+ endchoice
+ 
+ config HZ
+ 	int
+ 	default 100 if HZ_100
+-	default 250 if HZ_250
++	default 250 if HZ_250_NODEFAULT
+ 	default 300 if HZ_300
+ 	default 1000 if HZ_1000
++	default 1500 if HZ_1500
++	default 2000 if HZ_2000
++	default 3000 if HZ_3000
++	default 4000 if HZ_4000
++	default 5000 if HZ_5000
++	default 7500 if HZ_7500
++	default 10000 if HZ_10000
+ 
+Index: linux-2.6.21-ck1/arch/i386/defconfig
+===================================================================
+--- linux-2.6.21-ck1.orig/arch/i386/defconfig	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/arch/i386/defconfig	2007-05-04 12:24:21.000000000 +1000
+@@ -214,10 +214,10 @@ CONFIG_MTRR=y
+ # CONFIG_IRQBALANCE is not set
+ CONFIG_SECCOMP=y
+ # CONFIG_HZ_100 is not set
+-CONFIG_HZ_250=y
++# CONFIG_HZ_250 is not set
+ # CONFIG_HZ_300 is not set
+-# CONFIG_HZ_1000 is not set
+-CONFIG_HZ=250
++CONFIG_HZ_1000=y
++CONFIG_HZ=1000
+ # CONFIG_KEXEC is not set
+ # CONFIG_CRASH_DUMP is not set
+ CONFIG_PHYSICAL_START=0x100000
+Index: linux-2.6.21-ck1/arch/x86_64/defconfig
+===================================================================
+--- linux-2.6.21-ck1.orig/arch/x86_64/defconfig	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/arch/x86_64/defconfig	2007-05-04 12:24:21.000000000 +1000
+@@ -178,10 +178,10 @@ CONFIG_PHYSICAL_START=0x200000
+ CONFIG_SECCOMP=y
+ # CONFIG_CC_STACKPROTECTOR is not set
+ # CONFIG_HZ_100 is not set
+-CONFIG_HZ_250=y
++# CONFIG_HZ_250 is not set
+ # CONFIG_HZ_300 is not set
+-# CONFIG_HZ_1000 is not set
+-CONFIG_HZ=250
++CONFIG_HZ_1000=y
++CONFIG_HZ=1000
+ # CONFIG_REORDER is not set
+ CONFIG_K8_NB=y
+ CONFIG_GENERIC_HARDIRQS=y
+Index: linux-2.6.21-ck1/include/linux/jiffies.h
+===================================================================
+--- linux-2.6.21-ck1.orig/include/linux/jiffies.h	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/include/linux/jiffies.h	2007-05-04 12:24:21.000000000 +1000
+@@ -29,6 +29,12 @@
+ # define SHIFT_HZ	9
+ #elif HZ >= 768 && HZ < 1536
+ # define SHIFT_HZ	10
++#elif HZ >= 1536 && HZ < 3072
++# define SHIFT_HZ	11
++#elif HZ >= 3072 && HZ < 6144
++# define SHIFT_HZ	12
++#elif HZ >= 6144 && HZ < 12288
++# define SHIFT_HZ	13
+ #else
+ # error You lose.
+ #endif
+Index: linux-2.6.21-ck1/include/net/inet_timewait_sock.h
+===================================================================
+--- linux-2.6.21-ck1.orig/include/net/inet_timewait_sock.h	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/include/net/inet_timewait_sock.h	2007-05-04 12:24:21.000000000 +1000
+@@ -38,8 +38,8 @@ struct inet_hashinfo;
+  * If time > 4sec, it is "slow" path, no recycling is required,
+  * so that we select tick to get range about 4 seconds.
+  */
+-#if HZ <= 16 || HZ > 4096
+-# error Unsupported: HZ <= 16 or HZ > 4096
++#if HZ <= 16 || HZ > 16384
++# error Unsupported: HZ <= 16 or HZ > 16384
+ #elif HZ <= 32
+ # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+ #elif HZ <= 64
+@@ -54,8 +54,12 @@ struct inet_hashinfo;
+ # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+ #elif HZ <= 2048
+ # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+-#else
++#elif HZ <= 4096
+ # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
++#elif HZ <= 8192
++# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
++#else
++# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+ #endif
+ 
+ /* TIME_WAIT reaping mechanism. */
+Index: linux-2.6.21-ck1/include/net/pkt_sched.h
+===================================================================
+--- linux-2.6.21-ck1.orig/include/net/pkt_sched.h	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/include/net/pkt_sched.h	2007-05-04 12:24:21.000000000 +1000
+@@ -78,8 +78,14 @@ typedef long	psched_tdiff_t;
+ #define PSCHED_JSCALE 12
+ #elif HZ >= 384 && HZ < 768
+ #define PSCHED_JSCALE 11
+-#elif HZ >= 768
++#elif HZ >= 768 && HZ < 1536
+ #define PSCHED_JSCALE 10
++#elif HZ >= 1536 && HZ < 3072
++#define PSCHED_JSCALE 9
++#elif HZ >= 3072 && HZ < 6144
++#define PSCHED_JSCALE 8
++#else
++#define PSCHED_JSCALE 7
+ #endif
+ 
+ #define PSCHED_GET_TIME(stamp) ((stamp) = (get_jiffies_64()<<PSCHED_JSCALE))
+Index: linux-2.6.21-ck1/init/calibrate.c
+===================================================================
+--- linux-2.6.21-ck1.orig/init/calibrate.c	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/init/calibrate.c	2007-05-04 12:24:21.000000000 +1000
+@@ -122,12 +122,12 @@ void __devinit calibrate_delay(void)
+ 		printk("Calibrating delay loop (skipped)... "
+ 			"%lu.%02lu BogoMIPS preset\n",
+ 			loops_per_jiffy/(500000/HZ),
+-			(loops_per_jiffy/(5000/HZ)) % 100);
++			(loops_per_jiffy * 10/(50000/HZ)) % 100);
+ 	} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
+ 		printk("Calibrating delay using timer specific routine.. ");
+ 		printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
+ 			loops_per_jiffy/(500000/HZ),
+-			(loops_per_jiffy/(5000/HZ)) % 100,
++			(loops_per_jiffy * 10/(50000/HZ)) % 100,
+ 			loops_per_jiffy);
+ 	} else {
+ 		loops_per_jiffy = (1<<12);
+@@ -166,7 +166,7 @@ void __devinit calibrate_delay(void)
+ 		/* Round the value and print it */
+ 		printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
+ 			loops_per_jiffy/(500000/HZ),
+-			(loops_per_jiffy/(5000/HZ)) % 100,
++			(loops_per_jiffy * 10/(50000/HZ)) % 100,
+ 			loops_per_jiffy);
+ 	}
+ 
+Index: linux-2.6.21-ck1/arch/i386/kernel/cpu/proc.c
+===================================================================
+--- linux-2.6.21-ck1.orig/arch/i386/kernel/cpu/proc.c	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/arch/i386/kernel/cpu/proc.c	2007-05-04 12:24:21.000000000 +1000
+@@ -158,7 +158,7 @@ static int show_cpuinfo(struct seq_file 
+ 
+ 	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+ 		     c->loops_per_jiffy/(500000/HZ),
+-		     (c->loops_per_jiffy/(5000/HZ)) % 100);
++		     (c->loops_per_jiffy * 10/(50000/HZ)) % 100);
+ 	seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
+ 
+ 	return 0;
+Index: linux-2.6.21-ck1/arch/i386/kernel/smpboot.c
+===================================================================
+--- linux-2.6.21-ck1.orig/arch/i386/kernel/smpboot.c	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/arch/i386/kernel/smpboot.c	2007-05-04 12:24:21.000000000 +1000
+@@ -1134,7 +1134,7 @@ static void __init smp_boot_cpus(unsigne
+ 		"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+ 		cpucount+1,
+ 		bogosum/(500000/HZ),
+-		(bogosum/(5000/HZ))%100);
++		(bogosum * 10/(50000/HZ))%100);
+ 	
+ 	Dprintk("Before bogocount - setting activated=1.\n");
+ 
+Index: linux-2.6.21-ck1/include/linux/nfsd/stats.h
+===================================================================
+--- linux-2.6.21-ck1.orig/include/linux/nfsd/stats.h	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/include/linux/nfsd/stats.h	2007-05-04 12:24:21.000000000 +1000
+@@ -35,8 +35,8 @@ struct nfsd_stats {
+ 
+ };
+ 
+-/* thread usage wraps very million seconds (approx one fortnight) */
+-#define	NFSD_USAGE_WRAP	(HZ*1000000)
++/* thread usage wraps every one hundred thousand seconds (approx one day) */
++#define	NFSD_USAGE_WRAP	(HZ*100000)
+ 
+ #ifdef __KERNEL__
+ 
+Index: linux-2.6.21-ck1/arch/x86_64/kernel/setup.c
+===================================================================
+--- linux-2.6.21-ck1.orig/arch/x86_64/kernel/setup.c	2007-05-04 12:24:00.000000000 +1000
++++ linux-2.6.21-ck1/arch/x86_64/kernel/setup.c	2007-05-04 12:24:22.000000000 +1000
+@@ -1053,7 +1053,7 @@ static int show_cpuinfo(struct seq_file 
+ 		
+ 	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+ 		   c->loops_per_jiffy/(500000/HZ),
+-		   (c->loops_per_jiffy/(5000/HZ)) % 100);
++		   (c->loops_per_jiffy * 10/(50000/HZ)) % 100);
+ 
+ 	if (c->x86_tlbsize > 0) 
+ 		seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
diff --git a/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1 b/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1
new file mode 100644
index 000000000000..81fa14e2abe4
--- /dev/null
+++ b/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1
@@ -0,0 +1,5167 @@
+Index: linux-2.6.22-ck1/include/linux/sched.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/linux/sched.h	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/include/linux/sched.h	2007-07-10 14:55:21.000000000 +1000
+@@ -34,9 +34,14 @@
+ #define SCHED_FIFO		1
+ #define SCHED_RR		2
+ #define SCHED_BATCH		3
++#define SCHED_ISO		4
++#define SCHED_IDLEPRIO		5
+ 
+ #ifdef __KERNEL__
+ 
++#define SCHED_MAX		SCHED_IDLEPRIO
++#define SCHED_RANGE(policy)	((policy) <= SCHED_MAX)
++
+ struct sched_param {
+ 	int sched_priority;
+ };
+@@ -129,7 +134,7 @@
+ extern unsigned long nr_active(void);
+ extern unsigned long nr_iowait(void);
+ extern unsigned long weighted_cpuload(const int cpu);
+-
++extern int above_background_load(void);
+ 
+ /*
+  * Task state bitmask. NOTE! These bits are also
+@@ -150,8 +155,7 @@
+ #define EXIT_ZOMBIE		16
+ #define EXIT_DEAD		32
+ /* in tsk->state again */
+-#define TASK_NONINTERACTIVE	64
+-#define TASK_DEAD		128
++#define TASK_DEAD		64
+ 
+ #define __set_task_state(tsk, state_value)		\
+ 	do { (tsk)->state = (state_value); } while (0)
+@@ -537,14 +541,19 @@
+ 
+ #define MAX_USER_RT_PRIO	100
+ #define MAX_RT_PRIO		MAX_USER_RT_PRIO
++#define PRIO_RANGE		(40)
++#define ISO_PRIO		(MAX_RT_PRIO - 1)
+ 
+-#define MAX_PRIO		(MAX_RT_PRIO + 40)
++#define MAX_PRIO		(MAX_RT_PRIO + PRIO_RANGE)
+ 
+-#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
++#define rt_prio(prio)		unlikely((prio) < ISO_PRIO)
+ #define rt_task(p)		rt_prio((p)->prio)
+ #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
+-#define is_rt_policy(p)		((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
++#define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
++					(policy) == SCHED_RR)
+ #define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
++#define iso_task(p)		unlikely((p)->policy == SCHED_ISO)
++#define idleprio_task(p)	unlikely((p)->policy == SCHED_IDLEPRIO)
+ 
+ /*
+  * Some day this will be a full-fledged user tracking system..
+@@ -809,13 +818,6 @@
+ struct pipe_inode_info;
+ struct uts_namespace;
+ 
+-enum sleep_type {
+-	SLEEP_NORMAL,
+-	SLEEP_NONINTERACTIVE,
+-	SLEEP_INTERACTIVE,
+-	SLEEP_INTERRUPTED,
+-};
+-
+ struct prio_array;
+ 
+ struct task_struct {
+@@ -835,20 +837,33 @@
+ 	int load_weight;	/* for niceness load balancing purposes */
+ 	int prio, static_prio, normal_prio;
+ 	struct list_head run_list;
++	/*
++	 * This bitmap shows what priorities this task has received quota
++	 * from for this major priority rotation on its current runqueue.
++	 */
++	DECLARE_BITMAP(bitmap, PRIO_RANGE + 1);
+ 	struct prio_array *array;
++	/* Which major runqueue rotation did this task run */
++	unsigned long rotation;
+ 
+ 	unsigned short ioprio;
+ #ifdef CONFIG_BLK_DEV_IO_TRACE
+ 	unsigned int btrace_seq;
+ #endif
+-	unsigned long sleep_avg;
+ 	unsigned long long timestamp, last_ran;
+ 	unsigned long long sched_time; /* sched_clock time spent running */
+-	enum sleep_type sleep_type;
+ 
+ 	unsigned int policy;
+ 	cpumask_t cpus_allowed;
+-	unsigned int time_slice, first_time_slice;
++	/*
++	 * How much this task is entitled to run at the current priority
++	 * before being requeued at a lower priority.
++	 */
++	int time_slice;
++	/* Is this the very first time_slice this task has ever run. */
++	unsigned int first_time_slice;
++	/* How much this task receives at each priority level */
++	int quota;
+ 
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ 	struct sched_info sched_info;
+@@ -1013,6 +1028,7 @@
+ 	struct held_lock held_locks[MAX_LOCK_DEPTH];
+ 	unsigned int lockdep_recursion;
+ #endif
++	unsigned long mutexes_held;
+ 
+ /* journalling filesystem info */
+ 	void *journal_info;
+@@ -1181,9 +1197,11 @@
+ #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+ #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
+ #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
++#define PF_ISOREF	0x04000000	/* SCHED_ISO task has used up quota */
+ #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
+ #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
+ #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
++#define PF_NONSLEEP	0x80000000	/* Waiting on in-kernel activity */
+ 
+ /*
+  * Only the _current_ task can read/write to tsk->flags, but other
+@@ -1253,7 +1271,7 @@
+ #endif
+ 
+ extern void set_user_nice(struct task_struct *p, long nice);
+-extern int task_prio(const struct task_struct *p);
++extern int task_prio(struct task_struct *p);
+ extern int task_nice(const struct task_struct *p);
+ extern int can_nice(const struct task_struct *p, const int nice);
+ extern int task_curr(const struct task_struct *p);
+Index: linux-2.6.22-ck1/kernel/sched.c
+===================================================================
+--- linux-2.6.22-ck1.orig/kernel/sched.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/kernel/sched.c	2007-07-10 14:55:24.000000000 +1000
+@@ -16,6 +16,7 @@
+  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
+  *  2003-09-03	Interactivity tuning by Con Kolivas.
+  *  2004-04-02	Scheduler domains code by Nick Piggin
++ *  2007-03-02	Staircase deadline scheduling policy by Con Kolivas
+  */
+ 
+ #include <linux/mm.h>
+@@ -53,8 +54,9 @@
+ #include <linux/kprobes.h>
+ #include <linux/delayacct.h>
+ #include <linux/reciprocal_div.h>
+-
++#include <linux/log2.h>
+ #include <asm/tlb.h>
++
+ #include <asm/unistd.h>
+ 
+ /*
+@@ -84,147 +86,85 @@
+ #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
+ #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
+ #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
++#define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
+ 
+-/*
+- * Some helpers for converting nanosecond timing to jiffy resolution
+- */
+-#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
++/* Some helpers for converting to/from various scales.*/
+ #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+-
+-/*
+- * These are the 'tuning knobs' of the scheduler:
+- *
+- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
+- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+- * Timeslices get refilled after they expire.
+- */
+-#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
+-#define DEF_TIMESLICE		(100 * HZ / 1000)
+-#define ON_RUNQUEUE_WEIGHT	 30
+-#define CHILD_PENALTY		 95
+-#define PARENT_PENALTY		100
+-#define EXIT_WEIGHT		  3
+-#define PRIO_BONUS_RATIO	 25
+-#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
+-#define INTERACTIVE_DELTA	  2
+-#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
+-#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
+-#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+-
+-/*
+- * If a task is 'interactive' then we reinsert it in the active
+- * array after it has expired its current timeslice. (it will not
+- * continue to run immediately, it will still roundrobin with
+- * other interactive tasks.)
+- *
+- * This part scales the interactivity limit depending on niceness.
+- *
+- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+- * Here are a few examples of different nice levels:
+- *
+- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
+- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
+- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
+- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
+- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
+- *
+- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
+- *  priority range a task can explore, a value of '1' means the
+- *  task is rated interactive.)
+- *
+- * Ie. nice +19 tasks can never get 'interactive' enough to be
+- * reinserted into the active array. And only heavily CPU-hog nice -20
+- * tasks will be expired. Default nice 0 tasks are somewhere between,
+- * it takes some effort for them to get interactive, but it's not
+- * too hard.
+- */
+-
+-#define CURRENT_BONUS(p) \
+-	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
+-		MAX_SLEEP_AVG)
+-
+-#define GRANULARITY	(10 * HZ / 1000 ? : 1)
+-
+-#ifdef CONFIG_SMP
+-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
+-			num_online_cpus())
+-#else
+-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
+-#endif
+-
+-#define SCALE(v1,v1_max,v2_max) \
+-	(v1) * (v2_max) / (v1_max)
+-
+-#define DELTA(p) \
+-	(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
+-		INTERACTIVE_DELTA)
+-
+-#define TASK_INTERACTIVE(p) \
+-	((p)->prio <= (p)->static_prio - DELTA(p))
+-
+-#define INTERACTIVE_SLEEP(p) \
+-	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
+-		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+-
+-#define TASK_PREEMPTS_CURR(p, rq) \
+-	((p)->prio < (rq)->curr->prio)
+-
+-#define SCALE_PRIO(x, prio) \
+-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
+-
+-static unsigned int static_prio_timeslice(int static_prio)
+-{
+-	if (static_prio < NICE_TO_PRIO(0))
+-		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
+-	else
+-		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
+-}
+-
+-#ifdef CONFIG_SMP
+-/*
+- * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+- * Since cpu_power is a 'constant', we can use a reciprocal divide.
++#define MS_TO_NS(TIME)		((TIME) * 1000000)
++#define MS_TO_US(TIME)		((TIME) * 1000)
++#define US_TO_MS(TIME)		((TIME) / 1000)
++
++#define TASK_PREEMPTS_CURR(p, curr)	((p)->prio < (curr)->prio)
++
++/*
++ * This is the time all tasks within the same priority round robin.
++ * Value is in ms and set to a minimum of 10ms. Scales with number of cpus.
++ * Tunable via /proc interface.
++ */
++int rr_interval __read_mostly = 6;
++int sched_interactive __read_mostly = 1;
++
++/*
++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
++ * are allowed to run (over ISO_PERIOD seconds) as real time tasks.
++ * sched_iso_period - sysctl which determines the number of seconds over
++ * which cpu usage of SCHED_ISO tasks is averaged to determine if they are
++ * exceeding their allowable bandwidth.
++*/
++int sched_iso_cpu __read_mostly = 80;
++int sched_iso_period __read_mostly = 5;
++
++#define ISO_PERIOD	((sched_iso_period * HZ) + 1)
++
++/*
++ * This contains a bitmap for each dynamic priority level with empty slots
++ * for the valid priorities each different nice level can have. It allows
++ * us to stagger the slots where differing priorities run in a way that
++ * keeps latency differences between different nice levels at a minimum.
++ * The purpose of a pre-generated matrix is for rapid lookup of next slot in
++ * O(1) time without having to recalculate every time priority gets demoted.
++ * All nice levels use priority slot 39 as this allows less niced tasks to
++ * get all priority slots better than that before expiration is forced.
++ * ie, where 0 means a slot for that priority, priority running from left to
++ * right is from prio 0 to prio 39:
++ * nice -20 0000000000000000000000000000000000000000
++ * nice -10 1000100010001000100010001000100010010000
++ * nice   0 1010101010101010101010101010101010101010
++ * nice   5 1011010110110101101101011011010110110110
++ * nice  10 1110111011101110111011101110111011101110
++ * nice  15 1111111011111110111111101111111011111110
++ * nice  19 1111111111111111111111111111111111111110
+  */
+-static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+-{
+-	return reciprocal_divide(load, sg->reciprocal_cpu_power);
+-}
++static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)]
++				 __read_mostly;
+ 
+-/*
+- * Each time a sched group cpu_power is changed,
+- * we must compute its reciprocal value
+- */
+-static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+-{
+-	sg->__cpu_power += val;
+-	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+-}
+-#endif
++struct rq;
+ 
+ /*
+- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+- * to time slice values: [800ms ... 100ms ... 5ms]
+- *
+- * The higher a thread's priority, the bigger timeslices
+- * it gets during one round of execution. But even the lowest
+- * priority thread gets MIN_TIMESLICE worth of execution time.
++ * These are the runqueue data structures:
+  */
++struct prio_array {
++	/* Tasks queued at each priority */
++	struct list_head queue[MAX_PRIO + 1];
+ 
+-static inline unsigned int task_timeslice(struct task_struct *p)
+-{
+-	return static_prio_timeslice(p->static_prio);
+-}
++	/*
++	 * The bitmap of priorities queued for this array. While the expired
++	 * array will never have realtime tasks on it, it is simpler to have
++	 * equal sized bitmaps for a cheap array swap. Include 1 bit for
++	 * delimiter.
++	 */
++	DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
+ 
+-/*
+- * These are the runqueue data structures:
+- */
++	/*
++	 * The best static priority (of the dynamic priority tasks) queued
++	 * this array.
++	 */
++	int best_static_prio;
+ 
+-struct prio_array {
+-	unsigned int nr_active;
+-	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
+-	struct list_head queue[MAX_PRIO];
++#ifdef CONFIG_SMP
++	/* For convenience looks back at rq */
++	struct rq *rq;
++#endif
+ };
+ 
+ /*
+@@ -260,14 +200,28 @@
+ 	 */
+ 	unsigned long nr_uninterruptible;
+ 
+-	unsigned long expired_timestamp;
+ 	/* Cached timestamp set by update_cpu_clock() */
+ 	unsigned long long most_recent_timestamp;
+ 	struct task_struct *curr, *idle;
+ 	unsigned long next_balance;
+ 	struct mm_struct *prev_mm;
+-	struct prio_array *active, *expired, arrays[2];
+-	int best_expired_prio;
++
++	struct prio_array *active, *expired, *idleprio, arrays[2];
++	unsigned long *dyn_bitmap, *exp_bitmap;
++
++	/*
++	 * The current dynamic priority level this runqueue is at per static
++	 * priority level.
++	 */
++	int prio_level[PRIO_RANGE];
++
++	/* How many times we have rotated the priority queue */
++	unsigned long prio_rotation;
++	unsigned long iso_ticks;
++	unsigned short iso_refractory;
++
++	/* Number of idleprio tasks running */
++	unsigned long nr_idleprio;
+ 	atomic_t nr_iowait;
+ 
+ #ifdef CONFIG_SMP
+@@ -606,12 +560,9 @@
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ /*
+  * Called when a process is dequeued from the active array and given
+- * the cpu.  We should note that with the exception of interactive
+- * tasks, the expired queue will become the active queue after the active
+- * queue is empty, without explicitly dequeuing and requeuing tasks in the
+- * expired queue.  (Interactive tasks may be requeued directly to the
+- * active queue, thus delaying tasks in the expired queue from running;
+- * see scheduler_tick()).
++ * the cpu.  We should note that the expired queue will become the active
++ * queue after the active queue is empty, without explicitly dequeuing and
++ * requeuing tasks in the expired queue.
+  *
+  * This function is only called from sched_info_arrive(), rather than
+  * dequeue_task(). Even though a task may be queued and dequeued multiple
+@@ -709,71 +660,304 @@
+ #define sched_info_switch(t, next)	do { } while (0)
+ #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+ 
++static int idleprio_suitable(struct task_struct *p)
++{
++	return (!p->mutexes_held && !freezing(p) && !signal_pending(p) &&
++		!(p->flags & (PF_NONSLEEP | PF_EXITING)));
++}
++
++static int idleprio(const struct task_struct *p)
++{
++	return (p->prio == MAX_PRIO);
++}
++
++static inline int task_queued(struct task_struct *task)
++{
++	return !list_empty(&task->run_list);
++}
++
++static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq)
++{
++	__set_bit(p->prio, p->array->prio_bitmap);
++}
++
+ /*
+- * Adding/removing a task to/from a priority array:
++ * Removing from a runqueue.
+  */
+-static void dequeue_task(struct task_struct *p, struct prio_array *array)
++static void dequeue_task(struct task_struct *p, struct rq *rq)
+ {
+-	array->nr_active--;
+-	list_del(&p->run_list);
+-	if (list_empty(array->queue + p->prio))
+-		__clear_bit(p->prio, array->bitmap);
++	list_del_init(&p->run_list);
++	if (idleprio_task(p) && idleprio(p))
++		rq->nr_idleprio--;
++	else if (list_empty(p->array->queue + p->prio))
++		__clear_bit(p->prio, p->array->prio_bitmap);
+ }
+ 
+-static void enqueue_task(struct task_struct *p, struct prio_array *array)
++static void reset_first_time_slice(struct task_struct *p)
+ {
+-	sched_info_queued(p);
+-	list_add_tail(&p->run_list, array->queue + p->prio);
+-	__set_bit(p->prio, array->bitmap);
+-	array->nr_active++;
++	if (unlikely(p->first_time_slice))
++		p->first_time_slice = 0;
++}
++
++/*
++ * The task is being queued on a fresh array so it has its entitlement
++ * bitmap cleared.
++ */
++static void task_new_array(struct task_struct *p, struct rq *rq,
++			   struct prio_array *array)
++{
++	bitmap_zero(p->bitmap, PRIO_RANGE);
++	p->rotation = rq->prio_rotation;
++	p->time_slice = p->quota;
+ 	p->array = array;
++	reset_first_time_slice(p);
++}
++
++/* Find the first slot from the relevant prio_matrix entry */
++static int first_prio_slot(struct task_struct *p)
++{
++	if (unlikely(p->policy == SCHED_BATCH))
++		return p->static_prio;
++	return SCHED_PRIO(find_first_zero_bit(
++		prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE));
+ }
+ 
+ /*
+- * Put task to the end of the run list without the overhead of dequeue
+- * followed by enqueue.
++ * In sched_interactive mode priority allocation occurs per process per rq
++ * array swap. In !sched_interactive mode all waking tasks must obey the
++ * current prio level of all other tasks running per array swap.
+  */
+-static void requeue_task(struct task_struct *p, struct prio_array *array)
++static int minprio(struct rq *rq, int uprio)
+ {
+-	list_move_tail(&p->run_list, array->queue + p->prio);
++	if (sched_interactive)
++		return MAX_RT_PRIO;
++	return rq->prio_level[uprio];
+ }
+ 
+-static inline void
+-enqueue_task_head(struct task_struct *p, struct prio_array *array)
++/*
++ * Find the first unused slot by this task that is also in its prio_matrix
++ * level. SCHED_BATCH tasks do not use the priority matrix. They only take
++ * priority slots from their static_prio and above.
++ */
++static int next_entitled_slot(struct task_struct *p, struct rq *rq)
+ {
+-	list_add(&p->run_list, array->queue + p->prio);
+-	__set_bit(p->prio, array->bitmap);
+-	array->nr_active++;
+-	p->array = array;
++	int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio);
++	struct prio_array *array = rq->active;
++	DECLARE_BITMAP(tmp, PRIO_RANGE);
++
++	/*
++	 * Go straight to expiration if there are higher priority tasks
++	 * already expired.
++	 */
++	if (p->static_prio > rq->expired->best_static_prio)
++		return MAX_PRIO;
++	if (!rq->prio_level[uprio])
++		rq->prio_level[uprio] = MAX_RT_PRIO;
++	/*
++	 * Only priorities equal to the prio_level and above for their
++	 * static_prio are acceptable, and only if it's not better than
++	 * a queued better static_prio's prio_level.
++	 */
++	if (p->static_prio < array->best_static_prio) {
++		if (likely(p->policy != SCHED_BATCH))
++			array->best_static_prio = p->static_prio;
++	} else if (p->static_prio == array->best_static_prio) {
++		search_prio = minprio(rq, uprio);
++	} else {
++		int i;
++
++		search_prio = minprio(rq, uprio);
++		/* A bound O(n) function, worst case n is 40 */
++		for (i = array->best_static_prio; i <= p->static_prio ; i++) {
++			if (!rq->prio_level[USER_PRIO(i)])
++				rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO;
++			search_prio = max(search_prio,
++				      rq->prio_level[USER_PRIO(i)]);
++		}
++	}
++	if (unlikely(p->policy == SCHED_BATCH)) {
++		search_prio = max(search_prio, p->static_prio);
++		return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE,
++				  USER_PRIO(search_prio)));
++	}
++	bitmap_or(tmp, p->bitmap, prio_matrix[uprio], PRIO_RANGE);
++	return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE,
++		USER_PRIO(search_prio)));
++}
++
++static void queue_expired(struct task_struct *p, struct rq *rq)
++{
++	task_new_array(p, rq, rq->expired);
++	p->prio = p->normal_prio = first_prio_slot(p);
++	if (p->static_prio < rq->expired->best_static_prio)
++		rq->expired->best_static_prio = p->static_prio;
++	reset_first_time_slice(p);
+ }
+ 
++#ifdef CONFIG_SMP
+ /*
+- * __normal_prio - return the priority that is based on the static
+- * priority but is modified by bonuses/penalties.
+- *
+- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+- * into the -5 ... 0 ... +5 bonus/penalty range.
+- *
+- * We use 25% of the full 0...39 priority range so that:
+- *
+- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+- *
+- * Both properties are important to certain workloads.
++ * If we're waking up a task that was previously on a different runqueue,
++ * update its data appropriately. Note we may be reading data from src_rq->
++ * outside of lock, but the occasional inaccurate result should be harmless.
+  */
++ static void update_if_moved(struct task_struct *p, struct rq *rq)
++{
++	struct rq *src_rq = p->array->rq;
++
++	if (src_rq == rq)
++		return;
++	/*
++	 * Only need to set p->array when p->rotation == rq->prio_rotation as
++	 * they will be set in recalc_task_prio when != rq->prio_rotation.
++	 */
++	if (p->rotation == src_rq->prio_rotation) {
++		p->rotation = rq->prio_rotation;
++		if (p->array == src_rq->expired)
++			p->array = rq->expired;
++		else
++			p->array = rq->active;
++	} else
++		p->rotation = 0;
++}
++#else
++static inline void update_if_moved(struct task_struct *p, struct rq *rq)
++{
++}
++#endif
+ 
+-static inline int __normal_prio(struct task_struct *p)
++static inline int isoprio_suitable(struct task_struct *p)
+ {
+-	int bonus, prio;
++	return !(p->flags & PF_ISOREF);
++}
+ 
+-	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
++static int task_timeslice(struct task_struct *p);
+ 
+-	prio = p->static_prio - bonus;
+-	if (prio < MAX_RT_PRIO)
+-		prio = MAX_RT_PRIO;
+-	if (prio > MAX_PRIO-1)
+-		prio = MAX_PRIO-1;
+-	return prio;
++/*
++ * recalc_task_prio determines what priority a non rt_task will be
++ * queued at. If the task has already been running during this runqueue's
++ * major rotation (rq->prio_rotation) then it continues at the same
++ * priority if it has tick entitlement left. If it does not have entitlement
++ * left, it finds the next priority slot according to its nice value that it
++ * has not extracted quota from. If it has not run during this major
++ * rotation, it starts at the next_entitled_slot and has its bitmap quota
++ * cleared. If it does not have any slots left it has all its slots reset and
++ * is queued on the expired at its first_prio_slot.
++ */
++static void recalc_task_prio(struct task_struct *p, struct rq *rq)
++{
++	struct prio_array *array = rq->active;
++	int queue_prio;
++
++	if (iso_task(p)) {
++		if (isoprio_suitable(p)) {
++			/*
++			 * If SCHED_ISO tasks have not used up their real time
++			 * quota they have run just better than highest
++			 * SCHED_NORMAL priority. Otherwise they run as
++			 * SCHED_NORMAL.
++			 */
++			p->prio = p->normal_prio = ISO_PRIO;
++			p->array = rq->active;
++			if (p->time_slice <= 0)
++				p->time_slice = p->quota;
++			return;
++		} else if (p->prio == ISO_PRIO) {
++			/* Just about to be demoted to SCHED_NORMAL */
++			p->time_slice = 0;
++		}
++	} else if (idleprio_task(p)) {
++		if (idleprio_suitable(p)) {
++			/*
++			 * If suitable idleprio_tasks are queued at MAX_PRIO
++			 * only on the idleprio array. Their time_slice is
++			 * their full task_timeslice as they cooperatively
++			 * multitask.
++			 */
++			p->prio = p->normal_prio = MAX_PRIO;
++			p->array = rq->idleprio;
++			if (p->time_slice <= 0)
++				p->time_slice = task_timeslice(p);
++			return;
++		}
++		/*
++		 * If unsuitable idleprio_tasks are queued equivalent to
++		 * nice 19 tasks on the expired array.
++		 */
++		p->flags &= ~PF_NONSLEEP;
++		p->prio = p->normal_prio = MAX_PRIO - 1;
++		p->array = rq->expired;
++		if (p->time_slice <= 0 || p->time_slice > p->quota)
++			p->time_slice = p->quota;
++		return;
++	}
++
++	update_if_moved(p, rq);
++	if (p->rotation == rq->prio_rotation) {
++		if (p->array == array) {
++			if (p->time_slice > 0)
++				return;
++			p->time_slice = p->quota;
++		} else if (p->array == rq->expired) {
++			queue_expired(p, rq);
++			return;
++		} else
++			task_new_array(p, rq, array);
++	} else
++		task_new_array(p, rq, array);
++
++	queue_prio = next_entitled_slot(p, rq);
++	if (queue_prio >= MAX_PRIO) {
++		queue_expired(p, rq);
++		return;
++	}
++	p->prio = p->normal_prio = queue_prio;
++	__set_bit(USER_PRIO(p->prio), p->bitmap);
++}
++
++/*
++ * Adding to a runqueue. The dynamic priority queue that it is added to is
++ * determined by recalc_task_prio() above.
++ */
++static inline void __enqueue_task(struct task_struct *p, struct rq *rq)
++{
++	if (rt_task(p))
++		p->array = rq->active;
++	else
++		recalc_task_prio(p, rq);
++
++	if (idleprio_task(p) && idleprio(p))
++		rq->nr_idleprio++;
++	sched_info_queued(p);
++	set_dynamic_bit(p, rq);
++}
++
++static void enqueue_task(struct task_struct *p, struct rq *rq)
++{
++	__enqueue_task(p, rq);
++	list_add_tail(&p->run_list, p->array->queue + p->prio);
++}
++
++static inline void enqueue_task_head(struct task_struct *p, struct rq *rq)
++{
++	__enqueue_task(p, rq);
++	list_add(&p->run_list, p->array->queue + p->prio);
++}
++
++/*
++ * requeue_task is only called when p->static_prio does not change. p->prio
++ * can change with dynamic tasks.
++ */
++static void requeue_task(struct task_struct *p, struct rq *rq,
++			 struct prio_array *old_array, int old_prio)
++{
++	if (p->array == rq->expired)
++		queue_expired(p, rq);
++	list_move_tail(&p->run_list, p->array->queue + p->prio);
++	if (!rt_task(p)) {
++		if (list_empty(old_array->queue + old_prio))
++			__clear_bit(old_prio, old_array->prio_bitmap);
++		set_dynamic_bit(p, rq);
++	}
+ }
+ 
+ /*
+@@ -786,20 +970,29 @@
+  */
+ 
+ /*
+- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
+- * If static_prio_timeslice() is ever changed to break this assumption then
+- * this code will need modification
+- */
+-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
+-#define LOAD_WEIGHT(lp) \
+-	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
+-#define PRIO_TO_LOAD_WEIGHT(prio) \
+-	LOAD_WEIGHT(static_prio_timeslice(prio))
+-#define RTPRIO_TO_LOAD_WEIGHT(rp) \
+-	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
++ * task_timeslice - the total duration a task can run during one major
++ * rotation. Returns value in milliseconds as the smallest value can be 1.
++ */
++static int task_timeslice(struct task_struct *p)
++{
++	int slice = p->quota;	/* quota is in us */
++
++	if (!rt_task(p))
++		slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice;
++	return US_TO_MS(slice);
++}
++
++/*
++ * The load weight is basically the task_timeslice in ms. Realtime tasks are
++ * special cased to be proportionately larger than nice -20 by their
++ * rt_priority. The weight for rt tasks can only be arbitrary at best.
++ */
++#define RTPRIO_TO_LOAD_WEIGHT(rp)	(rr_interval * 20 * (40 + rp))
+ 
+ static void set_load_weight(struct task_struct *p)
+ {
++	int load_weight;
++
+ 	if (has_rt_policy(p)) {
+ #ifdef CONFIG_SMP
+ 		if (p == task_rq(p)->migration_thread)
+@@ -808,12 +1001,19 @@
+ 			 * Giving its load any weight will skew balancing
+ 			 * adversely.
+ 			 */
+-			p->load_weight = 0;
++			load_weight = 0;
+ 		else
+ #endif
+-			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
++			load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+ 	} else
+-		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
++		load_weight = task_timeslice(p);
++	/*
++	 * idleprio tasks have much lower weight than SCHED_NORMAL tasks but
++	 * still need to be weighted to allow balancing to occur.
++	 */
++	if (likely(!idleprio_task(p)))
++		load_weight *= PRIO_RANGE;
++	p->load_weight = load_weight;
+ }
+ 
+ static inline void
+@@ -841,28 +1041,38 @@
+ }
+ 
+ /*
+- * Calculate the expected normal priority: i.e. priority
+- * without taking RT-inheritance into account. Might be
+- * boosted by interactivity modifiers. Changes upon fork,
+- * setprio syscalls, and whenever the interactivity
+- * estimator recalculates.
++ * __activate_task - move a task to the runqueue.
+  */
+-static inline int normal_prio(struct task_struct *p)
++static inline void __activate_task(struct task_struct *p, struct rq *rq)
+ {
+-	int prio;
++	enqueue_task(p, rq);
++	inc_nr_running(p, rq);
++}
+ 
++/*
++ * __activate_idle_task - move idle task to the _front_ of runqueue.
++ */
++static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
++{
++	enqueue_task_head(p, rq);
++	inc_nr_running(p, rq);
++}
++
++static inline int normal_prio(struct task_struct *p)
++{
+ 	if (has_rt_policy(p))
+-		prio = MAX_RT_PRIO-1 - p->rt_priority;
++		return MAX_RT_PRIO-1 - p->rt_priority;
++	/* Other tasks all have normal_prio set in recalc_task_prio */
++	if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO))
++		return p->prio;
+ 	else
+-		prio = __normal_prio(p);
+-	return prio;
++		return p->static_prio;
+ }
+ 
+ /*
+  * Calculate the current priority, i.e. the priority
+  * taken into account by the scheduler. This value might
+- * be boosted by RT tasks, or might be boosted by
+- * interactivity modifiers. Will be RT if the task got
++ * be boosted by RT tasks as it will be RT if the task got
+  * RT-boosted. If not then it returns p->normal_prio.
+  */
+ static int effective_prio(struct task_struct *p)
+@@ -878,112 +1088,70 @@
+ 	return p->prio;
+ }
+ 
+-/*
+- * __activate_task - move a task to the runqueue.
+- */
+-static void __activate_task(struct task_struct *p, struct rq *rq)
++static inline unsigned int nice_quota_ms(int nice)
+ {
+-	struct prio_array *target = rq->active;
++	unsigned int rr = rr_interval;
+ 
+-	if (batch_task(p))
+-		target = rq->expired;
+-	enqueue_task(p, target);
+-	inc_nr_running(p, rq);
++	if (nice < -6) {
++		rr *= nice * nice;
++		rr /= 40;
++	} else if (nice > 0)
++		rr = rr / 2 ? : 1;
++	return rr;
+ }
+ 
++#define DEFAULT_WEIGHT	(nice_quota_ms(0) * 20 * PRIO_RANGE)
++
+ /*
+- * __activate_idle_task - move idle task to the _front_ of runqueue.
++ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
++ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
++ * task of nice 0 or enough lower priority tasks to bring up the
++ * weighted_cpuload
+  */
+-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
++int above_background_load(void)
+ {
+-	enqueue_task_head(p, rq->active);
+-	inc_nr_running(p, rq);
++	unsigned long cpu;
++
++	for_each_online_cpu(cpu) {
++		if (weighted_cpuload(cpu) >= DEFAULT_WEIGHT)
++			return 1;
++	}
++	return 0;
+ }
+ 
+ /*
+- * Recalculate p->normal_prio and p->prio after having slept,
+- * updating the sleep-average too:
++ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval.
++ * From nice 1 to 19 they are smaller than it only if they are at least one
++ * tick still. Below nice 0 they get progressively larger.
++ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval
++ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2.
++ * Value returned is in microseconds.
+  */
+-static int recalc_task_prio(struct task_struct *p, unsigned long long now)
++static inline unsigned int rr_quota(struct task_struct *p)
+ {
+-	/* Caller must always ensure 'now >= p->timestamp' */
+-	unsigned long sleep_time = now - p->timestamp;
++	unsigned int quota;
+ 
+-	if (batch_task(p))
+-		sleep_time = 0;
+-
+-	if (likely(sleep_time > 0)) {
+-		/*
+-		 * This ceiling is set to the lowest priority that would allow
+-		 * a task to be reinserted into the active array on timeslice
+-		 * completion.
+-		 */
+-		unsigned long ceiling = INTERACTIVE_SLEEP(p);
+-
+-		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
+-			/*
+-			 * Prevents user tasks from achieving best priority
+-			 * with one single large enough sleep.
+-			 */
+-			p->sleep_avg = ceiling;
+-			/*
+-			 * Using INTERACTIVE_SLEEP() as a ceiling places a
+-			 * nice(0) task 1ms sleep away from promotion, and
+-			 * gives it 700ms to round-robin with no chance of
+-			 * being demoted.  This is more than generous, so
+-			 * mark this sleep as non-interactive to prevent the
+-			 * on-runqueue bonus logic from intervening should
+-			 * this task not receive cpu immediately.
+-			 */
+-			p->sleep_type = SLEEP_NONINTERACTIVE;
+-		} else {
+-			/*
+-			 * Tasks waking from uninterruptible sleep are
+-			 * limited in their sleep_avg rise as they
+-			 * are likely to be waiting on I/O
+-			 */
+-			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
+-				if (p->sleep_avg >= ceiling)
+-					sleep_time = 0;
+-				else if (p->sleep_avg + sleep_time >=
+-					 ceiling) {
+-						p->sleep_avg = ceiling;
+-						sleep_time = 0;
+-				}
+-			}
+-
+-			/*
+-			 * This code gives a bonus to interactive tasks.
+-			 *
+-			 * The boost works by updating the 'average sleep time'
+-			 * value here, based on ->timestamp. The more time a
+-			 * task spends sleeping, the higher the average gets -
+-			 * and the higher the priority boost gets as well.
+-			 */
+-			p->sleep_avg += sleep_time;
+-
+-		}
+-		if (p->sleep_avg > NS_MAX_SLEEP_AVG)
+-			p->sleep_avg = NS_MAX_SLEEP_AVG;
+-	}
++	if (rt_task(p))
++		quota = rr_interval;
++	else
++		quota = nice_quota_ms(TASK_NICE(p));
++	return MS_TO_US(quota);
++}
+ 
+-	return effective_prio(p);
++/* Every time we set the quota we need to set the load weight */
++static void set_quota(struct task_struct *p)
++{
++	p->quota = rr_quota(p);
++	set_load_weight(p);
+ }
+ 
+ /*
+  * activate_task - move a task to the runqueue and do priority recalculation
+- *
+- * Update all the scheduling statistics stuff. (sleep average
+- * calculation, priority modifiers, etc.)
+  */
+ static void activate_task(struct task_struct *p, struct rq *rq, int local)
+ {
+-	unsigned long long now;
+-
+-	if (rt_task(p))
+-		goto out;
++	unsigned long long now = sched_clock();
+ 
+-	now = sched_clock();
+ #ifdef CONFIG_SMP
+ 	if (!local) {
+ 		/* Compensate for drifting sched_clock */
+@@ -1004,32 +1172,9 @@
+ 				     (now - p->timestamp) >> 20);
+ 	}
+ 
+-	p->prio = recalc_task_prio(p, now);
+-
+-	/*
+-	 * This checks to make sure it's not an uninterruptible task
+-	 * that is now waking up.
+-	 */
+-	if (p->sleep_type == SLEEP_NORMAL) {
+-		/*
+-		 * Tasks which were woken up by interrupts (ie. hw events)
+-		 * are most likely of interactive nature. So we give them
+-		 * the credit of extending their sleep time to the period
+-		 * of time they spend on the runqueue, waiting for execution
+-		 * on a CPU, first time around:
+-		 */
+-		if (in_interrupt())
+-			p->sleep_type = SLEEP_INTERRUPTED;
+-		else {
+-			/*
+-			 * Normal first-time wakeups get a credit too for
+-			 * on-runqueue time, but it will be weighted down:
+-			 */
+-			p->sleep_type = SLEEP_INTERACTIVE;
+-		}
+-	}
++	set_quota(p);
++	p->prio = effective_prio(p);
+ 	p->timestamp = now;
+-out:
+ 	__activate_task(p, rq);
+ }
+ 
+@@ -1039,8 +1184,7 @@
+ static void deactivate_task(struct task_struct *p, struct rq *rq)
+ {
+ 	dec_nr_running(p, rq);
+-	dequeue_task(p, p->array);
+-	p->array = NULL;
++	dequeue_task(p, rq);
+ }
+ 
+ /*
+@@ -1133,7 +1277,7 @@
+ 	 * If the task is not on a runqueue (and not running), then
+ 	 * it is sufficient to simply update the task's cpu field.
+ 	 */
+-	if (!p->array && !task_running(rq, p)) {
++	if (!task_queued(p) && !task_running(rq, p)) {
+ 		set_task_cpu(p, dest_cpu);
+ 		return 0;
+ 	}
+@@ -1159,7 +1303,6 @@
+ {
+ 	unsigned long flags;
+ 	struct rq *rq;
+-	struct prio_array *array;
+ 	int running;
+ 
+ repeat:
+@@ -1192,7 +1335,6 @@
+ 	 */
+ 	rq = task_rq_lock(p, &flags);
+ 	running = task_running(rq, p);
+-	array = p->array;
+ 	task_rq_unlock(rq, &flags);
+ 
+ 	/*
+@@ -1215,7 +1357,7 @@
+ 	 * running right now), it's preempted, and we should
+ 	 * yield - it could be a while.
+ 	 */
+-	if (unlikely(array)) {
++	if (unlikely(task_queued(p))) {
+ 		yield();
+ 		goto repeat;
+ 	}
+@@ -1294,6 +1436,25 @@
+ }
+ 
+ /*
++ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
++ * Since cpu_power is a 'constant', we can use a reciprocal divide.
++ */
++static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
++{
++	return reciprocal_divide(load, sg->reciprocal_cpu_power);
++}
++
++/*
++ * Each time a sched group cpu_power is changed,
++ * we must compute its reciprocal value
++ */
++static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
++{
++	sg->__cpu_power += val;
++	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
++}
++
++/*
+  * find_idlest_group finds and returns the least busy CPU group within the
+  * domain.
+  */
+@@ -1490,6 +1651,31 @@
+ }
+ #endif
+ 
++/*
++ * We need to have a special definition for an idle runqueue when testing
++ * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as
++ * a realtime task in sched_idle_next.
++ */
++#ifdef CONFIG_HOTPLUG_CPU
++#define rq_idle(rq)	((rq)->curr == (rq)->idle && !rt_task((rq)->curr))
++#else
++#define rq_idle(rq)	((rq)->curr == (rq)->idle)
++#endif
++
++static inline int task_preempts_curr(struct task_struct *p, struct rq *rq)
++{
++	struct task_struct *curr = rq->curr;
++
++	return ((p->array == task_rq(p)->active &&
++		TASK_PREEMPTS_CURR(p, curr)) || rq_idle(rq));
++}
++
++static inline void try_preempt(struct task_struct *p, struct rq *rq)
++{
++	if (task_preempts_curr(p, rq))
++		resched_task(rq->curr);
++}
++
+ /***
+  * try_to_wake_up - wake up a thread
+  * @p: the to-be-woken-up thread
+@@ -1521,7 +1707,7 @@
+ 	if (!(old_state & state))
+ 		goto out;
+ 
+-	if (p->array)
++	if (task_queued(p))
+ 		goto out_running;
+ 
+ 	cpu = task_cpu(p);
+@@ -1614,7 +1800,7 @@
+ 		old_state = p->state;
+ 		if (!(old_state & state))
+ 			goto out;
+-		if (p->array)
++		if (task_queued(p))
+ 			goto out_running;
+ 
+ 		this_cpu = smp_processor_id();
+@@ -1623,25 +1809,9 @@
+ 
+ out_activate:
+ #endif /* CONFIG_SMP */
+-	if (old_state == TASK_UNINTERRUPTIBLE) {
++	if (old_state == TASK_UNINTERRUPTIBLE)
+ 		rq->nr_uninterruptible--;
+-		/*
+-		 * Tasks on involuntary sleep don't earn
+-		 * sleep_avg beyond just interactive state.
+-		 */
+-		p->sleep_type = SLEEP_NONINTERACTIVE;
+-	} else
+-
+-	/*
+-	 * Tasks that have marked their sleep as noninteractive get
+-	 * woken up with their sleep average not weighted in an
+-	 * interactive way.
+-	 */
+-		if (old_state & TASK_NONINTERACTIVE)
+-			p->sleep_type = SLEEP_NONINTERACTIVE;
+-
+ 
+-	activate_task(p, rq, cpu == this_cpu);
+ 	/*
+ 	 * Sync wakeups (i.e. those types of wakeups where the waker
+ 	 * has indicated that it will leave the CPU in short order)
+@@ -1650,15 +1820,22 @@
+ 	 * the waker guarantees that the freshly woken up task is going
+ 	 * to be considered on this CPU.)
+ 	 */
+-	if (!sync || cpu != this_cpu) {
+-		if (TASK_PREEMPTS_CURR(p, rq))
+-			resched_task(rq->curr);
+-	}
++	activate_task(p, rq, cpu == this_cpu);
++	if (!sync || cpu != this_cpu)
++		try_preempt(p, rq);
+ 	success = 1;
+ 
+ out_running:
+ 	p->state = TASK_RUNNING;
+ out:
++	/*
++	 * Special case when freezing we need to reschedule idleprio tasks
++	 * as SCHED_NORMAL or else they'll never freeze
++	 */
++	if (idleprio_task(p) && freezing(p) && idleprio(p)) {
++		dequeue_task(p, rq);
++		enqueue_task(p, rq);
++	}
+ 	task_rq_unlock(rq, &flags);
+ 
+ 	return success;
+@@ -1676,7 +1853,6 @@
+ 	return try_to_wake_up(p, state, 0);
+ }
+ 
+-static void task_running_tick(struct rq *rq, struct task_struct *p);
+ /*
+  * Perform scheduler related setup for a newly forked process p.
+  * p is forked by current.
+@@ -1704,7 +1880,6 @@
+ 	p->prio = current->normal_prio;
+ 
+ 	INIT_LIST_HEAD(&p->run_list);
+-	p->array = NULL;
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ 	if (unlikely(sched_info_on()))
+ 		memset(&p->sched_info, 0, sizeof(p->sched_info));
+@@ -1716,30 +1891,31 @@
+ 	/* Want to start with kernel preemption disabled. */
+ 	task_thread_info(p)->preempt_count = 1;
+ #endif
++	if (unlikely(p->policy == SCHED_FIFO))
++		goto out;
+ 	/*
+ 	 * Share the timeslice between parent and child, thus the
+ 	 * total amount of pending timeslices in the system doesn't change,
+ 	 * resulting in more scheduling fairness.
+ 	 */
+ 	local_irq_disable();
+-	p->time_slice = (current->time_slice + 1) >> 1;
+-	/*
+-	 * The remainder of the first timeslice might be recovered by
+-	 * the parent if the child exits early enough.
+-	 */
+-	p->first_time_slice = 1;
+-	current->time_slice >>= 1;
+-	p->timestamp = sched_clock();
+-	if (unlikely(!current->time_slice)) {
++	if (current->time_slice > 0) {
++		current->time_slice /= 2;
++		if (current->time_slice)
++			p->time_slice = current->time_slice;
++		else
++			p->time_slice = 1;
+ 		/*
+-		 * This case is rare, it happens when the parent has only
+-		 * a single jiffy left from its timeslice. Taking the
+-		 * runqueue lock is not a problem.
++		 * The remainder of the first timeslice might be recovered by
++		 * the parent if the child exits early enough.
+ 		 */
+-		current->time_slice = 1;
+-		task_running_tick(cpu_rq(cpu), current);
+-	}
++		p->first_time_slice = 1;
++	} else
++		p->time_slice = 0;
++
++	p->timestamp = sched_clock();
+ 	local_irq_enable();
++out:
+ 	put_cpu();
+ }
+ 
+@@ -1761,38 +1937,16 @@
+ 	this_cpu = smp_processor_id();
+ 	cpu = task_cpu(p);
+ 
+-	/*
+-	 * We decrease the sleep average of forking parents
+-	 * and children as well, to keep max-interactive tasks
+-	 * from forking tasks that are max-interactive. The parent
+-	 * (current) is done further down, under its lock.
+-	 */
+-	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
+-		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+-
+-	p->prio = effective_prio(p);
+-
+ 	if (likely(cpu == this_cpu)) {
++		activate_task(p, rq, 1);
+ 		if (!(clone_flags & CLONE_VM)) {
+ 			/*
+ 			 * The VM isn't cloned, so we're in a good position to
+ 			 * do child-runs-first in anticipation of an exec. This
+ 			 * usually avoids a lot of COW overhead.
+ 			 */
+-			if (unlikely(!current->array))
+-				__activate_task(p, rq);
+-			else {
+-				p->prio = current->prio;
+-				p->normal_prio = current->normal_prio;
+-				list_add_tail(&p->run_list, &current->run_list);
+-				p->array = current->array;
+-				p->array->nr_active++;
+-				inc_nr_running(p, rq);
+-			}
+ 			set_need_resched();
+-		} else
+-			/* Run child last */
+-			__activate_task(p, rq);
++		}
+ 		/*
+ 		 * We skip the following code due to cpu == this_cpu
+ 	 	 *
+@@ -1809,19 +1963,16 @@
+ 		 */
+ 		p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
+ 					+ rq->most_recent_timestamp;
+-		__activate_task(p, rq);
+-		if (TASK_PREEMPTS_CURR(p, rq))
+-			resched_task(rq->curr);
++		activate_task(p, rq, 0);
++		try_preempt(p, rq);
+ 
+ 		/*
+ 		 * Parent and child are on different CPUs, now get the
+-		 * parent runqueue to update the parent's ->sleep_avg:
++		 * parent runqueue to update the parent's ->flags:
+ 		 */
+ 		task_rq_unlock(rq, &flags);
+ 		this_rq = task_rq_lock(current, &flags);
+ 	}
+-	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+-		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+ 	task_rq_unlock(this_rq, &flags);
+ }
+ 
+@@ -1836,23 +1987,17 @@
+  */
+ void fastcall sched_exit(struct task_struct *p)
+ {
++	struct task_struct *parent;
+ 	unsigned long flags;
+ 	struct rq *rq;
+ 
+-	/*
+-	 * If the child was a (relative-) CPU hog then decrease
+-	 * the sleep_avg of the parent as well.
+-	 */
+-	rq = task_rq_lock(p->parent, &flags);
+-	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
+-		p->parent->time_slice += p->time_slice;
+-		if (unlikely(p->parent->time_slice > task_timeslice(p)))
+-			p->parent->time_slice = task_timeslice(p);
+-	}
+-	if (p->sleep_avg < p->parent->sleep_avg)
+-		p->parent->sleep_avg = p->parent->sleep_avg /
+-		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
+-		(EXIT_WEIGHT + 1);
++	parent = p->parent;
++	rq = task_rq_lock(parent, &flags);
++	if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) {
++		parent->time_slice += p->time_slice;
++		if (unlikely(parent->time_slice > parent->quota))
++			parent->time_slice = parent->quota;
++	}
+ 	task_rq_unlock(rq, &flags);
+ }
+ 
+@@ -2184,23 +2329,17 @@
+  * pull_task - move a task from a remote runqueue to the local runqueue.
+  * Both runqueues must be locked.
+  */
+-static void pull_task(struct rq *src_rq, struct prio_array *src_array,
+-		      struct task_struct *p, struct rq *this_rq,
+-		      struct prio_array *this_array, int this_cpu)
++static void pull_task(struct rq *src_rq, struct task_struct *p,
++		      struct rq *this_rq, int this_cpu)
+ {
+-	dequeue_task(p, src_array);
++	dequeue_task(p, src_rq);
+ 	dec_nr_running(p, src_rq);
+ 	set_task_cpu(p, this_cpu);
+ 	inc_nr_running(p, this_rq);
+-	enqueue_task(p, this_array);
++	enqueue_task(p, this_rq);
+ 	p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
+ 				+ this_rq->most_recent_timestamp;
+-	/*
+-	 * Note that idle threads have a prio of MAX_PRIO, for this test
+-	 * to be always true for them.
+-	 */
+-	if (TASK_PREEMPTS_CURR(p, this_rq))
+-		resched_task(this_rq->curr);
++	try_preempt(p, this_rq);
+ }
+ 
+ /*
+@@ -2243,7 +2382,16 @@
+ 	return 1;
+ }
+ 
+-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
++static inline int rq_best_prio(struct rq *rq)
++{
++	int best_prio, exp_prio;
++
++	best_prio = sched_find_first_bit(rq->dyn_bitmap);
++	exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO);
++	if (unlikely(best_prio > exp_prio))
++		best_prio = exp_prio;
++	return best_prio;
++}
+ 
+ /*
+  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+@@ -2259,7 +2407,7 @@
+ {
+ 	int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
+ 	    best_prio_seen, skip_for_load;
+-	struct prio_array *array, *dst_array;
++	struct prio_array *array;
+ 	struct list_head *head, *curr;
+ 	struct task_struct *tmp;
+ 	long rem_load_move;
+@@ -2286,31 +2434,29 @@
+ 	 * be cache-cold, thus switching CPUs has the least effect
+ 	 * on them.
+ 	 */
+-	if (busiest->expired->nr_active) {
+-		array = busiest->expired;
+-		dst_array = this_rq->expired;
+-	} else {
+-		array = busiest->active;
+-		dst_array = this_rq->active;
+-	}
+-
++	array = busiest->expired;
+ new_array:
+-	/* Start searching at priority 0: */
+-	idx = 0;
++	/* Expired arrays don't have RT tasks so they're always MAX_RT_PRIO+ */
++	if (array == busiest->expired)
++		idx = MAX_RT_PRIO;
++	else
++		idx = 0;
+ skip_bitmap:
+ 	if (!idx)
+-		idx = sched_find_first_bit(array->bitmap);
++		idx = sched_find_first_bit(array->prio_bitmap);
+ 	else
+-		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+-	if (idx >= MAX_PRIO) {
+-		if (array == busiest->expired && busiest->active->nr_active) {
++		idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx);
++	if (idx == MAX_PRIO) {
++		if (array == busiest->idleprio && busiest->nr_idleprio)
++			goto found_idleprio;
++		if (array == busiest->expired) {
+ 			array = busiest->active;
+-			dst_array = this_rq->active;
+ 			goto new_array;
+ 		}
+ 		goto out;
+ 	}
+ 
++found_idleprio:
+ 	head = array->queue + idx;
+ 	curr = head->prev;
+ skip_queue:
+@@ -2332,11 +2478,22 @@
+ 		best_prio_seen |= idx == best_prio;
+ 		if (curr != head)
+ 			goto skip_queue;
++		if (idx == MAX_PRIO) {
++			/*
++			 * Occurs either when balancing idleprio tasks or
++			 * there really are no more tasks to find.
++			 */
++			if (array == busiest->expired) {
++				array = busiest->active;
++				goto new_array;
++			}
++			goto out;
++		}
+ 		idx++;
+ 		goto skip_bitmap;
+ 	}
+ 
+-	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
++	pull_task(busiest, tmp, this_rq, this_cpu);
+ 	pulled++;
+ 	rem_load_move -= tmp->load_weight;
+ 
+@@ -2349,6 +2506,13 @@
+ 			this_best_prio = idx;
+ 		if (curr != head)
+ 			goto skip_queue;
++		if (idx == MAX_PRIO) {
++			if (array == busiest->expired) {
++				array = busiest->active;
++				goto new_array;
++			}
++			goto out;
++		}
+ 		idx++;
+ 		goto skip_bitmap;
+ 	}
+@@ -3297,11 +3461,36 @@
+ /*
+  * This is called on clock ticks and on context switches.
+  * Bank in p->sched_time the ns elapsed since the last tick or switch.
++ * CPU scheduler quota accounting is also performed here in microseconds.
++ * The value returned from sched_clock() occasionally gives bogus values so
++ * some sanity checking is required.
+  */
+-static inline void
+-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
++static void
++update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
++		 int tick)
+ {
+-	p->sched_time += now - p->last_ran;
++	long time_diff = now - p->last_ran;
++
++	if (tick) {
++		/*
++		 * Called from scheduler_tick() there should be less than two
++		 * jiffies worth, and not negative/overflow.
++		 */
++		if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0)
++			time_diff = JIFFIES_TO_NS(1);
++	} else {
++		/*
++		 * Called from context_switch there should be less than one
++		 * jiffy worth, and not negative/overflow. There should be
++		 * some time banked here so use a nominal 1us.
++		 */
++		if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1)
++			time_diff = 1000;
++	}
++	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
++	if (p != rq->idle && p->policy != SCHED_FIFO)
++		p->time_slice -= time_diff / 1000;
++	p->sched_time += time_diff;
+ 	p->last_ran = rq->most_recent_timestamp = now;
+ }
+ 
+@@ -3322,27 +3511,6 @@
+ }
+ 
+ /*
+- * We place interactive tasks back into the active array, if possible.
+- *
+- * To guarantee that this does not starve expired tasks we ignore the
+- * interactivity of a task if the first expired task had to wait more
+- * than a 'reasonable' amount of time. This deadline timeout is
+- * load-dependent, as the frequency of array switched decreases with
+- * increasing number of running tasks. We also ignore the interactivity
+- * if a better static_prio task has expired:
+- */
+-static inline int expired_starving(struct rq *rq)
+-{
+-	if (rq->curr->static_prio > rq->best_expired_prio)
+-		return 1;
+-	if (!STARVATION_LIMIT || !rq->expired_timestamp)
+-		return 0;
+-	if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
+-		return 1;
+-	return 0;
+-}
+-
+-/*
+  * Account user cpu time to a process.
+  * @p: the process that the cpu time gets accounted to
+  * @hardirq_offset: the offset to subtract from hardirq_count()
+@@ -3357,7 +3525,7 @@
+ 
+ 	/* Add user time to cpustat. */
+ 	tmp = cputime_to_cputime64(cputime);
+-	if (TASK_NICE(p) > 0)
++	if (TASK_NICE(p) > 0 || idleprio_task(p))
+ 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
+ 	else
+ 		cpustat->user = cputime64_add(cpustat->user, tmp);
+@@ -3415,87 +3583,94 @@
+ 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
+ }
+ 
+-static void task_running_tick(struct rq *rq, struct task_struct *p)
++/*
++ * The task has used up its quota of running in this prio_level so it must be
++ * dropped a priority level, all managed by recalc_task_prio().
++ */
++static void task_expired_entitlement(struct rq *rq, struct task_struct *p)
+ {
+-	if (p->array != rq->active) {
+-		/* Task has expired but was not scheduled yet */
+-		set_tsk_need_resched(p);
++	int overrun;
++
++	reset_first_time_slice(p);
++	if (rt_task(p)) {
++		p->time_slice += p->quota;
++		list_move_tail(&p->run_list, p->array->queue + p->prio);
+ 		return;
+ 	}
+-	spin_lock(&rq->lock);
++	overrun = p->time_slice;
++	dequeue_task(p, rq);
++	enqueue_task(p, rq);
+ 	/*
+-	 * The task was running during this tick - update the
+-	 * time slice counter. Note: we do not update a thread's
+-	 * priority until it either goes to sleep or uses up its
+-	 * timeslice. This makes it possible for interactive tasks
+-	 * to use up their timeslices at their highest priority levels.
++	 * Subtract any extra time this task ran over its time_slice; ie
++	 * overrun will either be 0 or negative.
+ 	 */
+-	if (rt_task(p)) {
+-		/*
+-		 * RR tasks need a special form of timeslice management.
+-		 * FIFO tasks have no timeslices.
+-		 */
+-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
+-			p->time_slice = task_timeslice(p);
+-			p->first_time_slice = 0;
+-			set_tsk_need_resched(p);
++	p->time_slice += overrun;
++}
+ 
+-			/* put it at the end of the queue: */
+-			requeue_task(p, rq->active);
+-		}
+-		goto out_unlock;
++/*
++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
++ * tasks and set the refractory flag if necessary. There is 10% hysteresis
++ * for unsetting the flag.
++ */
++static unsigned int test_ret_isorefractory(struct rq *rq)
++{
++	if (likely(!rq->iso_refractory)) {
++		if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu)
++			rq->iso_refractory = 1;
++	} else {
++		if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100))
++			rq->iso_refractory = 0;
+ 	}
+-	if (!--p->time_slice) {
+-		dequeue_task(p, rq->active);
+-		set_tsk_need_resched(p);
+-		p->prio = effective_prio(p);
+-		p->time_slice = task_timeslice(p);
+-		p->first_time_slice = 0;
++	return rq->iso_refractory;
++}
+ 
+-		if (!rq->expired_timestamp)
+-			rq->expired_timestamp = jiffies;
+-		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+-			enqueue_task(p, rq->expired);
+-			if (p->static_prio < rq->best_expired_prio)
+-				rq->best_expired_prio = p->static_prio;
+-		} else
+-			enqueue_task(p, rq->active);
+-	} else {
+-		/*
+-		 * Prevent a too long timeslice allowing a task to monopolize
+-		 * the CPU. We do this by splitting up the timeslice into
+-		 * smaller pieces.
+-		 *
+-		 * Note: this does not mean the task's timeslices expire or
+-		 * get lost in any way, they just might be preempted by
+-		 * another task of equal priority. (one with higher
+-		 * priority would have preempted this task already.) We
+-		 * requeue this task to the end of the list on this priority
+-		 * level, which is in essence a round-robin of tasks with
+-		 * equal priority.
+-		 *
+-		 * This only applies to tasks in the interactive
+-		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
+-		 */
+-		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
+-			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
+-			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
+-			(p->array == rq->active)) {
++/* No SCHED_ISO task was running so decrease rq->iso_ticks */
++static inline void no_iso_tick(struct rq *rq)
++{
++	rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
++}
+ 
+-			requeue_task(p, rq->active);
+-			set_tsk_need_resched(p);
+-		}
++/* This manages tasks that have run out of timeslice during a scheduler_tick */
++static void task_running_tick(struct rq *rq, struct task_struct *p)
++{
++	/*
++	 * If a SCHED_ISO task is running we increment the iso_ticks. In
++	 * order to prevent SCHED_ISO tasks from causing starvation in the
++	 * presence of true RT tasks we account those as iso_ticks as well.
++	 */
++	if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) {
++		if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100)
++			rq->iso_ticks += 100;
++	} else
++		no_iso_tick(rq);
++
++	if (iso_task(p)) {
++		if (unlikely(test_ret_isorefractory(rq))) {
++			if (isoprio_suitable(p)) {
++				/*
++				 * SCHED_ISO task is running as RT and limit
++				 * has been hit. Set the PF_ISOREF flag and
++				 * force it to reschedule as SCHED_NORMAL
++				 * by zeroing its time_slice
++				 */
++				p->flags |= PF_ISOREF;
++				p->time_slice = 0;
++			}
++		} else
++			p->flags &= ~PF_ISOREF;
+ 	}
+-out_unlock:
+-	spin_unlock(&rq->lock);
++	/* SCHED_FIFO tasks never run out of timeslice. */
++	if (p->time_slice > 0 || p->policy == SCHED_FIFO)
++		return;
++	/* p->time_slice <= 0 */
++	set_tsk_need_resched(p);
++	if (likely(task_queued(p)))
++		task_expired_entitlement(rq, p);
+ }
+ 
+ /*
+  * This function gets called by the timer code, with HZ frequency.
+  * We call it with interrupts disabled.
+- *
+- * It also gets called by the fork code, when changing the parent's
+- * timeslices.
+  */
+ void scheduler_tick(void)
+ {
+@@ -3505,10 +3680,14 @@
+ 	int idle_at_tick = idle_cpu(cpu);
+ 	struct rq *rq = cpu_rq(cpu);
+ 
+-	update_cpu_clock(p, rq, now);
++	update_cpu_clock(p, rq, now, 1);
+ 
++	spin_lock(&rq->lock);
+ 	if (!idle_at_tick)
+ 		task_running_tick(rq, p);
++	else
++		no_iso_tick(rq);
++	spin_unlock(&rq->lock);
+ #ifdef CONFIG_SMP
+ 	update_load(rq);
+ 	rq->idle_at_tick = idle_at_tick;
+@@ -3554,10 +3733,80 @@
+ 
+ #endif
+ 
+-static inline int interactive_sleep(enum sleep_type sleep_type)
++static void reset_prio_levels(struct rq *rq)
+ {
+-	return (sleep_type == SLEEP_INTERACTIVE ||
+-		sleep_type == SLEEP_INTERRUPTED);
++	rq->active->best_static_prio = MAX_PRIO - 1;
++	rq->expired->best_static_prio = MAX_PRIO - 1;
++	memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE);
++}
++
++/*
++ * Only tasks running are SCHED_IDLEPRIO. Set the active array to the
++ * idleprio array and if it isn't already active
++ */
++static struct task_struct *next_idleprio_task(struct rq *rq)
++{
++	struct prio_array *array = rq->active;
++	struct list_head *queue;
++
++	if (array != rq->idleprio) {
++		rq->active = rq->idleprio;
++		rq->expired = array;
++		array = rq->active;
++		rq->exp_bitmap = rq->expired->prio_bitmap;
++		rq->dyn_bitmap = rq->active->prio_bitmap;
++	}
++	rq->prio_rotation++;
++	reset_prio_levels(rq);
++	queue = array->queue + MAX_PRIO;
++	return list_entry(queue->next, struct task_struct, run_list);
++}
++
++/*
++ * next_dynamic_task finds the next suitable dynamic task.
++ */
++static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx)
++{
++	struct prio_array *array = rq->active;
++	struct task_struct *next;
++	struct list_head *queue;
++	int nstatic;
++
++retry:
++	if (unlikely(rq->nr_running == rq->nr_idleprio))
++		return next_idleprio_task(rq);
++	if (idx >= MAX_PRIO) {
++		/* There are no more tasks in the active array. Swap arrays */
++		array = rq->expired;
++		rq->expired = rq->active;
++		rq->active = array;
++		rq->exp_bitmap = rq->expired->prio_bitmap;
++		rq->dyn_bitmap = rq->active->prio_bitmap;
++		rq->prio_rotation++;
++		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
++		reset_prio_levels(rq);
++	}
++	queue = array->queue + idx;
++	next = list_entry(queue->next, struct task_struct, run_list);
++	if (unlikely(next->time_slice <= 0 && !(iso_task(next) &&
++	    isoprio_suitable(next)))) {
++		/*
++		 * Unlucky enough that this task ran out of time_slice
++		 * before it hit a scheduler_tick so it should have its
++		 * priority reassessed and choose another task (possibly
++		 * the same one)
++		 */
++		task_expired_entitlement(rq, next);
++		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
++		goto retry;
++	}
++	next->rotation = rq->prio_rotation;
++	nstatic = next->static_prio;
++	if (nstatic < array->best_static_prio)
++		array->best_static_prio = nstatic;
++	if (idx > rq->prio_level[USER_PRIO(nstatic)])
++		rq->prio_level[USER_PRIO(nstatic)] = idx;
++	return next;
+ }
+ 
+ /*
+@@ -3566,13 +3815,11 @@
+ asmlinkage void __sched schedule(void)
+ {
+ 	struct task_struct *prev, *next;
+-	struct prio_array *array;
+ 	struct list_head *queue;
+ 	unsigned long long now;
+-	unsigned long run_time;
+-	int cpu, idx, new_prio;
+ 	long *switch_count;
+ 	struct rq *rq;
++	int cpu, idx;
+ 
+ 	/*
+ 	 * Test if we are atomic.  Since do_exit() needs to call into
+@@ -3608,18 +3855,6 @@
+ 
+ 	schedstat_inc(rq, sched_cnt);
+ 	now = sched_clock();
+-	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
+-		run_time = now - prev->timestamp;
+-		if (unlikely((long long)(now - prev->timestamp) < 0))
+-			run_time = 0;
+-	} else
+-		run_time = NS_MAX_SLEEP_AVG;
+-
+-	/*
+-	 * Tasks charged proportionately less run_time at high sleep_avg to
+-	 * delay them losing their interactive status
+-	 */
+-	run_time /= (CURRENT_BONUS(prev) ? : 1);
+ 
+ 	spin_lock_irq(&rq->lock);
+ 
+@@ -3630,8 +3865,10 @@
+ 				unlikely(signal_pending(prev))))
+ 			prev->state = TASK_RUNNING;
+ 		else {
+-			if (prev->state == TASK_UNINTERRUPTIBLE)
++			if (prev->state == TASK_UNINTERRUPTIBLE) {
++				prev->flags |= PF_NONSLEEP;
+ 				rq->nr_uninterruptible++;
++			}
+ 			deactivate_task(prev, rq);
+ 		}
+ 	}
+@@ -3641,59 +3878,29 @@
+ 		idle_balance(cpu, rq);
+ 		if (!rq->nr_running) {
+ 			next = rq->idle;
+-			rq->expired_timestamp = 0;
+ 			goto switch_tasks;
+ 		}
+ 	}
+ 
+-	array = rq->active;
+-	if (unlikely(!array->nr_active)) {
+-		/*
+-		 * Switch the active and expired arrays.
+-		 */
+-		schedstat_inc(rq, sched_switch);
+-		rq->active = rq->expired;
+-		rq->expired = array;
+-		array = rq->active;
+-		rq->expired_timestamp = 0;
+-		rq->best_expired_prio = MAX_PRIO;
+-	}
+-
+-	idx = sched_find_first_bit(array->bitmap);
+-	queue = array->queue + idx;
+-	next = list_entry(queue->next, struct task_struct, run_list);
+-
+-	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
+-		unsigned long long delta = now - next->timestamp;
+-		if (unlikely((long long)(now - next->timestamp) < 0))
+-			delta = 0;
+-
+-		if (next->sleep_type == SLEEP_INTERACTIVE)
+-			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
+-
+-		array = next->array;
+-		new_prio = recalc_task_prio(next, next->timestamp + delta);
+-
+-		if (unlikely(next->prio != new_prio)) {
+-			dequeue_task(next, array);
+-			next->prio = new_prio;
+-			enqueue_task(next, array);
+-		}
++	idx = sched_find_first_bit(rq->dyn_bitmap);
++	if (likely(idx > ISO_PRIO))
++		next = next_dynamic_task(rq, idx);
++	else {
++		queue = rq->active->queue + idx;
++		next = list_entry(queue->next, struct task_struct, run_list);
+ 	}
+-	next->sleep_type = SLEEP_NORMAL;
+ switch_tasks:
+-	if (next == rq->idle)
++	if (next == rq->idle) {
++		reset_prio_levels(rq);
++		rq->prio_rotation++;
+ 		schedstat_inc(rq, sched_goidle);
++	}
+ 	prefetch(next);
+ 	prefetch_stack(next);
+ 	clear_tsk_need_resched(prev);
+ 	rcu_qsctr_inc(task_cpu(prev));
+ 
+-	update_cpu_clock(prev, rq, now);
+-
+-	prev->sleep_avg -= run_time;
+-	if ((long)prev->sleep_avg <= 0)
+-		prev->sleep_avg = 0;
++	update_cpu_clock(prev, rq, now, 0);
+ 	prev->timestamp = prev->last_ran = now;
+ 
+ 	sched_info_switch(prev, next);
+@@ -4129,29 +4336,22 @@
+  */
+ void rt_mutex_setprio(struct task_struct *p, int prio)
+ {
+-	struct prio_array *array;
+ 	unsigned long flags;
++	int queued, oldprio;
+ 	struct rq *rq;
+-	int oldprio;
+ 
+ 	BUG_ON(prio < 0 || prio > MAX_PRIO);
+ 
+ 	rq = task_rq_lock(p, &flags);
+ 
+ 	oldprio = p->prio;
+-	array = p->array;
+-	if (array)
+-		dequeue_task(p, array);
++	queued = task_queued(p);
++	if (queued)
++		dequeue_task(p, rq);
+ 	p->prio = prio;
+ 
+-	if (array) {
+-		/*
+-		 * If changing to an RT priority then queue it
+-		 * in the active array!
+-		 */
+-		if (rt_task(p))
+-			array = rq->active;
+-		enqueue_task(p, array);
++	if (queued) {
++		enqueue_task(p, rq);
+ 		/*
+ 		 * Reschedule if we are currently running on this runqueue and
+ 		 * our priority decreased, or if we are not currently running on
+@@ -4160,8 +4360,8 @@
+ 		if (task_running(rq, p)) {
+ 			if (p->prio > oldprio)
+ 				resched_task(rq->curr);
+-		} else if (TASK_PREEMPTS_CURR(p, rq))
+-			resched_task(rq->curr);
++		} else
++			try_preempt(p, rq);
+ 	}
+ 	task_rq_unlock(rq, &flags);
+ }
+@@ -4170,8 +4370,7 @@
+ 
+ void set_user_nice(struct task_struct *p, long nice)
+ {
+-	struct prio_array *array;
+-	int old_prio, delta;
++	int queued, old_prio,delta;
+ 	unsigned long flags;
+ 	struct rq *rq;
+ 
+@@ -4192,26 +4391,27 @@
+ 		p->static_prio = NICE_TO_PRIO(nice);
+ 		goto out_unlock;
+ 	}
+-	array = p->array;
+-	if (array) {
+-		dequeue_task(p, array);
++	queued = task_queued(p);
++	if (queued) {
++		dequeue_task(p, rq);
+ 		dec_raw_weighted_load(rq, p);
+ 	}
+ 
+ 	p->static_prio = NICE_TO_PRIO(nice);
+-	set_load_weight(p);
+ 	old_prio = p->prio;
+ 	p->prio = effective_prio(p);
++	set_quota(p);
+ 	delta = p->prio - old_prio;
+ 
+-	if (array) {
+-		enqueue_task(p, array);
++	if (queued) {
++		enqueue_task(p, rq);
+ 		inc_raw_weighted_load(rq, p);
+ 		/*
+ 		 * If the task increased its priority or is running and
+ 		 * lowered its priority, then reschedule its CPU:
+ 		 */
+-		if (delta < 0 || (delta > 0 && task_running(rq, p)))
++		if (delta < 0 || ((delta > 0 || idleprio_task(p)) &&
++		    task_running(rq, p)))
+ 			resched_task(rq->curr);
+ 	}
+ out_unlock:
+@@ -4281,11 +4481,23 @@
+  *
+  * This is the priority value as seen by users in /proc.
+  * RT tasks are offset by -200. Normal tasks are centered
+- * around 0, value goes from -16 to +15.
++ * around 1, value goes from 0 to +79. Values higher than
++ * 39 indicate task is on the expired array. This is done
++ * lockless and may rarely return an active instead of
++ * expired value.
+  */
+-int task_prio(const struct task_struct *p)
++int task_prio(struct task_struct *p)
+ {
+-	return p->prio - MAX_RT_PRIO;
++	int prio = p->prio - MAX_RT_PRIO;
++
++	if (task_queued(p)) {
++		struct rq *rq = task_rq(p);
++		struct prio_array *array = p->array;
++
++		if (rq && rq->expired == array)
++			prio += PRIO_RANGE;
++	}
++	return prio;
+ }
+ 
+ /**
+@@ -4328,19 +4540,14 @@
+ /* Actually do priority change: must hold rq lock. */
+ static void __setscheduler(struct task_struct *p, int policy, int prio)
+ {
+-	BUG_ON(p->array);
++	BUG_ON(task_queued(p));
+ 
+ 	p->policy = policy;
+ 	p->rt_priority = prio;
+ 	p->normal_prio = normal_prio(p);
+ 	/* we are holding p->pi_lock already */
+ 	p->prio = rt_mutex_getprio(p);
+-	/*
+-	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+-	 */
+-	if (policy == SCHED_BATCH)
+-		p->sleep_avg = 0;
+-	set_load_weight(p);
++	set_quota(p);
+ }
+ 
+ /**
+@@ -4354,19 +4561,36 @@
+ int sched_setscheduler(struct task_struct *p, int policy,
+ 		       struct sched_param *param)
+ {
+-	int retval, oldprio, oldpolicy = -1;
+-	struct prio_array *array;
++	struct sched_param zero_param = { .sched_priority = 0 };
++	int queued, retval, oldprio, oldpolicy = -1;
++	unsigned long rlim_rtprio = 0;
+ 	unsigned long flags;
+ 	struct rq *rq;
+ 
+ 	/* may grab non-irq protected spin_locks */
+ 	BUG_ON(in_interrupt());
++	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
++		unsigned long lflags;
++
++		if (!lock_task_sighand(p, &lflags))
++			return -ESRCH;
++		rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
++		unlock_task_sighand(p, &lflags);
++		if (rlim_rtprio)
++			goto recheck;
++		/*
++		 * If the caller requested an RT policy without having the
++		 * necessary rights, we downgrade the policy to SCHED_ISO.
++		 * We also set the parameter to zero to pass the checks.
++		 */
++		policy = SCHED_ISO;
++		param = &zero_param;
++	}
+ recheck:
+ 	/* double check policy once rq lock held */
+ 	if (policy < 0)
+ 		policy = oldpolicy = p->policy;
+-	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
+-			policy != SCHED_NORMAL && policy != SCHED_BATCH)
++	else if (!SCHED_RANGE(policy))
+ 		return -EINVAL;
+ 	/*
+ 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+@@ -4385,14 +4609,6 @@
+ 	 */
+ 	if (!capable(CAP_SYS_NICE)) {
+ 		if (is_rt_policy(policy)) {
+-			unsigned long rlim_rtprio;
+-			unsigned long flags;
+-
+-			if (!lock_task_sighand(p, &flags))
+-				return -ESRCH;
+-			rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+-			unlock_task_sighand(p, &flags);
+-
+ 			/* can't set/change the rt policy */
+ 			if (policy != p->policy && !rlim_rtprio)
+ 				return -EPERM;
+@@ -4401,6 +4617,31 @@
+ 			if (param->sched_priority > p->rt_priority &&
+ 			    param->sched_priority > rlim_rtprio)
+ 				return -EPERM;
++		} else {
++			switch (p->policy) {
++				/*
++				 * Can only downgrade policies but not back to
++				 * SCHED_NORMAL
++				 */
++				case SCHED_ISO:
++					if (policy == SCHED_ISO)
++						goto out;
++					if (policy == SCHED_NORMAL)
++						return -EPERM;
++					break;
++				case SCHED_BATCH:
++					if (policy == SCHED_BATCH)
++						goto out;
++					if (policy != SCHED_IDLEPRIO)
++					    	return -EPERM;
++					break;
++				case SCHED_IDLEPRIO:
++					if (policy == SCHED_IDLEPRIO)
++						goto out;
++					return -EPERM;
++				default:
++					break;
++			}
+ 		}
+ 
+ 		/* can't change other user's priorities */
+@@ -4409,6 +4650,11 @@
+ 			return -EPERM;
+ 	}
+ 
++	if (!(p->mm) && policy == SCHED_IDLEPRIO) {
++		/* Don't allow kernel threads to be SCHED_IDLEPRIO. */
++		return -EINVAL;
++	}
++
+ 	retval = security_task_setscheduler(p, policy, param);
+ 	if (retval)
+ 		return retval;
+@@ -4429,12 +4675,12 @@
+ 		spin_unlock_irqrestore(&p->pi_lock, flags);
+ 		goto recheck;
+ 	}
+-	array = p->array;
+-	if (array)
++	queued = task_queued(p);
++	if (queued)
+ 		deactivate_task(p, rq);
+ 	oldprio = p->prio;
+ 	__setscheduler(p, policy, param->sched_priority);
+-	if (array) {
++	if (queued) {
+ 		__activate_task(p, rq);
+ 		/*
+ 		 * Reschedule if we are currently running on this runqueue and
+@@ -4444,14 +4690,15 @@
+ 		if (task_running(rq, p)) {
+ 			if (p->prio > oldprio)
+ 				resched_task(rq->curr);
+-		} else if (TASK_PREEMPTS_CURR(p, rq))
+-			resched_task(rq->curr);
++		} else
++			try_preempt(p, rq);
+ 	}
+ 	__task_rq_unlock(rq);
+ 	spin_unlock_irqrestore(&p->pi_lock, flags);
+ 
+ 	rt_mutex_adjust_pi(p);
+ 
++out:
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(sched_setscheduler);
+@@ -4718,41 +4965,34 @@
+  * sys_sched_yield - yield the current processor to other threads.
+  *
+  * This function yields the current CPU by moving the calling thread
+- * to the expired array. If there are no other threads running on this
+- * CPU then this function will return.
++ * to the expired array if SCHED_NORMAL or the end of its current priority
++ * queue if a realtime task. If there are no other threads running on this
++ * cpu this function will return.
+  */
+ asmlinkage long sys_sched_yield(void)
+ {
+ 	struct rq *rq = this_rq_lock();
+-	struct prio_array *array = current->array, *target = rq->expired;
++	struct task_struct *p = current;
+ 
+ 	schedstat_inc(rq, yld_cnt);
+-	/*
+-	 * We implement yielding by moving the task into the expired
+-	 * queue.
+-	 *
+-	 * (special rule: RT tasks will just roundrobin in the active
+-	 *  array.)
+-	 */
+-	if (rt_task(current))
+-		target = rq->active;
+-
+-	if (array->nr_active == 1) {
+-		schedstat_inc(rq, yld_act_empty);
+-		if (!rq->expired->nr_active)
+-			schedstat_inc(rq, yld_both_empty);
+-	} else if (!rq->expired->nr_active)
+-		schedstat_inc(rq, yld_exp_empty);
+-
+-	if (array != target) {
+-		dequeue_task(current, array);
+-		enqueue_task(current, target);
+-	} else
+-		/*
+-		 * requeue_task is cheaper so perform that if possible.
+-		 */
+-		requeue_task(current, array);
++	if (rq->nr_running == 1)
++		schedstat_inc(rq, yld_both_empty);
++	else {
++		struct prio_array *old_array = p->array;
++		int old_prio = p->prio;
++
++		if (idleprio_task(p)) {
++			dequeue_task(p, rq);
++			enqueue_task(p, rq);
++			goto out_release;
++		}
++		/* p->prio will be updated in requeue_task via queue_expired */
++		if (!rt_task(p))
++			p->array = rq->expired;
++		requeue_task(p, rq, old_array, old_prio);
++	}
+ 
++out_release:
+ 	/*
+ 	 * Since we are going to call schedule() anyway, there's
+ 	 * no need to preempt or enable interrupts:
+@@ -4902,6 +5142,8 @@
+ 		break;
+ 	case SCHED_NORMAL:
+ 	case SCHED_BATCH:
++	case SCHED_ISO:
++	case SCHED_IDLEPRIO:
+ 		ret = 0;
+ 		break;
+ 	}
+@@ -4926,6 +5168,8 @@
+ 		break;
+ 	case SCHED_NORMAL:
+ 	case SCHED_BATCH:
++	case SCHED_ISO:
++	case SCHED_IDLEPRIO:
+ 		ret = 0;
+ 	}
+ 	return ret;
+@@ -4959,8 +5203,8 @@
+ 	if (retval)
+ 		goto out_unlock;
+ 
+-	jiffies_to_timespec(p->policy == SCHED_FIFO ?
+-				0 : task_timeslice(p), &t);
++	t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 :
++			   MS_TO_NS(task_timeslice(p)));
+ 	read_unlock(&tasklist_lock);
+ 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+ out_nounlock:
+@@ -5056,10 +5300,10 @@
+ 	struct rq *rq = cpu_rq(cpu);
+ 	unsigned long flags;
+ 
+-	idle->timestamp = sched_clock();
+-	idle->sleep_avg = 0;
+-	idle->array = NULL;
+-	idle->prio = idle->normal_prio = MAX_PRIO;
++	bitmap_zero(idle->bitmap, PRIO_RANGE);
++	idle->timestamp = idle->last_ran = sched_clock();
++	idle->array = rq->active;
++	idle->prio = idle->normal_prio = NICE_TO_PRIO(0);
+ 	idle->state = TASK_RUNNING;
+ 	idle->cpus_allowed = cpumask_of_cpu(cpu);
+ 	set_task_cpu(idle, cpu);
+@@ -5178,7 +5422,7 @@
+ 		goto out;
+ 
+ 	set_task_cpu(p, dest_cpu);
+-	if (p->array) {
++	if (task_queued(p)) {
+ 		/*
+ 		 * Sync timestamp with rq_dest's before activating.
+ 		 * The same thing could be achieved by doing this step
+@@ -5189,8 +5433,7 @@
+ 				+ rq_dest->most_recent_timestamp;
+ 		deactivate_task(p, rq_src);
+ 		__activate_task(p, rq_dest);
+-		if (TASK_PREEMPTS_CURR(p, rq_dest))
+-			resched_task(rq_dest->curr);
++		try_preempt(p, rq_dest);
+ 	}
+ 	ret = 1;
+ out:
+@@ -5487,7 +5730,7 @@
+ 		/* Idle task back to normal (off runqueue, low prio) */
+ 		rq = task_rq_lock(rq->idle, &flags);
+ 		deactivate_task(rq->idle, rq);
+-		rq->idle->static_prio = MAX_PRIO;
++		rq->idle->static_prio = NICE_TO_PRIO(0);
+ 		__setscheduler(rq->idle, SCHED_NORMAL, 0);
+ 		migrate_dead_tasks(cpu);
+ 		task_rq_unlock(rq, &flags);
+@@ -7013,6 +7256,13 @@
+ 	/* Move init over to a non-isolated CPU */
+ 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
+ 		BUG();
++
++	/*
++	 * Assume that every added cpu gives us slightly less overall latency
++	 * allowing us to increase the base rr_interval, but in a non linear
++	 * fashion.
++	 */
++	rr_interval *= 1 + ilog2(num_online_cpus());
+ }
+ #else
+ void __init sched_init_smp(void)
+@@ -7035,6 +7285,16 @@
+ 	int i, j, k;
+ 	int highest_cpu = 0;
+ 
++	/* Generate the priority matrix */
++	for (i = 0; i < PRIO_RANGE; i++) {
++		bitmap_fill(prio_matrix[i], PRIO_RANGE);
++		j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i);
++		for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) {
++			__clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE),
++				    prio_matrix[i]);
++		}
++	}
++
+ 	for_each_possible_cpu(i) {
+ 		struct prio_array *array;
+ 		struct rq *rq;
+@@ -7042,12 +7302,20 @@
+ 		rq = cpu_rq(i);
+ 		spin_lock_init(&rq->lock);
+ 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
++		rq->iso_ticks = 0;
+ 		rq->nr_running = 0;
++		rq->nr_idleprio = 0;
++		rq->prio_rotation = 0;
+ 		rq->active = rq->arrays;
++		rq->idleprio = rq->active;
+ 		rq->expired = rq->arrays + 1;
+-		rq->best_expired_prio = MAX_PRIO;
++		reset_prio_levels(rq);
++		rq->dyn_bitmap = rq->active->prio_bitmap;
++		rq->exp_bitmap = rq->expired->prio_bitmap;
+ 
+ #ifdef CONFIG_SMP
++		rq->active->rq = rq;
++		rq->expired->rq = rq;
+ 		rq->sd = NULL;
+ 		for (j = 1; j < 3; j++)
+ 			rq->cpu_load[j] = 0;
+@@ -7060,17 +7328,16 @@
+ 		atomic_set(&rq->nr_iowait, 0);
+ 
+ 		for (j = 0; j < 2; j++) {
++
+ 			array = rq->arrays + j;
+-			for (k = 0; k < MAX_PRIO; k++) {
++			for (k = 0; k <= MAX_PRIO; k++)
+ 				INIT_LIST_HEAD(array->queue + k);
+-				__clear_bit(k, array->bitmap);
+-			}
+-			// delimiter for bitsearch
+-			__set_bit(MAX_PRIO, array->bitmap);
++			bitmap_zero(array->prio_bitmap, MAX_PRIO);
++			/* delimiter for bitsearch */
++			__set_bit(MAX_PRIO, array->prio_bitmap);
+ 		}
+ 		highest_cpu = i;
+ 	}
+-
+ 	set_load_weight(&init_task);
+ 
+ #ifdef CONFIG_SMP
+@@ -7125,25 +7392,25 @@
+ #ifdef CONFIG_MAGIC_SYSRQ
+ void normalize_rt_tasks(void)
+ {
+-	struct prio_array *array;
+ 	struct task_struct *g, *p;
+ 	unsigned long flags;
+ 	struct rq *rq;
++	int queued;
+ 
+ 	read_lock_irq(&tasklist_lock);
+ 
+ 	do_each_thread(g, p) {
+-		if (!rt_task(p))
++		if (!rt_task(p) && !iso_task(p))
+ 			continue;
+ 
+ 		spin_lock_irqsave(&p->pi_lock, flags);
+ 		rq = __task_rq_lock(p);
+ 
+-		array = p->array;
+-		if (array)
++		queued = task_queued(p);
++		if (queued)
+ 			deactivate_task(p, task_rq(p));
+ 		__setscheduler(p, SCHED_NORMAL, 0);
+-		if (array) {
++		if (queued) {
+ 			__activate_task(p, task_rq(p));
+ 			resched_task(rq->curr);
+ 		}
+Index: linux-2.6.22-ck1/kernel/sysctl.c
+===================================================================
+--- linux-2.6.22-ck1.orig/kernel/sysctl.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/kernel/sysctl.c	2007-07-10 14:55:23.000000000 +1000
+@@ -22,6 +22,7 @@
+ #include <linux/mm.h>
+ #include <linux/swap.h>
+ #include <linux/slab.h>
++#include <linux/swap-prefetch.h>
+ #include <linux/sysctl.h>
+ #include <linux/proc_fs.h>
+ #include <linux/capability.h>
+@@ -70,6 +71,7 @@
+ extern char core_pattern[];
+ extern int pid_max;
+ extern int min_free_kbytes;
++extern int vm_tail_largefiles;
+ extern int printk_ratelimit_jiffies;
+ extern int printk_ratelimit_burst;
+ extern int pid_max_min, pid_max_max;
+@@ -78,6 +80,10 @@
+ extern int compat_log;
+ extern int maps_protect;
+ extern int sysctl_stat_interval;
++extern int rr_interval;
++extern int sched_interactive;
++extern int sched_iso_cpu;
++extern int sched_iso_period;
+ 
+ /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+ static int maxolduid = 65535;
+@@ -161,6 +167,14 @@
+ #endif
+ 
+ 
++/* Constants for minimum and maximum testing.
++   We use these as one-element integer vectors. */
++static int __read_mostly zero;
++static int __read_mostly one = 1;
++static int __read_mostly one_hundred = 100;
++static int __read_mostly five_thousand = 5000;
++
++
+ /* The default sysctl tables: */
+ 
+ static ctl_table root_table[] = {
+@@ -501,6 +515,47 @@
+ 		.mode		= 0444,
+ 		.proc_handler	= &proc_dointvec,
+ 	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "rr_interval",
++		.data		= &rr_interval,
++		.maxlen		= sizeof (int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.strategy	= &sysctl_intvec,
++		.extra1		= &one,
++		.extra2		= &five_thousand,
++	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "interactive",
++		.data		= &sched_interactive,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "iso_cpu",
++		.data		= &sched_iso_cpu,
++		.maxlen		= sizeof (int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.strategy	= &sysctl_intvec,
++		.extra1		= &zero,
++		.extra2		= &one_hundred,
++	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "iso_period",
++		.data		= &sched_iso_period,
++		.maxlen		= sizeof (int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.strategy	= &sysctl_intvec,
++		.extra1		= &one,
++		.extra2		= &one_hundred,
++	},
+ #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+ 	{
+ 		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
+@@ -619,14 +674,16 @@
+ 	{ .ctl_name = 0 }
+ };
+ 
+-/* Constants for minimum and maximum testing in vm_table.
+-   We use these as one-element integer vectors. */
+-static int zero;
+-static int one_hundred = 100;
+-
+-
+ static ctl_table vm_table[] = {
+ 	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "tail_largefiles",
++		.data		= &vm_tail_largefiles,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
+ 		.ctl_name	= VM_OVERCOMMIT_MEMORY,
+ 		.procname	= "overcommit_memory",
+ 		.data		= &sysctl_overcommit_memory,
+@@ -705,16 +762,24 @@
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ 	{
+-		.ctl_name	= VM_SWAPPINESS,
+-		.procname	= "swappiness",
+-		.data		= &vm_swappiness,
+-		.maxlen		= sizeof(vm_swappiness),
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "mapped",
++		.data		= &vm_mapped,
++		.maxlen		= sizeof(vm_mapped),
+ 		.mode		= 0644,
+ 		.proc_handler	= &proc_dointvec_minmax,
+ 		.strategy	= &sysctl_intvec,
+ 		.extra1		= &zero,
+ 		.extra2		= &one_hundred,
+ 	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "hardmaplimit",
++		.data		= &vm_hardmaplimit,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
+ #ifdef CONFIG_HUGETLB_PAGE
+ 	 {
+ 		.ctl_name	= VM_HUGETLB_PAGES,
+@@ -882,6 +947,32 @@
+ 		.extra1		= &zero,
+ 	},
+ #endif
++#ifdef CONFIG_SWAP_PREFETCH
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "swap_prefetch",
++		.data		= &swap_prefetch,
++		.maxlen		= sizeof(swap_prefetch),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "swap_prefetch_delay",
++		.data		= &swap_prefetch_delay,
++		.maxlen		= sizeof(swap_prefetch_delay),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "swap_prefetch_sleep",
++		.data		= &swap_prefetch_sleep,
++		.maxlen		= sizeof(swap_prefetch_sleep),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++#endif
+ 	{ .ctl_name = 0 }
+ };
+ 
+Index: linux-2.6.22-ck1/Documentation/sched-design.txt
+===================================================================
+--- linux-2.6.22-ck1.orig/Documentation/sched-design.txt	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/Documentation/sched-design.txt	2007-07-10 14:55:02.000000000 +1000
+@@ -1,11 +1,14 @@
+-		   Goals, Design and Implementation of the
+-		      new ultra-scalable O(1) scheduler
++ Goals, Design and Implementation of the ultra-scalable O(1) scheduler by
++ Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by
++ Con Kolivas.
+ 
+ 
+-  This is an edited version of an email Ingo Molnar sent to
+-  lkml on 4 Jan 2002.  It describes the goals, design, and
+-  implementation of Ingo's new ultra-scalable O(1) scheduler.
+-  Last Updated: 18 April 2002.
++  This was originally an edited version of an email Ingo Molnar sent to
++  lkml on 4 Jan 2002.  It describes the goals, design, and implementation
++  of Ingo's ultra-scalable O(1) scheduler. It now contains a description
++  of the Staircase Deadline priority scheduler that was built on this
++  design.
++  Last Updated: Fri, 4 May 2007
+ 
+ 
+ Goal
+@@ -163,3 +166,222 @@
+ code is smaller than the old one.
+ 
+ 	Ingo
++
++
++Staircase Deadline cpu scheduler policy
++================================================
++
++Design summary
++==============
++
++A novel design which incorporates a foreground-background descending priority
++system (the staircase) via a bandwidth allocation matrix according to nice
++level.
++
++
++Features
++========
++
++A starvation free, strict fairness O(1) scalable design with interactivity
++as good as the above restrictions can provide. There is no interactivity
++estimator, no sleep/run measurements and only simple fixed accounting.
++The design has strict enough a design and accounting that task behaviour
++can be modelled and maximum scheduling latencies can be predicted by
++the virtual deadline mechanism that manages runqueues. The prime concern
++in this design is to maintain fairness at all costs determined by nice level,
++yet to maintain as good interactivity as can be allowed within the
++constraints of strict fairness.
++
++
++Design description
++==================
++
++SD works off the principle of providing each task a quota of runtime that it is
++allowed to run at a number of priority levels determined by its static priority
++(ie. its nice level). If the task uses up its quota it has its priority
++decremented to the next level determined by a priority matrix. Once every
++runtime quota has been consumed of every priority level, a task is queued on the
++"expired" array. When no other tasks exist with quota, the expired array is
++activated and fresh quotas are handed out. This is all done in O(1).
++
++Design details
++==============
++
++Each task keeps a record of its own entitlement of cpu time. Most of the rest of
++these details apply to non-realtime tasks as rt task management is straight
++forward.
++
++Each runqueue keeps a record of what major epoch it is up to in the
++rq->prio_rotation field which is incremented on each major epoch. It also
++keeps a record of the current prio_level for each static priority task.
++
++Each task keeps a record of what major runqueue epoch it was last running
++on in p->rotation. It also keeps a record of what priority levels it has
++already been allocated quota from during this epoch in a bitmap p->bitmap.
++
++The only tunable that determines all other details is the RR_INTERVAL. This
++is set to 8ms, and is scaled gently upwards with more cpus. This value is
++tunable via a /proc interface.
++
++All tasks are initially given a quota based on RR_INTERVAL. This is equal to
++RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and
++progressively larger for nice values from -1 to -20. This is assigned to
++p->quota and only changes with changes in nice level.
++
++As a task is first queued, it checks in recalc_task_prio to see if it has run at
++this runqueue's current priority rotation. If it has not, it will have its
++p->prio level set according to the first slot in a "priority matrix" and will be
++given a p->time_slice equal to the p->quota, and has its allocation bitmap bit
++set in p->bitmap for this prio level. It is then queued on the current active
++priority array.
++
++If a task has already been running during this major epoch, and it has
++p->time_slice left and the rq->prio_quota for the task's p->prio still
++has quota, it will be placed back on the active array, but no more quota
++will be added.
++
++If a task has been running during this major epoch, but does not have
++p->time_slice left, it will find the next lowest priority in its bitmap that it
++has not been allocated quota from. It then gets the a full quota in
++p->time_slice. It is then queued on the current active priority array at the
++newly determined lower priority.
++
++If a task has been running during this major epoch, and does not have
++any entitlement left in p->bitmap and no time_slice left, it will have its
++bitmap cleared, and be queued at its best prio again, but on the expired
++priority array.
++
++When a task is queued, it has its relevant bit set in the array->prio_bitmap.
++
++p->time_slice is stored in nanosconds and is updated via update_cpu_clock on
++schedule() and scheduler_tick. If p->time_slice is below zero then the
++recalc_task_prio is readjusted and the task rescheduled.
++
++
++Priority Matrix
++===============
++
++In order to minimise the latencies between tasks of different nice levels
++running concurrently, the dynamic priority slots where different nice levels
++are queued are dithered instead of being sequential. What this means is that
++there are 40 priority slots where a task may run during one major rotation,
++and the allocation of slots is dependant on nice level. In the
++following table, a zero represents a slot where the task may run.
++
++PRIORITY:0..................20.................39
++nice -20 0000000000000000000000000000000000000000
++nice -10 1000100010001000100010001000100010010000
++nice   0 1010101010101010101010101010101010101010
++nice   5 1011010110110101101101011011010110110110
++nice  10 1110111011101110111011101110111011101110
++nice  15 1111111011111110111111101111111011111110
++nice  19 1111111111111111111111111111111111111110
++
++As can be seen, a nice -20 task runs in every priority slot whereas a nice 19
++task only runs one slot per major rotation. This dithered table allows for the
++smallest possible maximum latencies between tasks of varying nice levels, thus
++allowing vastly different nice levels to be used.
++
++SCHED_BATCH tasks are managed slightly differently, receiving only the top
++slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but
++slightly higher latencies.
++
++
++Modelling deadline behaviour
++============================
++
++As the accounting in this design is hard and not modified by sleep average
++calculations or interactivity modifiers, it is possible to accurately
++predict the maximum latency that a task may experience under different
++conditions. This is a virtual deadline mechanism enforced by mandatory
++timeslice expiration and not outside bandwidth measurement.
++
++The maximum duration a task can run during one major epoch is determined by its
++nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL
++duration during each epoch. Nice 10 tasks can run at 9 priority levels for each
++epoch, and so on. The table in the priority matrix above demonstrates how this
++is enforced.
++
++Therefore the maximum duration a runqueue epoch can take is determined by
++the number of tasks running, and their nice level. After that, the maximum
++duration it can take before a task can wait before it get scheduled is
++determined by the position of its first slot on the matrix.
++
++In the following examples, these are _worst case scenarios_ and would rarely
++occur, but can be modelled nonetheless to determine the maximum possible
++latency.
++
++So for example, if two nice 0 tasks are running, and one has just expired as
++another is activated for the first time receiving a full quota for this
++runqueue rotation, the first task will wait:
++
++nr_tasks * max_duration + nice_difference * rr_interval
++1 * 19 * RR_INTERVAL + 0 = 152ms
++
++In the presence of a nice 10 task, a nice 0 task would wait a maximum of
++1 * 10 * RR_INTERVAL + 0 = 80ms
++
++In the presence of a nice 0 task, a nice 10 task would wait a maximum of
++1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms
++
++More useful than these values, though, are the average latencies which are
++a matter of determining the average distance between priority slots of
++different nice values and multiplying them by the tasks' quota. For example
++in the presence of a nice -10 task, a nice 0 task will wait either one or
++two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL,
++this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or
++20 and 40ms respectively (on uniprocessor at 1000HZ).
++
++
++Achieving interactivity
++=======================
++
++A requirement of this scheduler design was to achieve good interactivity
++despite being a completely fair deadline based design. The disadvantage of
++designs that try to achieve interactivity is that they usually do so at
++the expense of maintaining fairness. As cpu speeds increase, the requirement
++for some sort of metered unfairness towards interactive tasks becomes a less
++desirable phenomenon, but low latency and fairness remains mandatory to
++good interactive performance.
++
++This design relies on the fact that interactive tasks, by their nature,
++sleep often. Most fair scheduling designs end up penalising such tasks
++indirectly giving them less than their fair possible share because of the
++sleep, and have to use a mechanism of bonusing their priority to offset
++this based on the duration they sleep. This becomes increasingly inaccurate
++as the number of running tasks rises and more tasks spend time waiting on
++runqueues rather than sleeping, and it is impossible to tell whether the
++task that's waiting on a runqueue only intends to run for a short period and
++then sleep again after than runqueue wait. Furthermore, all such designs rely
++on a period of time to pass to accumulate some form of statistic on the task
++before deciding on how much to give them preference. The shorter this period,
++the more rapidly bursts of cpu ruin the interactive tasks behaviour. The
++longer this period, the longer it takes for interactive tasks to get low
++scheduling latencies and fair cpu.
++
++This design does not measure sleep time at all. Interactive tasks that sleep
++often will wake up having consumed very little if any of their quota for
++the current major priority rotation. The longer they have slept, the less
++likely they are to even be on the current major priority rotation. Once
++woken up, though, they get to use up a their full quota for that epoch,
++whether part of a quota remains or a full quota. Overall, however, they
++can still only run as much cpu time for that epoch as any other task of the
++same nice level. This means that two tasks behaving completely differently
++from fully cpu bound to waking/sleeping extremely frequently will still
++get the same quota of cpu, but the latter will be using its quota for that
++epoch in bursts rather than continuously. This guarantees that interactive
++tasks get the same amount of cpu as cpu bound ones.
++
++The other requirement of interactive tasks is also to obtain low latencies
++for when they are scheduled. Unlike fully cpu bound tasks and the maximum
++latencies possible described in the modelling deadline behaviour section
++above, tasks that sleep will wake up with quota available usually at the
++current runqueue's priority_level or better. This means that the most latency
++they are likely to see is one RR_INTERVAL, and often they will preempt the
++current task if it is not of a sleeping nature. This then guarantees very
++low latency for interactive tasks, and the lowest latencies for the least
++cpu bound tasks.
++
++
++Fri, 4 May 2007
++Con Kolivas <kernel@kolivas.org>
+Index: linux-2.6.22-ck1/Documentation/sysctl/kernel.txt
+===================================================================
+--- linux-2.6.22-ck1.orig/Documentation/sysctl/kernel.txt	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/Documentation/sysctl/kernel.txt	2007-07-10 14:55:20.000000000 +1000
+@@ -25,6 +25,9 @@
+ - domainname
+ - hostname
+ - hotplug
++- interactive
++- iso_cpu
++- iso_period
+ - java-appletviewer           [ binfmt_java, obsolete ]
+ - java-interpreter            [ binfmt_java, obsolete ]
+ - kstack_depth_to_print       [ X86 only ]
+@@ -43,6 +46,7 @@
+ - printk
+ - real-root-dev               ==> Documentation/initrd.txt
+ - reboot-cmd                  [ SPARC only ]
++- rr_interval
+ - rtsig-max
+ - rtsig-nr
+ - sem
+@@ -164,6 +168,40 @@
+ 
+ ==============================================================
+ 
++interactive:
++
++The staircase-deadline cpu scheduler can be set in either purely
++forward-looking mode for absolutely rigid fairness and cpu distribution
++according to nice level, or it can allow a small per-process history
++to smooth out cpu usage perturbations common in interactive tasks by
++enabling this sysctl. While small fairness issues can arise with this
++enabled, overall fairness is usually still strongly maintained and
++starvation is never possible. Enabling this can significantly smooth
++out 3d graphics and games.
++
++Default value is 1 (enabled).
++
++==============================================================
++
++iso_cpu:
++
++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
++run effectively at realtime priority, averaged over a rolling iso_period
++seconds.
++
++Set to 80 (percent) by default.
++
++==============================================================
++
++iso_period:
++
++This sets the number of seconds over which SCHED_ISO cpu usage is averaged
++to see if it exceeds its allocated cpu bandwidth.
++
++Set to 5 (seconds) by default.
++
++==============================================================
++
+ l2cr: (PPC only)
+ 
+ This flag controls the L2 cache of G3 processor boards. If
+@@ -288,6 +326,19 @@
+ 
+ ==============================================================
+ 
++rr_interval:
++
++This is the smallest duration that any cpu process scheduling unit
++will run for. Increasing this value can increase throughput of cpu
++bound tasks substantially but at the expense of increased latencies
++overall. This value is in milliseconds and the default value chosen
++depends on the number of cpus available at scheduler initialisation
++with a minimum of 8.
++
++Valid values are from 1-5000.
++
++==============================================================
++
+ rtsig-max & rtsig-nr:
+ 
+ The file rtsig-max can be used to tune the maximum number
+Index: linux-2.6.22-ck1/fs/pipe.c
+===================================================================
+--- linux-2.6.22-ck1.orig/fs/pipe.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/fs/pipe.c	2007-07-10 14:55:02.000000000 +1000
+@@ -41,12 +41,7 @@
+ {
+ 	DEFINE_WAIT(wait);
+ 
+-	/*
+-	 * Pipes are system-local resources, so sleeping on them
+-	 * is considered a noninteractive wait:
+-	 */
+-	prepare_to_wait(&pipe->wait, &wait,
+-			TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
++	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
+ 	if (pipe->inode)
+ 		mutex_unlock(&pipe->inode->i_mutex);
+ 	schedule();
+Index: linux-2.6.22-ck1/fs/proc/array.c
+===================================================================
+--- linux-2.6.22-ck1.orig/fs/proc/array.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/fs/proc/array.c	2007-07-10 14:55:02.000000000 +1000
+@@ -165,7 +165,6 @@
+ 	rcu_read_lock();
+ 	buffer += sprintf(buffer,
+ 		"State:\t%s\n"
+-		"SleepAVG:\t%lu%%\n"
+ 		"Tgid:\t%d\n"
+ 		"Pid:\t%d\n"
+ 		"PPid:\t%d\n"
+@@ -173,7 +172,6 @@
+ 		"Uid:\t%d\t%d\t%d\t%d\n"
+ 		"Gid:\t%d\t%d\t%d\t%d\n",
+ 		get_task_state(p),
+-		(p->sleep_avg/1024)*100/(1020000000/1024),
+ 	       	p->tgid, p->pid,
+ 	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
+ 		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
+Index: linux-2.6.22-ck1/include/linux/init_task.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/linux/init_task.h	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/include/linux/init_task.h	2007-07-10 14:55:20.000000000 +1000
+@@ -125,13 +125,15 @@
+ 	.prio		= MAX_PRIO-20,					\
+ 	.static_prio	= MAX_PRIO-20,					\
+ 	.normal_prio	= MAX_PRIO-20,					\
++	.rotation	= 0,						\
+ 	.policy		= SCHED_NORMAL,					\
+ 	.cpus_allowed	= CPU_MASK_ALL,					\
+ 	.mm		= NULL,						\
+ 	.active_mm	= &init_mm,					\
+ 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+ 	.ioprio		= 0,						\
+-	.time_slice	= HZ,						\
++	.time_slice	= 1000000000,						\
++	.quota		= 1000000000,						\
+ 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+ 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
+ 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
+@@ -158,6 +160,7 @@
+ 		.signal = {{0}}},					\
+ 	.blocked	= {{0}},					\
+ 	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
++	.mutexes_held	= 0,						\
+ 	.journal_info	= NULL,						\
+ 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+ 	.fs_excl	= ATOMIC_INIT(0),				\
+Index: linux-2.6.22-ck1/kernel/softirq.c
+===================================================================
+--- linux-2.6.22-ck1.orig/kernel/softirq.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/kernel/softirq.c	2007-07-10 14:55:02.000000000 +1000
+@@ -488,7 +488,7 @@
+ 
+ static int ksoftirqd(void * __bind_cpu)
+ {
+-	set_user_nice(current, 19);
++	set_user_nice(current, 15);
+ 	current->flags |= PF_NOFREEZE;
+ 
+ 	set_current_state(TASK_INTERRUPTIBLE);
+Index: linux-2.6.22-ck1/kernel/workqueue.c
+===================================================================
+--- linux-2.6.22-ck1.orig/kernel/workqueue.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/kernel/workqueue.c	2007-07-10 14:55:02.000000000 +1000
+@@ -285,8 +285,6 @@
+ 	if (!cwq->wq->freezeable)
+ 		current->flags |= PF_NOFREEZE;
+ 
+-	set_user_nice(current, -5);
+-
+ 	for (;;) {
+ 		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
+ 		if (!freezing(current) &&
+Index: linux-2.6.22-ck1/kernel/kthread.c
+===================================================================
+--- linux-2.6.22-ck1.orig/kernel/kthread.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/kernel/kthread.c	2007-07-10 14:55:02.000000000 +1000
+@@ -223,7 +223,6 @@
+ 
+ 	ignore_signals(tsk);
+ 
+-	set_user_nice(tsk, -5);
+ 	set_cpus_allowed(tsk, CPU_MASK_ALL);
+ }
+ 
+Index: linux-2.6.22-ck1/kernel/fork.c
+===================================================================
+--- linux-2.6.22-ck1.orig/kernel/fork.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/kernel/fork.c	2007-07-10 14:55:20.000000000 +1000
+@@ -1063,6 +1063,7 @@
+ 	p->io_context = NULL;
+ 	p->io_wait = NULL;
+ 	p->audit_context = NULL;
++	p->mutexes_held = 0;
+ 	cpuset_fork(p);
+ #ifdef CONFIG_NUMA
+  	p->mempolicy = mpol_copy(p->mempolicy);
+Index: linux-2.6.22-ck1/kernel/mutex.c
+===================================================================
+--- linux-2.6.22-ck1.orig/kernel/mutex.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/kernel/mutex.c	2007-07-10 14:55:20.000000000 +1000
+@@ -60,6 +60,16 @@
+ static void fastcall noinline __sched
+ __mutex_lock_slowpath(atomic_t *lock_count);
+ 
++static inline void inc_mutex_count(void)
++{
++	current->mutexes_held++;
++}
++
++static inline void dec_mutex_count(void)
++{
++	current->mutexes_held--;
++}
++
+ /***
+  * mutex_lock - acquire the mutex
+  * @lock: the mutex to be acquired
+@@ -89,6 +99,7 @@
+ 	 * 'unlocked' into 'locked' state.
+ 	 */
+ 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
++	inc_mutex_count();
+ }
+ 
+ EXPORT_SYMBOL(mutex_lock);
+@@ -114,6 +125,7 @@
+ 	 * into 'unlocked' state:
+ 	 */
+ 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
++	dec_mutex_count();
+ }
+ 
+ EXPORT_SYMBOL(mutex_unlock);
+@@ -283,9 +295,14 @@
+  */
+ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
+ {
++	int ret;
++
+ 	might_sleep();
+-	return __mutex_fastpath_lock_retval
++	ret = __mutex_fastpath_lock_retval
+ 			(&lock->count, __mutex_lock_interruptible_slowpath);
++	if (likely(!ret))
++		inc_mutex_count();
++	return ret;
+ }
+ 
+ EXPORT_SYMBOL(mutex_lock_interruptible);
+@@ -340,8 +357,12 @@
+  */
+ int fastcall __sched mutex_trylock(struct mutex *lock)
+ {
+-	return __mutex_fastpath_trylock(&lock->count,
++	int ret = __mutex_fastpath_trylock(&lock->count,
+ 					__mutex_trylock_slowpath);
++
++	if (likely(ret))
++		inc_mutex_count();
++	return ret;
+ }
+ 
+ EXPORT_SYMBOL(mutex_trylock);
+Index: linux-2.6.22-ck1/block/cfq-iosched.c
+===================================================================
+--- linux-2.6.22-ck1.orig/block/cfq-iosched.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/block/cfq-iosched.c	2007-07-10 14:55:21.000000000 +1000
+@@ -1276,10 +1276,12 @@
+ 			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
+ 		case IOPRIO_CLASS_NONE:
+ 			/*
+-			 * no prio set, place us in the middle of the BE classes
++			 * Select class and ioprio according to policy and nice
+ 			 */
++			cfqq->ioprio_class = task_policy_ioprio_class(tsk);
+ 			cfqq->ioprio = task_nice_ioprio(tsk);
+-			cfqq->ioprio_class = IOPRIO_CLASS_BE;
++			if (cfqq->ioprio_class == IOPRIO_CLASS_IDLE)
++				cfq_clear_cfqq_idle_window(cfqq);
+ 			break;
+ 		case IOPRIO_CLASS_RT:
+ 			cfqq->ioprio = task_ioprio(tsk);
+Index: linux-2.6.22-ck1/include/linux/ioprio.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/linux/ioprio.h	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/include/linux/ioprio.h	2007-07-10 14:55:21.000000000 +1000
+@@ -22,7 +22,7 @@
+  * class, the default for any process. IDLE is the idle scheduling class, it
+  * is only served when no one else is using the disk.
+  */
+-enum {
++enum ioprio_class {
+ 	IOPRIO_CLASS_NONE,
+ 	IOPRIO_CLASS_RT,
+ 	IOPRIO_CLASS_BE,
+@@ -51,8 +51,25 @@
+ 	return IOPRIO_PRIO_DATA(task->ioprio);
+ }
+ 
++static inline enum ioprio_class
++	task_policy_ioprio_class(struct task_struct *task)
++{
++	if (rt_task(task))
++		return IOPRIO_CLASS_RT;
++	if (idleprio_task(task))
++		return IOPRIO_CLASS_IDLE;
++	return IOPRIO_CLASS_BE;
++}
++
+ static inline int task_nice_ioprio(struct task_struct *task)
+ {
++	if (rt_task(task))
++		return (MAX_RT_PRIO - task->rt_priority) * IOPRIO_BE_NR /
++			(MAX_RT_PRIO + 1);
++	if (iso_task(task))
++		return 0;
++	if (idleprio_task(task))
++		return IOPRIO_BE_NR - 1;
+ 	return (task_nice(task) + 20) / 5;
+ }
+ 
+Index: linux-2.6.22-ck1/Documentation/sysctl/vm.txt
+===================================================================
+--- linux-2.6.22-ck1.orig/Documentation/sysctl/vm.txt	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/Documentation/sysctl/vm.txt	2007-07-10 14:55:23.000000000 +1000
+@@ -22,6 +22,8 @@
+ - dirty_background_ratio
+ - dirty_expire_centisecs
+ - dirty_writeback_centisecs
++- hardmaplimit
++- mapped
+ - max_map_count
+ - min_free_kbytes
+ - laptop_mode
+@@ -31,12 +33,15 @@
+ - min_unmapped_ratio
+ - min_slab_ratio
+ - panic_on_oom
++- swap_prefetch
++- swap_prefetch_delay
++- swap_prefetch_sleep
+ 
+ ==============================================================
+ 
+ dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
+ dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
+-block_dump, swap_token_timeout, drop-caches:
++block_dump, swap_token_timeout, drop-caches, tail_largefiles:
+ 
+ See Documentation/filesystems/proc.txt
+ 
+@@ -86,6 +91,27 @@
+ 
+ ==============================================================
+ 
++hardmaplimit:
++
++This flag makes the vm adhere to the mapped value as closely as possible
++except in the most extreme vm stress where doing so would provoke an out
++of memory condition (see mapped below).
++
++Enabled by default.
++
++==============================================================
++
++mapped:
++
++This is the percentage ram that is filled with mapped pages (applications)
++before the vm will start reclaiming mapped pages by moving them to swap.
++It is altered by the relative stress of the vm at the time so is not
++strictly adhered to to prevent provoking out of memory kills.
++
++Set to 66 by default.
++
++==============================================================
++
+ max_map_count:
+ 
+ This file contains the maximum number of memory map areas a process
+@@ -216,3 +242,37 @@
+ The default value is 0.
+ 1 and 2 are for failover of clustering. Please select either
+ according to your policy of failover.
++
++==============================================================
++
++swap_prefetch
++
++This enables or disables the swap prefetching feature. When the virtual
++memory subsystem has been extremely idle for at least swap_prefetch_sleep
++seconds it will start copying back pages from swap into the swapcache and keep
++a copy in swap. Valid values are 0 - 3. A value of 0 disables swap
++prefetching, 1 enables it unless laptop_mode is enabled, 2 enables it in the
++presence of laptop_mode, and 3 enables it unconditionally, ignoring whether
++the system is idle or not. If set to 0, swap prefetch wil not even try to keep
++record of ram swapped out to have the most minimal impact on performance.
++
++The default value is 1.
++
++==============================================================
++
++swap_prefetch_delay
++
++This is the time in seconds that swap prefetching is delayed upon finding
++the system is not idle (ie the vm is busy or non-niced cpu load is present).
++
++The default value is 1.
++
++==============================================================
++
++swap_prefetch_sleep
++
++This is the time in seconds that the swap prefetch kernel thread is put to
++sleep for when the ram is found to be full and it is unable to prefetch
++further.
++
++The default value is 5.
+Index: linux-2.6.22-ck1/include/linux/swap.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/linux/swap.h	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/include/linux/swap.h	2007-07-10 14:55:22.000000000 +1000
+@@ -180,6 +180,7 @@
+ /* linux/mm/swap.c */
+ extern void FASTCALL(lru_cache_add(struct page *));
+ extern void FASTCALL(lru_cache_add_active(struct page *));
++extern void FASTCALL(lru_cache_add_tail(struct page *));
+ extern void FASTCALL(activate_page(struct page *));
+ extern void FASTCALL(mark_page_accessed(struct page *));
+ extern void lru_add_drain(void);
+@@ -188,9 +189,11 @@
+ extern void swap_setup(void);
+ 
+ /* linux/mm/vmscan.c */
+-extern unsigned long try_to_free_pages(struct zone **, gfp_t);
++extern unsigned long try_to_free_pages(struct zone **, gfp_t,
++				       struct task_struct *p);
+ extern unsigned long shrink_all_memory(unsigned long nr_pages);
+-extern int vm_swappiness;
++extern int vm_mapped;
++extern int vm_hardmaplimit;
+ extern int remove_mapping(struct address_space *mapping, struct page *page);
+ extern long vm_total_pages;
+ 
+@@ -237,6 +240,7 @@
+ extern struct page * lookup_swap_cache(swp_entry_t);
+ extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
+ 					   unsigned long addr);
++extern int add_to_swap_cache(struct page *page, swp_entry_t entry);
+ /* linux/mm/swapfile.c */
+ extern long total_swap_pages;
+ extern unsigned int nr_swapfiles;
+Index: linux-2.6.22-ck1/init/Kconfig
+===================================================================
+--- linux-2.6.22-ck1.orig/init/Kconfig	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/init/Kconfig	2007-07-10 14:55:22.000000000 +1000
+@@ -105,6 +105,28 @@
+ 	  used to provide more virtual memory than the actual RAM present
+ 	  in your computer.  If unsure say Y.
+ 
++config SWAP_PREFETCH
++	bool "Support for prefetching swapped memory"
++	depends on SWAP
++	default y
++	---help---
++	  This option will allow the kernel to prefetch swapped memory pages
++	  when idle. The pages will be kept on both swap and in swap_cache
++	  thus avoiding the need for further I/O if either ram or swap space
++	  is required.
++
++	  What this will do on workstations is slowly bring back applications
++	  that have swapped out after memory intensive workloads back into
++	  physical ram if you have free ram at a later stage and the machine
++	  is relatively idle. This means that when you come back to your
++	  computer after leaving it idle for a while, applications will come
++	  to life faster. Note that your swap usage will appear to increase
++	  but these are cached pages, can be dropped freely by the vm, and it
++	  should stabilise around 50% swap usage maximum.
++
++	  Workstations and multiuser workstation servers will most likely want
++	  to say Y.
++
+ config SYSVIPC
+ 	bool "System V IPC"
+ 	---help---
+Index: linux-2.6.22-ck1/mm/Makefile
+===================================================================
+--- linux-2.6.22-ck1.orig/mm/Makefile	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/mm/Makefile	2007-07-10 14:55:22.000000000 +1000
+@@ -17,6 +17,7 @@
+ obj-y			+= bounce.o
+ endif
+ obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
++obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o
+ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
+ obj-$(CONFIG_NUMA) 	+= mempolicy.o
+ obj-$(CONFIG_SPARSEMEM)	+= sparse.o
+Index: linux-2.6.22-ck1/mm/swap.c
+===================================================================
+--- linux-2.6.22-ck1.orig/mm/swap.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/mm/swap.c	2007-07-10 14:55:23.000000000 +1000
+@@ -17,6 +17,7 @@
+ #include <linux/sched.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
++#include <linux/swap-prefetch.h>
+ #include <linux/mman.h>
+ #include <linux/pagemap.h>
+ #include <linux/pagevec.h>
+@@ -176,6 +177,7 @@
+  */
+ static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
+ static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
++static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, };
+ 
+ void fastcall lru_cache_add(struct page *page)
+ {
+@@ -197,6 +199,31 @@
+ 	put_cpu_var(lru_add_active_pvecs);
+ }
+ 
++static void __pagevec_lru_add_tail(struct pagevec *pvec)
++{
++	int i;
++	struct zone *zone = NULL;
++
++	for (i = 0; i < pagevec_count(pvec); i++) {
++		struct page *page = pvec->pages[i];
++		struct zone *pagezone = page_zone(page);
++
++		if (pagezone != zone) {
++			if (zone)
++				spin_unlock_irq(&zone->lru_lock);
++			zone = pagezone;
++			spin_lock_irq(&zone->lru_lock);
++		}
++		BUG_ON(PageLRU(page));
++		SetPageLRU(page);
++		add_page_to_inactive_list_tail(zone, page);
++	}
++	if (zone)
++		spin_unlock_irq(&zone->lru_lock);
++	release_pages(pvec->pages, pvec->nr, pvec->cold);
++	pagevec_reinit(pvec);
++}
++
+ static void __lru_add_drain(int cpu)
+ {
+ 	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+@@ -207,6 +234,9 @@
+ 	pvec = &per_cpu(lru_add_active_pvecs, cpu);
+ 	if (pagevec_count(pvec))
+ 		__pagevec_lru_add_active(pvec);
++	pvec = &per_cpu(lru_add_tail_pvecs, cpu);
++	if (pagevec_count(pvec))
++		__pagevec_lru_add_tail(pvec);
+ }
+ 
+ void lru_add_drain(void)
+@@ -403,6 +433,20 @@
+ }
+ 
+ /*
++ * Function used uniquely to put pages back to the lru at the end of the
++ * inactive list to preserve the lru order.
++ */
++void fastcall lru_cache_add_tail(struct page *page)
++{
++	struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs);
++
++	page_cache_get(page);
++	if (!pagevec_add(pvec, page))
++		__pagevec_lru_add_tail(pvec);
++	put_cpu_var(lru_add_pvecs);
++}
++
++/*
+  * Try to drop buffers from the pages in a pagevec
+  */
+ void pagevec_strip(struct pagevec *pvec)
+@@ -514,6 +558,9 @@
+ 	 * Right now other parts of the system means that we
+ 	 * _really_ don't want to cluster much more
+ 	 */
++
++	prepare_swap_prefetch();
++
+ #ifdef CONFIG_HOTPLUG_CPU
+ 	hotcpu_notifier(cpu_swap_callback, 0);
+ #endif
+Index: linux-2.6.22-ck1/mm/swap_prefetch.c
+===================================================================
+--- /dev/null	1970-01-01 00:00:00.000000000 +0000
++++ linux-2.6.22-ck1/mm/swap_prefetch.c	2007-07-10 14:55:22.000000000 +1000
+@@ -0,0 +1,542 @@
++/*
++ * linux/mm/swap_prefetch.c
++ *
++ * Copyright (C) 2005-2007 Con Kolivas
++ *
++ * Written by Con Kolivas <kernel@kolivas.org>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ */
++
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/swap.h>
++#include <linux/swap-prefetch.h>
++#include <linux/ioprio.h>
++#include <linux/kthread.h>
++#include <linux/pagemap.h>
++#include <linux/syscalls.h>
++#include <linux/writeback.h>
++#include <linux/vmstat.h>
++#include <linux/freezer.h>
++
++/*
++ * sysctls:
++ * swap_prefetch:	0. Disable swap prefetching
++ *			1. Prefetch only when idle and not with laptop_mode
++ *			2. Prefetch when idle and with laptop_mode
++ *			3. Prefetch at all times.
++ * swap_prefetch_delay:	Number of seconds to delay prefetching when system
++ *			is not idle.
++ * swap_prefetch_sleep:	Number of seconds to put kprefetchd to sleep when
++ *			unable to prefetch.
++ */
++int swap_prefetch __read_mostly = 1;
++int swap_prefetch_delay __read_mostly = 1;
++int swap_prefetch_sleep __read_mostly = 5;
++
++#define PREFETCH_DELAY		(HZ * swap_prefetch_delay)
++#define PREFETCH_SLEEP		((HZ * swap_prefetch_sleep) ? : 1)
++
++struct swapped_root {
++	unsigned long		busy;		/* vm busy */
++	spinlock_t		lock;		/* protects all data */
++	struct list_head	list;		/* MRU list of swapped pages */
++	struct radix_tree_root	swap_tree;	/* Lookup tree of pages */
++	unsigned int		count;		/* Number of entries */
++	unsigned int		maxcount;	/* Maximum entries allowed */
++	struct kmem_cache	*cache;		/* Of struct swapped_entry */
++};
++
++static struct swapped_root swapped = {
++	.lock		= SPIN_LOCK_UNLOCKED,
++	.list  		= LIST_HEAD_INIT(swapped.list),
++	.swap_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
++};
++
++static struct task_struct *kprefetchd_task;
++
++/*
++ * We check to see no part of the vm is busy. If it is this will interrupt
++ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy.
++ */
++inline void delay_swap_prefetch(void)
++{
++	if (!test_bit(0, &swapped.busy))
++		__set_bit(0, &swapped.busy);
++}
++
++/*
++ * If laptop_mode is enabled don't prefetch to avoid hard drives
++ * doing unnecessary spin-ups unless swap_prefetch is explicitly
++ * set to a higher value.
++ */
++static inline int prefetch_enabled(void)
++{
++	if (swap_prefetch <= laptop_mode)
++		return 0;
++	return 1;
++}
++
++static int kprefetchd_awake;
++
++/*
++ * Drop behind accounting which keeps a list of the most recently used swap
++ * entries. Entries are removed lazily by kprefetchd.
++ */
++void add_to_swapped_list(struct page *page)
++{
++	struct swapped_entry *entry;
++	unsigned long index, flags;
++
++	if (!prefetch_enabled())
++		goto out;
++
++	spin_lock_irqsave(&swapped.lock, flags);
++	if (swapped.count >= swapped.maxcount) {
++		/*
++		 * Once the number of entries exceeds maxcount we start
++		 * removing the least recently used entries.
++		 */
++		entry = list_entry(swapped.list.next,
++			struct swapped_entry, swapped_list);
++		radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val);
++		list_del(&entry->swapped_list);
++		swapped.count--;
++	} else {
++		entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC);
++		if (unlikely(!entry))
++			/* bad, can't allocate more mem */
++			goto out_locked;
++	}
++
++	index = page_private(page);
++	entry->swp_entry.val = index;
++	/*
++	 * On numa we need to store the node id to ensure that we prefetch to
++	 * the same node it came from.
++	 */
++	store_swap_entry_node(entry, page);
++
++	if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) {
++		list_add(&entry->swapped_list, &swapped.list);
++		swapped.count++;
++	} else
++		kmem_cache_free(swapped.cache, entry);
++
++out_locked:
++	spin_unlock_irqrestore(&swapped.lock, flags);
++out:
++	if (!kprefetchd_awake)
++		wake_up_process(kprefetchd_task);
++	return;
++}
++
++/*
++ * Removes entries from the swapped_list. The radix tree allows us to quickly
++ * look up the entry from the index without having to iterate over the whole
++ * list.
++ */
++static void remove_from_swapped_list(const unsigned long index)
++{
++	struct swapped_entry *entry;
++	unsigned long flags;
++
++	spin_lock_irqsave(&swapped.lock, flags);
++	entry = radix_tree_delete(&swapped.swap_tree, index);
++	if (likely(entry)) {
++		list_del(&entry->swapped_list);
++		swapped.count--;
++		kmem_cache_free(swapped.cache, entry);
++	}
++	spin_unlock_irqrestore(&swapped.lock, flags);
++}
++
++enum trickle_return {
++	TRICKLE_SUCCESS,
++	TRICKLE_FAILED,
++	TRICKLE_DELAY,
++};
++
++struct node_stats {
++	/* Free ram after a cycle of prefetching */
++	unsigned long	last_free;
++	/* Free ram on this cycle of checking prefetch_suitable */
++	unsigned long	current_free;
++	/* The amount of free ram before we start prefetching */
++	unsigned long	highfree[MAX_NR_ZONES];
++	/* The amount of free ram where we will stop prefetching */
++	unsigned long	lowfree[MAX_NR_ZONES];
++	/* highfree or lowfree depending on whether we've hit a watermark */
++	unsigned long	*pointfree[MAX_NR_ZONES];
++};
++
++/*
++ * prefetch_stats stores the free ram data of each node and this is used to
++ * determine if a node is suitable for prefetching into.
++ */
++struct prefetch_stats {
++	/* Which nodes are currently suited to prefetching */
++	nodemask_t	prefetch_nodes;
++	/* Total pages we've prefetched on this wakeup of kprefetchd */
++	unsigned long	prefetched_pages;
++	struct node_stats node[MAX_NUMNODES];
++};
++
++static struct prefetch_stats sp_stat;
++
++/*
++ * This tries to read a swp_entry_t into swap cache for swap prefetching.
++ * If it returns TRICKLE_DELAY we should delay further prefetching.
++ */
++static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry,
++	const int node)
++{
++	enum trickle_return ret = TRICKLE_FAILED;
++	unsigned long flags;
++	struct page *page;
++
++	read_lock_irqsave(&swapper_space.tree_lock, flags);
++	/* Entry may already exist */
++	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
++	read_unlock_irqrestore(&swapper_space.tree_lock, flags);
++	if (page)
++		goto out;
++
++	/*
++	 * Get a new page to read from swap. We have already checked the
++	 * watermarks so __alloc_pages will not call on reclaim.
++	 */
++	page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0);
++	if (unlikely(!page)) {
++		ret = TRICKLE_DELAY;
++		goto out;
++	}
++
++	if (add_to_swap_cache(page, entry)) {
++		/* Failed to add to swap cache */
++		goto out_release;
++	}
++
++	/* Add them to the tail of the inactive list to preserve LRU order */
++	lru_cache_add_tail(page);
++	if (unlikely(swap_readpage(NULL, page)))
++		goto out_release;
++
++	sp_stat.prefetched_pages++;
++	sp_stat.node[node].last_free--;
++
++	ret = TRICKLE_SUCCESS;
++out_release:
++	page_cache_release(page);
++out:
++	/*
++	 * All entries are removed here lazily. This avoids the cost of
++	 * remove_from_swapped_list during normal swapin. Thus there are
++	 * usually many stale entries.
++	 */
++	remove_from_swapped_list(entry.val);
++	return ret;
++}
++
++static void clear_last_prefetch_free(void)
++{
++	int node;
++
++	/*
++	 * Reset the nodes suitable for prefetching to all nodes. We could
++	 * update the data to take into account memory hotplug if desired..
++	 */
++	sp_stat.prefetch_nodes = node_online_map;
++	for_each_node_mask(node, sp_stat.prefetch_nodes) {
++		struct node_stats *ns = &sp_stat.node[node];
++
++		ns->last_free = 0;
++	}
++}
++
++static void clear_current_prefetch_free(void)
++{
++	int node;
++
++	sp_stat.prefetch_nodes = node_online_map;
++	for_each_node_mask(node, sp_stat.prefetch_nodes) {
++		struct node_stats *ns = &sp_stat.node[node];
++
++		ns->current_free = 0;
++	}
++}
++
++/*
++ * This updates the high and low watermarks of amount of free ram in each
++ * node used to start and stop prefetching. We prefetch from pages_high * 4
++ * down to pages_high * 3.
++ */
++static void examine_free_limits(void)
++{
++	struct zone *z;
++
++	for_each_zone(z) {
++		struct node_stats *ns;
++		int idx;
++
++		if (!populated_zone(z))
++			continue;
++
++		ns = &sp_stat.node[zone_to_nid(z)];
++		idx = zone_idx(z);
++		ns->lowfree[idx] = z->pages_high * 3;
++		ns->highfree[idx] = ns->lowfree[idx] + z->pages_high;
++
++		if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) {
++			/*
++			 * We've gotten above the high watermark of free pages
++			 * so we can start prefetching till we get to the low
++			 * watermark.
++			 */
++			ns->pointfree[idx] = &ns->lowfree[idx];
++		}
++	}
++}
++
++/*
++ * We want to be absolutely certain it's ok to start prefetching.
++ */
++static enum trickle_return prefetch_suitable(void)
++{
++	enum trickle_return ret = TRICKLE_DELAY;
++	struct zone *z;
++	int node;
++
++	/*
++	 * If swap_prefetch is set to a high value we can ignore load
++	 * and prefetch whenever we can. Otherwise we test for vm and
++	 * cpu activity.
++	 */
++	if (swap_prefetch < 3) {
++		/* Purposefully racy, may return false positive */
++		if (test_bit(0, &swapped.busy)) {
++			__clear_bit(0, &swapped.busy);
++			goto out;
++		}
++
++		/*
++		 * above_background_load is expensive so we only perform it
++		 * every SWAP_CLUSTER_MAX prefetched_pages.
++		 * We test to see if we're above_background_load as disk
++		 * activity even at low priority can cause interrupt induced
++		 * scheduling latencies.
++		 */
++		if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX) &&
++		    above_background_load())
++			goto out;
++	}
++	clear_current_prefetch_free();
++
++	/*
++	 * Have some hysteresis between where page reclaiming and prefetching
++	 * will occur to prevent ping-ponging between them.
++	 */
++	for_each_zone(z) {
++		struct node_stats *ns;
++		unsigned long free;
++		int idx;
++
++		if (!populated_zone(z))
++			continue;
++
++		node = zone_to_nid(z);
++		ns = &sp_stat.node[node];
++		idx = zone_idx(z);
++
++		free = zone_page_state(z, NR_FREE_PAGES);
++		if (free < *ns->pointfree[idx]) {
++			/*
++			 * Free pages have dropped below the low watermark so
++			 * we won't start prefetching again till we hit the
++			 * high watermark of free pages.
++			 */
++			ns->pointfree[idx] = &ns->highfree[idx];
++			node_clear(node, sp_stat.prefetch_nodes);
++			continue;
++		}
++		ns->current_free += free;
++	}
++
++	/*
++	 * We iterate over each node testing to see if it is suitable for
++	 * prefetching and clear the nodemask if it is not.
++	 */
++	for_each_node_mask(node, sp_stat.prefetch_nodes) {
++		struct node_stats *ns = &sp_stat.node[node];
++
++		/*
++		 * We check to see that pages are not being allocated
++		 * elsewhere at any significant rate implying any
++		 * degree of memory pressure (eg during file reads)
++		 */
++		if (ns->last_free) {
++			if (ns->current_free + SWAP_CLUSTER_MAX <
++			    ns->last_free) {
++				ns->last_free = ns->current_free;
++				node_clear(node,
++					sp_stat.prefetch_nodes);
++				continue;
++			}
++		} else
++			ns->last_free = ns->current_free;
++
++		/* We shouldn't prefetch when we are doing writeback */
++		if (node_page_state(node, NR_WRITEBACK))
++			node_clear(node, sp_stat.prefetch_nodes);
++	}
++
++	/* Nothing suitable, put kprefetchd back to sleep */
++	if (nodes_empty(sp_stat.prefetch_nodes))
++		return TRICKLE_FAILED;
++
++	/* Survived all that? Hooray we can prefetch! */
++	ret = TRICKLE_SUCCESS;
++out:
++	return ret;
++}
++
++/*
++ * trickle_swap is the main function that initiates the swap prefetching. It
++ * first checks to see if the busy flag is set, and does not prefetch if it
++ * is, as the flag implied we are low on memory or swapping in currently.
++ * Otherwise it runs until prefetch_suitable fails which occurs when the
++ * vm is busy, we prefetch to the watermark, the list is empty or we have
++ * iterated over all entries once.
++ */
++static enum trickle_return trickle_swap(void)
++{
++	enum trickle_return suitable, ret = TRICKLE_DELAY;
++	struct swapped_entry *pos, *n;
++	unsigned long flags;
++
++	if (!prefetch_enabled())
++		return ret;
++
++	examine_free_limits();
++	suitable = prefetch_suitable();
++	if (suitable != TRICKLE_SUCCESS)
++		return suitable;
++	if (list_empty(&swapped.list)) {
++		kprefetchd_awake = 0;
++		return TRICKLE_FAILED;
++	}
++
++	spin_lock_irqsave(&swapped.lock, flags);
++	list_for_each_entry_safe_reverse(pos, n, &swapped.list, swapped_list) {
++		swp_entry_t swp_entry;
++		int node;
++
++		spin_unlock_irqrestore(&swapped.lock, flags);
++		cond_resched();
++		suitable = prefetch_suitable();
++		if (suitable != TRICKLE_SUCCESS) {
++			ret = suitable;
++			goto out_unlocked;
++		}
++
++		spin_lock_irqsave(&swapped.lock, flags);
++		if (unlikely(!pos))
++			continue;
++		node = get_swap_entry_node(pos);
++		if (!node_isset(node, sp_stat.prefetch_nodes)) {
++			/*
++			 * We found an entry that belongs to a node that is
++			 * not suitable for prefetching so skip it.
++			 */
++			continue;
++		}
++		swp_entry = pos->swp_entry;
++		spin_unlock_irqrestore(&swapped.lock, flags);
++
++		if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY)
++			goto out_unlocked;
++		spin_lock_irqsave(&swapped.lock, flags);
++	}
++	spin_unlock_irqrestore(&swapped.lock, flags);
++
++out_unlocked:
++	if (sp_stat.prefetched_pages) {
++		lru_add_drain();
++		sp_stat.prefetched_pages = 0;
++	}
++	return ret;
++}
++
++static int kprefetchd(void *__unused)
++{
++	struct sched_param param = { .sched_priority = 0 };
++
++	sched_setscheduler(current, SCHED_BATCH, &param);
++	set_user_nice(current, 19);
++	/* Set ioprio to lowest if supported by i/o scheduler */
++	sys_ioprio_set(IOPRIO_WHO_PROCESS, IOPRIO_BE_NR - 1, IOPRIO_CLASS_BE);
++
++	while (!kthread_should_stop()) {
++		try_to_freeze();
++
++		if (!kprefetchd_awake) {
++			set_current_state(TASK_INTERRUPTIBLE);
++			schedule();
++			kprefetchd_awake = 1;
++		}
++
++		if (trickle_swap() == TRICKLE_FAILED)
++			schedule_timeout_interruptible(PREFETCH_SLEEP);
++		else
++			schedule_timeout_interruptible(PREFETCH_DELAY);
++		clear_last_prefetch_free();
++	}
++	return 0;
++}
++
++/*
++ * Create kmem cache for swapped entries
++ */
++void __init prepare_swap_prefetch(void)
++{
++	struct zone *zone;
++
++	swapped.cache = kmem_cache_create("swapped_entry",
++		sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
++
++	/*
++	 * We set the limit to more entries than the physical ram.
++	 * We remove entries lazily so we need some headroom.
++	 */
++	swapped.maxcount = nr_free_pagecache_pages() * 2;
++
++	for_each_zone(zone) {
++		struct node_stats *ns;
++		int idx;
++
++		if (!populated_zone(zone))
++			continue;
++
++		ns = &sp_stat.node[zone_to_nid(zone)];
++		idx = zone_idx(zone);
++		ns->pointfree[idx] = &ns->highfree[idx];
++	}
++}
++
++static int __init kprefetchd_init(void)
++{
++	kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd");
++
++	return 0;
++}
++
++static void __exit kprefetchd_exit(void)
++{
++	kthread_stop(kprefetchd_task);
++}
++
++module_init(kprefetchd_init);
++module_exit(kprefetchd_exit);
+Index: linux-2.6.22-ck1/mm/swap_state.c
+===================================================================
+--- linux-2.6.22-ck1.orig/mm/swap_state.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/mm/swap_state.c	2007-07-10 14:55:22.000000000 +1000
+@@ -10,6 +10,7 @@
+ #include <linux/mm.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
++#include <linux/swap-prefetch.h>
+ #include <linux/init.h>
+ #include <linux/pagemap.h>
+ #include <linux/buffer_head.h>
+@@ -95,7 +96,7 @@
+ 	return error;
+ }
+ 
+-static int add_to_swap_cache(struct page *page, swp_entry_t entry)
++int add_to_swap_cache(struct page *page, swp_entry_t entry)
+ {
+ 	int error;
+ 
+@@ -148,6 +149,9 @@
+ 	swp_entry_t entry;
+ 	int err;
+ 
++	/* Swap prefetching is delayed if we're swapping pages */
++	delay_swap_prefetch();
++
+ 	BUG_ON(!PageLocked(page));
+ 
+ 	for (;;) {
+@@ -320,6 +324,9 @@
+ 	struct page *found_page, *new_page = NULL;
+ 	int err;
+ 
++	/* Swap prefetching is delayed if we're already reading from swap */
++	delay_swap_prefetch();
++
+ 	do {
+ 		/*
+ 		 * First check the swap cache.  Since this is normally
+Index: linux-2.6.22-ck1/mm/vmscan.c
+===================================================================
+--- linux-2.6.22-ck1.orig/mm/vmscan.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/mm/vmscan.c	2007-07-10 14:55:23.000000000 +1000
+@@ -16,6 +16,7 @@
+ #include <linux/slab.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
++#include <linux/swap-prefetch.h>
+ #include <linux/pagemap.h>
+ #include <linux/init.h>
+ #include <linux/highmem.h>
+@@ -36,6 +37,7 @@
+ #include <linux/rwsem.h>
+ #include <linux/delay.h>
+ #include <linux/kthread.h>
++#include <linux/timer.h>
+ #include <linux/freezer.h>
+ 
+ #include <asm/tlbflush.h>
+@@ -63,7 +65,7 @@
+ 	 * whole list at once. */
+ 	int swap_cluster_max;
+ 
+-	int swappiness;
++	int mapped;
+ 
+ 	int all_unreclaimable;
+ };
+@@ -110,9 +112,10 @@
+ #endif
+ 
+ /*
+- * From 0 .. 100.  Higher means more swappy.
++ * From 0 .. 100.  Lower means more swappy.
+  */
+-int vm_swappiness = 60;
++int vm_mapped __read_mostly = 66;
++int vm_hardmaplimit __read_mostly = 1;
+ long vm_total_pages;	/* The total number of pages which the VM controls */
+ 
+ static LIST_HEAD(shrinker_list);
+@@ -803,10 +806,14 @@
+ 		 * The distress ratio is important - we don't want to start
+ 		 * going oom.
+ 		 *
+-		 * A 100% value of vm_swappiness overrides this algorithm
+-		 * altogether.
++		 * This distress value is ignored if we apply a hardmaplimit except
++		 * in extreme distress.
++		 *
++		 * A 0% value of vm_mapped overrides this algorithm altogether.
+ 		 */
+-		swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
++		swap_tendency = mapped_ratio * 100 / (sc->mapped + 1);
++		if (!vm_hardmaplimit || distress == 100)
++			swap_tendency += distress;
+ 
+ 		/*
+ 		 * Now use this metric to decide whether to start moving mapped
+@@ -955,6 +962,41 @@
+ }
+ 
+ /*
++ * Helper functions to adjust nice level of kswapd, based on the priority of
++ * the task (p) that called it. If it is already higher priority we do not
++ * demote its nice level since it is still working on behalf of a higher
++ * priority task. With kernel threads we leave it at nice 0.
++ *
++ * We don't ever run kswapd real time, so if a real time task calls kswapd we
++ * set it to highest SCHED_NORMAL priority.
++ */
++static int effective_sc_prio(struct task_struct *p)
++{
++	if (likely(p->mm)) {
++		if (rt_task(p))
++			return -20;
++		if (idleprio_task(p))
++			return 19;
++		return task_nice(p);
++	}
++	return 0;
++}
++
++static void set_kswapd_nice(struct task_struct *kswapd, struct task_struct *p,
++			    int active)
++{
++	long nice = effective_sc_prio(p);
++
++	if (task_nice(kswapd) > nice || !active)
++		set_user_nice(kswapd, nice);
++}
++
++static int sc_priority(struct task_struct *p)
++{
++	return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40));
++}
++
++/*
+  * This is the direct reclaim path, for page-allocating processes.  We only
+  * try to reclaim pages from zones which will satisfy the caller's allocation
+  * request.
+@@ -1011,7 +1053,8 @@
+  * holds filesystem locks which prevent writeout this might not work, and the
+  * allocation attempt will fail.
+  */
+-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
++unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
++				struct task_struct *p)
+ {
+ 	int priority;
+ 	int ret = 0;
+@@ -1019,15 +1062,20 @@
+ 	unsigned long nr_reclaimed = 0;
+ 	struct reclaim_state *reclaim_state = current->reclaim_state;
+ 	unsigned long lru_pages = 0;
+-	int i;
++	int i, scan_priority = DEF_PRIORITY;
+ 	struct scan_control sc = {
+ 		.gfp_mask = gfp_mask,
+ 		.may_writepage = !laptop_mode,
+ 		.swap_cluster_max = SWAP_CLUSTER_MAX,
+ 		.may_swap = 1,
+-		.swappiness = vm_swappiness,
++		.mapped = vm_mapped,
+ 	};
+ 
++	if (p)
++		scan_priority = sc_priority(p);
++
++	delay_swap_prefetch();
++
+ 	count_vm_event(ALLOCSTALL);
+ 
+ 	for (i = 0; zones[i] != NULL; i++) {
+@@ -1040,7 +1088,7 @@
+ 				+ zone_page_state(zone, NR_INACTIVE);
+ 	}
+ 
+-	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
++	for (priority = scan_priority; priority >= 0; priority--) {
+ 		sc.nr_scanned = 0;
+ 		if (!priority)
+ 			disable_swap_token();
+@@ -1070,7 +1118,7 @@
+ 		}
+ 
+ 		/* Take a nap, wait for some writeback to complete */
+-		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
++		if (sc.nr_scanned && priority < scan_priority - 2)
+ 			congestion_wait(WRITE, HZ/10);
+ 	}
+ 	/* top priority shrink_caches still had more to do? don't OOM, then */
+@@ -1120,9 +1168,9 @@
+  */
+ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+ {
+-	int all_zones_ok;
++	int all_zones_ok = 0;
+ 	int priority;
+-	int i;
++	int i, scan_priority;
+ 	unsigned long total_scanned;
+ 	unsigned long nr_reclaimed;
+ 	struct reclaim_state *reclaim_state = current->reclaim_state;
+@@ -1130,7 +1178,7 @@
+ 		.gfp_mask = GFP_KERNEL,
+ 		.may_swap = 1,
+ 		.swap_cluster_max = SWAP_CLUSTER_MAX,
+-		.swappiness = vm_swappiness,
++		.mapped = vm_mapped,
+ 	};
+ 	/*
+ 	 * temp_priority is used to remember the scanning priority at which
+@@ -1138,6 +1186,8 @@
+ 	 */
+ 	int temp_priority[MAX_NR_ZONES];
+ 
++	scan_priority = sc_priority(pgdat->kswapd);
++
+ loop_again:
+ 	total_scanned = 0;
+ 	nr_reclaimed = 0;
+@@ -1145,9 +1195,9 @@
+ 	count_vm_event(PAGEOUTRUN);
+ 
+ 	for (i = 0; i < pgdat->nr_zones; i++)
+-		temp_priority[i] = DEF_PRIORITY;
++		temp_priority[i] = scan_priority;
+ 
+-	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
++	for (priority = scan_priority; priority >= 0; priority--) {
+ 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
+ 		unsigned long lru_pages = 0;
+ 
+@@ -1163,15 +1213,22 @@
+ 		 */
+ 		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+ 			struct zone *zone = pgdat->node_zones + i;
++			unsigned long watermark;
+ 
+ 			if (!populated_zone(zone))
+ 				continue;
+ 
+-			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
++			if (zone->all_unreclaimable && priority != scan_priority)
+ 				continue;
+ 
+-			if (!zone_watermark_ok(zone, order, zone->pages_high,
+-					       0, 0)) {
++			/*
++			 * The watermark is relaxed depending on the
++			 * level of "priority" till it drops to
++			 * pages_high.
++			 */
++			watermark = zone->pages_high + (zone->pages_high *
++				    priority / scan_priority);
++			if (!zone_watermark_ok(zone, order, watermark, 0, 0)) {
+ 				end_zone = i;
+ 				break;
+ 			}
+@@ -1198,14 +1255,18 @@
+ 		for (i = 0; i <= end_zone; i++) {
+ 			struct zone *zone = pgdat->node_zones + i;
+ 			int nr_slab;
++			unsigned long watermark;
+ 
+ 			if (!populated_zone(zone))
+ 				continue;
+ 
+-			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
++			if (zone->all_unreclaimable && priority != scan_priority)
+ 				continue;
+ 
+-			if (!zone_watermark_ok(zone, order, zone->pages_high,
++			watermark = zone->pages_high + (zone->pages_high *
++				    priority / scan_priority);
++
++			if (!zone_watermark_ok(zone, order, watermark,
+ 					       end_zone, 0))
+ 				all_zones_ok = 0;
+ 			temp_priority[i] = priority;
+@@ -1238,7 +1299,7 @@
+ 		 * OK, kswapd is getting into trouble.  Take a nap, then take
+ 		 * another pass across the zones.
+ 		 */
+-		if (total_scanned && priority < DEF_PRIORITY - 2)
++		if (total_scanned && priority < scan_priority - 2)
+ 			congestion_wait(WRITE, HZ/10);
+ 
+ 		/*
+@@ -1272,6 +1333,8 @@
+ 	return nr_reclaimed;
+ }
+ 
++#define WT_EXPIRY	(HZ * 5)	/* Time to wakeup watermark_timer */
++
+ /*
+  * The background pageout daemon, started as a kernel thread
+  * from the init process. 
+@@ -1319,6 +1382,8 @@
+ 	for ( ; ; ) {
+ 		unsigned long new_order;
+ 
++		/* kswapd has been busy so delay watermark_timer */
++		mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY);
+ 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+ 		new_order = pgdat->kswapd_max_order;
+ 		pgdat->kswapd_max_order = 0;
+@@ -1332,6 +1397,7 @@
+ 			if (!freezing(current))
+ 				schedule();
+ 
++			set_user_nice(tsk, 0);
+ 			order = pgdat->kswapd_max_order;
+ 		}
+ 		finish_wait(&pgdat->kswapd_wait, &wait);
+@@ -1349,9 +1415,10 @@
+ /*
+  * A zone is low on free memory, so wake its kswapd task to service it.
+  */
+-void wakeup_kswapd(struct zone *zone, int order)
++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p)
+ {
+ 	pg_data_t *pgdat;
++	int active;
+ 
+ 	if (!populated_zone(zone))
+ 		return;
+@@ -1363,7 +1430,9 @@
+ 		pgdat->kswapd_max_order = order;
+ 	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ 		return;
+-	if (!waitqueue_active(&pgdat->kswapd_wait))
++	active = waitqueue_active(&pgdat->kswapd_wait);
++	set_kswapd_nice(pgdat->kswapd, p, active);
++	if (!active)
+ 		return;
+ 	wake_up_interruptible(&pgdat->kswapd_wait);
+ }
+@@ -1382,6 +1451,8 @@
+ 	struct zone *zone;
+ 	unsigned long nr_to_scan, ret = 0;
+ 
++	delay_swap_prefetch();
++
+ 	for_each_zone(zone) {
+ 
+ 		if (!populated_zone(zone))
+@@ -1441,7 +1512,7 @@
+ 		.may_swap = 0,
+ 		.swap_cluster_max = nr_pages,
+ 		.may_writepage = 1,
+-		.swappiness = vm_swappiness,
++		.mapped = vm_mapped,
+ 	};
+ 
+ 	current->reclaim_state = &reclaim_state;
+@@ -1476,7 +1547,7 @@
+ 		/* Force reclaiming mapped pages in the passes #3 and #4 */
+ 		if (pass > 2) {
+ 			sc.may_swap = 1;
+-			sc.swappiness = 100;
++			sc.mapped = 0;
+ 		}
+ 
+ 		for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+@@ -1540,20 +1611,57 @@
+ }
+ 
+ /*
++ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots
++ */
++static void watermark_wakeup(unsigned long data)
++{
++	pg_data_t *pgdat = (pg_data_t *)data;
++	struct timer_list *wt = &pgdat->watermark_timer;
++	int i;
++
++	if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load())
++		goto out;
++	for (i = pgdat->nr_zones - 1; i >= 0; i--) {
++		struct zone *z = pgdat->node_zones + i;
++
++		if (!populated_zone(z) || is_highmem(z)) {
++			/* We are better off leaving highmem full */
++			continue;
++		}
++		if (!zone_watermark_ok(z, 0, z->pages_lots, 0, 0)) {
++			wake_up_interruptible(&pgdat->kswapd_wait);
++			goto out;
++		}
++	}
++out:
++	mod_timer(wt, jiffies + WT_EXPIRY);
++	return;
++}
++
++/*
+  * This kswapd start function will be called by init and node-hot-add.
+  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+  */
+ int kswapd_run(int nid)
+ {
+ 	pg_data_t *pgdat = NODE_DATA(nid);
++	struct timer_list *wt;
+ 	int ret = 0;
+ 
+ 	if (pgdat->kswapd)
+ 		return 0;
+ 
++	wt = &pgdat->watermark_timer;
++	init_timer(wt);
++	wt->data = (unsigned long)pgdat;
++	wt->function = watermark_wakeup;
++	wt->expires = jiffies + WT_EXPIRY;
++	add_timer(wt);
++
+ 	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ 	if (IS_ERR(pgdat->kswapd)) {
+ 		/* failure at boot is fatal */
++		del_timer(wt);
+ 		BUG_ON(system_state == SYSTEM_BOOTING);
+ 		printk("Failed to start kswapd on node %d\n",nid);
+ 		ret = -1;
+@@ -1624,7 +1732,7 @@
+ 		.swap_cluster_max = max_t(unsigned long, nr_pages,
+ 					SWAP_CLUSTER_MAX),
+ 		.gfp_mask = gfp_mask,
+-		.swappiness = vm_swappiness,
++		.mapped = vm_mapped,
+ 	};
+ 	unsigned long slab_reclaimable;
+ 
+Index: linux-2.6.22-ck1/include/linux/mm_inline.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/linux/mm_inline.h	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/include/linux/mm_inline.h	2007-07-10 14:55:22.000000000 +1000
+@@ -13,6 +13,13 @@
+ }
+ 
+ static inline void
++add_page_to_inactive_list_tail(struct zone *zone, struct page *page)
++{
++	list_add_tail(&page->lru, &zone->inactive_list);
++	__inc_zone_state(zone, NR_INACTIVE);
++}
++
++static inline void
+ del_page_from_active_list(struct zone *zone, struct page *page)
+ {
+ 	list_del(&page->lru);
+Index: linux-2.6.22-ck1/include/linux/swap-prefetch.h
+===================================================================
+--- /dev/null	1970-01-01 00:00:00.000000000 +0000
++++ linux-2.6.22-ck1/include/linux/swap-prefetch.h	2007-07-10 14:55:22.000000000 +1000
+@@ -0,0 +1,53 @@
++#ifndef SWAP_PREFETCH_H_INCLUDED
++#define SWAP_PREFETCH_H_INCLUDED
++
++#ifdef CONFIG_SWAP_PREFETCH
++/* mm/swap_prefetch.c */
++extern int swap_prefetch;
++extern int swap_prefetch_delay;
++extern int swap_prefetch_sleep;
++
++struct swapped_entry {
++	swp_entry_t		swp_entry;	/* The actual swap entry */
++	struct list_head	swapped_list;	/* Linked list of entries */
++#if MAX_NUMNODES > 1
++	int			node;		/* Node id */
++#endif
++} __attribute__((packed));
++
++static inline void store_swap_entry_node(struct swapped_entry *entry,
++	struct page *page)
++{
++#if MAX_NUMNODES > 1
++	entry->node = page_to_nid(page);
++#endif
++}
++
++static inline int get_swap_entry_node(struct swapped_entry *entry)
++{
++#if MAX_NUMNODES > 1
++	return entry->node;
++#else
++	return 0;
++#endif
++}
++
++extern void add_to_swapped_list(struct page *page);
++extern void delay_swap_prefetch(void);
++extern void prepare_swap_prefetch(void);
++
++#else	/* CONFIG_SWAP_PREFETCH */
++static inline void add_to_swapped_list(struct page *__unused)
++{
++}
++
++static inline void prepare_swap_prefetch(void)
++{
++}
++
++static inline void delay_swap_prefetch(void)
++{
++}
++#endif	/* CONFIG_SWAP_PREFETCH */
++
++#endif		/* SWAP_PREFETCH_H_INCLUDED */
+Index: linux-2.6.22-ck1/mm/page_io.c
+===================================================================
+--- linux-2.6.22-ck1.orig/mm/page_io.c	2007-07-10 14:55:00.000000000 +1000
++++ linux-2.6.22-ck1/mm/page_io.c	2007-07-10 14:55:22.000000000 +1000
+@@ -17,6 +17,7 @@
+ #include <linux/bio.h>
+ #include <linux/swapops.h>
+ #include <linux/writeback.h>
++#include <linux/swap-prefetch.h>
+ #include <asm/pgtable.h>
+ 
+ static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
+@@ -118,6 +119,7 @@
+ 		ret = -ENOMEM;
+ 		goto out;
+ 	}
++	add_to_swapped_list(page);
+ 	if (wbc->sync_mode == WB_SYNC_ALL)
+ 		rw |= (1 << BIO_RW_SYNC);
+ 	count_vm_event(PSWPOUT);
+Index: linux-2.6.22-ck1/include/linux/sysctl.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/linux/sysctl.h	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/include/linux/sysctl.h	2007-07-10 14:55:22.000000000 +1000
+@@ -190,7 +190,7 @@
+ 	VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */
+ 	VM_PAGEBUF=17,		/* struct: Control pagebuf parameters */
+ 	VM_HUGETLB_PAGES=18,	/* int: Number of available Huge Pages */
+-	VM_SWAPPINESS=19,	/* Tendency to steal mapped memory */
++	VM_UNUSED19=19,		/* was: Tendency to steal mapped memory */
+ 	VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */
+ 	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
+ 	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
+Index: linux-2.6.22-ck1/include/linux/mmzone.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/linux/mmzone.h	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/include/linux/mmzone.h	2007-07-10 14:55:23.000000000 +1000
+@@ -13,6 +13,7 @@
+ #include <linux/init.h>
+ #include <linux/seqlock.h>
+ #include <linux/nodemask.h>
++#include <linux/timer.h>
+ #include <asm/atomic.h>
+ #include <asm/page.h>
+ 
+@@ -181,7 +182,7 @@
+ 
+ struct zone {
+ 	/* Fields commonly accessed by the page allocator */
+-	unsigned long		pages_min, pages_low, pages_high;
++	unsigned long		pages_min, pages_low, pages_high, pages_lots;
+ 	/*
+ 	 * We don't know if the memory that we're going to allocate will be freeable
+ 	 * or/and it will be released eventually, so to avoid totally wasting several
+@@ -452,6 +453,7 @@
+ 	wait_queue_head_t kswapd_wait;
+ 	struct task_struct *kswapd;
+ 	int kswapd_max_order;
++	struct timer_list watermark_timer;
+ } pg_data_t;
+ 
+ #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
+@@ -468,7 +470,7 @@
+ void get_zone_counts(unsigned long *active, unsigned long *inactive,
+ 			unsigned long *free);
+ void build_all_zonelists(void);
+-void wakeup_kswapd(struct zone *zone, int order);
++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p);
+ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ 		int classzone_idx, int alloc_flags);
+ enum memmap_context {
+Index: linux-2.6.22-ck1/mm/page_alloc.c
+===================================================================
+--- linux-2.6.22-ck1.orig/mm/page_alloc.c	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/mm/page_alloc.c	2007-07-10 14:55:22.000000000 +1000
+@@ -1250,7 +1250,7 @@
+ 		goto nopage;
+ 
+ 	for (z = zonelist->zones; *z; z++)
+-		wakeup_kswapd(*z, order);
++		wakeup_kswapd(*z, order, p);
+ 
+ 	/*
+ 	 * OK, we're below the kswapd watermark and have kicked background
+@@ -1314,7 +1314,7 @@
+ 	reclaim_state.reclaimed_slab = 0;
+ 	p->reclaim_state = &reclaim_state;
+ 
+-	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
++	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask, p);
+ 
+ 	p->reclaim_state = NULL;
+ 	p->flags &= ~PF_MEMALLOC;
+@@ -1570,6 +1570,7 @@
+ 			" min:%lukB"
+ 			" low:%lukB"
+ 			" high:%lukB"
++			" lots:%lukB"
+ 			" active:%lukB"
+ 			" inactive:%lukB"
+ 			" present:%lukB"
+@@ -1581,6 +1582,7 @@
+ 			K(zone->pages_min),
+ 			K(zone->pages_low),
+ 			K(zone->pages_high),
++			K(zone->pages_lots),
+ 			K(zone_page_state(zone, NR_ACTIVE)),
+ 			K(zone_page_state(zone, NR_INACTIVE)),
+ 			K(zone->present_pages),
+@@ -3142,6 +3144,7 @@
+ 
+ 		zone->pages_low   = zone->pages_min + (tmp >> 2);
+ 		zone->pages_high  = zone->pages_min + (tmp >> 1);
++		zone->pages_lots  = zone->pages_min + tmp;
+ 		spin_unlock_irqrestore(&zone->lru_lock, flags);
+ 	}
+ 
+Index: linux-2.6.22-ck1/fs/buffer.c
+===================================================================
+--- linux-2.6.22-ck1.orig/fs/buffer.c	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/fs/buffer.c	2007-07-10 14:55:22.000000000 +1000
+@@ -356,7 +356,7 @@
+ 	for_each_online_pgdat(pgdat) {
+ 		zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
+ 		if (*zones)
+-			try_to_free_pages(zones, GFP_NOFS);
++			try_to_free_pages(zones, GFP_NOFS, NULL);
+ 	}
+ }
+ 
+Index: linux-2.6.22-ck1/mm/filemap.c
+===================================================================
+--- linux-2.6.22-ck1.orig/mm/filemap.c	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/mm/filemap.c	2007-07-10 14:55:23.000000000 +1000
+@@ -466,6 +466,16 @@
+ 	return ret;
+ }
+ 
++int add_to_page_cache_lru_tail(struct page *page,
++	struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
++{
++	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
++
++	if (ret == 0)
++		lru_cache_add_tail(page);
++	return ret;
++}
++
+ #ifdef CONFIG_NUMA
+ struct page *__page_cache_alloc(gfp_t gfp)
+ {
+@@ -839,6 +849,34 @@
+ 	ra->ra_pages /= 4;
+ }
+ 
++/*
++ * Sysctl which determines whether we should read from large files to the
++ * tail of the inactive lru list.
++ */
++int vm_tail_largefiles __read_mostly = 1;
++
++static inline int nr_mapped(void)
++{
++	return global_page_state(NR_FILE_MAPPED) +
++		global_page_state(NR_ANON_PAGES);
++}
++
++/*
++ * This examines how large in pages a file size is and returns 1 if it is
++ * more than half the unmapped ram. Avoid doing read_page_state which is
++ * expensive unless we already know it is likely to be large enough.
++ */
++static int large_isize(unsigned long nr_pages)
++{
++	if (nr_pages * 6 > vm_total_pages) {
++		 unsigned long unmapped_ram = vm_total_pages - nr_mapped();
++
++		if (nr_pages * 2 > unmapped_ram)
++			return 1;
++	}
++	return 0;
++}
++
+ /**
+  * do_generic_mapping_read - generic file read routine
+  * @mapping:	address_space to be read
+@@ -1051,8 +1089,19 @@
+ 				goto out;
+ 			}
+ 		}
+-		error = add_to_page_cache_lru(cached_page, mapping,
+-						index, GFP_KERNEL);
++
++		/*
++		 * If we know the file is large we add the pages read to the
++		 * end of the lru as we're unlikely to be able to cache the
++		 * whole file in ram so make those pages the first to be
++		 * dropped if not referenced soon.
++		 */
++		if (vm_tail_largefiles && large_isize(end_index))
++			error = add_to_page_cache_lru_tail(cached_page,
++						mapping, index, GFP_KERNEL);
++		else
++			error = add_to_page_cache_lru(cached_page, mapping,
++							index, GFP_KERNEL);
+ 		if (error) {
+ 			if (error == -EEXIST)
+ 				goto find_page;
+Index: linux-2.6.22-ck1/Documentation/filesystems/proc.txt
+===================================================================
+--- linux-2.6.22-ck1.orig/Documentation/filesystems/proc.txt	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/Documentation/filesystems/proc.txt	2007-07-10 14:55:23.000000000 +1000
+@@ -1333,6 +1333,14 @@
+ As this is a non-destructive operation and dirty objects are not freeable, the
+ user should run `sync' first.
+ 
++tail_largefiles
++---------------
++
++When enabled reads from large files to the tail end of the inactive lru list.
++This means that any cache from reading large files is dropped very quickly,
++preventing loss of mapped ram and useful pagecache when large files are read.
++This does, however, make caching less effective when working with large files.
++
+ 
+ 2.5 /proc/sys/dev - Device specific parameters
+ ----------------------------------------------
+Index: linux-2.6.22-ck1/arch/i386/Kconfig
+===================================================================
+--- linux-2.6.22-ck1.orig/arch/i386/Kconfig	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/arch/i386/Kconfig	2007-07-10 14:55:23.000000000 +1000
+@@ -550,7 +550,7 @@
+ 
+ choice
+ 	depends on EXPERIMENTAL
+-	prompt "Memory split" if EMBEDDED
++	prompt "Memory split"
+ 	default VMSPLIT_3G
+ 	help
+ 	  Select the desired split between kernel and user memory.
+@@ -569,17 +569,17 @@
+ 	  option alone!
+ 
+ 	config VMSPLIT_3G
+-		bool "3G/1G user/kernel split"
++		bool "Default 896MB lowmem (3G/1G user/kernel split)"
+ 	config VMSPLIT_3G_OPT
+ 		depends on !HIGHMEM
+-		bool "3G/1G user/kernel split (for full 1G low memory)"
++		bool "1GB lowmem (3G/1G user/kernel split)"
+ 	config VMSPLIT_2G
+-		bool "2G/2G user/kernel split"
++		bool "2GB lowmem (2G/2G user/kernel split)"
+ 	config VMSPLIT_2G_OPT
+ 		depends on !HIGHMEM
+-		bool "2G/2G user/kernel split (for full 2G low memory)"
++		bool "2GB lowmem (2G/2G user/kernel split)"
+ 	config VMSPLIT_1G
+-		bool "1G/3G user/kernel split"
++		bool "3GB lowmem (1G/3G user/kernel split)"
+ endchoice
+ 
+ config PAGE_OFFSET
+Index: linux-2.6.22-ck1/kernel/Kconfig.hz
+===================================================================
+--- linux-2.6.22-ck1.orig/kernel/Kconfig.hz	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/kernel/Kconfig.hz	2007-07-10 14:55:24.000000000 +1000
+@@ -4,7 +4,7 @@
+ 
+ choice
+ 	prompt "Timer frequency"
+-	default HZ_250
++	default HZ_1000
+ 	help
+ 	 Allows the configuration of the timer frequency. It is customary
+ 	 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
+@@ -13,8 +13,7 @@
+ 	 contention and cacheline bounces as a result of timer interrupts.
+ 	 Note that the timer interrupt occurs on each processor in an SMP
+ 	 environment leading to NR_CPUS * HZ number of timer interrupts
+-	 per second.
+-
++	 per second.Laptops may also show improved battery life.
+ 
+ 	config HZ_100
+ 		bool "100 HZ"
+@@ -23,13 +22,14 @@
+ 	  with lots of processors that may show reduced performance if
+ 	  too many timer interrupts are occurring.
+ 
+-	config HZ_250
++	config HZ_250_NODEFAULT
+ 		bool "250 HZ"
+ 	help
+-	 250 Hz is a good compromise choice allowing server performance
+-	 while also showing good interactive responsiveness even
+-	 on SMP and NUMA systems. If you are going to be using NTSC video
+-	 or multimedia, selected 300Hz instead.
++	 250 HZ is a lousy compromise choice allowing server interactivity
++	 while also showing desktop throughput and no extra power saving on
++	 laptops. Good for when you can't make up your mind.
++
++	 Recommend 100 or 1000 instead.
+ 
+ 	config HZ_300
+ 		bool "300 HZ"
+@@ -45,12 +45,76 @@
+ 	 1000 Hz is the preferred choice for desktop systems and other
+ 	 systems requiring fast interactive responses to events.
+ 
++	config HZ_1500
++		bool "1500 HZ"
++	help
++	 1500 Hz is an insane value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_2000
++		bool "2000 HZ"
++	help
++	 2000 Hz is an insane value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_3000
++		bool "3000 HZ"
++	help
++	 3000 Hz is an insane value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_4000
++		bool "4000 HZ"
++	help
++	 4000 Hz is an insane value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_5000
++		bool "5000 HZ"
++	help
++	 5000 Hz is an obscene value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_7500
++		bool "7500 HZ"
++	help
++	 7500 Hz is an obscene value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++	config HZ_10000
++		bool "10000 HZ"
++	help
++	 10000 Hz is an obscene value to use to run broken software that is Hz
++	 limited.
++
++	 Being over 1000, driver breakage is likely.
++
++
+ endchoice
+ 
+ config HZ
+ 	int
+ 	default 100 if HZ_100
+-	default 250 if HZ_250
++	default 250 if HZ_250_NODEFAULT
+ 	default 300 if HZ_300
+ 	default 1000 if HZ_1000
++	default 1500 if HZ_1500
++	default 2000 if HZ_2000
++	default 3000 if HZ_3000
++	default 4000 if HZ_4000
++	default 5000 if HZ_5000
++	default 7500 if HZ_7500
++	default 10000 if HZ_10000
+ 
+Index: linux-2.6.22-ck1/arch/i386/defconfig
+===================================================================
+--- linux-2.6.22-ck1.orig/arch/i386/defconfig	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/arch/i386/defconfig	2007-07-10 14:55:23.000000000 +1000
+@@ -226,10 +226,10 @@
+ # CONFIG_IRQBALANCE is not set
+ CONFIG_SECCOMP=y
+ # CONFIG_HZ_100 is not set
+-CONFIG_HZ_250=y
++# CONFIG_HZ_250 is not set
+ # CONFIG_HZ_300 is not set
+-# CONFIG_HZ_1000 is not set
+-CONFIG_HZ=250
++CONFIG_HZ_1000=y
++CONFIG_HZ=1000
+ # CONFIG_KEXEC is not set
+ # CONFIG_CRASH_DUMP is not set
+ CONFIG_PHYSICAL_START=0x100000
+Index: linux-2.6.22-ck1/arch/x86_64/defconfig
+===================================================================
+--- linux-2.6.22-ck1.orig/arch/x86_64/defconfig	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/arch/x86_64/defconfig	2007-07-10 14:55:23.000000000 +1000
+@@ -185,10 +185,10 @@
+ CONFIG_SECCOMP=y
+ # CONFIG_CC_STACKPROTECTOR is not set
+ # CONFIG_HZ_100 is not set
+-CONFIG_HZ_250=y
++# CONFIG_HZ_250 is not set
+ # CONFIG_HZ_300 is not set
+-# CONFIG_HZ_1000 is not set
+-CONFIG_HZ=250
++CONFIG_HZ_1000=y
++CONFIG_HZ=1000
+ CONFIG_K8_NB=y
+ CONFIG_GENERIC_HARDIRQS=y
+ CONFIG_GENERIC_IRQ_PROBE=y
+Index: linux-2.6.22-ck1/include/linux/jiffies.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/linux/jiffies.h	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/include/linux/jiffies.h	2007-07-10 14:55:24.000000000 +1000
+@@ -29,6 +29,12 @@
+ # define SHIFT_HZ	9
+ #elif HZ >= 768 && HZ < 1536
+ # define SHIFT_HZ	10
++#elif HZ >= 1536 && HZ < 3072
++# define SHIFT_HZ	11
++#elif HZ >= 3072 && HZ < 6144
++# define SHIFT_HZ	12
++#elif HZ >= 6144 && HZ < 12288
++# define SHIFT_HZ	13
+ #else
+ # error You lose.
+ #endif
+Index: linux-2.6.22-ck1/include/net/inet_timewait_sock.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/net/inet_timewait_sock.h	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/include/net/inet_timewait_sock.h	2007-07-10 14:55:24.000000000 +1000
+@@ -38,8 +38,8 @@
+  * If time > 4sec, it is "slow" path, no recycling is required,
+  * so that we select tick to get range about 4 seconds.
+  */
+-#if HZ <= 16 || HZ > 4096
+-# error Unsupported: HZ <= 16 or HZ > 4096
++#if HZ <= 16 || HZ > 16384
++# error Unsupported: HZ <= 16 or HZ > 16384
+ #elif HZ <= 32
+ # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+ #elif HZ <= 64
+@@ -54,8 +54,12 @@
+ # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+ #elif HZ <= 2048
+ # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+-#else
++#elif HZ <= 4096
+ # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
++#elif HZ <= 8192
++# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
++#else
++# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+ #endif
+ 
+ /* TIME_WAIT reaping mechanism. */
+Index: linux-2.6.22-ck1/init/calibrate.c
+===================================================================
+--- linux-2.6.22-ck1.orig/init/calibrate.c	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/init/calibrate.c	2007-07-10 14:55:24.000000000 +1000
+@@ -122,12 +122,12 @@
+ 		printk("Calibrating delay loop (skipped)... "
+ 			"%lu.%02lu BogoMIPS preset\n",
+ 			loops_per_jiffy/(500000/HZ),
+-			(loops_per_jiffy/(5000/HZ)) % 100);
++			(loops_per_jiffy * 10/(50000/HZ)) % 100);
+ 	} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
+ 		printk("Calibrating delay using timer specific routine.. ");
+ 		printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
+ 			loops_per_jiffy/(500000/HZ),
+-			(loops_per_jiffy/(5000/HZ)) % 100,
++			(loops_per_jiffy * 10/(50000/HZ)) % 100,
+ 			loops_per_jiffy);
+ 	} else {
+ 		loops_per_jiffy = (1<<12);
+@@ -166,7 +166,7 @@
+ 		/* Round the value and print it */
+ 		printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
+ 			loops_per_jiffy/(500000/HZ),
+-			(loops_per_jiffy/(5000/HZ)) % 100,
++			(loops_per_jiffy * 10/(50000/HZ)) % 100,
+ 			loops_per_jiffy);
+ 	}
+ 
+Index: linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c
+===================================================================
+--- linux-2.6.22-ck1.orig/arch/i386/kernel/cpu/proc.c	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c	2007-07-10 14:55:24.000000000 +1000
+@@ -157,7 +157,7 @@
+ 
+ 	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+ 		     c->loops_per_jiffy/(500000/HZ),
+-		     (c->loops_per_jiffy/(5000/HZ)) % 100);
++		     (c->loops_per_jiffy * 10/(50000/HZ)) % 100);
+ 	seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
+ 
+ 	return 0;
+Index: linux-2.6.22-ck1/arch/i386/kernel/smpboot.c
+===================================================================
+--- linux-2.6.22-ck1.orig/arch/i386/kernel/smpboot.c	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/arch/i386/kernel/smpboot.c	2007-07-10 14:55:24.000000000 +1000
+@@ -1094,7 +1094,7 @@
+ 		"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+ 		cpucount+1,
+ 		bogosum/(500000/HZ),
+-		(bogosum/(5000/HZ))%100);
++		(bogosum * 10/(50000/HZ))%100);
+ 	
+ 	Dprintk("Before bogocount - setting activated=1.\n");
+ 
+Index: linux-2.6.22-ck1/include/linux/nfsd/stats.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/linux/nfsd/stats.h	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/include/linux/nfsd/stats.h	2007-07-10 14:55:24.000000000 +1000
+@@ -35,8 +35,8 @@
+ 
+ };
+ 
+-/* thread usage wraps very million seconds (approx one fortnight) */
+-#define	NFSD_USAGE_WRAP	(HZ*1000000)
++/* thread usage wraps every one hundred thousand seconds (approx one day) */
++#define	NFSD_USAGE_WRAP	(HZ*100000)
+ 
+ #ifdef __KERNEL__
+ 
+Index: linux-2.6.22-ck1/arch/x86_64/kernel/setup.c
+===================================================================
+--- linux-2.6.22-ck1.orig/arch/x86_64/kernel/setup.c	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/arch/x86_64/kernel/setup.c	2007-07-10 14:55:24.000000000 +1000
+@@ -1047,7 +1047,7 @@
+ 		
+ 	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+ 		   c->loops_per_jiffy/(500000/HZ),
+-		   (c->loops_per_jiffy/(5000/HZ)) % 100);
++		   (c->loops_per_jiffy * 10/(50000/HZ)) % 100);
+ 
+ 	if (c->x86_tlbsize > 0) 
+ 		seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
+Index: linux-2.6.22-ck1/Makefile
+===================================================================
+--- linux-2.6.22-ck1.orig/Makefile	2007-07-10 14:54:59.000000000 +1000
++++ linux-2.6.22-ck1/Makefile	2007-07-10 14:55:24.000000000 +1000
+@@ -1,8 +1,9 @@
+ VERSION = 2
+ PATCHLEVEL = 6
+ SUBLEVEL = 22
+-EXTRAVERSION =
+-NAME = Holy Dancing Manatees, Batman!
++EXTRAVERSION = -ck1
++NAME = So long, and thanks for all the fish
++JANAME = さようなら、いままで魚をありがとう
+ 
+ # *DOCUMENTATION*
+ # To see a list of typical targets execute "make help"
diff --git a/pkgs/tools/package-management/nix/unstable.nix b/pkgs/tools/package-management/nix/unstable.nix
index 532863d66eb9..30d0b8f345eb 100644
--- a/pkgs/tools/package-management/nix/unstable.nix
+++ b/pkgs/tools/package-management/nix/unstable.nix
@@ -3,14 +3,14 @@
 , stateDir ? "/nix/var"
 }:
 
-let version = "0.11pre9692"; in
+let version = "0.11pre9718"; in
 
 stdenv.mkDerivation {
   name = "nix-${version}";
   
   src = fetchurl {
     url = "http://nix.cs.uu.nl/dist/nix/nix-${version}/nix-${version}.tar.bz2";
-    md5 = "35c0bc68b81d20c7fb925bcf8faf4827";
+    md5 = "cae130dcc51a30eff34fc194e17891f2";
   };
 
   buildInputs = [perl curl openssl];
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index 0840cfde0597..8bc2c48df0cc 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -154,6 +154,13 @@ rec {
 	  version = getConfig [ "environment" "versions" name ];
   };
 
+  # The same, another syntax.
+  # Warning: syntax for configuration.nix changed too
+  useVersion = name: f: f
+  {
+	  version = getConfig [ "environment" "versions" name ];
+  };
+
   # Whether user enabled given feature for the given package?
   getFlag = flag: package: default:
   getConfig [ "environment" "flags" package flag ]
@@ -2803,6 +2810,10 @@ rec {
     inherit fetchurl stdenv;
   };
 
+  bridge_utils = import ../os-specific/linux/bridge_utils {
+    inherit fetchurl stdenv autoconf automake;
+  };
+
   alsaUtils = import ../os-specific/linux/alsa/utils {
     inherit fetchurl stdenv alsaLib ncurses gettext;
   };
@@ -3011,14 +3022,16 @@ rec {
   kernel_2_6_22 = import ../os-specific/linux/kernel/linux-2.6.22.nix {
     inherit fetchurl stdenv perl mktemp module_init_tools;
     kernelPatches = [
-      /*{ name = "ext3cow";
-        patch = ../os-specific/linux/kernel/linux-2.6.20.3-ext3cow.patch;
+      /*
+      { name = "ext3cow";
+        patch = ../os-specific/linux/kernel/linux-2.6.21.7-ext3cow_wouter.patch;
         extraConfig =
         "CONFIG_EXT3COW_FS=m\n" +
         "CONFIG_EXT3COW_FS_XATTR=y\n" +
         "CONFIG_EXT3COW_FS_POSIX_ACL=y\n" +
         "CONFIG_EXT3COW_FS_SECURITY=y\n";
-      }*/
+      }
+      */
       { name = "paravirt-nvidia";
         patch = ../os-specific/linux/kernel/2.6.22-paravirt-nvidia.patch;
       }
@@ -3049,12 +3062,63 @@ rec {
       [(getConfig ["kernel" "addConfig"] "")];
   };
 
+  kernel_2_6_21_ck = import ../os-specific/linux/kernel/linux-2.6.21_ck.nix {
+    inherit fetchurl stdenv perl mktemp module_init_tools;
+    kernelPatches = [
+      { name = "ext3cow";
+        patch = ../os-specific/linux/kernel/linux-2.6.21.7-ext3cow_wouter.patch;
+        extraConfig =
+        "CONFIG_EXT3COW_FS=m\n" +
+        "CONFIG_EXT3COW_FS_XATTR=y\n" +
+        "CONFIG_EXT3COW_FS_POSIX_ACL=y\n" +
+        "CONFIG_EXT3COW_FS_SECURITY=y\n";
+      }
+      { name = "Con Kolivas Patch";
+        patch = ../os-specific/linux/kernel/patch-2.6.21-ck1;
+      }
+      { name = "paravirt-nvidia";
+        patch = ../os-specific/linux/kernel/2.6.20-paravirt-nvidia.patch;
+      }
+      { name = "skas-2.6.20-v9-pre9";
+        patch = fetchurl {
+          url = http://www.user-mode-linux.org/~blaisorblade/patches/skas3-2.6/skas-2.6.20-v9-pre9/skas-2.6.20-v9-pre9.patch.bz2;
+          md5 = "02e619e5b3aaf0f9768f03ac42753e74";
+        };
+        extraConfig =
+          "CONFIG_PROC_MM=y\n" +
+          "# CONFIG_PROC_MM_DUMPABLE is not set\n";
+      }
+      { name = "fbsplash-0.9.2-r5-2.6.21";
+        patch = fetchurl {
+          url = http://dev.gentoo.org/~dsd/genpatches/trunk/2.6.21/4200_fbsplash-0.9.2-r5.patch;
+          sha256 = "00s8074fzsly2zpir885zqkvq267qyzg6vhsn7n1z2v1z78avxd8";
+        };
+        extraConfig = "CONFIG_FB_SPLASH=y";
+      }
+    ];
+  };
+
+
+
   kernel_2_6_23 = import ../os-specific/linux/kernel/linux-2.6.23.nix {
     inherit fetchurl stdenv perl mktemp module_init_tools;
     kernelPatches = [
       { name = "paravirt-nvidia";
         patch = ../os-specific/linux/kernel/2.6.22-paravirt-nvidia.patch;
       }
+      { # resume with resume=swap:/dev/xx
+        name = "tux on ice"; # (swsusp2)
+        patch = fetchurl {
+          url = "http://www.tuxonice.net/downloads/all/tuxonice-3.0-rc2-for-2.6.23.1.patch.bz2";
+          sha256 = "ef86267b6f3d7e309221f5173a881afae1dfa57418be5b3963f2380b0633ca1a";
+        };
+        extraConfig = "
+          CONFIG_SUSPEND2=y
+          CONFIG_SUSPEND2_FILE=y
+          CONFIG_SUSPEND2_SWAP=y
+          CONFIG_CRYPTO_LZF=y
+        ";
+      }
       { name = "fbsplash-0.9.2-r5-2.6.21";
         patch = fetchurl {
           url = http://dev.gentoo.org/~dsd/genpatches/trunk/2.6.22/4200_fbsplash-0.9.2-r5.patch;