mirror of
https://github.com/NixOS/nixpkgs.git
synced 2024-11-19 08:36:41 +01:00
e20c559e5f
svn path=/nixpkgs/trunk/; revision=9748
5167 lines
155 KiB
Text
5167 lines
155 KiB
Text
Index: linux-2.6.22-ck1/include/linux/sched.h
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/include/linux/sched.h 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/include/linux/sched.h 2007-07-10 14:55:21.000000000 +1000
|
|
@@ -34,9 +34,14 @@
|
|
#define SCHED_FIFO 1
|
|
#define SCHED_RR 2
|
|
#define SCHED_BATCH 3
|
|
+#define SCHED_ISO 4
|
|
+#define SCHED_IDLEPRIO 5
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
+#define SCHED_MAX SCHED_IDLEPRIO
|
|
+#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX)
|
|
+
|
|
struct sched_param {
|
|
int sched_priority;
|
|
};
|
|
@@ -129,7 +134,7 @@
|
|
extern unsigned long nr_active(void);
|
|
extern unsigned long nr_iowait(void);
|
|
extern unsigned long weighted_cpuload(const int cpu);
|
|
-
|
|
+extern int above_background_load(void);
|
|
|
|
/*
|
|
* Task state bitmask. NOTE! These bits are also
|
|
@@ -150,8 +155,7 @@
|
|
#define EXIT_ZOMBIE 16
|
|
#define EXIT_DEAD 32
|
|
/* in tsk->state again */
|
|
-#define TASK_NONINTERACTIVE 64
|
|
-#define TASK_DEAD 128
|
|
+#define TASK_DEAD 64
|
|
|
|
#define __set_task_state(tsk, state_value) \
|
|
do { (tsk)->state = (state_value); } while (0)
|
|
@@ -537,14 +541,19 @@
|
|
|
|
#define MAX_USER_RT_PRIO 100
|
|
#define MAX_RT_PRIO MAX_USER_RT_PRIO
|
|
+#define PRIO_RANGE (40)
|
|
+#define ISO_PRIO (MAX_RT_PRIO - 1)
|
|
|
|
-#define MAX_PRIO (MAX_RT_PRIO + 40)
|
|
+#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE)
|
|
|
|
-#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
|
|
+#define rt_prio(prio) unlikely((prio) < ISO_PRIO)
|
|
#define rt_task(p) rt_prio((p)->prio)
|
|
#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
|
|
-#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
|
|
+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \
|
|
+ (policy) == SCHED_RR)
|
|
#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
|
|
+#define iso_task(p) unlikely((p)->policy == SCHED_ISO)
|
|
+#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO)
|
|
|
|
/*
|
|
* Some day this will be a full-fledged user tracking system..
|
|
@@ -809,13 +818,6 @@
|
|
struct pipe_inode_info;
|
|
struct uts_namespace;
|
|
|
|
-enum sleep_type {
|
|
- SLEEP_NORMAL,
|
|
- SLEEP_NONINTERACTIVE,
|
|
- SLEEP_INTERACTIVE,
|
|
- SLEEP_INTERRUPTED,
|
|
-};
|
|
-
|
|
struct prio_array;
|
|
|
|
struct task_struct {
|
|
@@ -835,20 +837,33 @@
|
|
int load_weight; /* for niceness load balancing purposes */
|
|
int prio, static_prio, normal_prio;
|
|
struct list_head run_list;
|
|
+ /*
|
|
+ * This bitmap shows what priorities this task has received quota
|
|
+ * from for this major priority rotation on its current runqueue.
|
|
+ */
|
|
+ DECLARE_BITMAP(bitmap, PRIO_RANGE + 1);
|
|
struct prio_array *array;
|
|
+ /* Which major runqueue rotation did this task run */
|
|
+ unsigned long rotation;
|
|
|
|
unsigned short ioprio;
|
|
#ifdef CONFIG_BLK_DEV_IO_TRACE
|
|
unsigned int btrace_seq;
|
|
#endif
|
|
- unsigned long sleep_avg;
|
|
unsigned long long timestamp, last_ran;
|
|
unsigned long long sched_time; /* sched_clock time spent running */
|
|
- enum sleep_type sleep_type;
|
|
|
|
unsigned int policy;
|
|
cpumask_t cpus_allowed;
|
|
- unsigned int time_slice, first_time_slice;
|
|
+ /*
|
|
+ * How much this task is entitled to run at the current priority
|
|
+ * before being requeued at a lower priority.
|
|
+ */
|
|
+ int time_slice;
|
|
+ /* Is this the very first time_slice this task has ever run. */
|
|
+ unsigned int first_time_slice;
|
|
+ /* How much this task receives at each priority level */
|
|
+ int quota;
|
|
|
|
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
|
|
struct sched_info sched_info;
|
|
@@ -1013,6 +1028,7 @@
|
|
struct held_lock held_locks[MAX_LOCK_DEPTH];
|
|
unsigned int lockdep_recursion;
|
|
#endif
|
|
+ unsigned long mutexes_held;
|
|
|
|
/* journalling filesystem info */
|
|
void *journal_info;
|
|
@@ -1181,9 +1197,11 @@
|
|
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
|
|
#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
|
|
#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
|
|
+#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */
|
|
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
|
|
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
|
|
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */
|
|
+#define PF_NONSLEEP 0x80000000 /* Waiting on in-kernel activity */
|
|
|
|
/*
|
|
* Only the _current_ task can read/write to tsk->flags, but other
|
|
@@ -1253,7 +1271,7 @@
|
|
#endif
|
|
|
|
extern void set_user_nice(struct task_struct *p, long nice);
|
|
-extern int task_prio(const struct task_struct *p);
|
|
+extern int task_prio(struct task_struct *p);
|
|
extern int task_nice(const struct task_struct *p);
|
|
extern int can_nice(const struct task_struct *p, const int nice);
|
|
extern int task_curr(const struct task_struct *p);
|
|
Index: linux-2.6.22-ck1/kernel/sched.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/kernel/sched.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/kernel/sched.c 2007-07-10 14:55:24.000000000 +1000
|
|
@@ -16,6 +16,7 @@
|
|
* by Davide Libenzi, preemptible kernel bits by Robert Love.
|
|
* 2003-09-03 Interactivity tuning by Con Kolivas.
|
|
* 2004-04-02 Scheduler domains code by Nick Piggin
|
|
+ * 2007-03-02 Staircase deadline scheduling policy by Con Kolivas
|
|
*/
|
|
|
|
#include <linux/mm.h>
|
|
@@ -53,8 +54,9 @@
|
|
#include <linux/kprobes.h>
|
|
#include <linux/delayacct.h>
|
|
#include <linux/reciprocal_div.h>
|
|
-
|
|
+#include <linux/log2.h>
|
|
#include <asm/tlb.h>
|
|
+
|
|
#include <asm/unistd.h>
|
|
|
|
/*
|
|
@@ -84,147 +86,85 @@
|
|
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
|
|
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
|
|
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
|
|
+#define SCHED_PRIO(p) ((p)+MAX_RT_PRIO)
|
|
|
|
-/*
|
|
- * Some helpers for converting nanosecond timing to jiffy resolution
|
|
- */
|
|
-#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
|
|
+/* Some helpers for converting to/from various scales.*/
|
|
#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
|
|
-
|
|
-/*
|
|
- * These are the 'tuning knobs' of the scheduler:
|
|
- *
|
|
- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
|
|
- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
|
|
- * Timeslices get refilled after they expire.
|
|
- */
|
|
-#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
|
|
-#define DEF_TIMESLICE (100 * HZ / 1000)
|
|
-#define ON_RUNQUEUE_WEIGHT 30
|
|
-#define CHILD_PENALTY 95
|
|
-#define PARENT_PENALTY 100
|
|
-#define EXIT_WEIGHT 3
|
|
-#define PRIO_BONUS_RATIO 25
|
|
-#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
|
|
-#define INTERACTIVE_DELTA 2
|
|
-#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
|
|
-#define STARVATION_LIMIT (MAX_SLEEP_AVG)
|
|
-#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
|
|
-
|
|
-/*
|
|
- * If a task is 'interactive' then we reinsert it in the active
|
|
- * array after it has expired its current timeslice. (it will not
|
|
- * continue to run immediately, it will still roundrobin with
|
|
- * other interactive tasks.)
|
|
- *
|
|
- * This part scales the interactivity limit depending on niceness.
|
|
- *
|
|
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
|
|
- * Here are a few examples of different nice levels:
|
|
- *
|
|
- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
|
|
- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
|
|
- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
|
|
- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
|
|
- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
|
|
- *
|
|
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
|
|
- * priority range a task can explore, a value of '1' means the
|
|
- * task is rated interactive.)
|
|
- *
|
|
- * Ie. nice +19 tasks can never get 'interactive' enough to be
|
|
- * reinserted into the active array. And only heavily CPU-hog nice -20
|
|
- * tasks will be expired. Default nice 0 tasks are somewhere between,
|
|
- * it takes some effort for them to get interactive, but it's not
|
|
- * too hard.
|
|
- */
|
|
-
|
|
-#define CURRENT_BONUS(p) \
|
|
- (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
|
|
- MAX_SLEEP_AVG)
|
|
-
|
|
-#define GRANULARITY (10 * HZ / 1000 ? : 1)
|
|
-
|
|
-#ifdef CONFIG_SMP
|
|
-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
|
|
- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
|
|
- num_online_cpus())
|
|
-#else
|
|
-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
|
|
- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
|
|
-#endif
|
|
-
|
|
-#define SCALE(v1,v1_max,v2_max) \
|
|
- (v1) * (v2_max) / (v1_max)
|
|
-
|
|
-#define DELTA(p) \
|
|
- (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
|
|
- INTERACTIVE_DELTA)
|
|
-
|
|
-#define TASK_INTERACTIVE(p) \
|
|
- ((p)->prio <= (p)->static_prio - DELTA(p))
|
|
-
|
|
-#define INTERACTIVE_SLEEP(p) \
|
|
- (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
|
|
- (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
|
|
-
|
|
-#define TASK_PREEMPTS_CURR(p, rq) \
|
|
- ((p)->prio < (rq)->curr->prio)
|
|
-
|
|
-#define SCALE_PRIO(x, prio) \
|
|
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
|
|
-
|
|
-static unsigned int static_prio_timeslice(int static_prio)
|
|
-{
|
|
- if (static_prio < NICE_TO_PRIO(0))
|
|
- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
|
|
- else
|
|
- return SCALE_PRIO(DEF_TIMESLICE, static_prio);
|
|
-}
|
|
-
|
|
-#ifdef CONFIG_SMP
|
|
-/*
|
|
- * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
|
|
- * Since cpu_power is a 'constant', we can use a reciprocal divide.
|
|
+#define MS_TO_NS(TIME) ((TIME) * 1000000)
|
|
+#define MS_TO_US(TIME) ((TIME) * 1000)
|
|
+#define US_TO_MS(TIME) ((TIME) / 1000)
|
|
+
|
|
+#define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio)
|
|
+
|
|
+/*
|
|
+ * This is the time all tasks within the same priority round robin.
|
|
+ * Value is in ms and set to a minimum of 10ms. Scales with number of cpus.
|
|
+ * Tunable via /proc interface.
|
|
+ */
|
|
+int rr_interval __read_mostly = 6;
|
|
+int sched_interactive __read_mostly = 1;
|
|
+
|
|
+/*
|
|
+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
|
|
+ * are allowed to run (over ISO_PERIOD seconds) as real time tasks.
|
|
+ * sched_iso_period - sysctl which determines the number of seconds over
|
|
+ * which cpu usage of SCHED_ISO tasks is averaged to determine if they are
|
|
+ * exceeding their allowable bandwidth.
|
|
+*/
|
|
+int sched_iso_cpu __read_mostly = 80;
|
|
+int sched_iso_period __read_mostly = 5;
|
|
+
|
|
+#define ISO_PERIOD ((sched_iso_period * HZ) + 1)
|
|
+
|
|
+/*
|
|
+ * This contains a bitmap for each dynamic priority level with empty slots
|
|
+ * for the valid priorities each different nice level can have. It allows
|
|
+ * us to stagger the slots where differing priorities run in a way that
|
|
+ * keeps latency differences between different nice levels at a minimum.
|
|
+ * The purpose of a pre-generated matrix is for rapid lookup of next slot in
|
|
+ * O(1) time without having to recalculate every time priority gets demoted.
|
|
+ * All nice levels use priority slot 39 as this allows less niced tasks to
|
|
+ * get all priority slots better than that before expiration is forced.
|
|
+ * ie, where 0 means a slot for that priority, priority running from left to
|
|
+ * right is from prio 0 to prio 39:
|
|
+ * nice -20 0000000000000000000000000000000000000000
|
|
+ * nice -10 1000100010001000100010001000100010010000
|
|
+ * nice 0 1010101010101010101010101010101010101010
|
|
+ * nice 5 1011010110110101101101011011010110110110
|
|
+ * nice 10 1110111011101110111011101110111011101110
|
|
+ * nice 15 1111111011111110111111101111111011111110
|
|
+ * nice 19 1111111111111111111111111111111111111110
|
|
*/
|
|
-static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
|
|
-{
|
|
- return reciprocal_divide(load, sg->reciprocal_cpu_power);
|
|
-}
|
|
+static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)]
|
|
+ __read_mostly;
|
|
|
|
-/*
|
|
- * Each time a sched group cpu_power is changed,
|
|
- * we must compute its reciprocal value
|
|
- */
|
|
-static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
|
|
-{
|
|
- sg->__cpu_power += val;
|
|
- sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
|
|
-}
|
|
-#endif
|
|
+struct rq;
|
|
|
|
/*
|
|
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
|
|
- * to time slice values: [800ms ... 100ms ... 5ms]
|
|
- *
|
|
- * The higher a thread's priority, the bigger timeslices
|
|
- * it gets during one round of execution. But even the lowest
|
|
- * priority thread gets MIN_TIMESLICE worth of execution time.
|
|
+ * These are the runqueue data structures:
|
|
*/
|
|
+struct prio_array {
|
|
+ /* Tasks queued at each priority */
|
|
+ struct list_head queue[MAX_PRIO + 1];
|
|
|
|
-static inline unsigned int task_timeslice(struct task_struct *p)
|
|
-{
|
|
- return static_prio_timeslice(p->static_prio);
|
|
-}
|
|
+ /*
|
|
+ * The bitmap of priorities queued for this array. While the expired
|
|
+ * array will never have realtime tasks on it, it is simpler to have
|
|
+ * equal sized bitmaps for a cheap array swap. Include 1 bit for
|
|
+ * delimiter.
|
|
+ */
|
|
+ DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
|
|
|
|
-/*
|
|
- * These are the runqueue data structures:
|
|
- */
|
|
+ /*
|
|
+ * The best static priority (of the dynamic priority tasks) queued
|
|
+ * this array.
|
|
+ */
|
|
+ int best_static_prio;
|
|
|
|
-struct prio_array {
|
|
- unsigned int nr_active;
|
|
- DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
|
|
- struct list_head queue[MAX_PRIO];
|
|
+#ifdef CONFIG_SMP
|
|
+ /* For convenience looks back at rq */
|
|
+ struct rq *rq;
|
|
+#endif
|
|
};
|
|
|
|
/*
|
|
@@ -260,14 +200,28 @@
|
|
*/
|
|
unsigned long nr_uninterruptible;
|
|
|
|
- unsigned long expired_timestamp;
|
|
/* Cached timestamp set by update_cpu_clock() */
|
|
unsigned long long most_recent_timestamp;
|
|
struct task_struct *curr, *idle;
|
|
unsigned long next_balance;
|
|
struct mm_struct *prev_mm;
|
|
- struct prio_array *active, *expired, arrays[2];
|
|
- int best_expired_prio;
|
|
+
|
|
+ struct prio_array *active, *expired, *idleprio, arrays[2];
|
|
+ unsigned long *dyn_bitmap, *exp_bitmap;
|
|
+
|
|
+ /*
|
|
+ * The current dynamic priority level this runqueue is at per static
|
|
+ * priority level.
|
|
+ */
|
|
+ int prio_level[PRIO_RANGE];
|
|
+
|
|
+ /* How many times we have rotated the priority queue */
|
|
+ unsigned long prio_rotation;
|
|
+ unsigned long iso_ticks;
|
|
+ unsigned short iso_refractory;
|
|
+
|
|
+ /* Number of idleprio tasks running */
|
|
+ unsigned long nr_idleprio;
|
|
atomic_t nr_iowait;
|
|
|
|
#ifdef CONFIG_SMP
|
|
@@ -606,12 +560,9 @@
|
|
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
|
|
/*
|
|
* Called when a process is dequeued from the active array and given
|
|
- * the cpu. We should note that with the exception of interactive
|
|
- * tasks, the expired queue will become the active queue after the active
|
|
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
|
|
- * expired queue. (Interactive tasks may be requeued directly to the
|
|
- * active queue, thus delaying tasks in the expired queue from running;
|
|
- * see scheduler_tick()).
|
|
+ * the cpu. We should note that the expired queue will become the active
|
|
+ * queue after the active queue is empty, without explicitly dequeuing and
|
|
+ * requeuing tasks in the expired queue.
|
|
*
|
|
* This function is only called from sched_info_arrive(), rather than
|
|
* dequeue_task(). Even though a task may be queued and dequeued multiple
|
|
@@ -709,71 +660,304 @@
|
|
#define sched_info_switch(t, next) do { } while (0)
|
|
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
|
|
|
|
+static int idleprio_suitable(struct task_struct *p)
|
|
+{
|
|
+ return (!p->mutexes_held && !freezing(p) && !signal_pending(p) &&
|
|
+ !(p->flags & (PF_NONSLEEP | PF_EXITING)));
|
|
+}
|
|
+
|
|
+static int idleprio(const struct task_struct *p)
|
|
+{
|
|
+ return (p->prio == MAX_PRIO);
|
|
+}
|
|
+
|
|
+static inline int task_queued(struct task_struct *task)
|
|
+{
|
|
+ return !list_empty(&task->run_list);
|
|
+}
|
|
+
|
|
+static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ __set_bit(p->prio, p->array->prio_bitmap);
|
|
+}
|
|
+
|
|
/*
|
|
- * Adding/removing a task to/from a priority array:
|
|
+ * Removing from a runqueue.
|
|
*/
|
|
-static void dequeue_task(struct task_struct *p, struct prio_array *array)
|
|
+static void dequeue_task(struct task_struct *p, struct rq *rq)
|
|
{
|
|
- array->nr_active--;
|
|
- list_del(&p->run_list);
|
|
- if (list_empty(array->queue + p->prio))
|
|
- __clear_bit(p->prio, array->bitmap);
|
|
+ list_del_init(&p->run_list);
|
|
+ if (idleprio_task(p) && idleprio(p))
|
|
+ rq->nr_idleprio--;
|
|
+ else if (list_empty(p->array->queue + p->prio))
|
|
+ __clear_bit(p->prio, p->array->prio_bitmap);
|
|
}
|
|
|
|
-static void enqueue_task(struct task_struct *p, struct prio_array *array)
|
|
+static void reset_first_time_slice(struct task_struct *p)
|
|
{
|
|
- sched_info_queued(p);
|
|
- list_add_tail(&p->run_list, array->queue + p->prio);
|
|
- __set_bit(p->prio, array->bitmap);
|
|
- array->nr_active++;
|
|
+ if (unlikely(p->first_time_slice))
|
|
+ p->first_time_slice = 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * The task is being queued on a fresh array so it has its entitlement
|
|
+ * bitmap cleared.
|
|
+ */
|
|
+static void task_new_array(struct task_struct *p, struct rq *rq,
|
|
+ struct prio_array *array)
|
|
+{
|
|
+ bitmap_zero(p->bitmap, PRIO_RANGE);
|
|
+ p->rotation = rq->prio_rotation;
|
|
+ p->time_slice = p->quota;
|
|
p->array = array;
|
|
+ reset_first_time_slice(p);
|
|
+}
|
|
+
|
|
+/* Find the first slot from the relevant prio_matrix entry */
|
|
+static int first_prio_slot(struct task_struct *p)
|
|
+{
|
|
+ if (unlikely(p->policy == SCHED_BATCH))
|
|
+ return p->static_prio;
|
|
+ return SCHED_PRIO(find_first_zero_bit(
|
|
+ prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE));
|
|
}
|
|
|
|
/*
|
|
- * Put task to the end of the run list without the overhead of dequeue
|
|
- * followed by enqueue.
|
|
+ * In sched_interactive mode priority allocation occurs per process per rq
|
|
+ * array swap. In !sched_interactive mode all waking tasks must obey the
|
|
+ * current prio level of all other tasks running per array swap.
|
|
*/
|
|
-static void requeue_task(struct task_struct *p, struct prio_array *array)
|
|
+static int minprio(struct rq *rq, int uprio)
|
|
{
|
|
- list_move_tail(&p->run_list, array->queue + p->prio);
|
|
+ if (sched_interactive)
|
|
+ return MAX_RT_PRIO;
|
|
+ return rq->prio_level[uprio];
|
|
}
|
|
|
|
-static inline void
|
|
-enqueue_task_head(struct task_struct *p, struct prio_array *array)
|
|
+/*
|
|
+ * Find the first unused slot by this task that is also in its prio_matrix
|
|
+ * level. SCHED_BATCH tasks do not use the priority matrix. They only take
|
|
+ * priority slots from their static_prio and above.
|
|
+ */
|
|
+static int next_entitled_slot(struct task_struct *p, struct rq *rq)
|
|
{
|
|
- list_add(&p->run_list, array->queue + p->prio);
|
|
- __set_bit(p->prio, array->bitmap);
|
|
- array->nr_active++;
|
|
- p->array = array;
|
|
+ int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio);
|
|
+ struct prio_array *array = rq->active;
|
|
+ DECLARE_BITMAP(tmp, PRIO_RANGE);
|
|
+
|
|
+ /*
|
|
+ * Go straight to expiration if there are higher priority tasks
|
|
+ * already expired.
|
|
+ */
|
|
+ if (p->static_prio > rq->expired->best_static_prio)
|
|
+ return MAX_PRIO;
|
|
+ if (!rq->prio_level[uprio])
|
|
+ rq->prio_level[uprio] = MAX_RT_PRIO;
|
|
+ /*
|
|
+ * Only priorities equal to the prio_level and above for their
|
|
+ * static_prio are acceptable, and only if it's not better than
|
|
+ * a queued better static_prio's prio_level.
|
|
+ */
|
|
+ if (p->static_prio < array->best_static_prio) {
|
|
+ if (likely(p->policy != SCHED_BATCH))
|
|
+ array->best_static_prio = p->static_prio;
|
|
+ } else if (p->static_prio == array->best_static_prio) {
|
|
+ search_prio = minprio(rq, uprio);
|
|
+ } else {
|
|
+ int i;
|
|
+
|
|
+ search_prio = minprio(rq, uprio);
|
|
+ /* A bound O(n) function, worst case n is 40 */
|
|
+ for (i = array->best_static_prio; i <= p->static_prio ; i++) {
|
|
+ if (!rq->prio_level[USER_PRIO(i)])
|
|
+ rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO;
|
|
+ search_prio = max(search_prio,
|
|
+ rq->prio_level[USER_PRIO(i)]);
|
|
+ }
|
|
+ }
|
|
+ if (unlikely(p->policy == SCHED_BATCH)) {
|
|
+ search_prio = max(search_prio, p->static_prio);
|
|
+ return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE,
|
|
+ USER_PRIO(search_prio)));
|
|
+ }
|
|
+ bitmap_or(tmp, p->bitmap, prio_matrix[uprio], PRIO_RANGE);
|
|
+ return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE,
|
|
+ USER_PRIO(search_prio)));
|
|
+}
|
|
+
|
|
+static void queue_expired(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ task_new_array(p, rq, rq->expired);
|
|
+ p->prio = p->normal_prio = first_prio_slot(p);
|
|
+ if (p->static_prio < rq->expired->best_static_prio)
|
|
+ rq->expired->best_static_prio = p->static_prio;
|
|
+ reset_first_time_slice(p);
|
|
}
|
|
|
|
+#ifdef CONFIG_SMP
|
|
/*
|
|
- * __normal_prio - return the priority that is based on the static
|
|
- * priority but is modified by bonuses/penalties.
|
|
- *
|
|
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
|
|
- * into the -5 ... 0 ... +5 bonus/penalty range.
|
|
- *
|
|
- * We use 25% of the full 0...39 priority range so that:
|
|
- *
|
|
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
|
|
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
|
|
- *
|
|
- * Both properties are important to certain workloads.
|
|
+ * If we're waking up a task that was previously on a different runqueue,
|
|
+ * update its data appropriately. Note we may be reading data from src_rq->
|
|
+ * outside of lock, but the occasional inaccurate result should be harmless.
|
|
*/
|
|
+ static void update_if_moved(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ struct rq *src_rq = p->array->rq;
|
|
+
|
|
+ if (src_rq == rq)
|
|
+ return;
|
|
+ /*
|
|
+ * Only need to set p->array when p->rotation == rq->prio_rotation as
|
|
+ * they will be set in recalc_task_prio when != rq->prio_rotation.
|
|
+ */
|
|
+ if (p->rotation == src_rq->prio_rotation) {
|
|
+ p->rotation = rq->prio_rotation;
|
|
+ if (p->array == src_rq->expired)
|
|
+ p->array = rq->expired;
|
|
+ else
|
|
+ p->array = rq->active;
|
|
+ } else
|
|
+ p->rotation = 0;
|
|
+}
|
|
+#else
|
|
+static inline void update_if_moved(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+}
|
|
+#endif
|
|
|
|
-static inline int __normal_prio(struct task_struct *p)
|
|
+static inline int isoprio_suitable(struct task_struct *p)
|
|
{
|
|
- int bonus, prio;
|
|
+ return !(p->flags & PF_ISOREF);
|
|
+}
|
|
|
|
- bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
|
|
+static int task_timeslice(struct task_struct *p);
|
|
|
|
- prio = p->static_prio - bonus;
|
|
- if (prio < MAX_RT_PRIO)
|
|
- prio = MAX_RT_PRIO;
|
|
- if (prio > MAX_PRIO-1)
|
|
- prio = MAX_PRIO-1;
|
|
- return prio;
|
|
+/*
|
|
+ * recalc_task_prio determines what priority a non rt_task will be
|
|
+ * queued at. If the task has already been running during this runqueue's
|
|
+ * major rotation (rq->prio_rotation) then it continues at the same
|
|
+ * priority if it has tick entitlement left. If it does not have entitlement
|
|
+ * left, it finds the next priority slot according to its nice value that it
|
|
+ * has not extracted quota from. If it has not run during this major
|
|
+ * rotation, it starts at the next_entitled_slot and has its bitmap quota
|
|
+ * cleared. If it does not have any slots left it has all its slots reset and
|
|
+ * is queued on the expired at its first_prio_slot.
|
|
+ */
|
|
+static void recalc_task_prio(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ struct prio_array *array = rq->active;
|
|
+ int queue_prio;
|
|
+
|
|
+ if (iso_task(p)) {
|
|
+ if (isoprio_suitable(p)) {
|
|
+ /*
|
|
+ * If SCHED_ISO tasks have not used up their real time
|
|
+ * quota they have run just better than highest
|
|
+ * SCHED_NORMAL priority. Otherwise they run as
|
|
+ * SCHED_NORMAL.
|
|
+ */
|
|
+ p->prio = p->normal_prio = ISO_PRIO;
|
|
+ p->array = rq->active;
|
|
+ if (p->time_slice <= 0)
|
|
+ p->time_slice = p->quota;
|
|
+ return;
|
|
+ } else if (p->prio == ISO_PRIO) {
|
|
+ /* Just about to be demoted to SCHED_NORMAL */
|
|
+ p->time_slice = 0;
|
|
+ }
|
|
+ } else if (idleprio_task(p)) {
|
|
+ if (idleprio_suitable(p)) {
|
|
+ /*
|
|
+ * If suitable idleprio_tasks are queued at MAX_PRIO
|
|
+ * only on the idleprio array. Their time_slice is
|
|
+ * their full task_timeslice as they cooperatively
|
|
+ * multitask.
|
|
+ */
|
|
+ p->prio = p->normal_prio = MAX_PRIO;
|
|
+ p->array = rq->idleprio;
|
|
+ if (p->time_slice <= 0)
|
|
+ p->time_slice = task_timeslice(p);
|
|
+ return;
|
|
+ }
|
|
+ /*
|
|
+ * If unsuitable idleprio_tasks are queued equivalent to
|
|
+ * nice 19 tasks on the expired array.
|
|
+ */
|
|
+ p->flags &= ~PF_NONSLEEP;
|
|
+ p->prio = p->normal_prio = MAX_PRIO - 1;
|
|
+ p->array = rq->expired;
|
|
+ if (p->time_slice <= 0 || p->time_slice > p->quota)
|
|
+ p->time_slice = p->quota;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ update_if_moved(p, rq);
|
|
+ if (p->rotation == rq->prio_rotation) {
|
|
+ if (p->array == array) {
|
|
+ if (p->time_slice > 0)
|
|
+ return;
|
|
+ p->time_slice = p->quota;
|
|
+ } else if (p->array == rq->expired) {
|
|
+ queue_expired(p, rq);
|
|
+ return;
|
|
+ } else
|
|
+ task_new_array(p, rq, array);
|
|
+ } else
|
|
+ task_new_array(p, rq, array);
|
|
+
|
|
+ queue_prio = next_entitled_slot(p, rq);
|
|
+ if (queue_prio >= MAX_PRIO) {
|
|
+ queue_expired(p, rq);
|
|
+ return;
|
|
+ }
|
|
+ p->prio = p->normal_prio = queue_prio;
|
|
+ __set_bit(USER_PRIO(p->prio), p->bitmap);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Adding to a runqueue. The dynamic priority queue that it is added to is
|
|
+ * determined by recalc_task_prio() above.
|
|
+ */
|
|
+static inline void __enqueue_task(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ if (rt_task(p))
|
|
+ p->array = rq->active;
|
|
+ else
|
|
+ recalc_task_prio(p, rq);
|
|
+
|
|
+ if (idleprio_task(p) && idleprio(p))
|
|
+ rq->nr_idleprio++;
|
|
+ sched_info_queued(p);
|
|
+ set_dynamic_bit(p, rq);
|
|
+}
|
|
+
|
|
+static void enqueue_task(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ __enqueue_task(p, rq);
|
|
+ list_add_tail(&p->run_list, p->array->queue + p->prio);
|
|
+}
|
|
+
|
|
+static inline void enqueue_task_head(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ __enqueue_task(p, rq);
|
|
+ list_add(&p->run_list, p->array->queue + p->prio);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * requeue_task is only called when p->static_prio does not change. p->prio
|
|
+ * can change with dynamic tasks.
|
|
+ */
|
|
+static void requeue_task(struct task_struct *p, struct rq *rq,
|
|
+ struct prio_array *old_array, int old_prio)
|
|
+{
|
|
+ if (p->array == rq->expired)
|
|
+ queue_expired(p, rq);
|
|
+ list_move_tail(&p->run_list, p->array->queue + p->prio);
|
|
+ if (!rt_task(p)) {
|
|
+ if (list_empty(old_array->queue + old_prio))
|
|
+ __clear_bit(old_prio, old_array->prio_bitmap);
|
|
+ set_dynamic_bit(p, rq);
|
|
+ }
|
|
}
|
|
|
|
/*
|
|
@@ -786,20 +970,29 @@
|
|
*/
|
|
|
|
/*
|
|
- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
|
|
- * If static_prio_timeslice() is ever changed to break this assumption then
|
|
- * this code will need modification
|
|
- */
|
|
-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
|
|
-#define LOAD_WEIGHT(lp) \
|
|
- (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
|
|
-#define PRIO_TO_LOAD_WEIGHT(prio) \
|
|
- LOAD_WEIGHT(static_prio_timeslice(prio))
|
|
-#define RTPRIO_TO_LOAD_WEIGHT(rp) \
|
|
- (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
|
|
+ * task_timeslice - the total duration a task can run during one major
|
|
+ * rotation. Returns value in milliseconds as the smallest value can be 1.
|
|
+ */
|
|
+static int task_timeslice(struct task_struct *p)
|
|
+{
|
|
+ int slice = p->quota; /* quota is in us */
|
|
+
|
|
+ if (!rt_task(p))
|
|
+ slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice;
|
|
+ return US_TO_MS(slice);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * The load weight is basically the task_timeslice in ms. Realtime tasks are
|
|
+ * special cased to be proportionately larger than nice -20 by their
|
|
+ * rt_priority. The weight for rt tasks can only be arbitrary at best.
|
|
+ */
|
|
+#define RTPRIO_TO_LOAD_WEIGHT(rp) (rr_interval * 20 * (40 + rp))
|
|
|
|
static void set_load_weight(struct task_struct *p)
|
|
{
|
|
+ int load_weight;
|
|
+
|
|
if (has_rt_policy(p)) {
|
|
#ifdef CONFIG_SMP
|
|
if (p == task_rq(p)->migration_thread)
|
|
@@ -808,12 +1001,19 @@
|
|
* Giving its load any weight will skew balancing
|
|
* adversely.
|
|
*/
|
|
- p->load_weight = 0;
|
|
+ load_weight = 0;
|
|
else
|
|
#endif
|
|
- p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
|
|
+ load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
|
|
} else
|
|
- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
|
|
+ load_weight = task_timeslice(p);
|
|
+ /*
|
|
+ * idleprio tasks have much lower weight than SCHED_NORMAL tasks but
|
|
+ * still need to be weighted to allow balancing to occur.
|
|
+ */
|
|
+ if (likely(!idleprio_task(p)))
|
|
+ load_weight *= PRIO_RANGE;
|
|
+ p->load_weight = load_weight;
|
|
}
|
|
|
|
static inline void
|
|
@@ -841,28 +1041,38 @@
|
|
}
|
|
|
|
/*
|
|
- * Calculate the expected normal priority: i.e. priority
|
|
- * without taking RT-inheritance into account. Might be
|
|
- * boosted by interactivity modifiers. Changes upon fork,
|
|
- * setprio syscalls, and whenever the interactivity
|
|
- * estimator recalculates.
|
|
+ * __activate_task - move a task to the runqueue.
|
|
*/
|
|
-static inline int normal_prio(struct task_struct *p)
|
|
+static inline void __activate_task(struct task_struct *p, struct rq *rq)
|
|
{
|
|
- int prio;
|
|
+ enqueue_task(p, rq);
|
|
+ inc_nr_running(p, rq);
|
|
+}
|
|
|
|
+/*
|
|
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
|
|
+ */
|
|
+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ enqueue_task_head(p, rq);
|
|
+ inc_nr_running(p, rq);
|
|
+}
|
|
+
|
|
+static inline int normal_prio(struct task_struct *p)
|
|
+{
|
|
if (has_rt_policy(p))
|
|
- prio = MAX_RT_PRIO-1 - p->rt_priority;
|
|
+ return MAX_RT_PRIO-1 - p->rt_priority;
|
|
+ /* Other tasks all have normal_prio set in recalc_task_prio */
|
|
+ if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO))
|
|
+ return p->prio;
|
|
else
|
|
- prio = __normal_prio(p);
|
|
- return prio;
|
|
+ return p->static_prio;
|
|
}
|
|
|
|
/*
|
|
* Calculate the current priority, i.e. the priority
|
|
* taken into account by the scheduler. This value might
|
|
- * be boosted by RT tasks, or might be boosted by
|
|
- * interactivity modifiers. Will be RT if the task got
|
|
+ * be boosted by RT tasks as it will be RT if the task got
|
|
* RT-boosted. If not then it returns p->normal_prio.
|
|
*/
|
|
static int effective_prio(struct task_struct *p)
|
|
@@ -878,112 +1088,70 @@
|
|
return p->prio;
|
|
}
|
|
|
|
-/*
|
|
- * __activate_task - move a task to the runqueue.
|
|
- */
|
|
-static void __activate_task(struct task_struct *p, struct rq *rq)
|
|
+static inline unsigned int nice_quota_ms(int nice)
|
|
{
|
|
- struct prio_array *target = rq->active;
|
|
+ unsigned int rr = rr_interval;
|
|
|
|
- if (batch_task(p))
|
|
- target = rq->expired;
|
|
- enqueue_task(p, target);
|
|
- inc_nr_running(p, rq);
|
|
+ if (nice < -6) {
|
|
+ rr *= nice * nice;
|
|
+ rr /= 40;
|
|
+ } else if (nice > 0)
|
|
+ rr = rr / 2 ? : 1;
|
|
+ return rr;
|
|
}
|
|
|
|
+#define DEFAULT_WEIGHT (nice_quota_ms(0) * 20 * PRIO_RANGE)
|
|
+
|
|
/*
|
|
- * __activate_idle_task - move idle task to the _front_ of runqueue.
|
|
+ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
|
|
+ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
|
|
+ * task of nice 0 or enough lower priority tasks to bring up the
|
|
+ * weighted_cpuload
|
|
*/
|
|
-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
|
|
+int above_background_load(void)
|
|
{
|
|
- enqueue_task_head(p, rq->active);
|
|
- inc_nr_running(p, rq);
|
|
+ unsigned long cpu;
|
|
+
|
|
+ for_each_online_cpu(cpu) {
|
|
+ if (weighted_cpuload(cpu) >= DEFAULT_WEIGHT)
|
|
+ return 1;
|
|
+ }
|
|
+ return 0;
|
|
}
|
|
|
|
/*
|
|
- * Recalculate p->normal_prio and p->prio after having slept,
|
|
- * updating the sleep-average too:
|
|
+ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval.
|
|
+ * From nice 1 to 19 they are smaller than it only if they are at least one
|
|
+ * tick still. Below nice 0 they get progressively larger.
|
|
+ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval
|
|
+ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2.
|
|
+ * Value returned is in microseconds.
|
|
*/
|
|
-static int recalc_task_prio(struct task_struct *p, unsigned long long now)
|
|
+static inline unsigned int rr_quota(struct task_struct *p)
|
|
{
|
|
- /* Caller must always ensure 'now >= p->timestamp' */
|
|
- unsigned long sleep_time = now - p->timestamp;
|
|
+ unsigned int quota;
|
|
|
|
- if (batch_task(p))
|
|
- sleep_time = 0;
|
|
-
|
|
- if (likely(sleep_time > 0)) {
|
|
- /*
|
|
- * This ceiling is set to the lowest priority that would allow
|
|
- * a task to be reinserted into the active array on timeslice
|
|
- * completion.
|
|
- */
|
|
- unsigned long ceiling = INTERACTIVE_SLEEP(p);
|
|
-
|
|
- if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
|
|
- /*
|
|
- * Prevents user tasks from achieving best priority
|
|
- * with one single large enough sleep.
|
|
- */
|
|
- p->sleep_avg = ceiling;
|
|
- /*
|
|
- * Using INTERACTIVE_SLEEP() as a ceiling places a
|
|
- * nice(0) task 1ms sleep away from promotion, and
|
|
- * gives it 700ms to round-robin with no chance of
|
|
- * being demoted. This is more than generous, so
|
|
- * mark this sleep as non-interactive to prevent the
|
|
- * on-runqueue bonus logic from intervening should
|
|
- * this task not receive cpu immediately.
|
|
- */
|
|
- p->sleep_type = SLEEP_NONINTERACTIVE;
|
|
- } else {
|
|
- /*
|
|
- * Tasks waking from uninterruptible sleep are
|
|
- * limited in their sleep_avg rise as they
|
|
- * are likely to be waiting on I/O
|
|
- */
|
|
- if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
|
|
- if (p->sleep_avg >= ceiling)
|
|
- sleep_time = 0;
|
|
- else if (p->sleep_avg + sleep_time >=
|
|
- ceiling) {
|
|
- p->sleep_avg = ceiling;
|
|
- sleep_time = 0;
|
|
- }
|
|
- }
|
|
-
|
|
- /*
|
|
- * This code gives a bonus to interactive tasks.
|
|
- *
|
|
- * The boost works by updating the 'average sleep time'
|
|
- * value here, based on ->timestamp. The more time a
|
|
- * task spends sleeping, the higher the average gets -
|
|
- * and the higher the priority boost gets as well.
|
|
- */
|
|
- p->sleep_avg += sleep_time;
|
|
-
|
|
- }
|
|
- if (p->sleep_avg > NS_MAX_SLEEP_AVG)
|
|
- p->sleep_avg = NS_MAX_SLEEP_AVG;
|
|
- }
|
|
+ if (rt_task(p))
|
|
+ quota = rr_interval;
|
|
+ else
|
|
+ quota = nice_quota_ms(TASK_NICE(p));
|
|
+ return MS_TO_US(quota);
|
|
+}
|
|
|
|
- return effective_prio(p);
|
|
+/* Every time we set the quota we need to set the load weight */
|
|
+static void set_quota(struct task_struct *p)
|
|
+{
|
|
+ p->quota = rr_quota(p);
|
|
+ set_load_weight(p);
|
|
}
|
|
|
|
/*
|
|
* activate_task - move a task to the runqueue and do priority recalculation
|
|
- *
|
|
- * Update all the scheduling statistics stuff. (sleep average
|
|
- * calculation, priority modifiers, etc.)
|
|
*/
|
|
static void activate_task(struct task_struct *p, struct rq *rq, int local)
|
|
{
|
|
- unsigned long long now;
|
|
-
|
|
- if (rt_task(p))
|
|
- goto out;
|
|
+ unsigned long long now = sched_clock();
|
|
|
|
- now = sched_clock();
|
|
#ifdef CONFIG_SMP
|
|
if (!local) {
|
|
/* Compensate for drifting sched_clock */
|
|
@@ -1004,32 +1172,9 @@
|
|
(now - p->timestamp) >> 20);
|
|
}
|
|
|
|
- p->prio = recalc_task_prio(p, now);
|
|
-
|
|
- /*
|
|
- * This checks to make sure it's not an uninterruptible task
|
|
- * that is now waking up.
|
|
- */
|
|
- if (p->sleep_type == SLEEP_NORMAL) {
|
|
- /*
|
|
- * Tasks which were woken up by interrupts (ie. hw events)
|
|
- * are most likely of interactive nature. So we give them
|
|
- * the credit of extending their sleep time to the period
|
|
- * of time they spend on the runqueue, waiting for execution
|
|
- * on a CPU, first time around:
|
|
- */
|
|
- if (in_interrupt())
|
|
- p->sleep_type = SLEEP_INTERRUPTED;
|
|
- else {
|
|
- /*
|
|
- * Normal first-time wakeups get a credit too for
|
|
- * on-runqueue time, but it will be weighted down:
|
|
- */
|
|
- p->sleep_type = SLEEP_INTERACTIVE;
|
|
- }
|
|
- }
|
|
+ set_quota(p);
|
|
+ p->prio = effective_prio(p);
|
|
p->timestamp = now;
|
|
-out:
|
|
__activate_task(p, rq);
|
|
}
|
|
|
|
@@ -1039,8 +1184,7 @@
|
|
static void deactivate_task(struct task_struct *p, struct rq *rq)
|
|
{
|
|
dec_nr_running(p, rq);
|
|
- dequeue_task(p, p->array);
|
|
- p->array = NULL;
|
|
+ dequeue_task(p, rq);
|
|
}
|
|
|
|
/*
|
|
@@ -1133,7 +1277,7 @@
|
|
* If the task is not on a runqueue (and not running), then
|
|
* it is sufficient to simply update the task's cpu field.
|
|
*/
|
|
- if (!p->array && !task_running(rq, p)) {
|
|
+ if (!task_queued(p) && !task_running(rq, p)) {
|
|
set_task_cpu(p, dest_cpu);
|
|
return 0;
|
|
}
|
|
@@ -1159,7 +1303,6 @@
|
|
{
|
|
unsigned long flags;
|
|
struct rq *rq;
|
|
- struct prio_array *array;
|
|
int running;
|
|
|
|
repeat:
|
|
@@ -1192,7 +1335,6 @@
|
|
*/
|
|
rq = task_rq_lock(p, &flags);
|
|
running = task_running(rq, p);
|
|
- array = p->array;
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
/*
|
|
@@ -1215,7 +1357,7 @@
|
|
* running right now), it's preempted, and we should
|
|
* yield - it could be a while.
|
|
*/
|
|
- if (unlikely(array)) {
|
|
+ if (unlikely(task_queued(p))) {
|
|
yield();
|
|
goto repeat;
|
|
}
|
|
@@ -1294,6 +1436,25 @@
|
|
}
|
|
|
|
/*
|
|
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
|
|
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
|
|
+ */
|
|
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
|
|
+{
|
|
+ return reciprocal_divide(load, sg->reciprocal_cpu_power);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Each time a sched group cpu_power is changed,
|
|
+ * we must compute its reciprocal value
|
|
+ */
|
|
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
|
|
+{
|
|
+ sg->__cpu_power += val;
|
|
+ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
|
|
+}
|
|
+
|
|
+/*
|
|
* find_idlest_group finds and returns the least busy CPU group within the
|
|
* domain.
|
|
*/
|
|
@@ -1490,6 +1651,31 @@
|
|
}
|
|
#endif
|
|
|
|
+/*
|
|
+ * We need to have a special definition for an idle runqueue when testing
|
|
+ * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as
|
|
+ * a realtime task in sched_idle_next.
|
|
+ */
|
|
+#ifdef CONFIG_HOTPLUG_CPU
|
|
+#define rq_idle(rq) ((rq)->curr == (rq)->idle && !rt_task((rq)->curr))
|
|
+#else
|
|
+#define rq_idle(rq) ((rq)->curr == (rq)->idle)
|
|
+#endif
|
|
+
|
|
+static inline int task_preempts_curr(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ struct task_struct *curr = rq->curr;
|
|
+
|
|
+ return ((p->array == task_rq(p)->active &&
|
|
+ TASK_PREEMPTS_CURR(p, curr)) || rq_idle(rq));
|
|
+}
|
|
+
|
|
+static inline void try_preempt(struct task_struct *p, struct rq *rq)
|
|
+{
|
|
+ if (task_preempts_curr(p, rq))
|
|
+ resched_task(rq->curr);
|
|
+}
|
|
+
|
|
/***
|
|
* try_to_wake_up - wake up a thread
|
|
* @p: the to-be-woken-up thread
|
|
@@ -1521,7 +1707,7 @@
|
|
if (!(old_state & state))
|
|
goto out;
|
|
|
|
- if (p->array)
|
|
+ if (task_queued(p))
|
|
goto out_running;
|
|
|
|
cpu = task_cpu(p);
|
|
@@ -1614,7 +1800,7 @@
|
|
old_state = p->state;
|
|
if (!(old_state & state))
|
|
goto out;
|
|
- if (p->array)
|
|
+ if (task_queued(p))
|
|
goto out_running;
|
|
|
|
this_cpu = smp_processor_id();
|
|
@@ -1623,25 +1809,9 @@
|
|
|
|
out_activate:
|
|
#endif /* CONFIG_SMP */
|
|
- if (old_state == TASK_UNINTERRUPTIBLE) {
|
|
+ if (old_state == TASK_UNINTERRUPTIBLE)
|
|
rq->nr_uninterruptible--;
|
|
- /*
|
|
- * Tasks on involuntary sleep don't earn
|
|
- * sleep_avg beyond just interactive state.
|
|
- */
|
|
- p->sleep_type = SLEEP_NONINTERACTIVE;
|
|
- } else
|
|
-
|
|
- /*
|
|
- * Tasks that have marked their sleep as noninteractive get
|
|
- * woken up with their sleep average not weighted in an
|
|
- * interactive way.
|
|
- */
|
|
- if (old_state & TASK_NONINTERACTIVE)
|
|
- p->sleep_type = SLEEP_NONINTERACTIVE;
|
|
-
|
|
|
|
- activate_task(p, rq, cpu == this_cpu);
|
|
/*
|
|
* Sync wakeups (i.e. those types of wakeups where the waker
|
|
* has indicated that it will leave the CPU in short order)
|
|
@@ -1650,15 +1820,22 @@
|
|
* the waker guarantees that the freshly woken up task is going
|
|
* to be considered on this CPU.)
|
|
*/
|
|
- if (!sync || cpu != this_cpu) {
|
|
- if (TASK_PREEMPTS_CURR(p, rq))
|
|
- resched_task(rq->curr);
|
|
- }
|
|
+ activate_task(p, rq, cpu == this_cpu);
|
|
+ if (!sync || cpu != this_cpu)
|
|
+ try_preempt(p, rq);
|
|
success = 1;
|
|
|
|
out_running:
|
|
p->state = TASK_RUNNING;
|
|
out:
|
|
+ /*
|
|
+ * Special case when freezing we need to reschedule idleprio tasks
|
|
+ * as SCHED_NORMAL or else they'll never freeze
|
|
+ */
|
|
+ if (idleprio_task(p) && freezing(p) && idleprio(p)) {
|
|
+ dequeue_task(p, rq);
|
|
+ enqueue_task(p, rq);
|
|
+ }
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
return success;
|
|
@@ -1676,7 +1853,6 @@
|
|
return try_to_wake_up(p, state, 0);
|
|
}
|
|
|
|
-static void task_running_tick(struct rq *rq, struct task_struct *p);
|
|
/*
|
|
* Perform scheduler related setup for a newly forked process p.
|
|
* p is forked by current.
|
|
@@ -1704,7 +1880,6 @@
|
|
p->prio = current->normal_prio;
|
|
|
|
INIT_LIST_HEAD(&p->run_list);
|
|
- p->array = NULL;
|
|
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
|
|
if (unlikely(sched_info_on()))
|
|
memset(&p->sched_info, 0, sizeof(p->sched_info));
|
|
@@ -1716,30 +1891,31 @@
|
|
/* Want to start with kernel preemption disabled. */
|
|
task_thread_info(p)->preempt_count = 1;
|
|
#endif
|
|
+ if (unlikely(p->policy == SCHED_FIFO))
|
|
+ goto out;
|
|
/*
|
|
* Share the timeslice between parent and child, thus the
|
|
* total amount of pending timeslices in the system doesn't change,
|
|
* resulting in more scheduling fairness.
|
|
*/
|
|
local_irq_disable();
|
|
- p->time_slice = (current->time_slice + 1) >> 1;
|
|
- /*
|
|
- * The remainder of the first timeslice might be recovered by
|
|
- * the parent if the child exits early enough.
|
|
- */
|
|
- p->first_time_slice = 1;
|
|
- current->time_slice >>= 1;
|
|
- p->timestamp = sched_clock();
|
|
- if (unlikely(!current->time_slice)) {
|
|
+ if (current->time_slice > 0) {
|
|
+ current->time_slice /= 2;
|
|
+ if (current->time_slice)
|
|
+ p->time_slice = current->time_slice;
|
|
+ else
|
|
+ p->time_slice = 1;
|
|
/*
|
|
- * This case is rare, it happens when the parent has only
|
|
- * a single jiffy left from its timeslice. Taking the
|
|
- * runqueue lock is not a problem.
|
|
+ * The remainder of the first timeslice might be recovered by
|
|
+ * the parent if the child exits early enough.
|
|
*/
|
|
- current->time_slice = 1;
|
|
- task_running_tick(cpu_rq(cpu), current);
|
|
- }
|
|
+ p->first_time_slice = 1;
|
|
+ } else
|
|
+ p->time_slice = 0;
|
|
+
|
|
+ p->timestamp = sched_clock();
|
|
local_irq_enable();
|
|
+out:
|
|
put_cpu();
|
|
}
|
|
|
|
@@ -1761,38 +1937,16 @@
|
|
this_cpu = smp_processor_id();
|
|
cpu = task_cpu(p);
|
|
|
|
- /*
|
|
- * We decrease the sleep average of forking parents
|
|
- * and children as well, to keep max-interactive tasks
|
|
- * from forking tasks that are max-interactive. The parent
|
|
- * (current) is done further down, under its lock.
|
|
- */
|
|
- p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
|
|
- CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
|
|
-
|
|
- p->prio = effective_prio(p);
|
|
-
|
|
if (likely(cpu == this_cpu)) {
|
|
+ activate_task(p, rq, 1);
|
|
if (!(clone_flags & CLONE_VM)) {
|
|
/*
|
|
* The VM isn't cloned, so we're in a good position to
|
|
* do child-runs-first in anticipation of an exec. This
|
|
* usually avoids a lot of COW overhead.
|
|
*/
|
|
- if (unlikely(!current->array))
|
|
- __activate_task(p, rq);
|
|
- else {
|
|
- p->prio = current->prio;
|
|
- p->normal_prio = current->normal_prio;
|
|
- list_add_tail(&p->run_list, ¤t->run_list);
|
|
- p->array = current->array;
|
|
- p->array->nr_active++;
|
|
- inc_nr_running(p, rq);
|
|
- }
|
|
set_need_resched();
|
|
- } else
|
|
- /* Run child last */
|
|
- __activate_task(p, rq);
|
|
+ }
|
|
/*
|
|
* We skip the following code due to cpu == this_cpu
|
|
*
|
|
@@ -1809,19 +1963,16 @@
|
|
*/
|
|
p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
|
|
+ rq->most_recent_timestamp;
|
|
- __activate_task(p, rq);
|
|
- if (TASK_PREEMPTS_CURR(p, rq))
|
|
- resched_task(rq->curr);
|
|
+ activate_task(p, rq, 0);
|
|
+ try_preempt(p, rq);
|
|
|
|
/*
|
|
* Parent and child are on different CPUs, now get the
|
|
- * parent runqueue to update the parent's ->sleep_avg:
|
|
+ * parent runqueue to update the parent's ->flags:
|
|
*/
|
|
task_rq_unlock(rq, &flags);
|
|
this_rq = task_rq_lock(current, &flags);
|
|
}
|
|
- current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
|
|
- PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
|
|
task_rq_unlock(this_rq, &flags);
|
|
}
|
|
|
|
@@ -1836,23 +1987,17 @@
|
|
*/
|
|
void fastcall sched_exit(struct task_struct *p)
|
|
{
|
|
+ struct task_struct *parent;
|
|
unsigned long flags;
|
|
struct rq *rq;
|
|
|
|
- /*
|
|
- * If the child was a (relative-) CPU hog then decrease
|
|
- * the sleep_avg of the parent as well.
|
|
- */
|
|
- rq = task_rq_lock(p->parent, &flags);
|
|
- if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
|
|
- p->parent->time_slice += p->time_slice;
|
|
- if (unlikely(p->parent->time_slice > task_timeslice(p)))
|
|
- p->parent->time_slice = task_timeslice(p);
|
|
- }
|
|
- if (p->sleep_avg < p->parent->sleep_avg)
|
|
- p->parent->sleep_avg = p->parent->sleep_avg /
|
|
- (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
|
|
- (EXIT_WEIGHT + 1);
|
|
+ parent = p->parent;
|
|
+ rq = task_rq_lock(parent, &flags);
|
|
+ if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) {
|
|
+ parent->time_slice += p->time_slice;
|
|
+ if (unlikely(parent->time_slice > parent->quota))
|
|
+ parent->time_slice = parent->quota;
|
|
+ }
|
|
task_rq_unlock(rq, &flags);
|
|
}
|
|
|
|
@@ -2184,23 +2329,17 @@
|
|
* pull_task - move a task from a remote runqueue to the local runqueue.
|
|
* Both runqueues must be locked.
|
|
*/
|
|
-static void pull_task(struct rq *src_rq, struct prio_array *src_array,
|
|
- struct task_struct *p, struct rq *this_rq,
|
|
- struct prio_array *this_array, int this_cpu)
|
|
+static void pull_task(struct rq *src_rq, struct task_struct *p,
|
|
+ struct rq *this_rq, int this_cpu)
|
|
{
|
|
- dequeue_task(p, src_array);
|
|
+ dequeue_task(p, src_rq);
|
|
dec_nr_running(p, src_rq);
|
|
set_task_cpu(p, this_cpu);
|
|
inc_nr_running(p, this_rq);
|
|
- enqueue_task(p, this_array);
|
|
+ enqueue_task(p, this_rq);
|
|
p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
|
|
+ this_rq->most_recent_timestamp;
|
|
- /*
|
|
- * Note that idle threads have a prio of MAX_PRIO, for this test
|
|
- * to be always true for them.
|
|
- */
|
|
- if (TASK_PREEMPTS_CURR(p, this_rq))
|
|
- resched_task(this_rq->curr);
|
|
+ try_preempt(p, this_rq);
|
|
}
|
|
|
|
/*
|
|
@@ -2243,7 +2382,16 @@
|
|
return 1;
|
|
}
|
|
|
|
-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
|
|
+static inline int rq_best_prio(struct rq *rq)
|
|
+{
|
|
+ int best_prio, exp_prio;
|
|
+
|
|
+ best_prio = sched_find_first_bit(rq->dyn_bitmap);
|
|
+ exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO);
|
|
+ if (unlikely(best_prio > exp_prio))
|
|
+ best_prio = exp_prio;
|
|
+ return best_prio;
|
|
+}
|
|
|
|
/*
|
|
* move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
|
|
@@ -2259,7 +2407,7 @@
|
|
{
|
|
int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
|
|
best_prio_seen, skip_for_load;
|
|
- struct prio_array *array, *dst_array;
|
|
+ struct prio_array *array;
|
|
struct list_head *head, *curr;
|
|
struct task_struct *tmp;
|
|
long rem_load_move;
|
|
@@ -2286,31 +2434,29 @@
|
|
* be cache-cold, thus switching CPUs has the least effect
|
|
* on them.
|
|
*/
|
|
- if (busiest->expired->nr_active) {
|
|
- array = busiest->expired;
|
|
- dst_array = this_rq->expired;
|
|
- } else {
|
|
- array = busiest->active;
|
|
- dst_array = this_rq->active;
|
|
- }
|
|
-
|
|
+ array = busiest->expired;
|
|
new_array:
|
|
- /* Start searching at priority 0: */
|
|
- idx = 0;
|
|
+ /* Expired arrays don't have RT tasks so they're always MAX_RT_PRIO+ */
|
|
+ if (array == busiest->expired)
|
|
+ idx = MAX_RT_PRIO;
|
|
+ else
|
|
+ idx = 0;
|
|
skip_bitmap:
|
|
if (!idx)
|
|
- idx = sched_find_first_bit(array->bitmap);
|
|
+ idx = sched_find_first_bit(array->prio_bitmap);
|
|
else
|
|
- idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
|
|
- if (idx >= MAX_PRIO) {
|
|
- if (array == busiest->expired && busiest->active->nr_active) {
|
|
+ idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx);
|
|
+ if (idx == MAX_PRIO) {
|
|
+ if (array == busiest->idleprio && busiest->nr_idleprio)
|
|
+ goto found_idleprio;
|
|
+ if (array == busiest->expired) {
|
|
array = busiest->active;
|
|
- dst_array = this_rq->active;
|
|
goto new_array;
|
|
}
|
|
goto out;
|
|
}
|
|
|
|
+found_idleprio:
|
|
head = array->queue + idx;
|
|
curr = head->prev;
|
|
skip_queue:
|
|
@@ -2332,11 +2478,22 @@
|
|
best_prio_seen |= idx == best_prio;
|
|
if (curr != head)
|
|
goto skip_queue;
|
|
+ if (idx == MAX_PRIO) {
|
|
+ /*
|
|
+ * Occurs either when balancing idleprio tasks or
|
|
+ * there really are no more tasks to find.
|
|
+ */
|
|
+ if (array == busiest->expired) {
|
|
+ array = busiest->active;
|
|
+ goto new_array;
|
|
+ }
|
|
+ goto out;
|
|
+ }
|
|
idx++;
|
|
goto skip_bitmap;
|
|
}
|
|
|
|
- pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
|
|
+ pull_task(busiest, tmp, this_rq, this_cpu);
|
|
pulled++;
|
|
rem_load_move -= tmp->load_weight;
|
|
|
|
@@ -2349,6 +2506,13 @@
|
|
this_best_prio = idx;
|
|
if (curr != head)
|
|
goto skip_queue;
|
|
+ if (idx == MAX_PRIO) {
|
|
+ if (array == busiest->expired) {
|
|
+ array = busiest->active;
|
|
+ goto new_array;
|
|
+ }
|
|
+ goto out;
|
|
+ }
|
|
idx++;
|
|
goto skip_bitmap;
|
|
}
|
|
@@ -3297,11 +3461,36 @@
|
|
/*
|
|
* This is called on clock ticks and on context switches.
|
|
* Bank in p->sched_time the ns elapsed since the last tick or switch.
|
|
+ * CPU scheduler quota accounting is also performed here in microseconds.
|
|
+ * The value returned from sched_clock() occasionally gives bogus values so
|
|
+ * some sanity checking is required.
|
|
*/
|
|
-static inline void
|
|
-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
|
|
+static void
|
|
+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
|
|
+ int tick)
|
|
{
|
|
- p->sched_time += now - p->last_ran;
|
|
+ long time_diff = now - p->last_ran;
|
|
+
|
|
+ if (tick) {
|
|
+ /*
|
|
+ * Called from scheduler_tick() there should be less than two
|
|
+ * jiffies worth, and not negative/overflow.
|
|
+ */
|
|
+ if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0)
|
|
+ time_diff = JIFFIES_TO_NS(1);
|
|
+ } else {
|
|
+ /*
|
|
+ * Called from context_switch there should be less than one
|
|
+ * jiffy worth, and not negative/overflow. There should be
|
|
+ * some time banked here so use a nominal 1us.
|
|
+ */
|
|
+ if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1)
|
|
+ time_diff = 1000;
|
|
+ }
|
|
+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */
|
|
+ if (p != rq->idle && p->policy != SCHED_FIFO)
|
|
+ p->time_slice -= time_diff / 1000;
|
|
+ p->sched_time += time_diff;
|
|
p->last_ran = rq->most_recent_timestamp = now;
|
|
}
|
|
|
|
@@ -3322,27 +3511,6 @@
|
|
}
|
|
|
|
/*
|
|
- * We place interactive tasks back into the active array, if possible.
|
|
- *
|
|
- * To guarantee that this does not starve expired tasks we ignore the
|
|
- * interactivity of a task if the first expired task had to wait more
|
|
- * than a 'reasonable' amount of time. This deadline timeout is
|
|
- * load-dependent, as the frequency of array switched decreases with
|
|
- * increasing number of running tasks. We also ignore the interactivity
|
|
- * if a better static_prio task has expired:
|
|
- */
|
|
-static inline int expired_starving(struct rq *rq)
|
|
-{
|
|
- if (rq->curr->static_prio > rq->best_expired_prio)
|
|
- return 1;
|
|
- if (!STARVATION_LIMIT || !rq->expired_timestamp)
|
|
- return 0;
|
|
- if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
|
|
- return 1;
|
|
- return 0;
|
|
-}
|
|
-
|
|
-/*
|
|
* Account user cpu time to a process.
|
|
* @p: the process that the cpu time gets accounted to
|
|
* @hardirq_offset: the offset to subtract from hardirq_count()
|
|
@@ -3357,7 +3525,7 @@
|
|
|
|
/* Add user time to cpustat. */
|
|
tmp = cputime_to_cputime64(cputime);
|
|
- if (TASK_NICE(p) > 0)
|
|
+ if (TASK_NICE(p) > 0 || idleprio_task(p))
|
|
cpustat->nice = cputime64_add(cpustat->nice, tmp);
|
|
else
|
|
cpustat->user = cputime64_add(cpustat->user, tmp);
|
|
@@ -3415,87 +3583,94 @@
|
|
cpustat->steal = cputime64_add(cpustat->steal, tmp);
|
|
}
|
|
|
|
-static void task_running_tick(struct rq *rq, struct task_struct *p)
|
|
+/*
|
|
+ * The task has used up its quota of running in this prio_level so it must be
|
|
+ * dropped a priority level, all managed by recalc_task_prio().
|
|
+ */
|
|
+static void task_expired_entitlement(struct rq *rq, struct task_struct *p)
|
|
{
|
|
- if (p->array != rq->active) {
|
|
- /* Task has expired but was not scheduled yet */
|
|
- set_tsk_need_resched(p);
|
|
+ int overrun;
|
|
+
|
|
+ reset_first_time_slice(p);
|
|
+ if (rt_task(p)) {
|
|
+ p->time_slice += p->quota;
|
|
+ list_move_tail(&p->run_list, p->array->queue + p->prio);
|
|
return;
|
|
}
|
|
- spin_lock(&rq->lock);
|
|
+ overrun = p->time_slice;
|
|
+ dequeue_task(p, rq);
|
|
+ enqueue_task(p, rq);
|
|
/*
|
|
- * The task was running during this tick - update the
|
|
- * time slice counter. Note: we do not update a thread's
|
|
- * priority until it either goes to sleep or uses up its
|
|
- * timeslice. This makes it possible for interactive tasks
|
|
- * to use up their timeslices at their highest priority levels.
|
|
+ * Subtract any extra time this task ran over its time_slice; ie
|
|
+ * overrun will either be 0 or negative.
|
|
*/
|
|
- if (rt_task(p)) {
|
|
- /*
|
|
- * RR tasks need a special form of timeslice management.
|
|
- * FIFO tasks have no timeslices.
|
|
- */
|
|
- if ((p->policy == SCHED_RR) && !--p->time_slice) {
|
|
- p->time_slice = task_timeslice(p);
|
|
- p->first_time_slice = 0;
|
|
- set_tsk_need_resched(p);
|
|
+ p->time_slice += overrun;
|
|
+}
|
|
|
|
- /* put it at the end of the queue: */
|
|
- requeue_task(p, rq->active);
|
|
- }
|
|
- goto out_unlock;
|
|
+/*
|
|
+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
|
|
+ * tasks and set the refractory flag if necessary. There is 10% hysteresis
|
|
+ * for unsetting the flag.
|
|
+ */
|
|
+static unsigned int test_ret_isorefractory(struct rq *rq)
|
|
+{
|
|
+ if (likely(!rq->iso_refractory)) {
|
|
+ if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu)
|
|
+ rq->iso_refractory = 1;
|
|
+ } else {
|
|
+ if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100))
|
|
+ rq->iso_refractory = 0;
|
|
}
|
|
- if (!--p->time_slice) {
|
|
- dequeue_task(p, rq->active);
|
|
- set_tsk_need_resched(p);
|
|
- p->prio = effective_prio(p);
|
|
- p->time_slice = task_timeslice(p);
|
|
- p->first_time_slice = 0;
|
|
+ return rq->iso_refractory;
|
|
+}
|
|
|
|
- if (!rq->expired_timestamp)
|
|
- rq->expired_timestamp = jiffies;
|
|
- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
|
|
- enqueue_task(p, rq->expired);
|
|
- if (p->static_prio < rq->best_expired_prio)
|
|
- rq->best_expired_prio = p->static_prio;
|
|
- } else
|
|
- enqueue_task(p, rq->active);
|
|
- } else {
|
|
- /*
|
|
- * Prevent a too long timeslice allowing a task to monopolize
|
|
- * the CPU. We do this by splitting up the timeslice into
|
|
- * smaller pieces.
|
|
- *
|
|
- * Note: this does not mean the task's timeslices expire or
|
|
- * get lost in any way, they just might be preempted by
|
|
- * another task of equal priority. (one with higher
|
|
- * priority would have preempted this task already.) We
|
|
- * requeue this task to the end of the list on this priority
|
|
- * level, which is in essence a round-robin of tasks with
|
|
- * equal priority.
|
|
- *
|
|
- * This only applies to tasks in the interactive
|
|
- * delta range with at least TIMESLICE_GRANULARITY to requeue.
|
|
- */
|
|
- if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
|
|
- p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
|
|
- (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
|
|
- (p->array == rq->active)) {
|
|
+/* No SCHED_ISO task was running so decrease rq->iso_ticks */
|
|
+static inline void no_iso_tick(struct rq *rq)
|
|
+{
|
|
+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
|
|
+}
|
|
|
|
- requeue_task(p, rq->active);
|
|
- set_tsk_need_resched(p);
|
|
- }
|
|
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
|
|
+static void task_running_tick(struct rq *rq, struct task_struct *p)
|
|
+{
|
|
+ /*
|
|
+ * If a SCHED_ISO task is running we increment the iso_ticks. In
|
|
+ * order to prevent SCHED_ISO tasks from causing starvation in the
|
|
+ * presence of true RT tasks we account those as iso_ticks as well.
|
|
+ */
|
|
+ if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) {
|
|
+ if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100)
|
|
+ rq->iso_ticks += 100;
|
|
+ } else
|
|
+ no_iso_tick(rq);
|
|
+
|
|
+ if (iso_task(p)) {
|
|
+ if (unlikely(test_ret_isorefractory(rq))) {
|
|
+ if (isoprio_suitable(p)) {
|
|
+ /*
|
|
+ * SCHED_ISO task is running as RT and limit
|
|
+ * has been hit. Set the PF_ISOREF flag and
|
|
+ * force it to reschedule as SCHED_NORMAL
|
|
+ * by zeroing its time_slice
|
|
+ */
|
|
+ p->flags |= PF_ISOREF;
|
|
+ p->time_slice = 0;
|
|
+ }
|
|
+ } else
|
|
+ p->flags &= ~PF_ISOREF;
|
|
}
|
|
-out_unlock:
|
|
- spin_unlock(&rq->lock);
|
|
+ /* SCHED_FIFO tasks never run out of timeslice. */
|
|
+ if (p->time_slice > 0 || p->policy == SCHED_FIFO)
|
|
+ return;
|
|
+ /* p->time_slice <= 0 */
|
|
+ set_tsk_need_resched(p);
|
|
+ if (likely(task_queued(p)))
|
|
+ task_expired_entitlement(rq, p);
|
|
}
|
|
|
|
/*
|
|
* This function gets called by the timer code, with HZ frequency.
|
|
* We call it with interrupts disabled.
|
|
- *
|
|
- * It also gets called by the fork code, when changing the parent's
|
|
- * timeslices.
|
|
*/
|
|
void scheduler_tick(void)
|
|
{
|
|
@@ -3505,10 +3680,14 @@
|
|
int idle_at_tick = idle_cpu(cpu);
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
|
- update_cpu_clock(p, rq, now);
|
|
+ update_cpu_clock(p, rq, now, 1);
|
|
|
|
+ spin_lock(&rq->lock);
|
|
if (!idle_at_tick)
|
|
task_running_tick(rq, p);
|
|
+ else
|
|
+ no_iso_tick(rq);
|
|
+ spin_unlock(&rq->lock);
|
|
#ifdef CONFIG_SMP
|
|
update_load(rq);
|
|
rq->idle_at_tick = idle_at_tick;
|
|
@@ -3554,10 +3733,80 @@
|
|
|
|
#endif
|
|
|
|
-static inline int interactive_sleep(enum sleep_type sleep_type)
|
|
+static void reset_prio_levels(struct rq *rq)
|
|
{
|
|
- return (sleep_type == SLEEP_INTERACTIVE ||
|
|
- sleep_type == SLEEP_INTERRUPTED);
|
|
+ rq->active->best_static_prio = MAX_PRIO - 1;
|
|
+ rq->expired->best_static_prio = MAX_PRIO - 1;
|
|
+ memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Only tasks running are SCHED_IDLEPRIO. Set the active array to the
|
|
+ * idleprio array and if it isn't already active
|
|
+ */
|
|
+static struct task_struct *next_idleprio_task(struct rq *rq)
|
|
+{
|
|
+ struct prio_array *array = rq->active;
|
|
+ struct list_head *queue;
|
|
+
|
|
+ if (array != rq->idleprio) {
|
|
+ rq->active = rq->idleprio;
|
|
+ rq->expired = array;
|
|
+ array = rq->active;
|
|
+ rq->exp_bitmap = rq->expired->prio_bitmap;
|
|
+ rq->dyn_bitmap = rq->active->prio_bitmap;
|
|
+ }
|
|
+ rq->prio_rotation++;
|
|
+ reset_prio_levels(rq);
|
|
+ queue = array->queue + MAX_PRIO;
|
|
+ return list_entry(queue->next, struct task_struct, run_list);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * next_dynamic_task finds the next suitable dynamic task.
|
|
+ */
|
|
+static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx)
|
|
+{
|
|
+ struct prio_array *array = rq->active;
|
|
+ struct task_struct *next;
|
|
+ struct list_head *queue;
|
|
+ int nstatic;
|
|
+
|
|
+retry:
|
|
+ if (unlikely(rq->nr_running == rq->nr_idleprio))
|
|
+ return next_idleprio_task(rq);
|
|
+ if (idx >= MAX_PRIO) {
|
|
+ /* There are no more tasks in the active array. Swap arrays */
|
|
+ array = rq->expired;
|
|
+ rq->expired = rq->active;
|
|
+ rq->active = array;
|
|
+ rq->exp_bitmap = rq->expired->prio_bitmap;
|
|
+ rq->dyn_bitmap = rq->active->prio_bitmap;
|
|
+ rq->prio_rotation++;
|
|
+ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
|
|
+ reset_prio_levels(rq);
|
|
+ }
|
|
+ queue = array->queue + idx;
|
|
+ next = list_entry(queue->next, struct task_struct, run_list);
|
|
+ if (unlikely(next->time_slice <= 0 && !(iso_task(next) &&
|
|
+ isoprio_suitable(next)))) {
|
|
+ /*
|
|
+ * Unlucky enough that this task ran out of time_slice
|
|
+ * before it hit a scheduler_tick so it should have its
|
|
+ * priority reassessed and choose another task (possibly
|
|
+ * the same one)
|
|
+ */
|
|
+ task_expired_entitlement(rq, next);
|
|
+ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
|
|
+ goto retry;
|
|
+ }
|
|
+ next->rotation = rq->prio_rotation;
|
|
+ nstatic = next->static_prio;
|
|
+ if (nstatic < array->best_static_prio)
|
|
+ array->best_static_prio = nstatic;
|
|
+ if (idx > rq->prio_level[USER_PRIO(nstatic)])
|
|
+ rq->prio_level[USER_PRIO(nstatic)] = idx;
|
|
+ return next;
|
|
}
|
|
|
|
/*
|
|
@@ -3566,13 +3815,11 @@
|
|
asmlinkage void __sched schedule(void)
|
|
{
|
|
struct task_struct *prev, *next;
|
|
- struct prio_array *array;
|
|
struct list_head *queue;
|
|
unsigned long long now;
|
|
- unsigned long run_time;
|
|
- int cpu, idx, new_prio;
|
|
long *switch_count;
|
|
struct rq *rq;
|
|
+ int cpu, idx;
|
|
|
|
/*
|
|
* Test if we are atomic. Since do_exit() needs to call into
|
|
@@ -3608,18 +3855,6 @@
|
|
|
|
schedstat_inc(rq, sched_cnt);
|
|
now = sched_clock();
|
|
- if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
|
|
- run_time = now - prev->timestamp;
|
|
- if (unlikely((long long)(now - prev->timestamp) < 0))
|
|
- run_time = 0;
|
|
- } else
|
|
- run_time = NS_MAX_SLEEP_AVG;
|
|
-
|
|
- /*
|
|
- * Tasks charged proportionately less run_time at high sleep_avg to
|
|
- * delay them losing their interactive status
|
|
- */
|
|
- run_time /= (CURRENT_BONUS(prev) ? : 1);
|
|
|
|
spin_lock_irq(&rq->lock);
|
|
|
|
@@ -3630,8 +3865,10 @@
|
|
unlikely(signal_pending(prev))))
|
|
prev->state = TASK_RUNNING;
|
|
else {
|
|
- if (prev->state == TASK_UNINTERRUPTIBLE)
|
|
+ if (prev->state == TASK_UNINTERRUPTIBLE) {
|
|
+ prev->flags |= PF_NONSLEEP;
|
|
rq->nr_uninterruptible++;
|
|
+ }
|
|
deactivate_task(prev, rq);
|
|
}
|
|
}
|
|
@@ -3641,59 +3878,29 @@
|
|
idle_balance(cpu, rq);
|
|
if (!rq->nr_running) {
|
|
next = rq->idle;
|
|
- rq->expired_timestamp = 0;
|
|
goto switch_tasks;
|
|
}
|
|
}
|
|
|
|
- array = rq->active;
|
|
- if (unlikely(!array->nr_active)) {
|
|
- /*
|
|
- * Switch the active and expired arrays.
|
|
- */
|
|
- schedstat_inc(rq, sched_switch);
|
|
- rq->active = rq->expired;
|
|
- rq->expired = array;
|
|
- array = rq->active;
|
|
- rq->expired_timestamp = 0;
|
|
- rq->best_expired_prio = MAX_PRIO;
|
|
- }
|
|
-
|
|
- idx = sched_find_first_bit(array->bitmap);
|
|
- queue = array->queue + idx;
|
|
- next = list_entry(queue->next, struct task_struct, run_list);
|
|
-
|
|
- if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
|
|
- unsigned long long delta = now - next->timestamp;
|
|
- if (unlikely((long long)(now - next->timestamp) < 0))
|
|
- delta = 0;
|
|
-
|
|
- if (next->sleep_type == SLEEP_INTERACTIVE)
|
|
- delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
|
|
-
|
|
- array = next->array;
|
|
- new_prio = recalc_task_prio(next, next->timestamp + delta);
|
|
-
|
|
- if (unlikely(next->prio != new_prio)) {
|
|
- dequeue_task(next, array);
|
|
- next->prio = new_prio;
|
|
- enqueue_task(next, array);
|
|
- }
|
|
+ idx = sched_find_first_bit(rq->dyn_bitmap);
|
|
+ if (likely(idx > ISO_PRIO))
|
|
+ next = next_dynamic_task(rq, idx);
|
|
+ else {
|
|
+ queue = rq->active->queue + idx;
|
|
+ next = list_entry(queue->next, struct task_struct, run_list);
|
|
}
|
|
- next->sleep_type = SLEEP_NORMAL;
|
|
switch_tasks:
|
|
- if (next == rq->idle)
|
|
+ if (next == rq->idle) {
|
|
+ reset_prio_levels(rq);
|
|
+ rq->prio_rotation++;
|
|
schedstat_inc(rq, sched_goidle);
|
|
+ }
|
|
prefetch(next);
|
|
prefetch_stack(next);
|
|
clear_tsk_need_resched(prev);
|
|
rcu_qsctr_inc(task_cpu(prev));
|
|
|
|
- update_cpu_clock(prev, rq, now);
|
|
-
|
|
- prev->sleep_avg -= run_time;
|
|
- if ((long)prev->sleep_avg <= 0)
|
|
- prev->sleep_avg = 0;
|
|
+ update_cpu_clock(prev, rq, now, 0);
|
|
prev->timestamp = prev->last_ran = now;
|
|
|
|
sched_info_switch(prev, next);
|
|
@@ -4129,29 +4336,22 @@
|
|
*/
|
|
void rt_mutex_setprio(struct task_struct *p, int prio)
|
|
{
|
|
- struct prio_array *array;
|
|
unsigned long flags;
|
|
+ int queued, oldprio;
|
|
struct rq *rq;
|
|
- int oldprio;
|
|
|
|
BUG_ON(prio < 0 || prio > MAX_PRIO);
|
|
|
|
rq = task_rq_lock(p, &flags);
|
|
|
|
oldprio = p->prio;
|
|
- array = p->array;
|
|
- if (array)
|
|
- dequeue_task(p, array);
|
|
+ queued = task_queued(p);
|
|
+ if (queued)
|
|
+ dequeue_task(p, rq);
|
|
p->prio = prio;
|
|
|
|
- if (array) {
|
|
- /*
|
|
- * If changing to an RT priority then queue it
|
|
- * in the active array!
|
|
- */
|
|
- if (rt_task(p))
|
|
- array = rq->active;
|
|
- enqueue_task(p, array);
|
|
+ if (queued) {
|
|
+ enqueue_task(p, rq);
|
|
/*
|
|
* Reschedule if we are currently running on this runqueue and
|
|
* our priority decreased, or if we are not currently running on
|
|
@@ -4160,8 +4360,8 @@
|
|
if (task_running(rq, p)) {
|
|
if (p->prio > oldprio)
|
|
resched_task(rq->curr);
|
|
- } else if (TASK_PREEMPTS_CURR(p, rq))
|
|
- resched_task(rq->curr);
|
|
+ } else
|
|
+ try_preempt(p, rq);
|
|
}
|
|
task_rq_unlock(rq, &flags);
|
|
}
|
|
@@ -4170,8 +4370,7 @@
|
|
|
|
void set_user_nice(struct task_struct *p, long nice)
|
|
{
|
|
- struct prio_array *array;
|
|
- int old_prio, delta;
|
|
+ int queued, old_prio,delta;
|
|
unsigned long flags;
|
|
struct rq *rq;
|
|
|
|
@@ -4192,26 +4391,27 @@
|
|
p->static_prio = NICE_TO_PRIO(nice);
|
|
goto out_unlock;
|
|
}
|
|
- array = p->array;
|
|
- if (array) {
|
|
- dequeue_task(p, array);
|
|
+ queued = task_queued(p);
|
|
+ if (queued) {
|
|
+ dequeue_task(p, rq);
|
|
dec_raw_weighted_load(rq, p);
|
|
}
|
|
|
|
p->static_prio = NICE_TO_PRIO(nice);
|
|
- set_load_weight(p);
|
|
old_prio = p->prio;
|
|
p->prio = effective_prio(p);
|
|
+ set_quota(p);
|
|
delta = p->prio - old_prio;
|
|
|
|
- if (array) {
|
|
- enqueue_task(p, array);
|
|
+ if (queued) {
|
|
+ enqueue_task(p, rq);
|
|
inc_raw_weighted_load(rq, p);
|
|
/*
|
|
* If the task increased its priority or is running and
|
|
* lowered its priority, then reschedule its CPU:
|
|
*/
|
|
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
|
|
+ if (delta < 0 || ((delta > 0 || idleprio_task(p)) &&
|
|
+ task_running(rq, p)))
|
|
resched_task(rq->curr);
|
|
}
|
|
out_unlock:
|
|
@@ -4281,11 +4481,23 @@
|
|
*
|
|
* This is the priority value as seen by users in /proc.
|
|
* RT tasks are offset by -200. Normal tasks are centered
|
|
- * around 0, value goes from -16 to +15.
|
|
+ * around 1, value goes from 0 to +79. Values higher than
|
|
+ * 39 indicate task is on the expired array. This is done
|
|
+ * lockless and may rarely return an active instead of
|
|
+ * expired value.
|
|
*/
|
|
-int task_prio(const struct task_struct *p)
|
|
+int task_prio(struct task_struct *p)
|
|
{
|
|
- return p->prio - MAX_RT_PRIO;
|
|
+ int prio = p->prio - MAX_RT_PRIO;
|
|
+
|
|
+ if (task_queued(p)) {
|
|
+ struct rq *rq = task_rq(p);
|
|
+ struct prio_array *array = p->array;
|
|
+
|
|
+ if (rq && rq->expired == array)
|
|
+ prio += PRIO_RANGE;
|
|
+ }
|
|
+ return prio;
|
|
}
|
|
|
|
/**
|
|
@@ -4328,19 +4540,14 @@
|
|
/* Actually do priority change: must hold rq lock. */
|
|
static void __setscheduler(struct task_struct *p, int policy, int prio)
|
|
{
|
|
- BUG_ON(p->array);
|
|
+ BUG_ON(task_queued(p));
|
|
|
|
p->policy = policy;
|
|
p->rt_priority = prio;
|
|
p->normal_prio = normal_prio(p);
|
|
/* we are holding p->pi_lock already */
|
|
p->prio = rt_mutex_getprio(p);
|
|
- /*
|
|
- * SCHED_BATCH tasks are treated as perpetual CPU hogs:
|
|
- */
|
|
- if (policy == SCHED_BATCH)
|
|
- p->sleep_avg = 0;
|
|
- set_load_weight(p);
|
|
+ set_quota(p);
|
|
}
|
|
|
|
/**
|
|
@@ -4354,19 +4561,36 @@
|
|
int sched_setscheduler(struct task_struct *p, int policy,
|
|
struct sched_param *param)
|
|
{
|
|
- int retval, oldprio, oldpolicy = -1;
|
|
- struct prio_array *array;
|
|
+ struct sched_param zero_param = { .sched_priority = 0 };
|
|
+ int queued, retval, oldprio, oldpolicy = -1;
|
|
+ unsigned long rlim_rtprio = 0;
|
|
unsigned long flags;
|
|
struct rq *rq;
|
|
|
|
/* may grab non-irq protected spin_locks */
|
|
BUG_ON(in_interrupt());
|
|
+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
|
|
+ unsigned long lflags;
|
|
+
|
|
+ if (!lock_task_sighand(p, &lflags))
|
|
+ return -ESRCH;
|
|
+ rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
|
|
+ unlock_task_sighand(p, &lflags);
|
|
+ if (rlim_rtprio)
|
|
+ goto recheck;
|
|
+ /*
|
|
+ * If the caller requested an RT policy without having the
|
|
+ * necessary rights, we downgrade the policy to SCHED_ISO.
|
|
+ * We also set the parameter to zero to pass the checks.
|
|
+ */
|
|
+ policy = SCHED_ISO;
|
|
+ param = &zero_param;
|
|
+ }
|
|
recheck:
|
|
/* double check policy once rq lock held */
|
|
if (policy < 0)
|
|
policy = oldpolicy = p->policy;
|
|
- else if (policy != SCHED_FIFO && policy != SCHED_RR &&
|
|
- policy != SCHED_NORMAL && policy != SCHED_BATCH)
|
|
+ else if (!SCHED_RANGE(policy))
|
|
return -EINVAL;
|
|
/*
|
|
* Valid priorities for SCHED_FIFO and SCHED_RR are
|
|
@@ -4385,14 +4609,6 @@
|
|
*/
|
|
if (!capable(CAP_SYS_NICE)) {
|
|
if (is_rt_policy(policy)) {
|
|
- unsigned long rlim_rtprio;
|
|
- unsigned long flags;
|
|
-
|
|
- if (!lock_task_sighand(p, &flags))
|
|
- return -ESRCH;
|
|
- rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
|
|
- unlock_task_sighand(p, &flags);
|
|
-
|
|
/* can't set/change the rt policy */
|
|
if (policy != p->policy && !rlim_rtprio)
|
|
return -EPERM;
|
|
@@ -4401,6 +4617,31 @@
|
|
if (param->sched_priority > p->rt_priority &&
|
|
param->sched_priority > rlim_rtprio)
|
|
return -EPERM;
|
|
+ } else {
|
|
+ switch (p->policy) {
|
|
+ /*
|
|
+ * Can only downgrade policies but not back to
|
|
+ * SCHED_NORMAL
|
|
+ */
|
|
+ case SCHED_ISO:
|
|
+ if (policy == SCHED_ISO)
|
|
+ goto out;
|
|
+ if (policy == SCHED_NORMAL)
|
|
+ return -EPERM;
|
|
+ break;
|
|
+ case SCHED_BATCH:
|
|
+ if (policy == SCHED_BATCH)
|
|
+ goto out;
|
|
+ if (policy != SCHED_IDLEPRIO)
|
|
+ return -EPERM;
|
|
+ break;
|
|
+ case SCHED_IDLEPRIO:
|
|
+ if (policy == SCHED_IDLEPRIO)
|
|
+ goto out;
|
|
+ return -EPERM;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
}
|
|
|
|
/* can't change other user's priorities */
|
|
@@ -4409,6 +4650,11 @@
|
|
return -EPERM;
|
|
}
|
|
|
|
+ if (!(p->mm) && policy == SCHED_IDLEPRIO) {
|
|
+ /* Don't allow kernel threads to be SCHED_IDLEPRIO. */
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
retval = security_task_setscheduler(p, policy, param);
|
|
if (retval)
|
|
return retval;
|
|
@@ -4429,12 +4675,12 @@
|
|
spin_unlock_irqrestore(&p->pi_lock, flags);
|
|
goto recheck;
|
|
}
|
|
- array = p->array;
|
|
- if (array)
|
|
+ queued = task_queued(p);
|
|
+ if (queued)
|
|
deactivate_task(p, rq);
|
|
oldprio = p->prio;
|
|
__setscheduler(p, policy, param->sched_priority);
|
|
- if (array) {
|
|
+ if (queued) {
|
|
__activate_task(p, rq);
|
|
/*
|
|
* Reschedule if we are currently running on this runqueue and
|
|
@@ -4444,14 +4690,15 @@
|
|
if (task_running(rq, p)) {
|
|
if (p->prio > oldprio)
|
|
resched_task(rq->curr);
|
|
- } else if (TASK_PREEMPTS_CURR(p, rq))
|
|
- resched_task(rq->curr);
|
|
+ } else
|
|
+ try_preempt(p, rq);
|
|
}
|
|
__task_rq_unlock(rq);
|
|
spin_unlock_irqrestore(&p->pi_lock, flags);
|
|
|
|
rt_mutex_adjust_pi(p);
|
|
|
|
+out:
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sched_setscheduler);
|
|
@@ -4718,41 +4965,34 @@
|
|
* sys_sched_yield - yield the current processor to other threads.
|
|
*
|
|
* This function yields the current CPU by moving the calling thread
|
|
- * to the expired array. If there are no other threads running on this
|
|
- * CPU then this function will return.
|
|
+ * to the expired array if SCHED_NORMAL or the end of its current priority
|
|
+ * queue if a realtime task. If there are no other threads running on this
|
|
+ * cpu this function will return.
|
|
*/
|
|
asmlinkage long sys_sched_yield(void)
|
|
{
|
|
struct rq *rq = this_rq_lock();
|
|
- struct prio_array *array = current->array, *target = rq->expired;
|
|
+ struct task_struct *p = current;
|
|
|
|
schedstat_inc(rq, yld_cnt);
|
|
- /*
|
|
- * We implement yielding by moving the task into the expired
|
|
- * queue.
|
|
- *
|
|
- * (special rule: RT tasks will just roundrobin in the active
|
|
- * array.)
|
|
- */
|
|
- if (rt_task(current))
|
|
- target = rq->active;
|
|
-
|
|
- if (array->nr_active == 1) {
|
|
- schedstat_inc(rq, yld_act_empty);
|
|
- if (!rq->expired->nr_active)
|
|
- schedstat_inc(rq, yld_both_empty);
|
|
- } else if (!rq->expired->nr_active)
|
|
- schedstat_inc(rq, yld_exp_empty);
|
|
-
|
|
- if (array != target) {
|
|
- dequeue_task(current, array);
|
|
- enqueue_task(current, target);
|
|
- } else
|
|
- /*
|
|
- * requeue_task is cheaper so perform that if possible.
|
|
- */
|
|
- requeue_task(current, array);
|
|
+ if (rq->nr_running == 1)
|
|
+ schedstat_inc(rq, yld_both_empty);
|
|
+ else {
|
|
+ struct prio_array *old_array = p->array;
|
|
+ int old_prio = p->prio;
|
|
+
|
|
+ if (idleprio_task(p)) {
|
|
+ dequeue_task(p, rq);
|
|
+ enqueue_task(p, rq);
|
|
+ goto out_release;
|
|
+ }
|
|
+ /* p->prio will be updated in requeue_task via queue_expired */
|
|
+ if (!rt_task(p))
|
|
+ p->array = rq->expired;
|
|
+ requeue_task(p, rq, old_array, old_prio);
|
|
+ }
|
|
|
|
+out_release:
|
|
/*
|
|
* Since we are going to call schedule() anyway, there's
|
|
* no need to preempt or enable interrupts:
|
|
@@ -4902,6 +5142,8 @@
|
|
break;
|
|
case SCHED_NORMAL:
|
|
case SCHED_BATCH:
|
|
+ case SCHED_ISO:
|
|
+ case SCHED_IDLEPRIO:
|
|
ret = 0;
|
|
break;
|
|
}
|
|
@@ -4926,6 +5168,8 @@
|
|
break;
|
|
case SCHED_NORMAL:
|
|
case SCHED_BATCH:
|
|
+ case SCHED_ISO:
|
|
+ case SCHED_IDLEPRIO:
|
|
ret = 0;
|
|
}
|
|
return ret;
|
|
@@ -4959,8 +5203,8 @@
|
|
if (retval)
|
|
goto out_unlock;
|
|
|
|
- jiffies_to_timespec(p->policy == SCHED_FIFO ?
|
|
- 0 : task_timeslice(p), &t);
|
|
+ t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 :
|
|
+ MS_TO_NS(task_timeslice(p)));
|
|
read_unlock(&tasklist_lock);
|
|
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
|
|
out_nounlock:
|
|
@@ -5056,10 +5300,10 @@
|
|
struct rq *rq = cpu_rq(cpu);
|
|
unsigned long flags;
|
|
|
|
- idle->timestamp = sched_clock();
|
|
- idle->sleep_avg = 0;
|
|
- idle->array = NULL;
|
|
- idle->prio = idle->normal_prio = MAX_PRIO;
|
|
+ bitmap_zero(idle->bitmap, PRIO_RANGE);
|
|
+ idle->timestamp = idle->last_ran = sched_clock();
|
|
+ idle->array = rq->active;
|
|
+ idle->prio = idle->normal_prio = NICE_TO_PRIO(0);
|
|
idle->state = TASK_RUNNING;
|
|
idle->cpus_allowed = cpumask_of_cpu(cpu);
|
|
set_task_cpu(idle, cpu);
|
|
@@ -5178,7 +5422,7 @@
|
|
goto out;
|
|
|
|
set_task_cpu(p, dest_cpu);
|
|
- if (p->array) {
|
|
+ if (task_queued(p)) {
|
|
/*
|
|
* Sync timestamp with rq_dest's before activating.
|
|
* The same thing could be achieved by doing this step
|
|
@@ -5189,8 +5433,7 @@
|
|
+ rq_dest->most_recent_timestamp;
|
|
deactivate_task(p, rq_src);
|
|
__activate_task(p, rq_dest);
|
|
- if (TASK_PREEMPTS_CURR(p, rq_dest))
|
|
- resched_task(rq_dest->curr);
|
|
+ try_preempt(p, rq_dest);
|
|
}
|
|
ret = 1;
|
|
out:
|
|
@@ -5487,7 +5730,7 @@
|
|
/* Idle task back to normal (off runqueue, low prio) */
|
|
rq = task_rq_lock(rq->idle, &flags);
|
|
deactivate_task(rq->idle, rq);
|
|
- rq->idle->static_prio = MAX_PRIO;
|
|
+ rq->idle->static_prio = NICE_TO_PRIO(0);
|
|
__setscheduler(rq->idle, SCHED_NORMAL, 0);
|
|
migrate_dead_tasks(cpu);
|
|
task_rq_unlock(rq, &flags);
|
|
@@ -7013,6 +7256,13 @@
|
|
/* Move init over to a non-isolated CPU */
|
|
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
|
|
BUG();
|
|
+
|
|
+ /*
|
|
+ * Assume that every added cpu gives us slightly less overall latency
|
|
+ * allowing us to increase the base rr_interval, but in a non linear
|
|
+ * fashion.
|
|
+ */
|
|
+ rr_interval *= 1 + ilog2(num_online_cpus());
|
|
}
|
|
#else
|
|
void __init sched_init_smp(void)
|
|
@@ -7035,6 +7285,16 @@
|
|
int i, j, k;
|
|
int highest_cpu = 0;
|
|
|
|
+ /* Generate the priority matrix */
|
|
+ for (i = 0; i < PRIO_RANGE; i++) {
|
|
+ bitmap_fill(prio_matrix[i], PRIO_RANGE);
|
|
+ j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i);
|
|
+ for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) {
|
|
+ __clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE),
|
|
+ prio_matrix[i]);
|
|
+ }
|
|
+ }
|
|
+
|
|
for_each_possible_cpu(i) {
|
|
struct prio_array *array;
|
|
struct rq *rq;
|
|
@@ -7042,12 +7302,20 @@
|
|
rq = cpu_rq(i);
|
|
spin_lock_init(&rq->lock);
|
|
lockdep_set_class(&rq->lock, &rq->rq_lock_key);
|
|
+ rq->iso_ticks = 0;
|
|
rq->nr_running = 0;
|
|
+ rq->nr_idleprio = 0;
|
|
+ rq->prio_rotation = 0;
|
|
rq->active = rq->arrays;
|
|
+ rq->idleprio = rq->active;
|
|
rq->expired = rq->arrays + 1;
|
|
- rq->best_expired_prio = MAX_PRIO;
|
|
+ reset_prio_levels(rq);
|
|
+ rq->dyn_bitmap = rq->active->prio_bitmap;
|
|
+ rq->exp_bitmap = rq->expired->prio_bitmap;
|
|
|
|
#ifdef CONFIG_SMP
|
|
+ rq->active->rq = rq;
|
|
+ rq->expired->rq = rq;
|
|
rq->sd = NULL;
|
|
for (j = 1; j < 3; j++)
|
|
rq->cpu_load[j] = 0;
|
|
@@ -7060,17 +7328,16 @@
|
|
atomic_set(&rq->nr_iowait, 0);
|
|
|
|
for (j = 0; j < 2; j++) {
|
|
+
|
|
array = rq->arrays + j;
|
|
- for (k = 0; k < MAX_PRIO; k++) {
|
|
+ for (k = 0; k <= MAX_PRIO; k++)
|
|
INIT_LIST_HEAD(array->queue + k);
|
|
- __clear_bit(k, array->bitmap);
|
|
- }
|
|
- // delimiter for bitsearch
|
|
- __set_bit(MAX_PRIO, array->bitmap);
|
|
+ bitmap_zero(array->prio_bitmap, MAX_PRIO);
|
|
+ /* delimiter for bitsearch */
|
|
+ __set_bit(MAX_PRIO, array->prio_bitmap);
|
|
}
|
|
highest_cpu = i;
|
|
}
|
|
-
|
|
set_load_weight(&init_task);
|
|
|
|
#ifdef CONFIG_SMP
|
|
@@ -7125,25 +7392,25 @@
|
|
#ifdef CONFIG_MAGIC_SYSRQ
|
|
void normalize_rt_tasks(void)
|
|
{
|
|
- struct prio_array *array;
|
|
struct task_struct *g, *p;
|
|
unsigned long flags;
|
|
struct rq *rq;
|
|
+ int queued;
|
|
|
|
read_lock_irq(&tasklist_lock);
|
|
|
|
do_each_thread(g, p) {
|
|
- if (!rt_task(p))
|
|
+ if (!rt_task(p) && !iso_task(p))
|
|
continue;
|
|
|
|
spin_lock_irqsave(&p->pi_lock, flags);
|
|
rq = __task_rq_lock(p);
|
|
|
|
- array = p->array;
|
|
- if (array)
|
|
+ queued = task_queued(p);
|
|
+ if (queued)
|
|
deactivate_task(p, task_rq(p));
|
|
__setscheduler(p, SCHED_NORMAL, 0);
|
|
- if (array) {
|
|
+ if (queued) {
|
|
__activate_task(p, task_rq(p));
|
|
resched_task(rq->curr);
|
|
}
|
|
Index: linux-2.6.22-ck1/kernel/sysctl.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/kernel/sysctl.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/kernel/sysctl.c 2007-07-10 14:55:23.000000000 +1000
|
|
@@ -22,6 +22,7 @@
|
|
#include <linux/mm.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/slab.h>
|
|
+#include <linux/swap-prefetch.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/capability.h>
|
|
@@ -70,6 +71,7 @@
|
|
extern char core_pattern[];
|
|
extern int pid_max;
|
|
extern int min_free_kbytes;
|
|
+extern int vm_tail_largefiles;
|
|
extern int printk_ratelimit_jiffies;
|
|
extern int printk_ratelimit_burst;
|
|
extern int pid_max_min, pid_max_max;
|
|
@@ -78,6 +80,10 @@
|
|
extern int compat_log;
|
|
extern int maps_protect;
|
|
extern int sysctl_stat_interval;
|
|
+extern int rr_interval;
|
|
+extern int sched_interactive;
|
|
+extern int sched_iso_cpu;
|
|
+extern int sched_iso_period;
|
|
|
|
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
|
|
static int maxolduid = 65535;
|
|
@@ -161,6 +167,14 @@
|
|
#endif
|
|
|
|
|
|
+/* Constants for minimum and maximum testing.
|
|
+ We use these as one-element integer vectors. */
|
|
+static int __read_mostly zero;
|
|
+static int __read_mostly one = 1;
|
|
+static int __read_mostly one_hundred = 100;
|
|
+static int __read_mostly five_thousand = 5000;
|
|
+
|
|
+
|
|
/* The default sysctl tables: */
|
|
|
|
static ctl_table root_table[] = {
|
|
@@ -501,6 +515,47 @@
|
|
.mode = 0444,
|
|
.proc_handler = &proc_dointvec,
|
|
},
|
|
+ {
|
|
+ .ctl_name = CTL_UNNUMBERED,
|
|
+ .procname = "rr_interval",
|
|
+ .data = &rr_interval,
|
|
+ .maxlen = sizeof (int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec_minmax,
|
|
+ .strategy = &sysctl_intvec,
|
|
+ .extra1 = &one,
|
|
+ .extra2 = &five_thousand,
|
|
+ },
|
|
+ {
|
|
+ .ctl_name = CTL_UNNUMBERED,
|
|
+ .procname = "interactive",
|
|
+ .data = &sched_interactive,
|
|
+ .maxlen = sizeof(int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec,
|
|
+ },
|
|
+ {
|
|
+ .ctl_name = CTL_UNNUMBERED,
|
|
+ .procname = "iso_cpu",
|
|
+ .data = &sched_iso_cpu,
|
|
+ .maxlen = sizeof (int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec_minmax,
|
|
+ .strategy = &sysctl_intvec,
|
|
+ .extra1 = &zero,
|
|
+ .extra2 = &one_hundred,
|
|
+ },
|
|
+ {
|
|
+ .ctl_name = CTL_UNNUMBERED,
|
|
+ .procname = "iso_period",
|
|
+ .data = &sched_iso_period,
|
|
+ .maxlen = sizeof (int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec_minmax,
|
|
+ .strategy = &sysctl_intvec,
|
|
+ .extra1 = &one,
|
|
+ .extra2 = &one_hundred,
|
|
+ },
|
|
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
|
|
{
|
|
.ctl_name = KERN_UNKNOWN_NMI_PANIC,
|
|
@@ -619,14 +674,16 @@
|
|
{ .ctl_name = 0 }
|
|
};
|
|
|
|
-/* Constants for minimum and maximum testing in vm_table.
|
|
- We use these as one-element integer vectors. */
|
|
-static int zero;
|
|
-static int one_hundred = 100;
|
|
-
|
|
-
|
|
static ctl_table vm_table[] = {
|
|
{
|
|
+ .ctl_name = CTL_UNNUMBERED,
|
|
+ .procname = "tail_largefiles",
|
|
+ .data = &vm_tail_largefiles,
|
|
+ .maxlen = sizeof(int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec,
|
|
+ },
|
|
+ {
|
|
.ctl_name = VM_OVERCOMMIT_MEMORY,
|
|
.procname = "overcommit_memory",
|
|
.data = &sysctl_overcommit_memory,
|
|
@@ -705,16 +762,24 @@
|
|
.proc_handler = &proc_dointvec,
|
|
},
|
|
{
|
|
- .ctl_name = VM_SWAPPINESS,
|
|
- .procname = "swappiness",
|
|
- .data = &vm_swappiness,
|
|
- .maxlen = sizeof(vm_swappiness),
|
|
+ .ctl_name = CTL_UNNUMBERED,
|
|
+ .procname = "mapped",
|
|
+ .data = &vm_mapped,
|
|
+ .maxlen = sizeof(vm_mapped),
|
|
.mode = 0644,
|
|
.proc_handler = &proc_dointvec_minmax,
|
|
.strategy = &sysctl_intvec,
|
|
.extra1 = &zero,
|
|
.extra2 = &one_hundred,
|
|
},
|
|
+ {
|
|
+ .ctl_name = CTL_UNNUMBERED,
|
|
+ .procname = "hardmaplimit",
|
|
+ .data = &vm_hardmaplimit,
|
|
+ .maxlen = sizeof(int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec,
|
|
+ },
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
{
|
|
.ctl_name = VM_HUGETLB_PAGES,
|
|
@@ -882,6 +947,32 @@
|
|
.extra1 = &zero,
|
|
},
|
|
#endif
|
|
+#ifdef CONFIG_SWAP_PREFETCH
|
|
+ {
|
|
+ .ctl_name = CTL_UNNUMBERED,
|
|
+ .procname = "swap_prefetch",
|
|
+ .data = &swap_prefetch,
|
|
+ .maxlen = sizeof(swap_prefetch),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec,
|
|
+ },
|
|
+ {
|
|
+ .ctl_name = CTL_UNNUMBERED,
|
|
+ .procname = "swap_prefetch_delay",
|
|
+ .data = &swap_prefetch_delay,
|
|
+ .maxlen = sizeof(swap_prefetch_delay),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec,
|
|
+ },
|
|
+ {
|
|
+ .ctl_name = CTL_UNNUMBERED,
|
|
+ .procname = "swap_prefetch_sleep",
|
|
+ .data = &swap_prefetch_sleep,
|
|
+ .maxlen = sizeof(swap_prefetch_sleep),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec,
|
|
+ },
|
|
+#endif
|
|
{ .ctl_name = 0 }
|
|
};
|
|
|
|
Index: linux-2.6.22-ck1/Documentation/sched-design.txt
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/Documentation/sched-design.txt 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/Documentation/sched-design.txt 2007-07-10 14:55:02.000000000 +1000
|
|
@@ -1,11 +1,14 @@
|
|
- Goals, Design and Implementation of the
|
|
- new ultra-scalable O(1) scheduler
|
|
+ Goals, Design and Implementation of the ultra-scalable O(1) scheduler by
|
|
+ Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by
|
|
+ Con Kolivas.
|
|
|
|
|
|
- This is an edited version of an email Ingo Molnar sent to
|
|
- lkml on 4 Jan 2002. It describes the goals, design, and
|
|
- implementation of Ingo's new ultra-scalable O(1) scheduler.
|
|
- Last Updated: 18 April 2002.
|
|
+ This was originally an edited version of an email Ingo Molnar sent to
|
|
+ lkml on 4 Jan 2002. It describes the goals, design, and implementation
|
|
+ of Ingo's ultra-scalable O(1) scheduler. It now contains a description
|
|
+ of the Staircase Deadline priority scheduler that was built on this
|
|
+ design.
|
|
+ Last Updated: Fri, 4 May 2007
|
|
|
|
|
|
Goal
|
|
@@ -163,3 +166,222 @@
|
|
code is smaller than the old one.
|
|
|
|
Ingo
|
|
+
|
|
+
|
|
+Staircase Deadline cpu scheduler policy
|
|
+================================================
|
|
+
|
|
+Design summary
|
|
+==============
|
|
+
|
|
+A novel design which incorporates a foreground-background descending priority
|
|
+system (the staircase) via a bandwidth allocation matrix according to nice
|
|
+level.
|
|
+
|
|
+
|
|
+Features
|
|
+========
|
|
+
|
|
+A starvation free, strict fairness O(1) scalable design with interactivity
|
|
+as good as the above restrictions can provide. There is no interactivity
|
|
+estimator, no sleep/run measurements and only simple fixed accounting.
|
|
+The design has strict enough a design and accounting that task behaviour
|
|
+can be modelled and maximum scheduling latencies can be predicted by
|
|
+the virtual deadline mechanism that manages runqueues. The prime concern
|
|
+in this design is to maintain fairness at all costs determined by nice level,
|
|
+yet to maintain as good interactivity as can be allowed within the
|
|
+constraints of strict fairness.
|
|
+
|
|
+
|
|
+Design description
|
|
+==================
|
|
+
|
|
+SD works off the principle of providing each task a quota of runtime that it is
|
|
+allowed to run at a number of priority levels determined by its static priority
|
|
+(ie. its nice level). If the task uses up its quota it has its priority
|
|
+decremented to the next level determined by a priority matrix. Once every
|
|
+runtime quota has been consumed of every priority level, a task is queued on the
|
|
+"expired" array. When no other tasks exist with quota, the expired array is
|
|
+activated and fresh quotas are handed out. This is all done in O(1).
|
|
+
|
|
+Design details
|
|
+==============
|
|
+
|
|
+Each task keeps a record of its own entitlement of cpu time. Most of the rest of
|
|
+these details apply to non-realtime tasks as rt task management is straight
|
|
+forward.
|
|
+
|
|
+Each runqueue keeps a record of what major epoch it is up to in the
|
|
+rq->prio_rotation field which is incremented on each major epoch. It also
|
|
+keeps a record of the current prio_level for each static priority task.
|
|
+
|
|
+Each task keeps a record of what major runqueue epoch it was last running
|
|
+on in p->rotation. It also keeps a record of what priority levels it has
|
|
+already been allocated quota from during this epoch in a bitmap p->bitmap.
|
|
+
|
|
+The only tunable that determines all other details is the RR_INTERVAL. This
|
|
+is set to 8ms, and is scaled gently upwards with more cpus. This value is
|
|
+tunable via a /proc interface.
|
|
+
|
|
+All tasks are initially given a quota based on RR_INTERVAL. This is equal to
|
|
+RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and
|
|
+progressively larger for nice values from -1 to -20. This is assigned to
|
|
+p->quota and only changes with changes in nice level.
|
|
+
|
|
+As a task is first queued, it checks in recalc_task_prio to see if it has run at
|
|
+this runqueue's current priority rotation. If it has not, it will have its
|
|
+p->prio level set according to the first slot in a "priority matrix" and will be
|
|
+given a p->time_slice equal to the p->quota, and has its allocation bitmap bit
|
|
+set in p->bitmap for this prio level. It is then queued on the current active
|
|
+priority array.
|
|
+
|
|
+If a task has already been running during this major epoch, and it has
|
|
+p->time_slice left and the rq->prio_quota for the task's p->prio still
|
|
+has quota, it will be placed back on the active array, but no more quota
|
|
+will be added.
|
|
+
|
|
+If a task has been running during this major epoch, but does not have
|
|
+p->time_slice left, it will find the next lowest priority in its bitmap that it
|
|
+has not been allocated quota from. It then gets the a full quota in
|
|
+p->time_slice. It is then queued on the current active priority array at the
|
|
+newly determined lower priority.
|
|
+
|
|
+If a task has been running during this major epoch, and does not have
|
|
+any entitlement left in p->bitmap and no time_slice left, it will have its
|
|
+bitmap cleared, and be queued at its best prio again, but on the expired
|
|
+priority array.
|
|
+
|
|
+When a task is queued, it has its relevant bit set in the array->prio_bitmap.
|
|
+
|
|
+p->time_slice is stored in nanosconds and is updated via update_cpu_clock on
|
|
+schedule() and scheduler_tick. If p->time_slice is below zero then the
|
|
+recalc_task_prio is readjusted and the task rescheduled.
|
|
+
|
|
+
|
|
+Priority Matrix
|
|
+===============
|
|
+
|
|
+In order to minimise the latencies between tasks of different nice levels
|
|
+running concurrently, the dynamic priority slots where different nice levels
|
|
+are queued are dithered instead of being sequential. What this means is that
|
|
+there are 40 priority slots where a task may run during one major rotation,
|
|
+and the allocation of slots is dependant on nice level. In the
|
|
+following table, a zero represents a slot where the task may run.
|
|
+
|
|
+PRIORITY:0..................20.................39
|
|
+nice -20 0000000000000000000000000000000000000000
|
|
+nice -10 1000100010001000100010001000100010010000
|
|
+nice 0 1010101010101010101010101010101010101010
|
|
+nice 5 1011010110110101101101011011010110110110
|
|
+nice 10 1110111011101110111011101110111011101110
|
|
+nice 15 1111111011111110111111101111111011111110
|
|
+nice 19 1111111111111111111111111111111111111110
|
|
+
|
|
+As can be seen, a nice -20 task runs in every priority slot whereas a nice 19
|
|
+task only runs one slot per major rotation. This dithered table allows for the
|
|
+smallest possible maximum latencies between tasks of varying nice levels, thus
|
|
+allowing vastly different nice levels to be used.
|
|
+
|
|
+SCHED_BATCH tasks are managed slightly differently, receiving only the top
|
|
+slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but
|
|
+slightly higher latencies.
|
|
+
|
|
+
|
|
+Modelling deadline behaviour
|
|
+============================
|
|
+
|
|
+As the accounting in this design is hard and not modified by sleep average
|
|
+calculations or interactivity modifiers, it is possible to accurately
|
|
+predict the maximum latency that a task may experience under different
|
|
+conditions. This is a virtual deadline mechanism enforced by mandatory
|
|
+timeslice expiration and not outside bandwidth measurement.
|
|
+
|
|
+The maximum duration a task can run during one major epoch is determined by its
|
|
+nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL
|
|
+duration during each epoch. Nice 10 tasks can run at 9 priority levels for each
|
|
+epoch, and so on. The table in the priority matrix above demonstrates how this
|
|
+is enforced.
|
|
+
|
|
+Therefore the maximum duration a runqueue epoch can take is determined by
|
|
+the number of tasks running, and their nice level. After that, the maximum
|
|
+duration it can take before a task can wait before it get scheduled is
|
|
+determined by the position of its first slot on the matrix.
|
|
+
|
|
+In the following examples, these are _worst case scenarios_ and would rarely
|
|
+occur, but can be modelled nonetheless to determine the maximum possible
|
|
+latency.
|
|
+
|
|
+So for example, if two nice 0 tasks are running, and one has just expired as
|
|
+another is activated for the first time receiving a full quota for this
|
|
+runqueue rotation, the first task will wait:
|
|
+
|
|
+nr_tasks * max_duration + nice_difference * rr_interval
|
|
+1 * 19 * RR_INTERVAL + 0 = 152ms
|
|
+
|
|
+In the presence of a nice 10 task, a nice 0 task would wait a maximum of
|
|
+1 * 10 * RR_INTERVAL + 0 = 80ms
|
|
+
|
|
+In the presence of a nice 0 task, a nice 10 task would wait a maximum of
|
|
+1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms
|
|
+
|
|
+More useful than these values, though, are the average latencies which are
|
|
+a matter of determining the average distance between priority slots of
|
|
+different nice values and multiplying them by the tasks' quota. For example
|
|
+in the presence of a nice -10 task, a nice 0 task will wait either one or
|
|
+two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL,
|
|
+this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or
|
|
+20 and 40ms respectively (on uniprocessor at 1000HZ).
|
|
+
|
|
+
|
|
+Achieving interactivity
|
|
+=======================
|
|
+
|
|
+A requirement of this scheduler design was to achieve good interactivity
|
|
+despite being a completely fair deadline based design. The disadvantage of
|
|
+designs that try to achieve interactivity is that they usually do so at
|
|
+the expense of maintaining fairness. As cpu speeds increase, the requirement
|
|
+for some sort of metered unfairness towards interactive tasks becomes a less
|
|
+desirable phenomenon, but low latency and fairness remains mandatory to
|
|
+good interactive performance.
|
|
+
|
|
+This design relies on the fact that interactive tasks, by their nature,
|
|
+sleep often. Most fair scheduling designs end up penalising such tasks
|
|
+indirectly giving them less than their fair possible share because of the
|
|
+sleep, and have to use a mechanism of bonusing their priority to offset
|
|
+this based on the duration they sleep. This becomes increasingly inaccurate
|
|
+as the number of running tasks rises and more tasks spend time waiting on
|
|
+runqueues rather than sleeping, and it is impossible to tell whether the
|
|
+task that's waiting on a runqueue only intends to run for a short period and
|
|
+then sleep again after than runqueue wait. Furthermore, all such designs rely
|
|
+on a period of time to pass to accumulate some form of statistic on the task
|
|
+before deciding on how much to give them preference. The shorter this period,
|
|
+the more rapidly bursts of cpu ruin the interactive tasks behaviour. The
|
|
+longer this period, the longer it takes for interactive tasks to get low
|
|
+scheduling latencies and fair cpu.
|
|
+
|
|
+This design does not measure sleep time at all. Interactive tasks that sleep
|
|
+often will wake up having consumed very little if any of their quota for
|
|
+the current major priority rotation. The longer they have slept, the less
|
|
+likely they are to even be on the current major priority rotation. Once
|
|
+woken up, though, they get to use up a their full quota for that epoch,
|
|
+whether part of a quota remains or a full quota. Overall, however, they
|
|
+can still only run as much cpu time for that epoch as any other task of the
|
|
+same nice level. This means that two tasks behaving completely differently
|
|
+from fully cpu bound to waking/sleeping extremely frequently will still
|
|
+get the same quota of cpu, but the latter will be using its quota for that
|
|
+epoch in bursts rather than continuously. This guarantees that interactive
|
|
+tasks get the same amount of cpu as cpu bound ones.
|
|
+
|
|
+The other requirement of interactive tasks is also to obtain low latencies
|
|
+for when they are scheduled. Unlike fully cpu bound tasks and the maximum
|
|
+latencies possible described in the modelling deadline behaviour section
|
|
+above, tasks that sleep will wake up with quota available usually at the
|
|
+current runqueue's priority_level or better. This means that the most latency
|
|
+they are likely to see is one RR_INTERVAL, and often they will preempt the
|
|
+current task if it is not of a sleeping nature. This then guarantees very
|
|
+low latency for interactive tasks, and the lowest latencies for the least
|
|
+cpu bound tasks.
|
|
+
|
|
+
|
|
+Fri, 4 May 2007
|
|
+Con Kolivas <kernel@kolivas.org>
|
|
Index: linux-2.6.22-ck1/Documentation/sysctl/kernel.txt
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/Documentation/sysctl/kernel.txt 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/Documentation/sysctl/kernel.txt 2007-07-10 14:55:20.000000000 +1000
|
|
@@ -25,6 +25,9 @@
|
|
- domainname
|
|
- hostname
|
|
- hotplug
|
|
+- interactive
|
|
+- iso_cpu
|
|
+- iso_period
|
|
- java-appletviewer [ binfmt_java, obsolete ]
|
|
- java-interpreter [ binfmt_java, obsolete ]
|
|
- kstack_depth_to_print [ X86 only ]
|
|
@@ -43,6 +46,7 @@
|
|
- printk
|
|
- real-root-dev ==> Documentation/initrd.txt
|
|
- reboot-cmd [ SPARC only ]
|
|
+- rr_interval
|
|
- rtsig-max
|
|
- rtsig-nr
|
|
- sem
|
|
@@ -164,6 +168,40 @@
|
|
|
|
==============================================================
|
|
|
|
+interactive:
|
|
+
|
|
+The staircase-deadline cpu scheduler can be set in either purely
|
|
+forward-looking mode for absolutely rigid fairness and cpu distribution
|
|
+according to nice level, or it can allow a small per-process history
|
|
+to smooth out cpu usage perturbations common in interactive tasks by
|
|
+enabling this sysctl. While small fairness issues can arise with this
|
|
+enabled, overall fairness is usually still strongly maintained and
|
|
+starvation is never possible. Enabling this can significantly smooth
|
|
+out 3d graphics and games.
|
|
+
|
|
+Default value is 1 (enabled).
|
|
+
|
|
+==============================================================
|
|
+
|
|
+iso_cpu:
|
|
+
|
|
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
|
|
+run effectively at realtime priority, averaged over a rolling iso_period
|
|
+seconds.
|
|
+
|
|
+Set to 80 (percent) by default.
|
|
+
|
|
+==============================================================
|
|
+
|
|
+iso_period:
|
|
+
|
|
+This sets the number of seconds over which SCHED_ISO cpu usage is averaged
|
|
+to see if it exceeds its allocated cpu bandwidth.
|
|
+
|
|
+Set to 5 (seconds) by default.
|
|
+
|
|
+==============================================================
|
|
+
|
|
l2cr: (PPC only)
|
|
|
|
This flag controls the L2 cache of G3 processor boards. If
|
|
@@ -288,6 +326,19 @@
|
|
|
|
==============================================================
|
|
|
|
+rr_interval:
|
|
+
|
|
+This is the smallest duration that any cpu process scheduling unit
|
|
+will run for. Increasing this value can increase throughput of cpu
|
|
+bound tasks substantially but at the expense of increased latencies
|
|
+overall. This value is in milliseconds and the default value chosen
|
|
+depends on the number of cpus available at scheduler initialisation
|
|
+with a minimum of 8.
|
|
+
|
|
+Valid values are from 1-5000.
|
|
+
|
|
+==============================================================
|
|
+
|
|
rtsig-max & rtsig-nr:
|
|
|
|
The file rtsig-max can be used to tune the maximum number
|
|
Index: linux-2.6.22-ck1/fs/pipe.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/fs/pipe.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/fs/pipe.c 2007-07-10 14:55:02.000000000 +1000
|
|
@@ -41,12 +41,7 @@
|
|
{
|
|
DEFINE_WAIT(wait);
|
|
|
|
- /*
|
|
- * Pipes are system-local resources, so sleeping on them
|
|
- * is considered a noninteractive wait:
|
|
- */
|
|
- prepare_to_wait(&pipe->wait, &wait,
|
|
- TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
|
|
+ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
|
|
if (pipe->inode)
|
|
mutex_unlock(&pipe->inode->i_mutex);
|
|
schedule();
|
|
Index: linux-2.6.22-ck1/fs/proc/array.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/fs/proc/array.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/fs/proc/array.c 2007-07-10 14:55:02.000000000 +1000
|
|
@@ -165,7 +165,6 @@
|
|
rcu_read_lock();
|
|
buffer += sprintf(buffer,
|
|
"State:\t%s\n"
|
|
- "SleepAVG:\t%lu%%\n"
|
|
"Tgid:\t%d\n"
|
|
"Pid:\t%d\n"
|
|
"PPid:\t%d\n"
|
|
@@ -173,7 +172,6 @@
|
|
"Uid:\t%d\t%d\t%d\t%d\n"
|
|
"Gid:\t%d\t%d\t%d\t%d\n",
|
|
get_task_state(p),
|
|
- (p->sleep_avg/1024)*100/(1020000000/1024),
|
|
p->tgid, p->pid,
|
|
pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
|
|
pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
|
|
Index: linux-2.6.22-ck1/include/linux/init_task.h
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/include/linux/init_task.h 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/include/linux/init_task.h 2007-07-10 14:55:20.000000000 +1000
|
|
@@ -125,13 +125,15 @@
|
|
.prio = MAX_PRIO-20, \
|
|
.static_prio = MAX_PRIO-20, \
|
|
.normal_prio = MAX_PRIO-20, \
|
|
+ .rotation = 0, \
|
|
.policy = SCHED_NORMAL, \
|
|
.cpus_allowed = CPU_MASK_ALL, \
|
|
.mm = NULL, \
|
|
.active_mm = &init_mm, \
|
|
.run_list = LIST_HEAD_INIT(tsk.run_list), \
|
|
.ioprio = 0, \
|
|
- .time_slice = HZ, \
|
|
+ .time_slice = 1000000000, \
|
|
+ .quota = 1000000000, \
|
|
.tasks = LIST_HEAD_INIT(tsk.tasks), \
|
|
.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
|
|
.ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \
|
|
@@ -158,6 +160,7 @@
|
|
.signal = {{0}}}, \
|
|
.blocked = {{0}}, \
|
|
.alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \
|
|
+ .mutexes_held = 0, \
|
|
.journal_info = NULL, \
|
|
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
|
|
.fs_excl = ATOMIC_INIT(0), \
|
|
Index: linux-2.6.22-ck1/kernel/softirq.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/kernel/softirq.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/kernel/softirq.c 2007-07-10 14:55:02.000000000 +1000
|
|
@@ -488,7 +488,7 @@
|
|
|
|
static int ksoftirqd(void * __bind_cpu)
|
|
{
|
|
- set_user_nice(current, 19);
|
|
+ set_user_nice(current, 15);
|
|
current->flags |= PF_NOFREEZE;
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
Index: linux-2.6.22-ck1/kernel/workqueue.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/kernel/workqueue.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/kernel/workqueue.c 2007-07-10 14:55:02.000000000 +1000
|
|
@@ -285,8 +285,6 @@
|
|
if (!cwq->wq->freezeable)
|
|
current->flags |= PF_NOFREEZE;
|
|
|
|
- set_user_nice(current, -5);
|
|
-
|
|
for (;;) {
|
|
prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
|
|
if (!freezing(current) &&
|
|
Index: linux-2.6.22-ck1/kernel/kthread.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/kernel/kthread.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/kernel/kthread.c 2007-07-10 14:55:02.000000000 +1000
|
|
@@ -223,7 +223,6 @@
|
|
|
|
ignore_signals(tsk);
|
|
|
|
- set_user_nice(tsk, -5);
|
|
set_cpus_allowed(tsk, CPU_MASK_ALL);
|
|
}
|
|
|
|
Index: linux-2.6.22-ck1/kernel/fork.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/kernel/fork.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/kernel/fork.c 2007-07-10 14:55:20.000000000 +1000
|
|
@@ -1063,6 +1063,7 @@
|
|
p->io_context = NULL;
|
|
p->io_wait = NULL;
|
|
p->audit_context = NULL;
|
|
+ p->mutexes_held = 0;
|
|
cpuset_fork(p);
|
|
#ifdef CONFIG_NUMA
|
|
p->mempolicy = mpol_copy(p->mempolicy);
|
|
Index: linux-2.6.22-ck1/kernel/mutex.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/kernel/mutex.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/kernel/mutex.c 2007-07-10 14:55:20.000000000 +1000
|
|
@@ -60,6 +60,16 @@
|
|
static void fastcall noinline __sched
|
|
__mutex_lock_slowpath(atomic_t *lock_count);
|
|
|
|
+static inline void inc_mutex_count(void)
|
|
+{
|
|
+ current->mutexes_held++;
|
|
+}
|
|
+
|
|
+static inline void dec_mutex_count(void)
|
|
+{
|
|
+ current->mutexes_held--;
|
|
+}
|
|
+
|
|
/***
|
|
* mutex_lock - acquire the mutex
|
|
* @lock: the mutex to be acquired
|
|
@@ -89,6 +99,7 @@
|
|
* 'unlocked' into 'locked' state.
|
|
*/
|
|
__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
|
|
+ inc_mutex_count();
|
|
}
|
|
|
|
EXPORT_SYMBOL(mutex_lock);
|
|
@@ -114,6 +125,7 @@
|
|
* into 'unlocked' state:
|
|
*/
|
|
__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
|
|
+ dec_mutex_count();
|
|
}
|
|
|
|
EXPORT_SYMBOL(mutex_unlock);
|
|
@@ -283,9 +295,14 @@
|
|
*/
|
|
int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
|
|
{
|
|
+ int ret;
|
|
+
|
|
might_sleep();
|
|
- return __mutex_fastpath_lock_retval
|
|
+ ret = __mutex_fastpath_lock_retval
|
|
(&lock->count, __mutex_lock_interruptible_slowpath);
|
|
+ if (likely(!ret))
|
|
+ inc_mutex_count();
|
|
+ return ret;
|
|
}
|
|
|
|
EXPORT_SYMBOL(mutex_lock_interruptible);
|
|
@@ -340,8 +357,12 @@
|
|
*/
|
|
int fastcall __sched mutex_trylock(struct mutex *lock)
|
|
{
|
|
- return __mutex_fastpath_trylock(&lock->count,
|
|
+ int ret = __mutex_fastpath_trylock(&lock->count,
|
|
__mutex_trylock_slowpath);
|
|
+
|
|
+ if (likely(ret))
|
|
+ inc_mutex_count();
|
|
+ return ret;
|
|
}
|
|
|
|
EXPORT_SYMBOL(mutex_trylock);
|
|
Index: linux-2.6.22-ck1/block/cfq-iosched.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/block/cfq-iosched.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/block/cfq-iosched.c 2007-07-10 14:55:21.000000000 +1000
|
|
@@ -1276,10 +1276,12 @@
|
|
printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
|
|
case IOPRIO_CLASS_NONE:
|
|
/*
|
|
- * no prio set, place us in the middle of the BE classes
|
|
+ * Select class and ioprio according to policy and nice
|
|
*/
|
|
+ cfqq->ioprio_class = task_policy_ioprio_class(tsk);
|
|
cfqq->ioprio = task_nice_ioprio(tsk);
|
|
- cfqq->ioprio_class = IOPRIO_CLASS_BE;
|
|
+ if (cfqq->ioprio_class == IOPRIO_CLASS_IDLE)
|
|
+ cfq_clear_cfqq_idle_window(cfqq);
|
|
break;
|
|
case IOPRIO_CLASS_RT:
|
|
cfqq->ioprio = task_ioprio(tsk);
|
|
Index: linux-2.6.22-ck1/include/linux/ioprio.h
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/include/linux/ioprio.h 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/include/linux/ioprio.h 2007-07-10 14:55:21.000000000 +1000
|
|
@@ -22,7 +22,7 @@
|
|
* class, the default for any process. IDLE is the idle scheduling class, it
|
|
* is only served when no one else is using the disk.
|
|
*/
|
|
-enum {
|
|
+enum ioprio_class {
|
|
IOPRIO_CLASS_NONE,
|
|
IOPRIO_CLASS_RT,
|
|
IOPRIO_CLASS_BE,
|
|
@@ -51,8 +51,25 @@
|
|
return IOPRIO_PRIO_DATA(task->ioprio);
|
|
}
|
|
|
|
+static inline enum ioprio_class
|
|
+ task_policy_ioprio_class(struct task_struct *task)
|
|
+{
|
|
+ if (rt_task(task))
|
|
+ return IOPRIO_CLASS_RT;
|
|
+ if (idleprio_task(task))
|
|
+ return IOPRIO_CLASS_IDLE;
|
|
+ return IOPRIO_CLASS_BE;
|
|
+}
|
|
+
|
|
static inline int task_nice_ioprio(struct task_struct *task)
|
|
{
|
|
+ if (rt_task(task))
|
|
+ return (MAX_RT_PRIO - task->rt_priority) * IOPRIO_BE_NR /
|
|
+ (MAX_RT_PRIO + 1);
|
|
+ if (iso_task(task))
|
|
+ return 0;
|
|
+ if (idleprio_task(task))
|
|
+ return IOPRIO_BE_NR - 1;
|
|
return (task_nice(task) + 20) / 5;
|
|
}
|
|
|
|
Index: linux-2.6.22-ck1/Documentation/sysctl/vm.txt
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/Documentation/sysctl/vm.txt 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/Documentation/sysctl/vm.txt 2007-07-10 14:55:23.000000000 +1000
|
|
@@ -22,6 +22,8 @@
|
|
- dirty_background_ratio
|
|
- dirty_expire_centisecs
|
|
- dirty_writeback_centisecs
|
|
+- hardmaplimit
|
|
+- mapped
|
|
- max_map_count
|
|
- min_free_kbytes
|
|
- laptop_mode
|
|
@@ -31,12 +33,15 @@
|
|
- min_unmapped_ratio
|
|
- min_slab_ratio
|
|
- panic_on_oom
|
|
+- swap_prefetch
|
|
+- swap_prefetch_delay
|
|
+- swap_prefetch_sleep
|
|
|
|
==============================================================
|
|
|
|
dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
|
|
dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
|
|
-block_dump, swap_token_timeout, drop-caches:
|
|
+block_dump, swap_token_timeout, drop-caches, tail_largefiles:
|
|
|
|
See Documentation/filesystems/proc.txt
|
|
|
|
@@ -86,6 +91,27 @@
|
|
|
|
==============================================================
|
|
|
|
+hardmaplimit:
|
|
+
|
|
+This flag makes the vm adhere to the mapped value as closely as possible
|
|
+except in the most extreme vm stress where doing so would provoke an out
|
|
+of memory condition (see mapped below).
|
|
+
|
|
+Enabled by default.
|
|
+
|
|
+==============================================================
|
|
+
|
|
+mapped:
|
|
+
|
|
+This is the percentage ram that is filled with mapped pages (applications)
|
|
+before the vm will start reclaiming mapped pages by moving them to swap.
|
|
+It is altered by the relative stress of the vm at the time so is not
|
|
+strictly adhered to to prevent provoking out of memory kills.
|
|
+
|
|
+Set to 66 by default.
|
|
+
|
|
+==============================================================
|
|
+
|
|
max_map_count:
|
|
|
|
This file contains the maximum number of memory map areas a process
|
|
@@ -216,3 +242,37 @@
|
|
The default value is 0.
|
|
1 and 2 are for failover of clustering. Please select either
|
|
according to your policy of failover.
|
|
+
|
|
+==============================================================
|
|
+
|
|
+swap_prefetch
|
|
+
|
|
+This enables or disables the swap prefetching feature. When the virtual
|
|
+memory subsystem has been extremely idle for at least swap_prefetch_sleep
|
|
+seconds it will start copying back pages from swap into the swapcache and keep
|
|
+a copy in swap. Valid values are 0 - 3. A value of 0 disables swap
|
|
+prefetching, 1 enables it unless laptop_mode is enabled, 2 enables it in the
|
|
+presence of laptop_mode, and 3 enables it unconditionally, ignoring whether
|
|
+the system is idle or not. If set to 0, swap prefetch wil not even try to keep
|
|
+record of ram swapped out to have the most minimal impact on performance.
|
|
+
|
|
+The default value is 1.
|
|
+
|
|
+==============================================================
|
|
+
|
|
+swap_prefetch_delay
|
|
+
|
|
+This is the time in seconds that swap prefetching is delayed upon finding
|
|
+the system is not idle (ie the vm is busy or non-niced cpu load is present).
|
|
+
|
|
+The default value is 1.
|
|
+
|
|
+==============================================================
|
|
+
|
|
+swap_prefetch_sleep
|
|
+
|
|
+This is the time in seconds that the swap prefetch kernel thread is put to
|
|
+sleep for when the ram is found to be full and it is unable to prefetch
|
|
+further.
|
|
+
|
|
+The default value is 5.
|
|
Index: linux-2.6.22-ck1/include/linux/swap.h
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/include/linux/swap.h 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/include/linux/swap.h 2007-07-10 14:55:22.000000000 +1000
|
|
@@ -180,6 +180,7 @@
|
|
/* linux/mm/swap.c */
|
|
extern void FASTCALL(lru_cache_add(struct page *));
|
|
extern void FASTCALL(lru_cache_add_active(struct page *));
|
|
+extern void FASTCALL(lru_cache_add_tail(struct page *));
|
|
extern void FASTCALL(activate_page(struct page *));
|
|
extern void FASTCALL(mark_page_accessed(struct page *));
|
|
extern void lru_add_drain(void);
|
|
@@ -188,9 +189,11 @@
|
|
extern void swap_setup(void);
|
|
|
|
/* linux/mm/vmscan.c */
|
|
-extern unsigned long try_to_free_pages(struct zone **, gfp_t);
|
|
+extern unsigned long try_to_free_pages(struct zone **, gfp_t,
|
|
+ struct task_struct *p);
|
|
extern unsigned long shrink_all_memory(unsigned long nr_pages);
|
|
-extern int vm_swappiness;
|
|
+extern int vm_mapped;
|
|
+extern int vm_hardmaplimit;
|
|
extern int remove_mapping(struct address_space *mapping, struct page *page);
|
|
extern long vm_total_pages;
|
|
|
|
@@ -237,6 +240,7 @@
|
|
extern struct page * lookup_swap_cache(swp_entry_t);
|
|
extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
|
|
unsigned long addr);
|
|
+extern int add_to_swap_cache(struct page *page, swp_entry_t entry);
|
|
/* linux/mm/swapfile.c */
|
|
extern long total_swap_pages;
|
|
extern unsigned int nr_swapfiles;
|
|
Index: linux-2.6.22-ck1/init/Kconfig
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/init/Kconfig 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/init/Kconfig 2007-07-10 14:55:22.000000000 +1000
|
|
@@ -105,6 +105,28 @@
|
|
used to provide more virtual memory than the actual RAM present
|
|
in your computer. If unsure say Y.
|
|
|
|
+config SWAP_PREFETCH
|
|
+ bool "Support for prefetching swapped memory"
|
|
+ depends on SWAP
|
|
+ default y
|
|
+ ---help---
|
|
+ This option will allow the kernel to prefetch swapped memory pages
|
|
+ when idle. The pages will be kept on both swap and in swap_cache
|
|
+ thus avoiding the need for further I/O if either ram or swap space
|
|
+ is required.
|
|
+
|
|
+ What this will do on workstations is slowly bring back applications
|
|
+ that have swapped out after memory intensive workloads back into
|
|
+ physical ram if you have free ram at a later stage and the machine
|
|
+ is relatively idle. This means that when you come back to your
|
|
+ computer after leaving it idle for a while, applications will come
|
|
+ to life faster. Note that your swap usage will appear to increase
|
|
+ but these are cached pages, can be dropped freely by the vm, and it
|
|
+ should stabilise around 50% swap usage maximum.
|
|
+
|
|
+ Workstations and multiuser workstation servers will most likely want
|
|
+ to say Y.
|
|
+
|
|
config SYSVIPC
|
|
bool "System V IPC"
|
|
---help---
|
|
Index: linux-2.6.22-ck1/mm/Makefile
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/mm/Makefile 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/mm/Makefile 2007-07-10 14:55:22.000000000 +1000
|
|
@@ -17,6 +17,7 @@
|
|
obj-y += bounce.o
|
|
endif
|
|
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
|
|
+obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o
|
|
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
|
|
obj-$(CONFIG_NUMA) += mempolicy.o
|
|
obj-$(CONFIG_SPARSEMEM) += sparse.o
|
|
Index: linux-2.6.22-ck1/mm/swap.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/mm/swap.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/mm/swap.c 2007-07-10 14:55:23.000000000 +1000
|
|
@@ -17,6 +17,7 @@
|
|
#include <linux/sched.h>
|
|
#include <linux/kernel_stat.h>
|
|
#include <linux/swap.h>
|
|
+#include <linux/swap-prefetch.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/pagevec.h>
|
|
@@ -176,6 +177,7 @@
|
|
*/
|
|
static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
|
|
static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
|
|
+static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, };
|
|
|
|
void fastcall lru_cache_add(struct page *page)
|
|
{
|
|
@@ -197,6 +199,31 @@
|
|
put_cpu_var(lru_add_active_pvecs);
|
|
}
|
|
|
|
+static void __pagevec_lru_add_tail(struct pagevec *pvec)
|
|
+{
|
|
+ int i;
|
|
+ struct zone *zone = NULL;
|
|
+
|
|
+ for (i = 0; i < pagevec_count(pvec); i++) {
|
|
+ struct page *page = pvec->pages[i];
|
|
+ struct zone *pagezone = page_zone(page);
|
|
+
|
|
+ if (pagezone != zone) {
|
|
+ if (zone)
|
|
+ spin_unlock_irq(&zone->lru_lock);
|
|
+ zone = pagezone;
|
|
+ spin_lock_irq(&zone->lru_lock);
|
|
+ }
|
|
+ BUG_ON(PageLRU(page));
|
|
+ SetPageLRU(page);
|
|
+ add_page_to_inactive_list_tail(zone, page);
|
|
+ }
|
|
+ if (zone)
|
|
+ spin_unlock_irq(&zone->lru_lock);
|
|
+ release_pages(pvec->pages, pvec->nr, pvec->cold);
|
|
+ pagevec_reinit(pvec);
|
|
+}
|
|
+
|
|
static void __lru_add_drain(int cpu)
|
|
{
|
|
struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
|
|
@@ -207,6 +234,9 @@
|
|
pvec = &per_cpu(lru_add_active_pvecs, cpu);
|
|
if (pagevec_count(pvec))
|
|
__pagevec_lru_add_active(pvec);
|
|
+ pvec = &per_cpu(lru_add_tail_pvecs, cpu);
|
|
+ if (pagevec_count(pvec))
|
|
+ __pagevec_lru_add_tail(pvec);
|
|
}
|
|
|
|
void lru_add_drain(void)
|
|
@@ -403,6 +433,20 @@
|
|
}
|
|
|
|
/*
|
|
+ * Function used uniquely to put pages back to the lru at the end of the
|
|
+ * inactive list to preserve the lru order.
|
|
+ */
|
|
+void fastcall lru_cache_add_tail(struct page *page)
|
|
+{
|
|
+ struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs);
|
|
+
|
|
+ page_cache_get(page);
|
|
+ if (!pagevec_add(pvec, page))
|
|
+ __pagevec_lru_add_tail(pvec);
|
|
+ put_cpu_var(lru_add_pvecs);
|
|
+}
|
|
+
|
|
+/*
|
|
* Try to drop buffers from the pages in a pagevec
|
|
*/
|
|
void pagevec_strip(struct pagevec *pvec)
|
|
@@ -514,6 +558,9 @@
|
|
* Right now other parts of the system means that we
|
|
* _really_ don't want to cluster much more
|
|
*/
|
|
+
|
|
+ prepare_swap_prefetch();
|
|
+
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
hotcpu_notifier(cpu_swap_callback, 0);
|
|
#endif
|
|
Index: linux-2.6.22-ck1/mm/swap_prefetch.c
|
|
===================================================================
|
|
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
|
|
+++ linux-2.6.22-ck1/mm/swap_prefetch.c 2007-07-10 14:55:22.000000000 +1000
|
|
@@ -0,0 +1,542 @@
|
|
+/*
|
|
+ * linux/mm/swap_prefetch.c
|
|
+ *
|
|
+ * Copyright (C) 2005-2007 Con Kolivas
|
|
+ *
|
|
+ * Written by Con Kolivas <kernel@kolivas.org>
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or modify
|
|
+ * it under the terms of the GNU General Public License version 2 as
|
|
+ * published by the Free Software Foundation.
|
|
+ */
|
|
+
|
|
+#include <linux/fs.h>
|
|
+#include <linux/mm.h>
|
|
+#include <linux/swap.h>
|
|
+#include <linux/swap-prefetch.h>
|
|
+#include <linux/ioprio.h>
|
|
+#include <linux/kthread.h>
|
|
+#include <linux/pagemap.h>
|
|
+#include <linux/syscalls.h>
|
|
+#include <linux/writeback.h>
|
|
+#include <linux/vmstat.h>
|
|
+#include <linux/freezer.h>
|
|
+
|
|
+/*
|
|
+ * sysctls:
|
|
+ * swap_prefetch: 0. Disable swap prefetching
|
|
+ * 1. Prefetch only when idle and not with laptop_mode
|
|
+ * 2. Prefetch when idle and with laptop_mode
|
|
+ * 3. Prefetch at all times.
|
|
+ * swap_prefetch_delay: Number of seconds to delay prefetching when system
|
|
+ * is not idle.
|
|
+ * swap_prefetch_sleep: Number of seconds to put kprefetchd to sleep when
|
|
+ * unable to prefetch.
|
|
+ */
|
|
+int swap_prefetch __read_mostly = 1;
|
|
+int swap_prefetch_delay __read_mostly = 1;
|
|
+int swap_prefetch_sleep __read_mostly = 5;
|
|
+
|
|
+#define PREFETCH_DELAY (HZ * swap_prefetch_delay)
|
|
+#define PREFETCH_SLEEP ((HZ * swap_prefetch_sleep) ? : 1)
|
|
+
|
|
+struct swapped_root {
|
|
+ unsigned long busy; /* vm busy */
|
|
+ spinlock_t lock; /* protects all data */
|
|
+ struct list_head list; /* MRU list of swapped pages */
|
|
+ struct radix_tree_root swap_tree; /* Lookup tree of pages */
|
|
+ unsigned int count; /* Number of entries */
|
|
+ unsigned int maxcount; /* Maximum entries allowed */
|
|
+ struct kmem_cache *cache; /* Of struct swapped_entry */
|
|
+};
|
|
+
|
|
+static struct swapped_root swapped = {
|
|
+ .lock = SPIN_LOCK_UNLOCKED,
|
|
+ .list = LIST_HEAD_INIT(swapped.list),
|
|
+ .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC),
|
|
+};
|
|
+
|
|
+static struct task_struct *kprefetchd_task;
|
|
+
|
|
+/*
|
|
+ * We check to see no part of the vm is busy. If it is this will interrupt
|
|
+ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy.
|
|
+ */
|
|
+inline void delay_swap_prefetch(void)
|
|
+{
|
|
+ if (!test_bit(0, &swapped.busy))
|
|
+ __set_bit(0, &swapped.busy);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * If laptop_mode is enabled don't prefetch to avoid hard drives
|
|
+ * doing unnecessary spin-ups unless swap_prefetch is explicitly
|
|
+ * set to a higher value.
|
|
+ */
|
|
+static inline int prefetch_enabled(void)
|
|
+{
|
|
+ if (swap_prefetch <= laptop_mode)
|
|
+ return 0;
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+static int kprefetchd_awake;
|
|
+
|
|
+/*
|
|
+ * Drop behind accounting which keeps a list of the most recently used swap
|
|
+ * entries. Entries are removed lazily by kprefetchd.
|
|
+ */
|
|
+void add_to_swapped_list(struct page *page)
|
|
+{
|
|
+ struct swapped_entry *entry;
|
|
+ unsigned long index, flags;
|
|
+
|
|
+ if (!prefetch_enabled())
|
|
+ goto out;
|
|
+
|
|
+ spin_lock_irqsave(&swapped.lock, flags);
|
|
+ if (swapped.count >= swapped.maxcount) {
|
|
+ /*
|
|
+ * Once the number of entries exceeds maxcount we start
|
|
+ * removing the least recently used entries.
|
|
+ */
|
|
+ entry = list_entry(swapped.list.next,
|
|
+ struct swapped_entry, swapped_list);
|
|
+ radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val);
|
|
+ list_del(&entry->swapped_list);
|
|
+ swapped.count--;
|
|
+ } else {
|
|
+ entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC);
|
|
+ if (unlikely(!entry))
|
|
+ /* bad, can't allocate more mem */
|
|
+ goto out_locked;
|
|
+ }
|
|
+
|
|
+ index = page_private(page);
|
|
+ entry->swp_entry.val = index;
|
|
+ /*
|
|
+ * On numa we need to store the node id to ensure that we prefetch to
|
|
+ * the same node it came from.
|
|
+ */
|
|
+ store_swap_entry_node(entry, page);
|
|
+
|
|
+ if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) {
|
|
+ list_add(&entry->swapped_list, &swapped.list);
|
|
+ swapped.count++;
|
|
+ } else
|
|
+ kmem_cache_free(swapped.cache, entry);
|
|
+
|
|
+out_locked:
|
|
+ spin_unlock_irqrestore(&swapped.lock, flags);
|
|
+out:
|
|
+ if (!kprefetchd_awake)
|
|
+ wake_up_process(kprefetchd_task);
|
|
+ return;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Removes entries from the swapped_list. The radix tree allows us to quickly
|
|
+ * look up the entry from the index without having to iterate over the whole
|
|
+ * list.
|
|
+ */
|
|
+static void remove_from_swapped_list(const unsigned long index)
|
|
+{
|
|
+ struct swapped_entry *entry;
|
|
+ unsigned long flags;
|
|
+
|
|
+ spin_lock_irqsave(&swapped.lock, flags);
|
|
+ entry = radix_tree_delete(&swapped.swap_tree, index);
|
|
+ if (likely(entry)) {
|
|
+ list_del(&entry->swapped_list);
|
|
+ swapped.count--;
|
|
+ kmem_cache_free(swapped.cache, entry);
|
|
+ }
|
|
+ spin_unlock_irqrestore(&swapped.lock, flags);
|
|
+}
|
|
+
|
|
+enum trickle_return {
|
|
+ TRICKLE_SUCCESS,
|
|
+ TRICKLE_FAILED,
|
|
+ TRICKLE_DELAY,
|
|
+};
|
|
+
|
|
+struct node_stats {
|
|
+ /* Free ram after a cycle of prefetching */
|
|
+ unsigned long last_free;
|
|
+ /* Free ram on this cycle of checking prefetch_suitable */
|
|
+ unsigned long current_free;
|
|
+ /* The amount of free ram before we start prefetching */
|
|
+ unsigned long highfree[MAX_NR_ZONES];
|
|
+ /* The amount of free ram where we will stop prefetching */
|
|
+ unsigned long lowfree[MAX_NR_ZONES];
|
|
+ /* highfree or lowfree depending on whether we've hit a watermark */
|
|
+ unsigned long *pointfree[MAX_NR_ZONES];
|
|
+};
|
|
+
|
|
+/*
|
|
+ * prefetch_stats stores the free ram data of each node and this is used to
|
|
+ * determine if a node is suitable for prefetching into.
|
|
+ */
|
|
+struct prefetch_stats {
|
|
+ /* Which nodes are currently suited to prefetching */
|
|
+ nodemask_t prefetch_nodes;
|
|
+ /* Total pages we've prefetched on this wakeup of kprefetchd */
|
|
+ unsigned long prefetched_pages;
|
|
+ struct node_stats node[MAX_NUMNODES];
|
|
+};
|
|
+
|
|
+static struct prefetch_stats sp_stat;
|
|
+
|
|
+/*
|
|
+ * This tries to read a swp_entry_t into swap cache for swap prefetching.
|
|
+ * If it returns TRICKLE_DELAY we should delay further prefetching.
|
|
+ */
|
|
+static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry,
|
|
+ const int node)
|
|
+{
|
|
+ enum trickle_return ret = TRICKLE_FAILED;
|
|
+ unsigned long flags;
|
|
+ struct page *page;
|
|
+
|
|
+ read_lock_irqsave(&swapper_space.tree_lock, flags);
|
|
+ /* Entry may already exist */
|
|
+ page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
|
|
+ read_unlock_irqrestore(&swapper_space.tree_lock, flags);
|
|
+ if (page)
|
|
+ goto out;
|
|
+
|
|
+ /*
|
|
+ * Get a new page to read from swap. We have already checked the
|
|
+ * watermarks so __alloc_pages will not call on reclaim.
|
|
+ */
|
|
+ page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0);
|
|
+ if (unlikely(!page)) {
|
|
+ ret = TRICKLE_DELAY;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (add_to_swap_cache(page, entry)) {
|
|
+ /* Failed to add to swap cache */
|
|
+ goto out_release;
|
|
+ }
|
|
+
|
|
+ /* Add them to the tail of the inactive list to preserve LRU order */
|
|
+ lru_cache_add_tail(page);
|
|
+ if (unlikely(swap_readpage(NULL, page)))
|
|
+ goto out_release;
|
|
+
|
|
+ sp_stat.prefetched_pages++;
|
|
+ sp_stat.node[node].last_free--;
|
|
+
|
|
+ ret = TRICKLE_SUCCESS;
|
|
+out_release:
|
|
+ page_cache_release(page);
|
|
+out:
|
|
+ /*
|
|
+ * All entries are removed here lazily. This avoids the cost of
|
|
+ * remove_from_swapped_list during normal swapin. Thus there are
|
|
+ * usually many stale entries.
|
|
+ */
|
|
+ remove_from_swapped_list(entry.val);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void clear_last_prefetch_free(void)
|
|
+{
|
|
+ int node;
|
|
+
|
|
+ /*
|
|
+ * Reset the nodes suitable for prefetching to all nodes. We could
|
|
+ * update the data to take into account memory hotplug if desired..
|
|
+ */
|
|
+ sp_stat.prefetch_nodes = node_online_map;
|
|
+ for_each_node_mask(node, sp_stat.prefetch_nodes) {
|
|
+ struct node_stats *ns = &sp_stat.node[node];
|
|
+
|
|
+ ns->last_free = 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void clear_current_prefetch_free(void)
|
|
+{
|
|
+ int node;
|
|
+
|
|
+ sp_stat.prefetch_nodes = node_online_map;
|
|
+ for_each_node_mask(node, sp_stat.prefetch_nodes) {
|
|
+ struct node_stats *ns = &sp_stat.node[node];
|
|
+
|
|
+ ns->current_free = 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This updates the high and low watermarks of amount of free ram in each
|
|
+ * node used to start and stop prefetching. We prefetch from pages_high * 4
|
|
+ * down to pages_high * 3.
|
|
+ */
|
|
+static void examine_free_limits(void)
|
|
+{
|
|
+ struct zone *z;
|
|
+
|
|
+ for_each_zone(z) {
|
|
+ struct node_stats *ns;
|
|
+ int idx;
|
|
+
|
|
+ if (!populated_zone(z))
|
|
+ continue;
|
|
+
|
|
+ ns = &sp_stat.node[zone_to_nid(z)];
|
|
+ idx = zone_idx(z);
|
|
+ ns->lowfree[idx] = z->pages_high * 3;
|
|
+ ns->highfree[idx] = ns->lowfree[idx] + z->pages_high;
|
|
+
|
|
+ if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) {
|
|
+ /*
|
|
+ * We've gotten above the high watermark of free pages
|
|
+ * so we can start prefetching till we get to the low
|
|
+ * watermark.
|
|
+ */
|
|
+ ns->pointfree[idx] = &ns->lowfree[idx];
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * We want to be absolutely certain it's ok to start prefetching.
|
|
+ */
|
|
+static enum trickle_return prefetch_suitable(void)
|
|
+{
|
|
+ enum trickle_return ret = TRICKLE_DELAY;
|
|
+ struct zone *z;
|
|
+ int node;
|
|
+
|
|
+ /*
|
|
+ * If swap_prefetch is set to a high value we can ignore load
|
|
+ * and prefetch whenever we can. Otherwise we test for vm and
|
|
+ * cpu activity.
|
|
+ */
|
|
+ if (swap_prefetch < 3) {
|
|
+ /* Purposefully racy, may return false positive */
|
|
+ if (test_bit(0, &swapped.busy)) {
|
|
+ __clear_bit(0, &swapped.busy);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * above_background_load is expensive so we only perform it
|
|
+ * every SWAP_CLUSTER_MAX prefetched_pages.
|
|
+ * We test to see if we're above_background_load as disk
|
|
+ * activity even at low priority can cause interrupt induced
|
|
+ * scheduling latencies.
|
|
+ */
|
|
+ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX) &&
|
|
+ above_background_load())
|
|
+ goto out;
|
|
+ }
|
|
+ clear_current_prefetch_free();
|
|
+
|
|
+ /*
|
|
+ * Have some hysteresis between where page reclaiming and prefetching
|
|
+ * will occur to prevent ping-ponging between them.
|
|
+ */
|
|
+ for_each_zone(z) {
|
|
+ struct node_stats *ns;
|
|
+ unsigned long free;
|
|
+ int idx;
|
|
+
|
|
+ if (!populated_zone(z))
|
|
+ continue;
|
|
+
|
|
+ node = zone_to_nid(z);
|
|
+ ns = &sp_stat.node[node];
|
|
+ idx = zone_idx(z);
|
|
+
|
|
+ free = zone_page_state(z, NR_FREE_PAGES);
|
|
+ if (free < *ns->pointfree[idx]) {
|
|
+ /*
|
|
+ * Free pages have dropped below the low watermark so
|
|
+ * we won't start prefetching again till we hit the
|
|
+ * high watermark of free pages.
|
|
+ */
|
|
+ ns->pointfree[idx] = &ns->highfree[idx];
|
|
+ node_clear(node, sp_stat.prefetch_nodes);
|
|
+ continue;
|
|
+ }
|
|
+ ns->current_free += free;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * We iterate over each node testing to see if it is suitable for
|
|
+ * prefetching and clear the nodemask if it is not.
|
|
+ */
|
|
+ for_each_node_mask(node, sp_stat.prefetch_nodes) {
|
|
+ struct node_stats *ns = &sp_stat.node[node];
|
|
+
|
|
+ /*
|
|
+ * We check to see that pages are not being allocated
|
|
+ * elsewhere at any significant rate implying any
|
|
+ * degree of memory pressure (eg during file reads)
|
|
+ */
|
|
+ if (ns->last_free) {
|
|
+ if (ns->current_free + SWAP_CLUSTER_MAX <
|
|
+ ns->last_free) {
|
|
+ ns->last_free = ns->current_free;
|
|
+ node_clear(node,
|
|
+ sp_stat.prefetch_nodes);
|
|
+ continue;
|
|
+ }
|
|
+ } else
|
|
+ ns->last_free = ns->current_free;
|
|
+
|
|
+ /* We shouldn't prefetch when we are doing writeback */
|
|
+ if (node_page_state(node, NR_WRITEBACK))
|
|
+ node_clear(node, sp_stat.prefetch_nodes);
|
|
+ }
|
|
+
|
|
+ /* Nothing suitable, put kprefetchd back to sleep */
|
|
+ if (nodes_empty(sp_stat.prefetch_nodes))
|
|
+ return TRICKLE_FAILED;
|
|
+
|
|
+ /* Survived all that? Hooray we can prefetch! */
|
|
+ ret = TRICKLE_SUCCESS;
|
|
+out:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * trickle_swap is the main function that initiates the swap prefetching. It
|
|
+ * first checks to see if the busy flag is set, and does not prefetch if it
|
|
+ * is, as the flag implied we are low on memory or swapping in currently.
|
|
+ * Otherwise it runs until prefetch_suitable fails which occurs when the
|
|
+ * vm is busy, we prefetch to the watermark, the list is empty or we have
|
|
+ * iterated over all entries once.
|
|
+ */
|
|
+static enum trickle_return trickle_swap(void)
|
|
+{
|
|
+ enum trickle_return suitable, ret = TRICKLE_DELAY;
|
|
+ struct swapped_entry *pos, *n;
|
|
+ unsigned long flags;
|
|
+
|
|
+ if (!prefetch_enabled())
|
|
+ return ret;
|
|
+
|
|
+ examine_free_limits();
|
|
+ suitable = prefetch_suitable();
|
|
+ if (suitable != TRICKLE_SUCCESS)
|
|
+ return suitable;
|
|
+ if (list_empty(&swapped.list)) {
|
|
+ kprefetchd_awake = 0;
|
|
+ return TRICKLE_FAILED;
|
|
+ }
|
|
+
|
|
+ spin_lock_irqsave(&swapped.lock, flags);
|
|
+ list_for_each_entry_safe_reverse(pos, n, &swapped.list, swapped_list) {
|
|
+ swp_entry_t swp_entry;
|
|
+ int node;
|
|
+
|
|
+ spin_unlock_irqrestore(&swapped.lock, flags);
|
|
+ cond_resched();
|
|
+ suitable = prefetch_suitable();
|
|
+ if (suitable != TRICKLE_SUCCESS) {
|
|
+ ret = suitable;
|
|
+ goto out_unlocked;
|
|
+ }
|
|
+
|
|
+ spin_lock_irqsave(&swapped.lock, flags);
|
|
+ if (unlikely(!pos))
|
|
+ continue;
|
|
+ node = get_swap_entry_node(pos);
|
|
+ if (!node_isset(node, sp_stat.prefetch_nodes)) {
|
|
+ /*
|
|
+ * We found an entry that belongs to a node that is
|
|
+ * not suitable for prefetching so skip it.
|
|
+ */
|
|
+ continue;
|
|
+ }
|
|
+ swp_entry = pos->swp_entry;
|
|
+ spin_unlock_irqrestore(&swapped.lock, flags);
|
|
+
|
|
+ if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY)
|
|
+ goto out_unlocked;
|
|
+ spin_lock_irqsave(&swapped.lock, flags);
|
|
+ }
|
|
+ spin_unlock_irqrestore(&swapped.lock, flags);
|
|
+
|
|
+out_unlocked:
|
|
+ if (sp_stat.prefetched_pages) {
|
|
+ lru_add_drain();
|
|
+ sp_stat.prefetched_pages = 0;
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int kprefetchd(void *__unused)
|
|
+{
|
|
+ struct sched_param param = { .sched_priority = 0 };
|
|
+
|
|
+ sched_setscheduler(current, SCHED_BATCH, ¶m);
|
|
+ set_user_nice(current, 19);
|
|
+ /* Set ioprio to lowest if supported by i/o scheduler */
|
|
+ sys_ioprio_set(IOPRIO_WHO_PROCESS, IOPRIO_BE_NR - 1, IOPRIO_CLASS_BE);
|
|
+
|
|
+ while (!kthread_should_stop()) {
|
|
+ try_to_freeze();
|
|
+
|
|
+ if (!kprefetchd_awake) {
|
|
+ set_current_state(TASK_INTERRUPTIBLE);
|
|
+ schedule();
|
|
+ kprefetchd_awake = 1;
|
|
+ }
|
|
+
|
|
+ if (trickle_swap() == TRICKLE_FAILED)
|
|
+ schedule_timeout_interruptible(PREFETCH_SLEEP);
|
|
+ else
|
|
+ schedule_timeout_interruptible(PREFETCH_DELAY);
|
|
+ clear_last_prefetch_free();
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Create kmem cache for swapped entries
|
|
+ */
|
|
+void __init prepare_swap_prefetch(void)
|
|
+{
|
|
+ struct zone *zone;
|
|
+
|
|
+ swapped.cache = kmem_cache_create("swapped_entry",
|
|
+ sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
|
|
+
|
|
+ /*
|
|
+ * We set the limit to more entries than the physical ram.
|
|
+ * We remove entries lazily so we need some headroom.
|
|
+ */
|
|
+ swapped.maxcount = nr_free_pagecache_pages() * 2;
|
|
+
|
|
+ for_each_zone(zone) {
|
|
+ struct node_stats *ns;
|
|
+ int idx;
|
|
+
|
|
+ if (!populated_zone(zone))
|
|
+ continue;
|
|
+
|
|
+ ns = &sp_stat.node[zone_to_nid(zone)];
|
|
+ idx = zone_idx(zone);
|
|
+ ns->pointfree[idx] = &ns->highfree[idx];
|
|
+ }
|
|
+}
|
|
+
|
|
+static int __init kprefetchd_init(void)
|
|
+{
|
|
+ kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd");
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void __exit kprefetchd_exit(void)
|
|
+{
|
|
+ kthread_stop(kprefetchd_task);
|
|
+}
|
|
+
|
|
+module_init(kprefetchd_init);
|
|
+module_exit(kprefetchd_exit);
|
|
Index: linux-2.6.22-ck1/mm/swap_state.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/mm/swap_state.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/mm/swap_state.c 2007-07-10 14:55:22.000000000 +1000
|
|
@@ -10,6 +10,7 @@
|
|
#include <linux/mm.h>
|
|
#include <linux/kernel_stat.h>
|
|
#include <linux/swap.h>
|
|
+#include <linux/swap-prefetch.h>
|
|
#include <linux/init.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/buffer_head.h>
|
|
@@ -95,7 +96,7 @@
|
|
return error;
|
|
}
|
|
|
|
-static int add_to_swap_cache(struct page *page, swp_entry_t entry)
|
|
+int add_to_swap_cache(struct page *page, swp_entry_t entry)
|
|
{
|
|
int error;
|
|
|
|
@@ -148,6 +149,9 @@
|
|
swp_entry_t entry;
|
|
int err;
|
|
|
|
+ /* Swap prefetching is delayed if we're swapping pages */
|
|
+ delay_swap_prefetch();
|
|
+
|
|
BUG_ON(!PageLocked(page));
|
|
|
|
for (;;) {
|
|
@@ -320,6 +324,9 @@
|
|
struct page *found_page, *new_page = NULL;
|
|
int err;
|
|
|
|
+ /* Swap prefetching is delayed if we're already reading from swap */
|
|
+ delay_swap_prefetch();
|
|
+
|
|
do {
|
|
/*
|
|
* First check the swap cache. Since this is normally
|
|
Index: linux-2.6.22-ck1/mm/vmscan.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/mm/vmscan.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/mm/vmscan.c 2007-07-10 14:55:23.000000000 +1000
|
|
@@ -16,6 +16,7 @@
|
|
#include <linux/slab.h>
|
|
#include <linux/kernel_stat.h>
|
|
#include <linux/swap.h>
|
|
+#include <linux/swap-prefetch.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/init.h>
|
|
#include <linux/highmem.h>
|
|
@@ -36,6 +37,7 @@
|
|
#include <linux/rwsem.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/kthread.h>
|
|
+#include <linux/timer.h>
|
|
#include <linux/freezer.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
@@ -63,7 +65,7 @@
|
|
* whole list at once. */
|
|
int swap_cluster_max;
|
|
|
|
- int swappiness;
|
|
+ int mapped;
|
|
|
|
int all_unreclaimable;
|
|
};
|
|
@@ -110,9 +112,10 @@
|
|
#endif
|
|
|
|
/*
|
|
- * From 0 .. 100. Higher means more swappy.
|
|
+ * From 0 .. 100. Lower means more swappy.
|
|
*/
|
|
-int vm_swappiness = 60;
|
|
+int vm_mapped __read_mostly = 66;
|
|
+int vm_hardmaplimit __read_mostly = 1;
|
|
long vm_total_pages; /* The total number of pages which the VM controls */
|
|
|
|
static LIST_HEAD(shrinker_list);
|
|
@@ -803,10 +806,14 @@
|
|
* The distress ratio is important - we don't want to start
|
|
* going oom.
|
|
*
|
|
- * A 100% value of vm_swappiness overrides this algorithm
|
|
- * altogether.
|
|
+ * This distress value is ignored if we apply a hardmaplimit except
|
|
+ * in extreme distress.
|
|
+ *
|
|
+ * A 0% value of vm_mapped overrides this algorithm altogether.
|
|
*/
|
|
- swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
|
|
+ swap_tendency = mapped_ratio * 100 / (sc->mapped + 1);
|
|
+ if (!vm_hardmaplimit || distress == 100)
|
|
+ swap_tendency += distress;
|
|
|
|
/*
|
|
* Now use this metric to decide whether to start moving mapped
|
|
@@ -955,6 +962,41 @@
|
|
}
|
|
|
|
/*
|
|
+ * Helper functions to adjust nice level of kswapd, based on the priority of
|
|
+ * the task (p) that called it. If it is already higher priority we do not
|
|
+ * demote its nice level since it is still working on behalf of a higher
|
|
+ * priority task. With kernel threads we leave it at nice 0.
|
|
+ *
|
|
+ * We don't ever run kswapd real time, so if a real time task calls kswapd we
|
|
+ * set it to highest SCHED_NORMAL priority.
|
|
+ */
|
|
+static int effective_sc_prio(struct task_struct *p)
|
|
+{
|
|
+ if (likely(p->mm)) {
|
|
+ if (rt_task(p))
|
|
+ return -20;
|
|
+ if (idleprio_task(p))
|
|
+ return 19;
|
|
+ return task_nice(p);
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void set_kswapd_nice(struct task_struct *kswapd, struct task_struct *p,
|
|
+ int active)
|
|
+{
|
|
+ long nice = effective_sc_prio(p);
|
|
+
|
|
+ if (task_nice(kswapd) > nice || !active)
|
|
+ set_user_nice(kswapd, nice);
|
|
+}
|
|
+
|
|
+static int sc_priority(struct task_struct *p)
|
|
+{
|
|
+ return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40));
|
|
+}
|
|
+
|
|
+/*
|
|
* This is the direct reclaim path, for page-allocating processes. We only
|
|
* try to reclaim pages from zones which will satisfy the caller's allocation
|
|
* request.
|
|
@@ -1011,7 +1053,8 @@
|
|
* holds filesystem locks which prevent writeout this might not work, and the
|
|
* allocation attempt will fail.
|
|
*/
|
|
-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
|
|
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
|
|
+ struct task_struct *p)
|
|
{
|
|
int priority;
|
|
int ret = 0;
|
|
@@ -1019,15 +1062,20 @@
|
|
unsigned long nr_reclaimed = 0;
|
|
struct reclaim_state *reclaim_state = current->reclaim_state;
|
|
unsigned long lru_pages = 0;
|
|
- int i;
|
|
+ int i, scan_priority = DEF_PRIORITY;
|
|
struct scan_control sc = {
|
|
.gfp_mask = gfp_mask,
|
|
.may_writepage = !laptop_mode,
|
|
.swap_cluster_max = SWAP_CLUSTER_MAX,
|
|
.may_swap = 1,
|
|
- .swappiness = vm_swappiness,
|
|
+ .mapped = vm_mapped,
|
|
};
|
|
|
|
+ if (p)
|
|
+ scan_priority = sc_priority(p);
|
|
+
|
|
+ delay_swap_prefetch();
|
|
+
|
|
count_vm_event(ALLOCSTALL);
|
|
|
|
for (i = 0; zones[i] != NULL; i++) {
|
|
@@ -1040,7 +1088,7 @@
|
|
+ zone_page_state(zone, NR_INACTIVE);
|
|
}
|
|
|
|
- for (priority = DEF_PRIORITY; priority >= 0; priority--) {
|
|
+ for (priority = scan_priority; priority >= 0; priority--) {
|
|
sc.nr_scanned = 0;
|
|
if (!priority)
|
|
disable_swap_token();
|
|
@@ -1070,7 +1118,7 @@
|
|
}
|
|
|
|
/* Take a nap, wait for some writeback to complete */
|
|
- if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
|
|
+ if (sc.nr_scanned && priority < scan_priority - 2)
|
|
congestion_wait(WRITE, HZ/10);
|
|
}
|
|
/* top priority shrink_caches still had more to do? don't OOM, then */
|
|
@@ -1120,9 +1168,9 @@
|
|
*/
|
|
static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
|
|
{
|
|
- int all_zones_ok;
|
|
+ int all_zones_ok = 0;
|
|
int priority;
|
|
- int i;
|
|
+ int i, scan_priority;
|
|
unsigned long total_scanned;
|
|
unsigned long nr_reclaimed;
|
|
struct reclaim_state *reclaim_state = current->reclaim_state;
|
|
@@ -1130,7 +1178,7 @@
|
|
.gfp_mask = GFP_KERNEL,
|
|
.may_swap = 1,
|
|
.swap_cluster_max = SWAP_CLUSTER_MAX,
|
|
- .swappiness = vm_swappiness,
|
|
+ .mapped = vm_mapped,
|
|
};
|
|
/*
|
|
* temp_priority is used to remember the scanning priority at which
|
|
@@ -1138,6 +1186,8 @@
|
|
*/
|
|
int temp_priority[MAX_NR_ZONES];
|
|
|
|
+ scan_priority = sc_priority(pgdat->kswapd);
|
|
+
|
|
loop_again:
|
|
total_scanned = 0;
|
|
nr_reclaimed = 0;
|
|
@@ -1145,9 +1195,9 @@
|
|
count_vm_event(PAGEOUTRUN);
|
|
|
|
for (i = 0; i < pgdat->nr_zones; i++)
|
|
- temp_priority[i] = DEF_PRIORITY;
|
|
+ temp_priority[i] = scan_priority;
|
|
|
|
- for (priority = DEF_PRIORITY; priority >= 0; priority--) {
|
|
+ for (priority = scan_priority; priority >= 0; priority--) {
|
|
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
|
|
unsigned long lru_pages = 0;
|
|
|
|
@@ -1163,15 +1213,22 @@
|
|
*/
|
|
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
|
|
struct zone *zone = pgdat->node_zones + i;
|
|
+ unsigned long watermark;
|
|
|
|
if (!populated_zone(zone))
|
|
continue;
|
|
|
|
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
|
|
+ if (zone->all_unreclaimable && priority != scan_priority)
|
|
continue;
|
|
|
|
- if (!zone_watermark_ok(zone, order, zone->pages_high,
|
|
- 0, 0)) {
|
|
+ /*
|
|
+ * The watermark is relaxed depending on the
|
|
+ * level of "priority" till it drops to
|
|
+ * pages_high.
|
|
+ */
|
|
+ watermark = zone->pages_high + (zone->pages_high *
|
|
+ priority / scan_priority);
|
|
+ if (!zone_watermark_ok(zone, order, watermark, 0, 0)) {
|
|
end_zone = i;
|
|
break;
|
|
}
|
|
@@ -1198,14 +1255,18 @@
|
|
for (i = 0; i <= end_zone; i++) {
|
|
struct zone *zone = pgdat->node_zones + i;
|
|
int nr_slab;
|
|
+ unsigned long watermark;
|
|
|
|
if (!populated_zone(zone))
|
|
continue;
|
|
|
|
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
|
|
+ if (zone->all_unreclaimable && priority != scan_priority)
|
|
continue;
|
|
|
|
- if (!zone_watermark_ok(zone, order, zone->pages_high,
|
|
+ watermark = zone->pages_high + (zone->pages_high *
|
|
+ priority / scan_priority);
|
|
+
|
|
+ if (!zone_watermark_ok(zone, order, watermark,
|
|
end_zone, 0))
|
|
all_zones_ok = 0;
|
|
temp_priority[i] = priority;
|
|
@@ -1238,7 +1299,7 @@
|
|
* OK, kswapd is getting into trouble. Take a nap, then take
|
|
* another pass across the zones.
|
|
*/
|
|
- if (total_scanned && priority < DEF_PRIORITY - 2)
|
|
+ if (total_scanned && priority < scan_priority - 2)
|
|
congestion_wait(WRITE, HZ/10);
|
|
|
|
/*
|
|
@@ -1272,6 +1333,8 @@
|
|
return nr_reclaimed;
|
|
}
|
|
|
|
+#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */
|
|
+
|
|
/*
|
|
* The background pageout daemon, started as a kernel thread
|
|
* from the init process.
|
|
@@ -1319,6 +1382,8 @@
|
|
for ( ; ; ) {
|
|
unsigned long new_order;
|
|
|
|
+ /* kswapd has been busy so delay watermark_timer */
|
|
+ mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY);
|
|
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
|
|
new_order = pgdat->kswapd_max_order;
|
|
pgdat->kswapd_max_order = 0;
|
|
@@ -1332,6 +1397,7 @@
|
|
if (!freezing(current))
|
|
schedule();
|
|
|
|
+ set_user_nice(tsk, 0);
|
|
order = pgdat->kswapd_max_order;
|
|
}
|
|
finish_wait(&pgdat->kswapd_wait, &wait);
|
|
@@ -1349,9 +1415,10 @@
|
|
/*
|
|
* A zone is low on free memory, so wake its kswapd task to service it.
|
|
*/
|
|
-void wakeup_kswapd(struct zone *zone, int order)
|
|
+void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p)
|
|
{
|
|
pg_data_t *pgdat;
|
|
+ int active;
|
|
|
|
if (!populated_zone(zone))
|
|
return;
|
|
@@ -1363,7 +1430,9 @@
|
|
pgdat->kswapd_max_order = order;
|
|
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
|
|
return;
|
|
- if (!waitqueue_active(&pgdat->kswapd_wait))
|
|
+ active = waitqueue_active(&pgdat->kswapd_wait);
|
|
+ set_kswapd_nice(pgdat->kswapd, p, active);
|
|
+ if (!active)
|
|
return;
|
|
wake_up_interruptible(&pgdat->kswapd_wait);
|
|
}
|
|
@@ -1382,6 +1451,8 @@
|
|
struct zone *zone;
|
|
unsigned long nr_to_scan, ret = 0;
|
|
|
|
+ delay_swap_prefetch();
|
|
+
|
|
for_each_zone(zone) {
|
|
|
|
if (!populated_zone(zone))
|
|
@@ -1441,7 +1512,7 @@
|
|
.may_swap = 0,
|
|
.swap_cluster_max = nr_pages,
|
|
.may_writepage = 1,
|
|
- .swappiness = vm_swappiness,
|
|
+ .mapped = vm_mapped,
|
|
};
|
|
|
|
current->reclaim_state = &reclaim_state;
|
|
@@ -1476,7 +1547,7 @@
|
|
/* Force reclaiming mapped pages in the passes #3 and #4 */
|
|
if (pass > 2) {
|
|
sc.may_swap = 1;
|
|
- sc.swappiness = 100;
|
|
+ sc.mapped = 0;
|
|
}
|
|
|
|
for (prio = DEF_PRIORITY; prio >= 0; prio--) {
|
|
@@ -1540,20 +1611,57 @@
|
|
}
|
|
|
|
/*
|
|
+ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots
|
|
+ */
|
|
+static void watermark_wakeup(unsigned long data)
|
|
+{
|
|
+ pg_data_t *pgdat = (pg_data_t *)data;
|
|
+ struct timer_list *wt = &pgdat->watermark_timer;
|
|
+ int i;
|
|
+
|
|
+ if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load())
|
|
+ goto out;
|
|
+ for (i = pgdat->nr_zones - 1; i >= 0; i--) {
|
|
+ struct zone *z = pgdat->node_zones + i;
|
|
+
|
|
+ if (!populated_zone(z) || is_highmem(z)) {
|
|
+ /* We are better off leaving highmem full */
|
|
+ continue;
|
|
+ }
|
|
+ if (!zone_watermark_ok(z, 0, z->pages_lots, 0, 0)) {
|
|
+ wake_up_interruptible(&pgdat->kswapd_wait);
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+out:
|
|
+ mod_timer(wt, jiffies + WT_EXPIRY);
|
|
+ return;
|
|
+}
|
|
+
|
|
+/*
|
|
* This kswapd start function will be called by init and node-hot-add.
|
|
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
|
|
*/
|
|
int kswapd_run(int nid)
|
|
{
|
|
pg_data_t *pgdat = NODE_DATA(nid);
|
|
+ struct timer_list *wt;
|
|
int ret = 0;
|
|
|
|
if (pgdat->kswapd)
|
|
return 0;
|
|
|
|
+ wt = &pgdat->watermark_timer;
|
|
+ init_timer(wt);
|
|
+ wt->data = (unsigned long)pgdat;
|
|
+ wt->function = watermark_wakeup;
|
|
+ wt->expires = jiffies + WT_EXPIRY;
|
|
+ add_timer(wt);
|
|
+
|
|
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
|
|
if (IS_ERR(pgdat->kswapd)) {
|
|
/* failure at boot is fatal */
|
|
+ del_timer(wt);
|
|
BUG_ON(system_state == SYSTEM_BOOTING);
|
|
printk("Failed to start kswapd on node %d\n",nid);
|
|
ret = -1;
|
|
@@ -1624,7 +1732,7 @@
|
|
.swap_cluster_max = max_t(unsigned long, nr_pages,
|
|
SWAP_CLUSTER_MAX),
|
|
.gfp_mask = gfp_mask,
|
|
- .swappiness = vm_swappiness,
|
|
+ .mapped = vm_mapped,
|
|
};
|
|
unsigned long slab_reclaimable;
|
|
|
|
Index: linux-2.6.22-ck1/include/linux/mm_inline.h
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/include/linux/mm_inline.h 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/include/linux/mm_inline.h 2007-07-10 14:55:22.000000000 +1000
|
|
@@ -13,6 +13,13 @@
|
|
}
|
|
|
|
static inline void
|
|
+add_page_to_inactive_list_tail(struct zone *zone, struct page *page)
|
|
+{
|
|
+ list_add_tail(&page->lru, &zone->inactive_list);
|
|
+ __inc_zone_state(zone, NR_INACTIVE);
|
|
+}
|
|
+
|
|
+static inline void
|
|
del_page_from_active_list(struct zone *zone, struct page *page)
|
|
{
|
|
list_del(&page->lru);
|
|
Index: linux-2.6.22-ck1/include/linux/swap-prefetch.h
|
|
===================================================================
|
|
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
|
|
+++ linux-2.6.22-ck1/include/linux/swap-prefetch.h 2007-07-10 14:55:22.000000000 +1000
|
|
@@ -0,0 +1,53 @@
|
|
+#ifndef SWAP_PREFETCH_H_INCLUDED
|
|
+#define SWAP_PREFETCH_H_INCLUDED
|
|
+
|
|
+#ifdef CONFIG_SWAP_PREFETCH
|
|
+/* mm/swap_prefetch.c */
|
|
+extern int swap_prefetch;
|
|
+extern int swap_prefetch_delay;
|
|
+extern int swap_prefetch_sleep;
|
|
+
|
|
+struct swapped_entry {
|
|
+ swp_entry_t swp_entry; /* The actual swap entry */
|
|
+ struct list_head swapped_list; /* Linked list of entries */
|
|
+#if MAX_NUMNODES > 1
|
|
+ int node; /* Node id */
|
|
+#endif
|
|
+} __attribute__((packed));
|
|
+
|
|
+static inline void store_swap_entry_node(struct swapped_entry *entry,
|
|
+ struct page *page)
|
|
+{
|
|
+#if MAX_NUMNODES > 1
|
|
+ entry->node = page_to_nid(page);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline int get_swap_entry_node(struct swapped_entry *entry)
|
|
+{
|
|
+#if MAX_NUMNODES > 1
|
|
+ return entry->node;
|
|
+#else
|
|
+ return 0;
|
|
+#endif
|
|
+}
|
|
+
|
|
+extern void add_to_swapped_list(struct page *page);
|
|
+extern void delay_swap_prefetch(void);
|
|
+extern void prepare_swap_prefetch(void);
|
|
+
|
|
+#else /* CONFIG_SWAP_PREFETCH */
|
|
+static inline void add_to_swapped_list(struct page *__unused)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline void prepare_swap_prefetch(void)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline void delay_swap_prefetch(void)
|
|
+{
|
|
+}
|
|
+#endif /* CONFIG_SWAP_PREFETCH */
|
|
+
|
|
+#endif /* SWAP_PREFETCH_H_INCLUDED */
|
|
Index: linux-2.6.22-ck1/mm/page_io.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/mm/page_io.c 2007-07-10 14:55:00.000000000 +1000
|
|
+++ linux-2.6.22-ck1/mm/page_io.c 2007-07-10 14:55:22.000000000 +1000
|
|
@@ -17,6 +17,7 @@
|
|
#include <linux/bio.h>
|
|
#include <linux/swapops.h>
|
|
#include <linux/writeback.h>
|
|
+#include <linux/swap-prefetch.h>
|
|
#include <asm/pgtable.h>
|
|
|
|
static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
|
|
@@ -118,6 +119,7 @@
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
+ add_to_swapped_list(page);
|
|
if (wbc->sync_mode == WB_SYNC_ALL)
|
|
rw |= (1 << BIO_RW_SYNC);
|
|
count_vm_event(PSWPOUT);
|
|
Index: linux-2.6.22-ck1/include/linux/sysctl.h
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/include/linux/sysctl.h 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/include/linux/sysctl.h 2007-07-10 14:55:22.000000000 +1000
|
|
@@ -190,7 +190,7 @@
|
|
VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */
|
|
VM_PAGEBUF=17, /* struct: Control pagebuf parameters */
|
|
VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */
|
|
- VM_SWAPPINESS=19, /* Tendency to steal mapped memory */
|
|
+ VM_UNUSED19=19, /* was: Tendency to steal mapped memory */
|
|
VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */
|
|
VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */
|
|
VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */
|
|
Index: linux-2.6.22-ck1/include/linux/mmzone.h
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/include/linux/mmzone.h 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/include/linux/mmzone.h 2007-07-10 14:55:23.000000000 +1000
|
|
@@ -13,6 +13,7 @@
|
|
#include <linux/init.h>
|
|
#include <linux/seqlock.h>
|
|
#include <linux/nodemask.h>
|
|
+#include <linux/timer.h>
|
|
#include <asm/atomic.h>
|
|
#include <asm/page.h>
|
|
|
|
@@ -181,7 +182,7 @@
|
|
|
|
struct zone {
|
|
/* Fields commonly accessed by the page allocator */
|
|
- unsigned long pages_min, pages_low, pages_high;
|
|
+ unsigned long pages_min, pages_low, pages_high, pages_lots;
|
|
/*
|
|
* We don't know if the memory that we're going to allocate will be freeable
|
|
* or/and it will be released eventually, so to avoid totally wasting several
|
|
@@ -452,6 +453,7 @@
|
|
wait_queue_head_t kswapd_wait;
|
|
struct task_struct *kswapd;
|
|
int kswapd_max_order;
|
|
+ struct timer_list watermark_timer;
|
|
} pg_data_t;
|
|
|
|
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
|
|
@@ -468,7 +470,7 @@
|
|
void get_zone_counts(unsigned long *active, unsigned long *inactive,
|
|
unsigned long *free);
|
|
void build_all_zonelists(void);
|
|
-void wakeup_kswapd(struct zone *zone, int order);
|
|
+void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p);
|
|
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
|
int classzone_idx, int alloc_flags);
|
|
enum memmap_context {
|
|
Index: linux-2.6.22-ck1/mm/page_alloc.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/mm/page_alloc.c 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/mm/page_alloc.c 2007-07-10 14:55:22.000000000 +1000
|
|
@@ -1250,7 +1250,7 @@
|
|
goto nopage;
|
|
|
|
for (z = zonelist->zones; *z; z++)
|
|
- wakeup_kswapd(*z, order);
|
|
+ wakeup_kswapd(*z, order, p);
|
|
|
|
/*
|
|
* OK, we're below the kswapd watermark and have kicked background
|
|
@@ -1314,7 +1314,7 @@
|
|
reclaim_state.reclaimed_slab = 0;
|
|
p->reclaim_state = &reclaim_state;
|
|
|
|
- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
|
|
+ did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask, p);
|
|
|
|
p->reclaim_state = NULL;
|
|
p->flags &= ~PF_MEMALLOC;
|
|
@@ -1570,6 +1570,7 @@
|
|
" min:%lukB"
|
|
" low:%lukB"
|
|
" high:%lukB"
|
|
+ " lots:%lukB"
|
|
" active:%lukB"
|
|
" inactive:%lukB"
|
|
" present:%lukB"
|
|
@@ -1581,6 +1582,7 @@
|
|
K(zone->pages_min),
|
|
K(zone->pages_low),
|
|
K(zone->pages_high),
|
|
+ K(zone->pages_lots),
|
|
K(zone_page_state(zone, NR_ACTIVE)),
|
|
K(zone_page_state(zone, NR_INACTIVE)),
|
|
K(zone->present_pages),
|
|
@@ -3142,6 +3144,7 @@
|
|
|
|
zone->pages_low = zone->pages_min + (tmp >> 2);
|
|
zone->pages_high = zone->pages_min + (tmp >> 1);
|
|
+ zone->pages_lots = zone->pages_min + tmp;
|
|
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
|
}
|
|
|
|
Index: linux-2.6.22-ck1/fs/buffer.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/fs/buffer.c 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/fs/buffer.c 2007-07-10 14:55:22.000000000 +1000
|
|
@@ -356,7 +356,7 @@
|
|
for_each_online_pgdat(pgdat) {
|
|
zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
|
|
if (*zones)
|
|
- try_to_free_pages(zones, GFP_NOFS);
|
|
+ try_to_free_pages(zones, GFP_NOFS, NULL);
|
|
}
|
|
}
|
|
|
|
Index: linux-2.6.22-ck1/mm/filemap.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/mm/filemap.c 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/mm/filemap.c 2007-07-10 14:55:23.000000000 +1000
|
|
@@ -466,6 +466,16 @@
|
|
return ret;
|
|
}
|
|
|
|
+int add_to_page_cache_lru_tail(struct page *page,
|
|
+ struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
|
|
+{
|
|
+ int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
|
|
+
|
|
+ if (ret == 0)
|
|
+ lru_cache_add_tail(page);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
#ifdef CONFIG_NUMA
|
|
struct page *__page_cache_alloc(gfp_t gfp)
|
|
{
|
|
@@ -839,6 +849,34 @@
|
|
ra->ra_pages /= 4;
|
|
}
|
|
|
|
+/*
|
|
+ * Sysctl which determines whether we should read from large files to the
|
|
+ * tail of the inactive lru list.
|
|
+ */
|
|
+int vm_tail_largefiles __read_mostly = 1;
|
|
+
|
|
+static inline int nr_mapped(void)
|
|
+{
|
|
+ return global_page_state(NR_FILE_MAPPED) +
|
|
+ global_page_state(NR_ANON_PAGES);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This examines how large in pages a file size is and returns 1 if it is
|
|
+ * more than half the unmapped ram. Avoid doing read_page_state which is
|
|
+ * expensive unless we already know it is likely to be large enough.
|
|
+ */
|
|
+static int large_isize(unsigned long nr_pages)
|
|
+{
|
|
+ if (nr_pages * 6 > vm_total_pages) {
|
|
+ unsigned long unmapped_ram = vm_total_pages - nr_mapped();
|
|
+
|
|
+ if (nr_pages * 2 > unmapped_ram)
|
|
+ return 1;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
/**
|
|
* do_generic_mapping_read - generic file read routine
|
|
* @mapping: address_space to be read
|
|
@@ -1051,8 +1089,19 @@
|
|
goto out;
|
|
}
|
|
}
|
|
- error = add_to_page_cache_lru(cached_page, mapping,
|
|
- index, GFP_KERNEL);
|
|
+
|
|
+ /*
|
|
+ * If we know the file is large we add the pages read to the
|
|
+ * end of the lru as we're unlikely to be able to cache the
|
|
+ * whole file in ram so make those pages the first to be
|
|
+ * dropped if not referenced soon.
|
|
+ */
|
|
+ if (vm_tail_largefiles && large_isize(end_index))
|
|
+ error = add_to_page_cache_lru_tail(cached_page,
|
|
+ mapping, index, GFP_KERNEL);
|
|
+ else
|
|
+ error = add_to_page_cache_lru(cached_page, mapping,
|
|
+ index, GFP_KERNEL);
|
|
if (error) {
|
|
if (error == -EEXIST)
|
|
goto find_page;
|
|
Index: linux-2.6.22-ck1/Documentation/filesystems/proc.txt
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/Documentation/filesystems/proc.txt 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/Documentation/filesystems/proc.txt 2007-07-10 14:55:23.000000000 +1000
|
|
@@ -1333,6 +1333,14 @@
|
|
As this is a non-destructive operation and dirty objects are not freeable, the
|
|
user should run `sync' first.
|
|
|
|
+tail_largefiles
|
|
+---------------
|
|
+
|
|
+When enabled reads from large files to the tail end of the inactive lru list.
|
|
+This means that any cache from reading large files is dropped very quickly,
|
|
+preventing loss of mapped ram and useful pagecache when large files are read.
|
|
+This does, however, make caching less effective when working with large files.
|
|
+
|
|
|
|
2.5 /proc/sys/dev - Device specific parameters
|
|
----------------------------------------------
|
|
Index: linux-2.6.22-ck1/arch/i386/Kconfig
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/arch/i386/Kconfig 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/arch/i386/Kconfig 2007-07-10 14:55:23.000000000 +1000
|
|
@@ -550,7 +550,7 @@
|
|
|
|
choice
|
|
depends on EXPERIMENTAL
|
|
- prompt "Memory split" if EMBEDDED
|
|
+ prompt "Memory split"
|
|
default VMSPLIT_3G
|
|
help
|
|
Select the desired split between kernel and user memory.
|
|
@@ -569,17 +569,17 @@
|
|
option alone!
|
|
|
|
config VMSPLIT_3G
|
|
- bool "3G/1G user/kernel split"
|
|
+ bool "Default 896MB lowmem (3G/1G user/kernel split)"
|
|
config VMSPLIT_3G_OPT
|
|
depends on !HIGHMEM
|
|
- bool "3G/1G user/kernel split (for full 1G low memory)"
|
|
+ bool "1GB lowmem (3G/1G user/kernel split)"
|
|
config VMSPLIT_2G
|
|
- bool "2G/2G user/kernel split"
|
|
+ bool "2GB lowmem (2G/2G user/kernel split)"
|
|
config VMSPLIT_2G_OPT
|
|
depends on !HIGHMEM
|
|
- bool "2G/2G user/kernel split (for full 2G low memory)"
|
|
+ bool "2GB lowmem (2G/2G user/kernel split)"
|
|
config VMSPLIT_1G
|
|
- bool "1G/3G user/kernel split"
|
|
+ bool "3GB lowmem (1G/3G user/kernel split)"
|
|
endchoice
|
|
|
|
config PAGE_OFFSET
|
|
Index: linux-2.6.22-ck1/kernel/Kconfig.hz
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/kernel/Kconfig.hz 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/kernel/Kconfig.hz 2007-07-10 14:55:24.000000000 +1000
|
|
@@ -4,7 +4,7 @@
|
|
|
|
choice
|
|
prompt "Timer frequency"
|
|
- default HZ_250
|
|
+ default HZ_1000
|
|
help
|
|
Allows the configuration of the timer frequency. It is customary
|
|
to have the timer interrupt run at 1000 Hz but 100 Hz may be more
|
|
@@ -13,8 +13,7 @@
|
|
contention and cacheline bounces as a result of timer interrupts.
|
|
Note that the timer interrupt occurs on each processor in an SMP
|
|
environment leading to NR_CPUS * HZ number of timer interrupts
|
|
- per second.
|
|
-
|
|
+ per second.Laptops may also show improved battery life.
|
|
|
|
config HZ_100
|
|
bool "100 HZ"
|
|
@@ -23,13 +22,14 @@
|
|
with lots of processors that may show reduced performance if
|
|
too many timer interrupts are occurring.
|
|
|
|
- config HZ_250
|
|
+ config HZ_250_NODEFAULT
|
|
bool "250 HZ"
|
|
help
|
|
- 250 Hz is a good compromise choice allowing server performance
|
|
- while also showing good interactive responsiveness even
|
|
- on SMP and NUMA systems. If you are going to be using NTSC video
|
|
- or multimedia, selected 300Hz instead.
|
|
+ 250 HZ is a lousy compromise choice allowing server interactivity
|
|
+ while also showing desktop throughput and no extra power saving on
|
|
+ laptops. Good for when you can't make up your mind.
|
|
+
|
|
+ Recommend 100 or 1000 instead.
|
|
|
|
config HZ_300
|
|
bool "300 HZ"
|
|
@@ -45,12 +45,76 @@
|
|
1000 Hz is the preferred choice for desktop systems and other
|
|
systems requiring fast interactive responses to events.
|
|
|
|
+ config HZ_1500
|
|
+ bool "1500 HZ"
|
|
+ help
|
|
+ 1500 Hz is an insane value to use to run broken software that is Hz
|
|
+ limited.
|
|
+
|
|
+ Being over 1000, driver breakage is likely.
|
|
+
|
|
+ config HZ_2000
|
|
+ bool "2000 HZ"
|
|
+ help
|
|
+ 2000 Hz is an insane value to use to run broken software that is Hz
|
|
+ limited.
|
|
+
|
|
+ Being over 1000, driver breakage is likely.
|
|
+
|
|
+ config HZ_3000
|
|
+ bool "3000 HZ"
|
|
+ help
|
|
+ 3000 Hz is an insane value to use to run broken software that is Hz
|
|
+ limited.
|
|
+
|
|
+ Being over 1000, driver breakage is likely.
|
|
+
|
|
+ config HZ_4000
|
|
+ bool "4000 HZ"
|
|
+ help
|
|
+ 4000 Hz is an insane value to use to run broken software that is Hz
|
|
+ limited.
|
|
+
|
|
+ Being over 1000, driver breakage is likely.
|
|
+
|
|
+ config HZ_5000
|
|
+ bool "5000 HZ"
|
|
+ help
|
|
+ 5000 Hz is an obscene value to use to run broken software that is Hz
|
|
+ limited.
|
|
+
|
|
+ Being over 1000, driver breakage is likely.
|
|
+
|
|
+ config HZ_7500
|
|
+ bool "7500 HZ"
|
|
+ help
|
|
+ 7500 Hz is an obscene value to use to run broken software that is Hz
|
|
+ limited.
|
|
+
|
|
+ Being over 1000, driver breakage is likely.
|
|
+
|
|
+ config HZ_10000
|
|
+ bool "10000 HZ"
|
|
+ help
|
|
+ 10000 Hz is an obscene value to use to run broken software that is Hz
|
|
+ limited.
|
|
+
|
|
+ Being over 1000, driver breakage is likely.
|
|
+
|
|
+
|
|
endchoice
|
|
|
|
config HZ
|
|
int
|
|
default 100 if HZ_100
|
|
- default 250 if HZ_250
|
|
+ default 250 if HZ_250_NODEFAULT
|
|
default 300 if HZ_300
|
|
default 1000 if HZ_1000
|
|
+ default 1500 if HZ_1500
|
|
+ default 2000 if HZ_2000
|
|
+ default 3000 if HZ_3000
|
|
+ default 4000 if HZ_4000
|
|
+ default 5000 if HZ_5000
|
|
+ default 7500 if HZ_7500
|
|
+ default 10000 if HZ_10000
|
|
|
|
Index: linux-2.6.22-ck1/arch/i386/defconfig
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/arch/i386/defconfig 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/arch/i386/defconfig 2007-07-10 14:55:23.000000000 +1000
|
|
@@ -226,10 +226,10 @@
|
|
# CONFIG_IRQBALANCE is not set
|
|
CONFIG_SECCOMP=y
|
|
# CONFIG_HZ_100 is not set
|
|
-CONFIG_HZ_250=y
|
|
+# CONFIG_HZ_250 is not set
|
|
# CONFIG_HZ_300 is not set
|
|
-# CONFIG_HZ_1000 is not set
|
|
-CONFIG_HZ=250
|
|
+CONFIG_HZ_1000=y
|
|
+CONFIG_HZ=1000
|
|
# CONFIG_KEXEC is not set
|
|
# CONFIG_CRASH_DUMP is not set
|
|
CONFIG_PHYSICAL_START=0x100000
|
|
Index: linux-2.6.22-ck1/arch/x86_64/defconfig
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/arch/x86_64/defconfig 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/arch/x86_64/defconfig 2007-07-10 14:55:23.000000000 +1000
|
|
@@ -185,10 +185,10 @@
|
|
CONFIG_SECCOMP=y
|
|
# CONFIG_CC_STACKPROTECTOR is not set
|
|
# CONFIG_HZ_100 is not set
|
|
-CONFIG_HZ_250=y
|
|
+# CONFIG_HZ_250 is not set
|
|
# CONFIG_HZ_300 is not set
|
|
-# CONFIG_HZ_1000 is not set
|
|
-CONFIG_HZ=250
|
|
+CONFIG_HZ_1000=y
|
|
+CONFIG_HZ=1000
|
|
CONFIG_K8_NB=y
|
|
CONFIG_GENERIC_HARDIRQS=y
|
|
CONFIG_GENERIC_IRQ_PROBE=y
|
|
Index: linux-2.6.22-ck1/include/linux/jiffies.h
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/include/linux/jiffies.h 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/include/linux/jiffies.h 2007-07-10 14:55:24.000000000 +1000
|
|
@@ -29,6 +29,12 @@
|
|
# define SHIFT_HZ 9
|
|
#elif HZ >= 768 && HZ < 1536
|
|
# define SHIFT_HZ 10
|
|
+#elif HZ >= 1536 && HZ < 3072
|
|
+# define SHIFT_HZ 11
|
|
+#elif HZ >= 3072 && HZ < 6144
|
|
+# define SHIFT_HZ 12
|
|
+#elif HZ >= 6144 && HZ < 12288
|
|
+# define SHIFT_HZ 13
|
|
#else
|
|
# error You lose.
|
|
#endif
|
|
Index: linux-2.6.22-ck1/include/net/inet_timewait_sock.h
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/include/net/inet_timewait_sock.h 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/include/net/inet_timewait_sock.h 2007-07-10 14:55:24.000000000 +1000
|
|
@@ -38,8 +38,8 @@
|
|
* If time > 4sec, it is "slow" path, no recycling is required,
|
|
* so that we select tick to get range about 4 seconds.
|
|
*/
|
|
-#if HZ <= 16 || HZ > 4096
|
|
-# error Unsupported: HZ <= 16 or HZ > 4096
|
|
+#if HZ <= 16 || HZ > 16384
|
|
+# error Unsupported: HZ <= 16 or HZ > 16384
|
|
#elif HZ <= 32
|
|
# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
|
|
#elif HZ <= 64
|
|
@@ -54,8 +54,12 @@
|
|
# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
|
|
#elif HZ <= 2048
|
|
# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
|
|
-#else
|
|
+#elif HZ <= 4096
|
|
# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
|
|
+#elif HZ <= 8192
|
|
+# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
|
|
+#else
|
|
+# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
|
|
#endif
|
|
|
|
/* TIME_WAIT reaping mechanism. */
|
|
Index: linux-2.6.22-ck1/init/calibrate.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/init/calibrate.c 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/init/calibrate.c 2007-07-10 14:55:24.000000000 +1000
|
|
@@ -122,12 +122,12 @@
|
|
printk("Calibrating delay loop (skipped)... "
|
|
"%lu.%02lu BogoMIPS preset\n",
|
|
loops_per_jiffy/(500000/HZ),
|
|
- (loops_per_jiffy/(5000/HZ)) % 100);
|
|
+ (loops_per_jiffy * 10/(50000/HZ)) % 100);
|
|
} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
|
|
printk("Calibrating delay using timer specific routine.. ");
|
|
printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
|
|
loops_per_jiffy/(500000/HZ),
|
|
- (loops_per_jiffy/(5000/HZ)) % 100,
|
|
+ (loops_per_jiffy * 10/(50000/HZ)) % 100,
|
|
loops_per_jiffy);
|
|
} else {
|
|
loops_per_jiffy = (1<<12);
|
|
@@ -166,7 +166,7 @@
|
|
/* Round the value and print it */
|
|
printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
|
|
loops_per_jiffy/(500000/HZ),
|
|
- (loops_per_jiffy/(5000/HZ)) % 100,
|
|
+ (loops_per_jiffy * 10/(50000/HZ)) % 100,
|
|
loops_per_jiffy);
|
|
}
|
|
|
|
Index: linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/arch/i386/kernel/cpu/proc.c 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c 2007-07-10 14:55:24.000000000 +1000
|
|
@@ -157,7 +157,7 @@
|
|
|
|
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
|
|
c->loops_per_jiffy/(500000/HZ),
|
|
- (c->loops_per_jiffy/(5000/HZ)) % 100);
|
|
+ (c->loops_per_jiffy * 10/(50000/HZ)) % 100);
|
|
seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
|
|
|
|
return 0;
|
|
Index: linux-2.6.22-ck1/arch/i386/kernel/smpboot.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/arch/i386/kernel/smpboot.c 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/arch/i386/kernel/smpboot.c 2007-07-10 14:55:24.000000000 +1000
|
|
@@ -1094,7 +1094,7 @@
|
|
"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
|
|
cpucount+1,
|
|
bogosum/(500000/HZ),
|
|
- (bogosum/(5000/HZ))%100);
|
|
+ (bogosum * 10/(50000/HZ))%100);
|
|
|
|
Dprintk("Before bogocount - setting activated=1.\n");
|
|
|
|
Index: linux-2.6.22-ck1/include/linux/nfsd/stats.h
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/include/linux/nfsd/stats.h 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/include/linux/nfsd/stats.h 2007-07-10 14:55:24.000000000 +1000
|
|
@@ -35,8 +35,8 @@
|
|
|
|
};
|
|
|
|
-/* thread usage wraps very million seconds (approx one fortnight) */
|
|
-#define NFSD_USAGE_WRAP (HZ*1000000)
|
|
+/* thread usage wraps every one hundred thousand seconds (approx one day) */
|
|
+#define NFSD_USAGE_WRAP (HZ*100000)
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
Index: linux-2.6.22-ck1/arch/x86_64/kernel/setup.c
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/arch/x86_64/kernel/setup.c 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/arch/x86_64/kernel/setup.c 2007-07-10 14:55:24.000000000 +1000
|
|
@@ -1047,7 +1047,7 @@
|
|
|
|
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
|
|
c->loops_per_jiffy/(500000/HZ),
|
|
- (c->loops_per_jiffy/(5000/HZ)) % 100);
|
|
+ (c->loops_per_jiffy * 10/(50000/HZ)) % 100);
|
|
|
|
if (c->x86_tlbsize > 0)
|
|
seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
|
|
Index: linux-2.6.22-ck1/Makefile
|
|
===================================================================
|
|
--- linux-2.6.22-ck1.orig/Makefile 2007-07-10 14:54:59.000000000 +1000
|
|
+++ linux-2.6.22-ck1/Makefile 2007-07-10 14:55:24.000000000 +1000
|
|
@@ -1,8 +1,9 @@
|
|
VERSION = 2
|
|
PATCHLEVEL = 6
|
|
SUBLEVEL = 22
|
|
-EXTRAVERSION =
|
|
-NAME = Holy Dancing Manatees, Batman!
|
|
+EXTRAVERSION = -ck1
|
|
+NAME = So long, and thanks for all the fish
|
|
+JANAME = さようなら、いままで魚をありがとう
|
|
|
|
# *DOCUMENTATION*
|
|
# To see a list of typical targets execute "make help"
|