Fair Share スケジューラの実装 Kernel 2.4.19
発表者:pirlo21
日付:2004年10月16日予定
目次
1 Fair Share スケジューラとは?
現在のLinux(2.4系)では、プロセスにCPUを割り振る際、nice値とCounter値の静的、
動的優先度の合計により次の実行タスクを決定する。そのため、必ずしもすべてのユーザ、
すべてのタスクに平等な資源割り当ては実現していない。
そこで、Fair shareスケジューラは、すべてのユーザ(またはタスクやグループ)に対して
資源を平等に割り振るよう設計されたスケジューラである。
2 従来のLinuxスケジューラ
従来のLinuxスケジューラは
601 /*
602 * Default process to select..
603 */
604 next = idle_task(this_cpu);
605 c = -1000;
606 list_for_each(tmp, &runqueue_head) {
607 p = list_entry(tmp, struct task_struct, run_list);
608 if (can_schedule(p, this_cpu)) {
609 int weight = goodness(p, this_cpu, prev->active_mm);
610 if (weight > c)
611 c = weight, next = p;
612 }
613 }
runaueue_headのリスト構造体から、タスクを一個ずつ取り出してその優先値をgoodness()関数にて計算しweight変数に格納する。
weight変数を比較し一番大きなものを次の実行タスクとする。
goodlness()関数の概要
168 weight = p->counter;
169 if (!weight)
170 goto out;
171
172 #ifdef CONFIG_SMP
173 /* Give a largish advantage to the same processor... */
174 /* (this is equivalent to penalizing other processors) */
175 if (p->processor == this_cpu)
176 weight += PROC_CHANGE_PENALTY;
177 #endif
178
179 /* .. and a slight advantage to the current MM */
180 if (p->mm == this_mm || !p->mm)
181 weight += 1;
182 weight += 20 - p->nice;
183 goto out;
168行目で現在のタスクのカウンタ値(動的優先度)をweightに格納
182行目でnice値(静的優先度)を計算し、weightに格納
191 weight = 1000 + p->rt_priority;
192 out:
193 return weight;
194 }
また、どのrunnableプロセスもcounter値を使い切ってしまった場合は次の、
616 if (unlikely(!c)) {
617 struct task_struct *p;
618
619 spin_unlock_irq(&runqueue_lock);
620 read_lock(&tasklist_lock);
621 for_each_task(p)
622 p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
623 read_unlock(&tasklist_lock);
624 spin_lock_irq(&runqueue_lock);
625 goto repeat_schedule;
626 }
の部分ですべてのプロセス(runnableタスクに限らない)に対してcounter値を再計算して設定しなおす。
3 実装概要
- ユーザに一定量のcounter値を変数として保持させ、タスクは自分のユーザのcounter値から実行に必要なcounter値を借りてタスクを実行する。
- すべてのユーザには一定値のcounter値を与える。
- すべてのユーザに対して簡単にアクセスできるようにlist構造を作る。
- goodness()の部分ではなく、counter値の再計算のところにFair Shareのための再計算関数を設定する。
4 実装
4.1 /include/linux/sched.hの編集
+ /* Linked list for for_each_user */
+ struct list_head list;
+ long cpu_ticks;
}
user struct 構造体に、リンクリストアクセス用のlist_head構造体と、cpu_tickets変数を追加する。
4.2 /kernel/sched.cの編集
リスト構造体の一番後ろに連結するための関数を定義。
+#ifdef CONFIG_FAIRSCHED
+/* Toggle the per-user fair scheduler on/off */
+int fairsched = 1;//fairshareスケジューラのためのフラグをセット
+
+/* Move a task to the tail of the tasklist */
+static inline void move_last_tasklist(struct task_struct * p)
+{
+ /* list_del */
+ p->next_task->prev_task = p->prev_task;
+ p->prev_task->next_task = p->next_task;
+
+ /* list_add_tail */
+ p->next_task = &init_task;
+ p->prev_task = init_task.prev_task;
+ init_task.prev_task->next_task = p;
+ init_task.prev_task = p;
+}
+
優先度の再計算のための関数定義。
+static void recalculate_priorities(void)
+{
+#ifdef CONFIG_FAIRSCHED
+ if (fairsched) {
+ struct task_struct *p, *next;
+ struct user_struct *up;
+ long oldcounter, newcounter;
+
+ recalculate_peruser_cputicks(); //各ユーザの持つチケットを再計算
+
+ write_lock_irq(&tasklist_lock);
+ safe_for_each_task(p) { //すべてのタスクに対して
+ up = p->user;
+ if (up->cpu_ticks > 0) {
+ oldcounter = p->counter;
+ newcounter = (oldcounter >> 1) +
+ NICE_TO_TICKS(p->nice);
+ up->cpu_ticks += oldcounter; //使い切っていないcounter値をuserのcpu_tickets変数に戻して、
+ up->cpu_ticks -= newcounter; //これから使う分(プロセスに渡す分)をcpu_ticketsから引く。
+ /*
+ * If a user is very busy, only some of its
+ * processes can get CPU time. We move those
+ * processes out of the way to prevent
+ * starvation of others.
+ */
+ if (oldcounter != newcounter) {
+ p->counter = newcounter;
+ move_last_tasklist(p);
+ }
+ }
+ }
+ write_unlock_irq(&tasklist_lock);
+ } else
+#endif /* CONFIG_FAIRSCHED */
+ { //従来のrecalculateの関数counter値を2分の1にしNICE値を再計算して加える。
+ struct task_struct *p;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p)
+ p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
+ read_unlock(&tasklist_lock);
+ }
+}
+
+/*
新しくなった再計算の部分。
if (unlikely(!c)) {
- struct task_struct *p;
-
spin_unlock_irq(&runqueue_lock);
- read_lock(&tasklist_lock);
- for_each_task(p)
- p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
- read_unlock(&tasklist_lock);
+
+ recalculate_priorities();
+
spin_lock_irq(&runqueue_lock);
goto repeat_schedule;
}
4.3 /kernel/user.cの編集
rootユーザだけは他のユーザとは作成の仕方が違うので別に定義。
struct user_struct root_user = {
__count: ATOMIC_INIT(1),
processes: ATOMIC_INIT(1),
- files: ATOMIC_INIT(0)
+ files: ATOMIC_INIT(0),
+ cpu_ticks: NICE_TO_TICKS(0)
};
ユーザの追加、削除の際にuser_listへも同時に加入できるように各関数(ユーザの登録や修正など)を修正。
/*
* These routines must be called with the uidhash spinlock held!
*/
@@ -44,6 +47,8 @@
next->pprev = &up->next;
up->pprev = hashent;
*hashent = up;
+
+ list_add(&up->list, &user_list);
}
static inline void uid_hash_remove(struct user_struct *up)
@@ -54,6 +59,8 @@
if (next)
next->pprev = pprev;
*pprev = next;
+
+ list_del(&up->list);
}
static inline struct user_struct *uid_hash_find(uid_t uid, struct user_struct **hashent)
@@ -101,6 +108,7 @@
atomic_set(&new->__count, 1);
atomic_set(&new->processes, 0);
atomic_set(&new->files, 0);
+ new->cpu_ticks = NICE_TO_TICKS(0);
ユーザの持つチケット数を再計算する関数の定義
+/* Fair scheduler, recalculate the per user cpu time cap. */
+void recalculate_peruser_cputicks(void)
+{
+ struct list_head * entry;
+ struct user_struct * user;
+
+ spin_lock(&uidhash_lock);
+ list_for_each(entry, &user_list) {
+ user = list_entry(entry, struct user_struct, list);
+ user->cpu_ticks = (user->cpu_ticks / 2) + NICE_TO_TICKS(0);
+ }
+ /* Needed hack, we can get called before uid_cache_init ... */
+ root_user.cpu_ticks = (root_user.cpu_ticks / 2) + NICE_TO_TICKS(0);
+ spin_unlock(&uidhash_lock);
+}
5 Fair Share スケジューラのパッチ
from:http://www.surriel.com/patches/
--- linux/kernel/sched.c.fair 2002-09-20 10:58:49.000000000 -0300
+++ linux/kernel/sched.c 2002-09-24 14:46:31.000000000 -0300
@@ -45,31 +45,33 @@
extern void mem_use(void);
+#ifdef CONFIG_FAIRSCHED
+/* Toggle the per-user fair scheduler on/off */
+int fairsched = 1;
+
+/* Move a task to the tail of the tasklist */
+static inline void move_last_tasklist(struct task_struct * p)
+{
+ /* list_del */
+ p->next_task->prev_task = p->prev_task;
+ p->prev_task->next_task = p->next_task;
+
+ /* list_add_tail */
+ p->next_task = &init_task;
+ p->prev_task = init_task.prev_task;
+ init_task.prev_task->next_task = p;
+ init_task.prev_task = p;
+}
+
/*
- * Scheduling quanta.
- *
- * NOTE! The unix "nice" value influences how long a process
- * gets. The nice value ranges from -20 to +19, where a -20
- * is a "high-priority" task, and a "+10" is a low-priority
- * task.
- *
- * We want the time-slice to be around 50ms or so, so this
- * calculation depends on the value of HZ.
+ * Remember p->next, in case we call move_last_tasklist(p) in the
+ * fairsched recalculation code.
*/
-#if HZ < 200
-#define TICK_SCALE(x) ((x) >> 2)
-#elif HZ < 400
-#define TICK_SCALE(x) ((x) >> 1)
-#elif HZ < 800
-#define TICK_SCALE(x) (x)
-#elif HZ < 1600
-#define TICK_SCALE(x) ((x) << 1)
-#else
-#define TICK_SCALE(x) ((x) << 2)
-#endif
-
-#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1)
+#define safe_for_each_task(p) \
+ for (p = init_task.next_task, next = p->next_task ; p != &init_task ; \
+ p = next, next = p->next_task)
+#endif /* CONFIG_FAIRSCHED */
/*
* Init task must be ok at boot for the ix86 as we will check its signals
@@ -460,6 +462,54 @@
}
/*
+ * If the remaining timeslice lengths of all runnable tasks reach 0
+ * the scheduler recalculates the priority of all processes.
+ */
+static void recalculate_priorities(void)
+{
+#ifdef CONFIG_FAIRSCHED
+ if (fairsched) {
+ struct task_struct *p, *next;
+ struct user_struct *up;
+ long oldcounter, newcounter;
+
+ recalculate_peruser_cputicks();
+
+ write_lock_irq(&tasklist_lock);
+ safe_for_each_task(p) {
+ up = p->user;
+ if (up->cpu_ticks > 0) {
+ oldcounter = p->counter;
+ newcounter = (oldcounter >> 1) +
+ NICE_TO_TICKS(p->nice);
+ up->cpu_ticks += oldcounter;
+ up->cpu_ticks -= newcounter;
+ /*
+ * If a user is very busy, only some of its
+ * processes can get CPU time. We move those
+ * processes out of the way to prevent
+ * starvation of others.
+ */
+ if (oldcounter != newcounter) {
+ p->counter = newcounter;
+ move_last_tasklist(p);
+ }
+ }
+ }
+ write_unlock_irq(&tasklist_lock);
+ } else
+#endif /* CONFIG_FAIRSCHED */
+ {
+ struct task_struct *p;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p)
+ p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
+ read_unlock(&tasklist_lock);
+ }
+}
+
+/*
* schedule_tail() is getting called from the fork return path. This
* cleans up all remaining scheduler things, without impacting the
* common case.
@@ -616,13 +666,10 @@
/* Do we need to re-calculate counters? */
if (unlikely(!c)) {
- struct task_struct *p;
-
spin_unlock_irq(&runqueue_lock);
- read_lock(&tasklist_lock);
- for_each_task(p)
- p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
- read_unlock(&tasklist_lock);
+
+ recalculate_priorities();
+
spin_lock_irq(&runqueue_lock);
goto repeat_schedule;
}
--- linux/kernel/user.c.fair 2002-09-20 11:50:56.000000000 -0300
+++ linux/kernel/user.c 2002-09-24 16:06:11.000000000 -0300
@@ -29,9 +29,12 @@
struct user_struct root_user = {
__count: ATOMIC_INIT(1),
processes: ATOMIC_INIT(1),
- files: ATOMIC_INIT(0)
+ files: ATOMIC_INIT(0),
+ cpu_ticks: NICE_TO_TICKS(0)
};
+static LIST_HEAD(user_list);
+
/*
* These routines must be called with the uidhash spinlock held!
*/
@@ -44,6 +47,8 @@
next->pprev = &up->next;
up->pprev = hashent;
*hashent = up;
+
+ list_add(&up->list, &user_list);
}
static inline void uid_hash_remove(struct user_struct *up)
@@ -54,6 +59,8 @@
if (next)
next->pprev = pprev;
*pprev = next;
+
+ list_del(&up->list);
}
static inline struct user_struct *uid_hash_find(uid_t uid, struct user_struct **hashent)
@@ -101,6 +108,7 @@
atomic_set(&new->__count, 1);
atomic_set(&new->processes, 0);
atomic_set(&new->files, 0);
+ new->cpu_ticks = NICE_TO_TICKS(0);
/*
* Before adding this, check whether we raced
@@ -120,6 +128,21 @@
return up;
}
+/* Fair scheduler, recalculate the per user cpu time cap. */
+void recalculate_peruser_cputicks(void)
+{
+ struct list_head * entry;
+ struct user_struct * user;
+
+ spin_lock(&uidhash_lock);
+ list_for_each(entry, &user_list) {
+ user = list_entry(entry, struct user_struct, list);
+ user->cpu_ticks = (user->cpu_ticks / 2) + NICE_TO_TICKS(0);
+ }
+ /* Needed hack, we can get called before uid_cache_init ... */
+ root_user.cpu_ticks = (root_user.cpu_ticks / 2) + NICE_TO_TICKS(0);
+ spin_unlock(&uidhash_lock);
+}
static int __init uid_cache_init(void)
{
--- linux/kernel/sysctl.c.fair 2002-09-21 00:00:36.000000000 -0300
+++ linux/kernel/sysctl.c 2002-09-21 20:40:49.000000000 -0300
@@ -50,6 +50,7 @@
extern int sysrq_enabled;
extern int core_uses_pid;
extern int cad_pid;
+extern int fairsched;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -256,6 +257,10 @@
{KERN_S390_USER_DEBUG_LOGGING,"userprocess_debug",
&sysctl_userprocess_debug,sizeof(int),0644,NULL,&proc_dointvec},
#endif
+#ifdef CONFIG_FAIRSCHED
+ {KERN_FAIRSCHED, "fairsched", &fairsched, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+#endif
{0}
};
--- linux/include/linux/sched.h.fair 2002-09-20 10:59:03.000000000 -0300
+++ linux/include/linux/sched.h 2002-09-24 15:12:50.000000000 -0300
@@ -275,6 +275,10 @@
/* Hash table maintenance information */
struct user_struct *next, **pprev;
uid_t uid;
+
+ /* Linked list for for_each_user */
+ struct list_head list;
+ long cpu_ticks;
};
#define get_current_user() ({ \
@@ -282,6 +286,7 @@
atomic_inc(&__user->__count); \
__user; })
+extern void recalculate_peruser_cputicks(void);
extern struct user_struct root_user;
#define INIT_USER (&root_user)
@@ -422,6 +427,31 @@
};
/*
+ * Scheduling quanta.
+ *
+ * NOTE! The unix "nice" value influences how long a process
+ * gets. The nice value ranges from -20 to +19, where a -20
+ * is a "high-priority" task, and a "+10" is a low-priority
+ * task.
+ *
+ * We want the time-slice to be around 50ms or so, so this
+ * calculation depends on the value of HZ.
+ */
+#if HZ < 200
+#define TICK_SCALE(x) ((x) >> 2)
+#elif HZ < 400
+#define TICK_SCALE(x) ((x) >> 1)
+#elif HZ < 800
+#define TICK_SCALE(x) (x)
+#elif HZ < 1600
+#define TICK_SCALE(x) ((x) << 1)
+#else
+#define TICK_SCALE(x) ((x) << 2)
+#endif
+
+#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1)
+
+/*
* Per process flags
*/
#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */
--- linux/include/linux/sysctl.h.fair 2002-09-21 20:41:18.000000000 -0300
+++ linux/include/linux/sysctl.h 2002-09-21 20:41:43.000000000 -0300
@@ -124,6 +124,7 @@
KERN_CORE_USES_PID=52, /* int: use core or core.%pid */
KERN_TAINTED=53, /* int: various kernel tainted flags */
KERN_CADPID=54, /* int: PID of the process to notify on CAD */
+ KERN_FAIRSCHED=55, /* int: turn fair scheduler on/off */
};
--- linux/arch/i386/config.in.fair 2002-09-21 20:42:06.000000000 -0300
+++ linux/arch/i386/config.in 2002-09-21 20:42:35.000000000 -0300
@@ -261,6 +261,9 @@
bool 'System V IPC' CONFIG_SYSVIPC
bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
bool 'Sysctl support' CONFIG_SYSCTL
+if [ "$CONFIG_EXPERIMENTAL" = "y" ] ; then
+ bool 'Fair scheduler' CONFIG_FAIRSCHED
+fi
if [ "$CONFIG_PROC_FS" = "y" ]; then
choice 'Kernel core (/proc/kcore) format' \
"ELF CONFIG_KCORE_ELF \
--- linux/arch/alpha/config.in.fair 2002-08-02 21:39:42.000000000 -0300
+++ linux/arch/alpha/config.in 2002-09-21 20:43:21.000000000 -0300
@@ -273,6 +273,9 @@
bool 'System V IPC' CONFIG_SYSVIPC
bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
bool 'Sysctl support' CONFIG_SYSCTL
+if [ "$CONFIG_EXPERIMENTAL" = "y" ] ; then
+ bool 'Fair scheduler' CONFIG_FAIRSCHED
+fi
if [ "$CONFIG_PROC_FS" = "y" ]; then
choice 'Kernel core (/proc/kcore) format' \
"ELF CONFIG_KCORE_ELF \
6 課題
今回は、ユーザファアなスケジューラを実装してみた。次は課題としてグループファアなスケジューラを実装せよ。
7 参考