排程器7—TASK_UNINTERRUPTIBLE和TASK_INTERRUPTIBLE
一、D狀態簡介
1. D狀態的由來
__schedule(bool preempt) { ... if (prev != next) { trace_sched_switch(preempt, prev, next); } ... }
trace_sched_switch() 中若 prev->state 為 TASK_UNINTERRUPTIBLE,在解析後的 trace 上就顯示為 D 狀態。
只要將程序狀態設定為 TASK_UNINTERRUPTIBLE,然後觸發任務切換將當前任務切走,此時在解析後的trace上看prev執行緒就是D狀態的,若是 TASK_INTERRUPTIBLE,trace上看就是sleep狀態。UNINTERRUPTIBLE 的意思是不被訊號喚醒。
2. 使用邏輯
(1) 和 schedule_timeout 配合使用,延時到期後由定時器到期後由 process_timeout 函式呼叫 wake_up_process(timeout->task) 喚醒自己,喚醒函式中會將任務狀態設定為 TASK_RUNNING。
static int sdias_sclp_send(struct sclp_req *req) //sclp_sdias.c { for (...) { set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(msecs_to_jiffies(500)); } }
(2) 和 hrtime 配合使用
和 schedule_timeout 搭配使用的時間精度是 jiffify,精度太低。可以使用高精度定時器,定時器到期後使用 hrtimer_wakeup 來喚醒任務。
int jbd2_journal_stop(handle_t *handle) //transaction.c { ... ktime_t expires = ktime_add_ns(ktime_get(), commit_time); set_current_state(TASK_UNINTERRUPTIBLE); schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); .... }
(3) 和等待佇列配合使用,當條件滿足時喚醒自己
init_waitqueue_head(&pp->wait); static int smu_release(struct inode *inode, struct file *file) //smu.c { ... DECLARE_WAITQUEUE(wait, current); add_wait_queue(&pp->wait, &wait); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); schedule(); if (pp->cmd.status != 1) break; } } remove_wait_queue(&pp->wait, &wait); ... } wake_up_all(&pp->wait);
先定義一個全域性等待佇列頭 wait_queue_head_t 結構,然後再定義一個 wait_queue_entry 結構來儲存需要喚醒的任務和指定喚醒函式 default_wake_function(預設),然後將 wait_queue_entry 掛在全域性連結串列 wait_queue_head_t 上,當條件滿足時呼叫 wake_up_all 相關函式喚醒全域性連結串列上的任務,任務喚醒後判斷條件是否滿足,滿足就退出,不滿足就切出任務繼續休眠。
注意這裡的 wait_queue_entry wait 是一個區域性變數,儲存在棧中,由於程序休眠後此函式沒有退出,沒有退棧,因此是沒有問題的。
3. 可以指定喚醒何種狀態的任務
int wake_up_state(struct task_struct *p, unsigned int state); int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, int sibling_count_hint); /* 常用的 wake_up_q 只用戶喚醒 interrupt 和 uninterruptable 型別的任務 */ void wake_up_q(struct wake_q_head *head) { try_to_wake_up(task, TASK_NORMAL, 0, 1); //TASK_NORMAL == (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) }
這裡有個引數 state,是個掩碼,只喚醒此時是這個掩碼包含狀態的任務,與它交集為空的任務不喚醒。
二、D狀態的使用機制
1. 大量驅動中進行自定義使用
就是上面三種使用方式,先 set_current_state(TASK_UNINTERRUPTIBLE) 然後再將任務切走,並等待喚醒。
2. swait/swakeup機制
__swait_XXX 函式進入等待,swake_up_XXX 喚醒,就是對上面機制的簡單封轉,見 swait.c/swait.h
3. wait/wakeup機制
wait_event_XXX 函式進入等待,__wake_up_XXX 喚醒,就是對上面機制的簡單封轉,見 wait.c/wait.h
4. wait_on_bit/wake_up_bit
wait_on_bit_XXX 函式進入等待,wake_up_bit 等函式喚醒,就是對上面機制的簡單封轉,見 wait_bit.c/wait_bit.h
5. semaphore
/* 使用的是 TASK_UNINTERRUPTIBLE */ extern void down(struct semaphore *sem); extern int __must_check down_timeout(struct semaphore *sem, long jiffies); /* 使用的是 TASK_INTERRUPTIBLE */ extern int __must_check down_interruptible(struct semaphore *sem); /* 使用的是 TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) */ extern int __must_check down_killable(struct semaphore *sem); /* 只對 sem->count - 1 進行判斷 */ extern int __must_check down_trylock(struct semaphore *sem); /* 使用 list_first_entry(&sem->wait_list, ...) 只喚醒wait連結串列上的首個任務 */ extern void up(struct semaphore *sem);
6. rwsem
/* 使用的是 TASK_UNINTERRUPTIBLE */ void __sched down_read(struct rw_semaphore *sem); void __sched down_write(struct rw_semaphore *sem); /* 使用的是 TASK_KILLABLE */ int __sched down_read_killable(struct rw_semaphore *sem); int __sched down_write_killable(struct rw_semaphore *sem);
讀寫訊號量匯出的函式中只使用了 TASK_UNINTERRUPTIBLE,沒有使用 TASK_INTERRUPTIBLE,實現見 rwsem.c
7. mutex
/* 使用的是 TASK_UNINTERRUPTIBLE */ void __sched mutex_lock(struct mutex *lock); /* 使用的是 TASK_INTERRUPTIBLE */ int __sched mutex_lock_interruptible(struct mutex *lock) /* 使用的是 TASK_KILLABLE */ int __sched mutex_lock_killable(struct mutex *lock)
8. rtmutex
/* 使用的是 TASK_UNINTERRUPTIBLE */ void __sched rt_mutex_lock(struct rt_mutex *lock) /* 使用的是 TASK_INTERRUPTIBLE */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
9. completion
/* 使用的是 TASK_UNINTERRUPTIBLE */ void __sched wait_for_completion(struct completion *x) unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout); void __sched wait_for_completion_io(struct completion *x) unsigned long __sched wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) /* 使用的是 TASK_INTERRUPTIBLE */ int __sched wait_for_completion_interruptible(struct completion *x) /* 使用的是 TASK_KILLABLE */ int __sched wait_for_completion_killable(struct completion *x) long __sched wait_for_completion_killable_timeout(struct completion *x, unsigned long timeout)
10. futex 使用者空間鎖
/* 使用的是 TASK_INTERRUPTIBLE,然後使用 wake_up_q 喚醒 */ void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) //futex.c
注:以上是在 5.4 核心中檢索 TASK_UNINTERRUPTIBLE,然後刪除重複項得出來的,應該是比較全面。
三、測試例子
#define pr_fmt(fmt) "mytest: " fmt #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sysfs.h> #include <linux/string.h> #include <linux/wait.h> #include <linux/sched.h> #define mytest_attr(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ .name = __stringify(_name), \ .mode = 0644, \ }, \ .show = _name##_show, \ .store = _name##_store, \ } #define mytest_attr_ro(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ .name = __stringify(_name), \ .mode = S_IRUGO, \ }, \ .show = _name##_show, \ } #define mytest_attr_wo(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ .name = __stringify(_name), \ .mode = S_IWUGO, \ }, \ .store = _name##_store, \ } struct mytest { int tri_value; struct kobject *kobj; wait_queue_head_t uninter_wait; wait_queue_head_t inter_wait; wait_queue_head_t killable_wait; }; struct mytest test; ssize_t uninter_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (test.tri_value != 1) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(&test.uninter_wait, &wait); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); schedule(); pr_info("uninter pid=%d %d was waken up! state=0x%x\n", current->pid, ((struct task_struct *)wait.private)->pid, ((struct task_struct *)wait.private)->state); if (test.tri_value == 1) { break; } } remove_wait_queue(&test.uninter_wait, &wait); } return sprintf(buf, "%d\n", test.tri_value); } mytest_attr_ro(uninter); ssize_t inter_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (test.tri_value != 2) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(&test.inter_wait, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); schedule(); pr_info("inter pid=%d %d was waken up! state=0x%x\n", current->pid, ((struct task_struct *)wait.private)->pid, ((struct task_struct *)wait.private)->state); if (test.tri_value == 2) { break; } } remove_wait_queue(&test.inter_wait, &wait); } return sprintf(buf, "%d\n", test.tri_value); } mytest_attr_ro(inter); ssize_t killable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (test.tri_value != 3) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(&test.killable_wait, &wait); for (;;) { set_current_state(TASK_KILLABLE); schedule(); pr_info("killable pid=%d %d was waken up! state=0x%x\n", current->pid, ((struct task_struct *)wait.private)->pid, ((struct task_struct *)wait.private)->state); if (test.tri_value == 3) { break; } } remove_wait_queue(&test.killable_wait, &wait); } return sprintf(buf, "%d\n", test.tri_value); } mytest_attr_ro(killable); ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int val; if (sscanf(buf, "%d", &val) != 1) { return -EINVAL; } test.tri_value = val; switch(test.tri_value) { case 1: wake_up_all(&test.uninter_wait); break; case 2: wake_up_all(&test.inter_wait); break; case 3: wake_up_all(&test.killable_wait); break; default: break; } return count; } mytest_attr_wo(trigger); static struct attribute *mytest_attrs[] = { &uninter_attr.attr, &inter_attr.attr, &killable_attr.attr, &trigger_attr.attr, NULL, }; static struct attribute_group mytest_attr_group = { .name = "mytest", .attrs = mytest_attrs, }; static int mytest_device_file_init(void) { int ret = 0; test.kobj = kobject_create_and_add("test", NULL); if (!test.kobj) { pr_info("kobject_create_and_add failed!\n"); return -ENOMEM; } ret = sysfs_create_group(test.kobj, &mytest_attr_group); if (ret) { pr_info("sysfs_create_group failed!\n"); return ret; } return ret; } static int __init mytest_init(void) { int ret; init_waitqueue_head(&test.uninter_wait); init_waitqueue_head(&test.inter_wait); init_waitqueue_head(&test.killable_wait); ret = mytest_device_file_init(); pr_info("mytest_init probed! ret=%d\n", ret); return ret; } static void __exit mytest_exit(void) { pr_info("mytest_exit removed\n"); } module_init(mytest_init); module_exit(mytest_exit); MODULE_LICENSE("GPL");
四、結論
大多數機制都是支援 interrupt 和 uninterrupt 的兩種進入等待方式的。核心中的鎖相關機制若無特殊標識,一般是使用 TASK_UNINTERRUPTIBLE,而使用者空間鎖機制,在核心中使用的是TASK_INTERRUPTIBLE 。
TODO:
但是由 signal_pending_state() 的實現可知,SIGKILL(9) 無法被遮蔽?
訊號量使用 __down_killable(),看UNINTERRUPTABLE能否被kill喚醒 ?#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)