Linux epoll源碼註釋
Linux系統運行源碼剖析-epoll代碼註釋
理解了中斷、等待隊列、調度,你就能懂Linux的80%。
--老子
轉發的話,請註明出處哦:http://www.cnblogs.com/stonehat/
Linux系統內核提供了三個系統調用:
include/linux/syscalls.h
// epoll_create,創建epoll描述符 asmlinkage long sys_epoll_create(int size); // epoll_ctl, 操作epoll描述符,增刪改 asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event); // epoll_wait, 你懂的 asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, int timeout);
其函數實現在fs/eventpoll.c
eventpoll 本身也是一個支持poll操作的文件,所以可以把eventpoll組成一個樹形關系。
下面分別按照sys_epoll_create,sys_epoll_ctl,sys_epoll_wait的順序分析三個系統調用。
重要的結構體
// eventpoll結構體
struct eventpoll {
/* Protect the this structure access */
rwlock_t lock;
/*
* 同步用的內核信號量
*/
struct rw_semaphore sem;
/**
* 等待隊列,epoll_wait()使用,將調用線程掛在這個隊列上。
*/
wait_queue_head_t wq;
/* 等待隊列,file->poll()會使用,在epoll中函數為ep_eventpoll_poll */
wait_queue_head_t poll_wait;
/* 就緒列表*/
struct list_head rdllist;
/* 紅黑樹,維護了 */
struct rb_root rbr;
};
// 內核中文件
struct file {
struct list_head f_list;
struct dentry *f_dentry;
struct vfsmount *f_vfsmnt;
//文件操作指針
struct file_operations *f_op;
atomic_t f_count;
unsigned int f_flags;
mode_t f_mode;
int f_error;
loff_t f_pos;
struct fown_struct f_owner;
unsigned int f_uid, f_gid;
struct file_ra_state f_ra;
unsigned long f_version;
void *f_security;
/* file中的私有自定義數據 */
void *private_data;
#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct list_head f_ep_links;
spinlock_t f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
};
struct file_operations {
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t, loff_t);
int (*readdir) (struct file *, void *, filldir_t);
// 不阻塞,檢測file狀態(可讀、可寫等),如果條件不滿足,pt將會被加到等待隊列中。(一般是這種邏輯,最終如何實現還是要看設備驅動)
unsigned int (*poll) (struct file *f, struct poll_table_struct *pt);
int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
int (*release) (struct inode *, struct file *);
int (*fsync) (struct file *, struct dentry *, int datasync);
int (*aio_fsync) (struct kiocb *, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*dir_notify)(struct file *filp, unsigned long arg);
int (*flock) (struct file *, int, struct file_lock *);
};
概念與關系
- 文件描述符fd:進程打開的文件的數字代表形式,是文件指針的索引。
- struct file:在內核中表示進程打開的文件。task.files[fd]=file
- struct inode:靜態的文件表示。
一. sys_epoll_create
代碼如下:為了方便理解原理,無關緊要的代碼邏輯和異常處理刪掉了
asmlinkage long sys_epoll_create(int size)
{
int error, fd;
struct inode *inode;
struct file *file;
.....
/*
* 創建一個新的file,inode,獲得file對應的fd。
* 並且將file加入到當前進程打開文件列表。
*/
error = ep_getfd(&fd, &inode, &file);
/* 創建struct eventpoll,並掛在file的private_data指針上*/
error = ep_file_init(file);
.....
return fd;
}
ep_getfd簡單流程
static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
{
struct qstr this;
char name[32];
struct dentry *dentry;
struct inode *inode;
struct file *file;
int error, fd;
/* Get an ready to use file */
error = -ENFILE;
file = get_empty_filp();
if (!file)
goto eexit_1;
/* Allocates an inode from the eventpoll file system */
inode = ep_eventpoll_inode();
error = PTR_ERR(inode);
if (IS_ERR(inode))
goto eexit_2;
/* Allocates a free descriptor to plug the file onto */
error = get_unused_fd();
if (error < 0)
goto eexit_3;
fd = error;
/*
* Link the inode to a directory entry by creating a unique name
* using the inode number.
*/
error = -ENOMEM;
sprintf(name, "[%lu]", inode->i_ino);
this.name = name;
this.len = strlen(name);
this.hash = inode->i_ino;
dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
if (!dentry)
goto eexit_4;
dentry->d_op = &eventpollfs_dentry_operations;
d_add(dentry, inode);
file->f_vfsmnt = mntget(eventpoll_mnt);
file->f_dentry = dentry;
file->f_mapping = inode->i_mapping;
file->f_pos = 0;
file->f_flags = O_RDONLY;
file->f_op = &eventpoll_fops;
file->f_mode = FMODE_READ;
file->f_version = 0;
file->private_data = NULL;
/* Install the new setup file into the allocated fd. */
fd_install(fd, file);
*efd = fd;
*einode = inode;
*efile = file;
return 0;
eexit_4:
put_unused_fd(fd);
eexit_3:
iput(inode);
eexit_2:
put_filp(file);
eexit_1:
return error;
}
查找一個沒有用的文件描述符。記為fd
創建一個空文件file結構體。記為epfile
在epoll的文件系統中創建一個inode
epfile和inode做關聯。
epfile的f_ops成員(文件操作指針)和epoll的自定義函數組eventpoll_fops做關聯。比較重要的一點是eventpoll_fops有一個自定義的poll函數,這個函數很重要,是實現epoll級聯模型的關鍵。後面可以通過比較f_ops是否等於eventpoll_fops來判斷file是不是epoll file。
static struct file_operations eventpoll_fops = {
.release = ep_eventpoll_close,
.poll = ep_eventpoll_poll
};
?
將epfile放到進程的打開文件列表中管理,用fd做索引。
初始化eventpoll結構,初始化等待隊列和就緒隊列等。
將epfile的private_data指向eventpoll結構。方便後面取eventpoll的數據。
返回給調用線程fd。
二、sys_epoll_ctl
sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
{
int error;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
current, epfd, op, fd, event));
error = -EFAULT;
// 1. 從用戶空間拷貝event數據。
if (EP_OP_HASH_EVENT(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto eexit_1;
/* 2. 根據epollfile的文件描述符獲得對應的file結構體,內核中fd和file是有一個映射關系的*/
error = -EBADF;
file = fget(epfd);
if (!file)
goto eexit_1;
/* 3. 獲得要操作的描述符的file指針,例如socket描述符 */
tfile = fget(fd);
if (!tfile)
goto eexit_2;
/* 4. 校驗tfile是否支持poll操作,必須支持poll才能使用epoll */
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll)
goto eexit_3;
/*
* 5. 校驗是否是epoll的file指針
*/
error = -EINVAL;
if (file == tfile || !IS_FILE_EPOLL(file))
goto eexit_3;
/*
* 6. 取eventpoll,從創建時,我們知道epoll把自己的eventpoll結構體放在file->private_data了裏面。
*/
ep = file->private_data;
down_write(&ep->sem);
/* Try to lookup the file inside our hash table */
epi = ep_find(ep, tfile, fd);
// 7. 具體的邏輯操作
error = -EINVAL;
switch (op) {
// 添加
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST;
break;
// 刪除
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
// 修改
case EPOLL_CTL_MOD:
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
break;
}
/*
* The function ep_find() increments the usage count of the structure
* so, if this is not NULL, we need to release it.
*/
if (epi)
ep_release_epitem(epi);
up_write(&ep->sem);
eexit_3:
fput(tfile);
eexit_2:
fput(file);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
current, epfd, op, fd, event, error));
return error;
}
上面的邏輯很簡單
- 驗證輸入有效性
邏輯上,只需要了解添加即可。epoll的添加是理解整個流程的關鍵
epoll添加
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct file *tfile, int fd)
{
int error, revents, pwake = 0;
unsigned long flags;
struct epitem *epi;
struct ep_pqueue epq;
error = -ENOMEM;
if (!(epi = EPI_MEM_ALLOC()))
goto eexit_1;
/* Item initialization follow here ... */
EP_RB_INITNODE(&epi->rbn);
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->txlink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
EP_SET_FFD(&epi->ffd, tfile, fd);
epi->event = *event;
atomic_set(&epi->usecnt, 1);
epi->nwait = 0;
/* 初始化polltable,當調用poll的時候,會調用ep_ptable_queue_proc函數將自身加入等待隊列中 */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* 將epq.pt的結構體傳入tfile進行poll,poll最終調用ep_ptable_queue_proc函數。
*/
revents = tfile->f_op->poll(tfile, &epq.pt);
/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
if (epi->nwait < 0)
goto eexit_2;
/* 操作tfile,把當前項加入到epoll列表中。
*/
spin_lock(&tfile->f_ep_lock);
list_add_tail(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_ep_lock);
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irqsave(&ep->lock, flags);
/* Add the current item to the rb-tree */
ep_rbtree_insert(ep, epi);
/* 如果已經有就緒的,就喚醒epollwait等待隊列和poll等待隊列 */
if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
current, ep, tfile, fd));
return 0;
eexit_2:
ep_unregister_pollwait(ep, epi);
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue.
*/
write_lock_irqsave(&ep->lock, flags);
if (EP_IS_LINKED(&epi->rdllink))
EP_LIST_DEL(&epi->rdllink);
write_unlock_irqrestore(&ep->lock, flags);
EPI_MEM_FREE(epi);
eexit_1:
return error;
}
整理一下,向epoll添加一個描述符主要步驟如下:
構建epitem,epitem之後會加入到eventpoll.rbr中。
調用init_poll_funcptr,將ep_ptable_queue_proc函數指針賦值給poll_table的qproc,poll_table記為epq.pt,在file的poll函數中,可以傳入poll_table作為參數,poll函數會主動調用poll_table的qproc函數。
poll_table的結構體如下:
```C
/**
*@param f:poll的file指針
*@param whead f的等待隊列
*@param pt
/
typedef void (poll_queue_proc)(struct file f, wait_queue_head_t whead, struct poll_table_struct *pt);
typedef struct poll_table_struct {
poll_queue_proc qproc;
} poll_table;
```
poll函數原型
C // 當上層傳入pt結構體時,驅動函數當調用poll_table_struct.qproc來實現阻塞隊列的添加工作。 unsigned int (*poll) (struct file *f, struct poll_table_struct *pt);
?
- 調用待監控的文件的poll函數,按第2步所說,poll函數規範的實現應該最終會調用到ep_ptable_queue_proc函數,ep_ptable_queue_proc主要是初始化一個等待隊列項(以ep_ptable_queue_proc為回調函數),然後將等待隊列項塞到驅動的等待隊列中。ep_ptable_queue_proc註釋如下:
struct __wait_queue {
unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
// 線程指針,如果func為默認的執行函數,這個需要賦值。
struct task_struct * task;
// 等待隊列喚醒執行的函數
wait_queue_func_t func;
struct list_head task_list;
};
typedef struct __wait_queue wait_queue_t;
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
// 這是一個特殊的宏操作,因為pt和epitem是包含在ep_queue結構體裏面的,所以可以根據偏移取同級別的epitem。
struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt);
struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) {
// 初始化一個等待隊列項,並且設置當等待隊列喚醒時的執行函數為ep_poll_callback
// 這個很關鍵。等下我們分析這個ep_poll_call
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
// 把剛創建的等待隊列項加入到等待隊列中。
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}
static inline void init_waitqueue_func_entry(wait_queue_t *q,
wait_queue_func_t func)
{
q->flags = 0;
q->task = NULL;
q->func = func;
}
- 至此,添加一個文件描述符到epoll監控內的流程完成了,總的來講,就是在對應的file中設置等待隊列。等待回調ep_poll_callback,。至於對應的file用什麽機制來確保文件異步就緒,epoll不管。不過一般是通過中斷來實現的。
epoll模型的poll函數實現:
* * structures and helpers for f_op->poll implementations */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); typedef struct poll_table_struct { poll_queue_proc qproc; } poll_table; //poll_wait函數實現,其實內部調用了poll_table.qproc成員,poll_table.qproc在epoll中對應了上面的ep_ptable_queue_proc函數 static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { if (p && wait_address) p->qproc(filp, wait_address, p); } // epollevent的poll函數實現,驅動的邏輯都差不多,有參考意義 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { unsigned int pollflags = 0; unsigned long flags; struct eventpoll *ep = file->private_data; /* 1. 加入等待隊列中*/ poll_wait(file, &ep->poll_wait, wait); /* Check our condition */ read_lock_irqsave(&ep->lock, flags); if (!list_empty(&ep->rdllist)) pollflags = POLLIN | POLLRDNORM; read_unlock_irqrestore(&ep->lock, flags); return pollflags; }
?
sys_epoll_wait
了解了ep_insert的話,這個其實就很容易理解了:
static struct file_operations eventpoll_fops = {
.release = ep_eventpoll_close,
.poll = ep_eventpoll_poll
};
/*
* sys_epoll_wait實現
*/
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout)
{
int error;
struct file *file;
struct eventpoll *ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
current, epfd, events, maxevents, timeout));
/**
* 驗證輸入的代碼忽略
*/
error = -EBADF;
// 1. 根據epfd獲得對應的file
file = fget(epfd);
if (!file)
goto eexit_1;
// 2. 驗證是否是epoll的file,就是驗證f_op是否等於eventpoll_fops
error = -EINVAL;
if (!IS_FILE_EPOLL(file))
goto eexit_2;
/*
* 3. 取eventpoll結構體
*/
ep = file->private_data;
/* 4. 調用ep_poll實現具體邏輯。不要被ep_poll名字忽悠了,這個不是poll實現 */
error = ep_poll(ep, events, maxevents, timeout);
eexit_2:
fput(file);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
current, epfd, events, maxevents, timeout, error));
return error;
}
epoll_wait最終調用ep_poll來實現核心功能。
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res, eavail;
unsigned long flags;
long jtimeout;
wait_queue_t wait;
/*
* 1. 內核中是是用滴答數作為時間計時的,所以下面代碼是轉換時間為滴答數。
*/
jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?
MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000;
retry:
write_lock_irqsave(&ep->lock, flags);
res = 0;
// 1. 如果就緒隊列是空的,就進行等待
if (list_empty(&ep->rdllist)) {
/*
* 2. 把當前調用epoll_wait的線程加入到wq等待隊列中,當ep_poll_callback()會喚醒這個線程。
* current是當前線程的代表,最終是從cpu中取得的。
*/
init_waitqueue_entry(&wait, current);
add_wait_queue(&ep->wq, &wait);
//死循環處理。
for (;;) {
/*
* 3. 設置為可打斷,方便處理信號。
*/
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&ep->rdllist) || !jtimeout)
break;
// 4. 處理未處理信號
if (signal_pending(current)) {
res = -EINTR;
break;
}
write_unlock_irqrestore(&ep->lock, flags);
// 類似於睡眠。其返回值為剩余時間。該函數會將該cpu的任務切換掉。所以下一行代碼在重新調度前不會執行。
jtimeout = schedule_timeout(jtimeout);
write_lock_irqsave(&ep->lock, flags);
}
//把調用線程從等待隊列刪除。
remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);
}
eavail = !list_empty(&ep->rdllist);
write_unlock_irqrestore(&ep->lock, flags);
/*
* 將events數據傳回用戶空間
*/
if (!res && eavail &&
!(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
goto retry;
return res;
}
ep_poll的步驟如下:
轉換超時時間為cpu滴答計數。
查詢就緒隊列是否就緒,如果有就緒的,就直接返回給上層。
如果沒有就緒的,就等待。
a. 把調用線程添加到eventpoll.wq隊列中。
b. 設置自身為可打斷狀態
c. 檢查現在是否有就緒,有的話就直接返給上層。
d. 處理信號。
c. 發起調度,將自身切換為阻塞狀態。等待被喚醒。喚醒的方式有:ep_poll_callback喚醒eventpoll.wq隊列或者其他中斷喚醒。ep_poll_callback是sys_epoll_ctl添加epoll監聽的時候設置的等待隊列回調。其實現為:
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
unsigned long flags;
// 1. 這是一個特殊的宏操作,因為wait和epitem是包含在ep_queue結構體裏面的,所以可以根據偏移取同級別的epitem。
struct epitem *epi = EP_ITEM_FROM_WAIT(wait);
// 2. 獲得對應的eventpoll
struct eventpoll *ep = epi->ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
current, epi->file, epi, ep));
write_lock_irqsave(&ep->lock, flags);
....
// 3. 將就緒item加入到就緒
list_add_tail(&epi->rdllink, &ep->rdllist);
is_linked:
/*
* 4. 喚醒wq等待隊列(就是喚醒等待epoll_wait的線程)
*/
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
is_disabled:
write_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait);
return 1;
}
Linux epoll源碼註釋