



select poll epoll这三个都是对poll机制的封装。




//好像是 linux-5.2

#ifndef __sigset_t_defined
#define __sigset_t_defined
typedef __sigset_t sigset_t;

EINVAL : 无效的标志
EMFILE : 用户打开的文件超过了限制
ENFILE : 系统打开的文件超过了限制
ENOMEM : 没有足够的内存完成当前操作
EBADF : epfd或者fd不是一个有效的文件描述符
EEXIST : op为EPOLL_CTL_ADD,但fd已经被监控
EINVAL : epfd是无效的epoll文件描述符
ENOMEM : 没有足够的内存完成当前操作
ENOSPC : epoll实例超过了/proc/sys/fs/epoll/max_user_watches中限制的监听数量
EBADF : epfd不是一个有效的文件描述符
EFAULT : events指向的内存无权访问
EINTR : 在请求事件发生或者过期之前,调用被信号打断
EINVAL : epfd是无效的epoll文件描述符
EPOLL_CLOEXEC = 02000000,
//EPOLL_NONBLOCK 它是fd的一个标识说明,用来设置文件close-on-exec状态的。

EPOLLIN = 0x001,
EPOLLPRI = 0x002,
EPOLLOUT = 0x004,
EPOLLMSG = 0x400,
EPOLLERR = 0x008,
//这个事件是默认的 后续代码有 epds.events |= EPOLLERR | EPOLLHUP;
EPOLLHUP = 0x010,
//这个事件是默认的 后续代码有 epds.events |= EPOLLERR | EPOLLHUP;
EPOLLRDHUP = 0x2000,
EPOLLONESHOT = (1 << 30), //设置关联的fd为one-shot的工作方式。 //表示只监听一次事件,如果要再次监听,需要重置这个socket上的EPOLLONESHOT事件。 //使用场合:多线程环境 //如果主线程在epoll_wait返回了套接字conn,之后子线程1在处理conn,主线程回到epoll_wait, //但还没等到子线程1返回conn又可读了,此时主线程epoll_wait返回,又分配给另一个线程,此时两个线程同时使用一个套接字,这当然是不行的, //出现了两个线程同时操作一个socket的局面。 //可以使用epoll的EPOLLONESHOT事件实现一个socket连接在任一时刻都被一个线程处理。 //作用: // 对于注册了EPOLLONESHOT事件的文件描述符,操作系统最多出发其上注册的一个可读,可写或异常事件,且只能触发一次。 //使用: // 注册了EPOLLONESHOT事件的socket一旦被某个线程处理完毕, // 该线程就应该立即重置这个socket上的EPOLLONESHOT事件,以确保这个socket下一次可读时, // (使用该线程的没重置此套接字前即:主线程不允许返回任何关于此套接字的事件,这样就做到同一时刻只可能有一个线程处理该套接字) // 其EPOLLIN事件能被触发,进而让其他工作线程有机会继续处理这个sockt。 //效果: // 尽管一个socket在不同事件可能被不同的线程处理,但同一时刻肯定只有一个线程在为它服务,这就保证了连接的完整性,从而避免了很多可能的竞态条件。 //也可以使用add,并忽略epoll_ctl()返回的错误码EEXIST来重置。(??还没试过) //EPOLLONESHOT优先于水平触发(默认)的处理,即同时设置水平触发和EPOLLONESHOT并不会把epi添加到ready链表。 //如果设置了EPOLLONESHOT标志位,则设置epi->event.events &= EP_PRIVATE_BITS,
//因而通过epoll_ctl进行ADD操作后会提示File exists错误。
EPOLLET = (1 << 31) //设置关联的fd为ET的工作方式,epoll的默认工作方式是LT。(LT/ET触发)LT水平触发 ET边缘触发 #define EPOLLET EPOLLET //LT模式是epoll默认的工作方式 //LT模式状态时,主线程正在epoll_wait等待事件时,请求到了,epoll_wait返回后没有去处理请求(recv), //那么下次epoll_wait时此请求还是会返回(立刻返回了); //而ET模式状态下,这次没处理,下次epoll_wait时将不返回(所以我们应该每次一定要处理) //本质的区别在设置了EPOLLET的fd在wait发送到用户空间之后,会重新挂回到就绪队列中。 //等待下次wait返回(会重新查看每个socket是否真的有数据,并不是挂上去就绪队列了就返回) //可查找这段代码if (!(epi->event.events & EPOLLET)) 仔细研读 看清楚有个!

#define EPOLL_CTL_ADD 1
// 注册目标fd到epfd中,同时关联内部event到fd上
#define EPOLL_CTL_DEL 2
// 从epfd中删除/移除已注册的fd,event可以被忽略,也可以为NULL
#define EPOLL_CTL_MOD 3
// 修改已经注册到fd的监听事件

struct epoll_event
uint32_t events;
epoll_data_t data;
} __attribute__((__packed__));

typedef union epoll_data
void *ptr;//指定与fd相关的用户数据
int fd; //指定事件所从属的目标文件描述符
uint32_t u32;
uint64_t u64;
} epoll_data_t;

extern int epoll_create(int __size) __THROW;
//创建一个epoll实例。 返回新实例的fd。
extern int epoll_create1(int __flags) __THROW;
//与epoll_create相同,但带有FLAGS参数。 无用的SIZE参数已被删除。

extern int epoll_ctl(int __epfd, int __op, int __fd,
struct epoll_event *__event) __THROW;
//epoll实例“epfd”。 成功时返回0,错误时返回-1(“errno”变量将包含特定错误代码)
//“op”参数是上面定义的EPOLL_CTL_ *常量之一。

extern int epoll_wait(int __epfd, struct epoll_event *__events,
int __maxevents, int __timeout);
//返回值为“events”缓冲区中返回的触发事件数。 如果出错,“errno”变量设置为特定错误代码,则返回-1。
//“timeout”参数指定最长等待时间(以毫秒为单位)(-1 ==无限)。

extern int epoll_pwait(int __epfd, struct epoll_event *__events,
int __maxevents, int __timeout,
__const __sigset_t *__ss);


// 1) epmutex (mutex)
// 2) ep->mtx (mutex)
// 3) ep->wq.lock (spinlock)
// 尤其是调用spin_lock_irqsave()的时候, 中断关闭, 不会发生进程调度,
// 被保护的资源其它CPU也无法访问。 这个锁是很强力的, 所以只能锁一些
// 非常轻量级的操作。


//将epoll fd插入另一个epoll中也会获得。
//当一个epoll fd被添加到另一个epoll fd时,有必要立即获得多个“ep-> mtx”。(最多4层嵌套)
//在这种情况下,我们总是按嵌套顺序获取锁(即在* epoll_ctl(e1,EPOLL_CTL_ADD,e2)之后,e1->mtx将始终在e2->mtx之前获取)。
//可以删除“ep-> mtx”并使用全局mutex“epmutex”(与“ep-> wq.lock”一起)使其工作,
//但是“ep-> mtx”将使界面更具可扩展性。 需要持有“epmutex”的事件非常罕见,
//而对于正常操作,epoll私有“ep-> mtx”将保证更好的可扩展性。

//RCU(Read - Copy Update),顾名思义就是读 - 拷贝修改,它是基于其原理命名的。



//在深入了解epoll的实现之前, 先来了解内核的3个方面.
// 1. 等待队列 waitqueue
// 我们简单解释一下等待队列:
// 队列头(wait_queue_head_t)往往是资源生产者,
// 队列成员(wait_queue_t)往往是资源消费者,
// 当头的资源ready后, 会逐个执行每个成员指定的回调函数,
// 来通知它们资源已经ready了, 等待队列大致就这个意思.
// 2. 内核的poll机制
// 被Poll的fd, 必须在实现上支持内核的Poll技术,
// 比如fd是某个字符设备,或者是个socket, 它必须实现
// file_operations中的poll操作, 给自己分配有一个等待队列头.
// 主动poll fd的某个进程必须分配一个等待队列成员, 添加到
// fd的对待队列里面去, 并指定资源ready时的回调函数.
// 用socket做例子, 它必须有实现一个poll操作, 这个Poll是
// 发起轮询的代码必须主动调用的, 该函数中必须调用poll_wait(),
// poll_wait会将发起者作为等待队列成员加入到socket的等待队列中去.
// 这样socket发生状态变化时可以通过队列头逐个通知所有关心它的进程.
// 这一点必须很清楚的理解, 否则会想不明白epoll是如何
// 得知fd的状态发生变化的.
// 3. epollfd本身也是个fd, 所以它本身也可以被epoll,
// (最多4层嵌套)EP_MAX_NESTS


//如果设置了EPOLLONESHOT标志位,则设置epi->event.events &= EP_PRIVATE_BITS,


#define EP_MAX_NESTS 4//指最多4层epoll嵌套
#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
#define EP_UNACTIVE_PTR ((void *) -1L)
#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))

struct epoll_filefd {
struct file *file;
int fd;

struct nested_call_node {
struct list_head llink;
void *cookie;
void *ctx;

struct nested_calls {
struct list_head tasks_call_list;
spinlock_t lock;

//epitem 表示一个被监听的fd
struct epitem {
union {
struct rb_node rbn;
//rb_node, 当使用epoll_ctl()将多个fds加入到某个epollfd时, 内核会用slab分配
//多个epitem与fds对应, 而且它们以红黑树的形式组织起来,
//tree的root保存在eventpoll rbr中.
struct rcu_head rcu;

struct list\_head rdllink;  

struct epitem \*next;  

struct epoll\_filefd ffd;  
//epitem对应的fd和struct file

/\* Number of active wait queue attached to poll operations \*/  
int nwait;  

/\* List containing poll wait queues \*/  
struct list\_head pwqlist;  

// 同一个文件上可能会监视多种事件,  
// 这些事件可能属于不同的wait\_queue中  
// (取决于对应文件类型的实现),  
// 所以需要使用链表  
struct eventpoll \*ep;  

struct list\_head fllink;  
//把一个file加到了两个epoll中(file 的 f\_ep\_links链表中会有两个epitem的fllink)

/\* wakeup\_source used when EPOLLWAKEUP is set \*/  
struct wakeup\_source \_\_rcu\* ws;  

struct epoll\_event event;  


* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
* Access to it is protected by the lock inside wq.
//这个结构存储在file->private_data。是每个epoll fd(epfd)对应的主要数据结构
struct eventpoll {
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
struct mutex mtx;
//添加, 修改或者删除监听fd的时候, 以及epoll_wait返回, 向用户空间传递数据时都会持有这个互斥锁,
//所以在用户空间可以放心的在多个线程中同时执行epoll相关的操作, 内核级已经做了保护.

wait\_queue\_head\_t wq;  

/\* Wait queue used by file->poll() \*/  
wait\_queue\_head\_t poll\_wait;  

struct list\_head rdllist;  

struct rb\_root\_cached rbr;  

struct epitem \*ovflist;  

/\* wakeup\_source used when ep\_scan\_ready\_list is running \*/  
struct wakeup\_source \*ws;  

struct user\_struct \*user;  
//这里保存了一些用户变量, 比如fd监听数量的最大值等等

struct file \*file;  

/\* used to optimize loop detection check \*/  
int visited;  
struct list\_head visited\_list\_link;  

/* Wait structure used by the poll hooks */

struct eppoll_entry {
/* List header used to link this structure to the "struct epitem" */
struct list_head llink;
/* The "base" pointer is set to the container "struct epitem" */
struct epitem *base;
* Wait queue item that will be linked to the target file wait
* queue head.
wait_queue_entry_t wait;
/* The wait queue head that linked the "wait" wait queue item */
wait_queue_head_t *whead;

/* Wrapper struct used by poll queueing */
struct ep_pqueue {
poll_table pt;
struct epitem *epi;

* Configuration options available inside /proc/sys/fs/epoll/
/* Maximum number of epoll watched descriptors, per user */
static long max_user_watches __read_mostly;
* This mutex is used to serialize ep_free() and eventpoll_release_file().
static DEFINE_MUTEX(epmutex);

/* Used to check for epoll file descriptor inclusion loops */
static struct nested_calls poll_loop_ncalls;

/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __read_mostly;

/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly;

/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
static LIST_HEAD(visited_list);

* List of files with newly added links, where we may need to limit the number
* of emanating paths. Protected by the epmutex.
static LIST_HEAD(tfile_check_list);

static const struct file_operations eventpoll_fops;

static inline int is_file_epoll(struct file *f)
return f->f_op == &eventpoll_fops;

static inline void ep_set_ffd(struct epoll_filefd *ffd,
struct file *file, int fd)
ffd->file = file;
ffd->fd = fd;

static inline int ep_cmp_ffd(struct epoll_filefd *p1,
struct epoll_filefd *p2)
return (p1->file > p2->file ? +1 :
(p1->file < p2->file ? -1 : p1->fd - p2->fd));

/* Tells us if the item is currently linked */
static inline int ep_is_linked(struct epitem *epi)
return !list_empty(&epi->rdllink);

//container_of 通过结构体变量中某个成员的首地址进而获得整个结构体变量的首地址。
//container_of(ptr, type, member)
//ptr : 表示结构体中member的地址
//type : 表示结构体类型
//member : 表示结构体中的成员
static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
return container_of(p, struct eppoll_entry, wait);

/* Get the "struct epitem" from a wait queue pointer */
static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
return container_of(p, struct eppoll_entry, wait)->base;

/* Get the "struct epitem" from an epoll queue wrapper */
static inline struct epitem *ep_item_from_epqueue(poll_table *p)
return container_of(p, struct ep_pqueue, pt)->epi;

/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
return op != EPOLL_CTL_DEL;

/* Initialize the poll safe wake up structure */
static void ep_nested_calls_init(struct nested_calls *ncalls)

* ep_events_available - Checks if ready events might be available.
* @ep: Pointer to the eventpoll context.
* Returns: Returns a value different than zero if ready events are available,
* or zero otherwise.
static inline int ep_events_available(struct eventpoll *ep)
return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;

static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)

static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)

static inline void ep_set_busy_poll_napi_id(struct epitem *epi)

* ep_call_nested - Perform a bound (possibly) nested call, by checking
* that the recursion limit is not exceeded, and that
* the same nested call (by the meaning of same cookie) is
* no re-entered.
* @ncalls: Pointer to the nested_calls structure to be used for this call.
* @max_nests: Maximum number of allowed nesting calls.
* @nproc: Nested call core function pointer.
* @priv: Opaque data to be passed to the @nproc callback.
* @cookie: Cookie to be used to identify this nested call.
* @ctx: This instance context.
* Returns: Returns the code returned by the @nproc callback, or -1 if
* the maximum recursion limit has been exceeded.

static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
int(*nproc)(void *, void *, int), void *priv,
void *cookie, void *ctx)
int error, call_nests = 0;
unsigned long flags;
struct list_head *lsthead = &ncalls->tasks_call_list;
struct nested_call_node *tncur;
struct nested_call_node tnode;

spin\_lock\_irqsave(&ncalls->lock, flags);

\* Try to see if the current task is already inside this wakeup call.  
\* We use a list here, since the population inside this set is always  
\* very much limited.  
list\_for\_each\_entry(tncur, lsthead, llink) {  
    if (tncur->ctx == ctx &&  
        (tncur->cookie == cookie || ++call\_nests > max\_nests)) {  
        \* Ops ... loop detected or maximum nest level reached.  
        \* We abort this wake by breaking the cycle itself.  
        error = -1;  
        goto out\_unlock;  

/\* Add the current task and cookie to the list \*/  
tnode.ctx = ctx;  
tnode.cookie = cookie;  
list\_add(&tnode.llink, lsthead);

spin\_unlock\_irqrestore(&ncalls->lock, flags);

/\* Call the nested function \*/  
error = (\*nproc)(priv, cookie, call\_nests);

/\* Remove the current task from the list \*/  
spin\_lock\_irqsave(&ncalls->lock, flags);  

spin_unlock_irqrestore(&ncalls->lock, flags);

return error;  


* As described in commit 0ccf831cb lockdep: annotate epoll
* the use of wait queues used by epoll is done in a very controlled
* manner. Wake ups can nest inside each other, but are never done
* with the same locking. For example:
* dfd = socket(…);
* efd1 = epoll_create();
* efd2 = epoll_create();
* epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, …);
* epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, …);
* When a packet arrives to the device underneath "dfd", the net code will
* issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
* callback wakeup entry on that queue, and the wake_up() performed by the
* "dfd" net code will end up in ep_poll_callback(). At this point epoll
* (efd1) notices that it may have some event ready, so it needs to wake up
* the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
* that ends up in another wake_up(), after having checked about the
* recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
* avoid stack blasting.
* When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
* this special case of epoll.

static void ep_poll_safewake(wait_queue_head_t *wq)
wake_up_poll(wq, EPOLLIN);

static void ep_remove_wait_queue(struct eppoll_entry *pwq)
wait_queue_head_t *whead;
* If it is cleared by POLLFREE, it should be rcu-safe.
* If we read NULL we need a barrier paired with
* smp_store_release() in ep_poll_callback(), otherwise
* we rely on whead->lock.
whead = smp_load_acquire(&pwq->whead);
if (whead)
remove_wait_queue(whead, &pwq->wait);

* This function unregisters poll callbacks from the associated file
* descriptor. Must be called with "mtx" held (or "epmutex" if called from
* ep_free).
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
struct list_head *lsthead = &epi->pwqlist;
struct eppoll_entry *pwq;

while (!list\_empty(lsthead)) {  
    pwq = list\_first\_entry(lsthead, struct eppoll\_entry, llink);

    kmem\_cache\_free(pwq\_cache, pwq);  


//__pm_stay_awake,通知PM core,ws产生了wakeup event,且正在处理,因此不允许系统suspend(stay awake);
//PM 电源管理
void __pm_stay_awake(wakeup_source *ws);
/* call only when ep->mtx is held */
static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));

/* call only when ep->mtx is held */
static inline void ep_pm_stay_awake(struct epitem *epi)
struct wakeup_source *ws = ep_wakeup_source(epi);

if (ws)  


static inline bool ep_has_wakeup_source(struct epitem *epi)
return rcu_access_pointer(epi->ws) ? true : false;

/* call when ep->mtx cannot be held (ep_poll_callback) */
static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
struct wakeup_source *ws;

ws = rcu\_dereference(epi->ws);  
if (ws)  


static void epi_rcu_free(struct rcu_head *head)
struct epitem *epi = container_of(head, struct epitem, rcu);
kmem_cache_free(epi_cache, epi);

static void ep_free(struct eventpoll *ep)
struct rb_node *rbp;
struct epitem *epi;

/\* We need to release all tasks waiting for these file \*/  
//如果为空的话,等待队列不可用返回0;. 如果不为空,等待队列可用返回1  
if (waitqueue\_active(&ep->poll\_wait))  

\* We need to lock this because we could be hit by  
\* eventpoll\_release\_file() while we're freeing the "struct eventpoll".  
\* We do not need to hold "ep->mtx" here because the epoll file  
\* is on the way to be removed and no one has references to it  
\* anymore. The only hit might come from eventpoll\_release\_file() but  
\* holding "epmutex" is sufficient here.  

\* Walks through the whole tree by unregistering poll callbacks.  
for (rbp = rb\_first\_cached(&ep->rbr); rbp; rbp = rb\_next(rbp)) {  
    epi = rb\_entry(rbp, struct epitem, rbn);

    ep\_unregister\_pollwait(ep, epi);  

\* Walks through the whole tree by freeing each "struct epitem". At this  
\* point we are sure no poll callbacks will be lingering around, and also by  
\* holding "epmutex" we can be sure that no file cleanup code will hit  
\* us during this operation. So we can avoid the lock on "ep->wq.lock".  
\* We do not need to lock ep->mtx, either, we only do it to prevent  
\* a lockdep warning.  

while ((rbp = rb\_first\_cached(&ep->rbr)) != NULL) {  
    epi = rb\_entry(rbp, struct epitem, rbn);  
    ep\_remove(ep, epi);  


static int ep_eventpoll_release(struct inode *inode, struct file *file)
struct eventpoll *ep = file->private_data;

if (ep)  

return 0;  


static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
struct epitem *epi, *tmp;
poll_table pt;
int depth = *(int *)priv;
init_poll_funcptr(&pt, NULL);
//list_for_each_entry_safe(pos, n, head, member),
list_for_each_entry_safe(epi, tmp, head, rdllink) {
if (ep_item_poll(epi, &pt, depth)) {
else {
* Item has been dropped into the ready list by the poll
* callback, but it's not actually ready, as far as
* caller requested events goes. We can remove it here.
//通知PM core,ws没有正在处理的wakeup event,允许系统suspend(relax)

return 0;  


* Differs from ep_eventpoll_poll() in that internal callers already have
* the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
* is correctly annotated.
//当epoll主动poll某个fd时, 用来将epitem与指定的fd关联起来(将epitem加入到指定文件的wait队列).


static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;

if (epi->nwait >= 0 && (pwq = kmem\_cache\_alloc(pwq\_cache, GFP\_KERNEL))) {

    //初始化等待队列, 指定ep\_poll\_callback为唤醒时的回调函数,  
    //当我们监听的fd发生状态改变时, 也就是队列头被唤醒时,  
    init\_waitqueue\_func\_entry(&pwq->wait, ep\_poll\_callback);  
    pwq->whead = whead;  
    pwq->base = epi;  
    //add\_wait\_queue() 用来将一个进程添加到等待队列  
    //add\_wait\_queue\_exclusive()将进程插入到队列尾部,同时还设置了 WQ\_EXCLUSIVE 标志。  
    if (epi->event.events & EPOLLEXCLUSIVE)  
        add\_wait\_queue\_exclusive(whead, &pwq->wait);  
        add\_wait\_queue(whead, &pwq->wait);  
    list\_add\_tail(&pwq->llink, &epi->pwqlist);  
else {  
    epi->nwait = -1;  


* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
//这个是关键性的回调函数, 当我们监听的fd发生状态改变时, 它会被调用,
//参数key被当作一个unsigned long整数使用, 携带的是events.


static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
int pwake = 0;
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
int ewake = 0;

spin\_lock\_irqsave(&ep->wq.lock, flags);


\* If the event mask does not contain any poll(2) event, we consider the  
\* descriptor to be disabled. This condition is likely the effect of the  
\* EPOLLONESHOT bit that disables the descriptor when an event is received,  
\* until the next EPOLL\_CTL\_MOD will be issued.  
if (!(epi->event.events & ~EP\_PRIVATE\_BITS))  
    goto out\_unlock;

\* Check the events coming with the callback. At this stage, not  
\* every device reports the events in the "key" parameter of the  
\* callback. We need to be able to handle both cases here, hence the  
\* test for "key" != NULL before the event match test.  
//检查回调附带的事件。 在此阶段,并非每个设备都在回调的“key”参数中报告事件。  
//我们需要能够在这里处理这两种情况,因此在事件匹配测试之前测试“key”!= NULL。

if (pollflags && !(pollflags & epi->event.events))  
    goto out\_unlock;

\* If we are transferring events to userspace, we can hold no locks  
\* (because we're accessing user memory, and because of linux f\_op->poll()  
\* semantics). All the events that happen during that period of time are  
\* chained in ep->ovflist and requeued later on.  

//如果ep->ovflist != EP\_UNACTIVE\_PTR说明此时正在扫描rdllist链表,  
if (unlikely(ep->ovflist != EP\_UNACTIVE\_PTR)) {  
    if (epi->next == EP\_UNACTIVE\_PTR) {  
        epi->next = ep->ovflist;  
        ep->ovflist = epi;  
        if (epi->ws) {  
            \* Activate ep->ws since epi->ws may get  
            \* deactivated at any time.  
    goto out\_unlock;  

/\* If this file is already in the ready list we exit soon \*/  
if (!ep\_is\_linked(epi)) {  
    list\_add\_tail(&epi->rdllink, &ep->rdllist);  

\* Wake up ( if active ) both the eventpoll wait list and the ->poll()  
\* wait list.  
if (waitqueue\_active(&ep->wq)) {  
    if ((epi->event.events & EPOLLEXCLUSIVE) &&  
        !(pollflags & POLLFREE)) {  
        switch (pollflags & EPOLLINOUT\_BITS) {  
        case EPOLLIN:  
            if (epi->event.events & EPOLLIN)  
                ewake = 1;  
        case EPOLLOUT:  
            if (epi->event.events & EPOLLOUT)  
                ewake = 1;  
        case 0:  
            ewake = 1;  
//如果该epoll也被poll(即其他epoll添加了该epoll fd,嵌套epoll), 那就唤醒poll在该epoll上的等待epoll队列  
if (waitqueue\_active(&ep->poll\_wait))  

spin_unlock_irqrestore(&ep->wq.lock, flags);

/\* We have to call this outside the lock \*/  
if (pwake)  

if (!(epi->event.events & EPOLLEXCLUSIVE))  
    ewake = 1;

if (pollflags & POLLFREE) {  
    \* If we race with ep\_remove\_wait\_queue() it can miss  
    \* ->whead = NULL and do another remove\_wait\_queue() after  
    \* us, so we can't use \_\_remove\_wait\_queue().  
    \* ->whead != NULL protects us from the race with ep\_free()  
    \* or ep\_remove(), ep\_remove\_wait\_queue() takes whead->lock  
    \* held by the caller. Once we nullify it, nothing protects  
    \* ep/epi or even wait.  
    smp\_store\_release(&ep\_pwq\_from\_wait(wait)->whead, NULL);  

return ewake;  




static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
int depth)
struct eventpoll *ep;
bool locked;
pt->_key = epi->event.events;
if (!is_file_epoll(epi->ffd.file))
return vfs_poll(epi->ffd.file, pt) & epi->event.events;

ep = epi->ffd.file->private\_data;  
poll\_wait(epi->ffd.file, &ep->poll\_wait, pt);  
locked = pt && (pt->\_qproc == ep\_ptable\_queue\_proc);

return ep\_scan\_ready\_list(epi->ffd.file->private\_data,  
    ep\_read\_events\_proc, &depth, depth,  
    locked) & epi->event.events;  


//epoll文件的 poll机制
static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
struct eventpoll *ep = file->private_data;
int depth = 0;

/\* Insert inside our poll wait queue \*/  
poll\_wait(file, &ep->poll\_wait, wait);

\* Proceed to find out if wanted events are really available inside  
\* the ready list.  
return ep\_scan\_ready\_list(ep, ep\_read\_events\_proc,  
    &depth, depth, false);  


/* File callbacks that implement the eventpoll file behaviour */
//epoll文件支持的操作 主要是poll
static const struct file_operations eventpoll_fops = {
.show_fdinfo = ep_show_fdinfo,
.release = ep_eventpoll_release,
.poll = ep_eventpoll_poll,
.llseek = noop_llseek,

* This is called from eventpoll_release() to unlink files from the eventpoll
* interface. We need to have this facility to cleanup correctly files that are
* closed without being removed from the eventpoll interface.
void eventpoll_release_file(struct file *file)
struct eventpoll *ep;
struct epitem *epi, *next;

\* We don't want to get "file->f\_lock" because it is not  
\* necessary. It is not necessary because we're in the "struct file"  
\* cleanup path, and this means that no one is using this file anymore.  
\* So, for example, epoll\_ctl() cannot hit here since if we reach this  
\* point, the file counter already went to zero and fget() would fail.  
\* The only hit might come from ep\_free() but by holding the mutex  
\* will correctly serialize the operation. We do need to acquire  
\* "ep->mtx" after "epmutex" because ep\_remove() requires it when called  
\* from anywhere but ep\_free().  
\* Besides, ep\_remove() acquires the lock, so we can't hold it here.  
list\_for\_each\_entry\_safe(epi, next, &file->f\_ep\_links, fllink) {  
    ep = epi->ep;  
    mutex\_lock\_nested(&ep->mtx, 0);  
    ep\_remove(ep, epi);  

static int ep_alloc(struct eventpoll **pep)
int error;
struct user_struct *user;
struct eventpoll *ep;
//获取当前进程的一些信息, 比如是不是root啦, 最大监听fd数目啦
user = get_current_user();
error = -ENOMEM;
ep = kzalloc(sizeof(*ep), GFP_KERNEL);
if (unlikely(!ep))
goto free_uid;
ep->rbr = RB_ROOT_CACHED;
ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user;
*pep = ep;
return 0;
return error;

* Search the file inside the eventpoll tree. The RB tree operations
* are protected by the "mtx" mutex, and ep_find() must be called with
* "mtx" held.
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
int kcmp;
struct rb_node *rbp;
struct epitem *epi, *epir = NULL;
struct epoll_filefd ffd;

ep\_set\_ffd(&ffd, file, fd);  
for (rbp = ep->rbr.rb\_root.rb\_node; rbp; ) {  
    epi = rb\_entry(rbp, struct epitem, rbn);  
    kcmp = ep\_cmp\_ffd(&ffd, &epi->ffd);  
    if (kcmp > 0)  
        rbp = rbp->rb\_right;  
    else if (kcmp < 0)  
        rbp = rbp->rb\_left;  
    else {  
        epir = epi;  
return epir;  


* This is the callback that is used to add our wait queue to the
* target file wakeup lists.

static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
int kcmp;
struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
struct epitem *epic;
bool leftmost = true;

while (\*p) {  
    parent = \*p;  
    epic = rb\_entry(parent, struct epitem, rbn);  
    kcmp = ep\_cmp\_ffd(&epi->ffd, &epic->ffd);  
    if (kcmp > 0) {  
        p = &parent->rb\_right;  
        leftmost = false;  
        p = &parent->rb\_left;  
rb\_link\_node(&epi->rbn, parent, p);  
rb\_insert\_color\_cached(&epi->rbn, &ep->rbr, leftmost);  


#define PATH_ARR_SIZE 5
* These are the number paths of length 1 to 5, that we are allowing to emanate
* from a single file of interest. For example, we allow 1000 paths of length
* 1, to emanate from each file of interest. This essentially represents the
* potential wakeup paths, which need to be limited in order to avoid massive
* uncontrolled wakeup storms. The common use case should be a single ep which
* is connected to n file sources. In this case each file source has 1 path
* of length 1. Thus, the numbers below should be more than sufficient. These
* path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
* and delete can't add additional paths. Protected by the epmutex.
static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
static int path_count[PATH_ARR_SIZE];

static int path_count_inc(int nests)
/* Allow an arbitrary number of depth 1 paths */
if (nests == 0)
return 0;

if (++path\_count\[nests\] > path\_limits\[nests\])  
    return -1;  
return 0;  


static void path_count_init(void)
int i;

for (i = 0; i < PATH\_ARR\_SIZE; i++)  
    path\_count\[i\] = 0;  

static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
int error = 0;
struct file *file = priv;
struct file *child_file;
struct epitem *epi;

/\* CTL\_DEL can remove links here, but that can't increase our count \*/  
list\_for\_each\_entry\_rcu(epi, &file->f\_ep\_links, fllink)  
    child\_file = epi->ep->file;  
    if (is\_file\_epoll(child\_file)) {  
        if (list\_empty(&child\_file->f\_ep\_links)) {  
            if (path\_count\_inc(call\_nests)) {  
                error = -1;  
        else {  
            error = ep\_call\_nested(&poll\_loop\_ncalls,  
                child\_file, child\_file,  
        if (error != 0)  
    else {  
        printk(KERN\_ERR "reverse\_path\_check\_proc: "  
            "file is not an ep!\\n");  
return error;  


* reverse_path_check - The tfile_check_list is list of file *, which have
* links that are proposed to be newly added. We need to
* make sure that those added links don't add too many
* paths such that we will spend all our time waking up
* eventpoll objects.
* Returns: Returns zero if the proposed links don't create too many paths,
* -1 otherwise.
static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
static int reverse_path_check(void)
int error = 0;
struct file *current_file;

/\* let's call this for all tfiles \*/  
list\_for\_each\_entry(current\_file, &tfile\_check\_list, f\_tfile\_llink) {  
    error = ep\_call\_nested(&poll\_loop\_ncalls, EP\_MAX\_NESTS,  
        reverse\_path\_check\_proc, current\_file,  
        current\_file, current);  
    if (error)  
return error;  


static int ep_create_wakeup_source(struct epitem *epi)
const char *name;
struct wakeup_source *ws;

if (!epi->ep->ws) {  
    epi->ep->ws = wakeup\_source\_register("eventpoll");  
    if (!epi->ep->ws)  
        return -ENOMEM;  

name = epi->ffd.file->f\_path.dentry->d\_name.name;  
ws = wakeup\_source\_register(name);

if (!ws)  
    return -ENOMEM;  
rcu\_assign\_pointer(epi->ws, ws);

return 0;  


/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
static noinline void ep_destroy_wakeup_source(struct epitem *epi)
struct wakeup_source *ws = ep_wakeup_source(epi);


\* wait for ep\_pm\_stay\_awake\_rcu to finish, synchronize\_rcu is  
\* used internally by wakeup\_source\_remove, too (called by  
\* wakeup\_source\_unregister), so we cannot use call\_rcu  


* Must be called with "mtx" held.

/* Used by the ep_send_events() function as callback private data */
struct ep_send_events_data {
int maxevents;
struct epoll_event __user *events;
int res;

static int ep_send_events(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
struct ep_send_events_data esed;

esed.maxevents = maxevents;  
esed.events = events;  
ep\_scan\_ready\_list(ep, ep\_send\_events\_proc, &esed, 0, false);  
return esed.res;  


* ep_scan_ready_list - Scans the ready list in a way that makes possible for
* the scan code, to call f_op->poll(). Also allows for
* O(NumReady) performance.
* @ep: Pointer to the epoll private data structure.
* @sproc: Pointer to the scan callback.
* @priv: Private opaque data passed to the @sproc callback.
* @depth: The current depth of recursive f_op->poll calls.
* @ep_locked: caller already holds ep->mtx
* Returns: The same integer error code returned by the @sproc callback.
static __poll_t ep_scan_ready_list(struct eventpoll *ep,
__poll_t(*sproc)(struct eventpoll *,
struct list_head *, void *),
void *priv, int depth, bool ep_locked)
__poll_t res;
int pwake = 0;
struct epitem *epi, *nepi;
//LIST_HEAD() -- 生成一个名为txlist的双向链表头节点


\* We need to lock this because we could be hit by  
\* eventpoll\_release\_file() and epoll\_ctl().  

if (!ep\_locked)  
    mutex\_lock\_nested(&ep->mtx, depth);

\* Steal the ready list, and re-init the original one to the  
\* empty list. Also, set ep->ovflist to NULL so that events  
\* happening while looping w/out locks, are not lost. We cannot  
\* have the poll callback to queue directly on ep->rdllist,  
\* because we want the "sproc" callback to be able to do it  
\* in a lockless way.  


//此外,将ep-> ovflist设置为NULL,以便在循环w/out锁定时发生的事件不会丢失。  
//我们不能让poll回调直接在ep-> rdllist上排队,因  
list\_splice\_init(&ep->rdllist, &txlist);  
ep->ovflist = NULL;  

\* Now call the callback function.  
//在这个回调函数里面处理每个epitem \*sproc 就是 ep\_send\_events\_proc或者ep\_read\_events\_proc

res = (\*sproc)(ep, &txlist, priv);  
\* During the time we spent inside the "sproc" callback, some  
\* other events might have been queued by the poll callback.  
\* We re-insert them inside the main ready-list here.  

for (nepi = ep->ovflist; (epi = nepi) != NULL;  
    nepi = epi->next, epi->next = EP\_UNACTIVE\_PTR) {  
    \* We need to check if the item is already in the list.  
    \* During the "sproc" callback execution time, items are  
    \* queued into ->ovflist but the "txlist" might already  
    \* contain them, and the list\_splice() below takes care of them.  

    if (!ep\_is\_linked(epi)) {  
        list\_add\_tail(&epi->rdllink, &ep->rdllist);  
\* We need to set back ep->ovflist to EP\_UNACTIVE\_PTR, so that after  
\* releasing the lock, events will be queued in the normal way inside  
\* ep->rdllist.  
ep->ovflist = EP\_UNACTIVE\_PTR;

\* Quickly re-inject items left on "txlist".  

//上一次没有处理完的epitem, 重新插入到ready list,可能是因为用户空间只取了一部分走  
list\_splice(&txlist, &ep->rdllist);  

if (!list\_empty(&ep->rdllist)) {  
    \* Wake up (if active) both the eventpoll wait list and  
    \* the ->poll() wait list (delayed after we release the lock).  
    if (waitqueue\_active(&ep->wq))  
    if (waitqueue\_active(&ep->poll\_wait))  

if (!ep\_locked)  

/\* We have to call this outside the lock \*/  
if (pwake)  

return res;  


//如果掩码不为0,说明该epitem对象对应的事件发生了,那么就将其对应的struct epoll_event类型的对象拷贝到用户态指定的内存中(封装在struct epoll_event,从epoll_wait返回)。
static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
struct ep_send_events_data *esed = priv;
__poll_t revents;
struct epitem *epi;
struct epoll_event __user *uevent;
struct wakeup_source *ws;
poll_table pt;

init\_poll\_funcptr(&pt, NULL);

\* We can loop without lock because we are passed a task private list.  
\* Items cannot vanish during the loop because ep\_scan\_ready\_list() is  
\* holding "mtx" during this call.  
for (esed->res = 0, uevent = esed->events;  
    !list\_empty(head) && esed->res < esed->maxevents;) {  
    epi = list\_first\_entry(head, struct epitem, rdllink);

    \* Activate ep->ws before deactivating epi->ws to prevent  
    \* triggering auto-suspend here (in case we reactive epi->ws  
    \* below).  
    \* This could be rearranged to delay the deactivation of epi->ws  
    \* instead, but then epi->ws would temporarily be out of sync  
    \* with ep\_is\_linked().  
    ws = ep\_wakeup\_source(epi);  
    if (ws) {  
        if (ws->active)  
    //list\_del\_init(entry) 的作用是从双链表中删除entry节点,并将entry节点的前继节点和后继节点都指向entry本身。  

    //注意events我们ep\_poll\_callback()里面已经取过一次了, 为啥还要再取?  
    //1. 我们当然希望能拿到此刻的最新数据, events是会变的~  
    //2. 不是所有的poll实现, 都通过等待队列传递了events, 有可能某些驱动压根没传必须主动去读取.

    revents = ep\_item\_poll(epi, &pt, 1);

    \* If the event mask intersect the caller-requested one,  
    \* deliver the event to userspace. Again, ep\_scan\_ready\_list()  
    \* is holding "mtx", so no operations coming from userspace  
    \* can change the item.  
    if (revents) {  
        if (\_\_put\_user(revents, &uevent->events) ||  
            \_\_put\_user(epi->event.data, &uevent->data)) {  
            list\_add(&epi->rdllink, head);  
            if (!esed->res)  
                esed->res = -EFAULT;  
            return 0;  
        //如果设置了EPOLLONESHOT标志位,则设置epi->event.events &= EP\_PRIVATE\_BITS,  
        //注意设置了EPOLLONESHOT触发一次后并没有删除epi,因而通过epoll\_ctl进行ADD操作后会提示File exists错误。  
        if (epi->event.events & EPOLLONESHOT)  
            epi->event.events &= EP\_PRIVATE\_BITS;

        //下一次epoll\_wait时, 会立即返回, 并通知给用户空间.当然如果这个被监听的fds确实没事件也没数据了, epoll\_wait会返回一个0,空转一次.  
        else if (!(epi->event.events & EPOLLET)) {

            \* If this file has been added with Level  
            \* Trigger mode, we need to insert back inside  
            \* the ready list, so that the next call to  
            \* epoll\_wait() will check again the events  
            \* availability. At this point, no one can insert  
            \* into ep->rdllist besides us. The epoll\_ctl()  
            \* callers are locked out by  
            \* ep\_scan\_ready\_list() holding "mtx" and the  
            \* poll callback will queue them in ep->ovflist.  
            list\_add\_tail(&epi->rdllink, &ep->rdllist);  

return 0;  


static inline struct timespec64 ep_set_mstimeout(long ms)
struct timespec64 now, ts = {
.tv_sec = ms / MSEC_PER_SEC,
.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),

return timespec64\_add\_safe(now, ts);  


* ep_poll - Retrieves ready events, and delivers them to the caller supplied
* event buffer.
* @ep: Pointer to the eventpoll context.
* @events: Pointer to the userspace buffer where the ready events should be
* stored.
* @maxevents: Size (in terms of number of events) of the caller event buffer.
* @timeout: Maximum timeout for the ready events fetch operation, in
* milliseconds. If the @timeout is zero, the function will not block,
* while if the @timeout is less than zero, the function will block
* until at least one event has been retrieved (or an error
* occurred).
* Returns: Returns the number of ready events which have been fetched, or an
* error code, in case of error.

//如果epoll_wait入参定时时间为0, 那么直接通过ep_events_available判断当前是否有用户感兴趣的事件发生,如果有则通过ep_send_events进行处理
//如果定时时间大于0,并且当前没有用户关注的事件发生,则进行休眠,并添加到ep->wq等待队列的头部。 对等待事件描述符设置WQ_FLAG_EXCLUSIVE标志
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
wait_queue_entry_t wait;//等待队列
ktime_t expires, *to = NULL;

if (timeout > 0)  
    struct timespec64 end\_time = ep\_set\_mstimeout(timeout);

    slack = select\_estimate\_accuracy(&end\_time);  
    to = &expires;  
    \*to = timespec64\_to\_ktime(end\_time);  
else if (timeout == 0)  
    timed\_out = 1;  
    goto check\_events;  

if (!ep_events_available(ep))
ep_busy_loop(ep, timed_out);


if (!ep\_events\_available(ep)) {  


    (1) 支持NAPI的网卡驱动必须提供轮询方法poll()。  
    (2) 非NAPI的内核接口为netif\_rx(),NAPI的内核接口为napi\_schedule()。  
    (3) 非NAPI使用共享的CPU队列softnet\_data->input\_pkt\_queue,NAPI使用设备内存(或者  
    \* Busy poll timed out.  Drop NAPI ID for now, we can add  
    \* it back in when we have moved a socket with a valid NAPI  
    \* ID onto the ready list.  


    \* We don't have any available event to return to the caller.  
    \* We need to sleep here, and we will be wake up by  
    \* ep\_poll\_callback() when events will become available.  
    //OK, 初始化一个等待队列, 准备直接把自己挂起,  
    //注意current是一个宏, 返回的是一个thread\_info结构task字段(我们称之为进程描述符)的变量,task正好指向与thread\_info结构关联的那个进程描述符  
    init\_waitqueue\_entry(&wait, current);  
    ////挂载到eventpoll的等待队列,等待文件状态就绪或直到超时, 或被信号中断  
    \_\_add\_wait\_queue\_exclusive(&ep->wq, &wait);

    for (;;) {  

        //将当前进程设置位睡眠, 但是可以被信号唤醒的状态,  
        //注意这个设置是"将来时", 此刻还没睡眠

        \* Always short-circuit for fatal signals to allow  
        \* threads to make a timely exit without the chance of  
        \* finding more events available and fetching  
        \* repeatedly.  
        if (fatal\_signal\_pending(current)) {  
            res = -EINTR;  

        if (ep\_events\_available(ep) || timed\_out)  
        ///有信号产生, 也退出循环,唤醒  
        if (signal\_pending(current)) {  
            res = -EINTR;  
        //什么都没有,解锁, 睡眠  

        //jtimeout这个时间后, 会被唤醒,  
        //那么我们就会直接被唤醒, 不用等时间了...  
        //ep\_poll\_callback()的调用时机是由被监听的fd的具体实现(就绪事件), 比如socket或者某个设备驱动来决定的,  
        //因为等待队列头是他们持有的, epoll和当前进程只是单纯的等待

        if (!schedule\_hrtimeout\_range(to, slack, HRTIMER\_MODE\_ABS))  
            timed\_out = 1;

    \_\_remove\_wait\_queue(&ep->wq, &wait);  

/* Is it worth to try to dig for events ? */

eavail = ep\_events\_available(ep);


\* Try to transfer events to user space. In case we get 0 events and  
\* there's still timeout left over, we go trying again in search of  
\* more luck.  
//如果一切正常, 有event发生, 就开始准备数据copy给用户空间了

if (!res && eavail &&  
    !(res = ep\_send\_events(ep, events, maxevents)) && !timed\_out)  
    goto fetch\_events;

return res;  


* ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
* API, to verify that adding an epoll file inside another
* epoll structure, does not violate the constraints, in
* terms of closed loops, or too deep chains (which can
* result in excessive stack usage).
* @priv: Pointer to the epoll file to be currently checked.
* @cookie: Original cookie for this call. This is the top-of-the-chain epoll
* data structure pointer.
* @call_nests: Current dept of the @ep_call_nested() call stack.
* Returns: Returns zero if adding the epoll @file inside current epoll
* structure @ep does not violate the constraints, or -1 otherwise.

static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
int error = 0;
struct file *file = priv;
struct eventpoll *ep = file->private_data;
struct eventpoll *ep_tovisit;
struct rb_node *rbp;
struct epitem *epi;

mutex\_lock\_nested(&ep->mtx, call\_nests + 1);  
ep->visited = 1;  
list\_add(&ep->visited\_list\_link, &visited\_list);  
for (rbp = rb\_first\_cached(&ep->rbr); rbp; rbp = rb\_next(rbp)) {  
    epi = rb\_entry(rbp, struct epitem, rbn);  
    if (unlikely(is\_file\_epoll(epi->ffd.file))) {  
        ep\_tovisit = epi->ffd.file->private\_data;  
        if (ep\_tovisit->visited)  
        error = ep\_call\_nested(&poll\_loop\_ncalls, EP\_MAX\_NESTS,  
            ep\_loop\_check\_proc, epi->ffd.file,  
            ep\_tovisit, current);  
        if (error != 0)  
    else {  
        \* If we've reached a file that is not associated with  
        \* an ep, then we need to check if the newly added  
        \* links are going to add too many wakeup paths. We do  
        \* this by adding it to the tfile\_check\_list, if it's  
        \* not already there, and calling reverse\_path\_check()  
        \* during ep\_insert().  
        if (list\_empty(&epi->ffd.file->f\_tfile\_llink))  

return error;  


* ep_loop_check - Performs a check to verify that adding an epoll file (@file)
* another epoll file (represented by @ep) does not create
* closed loops or too deep chains.
* @ep: Pointer to the epoll private data structure.
* @file: Pointer to the epoll file to be checked.
* Returns: Returns zero if adding the epoll @file inside current epoll
* structure @ep does not violate the constraints, or -1 otherwise.

static int ep_loop_check(struct eventpoll *ep, struct file *file)
int ret;
struct eventpoll *ep_cur, *ep_next;

ret = ep\_call\_nested(&poll\_loop\_ncalls, EP\_MAX\_NESTS,  
    ep\_loop\_check\_proc, file, ep, current);  
/\* clear visited list \*/  

list\_for\_each\_entry\_safe(ep\_cur, ep\_next, &visited\_list,  
    visited\_list\_link) {  
    ep\_cur->visited = 0;  
return ret;  

static void clear_tfile_check_list(void)
struct file *file;

/\* first clear the tfile\_check\_list \*/  
while (!list\_empty(&tfile\_check\_list)) {  
    file = list\_first\_entry(&tfile\_check\_list, struct file,  


static int do_epoll_create(int flags)
int error, fd;
struct eventpoll *ep = NULL;//主描述符
struct file *file;
error = ep_alloc(&ep);
//epollfd本身并不存在一个真正的文件与之对应, 所以内核需要创建一个
//"虚拟"的文件, 并为之分配真正的struct file结构,而且有真正的fd. 及innode节点
//eventpoll_fops, fops就是file operations, 就是当你对这个文件(这里是虚拟的)进行操作(比如读)时,
//fops里面的函数指针指向真正的操作实现, epoll只实现了poll和release(就是close)操作,其它文件系统操作都有VFS全权处理了.
//ep, ep就是struct epollevent, 它会作为一个私有数据保存在struct file的private指针里面.(sys_epoll_ctl会取用)
//其实, 就是为了能通过fd找到struct file, 通过struct file能找到eventpoll结构.

//O\_CLOEXEC close-on-exec  
fd = get\_unused\_fd\_flags(O\_RDWR | (flags & O\_CLOEXEC));



file = anon\_inode\_getfile("\[eventpoll\]", &eventpoll\_fops, ep,  
    O\_RDWR | (flags & O\_CLOEXEC));

ep->file = file;

fd\_install(fd, file);

return fd;  


SYSCALL_DEFINE1(epoll_create1, int, flags)
return do_epoll_create(flags);

SYSCALL_DEFINE1(epoll_create, int, size)
if (size <= 0)
return -EINVAL;
return do_epoll_create(0);

//epfd 就是epoll fd
//fd 需要监听的描述符
//event 我们关心的events
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
struct eventpoll *tep = NULL;

error = -EFAULT;

if (ep\_op\_has\_event(op) &&  
    copy\_from\_user(&epds, event, sizeof(struct epoll\_event)))  
    goto error\_return;

//为了取得epoll的struct file结构,  
f = fdget(epfd);

//获取被监控的文件描述符对应的struct fd类型的对象  
tf = fdget(fd);

error = -EPERM;  
//...epoll就是封装poll机制,select poll也都是封装poll机制  
if (!file\_can\_poll(tf.file))  
    goto error\_tgt\_fput;

/\* Check if EPOLLWAKEUP is allowed \*/  
if (ep\_op\_has\_event(op))  

\* We have to check that the file structure underneath the file descriptor  
\* the user passed to us \_is\_ an eventpoll file. And also we do not permit  
\* adding an epoll file descriptor inside itself.  
error = -EINVAL;  
if (f.file == tf.file || !is\_file\_epoll(f.file))  
    goto error\_tgt\_fput;

\* epoll adds to the wakeup queue at EPOLL\_CTL\_ADD time only,  
\* so EPOLLEXCLUSIVE is not allowed for a EPOLL\_CTL\_MOD operation.  
\* Also, we do not currently supported nested exclusive wakeups.  

if (ep\_op\_has\_event(op) && (epds.events & EPOLLEXCLUSIVE)) {  
    if (op == EPOLL\_CTL\_MOD)  
        goto error\_tgt\_fput;  
    if (op == EPOLL\_CTL\_ADD && (is\_file\_epoll(tf.file) ||  
        (epds.events & ~EPOLLEXCLUSIVE\_OK\_BITS)))  
        goto error\_tgt\_fput;  

//取到我们的eventpoll结构, 来自epoll\_create1()中的分配  
ep = f.file->private\_data;

\* When we insert an epoll file descriptor, inside another epoll file  
\* descriptor, there is the change of creating closed loops, which are  
\* better be handled here, than in more critical paths. While we are  
\* checking for loops we also determine the list of files reachable  
\* and hang them on the tfile\_check\_list, so we can check that we  
\* haven't created too many possible wakeup paths.  
\* We do not need to take the global 'epumutex' on EPOLL\_CTL\_ADD when  
\* the epoll file descriptor is attaching directly to a wakeup source,  
\* unless the epoll file descriptor is nested. The purpose of taking the  
\* 'epmutex' on add is to prevent complex toplogies such as loops and  
\* deep wakeup paths from forming in parallel through multiple  
\* EPOLL\_CTL\_ADD operations.  
// 接下来的操作有可能修改数据结构内容, 锁起来  
mutex\_lock\_nested(&ep->mtx, 0);  
//list\_empty(&f.file->f\_ep\_links) 这句的意思是, 看f这个文件有没有被epoll监听  
//把一个file加到了两个epoll中(file 的 f\_ep\_links 链表中会有两个epitem的fllink)  
if (op == EPOLL\_CTL\_ADD) {  
    if (!list\_empty(&f.file->f\_ep\_links) ||  
        is\_file\_epoll(tf.file)) {  
        full\_check = 1;  
        if (is\_file\_epoll(tf.file)) {  
            error = -ELOOP;  
            if (ep\_loop\_check(ep, tf.file) != 0) {  
                goto error\_tgt\_fput;  
        mutex\_lock\_nested(&ep->mtx, 0);  
        if (is\_file\_epoll(tf.file)) {  
            tep = tf.file->private\_data;  
            mutex\_lock\_nested(&tep->mtx, 1);  

epi = ep\_find(ep, tf.file, fd);  
error = -EINVAL;  
switch (op) {  
    // 注册新的fd到epfd中  
case EPOLL\_CTL\_ADD:  
    if (!epi) {  
        epds.events |= EPOLLERR | EPOLLHUP;  
        error = ep\_insert(ep, &epds, tf.file, fd, full\_check);  
        error = -EEXIST;  
    if (full\_check)  
    // 从epfd中删除一个fd  
case EPOLL\_CTL\_DEL:  
    if (epi)  
        error = ep\_remove(ep, epi);  
        error = -ENOENT;  
    // 修改已经注册的fd的监听事件  
case EPOLL\_CTL\_MOD:  
    if (epi) {  
        if (!(epi->event.events & EPOLLEXCLUSIVE)) {  
            epds.events |= EPOLLERR | EPOLLHUP;  
            error = ep\_modify(ep, epi, &epds);  
        error = -ENOENT;  
if (tep != NULL)  

if (full_check)



return error;  


* Removes a "struct epitem" from the eventpoll RB tree and deallocates
* all the associated resources. Must be called with "mtx" held.
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
struct file *file = epi->ffd.file;


ep\_unregister\_pollwait(ep, epi);

/\* Remove the current item from the list of epoll hooks \*/  

rb\_erase\_cached(&epi->rbn, &ep->rbr);

if (ep\_is\_linked(epi))  
\* At this point it is safe to free the eventpoll item. Use the union  
\* field epi->rcu, since we are trying to minimize the size of  
\* 'struct epitem'. The 'rbn' field is no longer in use. Protected by  
\* ep->mtx. The rcu read side, reverse\_path\_check\_proc(), does not make  
\* use of the rbn field.  
call\_rcu(&epi->rcu, epi\_rcu\_free);

return 0;  


//ep_insert()在epoll_ctl()中被调用, 完成往epollfd里面添加一个监听fd的工作
//tfile是fd的struct file结构


//即sock_def_readable(),在这个回调函数中则会遍历socket 文件中的等待队列,

static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
struct file *tfile, int fd, int full_check)
int error, pwake = 0;
__poll_t revents;
long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
user_watches = atomic_long_read(&ep->user->epoll_watches);

if (unlikely(user\_watches >= max\_user\_watches))  
    return -ENOSPC;  
if (!(epi = kmem\_cache\_alloc(epi\_cache, GFP\_KERNEL)))  
    return -ENOMEM;

epi->ep = ep;  
ep\_set\_ffd(&epi->ffd, tfile, fd);  
epi->event = \*event;  
epi->nwait = 0;  
epi->next = EP\_UNACTIVE\_PTR;  
//根据EPOLLWAKEUP标志注册wake up  
if (epi->event.events & EPOLLWAKEUP) {  
    error = ep\_create\_wakeup\_source(epi);  
    if (error)  
        goto error\_create\_wakeup\_source;  
else {  
    RCU\_INIT\_POINTER(epi->ws, NULL);  

epq.epi = epi;

// 安装poll回调函数  
init\_poll\_funcptr(&epq.pt, ep\_ptable\_queue\_proc);

\* Attach the item to the poll hooks and get current event bits.  
\* We can safely use the file\* here because its usage count has  
\* been increased by the caller of this function. Note that after  
\* this operation completes, the poll callback can start hitting  
\* the new item.  

revents = ep\_item\_poll(epi, &epq.pt, 1);

\* We have to check if something went wrong during the poll wait queue  
\* install process. Namely an allocation for a wait queue failed due  
\* high memory pressure.  
error = -ENOMEM;  
if (epi->nwait < 0)  
    goto error\_unregister;

/\* Add the current item to the list of active epoll hook for this file \*/  
list\_add\_tail\_rcu(&epi->fllink, &tfile->f\_ep\_links);  

\* Add the current item to the RB tree. All RB tree operations are  
\* protected by "mtx", and ep\_insert() is called with "mtx" held.  
ep\_rbtree\_insert(ep, epi);  
// 将该epitem插入到ep的红黑树中

/\* now check if we've created too many backpaths \*/  
error = -EINVAL;  
if (full\_check && reverse\_path\_check())  
    goto error\_remove\_epi;

/\* We have to drop the new item inside our item list to keep track of it \*/  

/\* record NAPI ID of new item if present \*/  

/\* If the file is already "ready" we drop it inside the ready list \*/  
// revents & event->events: 刚才fop->poll的返回值中标识的事件有用户event关心的事件发生  

if (revents && !ep\_is\_linked(epi)) {  
    list\_add\_tail(&epi->rdllink, &ep->rdllist);


    /\* Notify waiting tasks that events are available \*/

    //waitqueue\_active(q) 等待队列q中有等待的进程返回1,否则返回0。  
    if (waitqueue\_active(&ep->wq))  

    //如果有进程等待eventpoll文件本身的事件就绪(该eventpoll也被其他eventpoll poll住了),则增加临时变量pwake的值,  
    if (waitqueue\_active(&ep->poll\_wait))  


/\* We have to call this outside the lock \*/  
if (pwake)  
return 0;


rb\_erase\_cached(&epi->rbn, &ep->rbr);

ep_unregister_pollwait(ep, epi);

\* We need to do this because an event could have been arrived on some  
\* allocated wait queue. Note that we don't care about the ep->ovflist  
\* list, since that is used/cleaned only inside a section bound by "mtx".  
\* And ep\_insert() is called with "mtx" held.  
if (ep\_is\_linked(epi))  


kmem_cache_free(epi_cache, epi);

return error;  


* Modify the interest event mask by dropping an event if the new mask
* has a match in the current file status. Must be called with "mtx" held.

static int ep_modify(struct eventpoll *ep, struct epitem *epi,
const struct epoll_event *event)
int pwake = 0;
poll_table pt;


init\_poll\_funcptr(&pt, NULL);

\* Set the new event interest mask before calling f\_op->poll();  
\* otherwise we might miss an event that happens between the  
\* f\_op->poll() call and the new event set registering.  

epi->event.events = event->events; /\* need barrier below \*/  
epi->event.data = event->data; /\* protected by mtx \*/  
                               // 根据EPOLLWAKEUP更新wake up  
if (epi->event.events & EPOLLWAKEUP) {  
    if (!ep\_has\_wakeup\_source(epi))  
else if (ep\_has\_wakeup\_source(epi)) {  

\* The following barrier has two effects:  
\* 1) Flush epi changes above to other CPUs.  This ensures  
\*    we do not miss events from ep\_poll\_callback if an  
\*    event occurs immediately after we call f\_op->poll().  
\*    We need this because we did not take ep->wq.lock while  
\*    changing epi above (but ep\_poll\_callback does take  
\*    ep->wq.lock).  
\* 2) We also need to ensure we do not miss \_past\_ events  
\*    when calling f\_op->poll().  This barrier also  
\*    pairs with the barrier in wq\_has\_sleeper (see  
\*    comments for wq\_has\_sleeper).  
\* This barrier will now guarantee ep\_poll\_callback or f\_op->poll  
\* (or both) will notice the readiness of an item.  
//1)将epitem以上的刷新更改为其他CPU。这确保了在我们调用f\_op-> poll()之后立即发生事件时,我们不会错过ep\_poll\_callback中的事件。  
//我们需要这个,因为在更改上面的epitem时我们没有使用ep-> wq.lock(但是ep\_poll\_callback确实需要ep-> wq.lock)。  
//2)我们还需要确保在调用f\_op-> poll()时不会错过\_past\_事件。  
//此障碍现在将确保ep\_poll\_callback或f\_op-> poll(或两者)将注意到程序的准备情况。  

\* Get current event bits. We can safely use the file\* here because  
\* its usage count has been increased by the caller of this function.  
\* If the item is "hot" and it is not registered inside the ready  
\* list, push it inside.  
//那么把epi插入ready链表尾部 list\_add\_tail(&epi->rdllink, &ep->rdllist);  
if (ep\_item\_poll(epi, &pt, 1)) {  
    if (!ep\_is\_linked(epi)) {  
        list\_add\_tail(&epi->rdllink, &ep->rdllist);  

        /\* Notify waiting tasks that events are available \*/  
        if (waitqueue\_active(&ep->wq))  
        if (waitqueue\_active(&ep->poll\_wait))  

/\* We have to call this outside the lock \*/  
if (pwake)  

return 0;  

* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout)
int error;
struct fd f;
struct eventpoll *ep;

/\* The maximum number of event must be greater than zero \*/  
if (maxevents <= 0 || maxevents > EP\_MAX\_EVENTS)  
    return -EINVAL;  

if (!access\_ok(VERIFY\_WRITE, events, maxevents \* sizeof(struct epoll\_event)))  
    return -EFAULT;

/\* Get the "struct file \*" for the eventpoll file \*/  
//获取epoll的struct file  
//再通过对应的struct file获得eventpoll \*/  
f = fdget(epfd);  
if (!f.file)  
    return -EBADF;

\* We have to check that the file structure underneath the fd  
\* the user passed to us \_is\_ an eventpoll file.  
error = -EINVAL;  
//检查一下它是不是一个真正的epoll 是不是eventpoll file.  
if (!is\_file\_epoll(f.file))  
    goto error\_fput;

\* At this point it is safe to assume that the "private\_data" contains  
\* our own data structure.  
// 根据private\_data得到eventpoll结构  
ep = f.file->private\_data;

error = ep\_poll(ep, events, maxevents, timeout);  

return error;

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
return do_epoll_wait(epfd, events, maxevents, timeout);

* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2).

SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
size_t, sigsetsize)
int error;
sigset_t ksigmask, sigsaved;

\* If the caller wants a certain signal mask to be set during the wait,  
\* we apply it here.  
//  如果一个进程正在处理类型为k的信号,那么此时当另一个k信号到的时候,  
//  如果此时又来了一个k信号,那么由于此前pending位向量已被设置,所以这个信号会被丢弃。  
//  一旦进程接收了信号k,那么内核就会清除pending的第k位。  
//  blocked位向量(被阻塞信号集)维护着进程阻塞的信号,在这里设置的信号不会被进程接收处理。  
//  所以我们知道,pending & ~blocked 的结果指示了进程将要去接收处理的信号集。

if (sigmask) {  
    if (sigsetsize != sizeof(sigset\_t))  
        return -EINVAL;  
    if (copy\_from\_user(&ksigmask, sigmask, sizeof(ksigmask)))  
        return -EFAULT;  
    sigsaved = current->blocked;  
error = do\_epoll\_wait(epfd, events, maxevents, timeout);

\* If we changed the signal mask, we need to restore the original one.  
\* In case we've got a signal while waiting, we do not restore the  
\* signal mask yet, and we allow do\_signal() to deliver the signal on  
\* the way back to userspace, before the signal mask is restored.  
//在恢复信号掩码之前,如果我们在等待时有信号,我们允许do\_signal()在回来的路上发送信号 ,到用户空间。  

//ep\_poll()检查signal\_pending()并负责在有信号时将errno设置为 - EINTR。  
if (sigmask) {  
    //EINTR : 在请求事件发生或者过期之前,调用被信号打断  
    if (error == -EINTR) {  
        memcpy(&current->saved\_sigmask, &sigsaved,  
return error;  


static int __init eventpoll_init(void)
struct sysinfo si;
* Allows top 4% of lomem to be allocated for epoll watches (per user).
//内存页的大小 PAGE_SHIFT
//(si.totalram - si.totalhigh)低端内存页面数



//核地址空间划分成2个部分低端的796MB + 高端的128MB,低端796MB就使用f映射,直  


//Linux物理内存空间分为DMA内存区(DMA Zone)、低端内存区(Normal Zone)与高端内存  
//区(Highmem Zone)三部分。DMA Zone通常很小,只有几十M,低端内存区与高端内存区的划分来源于Linux内核空间大小的限制。


//过去,CPU的地址总线只有32位, 32的地址总线无论是从逻辑上还是从物理上都只能描  






//MemTotal   1547MB

//HighTotal     825MB

//LowTotal     721MB

max\_user\_watches = (((si.totalram - si.totalhigh) / 25) << PAGE\_SHIFT) / EP\_ITEM\_COST;  
BUG\_ON(max\_user\_watches < 0);

\* Initialize the structure used to perform epoll file descriptor  
\* inclusion loops checks.  

/* Initialize the structure used to perform safe poll wait head wake ups */

\* We can have many thousands of epitems, so prevent this from  
\* using an extra cache line on 64-bit (and smaller) CPUs  
BUILD\_BUG\_ON(sizeof(void \*) <= 8 && sizeof(struct epitem) > 128);

//(slab分配器)分配内存用来存放struct epitem  
epi\_cache = kmem\_cache\_create("eventpoll\_epi", sizeof(struct epitem),  

//(slab分配器)分配内存用来存放struct eppoll\_entry。  
pwq\_cache = kmem\_cache\_create("eventpoll\_pwq",  
    sizeof(struct eppoll\_entry), 0, SLAB\_PANIC | SLAB\_ACCOUNT, NULL);

return 0;  


//从slab缓存中创建一个eventpoll对象, 并且创建一个匿名的fd跟fd对应的file对象,
//而eventpoll对象保存在struct file结构的private指针中, 并且返回,
//该fd对应的file operations只是实现了poll跟release操作
//获取当前用户信息, 是不是root, 最大监听fd数目等并且保存到eventpoll对象中
//初始化等待队列, 初始化就绪链表, 初始化红黑树的头结点
//如果是add mod将epoll_event结构拷贝到内核空间中
//并且判断加入的fd是否支持poll结构(epoll, poll, selectI / O多路复用必须支持poll操作).
//并且从epfd->file->privatedata获取event_poll对象, 根据op区分是添加删除还是修改,
//首先在eventpoll结构中的红黑树查找是否已经存在了相对应的fd, 没找到就支持插入操作, 否则报重复的错误.
//相对应的修改, 删除类似。

//插入操作时, 会创建一个与fd对应的epitem结构, 并且初始化相关成员, 比如保存监听的fd跟file结构之类的(ffd)
//(其内部, 初始化设备的等待队列, 将该进程注册到等待队列)完成这一步, 我们的epitem就跟这个socket关联起来了,
//最后调用加入的fd的file operation->poll函数(最后会调用poll_wait操作)用于完成注册操作.
//计算睡眠时间(如果有), 判断eventpoll对象的链表是否为空, 不为空那就干活不睡眠.并且初始化一个等待队列, 把自己挂上去, 设置自己的进程状态
//为可睡眠状态.判断是否有信号到来(有的话直接被中断醒来, ), 如果啥事都没有那就调用schedule_timeout进行睡眠, 如果超时或者被唤醒, 首先从自己初始化的等待队列删除
//, 然后开始拷贝资源给用户空间了
//拷贝资源则是先把就绪事件链表转移到中间链表, 然后挨个遍历拷贝到用户空间,
//并且挨个判断其是否为水平触发, 是的话再次插入到就绪链表