从epoll源码分析它的使用

栏目: 后端 · 发布时间: 6年前

内容简介:从epoll源码分析它的使用

首先来看看epoll_create的真身

SYSCALL_DEFINE1(epoll_create, int, size)
{
    if (size <= 0)
    return -EINVAL;
    //也就是说参数size根本用不上
    return sys_epoll_create1(0);
}

再来看看epoll_create1的真身

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
    int error, fd;
    struct eventpoll *ep = NULL;
    struct file *file;

    BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
    if (flags & ~EPOLL_CLOEXEC)
        return -EINVAL;

    error = ep_alloc(&ep);
    if (error < 0)
        return error;

    fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
    if (fd < 0) {
        error = fd;
        goto out_free_ep;
    }
    file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC));
    if (IS_ERR(file)) {
        error = PTR_ERR(file);
        goto out_free_fd;
    }
    ep->file = file;
    fd_install(fd, file);
    return fd;

out_free_fd:
    put_unused_fd(fd);
out_free_ep:
    ep_free(ep);
    return error;
}

1. 对epoll来讲,目前唯一有效的flag只有EPOLL_CLOEXEC
2. ep_alloc初始化spinlock_t锁,mutex锁
3. 每次epoll_create1一个epollfd,内核就会分配一个eventpoll 与之对应
struct eventpoll
{
spinlock_t lock;

//添加,修改,删除fd,epoll_wait返回,内核态向用户态传递数据时都会持有这个锁,所以多线程操作epoll是安全的,内核做了保护
struct mutex mtx;

/* Wait queue used by sys_epoll_wait()*/
wait_queue_head_t wq;

/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;

//所有触发的epitem都放在这个链表里面
struct list_head rdllist;

//红黑树的root节点,所有要监听的epitem都在这个红黑树中,我们可以把红黑树的所有节点都看作epitem
struct rb_root rbr;

/*
* This is a single linked list that chains all the “struct epitem” that
* happened while transferring ready events to userspace w/out
* holding ->lock.
*/
struct epitem *ovflist;

/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;

/* The user that created the eventpoll descriptor */
struct user_struct *user;

struct file *file;

/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
};
3. 因为epollfd本身不存在一个真正的文件与之对应,不像socket,所以内核会分配一个真正的file结构且有真正的fd,然后和epollfd对应
struct file{
//eventpoll存储在这里
void *private_data;
struct list_head f_ep_links;
};
这样,通过epollfd找到它在内核中的file,然后通过file找到了存储的eventpoll
4. struct epitem {
/* RB tree node used to link this structure to the eventpoll RB tree */
struct rb_node rbn;

//当这个节点触发的时候,会链到之前提到的eventpoll中的rdllist中去
struct list_head rdllink;

/*
* Works together “struct eventpoll”->ovflist in keeping the
* single linked chain of items.
*/
struct epitem *next;

//epitem对应的fd和真正的file
struct epoll_filefd ffd;

/* Number of active wait queue attached to poll operations */
int nwait;

/* List containing poll wait queues */
struct list_head pwqlist;

//epitem属于的eventpoll
struct eventpoll *ep;

/* List header used to link this item to the “struct file” items list */
struct list_head fllink;

/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source *ws;

/* The structure that describe the interested events and the source fd */

//epitem关心的事件

struct epoll_event event;

};

struct epoll_filefd{

struct file *file;

int fd;

};

再来看看epoll_ctl的真身
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        struct epoll_event __user *, event)
{
    int error;
    int did_lock_epmutex = 0;
    struct file *file, *tfile;
    struct eventpoll *ep;
    struct epitem *epi;
    struct epoll_event epds;

    error = -EFAULT;
    if (ep_op_has_event(op) &&
        copy_from_user(&epds, event, sizeof(struct epoll_event)))
        goto error_return;

    /* Get the "struct file *" for the eventpoll file */
    error = -EBADF;
    //这里就是之前说的通过epollfd找到对应的file,后续会通过这个file找到eventpoll
    file = fget(epfd);
    if (!file)
        goto error_return;

    /* Get the "struct file *" for the target file */
    tfile = fget(fd);
    if (!tfile)
        goto error_fput;

    /* The target file descriptor must support poll */
    error = -EPERM;
    if (!tfile->f_op || !tfile->f_op->poll)
        goto error_tgt_fput;

    /* Check if EPOLLWAKEUP is allowed */
    if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
        epds.events &= ~EPOLLWAKEUP;

    /*
     * We have to check that the file structure underneath the file descriptor
     * the user passed to us _is_ an eventpoll file. And also we do not permit
     * adding an epoll file descriptor inside itself.
     */
    error = -EINVAL;
    //epoll不能监听自己
    if (file == tfile || !is_file_epoll(file))
        goto error_tgt_fput;

    /*
     * At this point it is safe to assume that the "private_data" contains
     * our own data structure.
     */
     //这里就是通过file找到对应的eventpoll
    ep = file->private_data;

    /*
     * When we insert an epoll file descriptor, inside another epoll file
     * descriptor, there is the change of creating closed loops, which are
     * better be handled here, than in more critical paths. While we are
     * checking for loops we also determine the list of files reachable
     * and hang them on the tfile_check_list, so we can check that we
     * haven't created too many possible wakeup paths.
     *
     * We need to hold the epmutex across both ep_insert and ep_remove
     * b/c we want to make sure we are looking at a coherent view of
     * epoll network.
     */
    if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
        mutex_lock(&epmutex);
        did_lock_epmutex = 1;
    }
    if (op == EPOLL_CTL_ADD) {
        if (is_file_epoll(tfile)) {
            error = -ELOOP;
            if (ep_loop_check(ep, tfile) != 0) {
                clear_tfile_check_list();
                goto error_tgt_fput;
            }
        } else
            list_add(&tfile->f_tfile_llink, &tfile_check_list);
    }

    mutex_lock_nested(&ep->mtx, 0);

    /*
     * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
     * above, we can be sure to be able to use the item looked up by
     * ep_find() till we release the mutex.
     */
     //我们在接口层面知道一个fd只能添加一次,这里对应到红黑树中是epitem
    epi = ep_find(ep, tfile, fd);

    error = -EINVAL;
    switch (op) {
    case EPOLL_CTL_ADD:
        if (!epi) {
            epds.events |= POLLERR | POLLHUP;
            error = ep_insert(ep, &epds, tfile, fd);
        } else
            error = -EEXIST;
        clear_tfile_check_list();
        break;
    case EPOLL_CTL_DEL:
        if (epi)
            error = ep_remove(ep, epi);
        else
            error = -ENOENT;
        break;
    case EPOLL_CTL_MOD:
        if (epi) {
            epds.events |= POLLERR | POLLHUP;
            error = ep_modify(ep, epi, &epds);
        } else
            error = -ENOENT;
        break;
    }
    mutex_unlock(&ep->mtx);

error_tgt_fput:
    if (did_lock_epmutex)
        mutex_unlock(&epmutex);

    fput(tfile);
error_fput:
    fput(file);
error_return:

    return error;
}

这里我们可以很清楚的看到EPOLL_CTL_ADD,EPOLL_CTL_DEL,EPOLL_CTL_MOD操作都是有加锁保护的,ep_insert使用了spinlock_t 锁,内部首先是查看eventpoll中user成员,查看给的最大监听数量,然后再分配一个epitem,并设置回调ep_ptable_queue_proc,也就是红黑树的节点epitem有事件触发就调用这个回调。这个回调将触发的epitem放到waitqueue中,并设置了回调ep_poll_callback,这个waitqueue是fd所持有的。然后这个回调内部将触发的epitem放到了之前说的eventpoll的rdllist中。最后我们的epoll_wait就是遍历这个rdllist,如果有事件触发,就开始从内核态拷贝数据给用户态,这里也使用了spinlock_t锁。拷贝完之后的操作,在这里还设置了ET和LT的区别,如果是ET,epitem是不会再进入到rdllist,除非fd再次发生了状态改变,ep_poll_callback被调用。如果是LT,不管你还有没有激活的事件或者有效的数据,都会被重新插入到rdllist,再下一次epoll_wait的时候又返回给你。
总结:
1. 我们不是一定非要在主线程中listen之后完成accept,recv然后把数据丢给工作线程池。因为在多线程中EPOLL_CTL_ADD,EPOLL_CTL_DEL,EPOLL_CTL_MOD都是安全的,我们完全可以让线程池来代替主线程做accep,recv,当然这个线程池应该是CPU密集的,数量最好是CPU核数。这样主线程只做一件事情监听就行了,连接管理就交给这个线程池来做,最后数据处理还是给工作线程池。
2. 对比select,每次调用select时都要把fd集合从用户态拷贝到内核态,每次都要重复拷贝,而epoll只是在EPOLL_CTL_ADD调用了一次,也就是只拷贝了一次
3. 对比select,每次调用select的返回都需要在内核遍历传进来的fd集合,而epoll内部是通过红黑树结构查找速度更快,并且触发的事件都会通过回调函数放到rdllist,而epoll_wait返回仅仅只是从rdllist拿已经触发的事件。select和epoll都会睡眠和唤醒的状态切换,但是select在唤醒的时候需要去遍历,而epoll只需要判断链表是否为空,也节约了CPU消耗
4. 对比select,select支持的文件描述符默认是1024,就算修改配置后面遍历的速度也会越来越慢没有红黑树快。而epoll支持的文件描述符是一个进程能够打开的最大文件描述符数目1G内存大概可以提供10万
5. 联系著名的“惊群”现象,多线程中epoll_wait会不会因为同一个fd的事件触发而触发了多个线程去处理?由于epoll_wait从rdllist拿事件是加锁了的,所以不会。


以上就是本文的全部内容,希望本文的内容对大家的学习或者工作能带来一定的帮助,也希望大家多多支持 码农网

查看所有标签

猜你喜欢:

本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们

Design and Analysis of Distributed Algorithms (Wiley Series on P

Design and Analysis of Distributed Algorithms (Wiley Series on P

Nicola Santoro / Wiley-Interscience / 2006-10-27 / USD 140.95

This text is based on a simple and fully reactive computational model that allows for intuitive comprehension and logical designs. The principles and techniques presented can be applied to any distrib......一起来看看 《Design and Analysis of Distributed Algorithms (Wiley Series on P》 这本书的介绍吧!

JS 压缩/解压工具
JS 压缩/解压工具

在线压缩/解压 JS 代码

CSS 压缩/解压工具
CSS 压缩/解压工具

在线压缩/解压 CSS 代码

HEX CMYK 转换工具
HEX CMYK 转换工具

HEX CMYK 互转工具