1. 程式人生 > >Linux環境下libpcap庫原始碼分析


linux環境下libpcap 原始碼分析


libpcap 原始碼官方下載地址:
git clone https://github.com/the-tcpdump-group/libpcap.git

git clone git://bpf.tcpdump.org/tcpdump

tcpdump.c使用libpcap裡的pcap_open_livepcap_loop 完成兩個最關鍵的動作:獲取捕獲報文的介面,和捕獲報文並將報文交給callback(關於tcpdump原始碼的構架,請參考作者的tcpdump原始碼分析)
pcap_open_livepcap_loop的實現機制,並進入linux核心,展示linux核心對這兩個API的響應動作。 tcpdump.cpcap_open_live的使用是: pd = pcap_open_live(device, snaplen, !pflag, 1000, ebuf); pcap_open_live定義如下: pcap_t *pcap_open_live(const char *source, int snaplen, int promisc, int to_ms, char *errbuf) source 為指定的網路介面。 snaplen 為最大報文長度。
Promisc 是否將裝置設定為混雜模式。 to_ms 超時時間。 errbuf 為錯誤資訊描述字元。 返回值為cap_t型別的指標,pcap_t 定義是: typedef struct pcap pcap_t; struct pcap { /*typedef int (*read_op_t)(pcap_t *, int cnt, pcap_handler, u_char *); read_op為從網路介面讀取報文的函式指標,待其得到賦值後,呼叫實現函式*/ read_op_t read_op; //從檔案裡讀取報文的函式指標 int (*next_packet_op)(pcap_t *, struct pcap_pkthdr *, u_char **); //檔案描述符
,socket int fd; int selectable_fd; int bufsize; //read緩衝區大小 u_char *buffer; //read緩衝區指標 u_char *bp; int cc; ... int snapshot; int linktype; /* Network linktype */ int linktype_ext; int tzoff; /* timezone offset */ int offset; /* offset for proper alignment */ int activated; /* true if the capture is really started */ int oldstyle; /* if we're opening with pcap_open_live() */ struct pcap_opt opt; u_char *pkt; ... //啟用函式,啟用函式在得到呼叫後,會建立起與底層IPCsocket activate_op_t activate_op; ... }; pcap_t * pcap_open_live(const char *source, int snaplen, int promisc, int to_ms, char *errbuf){ pcap_t *p; int status;
    p = pcap_create(source, errbuf);

    if (p == NULL)
        return (NULL);
    status = pcap_set_snaplen(p, snaplen);
    if (status < 0)
        goto fail;
    status = pcap_set_promisc(p, promisc);
    if (status < 0)
        goto fail;
    status = pcap_set_timeout(p, to_ms);
    if (status < 0)
        goto fail;
    p->oldstyle = 1;
	//pcap_avtivate呼叫pcap_tactivate_op, 建立起與底層IPC通道
    status = pcap_activate(p);

    if (status < 0)
        goto fail;
    return (p);

pcap_t *pcap_create(const char *source, char *errbuf){   
    size_t i;
    int is_theirs;
    pcap_t *p;

    if (source == NULL)
        source = "any";

    for (i = 0; capture_source_types[i].create_op != NULL; i++) {
        is_theirs = 0;
        p = capture_source_types[i].create_op(source, errbuf, &is_theirs);
        if (is_theirs) {
                return (p);

    //如果沒有, 那麼就將source作為普通網路介面
    return (pcap_create_interface(source, errbuf));
pcap_create_interface() 函式在libpcap下有多個實現,可由編譯巨集來指定特定的pcap_create_interface來初始化read_op等函式指標。linux環境裡預設是libpcap/pcap-linux.c中的 pcap_create_interface():

pcap_t *
pcap_create_interface(const char *device, char *ebuf)
    pcap_t *handle;
/*可將 pcap_create_common看做pcap_t結構的建構函式,初始化一個pcap_t*/
    handle = pcap_create_common(device, ebuf, sizeof (struct pcap_linux));
    if (handle == NULL)
        return NULL;
	//pcap_t 的啟用函式指標填充具體實現函式
    handle->activate_op = pcap_activate_linux;

    handle->can_set_rfmon_op = pcap_can_set_rfmon_linux;
    return handle;

完成後回到pcap_open_live,設定snaplen,promisc,to_ms後,呼叫status = pcap_activate(p),該函式執行status = p->activate_op(p) 
進而呼叫 pcap_activate_linux(), 完成read_op等重要函式指標的具體賦值。

static int
 pcap_activate_linux(pcap_t *handle)
    struct pcap_linux *handlep = handle->priv;
    const char  *device;
    int     status = 0;
    device = handle->opt.source;
    handle->inject_op = pcap_inject_linux;
    handle->setfilter_op = pcap_setfilter_linux;
    handle->setdirection_op = pcap_setdirection_linux;
    handle->set_datalink_op = pcap_set_datalink_linux;
    handle->getnonblock_op = pcap_getnonblock_fd;
    handle->setnonblock_op = pcap_setnonblock_fd;
    handle->cleanup_op = pcap_cleanup_linux;
    handle->read_op = pcap_read_linux;

    handle->stats_op = pcap_stats_linux;

    if (strcmp(device, "any") == 0) {
        if (handle->opt.promisc) {
            handle->opt.promisc = 0;
            /* Just a warning. */
            snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
                "Promiscuous mode not supported on the \"any\" device");
            status = PCAP_WARNING_PROMISC_NOTSUP;
    handlep->device = strdup(device);
    if (handlep->device == NULL) {
        snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "strdup: %s",
             pcap_strerror(errno) );
        return PCAP_ERROR;
    handlep->timeout = handle->opt.timeout;

    if (handle->opt.promisc)
        handlep->proc_dropped = linux_if_drops(handlep->device);

    status = activate_new(handle);
    if (status < 0) {
        goto fail;
    if (status == 1) {
        switch (activate_mmap(handle, &status)) {
        case 1:
            return status;
        case 0:

        case -1:
            goto fail;
	//如果status0, 再嘗試使用activete_old()函式
    else if (status == 0) {
        /* Non-fatal error; try old way */
        if ((status = activate_old(handle)) != 1) {
            goto fail;
    status = 0;
    if (handle->opt.buffer_size != 0) {
        if (setsockopt(handle->fd, SOL_SOCKET, SO_RCVBUF,
            sizeof(handle->opt.buffer_size)) == -1) {
            snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
                 "SO_RCVBUF: %s", pcap_strerror(errno));
            status = PCAP_ERROR;
            goto fail;
     handle->selectable_fd = handle->fd;
    return status;

static int
activate_new(pcap_t *handle)
   struct pcap_linux *handlep = handle->priv;
    const char  *device = handle->opt.source;
    int         is_any_device = (strcmp(device, "any") == 0);
    int         sock_fd = -1, arptype;
    int         err = 0;
    struct packet_mreq  mr;

socket()開啟不同的協議,返回控制代碼裡也包括了該協議的底層模組提供的ops. 只不過linux下面沒法將網路協議當作普通檔案(如/dev/xx)處理,所以才有了另一套socket特定的APIs*/
    sock_fd = is_any_device ?
        socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_ALL)) :
        socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));

    handlep->sock_packet = 0;
/*iface_get_id()使用ioctl(fd, SIOCGIFINDEX, &ifr)獲取lo還回裝置的索引值*/
    handlep->lo_ifindex = iface_get_id(sock_fd, "lo", handle->errbuf);
    handle->offset   = 0;

    if (!is_any_device) {
        handlep->cooked = 0;
        if (handle->opt.rfmon) {
            err = enter_rfmon_mode(handle, sock_fd, device);
            if (err < 0) {
                return err;
            if (err == 0) {
                return PCAP_ERROR_RFMON_NOTSUP;

            if (handlep->mondevice != NULL)
                device = handlep->mondevice;
/*iface_get_arptype()呼叫ioctl(fd, SIOCGIFHWADDR, &ifr)獲取硬體型別 */
        arptype = iface_get_arptype(sock_fd, device, handle->errbuf);
        if (arptype < 0) {
            return arptype;
        map_arphrd_to_dlt(handle, arptype, 1);
        handlep->ifindex = iface_get_id(sock_fd, device,

        if (handlep->ifindex == -1) {
            return PCAP_ERROR;

/*iface_bind()將裝置的索引值作為struct socketadd_ll的索引值與socket繫結
    struct sockaddr_ll  sll; 
    sll.sll_family      = AF_PACKET;                                                               
    sll.sll_ifindex     = ifindex;
    sll.sll_protocol    = htons(ETH_P_ALL);
bind(fd, (struct sockaddr *) &sll, sizeof(sll)) == -1 */
        if ((err = iface_bind(sock_fd, handlep->ifindex,
handle->errbuf)) != 1) {
            if (err < 0)
                return err;
                return 0;   /* try old mechanism */
    if (!is_any_device && handle->opt.promisc) {
        memset(&mr, 0, sizeof(mr));
        mr.mr_ifindex = handlep->ifindex;
        mr.mr_type    = PACKET_MR_PROMISC;
        if (setsockopt(sock_fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP,
            &mr, sizeof(mr)) == -1) {
            snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
                "setsockopt: %s", pcap_strerror(errno));
            return PCAP_ERROR;
    if (handlep->cooked) {
        if (handle->snapshot < SLL_HDR_LEN + 1)
            handle->snapshot = SLL_HDR_LEN + 1;
    handle->bufsize = handle->snapshot;
    //根據乙太網鏈路層型別決定VLAN Tag在報文中的偏移值
    switch (handle->linktype) {
    case DLT_EN10MB:
        handlep->vlan_offset = 2 * ETH_ALEN;
    case DLT_LINUX_SLL:
        handlep->vlan_offset = 14;
        handlep->vlan_offset = -1; /* unknown */
    handle->fd = sock_fd;

至此,通過pcap_open_live完成全部準備階段的內容, 之後就可以使用pcap_loop()來獲取來自底層的資料並提交給callback函式進行應用處理, tcpdump.c pcap_loop的使用是: 

status = pcap_loop(pd, cnt, callback, pcap_userdata); 
//cnt 為指定捕獲報文的個數

pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
    register int n;
    for (;;) {
        if (p->rfile != NULL) {
            n = pcap_offline_read(p, cnt, callback, user);
        } else {
            do {
                n = p->read_op(p, cnt, callback, user);

            } while (n == 0);
        if (n <= 0)
            return (n);
        if (cnt > 0) {
            cnt -= n;
            if (cnt <= 0)
                return (0);



pcap_read_packet(pcap_t *handle, pcap_handler callback, u_char *userdata
	struct pcap_linux   *handlep = handle->priv;
    	u_char          *bp;
 	struct sockaddr_ll  from;
        if (handle->break_loop) {
            handle->break_loop = 0;
            return PCAP_ERROR_BREAK;
        fromlen = sizeof(from);
//從socket接受資訊存入bp指向的快取區, 每次最大資料bufize,MSG_TRUNC為返回包的實際長度
	  packet_len = recvfrom(
		(struct sockaddr *) &from, &fromlen);

    } while (packet_len == -1 && errno == EINTR);

	caplen = packet_len;
    if (caplen > handle->snapshot)
        caplen = handle->snapshot;

    pcap_header.caplen  = caplen;                                                                 
    pcap_header.len     = packet_len;

    callback(userdata, &pcap_header, bp);
    return 1;   
Linux核心對recvfrm 的響應:


SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
        unsigned, flags, struct sockaddr __user *, addr,
        int __user *, addr_len)
 struct socket *sock;
    struct iovec iov;
    struct msghdr msg;
    struct sockaddr_storage address;
    int err, err2;
    int fput_needed;
    if (size > INT_MAX)
        size = INT_MAX;
    if (!sock)
        goto out;
    msg.msg_control = NULL;
    msg.msg_controllen = 0;
    msg.msg_iovlen = 1;
    msg.msg_iov = &iov;
    iov.iov_base = ubuf;
   //將msg_name指標指向address, 後面呼叫中,為msg_name賦值時address便得到賦值
    msg.msg_name = (struct sockaddr *)&address;
    msg.msg_namelen = sizeof(address);
    if (sock->file->f_flags & O_NONBLOCK)
        flags |= MSG_DONTWAIT;

/*使用者層的呼叫 packet_len = recvfrom(handle->fd, bp + offset,
	handle->bufsize - offset, MSG_TRUNC,
    if (err >= 0 && addr != NULL) {
        err2 = move_addr_to_user((struct sockaddr *)&address,
                     msg.msg_namelen, addr, addr_len);
        if (err2 < 0)
            err = err2;


int sock_recvmsg(struct socket *sock, struct msghdr *msg,
static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
                 struct msghdr *msg, size_t size, int flags)
    int err;
    struct sock_iocb *si = kiocb_to_siocb(iocb);
    si->sock = sock;
    si->scm = NULL;
    si->msg = msg;
    si->size = size;
    si->flags = flags;
    err = security_socket_recvmsg(sock, msg, size, flags);
    if (err)
        return err;
    return sock->ops->recvmsg(iocb, sock, msg, size, flags);

由於activate_new()裡面建立了 PF_PACKET協議的socket, 所以,linux會呼叫建立PF_PACKET的底層模組af_packet來響應recvmsg。 在linux啟動階段,af_packet模組初始化完成後,會填充ops->recvmsg等函式指標,對上層/net/sock完成介面對接。


static int __init packet_init(void)

static struct net_proto_family packet_family_ops = {
    .family =   PF_PACKET,
    .create =   packet_create,
    .owner  =   THIS_MODULE,

static int packet_create(struct net *net, struct socket *sock, int protocol)
    sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
    if (sk == NULL)
        goto out;
    sock->ops = &packet_ops; 
    return 0;

在packet_ops裡有對struct sock的函式指標recvmsg填充實現函式packet_recvmsg

static const struct proto_ops packet_ops = {
    .family =   PF_PACKET,
    .owner =    THIS_MODULE,
    .release =  packet_release,
    .bind =     packet_bind,
    .connect =  sock_no_connect,
    .socketpair =   sock_no_socketpair,
    .accept =   sock_no_accept,
    .getname =  packet_getname,
    .poll =     packet_poll,
    .ioctl =    packet_ioctl,
    .listen =   sock_no_listen,
    .shutdown = sock_no_shutdown,
    .setsockopt =   packet_setsockopt,
    .getsockopt =   packet_getsockopt,
    .sendmsg =  packet_sendmsg,
    .recvmsg =  packet_recvmsg,
    .mmap =     packet_mmap,
    .sendpage = sock_no_sendpage,

packet_recvmsg 封裝了接受報文並並將資料拷貝到使用者層全部動作:

static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
              struct msghdr *msg, size_t len, int flags)
    struct sock *sk = sock->sk;
    struct sk_buff *skb;
    int copied, err;
    struct sockaddr_ll *sll;
    skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
    if (skb == NULL)        
        goto out;           
    copied = skb->len;
    if (copied > len) {
        copied = len;

   //第二步, 將獲取到的資料skb拷貝到iov裡,即完成資料對使用者層的傳遞
    err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
    if (err)    
        goto out_free;
    sock_recv_timestamp(msg, sk, skb);
/*將skb裡的cb拷貝給msg->msg_name, 這樣在net/socket.c的
move_addr_to_user((struct sockaddr *)&address,
就可以將此msg_name 傳給使用者層 。*/    
    if (msg->msg_name)
        memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
    if (pkt_sk(sk)->auxdata) {
        struct tpacket_auxdata aux;
        aux.tp_status = TP_STATUS_USER;
        if (skb->ip_summed == CHECKSUM_PARTIAL)
            aux.tp_status |= TP_STATUS_CSUMNOTREADY;
        aux.tp_len = PACKET_SKB_CB(skb)->origlen;
        aux.tp_snaplen = skb->len;
        aux.tp_mac = 0;
        aux.tp_net = skb_network_offset(skb);
        aux.tp_vlan_tci = skb->vlan_tci;
        put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
    err = (flags&MSG_TRUNC) ? skb->len : copied;
    return err;


struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
                  int noblock, int *err)
    int peeked;
    return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),                                                                
                   &peeked, err);
__skb_recv_datagram的作用就是接收一個數據報快取的資料結構,本文的分析就到__skb_recv_datagram從sk->sk_receive_queue 中取得skb結構資料為止,至於這個接收佇列是由誰建立的,傳送端在哪裡,後續介紹。       
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
                    int *peeked, int *err)
    struct sk_buff *skb;
    long timeo;
    int error = sock_error(sk);
    if (error)
        goto no_packet;

    do {
        unsigned long cpu_flags;
        spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);

        skb = skb_peek(&sk->sk_receive_queue);
        if (skb) {
            *peeked = skb->peeked;
            if (flags & MSG_PEEK) {
                skb->peeked = 1;
            } else
                __skb_unlink(skb, &sk->sk_receive_queue);
        spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
        if (skb)
            return skb;
/*如果peek時沒有資料到到,在阻塞情況下,等待一定時間,當達到超時時間還沒有接收到資料,向err傳送錯誤型別報告,退出本函式; 在非阻塞情況下,timeo為0,直接報錯後退出*/
        error = -EAGAIN;
        if (!timeo)
            goto no_packet;
    } while (!wait_for_packet(sk, err, &timeo));
    return NULL;
    *err = error;
    return NULL;

接收到skb後,呼叫skb_copy_datagram_iovec 將其拷貝到msg的iov裡

struct iovec{  
    void __user *iov_base;  		//快取的首地址
    __kernel_size_t iov_len; 		//快取可用的大小
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
                struct iovec *to, int len)
    int start = skb_headlen(skb);
    int i, copy = start – offset;
    struct sk_buff *frag_iter;
    trace_skb_copy_datagram_iovec(skb, len);
    if (copy > 0) {
        if (copy > len)
            copy = len;
        if (memcpy_toiovec(to, skb->data + offset, copy))
            goto fault;
        if ((len -= copy) == 0)
            return 0;
        offset += copy;
        int end;
        WARN_ON(start > offset + len);
        end = start + frag_iter->len;
        if ((copy = end - offset) > 0) {
            if (copy > len)
                copy = len;
            if (skb_copy_datagram_iovec(frag_iter,t,
                goto fault;
            if ((len -= copy) == 0)
                return 0;
            offset += copy;
        start = end;
    if (!len)
        return 0; 
    return -EFAULT;

int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len){       
    while (len > 0) { 
        if (iov->iov_len) {
/*如果iov的iov_len大於len, 說明iov的快取區還可以接受資料,那麼設定本次拷貝大小為len
            if (copy_to_user(iov->iov_base, kdata, copy))
                return -EFAULT;
            kdata += copy;
            len -= copy;
		//每次拷貝後, 將iov_len減去已經使用的長度
            iov->iov_len -= copy;
            iov->iov_base += copy;
    return 0;


pcap_open_live 呼叫pcap_create()來為pcap_t填充read_op等函式指標,並提供了啟用函式pcap_activate_linux,建立了socket與linux底層模組af_packet通訊。 

pcap_loop 呼叫了read_op的實現函式 pcap_read_linux, pcap_read_linux 裡面使用了recvfrom 獲取乙太網原始資料,linux的af_packet模組會響應並完成recvfrom動作;recvfrom完成後呼叫callback指向的函式處理這些資料,callback指標的賦值是在tcpdump里根據具體鏈路層環境賦值的。
