Linux環境下libpcap庫原始碼分析
阿新 • • 發佈:2019-01-08
linux環境下libpcap 原始碼分析 韓大衛@吉林師範大學 libpcap 原始碼官方下載地址: git clone https://github.com/the-tcpdump-group/libpcap.git tcpdumpm原始碼官方下載地址: git clone git://bpf.tcpdump.org/tcpdump tcpdump.c使用libpcap裡的pcap_open_live和pcap_loop 完成兩個最關鍵的動作:獲取捕獲報文的介面,和捕獲報文並將報文交給callback。 (關於tcpdump原始碼的構架,請參考作者的tcpdump原始碼分析) 現結合libpcap原始碼分析pcap_open_live和pcap_loop的實現機制,並進入linux核心,展示linux核心對這兩個API的響應動作。 tcpdump.c對pcap_open_live的使用是: pd = pcap_open_live(device, snaplen, !pflag, 1000, ebuf); pcap_open_live定義如下: pcap_t *pcap_open_live(const char *source, int snaplen, int promisc, int to_ms, char *errbuf) source 為指定的網路介面。 snaplen 為最大報文長度。Promisc 是否將裝置設定為混雜模式。 to_ms 超時時間。 errbuf 為錯誤資訊描述字元。 返回值為cap_t型別的指標,pcap_t 定義是: typedef struct pcap pcap_t; struct pcap { /*typedef int (*read_op_t)(pcap_t *, int cnt, pcap_handler, u_char *); read_op為從網路介面讀取報文的函式指標,待其得到賦值後,呼叫實現函式*/ read_op_t read_op; //從檔案裡讀取報文的函式指標 int (*next_packet_op)(pcap_t *, struct pcap_pkthdr *, u_char **); //檔案描述符,即socket int fd; int selectable_fd; int bufsize; //read緩衝區大小 u_char *buffer; //read緩衝區指標 u_char *bp; int cc; ... int snapshot; int linktype; /* Network linktype */ int linktype_ext; int tzoff; /* timezone offset */ int offset; /* offset for proper alignment */ int activated; /* true if the capture is really started */ int oldstyle; /* if we're opening with pcap_open_live() */ struct pcap_opt opt; u_char *pkt; ... //啟用函式,啟用函式在得到呼叫後,會建立起與底層IPC的socket activate_op_t activate_op; ... }; pcap_t * pcap_open_live(const char *source, int snaplen, int promisc, int to_ms, char *errbuf){ pcap_t *p; int status;
//建立捕獲報文的介面控制代碼
p = pcap_create(source, errbuf); if (p == NULL) return (NULL); //設定最大報文長度 status = pcap_set_snaplen(p, snaplen); if (status < 0) goto fail; //將裝置設為混雜模式 status = pcap_set_promisc(p, promisc); if (status < 0) goto fail; //設定超時時間 status = pcap_set_timeout(p, to_ms); if (status < 0) goto fail; p->oldstyle = 1; //pcap_avtivate呼叫pcap_t的activate_op, 建立起與底層IPC通道 status = pcap_activate(p); if (status < 0) goto fail; return (p); ... } pcap_t *pcap_create(const char *source, char *errbuf){ size_t i; int is_theirs; pcap_t *p; if (source == NULL) source = "any"; //在capture_source_types數組裡尋找是否有特定API集合的介面對應source for (i = 0; capture_source_types[i].create_op != NULL; i++) { is_theirs = 0; p = capture_source_types[i].create_op(source, errbuf, &is_theirs); if (is_theirs) { return (p); } } //如果沒有, 那麼就將source作為普通網路介面 return (pcap_create_interface(source, errbuf)); } pcap_create_interface() 函式在libpcap下有多個實現,可由編譯巨集來指定特定的pcap_create_interface來初始化read_op等函式指標。linux環境裡預設是libpcap/pcap-linux.c中的 pcap_create_interface(): pcap_t * pcap_create_interface(const char *device, char *ebuf) { pcap_t *handle; /*可將 pcap_create_common看做pcap_t結構的建構函式,初始化一個pcap_t*/ handle = pcap_create_common(device, ebuf, sizeof (struct pcap_linux)); if (handle == NULL) return NULL; //為pcap_t 的啟用函式指標填充具體實現函式 handle->activate_op = pcap_activate_linux; handle->can_set_rfmon_op = pcap_can_set_rfmon_linux; return handle; } 完成後回到pcap_open_live,設定snaplen,promisc,to_ms後,呼叫status = pcap_activate(p),該函式執行status = p->activate_op(p) , 進而呼叫 pcap_activate_linux(), 完成read_op等重要函式指標的具體賦值。 static int pcap_activate_linux(pcap_t *handle) { struct pcap_linux *handlep = handle->priv; const char *device; int status = 0; device = handle->opt.source; handle->inject_op = pcap_inject_linux; handle->setfilter_op = pcap_setfilter_linux; handle->setdirection_op = pcap_setdirection_linux; handle->set_datalink_op = pcap_set_datalink_linux; handle->getnonblock_op = pcap_getnonblock_fd; handle->setnonblock_op = pcap_setnonblock_fd; handle->cleanup_op = pcap_cleanup_linux; //最重要的函式指標read_op handle->read_op = pcap_read_linux; handle->stats_op = pcap_stats_linux; if (strcmp(device, "any") == 0) { if (handle->opt.promisc) { handle->opt.promisc = 0; /* Just a warning. */ snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Promiscuous mode not supported on the \"any\" device"); status = PCAP_WARNING_PROMISC_NOTSUP; } } handlep->device = strdup(device); if (handlep->device == NULL) { snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "strdup: %s", pcap_strerror(errno) ); return PCAP_ERROR; } handlep->timeout = handle->opt.timeout; if (handle->opt.promisc) handlep->proc_dropped = linux_if_drops(handlep->device); //先使用activete_new() status = activate_new(handle); if (status < 0) { goto fail; } //根據錯誤值具體處理 if (status == 1) { switch (activate_mmap(handle, &status)) { case 1: return status; case 0: break; case -1: goto fail; } } //如果status為0, 再嘗試使用activete_old()函式 else if (status == 0) { /* Non-fatal error; try old way */ if ((status = activate_old(handle)) != 1) { goto fail; } } status = 0; if (handle->opt.buffer_size != 0) { //設定socket的緩衝區和緩衝區長度 if (setsockopt(handle->fd, SOL_SOCKET, SO_RCVBUF, &handle->opt.buffer_size, sizeof(handle->opt.buffer_size)) == -1) { snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "SO_RCVBUF: %s", pcap_strerror(errno)); status = PCAP_ERROR; goto fail; } } handle->selectable_fd = handle->fd; return status; ... } static int activate_new(pcap_t *handle) { struct pcap_linux *handlep = handle->priv; const char *device = handle->opt.source; int is_any_device = (strcmp(device, "any") == 0); int sock_fd = -1, arptype; int err = 0; struct packet_mreq mr; /*指定網口情況下用PF_PACKET協議通訊得到原始乙太網資料幀資料 關於socket()函式,我個人認為可以將其理解為open(): open()開啟不同的檔案,這樣在返回的控制代碼裡就可使用這個檔案裝置模組提供的ops, socket()開啟不同的協議,返回控制代碼裡也包括了該協議的底層模組提供的ops. 只不過linux下面沒法將網路協議當作普通檔案(如/dev/xx)處理,所以才有了另一套socket特定的APIs*/ sock_fd = is_any_device ? socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_ALL)) : socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); ... handlep->sock_packet = 0; /*iface_get_id()使用ioctl(fd, SIOCGIFINDEX, &ifr)獲取lo還回裝置的索引值*/ handlep->lo_ifindex = iface_get_id(sock_fd, "lo", handle->errbuf); handle->offset = 0; if (!is_any_device) { handlep->cooked = 0; if (handle->opt.rfmon) { err = enter_rfmon_mode(handle, sock_fd, device); if (err < 0) { close(sock_fd); return err; } if (err == 0) { close(sock_fd); return PCAP_ERROR_RFMON_NOTSUP; } if (handlep->mondevice != NULL) device = handlep->mondevice; } /*iface_get_arptype()呼叫ioctl(fd, SIOCGIFHWADDR, &ifr)獲取硬體型別 */ arptype = iface_get_arptype(sock_fd, device, handle->errbuf); if (arptype < 0) { close(sock_fd); return arptype; } map_arphrd_to_dlt(handle, arptype, 1); ... //獲取指定裝置的索引值 handlep->ifindex = iface_get_id(sock_fd, device, handle->errbuf); if (handlep->ifindex == -1) { close(sock_fd); return PCAP_ERROR; /*iface_bind()將裝置的索引值作為struct socketadd_ll的索引值與socket繫結 struct sockaddr_ll sll; sll.sll_family = AF_PACKET; sll.sll_ifindex = ifindex; sll.sll_protocol = htons(ETH_P_ALL); bind(fd, (struct sockaddr *) &sll, sizeof(sll)) == -1 */ if ((err = iface_bind(sock_fd, handlep->ifindex, handle->errbuf)) != 1) { close(sock_fd); if (err < 0) return err; else return 0; /* try old mechanism */ } ... } if (!is_any_device && handle->opt.promisc) { memset(&mr, 0, sizeof(mr)); mr.mr_ifindex = handlep->ifindex; mr.mr_type = PACKET_MR_PROMISC; if (setsockopt(sock_fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mr, sizeof(mr)) == -1) { snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "setsockopt: %s", pcap_strerror(errno)); close(sock_fd); return PCAP_ERROR; } } if (handlep->cooked) { if (handle->snapshot < SLL_HDR_LEN + 1) handle->snapshot = SLL_HDR_LEN + 1; } handle->bufsize = handle->snapshot; //根據乙太網鏈路層型別決定VLAN Tag在報文中的偏移值 switch (handle->linktype) { case DLT_EN10MB: handlep->vlan_offset = 2 * ETH_ALEN; break; case DLT_LINUX_SLL: handlep->vlan_offset = 14; break; default: handlep->vlan_offset = -1; /* unknown */ break; } //將sock_fd作為pcap_t的fd handle->fd = sock_fd; ... } 至此,通過pcap_open_live完成全部準備階段的內容, 之後就可以使用pcap_loop()來獲取來自底層的資料並提交給callback函式進行應用處理, tcpdump.c 對pcap_loop的使用是: status = pcap_loop(pd, cnt, callback, pcap_userdata); //cnt 為指定捕獲報文的個數 在libpcap/pcap.c裡有pcap_loop的定義: int pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user) { register int n; for (;;) { if (p->rfile != NULL) { //從檔案裡讀取報文 n = pcap_offline_read(p, cnt, callback, user); } else { //從指定網口讀取報文 do { //read_op即為pcap_read_packet n = p->read_op(p, cnt, callback, user); } while (n == 0); } //當n<0時退出迴圈,退出pcap_loop if (n <= 0) return (n); //如果達到捕獲報文個數,退出pcap_loop if (cnt > 0) { cnt -= n; if (cnt <= 0) return (0); } } }
函式指標read_op指向的就是pcap_read_packet
staticint
pcap_read_packet(pcap_t *handle, pcap_handler callback, u_char *userdata { struct pcap_linux *handlep = handle->priv; u_char *bp; struct sockaddr_ll from; if (handle->break_loop) { handle->break_loop = 0; return PCAP_ERROR_BREAK; } fromlen = sizeof(from); //從socket接受資訊存入bp指向的快取區, 每次最大資料bufize,MSG_TRUNC為返回包的實際長度 packet_len = recvfrom( (struct sockaddr *) &from, &fromlen); } while (packet_len == -1 && errno == EINTR); ... caplen = packet_len; if (caplen > handle->snapshot) caplen = handle->snapshot; //捕獲報文時的資訊 pcap_header.caplen = caplen; pcap_header.len = packet_len; handlep->packets_read++; //將資料內容bp交給函式指標callback指向的函式處理 callback(userdata, &pcap_header, bp); return 1; } Linux核心對recvfrm 的響應: net/socket.c SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, unsigned, flags, struct sockaddr __user *, addr, int __user *, addr_len) { struct socket *sock; struct iovec iov; struct msghdr msg; struct sockaddr_storage address; int err, err2; int fput_needed; if (size > INT_MAX) size = INT_MAX; if (!sock) goto out; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_iovlen = 1; //將iov作為msg的快取區資料結構,使得iov可以跟隨這msg一起作為引數傳遞下去 msg.msg_iov = &iov; iov.iov_base = ubuf; //將msg_name指標指向address, 後面呼叫中,為msg_name賦值時address便得到賦值 msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = sizeof(address); if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; /*使用者層的呼叫 packet_len = recvfrom(handle->fd, bp + offset, handle->bufsize - offset, MSG_TRUNC, //對recvform()裡from和fromlen的賦值,此時address已得到賦值 if (err >= 0 && addr != NULL) { err2 = move_addr_to_user((struct sockaddr *)&address, msg.msg_namelen, addr, addr_len); if (err2 < 0) err = err2; } ... } sock_revmsg()會呼叫sock裡的函式指標集合ops裡的recvmsg,這個函式指標在不同的模組下有不同的實現函式: int sock_recvmsg(struct socket *sock, struct msghdr *msg, ... } static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) int err; struct sock_iocb *si = kiocb_to_siocb(iocb); si->sock = sock; si->scm = NULL; si->msg = msg; si->size = size; si->flags = flags; err = security_socket_recvmsg(sock, msg, size, flags); if (err) return err; return sock->ops->recvmsg(iocb, sock, msg, size, flags); } 由於activate_new()裡面建立了 PF_PACKET協議的socket, 所以,linux會呼叫建立PF_PACKET的底層模組af_packet來響應recvmsg。 在linux啟動階段,af_packet模組初始化完成後,會填充ops->recvmsg等函式指標,對上層/net/sock完成介面對接。 net/packet/af_packet.c static int __init packet_init(void) sock_register(&packet_family_ops); register_pernet_subsys(&packet_net_ops); register_netdevice_notifier(&packet_netdev_notifier); } static struct net_proto_family packet_family_ops = { //PF_PACKET即AF_PACKET,數值為17 .family = PF_PACKET, .create = packet_create, .owner = THIS_MODULE, }; static int packet_create(struct net *net, struct socket *sock, int protocol) sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); if (sk == NULL) goto out; //為socket的ops指標集合填充實現函式。完成介面對接。 sock->ops = &packet_ops; ... return 0; } 在packet_ops裡有對struct sock的函式指標recvmsg填充實現函式packet_recvmsg static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, .sendpage = sock_no_sendpage, }; packet_recvmsg 封裝了接受報文並並將資料拷貝到使用者層全部動作: static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; struct sockaddr_ll *sll; ... //第一步,從skb接收佇列裡取得資料交給skb快取 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out; ... copied = skb->len; if (copied > len) { copied = len; } //第二步, 將獲取到的資料skb拷貝到iov裡,即完成資料對使用者層的傳遞 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); if (err) goto out_free; sock_recv_timestamp(msg, sk, skb); /*將skb裡的cb拷貝給msg->msg_name, 這樣在net/socket.c的 move_addr_to_user((struct sockaddr *)&address, 就可以將此msg_name 傳給使用者層 。*/ if (msg->msg_name) memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, if (pkt_sk(sk)->auxdata) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; aux.tp_len = PACKET_SKB_CB(skb)->origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); aux.tp_vlan_tci = skb->vlan_tci; put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); err = (flags&MSG_TRUNC) ? skb->len : copied; ... return err; } net/core/datagram.c struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err) int peeked; return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), &peeked, err); } __skb_recv_datagram的作用就是接收一個數據報快取的資料結構,本文的分析就到__skb_recv_datagram從sk->sk_receive_queue 中取得skb結構資料為止,至於這個接收佇列是由誰建立的,傳送端在哪裡,後續介紹。 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, int *peeked, int *err) struct sk_buff *skb; long timeo; int error = sock_error(sk); if (error) goto no_packet; do { unsigned long cpu_flags; //保證程序動作唯一,上spin鎖 spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); //檢視skb的*next指標時候有值,即是否有報文來到,有的話返回指標,沒有返回NULL skb = skb_peek(&sk->sk_receive_queue); if (skb) { *peeked = skb->peeked; if (flags & MSG_PEEK) { skb->peeked = 1; } else //如果不是MSG_PEEK(檢視動作)的話,那麼在sk的接收佇列中後移skb,即操作新的skb __skb_unlink(skb, &sk->sk_receive_queue); } //解spin鎖 spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); //有資料的話返回資料的快取 if (skb) return skb; /*如果peek時沒有資料到到,在阻塞情況下,等待一定時間,當達到超時時間還沒有接收到資料,向err傳送錯誤型別報告,退出本函式; 在非阻塞情況下,timeo為0,直接報錯後退出*/ error = -EAGAIN; if (!timeo) goto no_packet; //按照timeo的數值阻塞本程序,在timeo時間內持續執行do...while } while (!wait_for_packet(sk, err, &timeo)); return NULL; no_packet: *err = error; return NULL; } 接收到skb後,呼叫skb_copy_datagram_iovec 將其拷貝到msg的iov裡 struct iovec{ void __user *iov_base; //快取的首地址 __kernel_size_t iov_len; //快取可用的大小 }; int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, struct iovec *to, int len) { //報文頭部長度 int start = skb_headlen(skb); int i, copy = start – offset; struct sk_buff *frag_iter; trace_skb_copy_datagram_iovec(skb, len); //複製報文頭部 if (copy > 0) { if (copy > len) copy = len; //將skb的copy長度(報文頭部)的資料快取複製到iov裡,完成對使用者層資料的傳遞 if (memcpy_toiovec(to, skb->data + offset, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } ... int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; //遞迴呼叫skb_copy_datagram_iovec,offset-start表示當前分片報文的長度 if (skb_copy_datagram_iovec(frag_iter,t, goto fault; if ((len -= copy) == 0) return 0; offset += copy; } start = end; } if (!len) return 0; fault: return -EFAULT; } int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len){ while (len > 0) { if (iov->iov_len) { /*如果iov的iov_len大於len, 說明iov的快取區還可以接受資料,那麼設定本次拷貝大小為len //將kdata拷貝到iov的base地址,長度為len,即將資料拷貝到使用者層 if (copy_to_user(iov->iov_base, kdata, copy)) return -EFAULT; //每次拷貝後,kdata地址後移copy長度 kdata += copy; len -= copy; //每次拷貝後, 將iov_len減去已經使用的長度 iov->iov_len -= copy; //每次拷貝後,移動iov的base地址 iov->iov_base += copy; } iov++; } return 0; } 總結: pcap_open_live 呼叫pcap_create()來為pcap_t填充read_op等函式指標,並提供了啟用函式pcap_activate_linux,建立了socket與linux底層模組af_packet通訊。 pcap_loop 呼叫了read_op的實現函式 pcap_read_linux, pcap_read_linux 裡面使用了recvfrom 獲取乙太網原始資料,linux的af_packet模組會響應並完成recvfrom動作;recvfrom完成後呼叫callback指向的函式處理這些資料,callback指標的賦值是在tcpdump里根據具體鏈路層環境賦值的。 歡迎大家交流,不足之處請不吝指正,給予批評!