现在的位置: 首页 > 综合 > 正文

Linux环境下libpcap库源代码分析

2013年12月10日 ⁄ 综合 ⁄ 共 14163字 ⁄ 字号 评论关闭
linux环境下libpcap 源代码分析

韩大卫@吉林师范大学


libpcap 源代码官方下载地址:
git clone https://github.com/the-tcpdump-group/libpcap.git

tcpdumpm源代码官方下载地址:
git clone git://bpf.tcpdump.org/tcpdump

tcpdump.c使用libpcap里的pcap_open_livepcap_loop 完成两个最关键的动作:获取捕获报文的接口,和捕获报文并将报文交给callback(关于tcpdump源代码的构架,请参考作者的tcpdump源代码分析)
 
现结合libpcap源代码分析pcap_open_livepcap_loop的实现机制,并进入linux内核,展示linux内核对这两个API的响应动作。

tcpdump.cpcap_open_live的使用是:

pd = pcap_open_live(device, snaplen, !pflag, 1000, ebuf); 

pcap_open_live定义如下:

pcap_t *pcap_open_live(const char *source, int snaplen, int promisc, int to_ms, char *errbuf)

source 	为指定的网络接口。
snaplen 	为最大报文长度。
Promisc 	是否将设备设置为混杂模式。
to_ms 	超时时间。
errbuf 	为错误信息描述字符。

返回值为cap_t类型的指针,pcap_t 定义是:

typedef struct pcap pcap_t;
struct pcap {
/*typedef int (*read_op_t)(pcap_t *, int cnt, pcap_handler, u_char *);
read_op为从网络接口读取报文的函数指针,待其得到赋值后,调用实现函数*/
    read_op_t read_op;
 
//从文件里读取报文的函数指针
    int (*next_packet_op)(pcap_t *, struct pcap_pkthdr *, u_char **);
//文件描述符,socket
    int fd;
    int selectable_fd;   
    int bufsize;    //read缓冲区大小
    u_char *buffer; //read缓冲区指针
    u_char *bp;
    int cc;
...
    int snapshot;
    int linktype;       /* Network linktype */
    int linktype_ext;      
    int tzoff;      /* timezone offset */
    int offset;     /* offset for proper alignment */
    int activated;      /* true if the capture is really started */
    int oldstyle;       /* if we're opening with pcap_open_live() */
    struct pcap_opt opt; 
    u_char *pkt;
...
   //激活函数,激活函数在得到调用后,会建立起与底层IPCsocket
    activate_op_t activate_op;
...
};

pcap_t *
pcap_open_live(const char *source, int snaplen, int promisc, int to_ms, char *errbuf){   
    pcap_t *p;
    int status;
   //创建捕获报文的接口句柄
    p = pcap_create(source, errbuf);

    if (p == NULL)
        return (NULL);
    //设置最大报文长度
    status = pcap_set_snaplen(p, snaplen);
    if (status < 0)
        goto fail;
	//将设备设为混杂模式
    status = pcap_set_promisc(p, promisc);
    if (status < 0)
        goto fail;
	//设置超时时间
    status = pcap_set_timeout(p, to_ms);
    if (status < 0)
        goto fail;
    p->oldstyle = 1;
	//pcap_avtivate调用pcap_tactivate_op, 建立起与底层IPC通道
    status = pcap_activate(p);

    if (status < 0)
        goto fail;
    return (p);
...
}

pcap_t *pcap_create(const char *source, char *errbuf){   
    size_t i;
    int is_theirs;
    pcap_t *p;

    if (source == NULL)
        source = "any";

	//capture_source_types数组里寻找是否有特定API集合的接口对应source
    for (i = 0; capture_source_types[i].create_op != NULL; i++) {
        is_theirs = 0;
        p = capture_source_types[i].create_op(source, errbuf, &is_theirs);
        if (is_theirs) {
                return (p);
        }
    }

    //如果没有, 那么就将source作为普通网络接口
    return (pcap_create_interface(source, errbuf));
}
pcap_create_interface() 函数在libpcap下有多个实现,可由编译宏来指定特定的pcap_create_interface来初始化read_op等函数指针。linux环境里默认是libpcap/pcap-linux.c中的 pcap_create_interface():

pcap_t *
pcap_create_interface(const char *device, char *ebuf)
{  
    pcap_t *handle;
    
/*可将 pcap_create_common看做pcap_t结构的构造函数,初始化一个pcap_t*/
    handle = pcap_create_common(device, ebuf, sizeof (struct pcap_linux));
    if (handle == NULL)
        return NULL;
    
	//pcap_t 的激活函数指针填充具体实现函数
    handle->activate_op = pcap_activate_linux;

    handle->can_set_rfmon_op = pcap_can_set_rfmon_linux;
   
    return handle;
}

完成后回到pcap_open_live,设置snaplen,promisc,to_ms后,调用status = pcap_activate(p),该函数执行status = p->activate_op(p) 
进而调用 pcap_activate_linux(), 完成read_op等重要函数指针的具体赋值。

static int
 pcap_activate_linux(pcap_t *handle)
{   
    struct pcap_linux *handlep = handle->priv;
    const char  *device;
    int     status = 0;
    
    device = handle->opt.source;
    
    handle->inject_op = pcap_inject_linux;
    handle->setfilter_op = pcap_setfilter_linux;
    handle->setdirection_op = pcap_setdirection_linux;
    handle->set_datalink_op = pcap_set_datalink_linux;
    handle->getnonblock_op = pcap_getnonblock_fd;
    handle->setnonblock_op = pcap_setnonblock_fd;
    handle->cleanup_op = pcap_cleanup_linux;
	//最重要的函数指针read_op
    handle->read_op = pcap_read_linux;

    handle->stats_op = pcap_stats_linux;

    if (strcmp(device, "any") == 0) {
        if (handle->opt.promisc) {
            handle->opt.promisc = 0;
            /* Just a warning. */
            snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
                "Promiscuous mode not supported on the \"any\" device");
            status = PCAP_WARNING_PROMISC_NOTSUP;
        }
    }  
    handlep->device = strdup(device);
    if (handlep->device == NULL) {
        snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "strdup: %s",
             pcap_strerror(errno) );
        return PCAP_ERROR;
    }
    
    handlep->timeout = handle->opt.timeout;

    if (handle->opt.promisc)
        handlep->proc_dropped = linux_if_drops(handlep->device);

    //先使用activete_new()
    status = activate_new(handle);
    if (status < 0) {
        goto fail;
    }
    //根据错误值具体处理
    if (status == 1) {
        switch (activate_mmap(handle, &status)) {
        case 1:
            return status;
        case 0:

           break;
    
        case -1:
            goto fail;
        }
    }
	//如果status0, 再尝试使用activete_old()函数
    else if (status == 0) {
        /* Non-fatal error; try old way */
        if ((status = activate_old(handle)) != 1) {
            goto fail;
        }
    }
    status = 0;
    if (handle->opt.buffer_size != 0) {
        //设置socket的缓冲区和缓冲区长度
        if (setsockopt(handle->fd, SOL_SOCKET, SO_RCVBUF,
            &handle->opt.buffer_size,
            sizeof(handle->opt.buffer_size)) == -1) {
            snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
                 "SO_RCVBUF: %s", pcap_strerror(errno));
            status = PCAP_ERROR;
            goto fail;
        }
    }
     handle->selectable_fd = handle->fd;
   
    return status;
...
}     


static int
activate_new(pcap_t *handle)
{
   struct pcap_linux *handlep = handle->priv;
    const char  *device = handle->opt.source;
    int         is_any_device = (strcmp(device, "any") == 0);
    int         sock_fd = -1, arptype;
    int         err = 0;
    struct packet_mreq  mr;

/*指定网口情况下用PF_PACKET协议通信得到原始以太网数据帧数据
关于socket()函数,我个人认为可以将其理解为open()
open()打开不同的文件,这样在返回的句柄里就可使用这个文件设备模块提供的ops
socket()打开不同的协议,返回句柄里也包括了该协议的底层模块提供的ops. 只不过linux下面没法将网络协议当作普通文件(如/dev/xx)处理,所以才有了另一套socket特定的APIs*/
    sock_fd = is_any_device ?
        socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_ALL)) :
        socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));

...
    handlep->sock_packet = 0;
    
/*iface_get_id()使用ioctl(fd, SIOCGIFINDEX, &ifr)获取lo还回设备的索引值*/
    handlep->lo_ifindex = iface_get_id(sock_fd, "lo", handle->errbuf);
    
    handle->offset   = 0;
    

    if (!is_any_device) {
        handlep->cooked = 0;
    
        if (handle->opt.rfmon) {
            err = enter_rfmon_mode(handle, sock_fd, device);
            if (err < 0) {
                close(sock_fd);
                return err;
            }
            if (err == 0) {
                close(sock_fd);
                return PCAP_ERROR_RFMON_NOTSUP;
            }

            if (handlep->mondevice != NULL)
                device = handlep->mondevice;
        }
/*iface_get_arptype()调用ioctl(fd, SIOCGIFHWADDR, &ifr)获取硬件类型 */
        arptype = iface_get_arptype(sock_fd, device, handle->errbuf);
        if (arptype < 0) {
            close(sock_fd);
            return arptype;
        }
        map_arphrd_to_dlt(handle, arptype, 1);
  ...
    
      //获取指定设备的索引值
        handlep->ifindex = iface_get_id(sock_fd, device,
 handle->errbuf);

        if (handlep->ifindex == -1) {
            close(sock_fd);
            return PCAP_ERROR;

/*iface_bind()将设备的索引值作为struct socketadd_ll的索引值与socket绑定
    struct sockaddr_ll  sll; 
    sll.sll_family      = AF_PACKET;                                                               
    sll.sll_ifindex     = ifindex;
    sll.sll_protocol    = htons(ETH_P_ALL);
bind(fd, (struct sockaddr *) &sll, sizeof(sll)) == -1 */
        if ((err = iface_bind(sock_fd, handlep->ifindex,
handle->errbuf)) != 1) {
                close(sock_fd);
            if (err < 0)
                return err;
            else
                return 0;   /* try old mechanism */
        }
...
    }
    if (!is_any_device && handle->opt.promisc) {
        memset(&mr, 0, sizeof(mr));
        mr.mr_ifindex = handlep->ifindex;
        mr.mr_type    = PACKET_MR_PROMISC;
        if (setsockopt(sock_fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP,
            &mr, sizeof(mr)) == -1) {
            snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
                "setsockopt: %s", pcap_strerror(errno));
            close(sock_fd);
            return PCAP_ERROR;
        }
    }
    if (handlep->cooked) {
        if (handle->snapshot < SLL_HDR_LEN + 1)
            handle->snapshot = SLL_HDR_LEN + 1;
    }
    handle->bufsize = handle->snapshot;
    
	
    //根据以太网链路层类型决定VLAN Tag在报文中的偏移值
    switch (handle->linktype) {
    
    case DLT_EN10MB:
        handlep->vlan_offset = 2 * ETH_ALEN;
        break;
    
    case DLT_LINUX_SLL:
        handlep->vlan_offset = 14;
        break;
    
    default:
        handlep->vlan_offset = -1; /* unknown */
        break;
    }
    
//sock_fd作为pcap_tfd
    handle->fd = sock_fd;
...
}

至此,通过pcap_open_live完成全部准备阶段的内容, 之后就可以使用pcap_loop()来获取来自底层的数据并提交给callback函数进行应用处理, tcpdump.c pcap_loop的使用是: 

status = pcap_loop(pd, cnt, callback, pcap_userdata); 
//cnt 为指定捕获报文的个数

libpcap/pcap.c里有pcap_loop的定义:  
  
int 
pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
{   
    register int n;
    
    for (;;) {
        if (p->rfile != NULL) {
		//从文件里读取报文
            n = pcap_offline_read(p, cnt, callback, user);
        } else {
		//从指定网口读取报文
            do {
			//read_op即为pcap_read_packet
                n = p->read_op(p, cnt, callback, user);

            } while (n == 0);
        }
        //n<0时退出循环,退出pcap_loop
        if (n <= 0)
            return (n);
  
      //如果达到捕获报文个数,退出pcap_loop
        if (cnt > 0) {
            cnt -= n;
            if (cnt <= 0)
                return (0);
        }
    }
}   

函数指针read_op指向的就是pcap_read_packet

staticint

pcap_read_packet(pcap_t *handle, pcap_handler callback, u_char *userdata
{     
	struct pcap_linux   *handlep = handle->priv;
    	u_char          *bp;
 	struct sockaddr_ll  from;
        if (handle->break_loop) {
            handle->break_loop = 0;
            return PCAP_ERROR_BREAK;
        }            
        fromlen = sizeof(from);
//从socket接受信息存入bp指向的缓存区, 每次最大数据bufize,MSG_TRUNC为返回包的实际长度
	  packet_len = recvfrom(
		(struct sockaddr *) &from, &fromlen);

    } while (packet_len == -1 && errno == EINTR);

...
	caplen = packet_len;
    if (caplen > handle->snapshot)
        caplen = handle->snapshot;

	//捕获报文时的信息
    pcap_header.caplen  = caplen;                                                                 
    pcap_header.len     = packet_len;

    handlep->packets_read++;
                
    //将数据内容bp交给函数指针callback指向的函数处理
    callback(userdata, &pcap_header, bp);
                
    return 1;   
}               
 
Linux内核对recvfrm 的响应:

net/socket.c

SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
        unsigned, flags, struct sockaddr __user *, addr,
        int __user *, addr_len)
{
 struct socket *sock;
    struct iovec iov;
    struct msghdr msg;
    struct sockaddr_storage address;
    int err, err2;
    int fput_needed;
            
    if (size > INT_MAX)
        size = INT_MAX;
    if (!sock)
        goto out;
            
    msg.msg_control = NULL;
    msg.msg_controllen = 0;
    msg.msg_iovlen = 1;
    //将iov作为msg的缓存区数据结构,使得iov可以跟随这msg一起作为参数传递下去
    msg.msg_iov = &iov;
    iov.iov_base = ubuf;
   //将msg_name指针指向address, 后面调用中,为msg_name赋值时address便得到赋值
    msg.msg_name = (struct sockaddr *)&address;
    msg.msg_namelen = sizeof(address);
    if (sock->file->f_flags & O_NONBLOCK)
        flags |= MSG_DONTWAIT;

/*用户层的调用 packet_len = recvfrom(handle->fd, bp + offset,
	handle->bufsize - offset, MSG_TRUNC,
            
//对recvform()里from和fromlen的赋值,此时address已得到赋值
    if (err >= 0 && addr != NULL) {
        err2 = move_addr_to_user((struct sockaddr *)&address,
                     msg.msg_namelen, addr, addr_len);
        if (err2 < 0)
            err = err2;
    } 
...
}

sock_revmsg()会调用sock里的函数指针集合ops里的recvmsg,这个函数指针在不同的模块下有不同的实现函数:

int sock_recvmsg(struct socket *sock, struct msghdr *msg,
...
}   
    
static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
                 struct msghdr *msg, size_t size, int flags)
    int err;
    struct sock_iocb *si = kiocb_to_siocb(iocb);
          
    si->sock = sock;
    si->scm = NULL;
    si->msg = msg;
    si->size = size;
    si->flags = flags;
          
    err = security_socket_recvmsg(sock, msg, size, flags);
    if (err)
        return err;
          
    return sock->ops->recvmsg(iocb, sock, msg, size, flags);
} 

由于activate_new()里面建立了 PF_PACKET协议的socket, 所以,linux会调用建立PF_PACKET的底层模块af_packet来响应recvmsg。 在linux启动阶段,af_packet模块初始化完成后,会填充ops->recvmsg等函数指针,对上层/net/sock完成接口对接。

net/packet/af_packet.c

static int __init packet_init(void)
    sock_register(&packet_family_ops);
    register_pernet_subsys(&packet_net_ops);
    register_netdevice_notifier(&packet_netdev_notifier);
}

static struct net_proto_family packet_family_ops = {
	//PF_PACKET即AF_PACKET,数值为17
    .family =   PF_PACKET,
    .create =   packet_create,
    .owner  =   THIS_MODULE,
};

static int packet_create(struct net *net, struct socket *sock, int protocol)
    sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
    if (sk == NULL)
        goto out;
              
    //为socket的ops指针集合填充实现函数。完成接口对接。
    sock->ops = &packet_ops; 
...           
    return 0;
}              

在packet_ops里有对struct sock的函数指针recvmsg填充实现函数packet_recvmsg

static const struct proto_ops packet_ops = {
    .family =   PF_PACKET,
    .owner =    THIS_MODULE,
    .release =  packet_release,
    .bind =     packet_bind,
    .connect =  sock_no_connect,
    .socketpair =   sock_no_socketpair,
    .accept =   sock_no_accept,
    .getname =  packet_getname,
    .poll =     packet_poll,
    .ioctl =    packet_ioctl,
    .listen =   sock_no_listen,
    .shutdown = sock_no_shutdown,
    .setsockopt =   packet_setsockopt,
    .getsockopt =   packet_getsockopt,
    .sendmsg =  packet_sendmsg,
    .recvmsg =  packet_recvmsg,
    .mmap =     packet_mmap,
    .sendpage = sock_no_sendpage,
};  

packet_recvmsg 封装了接受报文并并将数据拷贝到用户层全部动作:

static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
              struct msghdr *msg, size_t len, int flags)
{ 
    struct sock *sk = sock->sk;
    struct sk_buff *skb;
    int copied, err;
    struct sockaddr_ll *sll;
   
...
  //第一步,从skb接收队列里取得数据交给skb缓存
    skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
    if (skb == NULL)        
        goto out;           
...
    copied = skb->len;
    if (copied > len) {
        copied = len;
    }       

   //第二步, 将获取到的数据skb拷贝到iov里,即完成数据对用户层的传递
    err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
    if (err)    
        goto out_free;
                
    sock_recv_timestamp(msg, sk, skb);
            
/*将skb里的cb拷贝给msg->msg_name, 这样在net/socket.c的
move_addr_to_user((struct sockaddr *)&address,
就可以将此msg_name 传给用户层 。*/    
    if (msg->msg_name)
        memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
                
    if (pkt_sk(sk)->auxdata) {
        struct tpacket_auxdata aux;
                
        aux.tp_status = TP_STATUS_USER;
        if (skb->ip_summed == CHECKSUM_PARTIAL)
            aux.tp_status |= TP_STATUS_CSUMNOTREADY;
        aux.tp_len = PACKET_SKB_CB(skb)->origlen;
        aux.tp_snaplen = skb->len;
        aux.tp_mac = 0;
        aux.tp_net = skb_network_offset(skb);
        aux.tp_vlan_tci = skb->vlan_tci;
        put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
    err = (flags&MSG_TRUNC) ? skb->len : copied;
... 
    return err;
}

net/core/datagram.c

struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
                  int noblock, int *err)
    int peeked;
    return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),                                                                
                   &peeked, err);
}                     
      
__skb_recv_datagram的作用就是接收一个数据报缓存的数据结构,本文的分析就到__skb_recv_datagram从sk->sk_receive_queue 中取得skb结构数据为止,至于这个接收队列是由谁建立的,发送端在哪里,后续介绍。       
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
                    int *peeked, int *err)
    struct sk_buff *skb;
    long timeo;
    int error = sock_error(sk);
    
    if (error)
        goto no_packet;
    

    do {
        unsigned long cpu_flags;
        //保证进程动作唯一,上spin锁   
        spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);

         //查看skb的*next指针时候有值,即是否有报文来到,有的话返回指针,没有返回NULL
        skb = skb_peek(&sk->sk_receive_queue);
        if (skb) {
            *peeked = skb->peeked;
            if (flags & MSG_PEEK) {
                skb->peeked = 1;
            } else
//如果不是MSG_PEEK(查看动作)的话,那么在sk的接收队列中后移skb,即操作新的skb
                __skb_unlink(skb, &sk->sk_receive_queue);
        }
       //解spin锁
        spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
       //有数据的话返回数据的缓存
        if (skb)
            return skb;
   
/*如果peek时没有数据到到,在阻塞情况下,等待一定时间,当达到超时时间还没有接收到数据,向err传送错误类型报告,退出本函数; 在非阻塞情况下,timeo为0,直接报错后退出*/
        error = -EAGAIN;
        if (!timeo)
            goto no_packet;
     //按照timeo的数值阻塞本进程,在timeo时间内持续执行do...while
    } while (!wait_for_packet(sk, err, &timeo));
   
    return NULL;
   
no_packet:
    *err = error;
    return NULL;
}


接收到skb后,调用skb_copy_datagram_iovec 将其拷贝到msg的iov里

struct iovec{  
    void __user *iov_base;  		//缓存的首地址
    __kernel_size_t iov_len; 		//缓存可用的大小
}; 
 
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
                struct iovec *to, int len)
{  
    //报文头部长度
    int start = skb_headlen(skb);
    int i, copy = start – offset;
    struct sk_buff *frag_iter;
   
    trace_skb_copy_datagram_iovec(skb, len);
   
    //复制报文头部
    if (copy > 0) {
        if (copy > len)
            copy = len;
        //将skb的copy长度(报文头部)的数据缓存复制到iov里,完成对用户层数据的传递
        if (memcpy_toiovec(to, skb->data + offset, copy))
            goto fault;
        if ((len -= copy) == 0)
            return 0;
        offset += copy;
	}
... 
        int end;
    
        WARN_ON(start > offset + len);
    
        end = start + frag_iter->len;
        if ((copy = end - offset) > 0) {
            if (copy > len)
                copy = len;
           //递归调用skb_copy_datagram_iovec,offset-start表示当前分片报文的长度
            if (skb_copy_datagram_iovec(frag_iter,t,
                goto fault;
            if ((len -= copy) == 0)
                return 0;
            offset += copy;
        }
        start = end;
    }
    if (!len)
        return 0; 
fault:
    return -EFAULT;
}

int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len){       
    while (len > 0) { 
        if (iov->iov_len) {
/*如果iov的iov_len大于len, 说明iov的缓存区还可以接受数据,那么设置本次拷贝大小为len
  		//将kdata拷贝到iov的base地址,长度为len,即将数据拷贝到用户层
            if (copy_to_user(iov->iov_base, kdata, copy))
                return -EFAULT;
		//每次拷贝后,kdata地址后移copy长度
            kdata += copy;
            len -= copy;
		//每次拷贝后, 将iov_len减去已经使用的长度
            iov->iov_len -= copy;
  		//每次拷贝后,移动iov的base地址
            iov->iov_base += copy;
        } 
        iov++;              
    }
    return 0;
}


总结:

pcap_open_live 调用pcap_create()来为pcap_t填充read_op等函数指针,并提供了激活函数pcap_activate_linux,建立了socket与linux底层模块af_packet通信。 

pcap_loop 调用了read_op的实现函数 pcap_read_linux, pcap_read_linux 里面使用了recvfrom 获取以太网原始数据,linux的af_packet模块会响应并完成recvfrom动作;recvfrom完成后调用callback指向的函数处理这些数据,callback指针的赋值是在tcpdump里根据具体链路层环境赋值的。

欢迎大家交流,不足之处请不吝指正,给予批评!

抱歉!评论已关闭.