• tcpdump实现和run_filter


    一、tcpdump

    对于本机中进程的系统行为调用跟踪,strace是一个很好的工具,而在网络问题的调试中,tcpdump应该说是一个必不可少的工具,和大部分linux下优秀工具一样,它的特点就是简单而强大。
    默认情况下,tcpdump不会抓取本机内部通讯的报文。根据网络协议栈的规定,对于报文,即使是目的地是本机,也需要经过本机的网络协议层,所以本机通讯肯定是通过API进入了内核,并且完成了路由选择。
    二、linux下抓包原理
    linux下的抓包是通过注册一种虚拟的底层网络协议来完成对网络报文(准确的说是网络设备)消息的处理权。当网卡接收到一个网络报文之后,它会遍历系统中所有已经注册的网络协议,例如以太网协议、x25协议处理模块来尝试进行报文的解析处理,这一点和一些文件系统的挂载相似,就是让系统中所有的已经注册的文件系统来进行尝试挂载,如果哪一个认为自己可以处理,那么就完成挂载。
    当抓包模块把自己伪装成一个网络协议的时候,系统在收到报文的时候就会给这个伪协议一次机会,让它来对网卡收到的报文进行一次处理,此时该模块就会趁机对报文进行窥探,也就是把这个报文完完整整的复制一份,假装是自己接收到的报文,汇报给抓包模块。
    先看一下网络层对于接收到的报文的处理方法
    static int process_backlog(struct net_device *backlog_dev, int *budget)---netif_receive_skb
        list_for_each_entry_rcu(ptype, &ptype_all, list) {
            if (!ptype->dev || ptype->dev == skb->dev) {
                if (pt_prev)
                    ret = deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
            }
        }
    三、协议族的注册
    对于这种协议,也只有在需要的时候才注册,因为它毕竟增加了系统报文的处理速度并且会消耗大量的系统skb。当抓包开始的时候,它会创建一个对应的网络套接口,这种套接口的类型就是af_packet类型。相关实现为
    linux-2.6.21 etpacketaf_packet.c
    static int packet_create(struct socket *sock, int protocol)
        sk->sk_family = PF_PACKET;
        po->num = proto;
    ……
        po->prot_hook.func = packet_rcv;
    ……

        if (proto) {
            po->prot_hook.type = proto;
            dev_add_pack(&po->prot_hook);这个接口会将prot_hook注册到前面看到的ptype_all队列中
            sock_hold(sk);
            po->running = 1;
        }
    当一个网卡上真正有报文到来的时候,它就会调用这里注册的packet_rcv函数
     
    static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
                   struct packet_type *pt, struct net_device *orig_dev)
    {
        struct sock *sk;
        struct packet_sock *po;
        struct sockaddr_ll *sll;
        union tpacket_uhdr h;
        u8 *skb_head = skb->data;
        int skb_len = skb->len;
        unsigned int snaplen, res;
        unsigned long status = TP_STATUS_USER;
        unsigned short macoff, netoff, hdrlen;
        struct sk_buff *copy_skb = NULL;
        struct timespec ts;
        __u32 ts_status;
        bool is_drop_n_account = false;
        bool do_vnet = false;
    
        /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
         * We may add members to them until current aligned size without forcing
         * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
         */
        BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
        BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
    
        if (skb->pkt_type == PACKET_LOOPBACK)
            goto drop;
    
        sk = pt->af_packet_priv;
        po = pkt_sk(sk);
    
        if (!net_eq(dev_net(dev), sock_net(sk)))
            goto drop;
    
        if (dev->header_ops) {
            if (sk->sk_type != SOCK_DGRAM)
                skb_push(skb, skb->data - skb_mac_header(skb));
            else if (skb->pkt_type == PACKET_OUTGOING) {
                /* Special case: outgoing packets have ll header at head */
                skb_pull(skb, skb_network_offset(skb));
            }
        }
    
        snaplen = skb->len;
    
        res = run_filter(skb, sk, snaplen);
        if (!res)
            goto drop_n_restore;
    
        /* If we are flooded, just give up */
        if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
            atomic_inc(&po->tp_drops);
            goto drop_n_restore;
        }
    
        if (skb->ip_summed == CHECKSUM_PARTIAL)
            status |= TP_STATUS_CSUMNOTREADY;
        else if (skb->pkt_type != PACKET_OUTGOING &&
             (skb->ip_summed == CHECKSUM_COMPLETE ||
              skb_csum_unnecessary(skb)))
            status |= TP_STATUS_CSUM_VALID;
    
        if (snaplen > res)
            snaplen = res;
    
        if (sk->sk_type == SOCK_DGRAM) {
            macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
                      po->tp_reserve;
        } else {
            unsigned int maclen = skb_network_offset(skb);
            netoff = TPACKET_ALIGN(po->tp_hdrlen +
                           (maclen < 16 ? 16 : maclen)) +
                           po->tp_reserve;
            if (po->has_vnet_hdr) {
                netoff += sizeof(struct virtio_net_hdr);
                do_vnet = true;
            }
            macoff = netoff - maclen;
        }
        if (po->tp_version <= TPACKET_V2) {
            if (macoff + snaplen > po->rx_ring.frame_size) {
                if (po->copy_thresh &&
                    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
                    if (skb_shared(skb)) {
                        copy_skb = skb_clone(skb, GFP_ATOMIC);
                    } else {
                        copy_skb = skb_get(skb);
                        skb_head = skb->data;
                    }
                    if (copy_skb)
                        skb_set_owner_r(copy_skb, sk);
                }
                snaplen = po->rx_ring.frame_size - macoff;
                if ((int)snaplen < 0) {
                    snaplen = 0;
                    do_vnet = false;
                }
            }
        } else if (unlikely(macoff + snaplen >
                    GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
            u32 nval;
    
            nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
            pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u
    ",
                    snaplen, nval, macoff);
            snaplen = nval;
            if (unlikely((int)snaplen < 0)) {
                snaplen = 0;
                macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
                do_vnet = false;
            }
        }
        spin_lock(&sk->sk_receive_queue.lock);
        h.raw = packet_current_rx_frame(po, skb,
                        TP_STATUS_KERNEL, (macoff+snaplen));
        if (!h.raw)
            goto drop_n_account;
        if (po->tp_version <= TPACKET_V2) {
            packet_increment_rx_head(po, &po->rx_ring);
        /*
         * LOSING will be reported till you read the stats,
         * because it's COR - Clear On Read.
         * Anyways, moving it for V1/V2 only as V3 doesn't need this
         * at packet level.
         */
            if (atomic_read(&po->tp_drops))
                status |= TP_STATUS_LOSING;
        }
    
        if (do_vnet &&
            virtio_net_hdr_from_skb(skb, h.raw + macoff -
                        sizeof(struct virtio_net_hdr),
                        vio_le(), true, 0))
            goto drop_n_account;
    
        po->stats.stats1.tp_packets++;
        if (copy_skb) {
            status |= TP_STATUS_COPY;
            __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
        }
        spin_unlock(&sk->sk_receive_queue.lock);
    
        skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
    
        if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
            getnstimeofday(&ts);
    
        status |= ts_status;
    
        switch (po->tp_version) {
        case TPACKET_V1:
            h.h1->tp_len = skb->len;
            h.h1->tp_snaplen = snaplen;
            h.h1->tp_mac = macoff;
            h.h1->tp_net = netoff;
            h.h1->tp_sec = ts.tv_sec;
            h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
            hdrlen = sizeof(*h.h1);
            break;
        case TPACKET_V2:
            h.h2->tp_len = skb->len;
            h.h2->tp_snaplen = snaplen;
            h.h2->tp_mac = macoff;
            h.h2->tp_net = netoff;
            h.h2->tp_sec = ts.tv_sec;
            h.h2->tp_nsec = ts.tv_nsec;
            if (skb_vlan_tag_present(skb)) {
                h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
                h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
                status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
            } else {
                h.h2->tp_vlan_tci = 0;
                h.h2->tp_vlan_tpid = 0;
            }
            memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
            hdrlen = sizeof(*h.h2);
            break;
        case TPACKET_V3:
            /* tp_nxt_offset,vlan are already populated above.
             * So DONT clear those fields here
             */
            h.h3->tp_status |= status;
            h.h3->tp_len = skb->len;
            h.h3->tp_snaplen = snaplen;
            h.h3->tp_mac = macoff;
            h.h3->tp_net = netoff;
            h.h3->tp_sec  = ts.tv_sec;
            h.h3->tp_nsec = ts.tv_nsec;
            memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
            hdrlen = sizeof(*h.h3);
            break;
        default:
            BUG();
        }
    
        sll = h.raw + TPACKET_ALIGN(hdrlen);
        sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
        sll->sll_family = AF_PACKET;
        sll->sll_hatype = dev->type;
        sll->sll_protocol = skb->protocol;
        sll->sll_pkttype = skb->pkt_type;
        if (unlikely(po->origdev))
            sll->sll_ifindex = orig_dev->ifindex;
        else
            sll->sll_ifindex = dev->ifindex;
    
        smp_mb();
    
    #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
        if (po->tp_version <= TPACKET_V2) {
            u8 *start, *end;
    
            end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
                        macoff + snaplen);
    
            for (start = h.raw; start < end; start += PAGE_SIZE)
                flush_dcache_page(pgv_to_page(start));
        }
        smp_wmb();
    #endif
    
        if (po->tp_version <= TPACKET_V2) {
            __packet_set_status(po, h.raw, status);
            sk->sk_data_ready(sk);
        } else {
            prb_clear_blk_fill_status(&po->rx_ring);
        }
    
    drop_n_restore:
        if (skb_head != skb->data && skb_shared(skb)) {
            skb->data = skb_head;
            skb->len = skb_len;
        }
    drop:
        if (!is_drop_n_account)
            consume_skb(skb);
        else
            kfree_skb(skb);
        return 0;
    
    drop_n_account:
        spin_unlock(&sk->sk_receive_queue.lock);
        atomic_inc(&po->tp_drops);
        is_drop_n_account = true;
    
        sk->sk_data_ready(sk);
        kfree_skb(copy_skb);
        goto drop_n_restore;
    }
    static unsigned int run_filter(struct sk_buff *skb,
                       const struct sock *sk,
                       unsigned int res)
    {
        struct sk_filter *filter;
    
        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter != NULL)
            res = bpf_prog_run_clear_cb(filter->prog, skb);
        rcu_read_unlock();
    
        return res;
    }

    ……
        res = run_filter(skb, sk, snaplen);如果说filter过滤失败,说明是抓包不关心的报文,直接放行,返回值非零表示不关心。
        if (!res)
            goto drop_n_restore;
    ……
        if (skb_shared(skb)) {
            struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);自己复制一份。
            if (nskb == NULL)
                goto drop_n_acct;

            if (skb_head != skb->data) {
                skb->data = skb_head;
                skb->len = skb_len;
            }
            kfree_skb(skb);
            skb = nskb;
        }
    四、filter的执行
    run_filter--->>sk_run_filter
    ……

        for (pc = 0; pc < flen; pc++) {
            fentry = &filter[pc];

            switch (fentry->code) {
            case BPF_ALU|BPF_ADD|BPF_X:
                A += X;
                continue;
            case BPF_ALU|BPF_ADD|BPF_K:
                A += fentry->k;
                continue;
    ……
           
            switch (k-SKF_AD_OFF) {
            case SKF_AD_PROTOCOL:
                A = ntohs(skb->protocol);
                continue;
            case SKF_AD_PKTTYPE:
                A = skb->pkt_type;
                continue;
            case SKF_AD_IFINDEX:
                A = skb->dev->ifindex;
                continue;
            default:
                return 0;
            }
    这个函数是执行了一个自己定义的指令集和。用户通过sockopt来注册这段指令,内核在内核态执行这些指令,完成匹配,其中包含了报文某些字段的加载,条件跳转、加减乘除以及返回等指令。当转包套接口接收到报文之后,对这个报文执行这段虚拟程序,直到遇到ret指令作为自己的返回值。通过tcpdump -d 可以显示出编译之后生成的指令,下面是一个测试输出
    [root@Harry bash-4.1]# tcpdump -d host 1.2.3.4
    tcpdump: WARNING: eth0: no IPv4 address assigned
    (000) ldh      [12]
    (001) jeq      #0x800           jt 2    jf 6
    (002) ld       [26]  加载接收到报文的第26个字节开始的一个int类型,
    (003) jeq      #0x1020304       jt 12    jf 4 如果和0x1234相等,跳转到12跳指令,不等继续第四条指令。 
    (004) ld       [30]
    (005) jeq      #0x1020304       jt 12    jf 13
    (006) jeq      #0x806           jt 8    jf 7
    (007) jeq      #0x8035          jt 8    jf 13
    (008) ld       [28]
    (009) jeq      #0x1020304       jt 12    jf 10
    (010) ld       [38]
    (011) jeq      #0x1020304       jt 12    jf 13
    (012) ret      #65535
    (013) ret      #0
     
     
  • 相关阅读:
    BZOJ3197:[SDOI2013]刺客信条——题解
    C 程序与 C++ 程序之间的相互调用
    使用Dev C++调试(debug)程序
    ARM 汇编指令 ADR 与 LDR 使用
    华为交换机以 LACP 模式实现链路聚合
    DLCI 简介
    华为路由器帧中继 FR 实验
    GVRP 的工作机制和工作模式
    华为路由器 HDLC 实验
    华为路由器 IPSec 与 GRE 结合实验
  • 原文地址:https://www.cnblogs.com/dream397/p/12222837.html
Copyright © 2020-2023  润新知