• TSO-GSO reading


    对 TCP,在网卡不支持 TSO 时,使用和不使用 GSO 的情形

     

    TSO  :

    在 分析:IP层发包时:如果是gso 报文会调用

    ip_finish_output_gso 

     来处理

    static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
    {
    #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
        /* Policy lookup after SNAT yielded a new policy */
        if (skb_dst(skb)->xfrm) {            //仅经过ip_forward流程处理的报文携带该对象
            IPCB(skb)->flags |= IPSKB_REROUTED;    //该flag会影响后续报文的GSO处理
            return dst_output_sk(sk, skb);        //由于SNAT等策略处理,需要再次调用xfrm4_output函数来发包
        }
    #endif
        if (skb_is_gso(skb))
            return ip_finish_output_gso(sk, skb);    //如果是gso报文
     
        if (skb->len > ip_skb_dst_mtu(skb))        //非gso报文,报文大小超过设备MTU值,则需要进行IP分片
            return ip_fragment(sk, skb, ip_finish_output2);
     
        return ip_finish_output2(sk, skb);        //直接发送报文
    }
    static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
    {
        netdev_features_t features;
        struct sk_buff *segs;
        int ret = 0;
     
        /* common case: locally created skb or seglen is <= mtu */
        if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||        //只有ip forward流程该条件才会不成立,否则该条件成立
              skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
            return ip_finish_output2(sk, skb);
     
        /* Slowpath -  GSO segment length is exceeding the dst MTU.
         *
         * This can happen in two cases:
         * 1) TCP GRO packet, DF bit not set
         * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
         * from host network stack.
         */
        features = netif_skb_features(skb);                //获取dev的offload feature
        segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);    //skb gso报文分段
        if (IS_ERR_OR_NULL(segs)) {
            kfree_skb(skb);
            return -ENOMEM;
        }
     
        consume_skb(skb);
     
        do {
            struct sk_buff *nskb = segs->next;
            int err;
     
            segs->next = NULL;
            err = ip_fragment(sk, segs, ip_finish_output2);        //分段报文经过ip分片后通过ip_finish_output2发送
     
            if (err && ret == 0)
                ret = err;
            segs = nskb;
        } while (segs);
     
        return ret;
    }

    看下 gso 的处理方式:入口函数skb_gso_segment

    这个函数将skb分片,并返回一个skb list。如果skb不需要分片则返回NULL。

    /**
     *    __skb_gso_segment - Perform segmentation on skb.
     *    @skb: buffer to segment
     *    @features: features for the output path (see dev->features)
     *    @tx_path: whether it is called in TX path
     *
     *    This function segments the given skb and returns a list of segments.
     *
     *    It may return NULL if the skb requires no segmentation.  This is
     *    only possible when GSO is used for verifying header integrity.
     *
     *    Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
     */
    struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
                      netdev_features_t features, bool tx_path)
    {
        struct sk_buff *segs;
    
        if (unlikely(skb_needs_check(skb, tx_path))) {// 判断等于 skb->ip_summed != CHECKSUM_PARTIAL  CHECKSUM_UNNECESSARY
            int err;
    
            /* We're going to init ->check field in TCP or UDP header   copy header of skb when it is required
     *    If the skb passed lacks sufficient headroom or its data part
     *    is shared, data is reallocated. If reallocation fails, an error
     *    is returned and original skb is not changed. */
            err = skb_cow_head(skb, 0);
            if (err < 0)
                return ERR_PTR(err);
        }
    
        /* Only report GSO partial support if it will enable us to
         * support segmentation on this frame without needing additional
         * work.
         */
        if (features & NETIF_F_GSO_PARTIAL) {
            netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
            struct net_device *dev = skb->dev;
    
            partial_features |= dev->features & dev->gso_partial_features;
            if (!skb_gso_ok(skb, features | partial_features))
                features &= ~NETIF_F_GSO_PARTIAL;
        }
    
        BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
                 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
    
        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);//设置mac_offset, 用于skb_segment分段拷贝外层报文
        SKB_GSO_CB(skb)->encap_level = 0;//encap_level为零,说明是最外层的报文
    
        skb_reset_mac_header(skb);//重置mac header
        skb_reset_mac_len(skb);//重置mac len
    
        segs = skb_mac_gso_segment(skb, features);
    
        if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
            skb_warn_bad_offload(skb);
    
        return segs;
    }
    /**
     *    skb_mac_gso_segment - mac layer segmentation handler.
     *    @skb: buffer to segment
     *    @features: features for the output path (see dev->features)
     */
    struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
                        netdev_features_t features)
    {
        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
        struct packet_offload *ptype;
        int vlan_depth = skb->mac_len;//__skb_gso_segment函数中计算得到  
        __be16 type = skb_network_protocol(skb, &vlan_depth);//得到skb协议
    
        if (unlikely(!type))
            return ERR_PTR(-EINVAL);
    
        __skb_pull(skb, vlan_depth);//skb data指针移动到IP头
    
        rcu_read_lock();
        list_for_each_entry_rcu(ptype, &offload_base, list) {
            if (ptype->type == type && ptype->callbacks.gso_segment) {
                segs = ptype->callbacks.gso_segment(skb, features);//调用IP层的GSO segment函数
                break;
            }
        }
        rcu_read_unlock();
    
        __skb_push(skb, skb->data - skb_mac_header(skb));//skb data指针移动到MAC头
    
        return segs;
    }

      需要做gso分段,则先进入ip层的分段处理,在ip层分段处理函数里,主要工作是调用tcp层的分段处理函数,等tcp层分段完成后,重新对分段的skb的ip头做checksum

    static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
                        netdev_features_t features)
    {
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        const struct net_offload *ops;
        unsigned int offset = 0;
        bool udpfrag, encap;
        struct iphdr *iph;
        int proto;
        int nhoff;
        int ihl;
        int id;
     //校验待软GSO分段的的skb,其gso_tpye是否存在其他非法值
        if (unlikely(skb_shinfo(skb)->gso_type &
                 ~(SKB_GSO_TCPV4 |
                   SKB_GSO_UDP |
                   SKB_GSO_DODGY |
                   SKB_GSO_TCP_ECN |
                   SKB_GSO_GRE |
                   SKB_GSO_GRE_CSUM |
                   SKB_GSO_IPIP |
                   SKB_GSO_SIT |
                   SKB_GSO_TCPV6 |
                   SKB_GSO_UDP_TUNNEL |
                   SKB_GSO_UDP_TUNNEL_CSUM |
                   SKB_GSO_TUNNEL_REMCSUM |
                   0)))
            goto out;
     
        skb_reset_network_header(skb);
        nhoff = skb_network_header(skb) - skb_mac_header(skb);    //根据network header和mac header得到IP头相对MAC的偏移
        if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))    //分段数据至少大于IP首部长度
            goto out;
     
        iph = ip_hdr(skb);
        //检验首部中的长度字段是否有效
        ihl = iph->ihl * 4;        //得到IP包头的实际长度,基于此可以得到L4的首地址
        if (ihl < sizeof(*iph))
            goto out;
     
        id = ntohs(iph->id);//取出首部中的id字段
        proto = iph->protocol;        //取出IP首部的协议值,L4层协议类型 用于定位与之对应的传输层接口(tcp还是udp)
     
        /* Warning: after this point, iph might be no longer valid */
        //再次通过首部中的长度字段检测skb长度是否有效
        if (unlikely(!pskb_may_pull(skb, ihl)))    //检测skb是否可以移动到L4头?
            goto out;
        __skb_pull(skb, ihl);        //报文data指针移动到传输层
     
        encap = SKB_GSO_CB(skb)->encap_level > 0;
        if (encap)
            features &= skb->dev->hw_enc_features;        //如果encap,那么feature与hw_enc_features取交集
        SKB_GSO_CB(skb)->encap_level += ihl;    //用来标示是否为内层报文
     
        skb_reset_transport_header(skb);    //设置transport header值
     
        segs = ERR_PTR(-EPROTONOSUPPORT);
     
        if (skb->encapsulation &&
            skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
            udpfrag = proto == IPPROTO_UDP && encap;
        else
            udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;        //vxlan封装报文走此分支,此时udpfrag为false
     
        ops = rcu_dereference(inet_offloads[proto]);//调用上册协议的GSO处理函数
        if (likely(ops && ops->callbacks.gso_segment))
            segs = ops->callbacks.gso_segment(skb, features);    //UDP或TCP的分段函数
     
        if (IS_ERR_OR_NULL(segs))
            goto out;
     
        skb = segs;//开始处理分段后的skb
        do {
            iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);    //根据分段报文的mac header 和 IP偏移
            if (udpfrag) {                //ip分片报文
                iph->id = htons(id);
                iph->frag_off = htons(offset >> 3);    //设置ip头的frag_off值
                if (skb->next)
                    iph->frag_off |= htons(IP_MF);    //后面还有报文,需要设置more frag标记
                offset += skb->len - nhoff - ihl;    //计算offset值,下一个报文需要使用
            } else {
                iph->id = htons(id++);        //每个报文为完整的IP报文
            }
            iph->tot_len = htons(skb->len - nhoff);
            ip_send_check(iph);                //计算ip头 csum值
            if (encap)        //如果encap值非空,说明当前处于内层报文中,所以需要设置inner heaer值
                skb_reset_inner_headers(skb);
            skb->network_header = (u8 *)iph - skb->head;    //设置network header
        } while ((skb = skb->next));
     
    out:
        return segs;
    }

      UDP经过GSO分片后每个分片的IP头部id是一样的,这个符合IP分片的逻辑,但是为什么TCP的GSO分片,IP头部的id会依次加1呢?原因是: tcp建立三次握手的过程中产生合适的mss(具体的处理机制参见TCP/IP详解P257),这个mss肯定是<=网络层的最大路径MTU,然后tcp数据封装成ip数据包通过网络层发送,当服务器端传输层接收到tcp数据之后进行tcp重组。所以正常情况下tcp产生的ip数据包在程中是不会发生分片的!由于GSO应该保证对外透明,所以其效果应该也和在TCP层直接分片的效果是一样的,所以这里UDP的处理是IP分片逻辑,但对TCP的处理是构造新的skb逻辑

    l  对于GSO

        UDP:所有分片ip头部id都相同,设置IP_MF分片标志(除最后一片(等同于IP分片)

        TCP:分片后,每个分片IP头部中id1, (等同于TCP分段)

    static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
                        netdev_features_t features)
    {
        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
            return ERR_PTR(-EINVAL);
     
        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {    
            const struct iphdr *iph = ip_hdr(skb);
            struct tcphdr *th = tcp_hdr(skb);    //ip层报文保证了transport header值
     
            /* Set up checksum pseudo header, usually expect stack to
             * have done this already.
             */
     
            th->check = 0;
            skb->ip_summed = CHECKSUM_PARTIAL;
            __tcp_v4_send_check(skb, iph->saddr, iph->daddr);    //计算伪头check值
        }
     
        return tcp_gso_segment(skb, features);    //TCP GSO分段
    }
    struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
                    netdev_features_t features)
    {
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        unsigned int sum_truesize = 0;
        struct tcphdr *th;
        unsigned int thlen;
        unsigned int seq;
        __be32 delta;
        unsigned int oldlen;
        unsigned int mss;
        struct sk_buff *gso_skb = skb;
        __sum16 newcheck;
        bool ooo_okay, copy_destructor;
     
        th = tcp_hdr(skb);
        thlen = th->doff * 4;        //得到tcp头的长度
        if (thlen < sizeof(*th))
            goto out;
     
        if (!pskb_may_pull(skb, thlen)) //再次通过首部中的长度字段检测skb长度是否有效
            goto out;
     //把tcp header移到skb header里,把skb->len存到oldlen中,此时skb->len就只有ip payload的长度(包含TCP首部)
        oldlen = (u16)~skb->len;
        __skb_pull(skb, thlen);        //skb移动到用户数据区(payload)
     
        mss = tcp_skb_mss(skb);        //得到mss值
        if (unlikely(skb->len <= mss))
            goto out;
     
        if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
            /* Packet is from an untrusted source, reset gso_segs. */
            int type = skb_shinfo(skb)->gso_type;
     
            if (unlikely(type &
                     ~(SKB_GSO_TCPV4 |
                       SKB_GSO_DODGY |
                       SKB_GSO_TCP_ECN |
                       SKB_GSO_TCPV6 |
                       SKB_GSO_GRE |
                       SKB_GSO_GRE_CSUM |
                       SKB_GSO_IPIP |
                       SKB_GSO_SIT |
                       SKB_GSO_UDP_TUNNEL |
                       SKB_GSO_UDP_TUNNEL_CSUM |
                       SKB_GSO_TUNNEL_REMCSUM |
                       0) ||
                     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
                goto out; //校验待软GSO分段的的skb,其gso_tpye是否存在其他非法值
     
            skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); //如果报文来源不可信,则重新计算segs,返回
     
            segs = NULL;
            goto out;
        }
     
        copy_destructor = gso_skb->destructor == tcp_wfree;
        ooo_okay = gso_skb->ooo_okay;
        /* All segments but the first should have ooo_okay cleared */
        skb->ooo_okay = 0;
     
        segs = skb_segment(skb, features);    //调用payload根据mss值分段
        if (IS_ERR(segs))
            goto out;
     
        /* Only first segment might have ooo_okay set */
        segs->ooo_okay = ooo_okay;
     
        delta = htonl(oldlen + (thlen + mss));    //TCP头+mss - 原始报文,该值为负值
     
        skb = segs;
        th = tcp_hdr(skb);    //skb_segment分段后,可以直接从skb中获取tcp头, skb_segment或udp4_ufo_fragment保证
        seq = ntohl(th->seq);
     
        if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
            tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
     
        newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +    //第一个报文基于原先值,根据delta快速计算
                               (__force u32)delta));
     
        do {    //刷新分段后报文的TCP头设置
            th->fin = th->psh = 0;
            th->check = newcheck;
     //计算每个分片的校验和
            if (skb->ip_summed != CHECKSUM_PARTIAL)       
                th->check = gso_make_checksum(skb, ~th->check);     //重新计算check值
     
            seq += mss; //重新初始化每个分片的序列号
            if (copy_destructor) {
                skb->destructor = gso_skb->destructor;
                skb->sk = gso_skb->sk;
                sum_truesize += skb->truesize;
            }
            skb = skb->next;
            th = tcp_hdr(skb);
     
            th->seq = htonl(seq);
            th->cwr = 0;
        } while (skb->next);
     
        /* Following permits TCP Small Queues to work well with GSO :
         * The callback to TCP stack will be called at the time last frag
         * is freed at TX completion, and not right now when gso_skb
         * is freed by GSO engine
         */
        if (copy_destructor) {
            swap(gso_skb->sk, skb->sk);
            swap(gso_skb->destructor, skb->destructor);
            sum_truesize += skb->truesize;
            atomic_add(sum_truesize - gso_skb->truesize,
                   &skb->sk->sk_wmem_alloc);
        }
     
        delta = htonl(oldlen + (skb_tail_pointer(skb) -
                    skb_transport_header(skb)) +    //最后一个报文的delta值不同
                  skb->data_len);
        th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
                    (__force u32)delta));
        if (skb->ip_summed != CHECKSUM_PARTIAL)
            th->check = gso_make_checksum(skb, ~th->check);    //重新计算check值
    out:
        return segs;
    }

    skg_segment是实现封装报文GSO分段的基础

    /**
     *    skb_segment - Perform protocol segmentation on skb.
     *    @head_skb: buffer to segment
     *    @features: features for the output path (see dev->features)
     *
     *    This function performs segmentation on the given skb.  It returns
     *    a pointer to the first in a list of new skbs for the segments.
     *    In case of error it returns ERR_PTR(err).
     */
    struct sk_buff *skb_segment(struct sk_buff *head_skb,
                    netdev_features_t features)
    {
        struct sk_buff *segs = NULL;
        struct sk_buff *tail = NULL;
        struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
        skb_frag_t *frag = skb_shinfo(head_skb)->frags;    
        unsigned int mss = skb_shinfo(head_skb)->gso_size;
        //mac头+ip头+tcp头 或mac头+ip头(对于UDP传入时没有将头部偏移过去)
        unsigned int doffset = head_skb->data - skb_mac_header(head_skb);  //得到内层报头的长度
        struct sk_buff *frag_skb = head_skb;
        unsigned int offset = doffset;
        unsigned int tnl_hlen = skb_tnl_header_len(head_skb);    //得到外层报头的长度,非封装报文该值为0, 是支持封装报文GSO的基础
        unsigned int headroom;
        unsigned int len;
        __be16 proto;
        bool csum;
        int sg = !!(features & NETIF_F_SG);    //是否支持SG
        int nfrags = skb_shinfo(head_skb)->nr_frags;
        int err = -ENOMEM;
        int i = 0;
        int pos;
        int dummy;
     
        __skb_push(head_skb, doffset);        //报文移到内层报文的mac头
        proto = skb_network_protocol(head_skb, &dummy);    //报文协议类型
        if (unlikely(!proto))
            return ERR_PTR(-EINVAL);
     
        csum = !head_skb->encap_hdr_csum &&
            !!can_checksum_protocol(features, proto);
     
        headroom = skb_headroom(head_skb);    //得到报文的headroom大小
        pos = skb_headlen(head_skb);        //报文线性区长度
     
        do {
            struct sk_buff *nskb;
            skb_frag_t *nskb_frag;
            int hsize;
            int size;
     /* offset为分片已处理的长度,len为skb->len减去直到offset的部分。开始时,offset只是mac header + ip header + tcp header的长度,
     len即tcp payload的长度。随着segment增加, offset每次都增加mss长度。
     因此len的定义是每个segment的payload长度(最后一个segment的payload可能小于一个mss长度)
     */
            len = head_skb->len - offset;    //计算报文待拷贝的长度,不包括包头
            if (len > mss)   //len为本次要创建的新分片的长度         
                len = mss;        //len超过mss,则只能拷贝mss长度
     // hsize为线性区部分的payload减去offset后的大小,如果hsize小于0,那么说明payload在skb的frags或frag_list中。
     //随着offset一直增长,必定会有hsize一直<0的情况开始出现,除非skb是一个完全linearize化的skb
            hsize = skb_headlen(head_skb) - offset;    //待拷贝的线性区长度
            if (hsize < 0)
                hsize = 0;//这种情况说明线性区已经没有tcp payload的部分,需要pull数据过来
            if (hsize > len || !sg)
                hsize = len;//如果不支持NETIF_F_SG或者hsize大于len,那么hsize就为len(本次新分片的长度),此时说明segment的payload还在skb 线性区中
      //如果把frags数组中的数据拷贝完还不够len长度,则需要从frag_list中拷贝了
      //表示需要从frags数组或者frag_list链表中拷贝出数据,i >= nfrags说明frags数组中的数据也拷贝完了//下面需要从frag_list链表中拷贝数据了
            if (!hsize && i >= nfrags && skb_headlen(list_skb) && 
                (skb_headlen(list_skb) == len || sg)) {
                BUG_ON(skb_headlen(list_skb) > len);    //frag_list中的skb线性区长度不超过len,即mss值
     
                i = 0;
                nfrags = skb_shinfo(list_skb)->nr_frags;
                frag = skb_shinfo(list_skb)->frags;
                frag_skb = list_skb;
                pos += skb_headlen(list_skb);    //增加线性区长度
     
                while (pos < offset + len) {    //只能拷贝len长度
                    BUG_ON(i >= nfrags);
     
                    size = skb_frag_size(frag);
                    if (pos + size > offset + len)
                        break;
     
                    i++;
                    pos += size;        //增加frag的长度
                    frag++;
                }
                //frag_list的数据不用真的拷贝,只需要拷贝其skb描述符,就可以复用其数据区
                nskb = skb_clone(list_skb, GFP_ATOMIC);    //克隆报文,该报文包含完整的数据,需要裁剪
                list_skb = list_skb->next;
     
                if (unlikely(!nskb))
                    goto err;
     
                if (unlikely(pskb_trim(nskb, len))) {    //裁剪报文到len长度
                    kfree_skb(nskb);
                    goto err;
                }
     
                hsize = skb_end_offset(nskb); //保证新的skb的headroom有mac header+ip header+tcp/udp+header的大小
                if (skb_cow_head(nskb, doffset + headroom)) {    //扩展head,以容得下外层报头
                    kfree_skb(nskb);
                    goto err;
                }
                //调整truesize,使其包含本次已分片的数据部分长度(hsize)
                nskb->truesize += skb_end_offset(nskb) - hsize;    //truesize值刷新
                skb_release_head_state(nskb);
                __skb_push(nskb, doffset);    //skb移动到内层报文的mac头
            } else {
            //每次要拷贝出的数据长度为len,其中hsize位于线性区
                nskb = __alloc_skb(hsize + doffset + headroom,    //skb的frag还未使用完,采用新申请skb的方式
                           GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
                           NUMA_NO_NODE);
     
                if (unlikely(!nskb))
                    goto err;
     
                skb_reserve(nskb, headroom);    //skb预留headroom长度
                __skb_put(nskb, doffset);    //线性区扩展内层报头长度
            }
     
            if (segs)
                tail->next = nskb;
            else
                segs = nskb;
            tail = nskb;
     
            __copy_skb_header(nskb, head_skb);    //拷贝skb的相关信息,包括header都拷贝了
     
            skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);    //刷新header值
            skb_reset_mac_len(nskb);    //重置mac len值
      //把skb->data开始doffset长度的内容拷贝到nskb->data中
            skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,    //拷贝外两层报头(如果封装的话)
                             nskb->data - tnl_hlen,
                             doffset + tnl_hlen);
     
            if (nskb->len == len + doffset)        //对于使用frag_list场景,满足条件;拷贝frag场景不满足
                goto perform_csum_check;
     
            if (!sg && !nskb->remcsum_offload) {//如果不支持NETIF_F_SG,说明frags数组中没有数据,只考虑从线性区中拷贝数据
                nskb->ip_summed = CHECKSUM_NONE;
                nskb->csum = skb_copy_and_csum_bits(head_skb, offset,    //计算cusm值
                                    skb_put(nskb, len),
                                    len, 0);
                SKB_GSO_CB(nskb)->csum_start =
                    skb_headroom(nskb) + doffset;
                continue;
            }
     
            nskb_frag = skb_shinfo(nskb)->frags;
            //如果hsize不为0,那么拷贝hsize的内容到nskb的线性区中
            skb_copy_from_linear_data_offset(head_skb, offset,    //拷贝线性区数据
                             skb_put(nskb, hsize), hsize);
     
            skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
                SKBTX_SHARED_FRAG;
                
     //每次要拷贝的数据长度是len,其中hsize是位于线性区中,但是随着线性区数据逐渐被处理,hsize可能不够len,这时剩下的(len-hsize)长度就要从frags数组中拷贝了
            while (pos < offset + len) {    
                if (i >= nfrags) {
                    BUG_ON(skb_headlen(list_skb));
     
                    i = 0;
                    nfrags = skb_shinfo(list_skb)->nr_frags;
                    frag = skb_shinfo(list_skb)->frags;
                    frag_skb = list_skb;
     
                    BUG_ON(!nfrags);
     
                    list_skb = list_skb->next;    //frag_list场景,取下一个skb
                }
     
                if (unlikely(skb_shinfo(nskb)->nr_frags >=
                         MAX_SKB_FRAGS)) {
                    net_warn_ratelimited(
                        "skb_segment: too many frags: %u %u
    ",
                        pos, mss);
                    goto err;
                }
     
                if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
                    goto err;
     
                *nskb_frag = *frag;    //frag_list的逻辑和frag的逻辑合并在了一起,增加了复杂度
                __skb_frag_ref(nskb_frag);
                size = skb_frag_size(nskb_frag);
     
                if (pos < offset) {//pos初始为线性区长度,后来表示已经被拷贝的长度
                    nskb_frag->page_offset += offset - pos;
                    skb_frag_size_sub(nskb_frag, offset - pos);  //frag分拆
                }
     
                skb_shinfo(nskb)->nr_frags++;
     
                if (pos + size <= offset + len) {
                    i++;
                    frag++;
                    pos += size;
                } else {
                
                    skb_frag_size_sub(nskb_frag, pos + size - (offset + len));    //frag分拆
                    goto skip_fraglist;
                }
     
                nskb_frag++;
            }
     
    skip_fraglist:
            nskb->data_len = len - hsize;
            nskb->len += nskb->data_len;
            nskb->truesize += nskb->data_len;
     
    perform_csum_check:
            if (!csum && !nskb->remcsum_offload) {
                nskb->csum = skb_checksum(nskb, doffset,
                              nskb->len - doffset, 0);    //计算csum值
                nskb->ip_summed = CHECKSUM_NONE;
                SKB_GSO_CB(nskb)->csum_start =
                    skb_headroom(nskb) + doffset;
            }
        } while ((offset += len) < head_skb->len);
     
        /* Some callers want to get the end of the list.
         * Put it in segs->prev to avoid walking the list.
         * (see validate_xmit_skb_list() for example)
         */
        segs->prev = tail;
     
        /* Following permits correct backpressure, for protocols
         * using skb_set_owner_w().
         * Idea is to tranfert ownership from head_skb to last segment.
         */
        if (head_skb->destructor == sock_wfree) {
            swap(tail->truesize, head_skb->truesize);
            swap(tail->destructor, head_skb->destructor);
            swap(tail->sk, head_skb->sk);
        }
        return segs;
     
    err:
        kfree_skb_list(segs);
        return ERR_PTR(err);
    }

    输出报文 分片:

    int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
               int (*output)(struct net *, struct sock *, struct sk_buff *))
    {
        struct iphdr *iph;
        int ptr;
        struct sk_buff *skb2;
        unsigned int mtu, hlen, left, len, ll_rs;
        int offset;
        __be16 not_last_frag;
        struct rtable *rt = skb_rtable(skb);
        int err = 0;
    
        /* for offloaded checksums cleanup checksum before fragmentation */
        /* PARTIAL类型需要清除校验和 */
        if (skb->ip_summed == CHECKSUM_PARTIAL &&
            (err = skb_checksum_help(skb)))
            goto fail;
    
        /*
         *    Point into the IP datagram header.
         */
    
        iph = ip_hdr(skb);
    
        /* 获取mtu */
        mtu = ip_skb_dst_mtu(sk, skb);
    
        /* 接收到的最大分片长度 < mtu,则将mtu设置为该值 */
        if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
            mtu = IPCB(skb)->frag_max_size;
    
        /*
         *    Setup starting values.
         */
    
        hlen = iph->ihl * 4;
        mtu = mtu - hlen;    /* Size of data space */
        IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
    
        /* When frag_list is given, use it. First, check its validity:
         * some transformers could create wrong frag_list or break existing
         * one, it is not prohibited. In this case fall back to copying.
         *
         * LATER: this step can be merged to real generation of fragments,
         * we can switch to copy when see the first bad fragment.
         */
        /* 有分片列表 */
        if (skb_has_frag_list(skb)) {
            struct sk_buff *frag, *frag2;
    
            /* 线性区域和分页区的数据长度 */
            unsigned int first_len = skb_pagelen(skb);
    
            /* 以下情况,进入慢路处理 */
            if (first_len - hlen > mtu || /* 分片长度>MTU */
                ((first_len - hlen) & 7) || /* 没有8字节对齐 */
                ip_is_fragment(iph) || /* 是一个分片 */
                skb_cloned(skb)) /* 是克隆的 */
                goto slow_path;
    
            /* 遍历分片列表 */
            skb_walk_frags(skb, frag) {
                /* Correct geometry. */
                /* 以下情况,恢复状态,进入慢速路径 */
                if (frag->len > mtu || /* 分片长度>mtu */
                    ((frag->len & 7) && frag->next) || /* 除最后一个分片外,其余有非8字节对齐的 */
                    skb_headroom(frag) < hlen) /* 头部长度过小 */
                    goto slow_path_clean;
    
                /* Partially cloned skb? */
                /* 克隆的,恢复状态,进入慢速路径 */
                if (skb_shared(frag))
                    goto slow_path_clean;
    
                BUG_ON(frag->sk);
    
                /* 分片关联控制块 */
                if (skb->sk) {
                    frag->sk = skb->sk;
                    frag->destructor = sock_wfree;
                }
    
                /* 第一个skb的长度去掉当前分片的长度 */
                skb->truesize -= frag->truesize;
            }
    
            /* Everything is OK. Generate! */
    
            /* 现在分片没问题了,设置分片信息 */
            err = 0;
            offset = 0;
            frag = skb_shinfo(skb)->frag_list;
            skb_frag_list_init(skb);
            skb->data_len = first_len - skb_headlen(skb);
            skb->len = first_len;
            iph->tot_len = htons(first_len);
            iph->frag_off = htons(IP_MF);
            ip_send_check(iph);
    
            /* 循环设置分片信息,并发送 */
            for (;;) {
                /* Prepare header of the next frame,
                 * before previous one went down. */
                 /* 为每一片都拷贝ip头,设置偏移信息 */
                if (frag) {
                    frag->ip_summed = CHECKSUM_NONE;
                    skb_reset_transport_header(frag);
                    __skb_push(frag, hlen);
                    skb_reset_network_header(frag);
                    memcpy(skb_network_header(frag), iph, hlen);
                    iph = ip_hdr(frag);
                    iph->tot_len = htons(frag->len);
                    ip_copy_metadata(frag, skb);
                    if (offset == 0)
                        ip_options_fragment(frag);
                    offset += skb->len - hlen;
                    iph->frag_off = htons(offset>>3);
                    if (frag->next)
                        iph->frag_off |= htons(IP_MF);
                    /* Ready, complete checksum */
                    ip_send_check(iph);
                }
    
                /* 调用发送回调 */
                err = output(net, sk, skb);
    
                if (!err)
                    IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
                if (err || !frag)
                    break;
    
                skb = frag;
                frag = skb->next;
                skb->next = NULL;
            }
    
            if (err == 0) {
                IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
                return 0;
            }
    
            /* 出错,释放分片 */
            while (frag) {
                skb = frag->next;
                kfree_skb(frag);
                frag = skb;
            }
            IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
            return err;
    
    slow_path_clean:
            /* 将分片恢复原状态 */
            skb_walk_frags(skb, frag2) {
                if (frag2 == frag)
                    break;
                frag2->sk = NULL;
                frag2->destructor = NULL;
                skb->truesize += frag2->truesize;
            }
        }
    
    slow_path:
        /* 慢速分片路径 */
    
    
        iph = ip_hdr(skb);
    
        /* 除去首部的剩余空间 */
        left = skb->len - hlen;        /* Space per frame */
        ptr = hlen;        /* Where to start from */
    
        /* 二层头部空间 */
        ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
    
        /*
         *    Fragment the datagram.
         */
    
        /* 初始化mf和offset */
        offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
        not_last_frag = iph->frag_off & htons(IP_MF);
    
        /*
         *    Keep copying data until we run out.
         */
    
        /* 开始分片了 */
        while (left > 0) {
            /* len初始为剩余长度 */
            len = left;
            /* IF: it doesn't fit, use 'mtu' - the data space left */
            /* 根据mtu确认长度 */
            if (len > mtu)
                len = mtu;
            /* IF: we are not sending up to and including the packet end
               then align the next start on an eight byte boundary */
            /* 除最后分片外,其余8字节对齐 */
            if (len < left)    {
                len &= ~7;
            }
    
            /* Allocate buffer */
            /* 分配skb */
            skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
            if (!skb2) {
                err = -ENOMEM;
                goto fail;
            }
    
            /*
             *    Set up data on packet
             */
    
            /* 拷贝元数据 */
            ip_copy_metadata(skb2, skb);
    
            /* 预留空间,设置头部偏移 */
            skb_reserve(skb2, ll_rs);
            skb_put(skb2, len + hlen);
            skb_reset_network_header(skb2);
            skb2->transport_header = skb2->network_header + hlen;
    
            /*
             *    Charge the memory for the fragment to any owner
             *    it might possess
             */
            /* 关联sk */
            if (skb->sk)
                skb_set_owner_w(skb2, skb->sk);
    
            /*
             *    Copy the packet header into the new buffer.
             */
    
            /* 拷贝头部 */
            skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
    
            /*
             *    Copy a block of the IP datagram.
             */
            /* 拷贝数据 */
            if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
                BUG();
            left -= len;
    
            /*
             *    Fill in the new header fields.
             */
            iph = ip_hdr(skb2);
    
            /* 设置偏移 *//
            iph->frag_off = htons((offset >> 3));
    
            /* 转发的数据包,带有FRAG_PMTU标记,则打上DF */
            if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
                iph->frag_off |= htons(IP_DF);
    
            /* ANK: dirty, but effective trick. Upgrade options only if
             * the segment to be fragmented was THE FIRST (otherwise,
             * options are already fixed) and make it ONCE
             * on the initial skb, so that all the following fragments
             * will inherit fixed options.
             */
            /* 第一个分片包含ip选项 */
            if (offset == 0)
                ip_options_fragment(skb);
    
            /*
             *    Added AC : If we are fragmenting a fragment that's not the
             *           last fragment then keep MF on each bit
             */
            /* 不是最后分片需要设定MF标记 */
            if (left > 0 || not_last_frag)
                iph->frag_off |= htons(IP_MF);
    
            /* 指针和偏移更新 */
            ptr += len;
            offset += len;
    
            /*
             *    Put this fragment into the sending queue.
             */
            /* 设置数据长度 */
            iph->tot_len = htons(len + hlen);
    
            /* 校验和 */
            ip_send_check(iph);
    
            /* 发送分片 */
            err = output(net, sk, skb2);
            if (err)
                goto fail;
    
            IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
        }
    
        /* 分片完成并发送,释放skb */
        consume_skb(skb);
        IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
        return err;
    
    fail:
    
        /* 出错,释放skb */
        kfree_skb(skb);
        IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
        return err;
    }
    http代理服务器(3-4-7层代理)-网络事件库公共组件、内核kernel驱动 摄像头驱动 tcpip网络协议栈、netfilter、bridge 好像看过!!!! 但行好事 莫问前程 --身高体重180的胖子
  • 相关阅读:
    go_接口
    go_封装
    go_结构体和方法
    go_字符和字符串处理
    go_Map
    为啥别人运行程序那么快,而你的却是龟速?
    大一新生开发的小工具火了!不一样的Python编程体验,现在的新生都这么厉害的吗
    十七种方法轻松解决PyTorch训练速度慢!
    Leetcode 1577 数的平方等于两数乘积的方法数
    C++11的decltype关键字
  • 原文地址:https://www.cnblogs.com/codestack/p/14890786.html
Copyright © 2020-2023  润新知