• Linux TCP/IP 协议栈学习(2)—— 数据帧收发主要函数及net_device 结构


     

    /**
     *    netif_rx    -     post buffer to the network code
     *    @skb: buffer to post
     *
     *    This function receives a packet from a device driver and queues it for
     *    the upper (protocol) levels to process.  It always succeeds. The buffer
     *    may be dropped during processing for congestion control or by the
     *    protocol layers.
     *
     *    return values:
     *    NET_RX_SUCCESS    (no congestion)
     *    NET_RX_DROP     (packet was dropped)
     *
     */
     
    int netif_rx( struct sk_buff * skb)
    {
           struct softnet_data * queue;
           unsigned long flags ;
     
           /* if netpoll wants it, pretend we never saw it */
           if (netpoll_rx (skb ))
                 return NET_RX_DROP;
     
           if (!skb->tstamp .tv64 ) //得到帧接收的时间
                 net_timestamp(skb);
     
           /*
           * The code is rearranged so that the path is the most
           * short when CPU is congested, but is still operating.
           */
           local_irq_save(flags);
           queue = &__get_cpu_var (softnet_data );//获取当前CPU的 softnet_data 数据
                   
           __get_cpu_var(netdev_rx_stat ).total ++;//当前CPU接收的帧数+1
           if (queue->input_pkt_queue .qlen <= netdev_max_backlog) {
                   //监测设备是否还有空间来存储帧,如果空间已满,表示网络阻塞严重,则返回一个错误,此后cpu将丢掉再来的帧。
                 if (queue->input_pkt_queue .qlen ) {
    enqueue: 
             
                                       //将该帧加入到softnet_data队列
                                 __skb_queue_tail(&queue ->input_pkt_queue, skb);
                       local_irq_restore(flags);
                       return NET_RX_SUCCESS;
                }
    //当队列是空的时候,表明这个队列并没有被软中断所schedule,因此我们需要将此队列加入到软中断的处理链表中。可以看到加入的正好是backlog,由于调用netif_rx的是非napi的驱动,因此backlog就是初始化时的process_backlog函数。
                 napi_schedule(&queue ->backlog);
                 goto enqueue;
          }
     
           __get_cpu_var(netdev_rx_stat ).dropped ++;
           local_irq_restore(flags);
     
           kfree_skb(skb);
           return NET_RX_DROP;
    }
    // 上面代码中用到一个关键的数据结构 softnet_data ,在网卡收发数据的时候,需要维护一个缓冲区队列,来缓存可能存在的突发数据,在协议栈中用一个队列层来表示该缓冲区,队列层位于数据链路层和网络层之间。softnet_data 就是数据链路层中的数据结构,它是一个Per-CPU变量,每个CPU都有一个
     
     
    /**
     *    netif_receive_skb - process receive buffer from network
     *    @skb: buffer to process
     *
     *    netif_receive_skb() is the main receive data processing function.
     *    It always succeeds. The buffer may be dropped during processing
     *    for congestion control or by the protocol layers.
     *
     *    This function may only be called from softirq context and interrupts
     *    should be enabled.
     *
     *    Return values (usually ignored):
     *    NET_RX_SUCCESS: no congestion
     *    NET_RX_DROP: packet was dropped
     */
    //netif_receive_skb 是对于 netif_rx 的 NAPI 对等函数; 它递交一个报文给内核. 当一个 NAPI 兼容的驱动已耗尽接收报文的供应, 它应当重开中断, 并且调用 netif_rx_complete(现在是 __napi_complete()) 来停止轮询.
    int netif_receive_skb( struct sk_buff * skb)
    {
           struct packet_type * ptype, *pt_prev ;
           struct net_device * orig_dev;
           struct net_device * master;
           struct net_device * null_or_orig;
           struct net_device * null_or_bond;
           int ret = NET_RX_DROP;
           __be16 type;
     
           if (!skb->tstamp .tv64 )
                 net_timestamp(skb);
     
           if (vlan_tx_tag_present (skb ) && vlan_hwaccel_do_receive(skb))
                 return NET_RX_SUCCESS;
     
           /* if we've gotten here through NAPI, check netpoll */
           if (netpoll_receive_skb (skb ))
                 return NET_RX_DROP;
     
           if (!skb->skb_iif )
                 skb->skb_iif = skb ->dev-> ifindex;// 记录帧的入口
     
           null_or_orig = NULL;
           orig_dev = skb->dev;
           master = ACCESS_ONCE (orig_dev ->master);
           if (master) {
                 if (skb_bond_should_drop (skb , master ))
                       null_or_orig = orig_dev ; /* deliver only exact match */
                 else
                       skb->dev = master ;
          }
     
           __get_cpu_var(netdev_rx_stat ).total ++;
     
           skb_reset_network_header(skb);
           skb_reset_transport_header(skb);
           skb->mac_len = skb ->network_header - skb->mac_header ;
     
           pt_prev = NULL;
     
           rcu_read_lock();
     
    #ifdef CONFIG_NET_CLS_ACT
           if (skb->tc_verd & TC_NCLS) {
                 skb->tc_verd = CLR_TC_NCLS( skb->tc_verd );
                 goto ncls;
          }
    #endif
              //处理 ptype_all 上所有的 packet_type->func() ,这里先提一下Linux 是根据packet_type 通过 dev_add_pack() 函数来注册相应的处理函数,后面会讲如何注册,每种包对应哪个处理函数
              // static struct list_head ptype_all __read_mostly;   
           list_for_each_entry_rcu(ptype, &ptype_all , list ) {
                 if (ptype->dev == null_or_orig || ptype->dev == skb-> dev ||
                   ptype->dev == orig_dev) {
                       if (pt_prev)
                             ret = deliver_skb (skb , pt_prev , orig_dev );//调用相应的包处理函数
                       pt_prev = ptype;
                }
          }
     
    #ifdef CONFIG_NET_CLS_ACT
           skb = handle_ing (skb , &pt_prev , &ret , orig_dev );
           if (!skb)
                 goto out;
    ncls:
    #endif
                   //若编译内核时选上BRIDGE,下面会执行网桥模块
           skb = handle_bridge (skb , &pt_prev , &ret , orig_dev );
           if (!skb)
                 goto out;
                  //编译内核时选上MAC_VLAN模块,下面才会执行
           skb = handle_macvlan (skb , &pt_prev , &ret , orig_dev );
           if (!skb)
                 goto out;
     
           /*
           * Make sure frames received on VLAN interfaces stacked on
           * bonding interfaces still make their way to any base bonding
           * device that may have registered for a specific ptype.  The
           * handler may have to adjust skb->dev and orig_dev.
           */
           null_or_bond = NULL;
           if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
             (vlan_dev_real_dev( skb->dev)->priv_flags & IFF_BONDING)) {
                 null_or_bond = vlan_dev_real_dev (skb ->dev);
          }
         //最后 type = skb->protocol; &ptype_base[ntohs(type)&15]处理ptype_base[ntohs(type)&15]上的所有的 packet_type->func(),根据第二层不同协议来进入不同的钩子函数,重要的有:ip_rcv(), arp_rcv()
           type = skb->protocol ;
           list_for_each_entry_rcu(ptype,
                       &ptype_base[ntohs (type ) & PTYPE_HASH_MASK], list) {
                 if (ptype->type == type && (ptype ->dev == null_or_orig ||
                    ptype->dev == skb-> dev || ptype->dev == orig_dev ||
                    ptype->dev == null_or_bond)) {
                       if (pt_prev)
                             ret = deliver_skb (skb , pt_prev , orig_dev );
                       pt_prev = ptype;
                }
          }
     
           if (pt_prev) {
                 ret = pt_prev ->func( skb, skb->dev, pt_prev , orig_dev );
          } else {
                 kfree_skb(skb);
                 /* Jamal, now you will not able to escape explaining
                 * me how you were going to use this. :-)
                 */
                 ret = NET_RX_DROP ;
          }
     
    out:
           rcu_read_unlock();
           return ret;
    }
     
     
    /**
     *    dev_queue_xmit - transmit a buffer
     *    @skb: buffer to transmit
     *
     *    Queue a buffer for transmission to a network device. The caller must
     *    have set the device and priority and built the buffer before calling
     *    this function. The function can be called from an interrupt.
     *
     *    A negative errno code is returned on a failure. A success does not
     *    guarantee the frame will be transmitted as it may be dropped due
     *    to congestion or traffic shaping.
     *
     * -----------------------------------------------------------------------------------
     *      I notice this method can also return errors from the queue disciplines,
     *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
     *      be positive.
     *
     *      Regardless of the return value, the skb is consumed, so it is currently
     *      difficult to retry a send to this method.  (You can bump the ref count
     *      before sending to hold a reference for retry if you are careful.)
     *
     *      When calling this method, interrupts MUST be enabled.  This is because
     *      the BH enable code must have IRQs enabled so that it will not deadlock.
     *          --BLG
     */
    int dev_queue_xmit( struct sk_buff * skb)
    {
           struct net_device * dev = skb->dev;
           struct netdev_queue * txq;
           struct Qdisc * q;
           int rc = - ENOMEM;
     
           /* GSO will handle the following emulations directly. */
           if (netif_needs_gso (dev , skb ))//如果是GSO数据包,且设备支持GSO数据包的处理
                 goto gso;
     
           /* Convert a paged skb to linear, if required */
           if (skb_needs_linearize (skb , dev ) && __skb_linearize(skb))
                 goto out_kfree_skb;
     
           /* If packet is not checksummed and device does not support
           * checksumming for this protocol, complete checksumming here.
           */
           if (skb->ip_summed == CHECKSUM_PARTIAL) {
                 skb_set_transport_header(skb, skb->csum_start -
                                       skb_headroom(skb));
                 if (!dev_can_checksum (dev , skb ) && skb_checksum_help(skb))
                       goto out_kfree_skb;
          }
     
    gso:
           /* Disable soft irqs for various locks below. Also
           * stops preemption for RCU.
           */
           rcu_read_lock_bh();
     
           txq = dev_pick_tx (dev , skb );
           q = rcu_dereference_bh(txq->qdisc );
     
    #ifdef CONFIG_NET_CLS_ACT
           skb->tc_verd = SET_TC_AT( skb->tc_verd , AT_EGRESS );
    #endif
           if (q->enqueue ) {
                 rc = __dev_xmit_skb (skb , q , dev , txq );
                 goto out;
          }
     
           /* The device has no queue. Common case for software devices:
             loopback, all the sorts of tunnels...
     
             Really, it is unlikely that netif_tx_lock protection is necessary
             here.  (f.e. loopback and IP tunnels are clean ignoring statistics
             counters.)
             However, it is possible, that they rely on protection
             made by us here.
     
             Check this and shot the lock. It is not prone from deadlocks.
             Either shot noqueue qdisc, it is even simpler 8)
           */
           if (dev->flags & IFF_UP) {
                 int cpu = smp_processor_id(); /* ok because BHs are off */
     
                 if (txq->xmit_lock_owner != cpu) {
     
                       HARD_TX_LOCK(dev, txq, cpu);
     
                       if (!netif_tx_queue_stopped (txq )) {
                             rc = dev_hard_start_xmit (skb , dev , txq );
                             if (dev_xmit_complete (rc )) {
                                   HARD_TX_UNLOCK(dev, txq);
                                   goto out;
                            }
                      }
                       HARD_TX_UNLOCK(dev, txq);
                       if (net_ratelimit ())
                             printk(KERN_CRIT "Virtual device %s asks to "
                                   "queue packet!\n" , dev ->name);
                } else {
                       /* Recursion is detected! It is possible,
                       * unfortunately */
                       if (net_ratelimit ())
                             printk(KERN_CRIT "Dead loop on virtual device "
                                   "%s, fix it urgently!\n" , dev ->name);
                }
          }
     
           rc = -ENETDOWN ;
           rcu_read_unlock_bh();
     
    out_kfree_skb:
           kfree_skb(skb);
           return rc;
    out:
           rcu_read_unlock_bh();
           return rc;
    }
     
    数据链路层不得不谈到 struct net_device 相关结构,在2.6.29之后 net_device 结构进行了调整,操作函数被重构到了 net_device_ops 中。下面简要分析一下:
    struct net_device 
    {
    /*
    This first field, name, is the beginning of the visible part of this structure. It contains the string
    that is the name of the interface. By visible, we mean that this part of the data structure is generic
    and doesn’t contain any private areas specific to a particular type of device
    .
    */
           char               name[IFNAMSIZ ];
           /* device name hash chain */
           struct hlist_node name_hlist;
           /* snmp alias */
           char               *ifalias ;
     
           /*
           *    I/O specific fields
           *    FIXME: Merge these and struct ifmap into one
           */
           unsigned long            mem_end;     /* shared mem end */
           unsigned long            mem_start;   /* shared mem start     */
           unsigned long            base_addr;   /* device I/O address   */
           unsigned int             irq;         /* device IRQ number    */
     
          /*
           *    Some hardware also needs these fields, but they are not
           *    part of the usual set specified in Space.c.
           */
     
           unsigned char            if_port;    /* Selectable AUI, TP,..*/
           unsigned char            dma;        /* DMA channel          */
     
           unsigned long            state;
    /*
    */
           struct list_head  dev_list;
           struct list_head  napi_list;
           struct list_head  unreg_list;
     
           /* Net device features */
           unsigned long            features;
    /*
    */
    #define NETIF_F_SG            1     /* Scatter/gather IO. */
    #define NETIF_F_IP_CSUM       2     /* Can checksum TCP/UDP over IPv4. */
    #define NETIF_F_NO_CSUM       4     /* Does not require checksum. F.e. loopack. */
    #define NETIF_F_HW_CSUM       8     /* Can checksum all the packets. */
    #define NETIF_F_IPV6_CSUM     16    /* Can checksum TCP/UDP over IPV6 */
    #define NETIF_F_HIGHDMA       32    /* Can DMA to high memory. */
    #define NETIF_F_FRAGLIST      64    /* Scatter/gather IO. */
    #define NETIF_F_HW_VLAN_TX    128   /* Transmit VLAN hw acceleration */
    #define NETIF_F_HW_VLAN_RX    256   /* Receive VLAN hw acceleration */
    #define NETIF_F_HW_VLAN_FILTER      512   /* Receive filtering on VLAN */
    #define NETIF_F_VLAN_CHALLENGED     1024  /* Device cannot handle VLAN packets */
    #define NETIF_F_GSO           2048  /* Enable software GSO. */
    #define NETIF_F_LLTX          4096  /* LockLess TX - deprecated. Please */
                                  /* do not use LLTX in new drivers */
    #define NETIF_F_NETNS_LOCAL   8192  /* Does not change network namespaces */
    #define NETIF_F_GRO           16384 /* Generic receive offload */
    #define NETIF_F_LRO           32768 /* large receive offload */
     
    /* the GSO_MASK reserves bits 16 through 23 */
    #define NETIF_F_FCOE_CRC      (1 << 24) /* FCoE CRC32 */
    #define NETIF_F_SCTP_CSUM     (1 << 25) /* SCTP checksum offload */
    #define NETIF_F_FCOE_MTU      (1 << 26) /* Supports max FCoE MTU, 2158 bytes*/
    #define NETIF_F_NTUPLE        (1 << 27) /* N-tuple filters supported */
     
          /* Segmentation offload features */
    #define NETIF_F_GSO_SHIFT     16
    #define NETIF_F_GSO_MASK      0x00ff0000
    #define NETIF_F_TSO           (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
    #define NETIF_F_UFO           (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)
    #define NETIF_F_GSO_ROBUST    (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
    #define NETIF_F_TSO_ECN       (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)
    #define NETIF_F_TSO6          (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
    #define NETIF_F_FSO           (SKB_GSO_FCOE << NETIF_F_GSO_SHIFT)
     
          /* List of features with software fallbacks. */
    #define NETIF_F_GSO_SOFTWARE  (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
     
     
    #define NETIF_F_GEN_CSUM      (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
    #define NETIF_F_V4_CSUM       (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM)
    #define NETIF_F_V6_CSUM       (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)
    #define NETIF_F_ALL_CSUM      (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)
     
          /*
           * If one device supports one of these features, then enable them
           * for all in netdev_increment_features.
           */
    #define NETIF_F_ONE_FOR_ALL   (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
                             NETIF_F_SG | NETIF_F_HIGHDMA |             \
                             NETIF_F_FRAGLIST )
     
           /* Interface index. Unique device identifier    */
           int                ifindex;
           int                iflink;
     
           struct net_device_stats stats;
     
    #ifdef CONFIG_WIRELESS_EXT
          /* List of functions to handle Wireless Extensions (instead of ioctl).
           * See <net/iw_handler.h> for details. Jean II */
           const struct iw_handler_def  wireless_handlers;
           /* Instance data managed by the core of Wireless Extensions. */
           struct iw_public_data  wireless_data;
    #endif
          /* Management operations */
           const struct net_device_ops *netdev_ops ;
           const struct ethtool_ops *ethtool_ops ;
     
           /* Hardware header description */
           const struct header_ops *header_ops ;
     
           unsigned int             flags;       /* interface flags (a la BSD)      */
           unsigned short           gflags;
            unsigned short          priv_flags; /* Like 'flags' but invisible to userspace. */
           unsigned short           padded;     /* How much padding added by alloc_netdev() */
     
           unsigned char            operstate; /* RFC2863 operstate */
           unsigned char            link_mode; /* mapping policy to operstate */
     
           unsigned           mtu;  /* interface MTU value        */
           unsigned short           type/* interface hardware type    */
           unsigned short           hard_header_len;  /* hardware hdr length      */
     
          /* extra head- and tailroom the hardware may need, but not in all cases
           * can this be guaranteed, especially tailroom. Some cases also use
           * LL_MAX_HEADER instead to allocate the skb.
           */
           unsigned short           needed_headroom;
           unsigned short           needed_tailroom;
     
           struct net_device * master; /* Pointer to master device of a group,
                                    * which this device is member of.
                                    */
     
          /* Interface address info. */
           unsigned char            perm_addr[MAX_ADDR_LEN ]; /* permanent hw address */
           unsigned char            addr_len;   /* hardware address length      */
           unsigned short          dev_id;           /* for shared network cards */
     
           struct netdev_hw_addr_list    uc;   /* Secondary unicast
                                           mac addresses */
           int                uc_promisc;
           spinlock_t        addr_list_lock ;
           struct dev_addr_list    *mc_list;   /* Multicast mac addresses      */
           int                mc_count;   /* Number of installed mcasts */
           unsigned int             promiscuity;
           unsigned int             allmulti;
     
     
           /* Protocol specific pointers */
          
     #ifdef CONFIG_NET_DSA
           void               *dsa_ptr ;   /* dsa specific data */
    #endif
           void               *atalk_ptr /* AppleTalk link       */
           void               *ip_ptr ;    /* IPv4 specific data   */
           void                    *dn_ptr ;        /* DECnet specific data */
           void                    *ip6_ptr ;       /* IPv6 specific data */
           void               *ec_ptr ;    /* Econet specific data */
           void               *ax25_ptr ;  /* AX.25 specific data */
           struct wireless_dev     *ieee80211_ptr ;   /* IEEE 802.11 specific data,
                                           assign before registering */
     
    /*
     * Cache line mostly used on receive path (including eth_type_trans())
     */
           unsigned long            last_rx;     /* Time of last Rx      */
          /* Interface address info used in eth_type_trans() */
           unsigned char            *dev_addr ;  /* hw address, (before bcast
                                           because most packets are
                                           unicast) */
     
           struct netdev_hw_addr_list    dev_addrs; /* list of device
                                              hw addresses */
     
           unsigned char            broadcast[MAX_ADDR_LEN ];       /* hw bcast add   */
     
           struct netdev_queue     rx_queue;
     
           struct netdev_queue     *_tx ____cacheline_aligned_in_smp ;
     
           /* Number of TX queues allocated at alloc_netdev_mq() time  */
           unsigned int             num_tx_queues;
     
           /* Number of TX queues currently active in device  */
           unsigned int             real_num_tx_queues;
     
           /* root qdisc from userspace point of view */
           struct Qdisc            *qdisc;
     
           unsigned long            tx_queue_len;      /* Max frames per queue allowed */
           spinlock_t        tx_global_lock ;
    /*
     * One part is mostly used on xmit path (device)
     */
          /* These may be needed for future network-power-down code. */
     
          /*
           * trans_start here is expensive for high speed devices on SMP,
           * please use netdev_queue->trans_start instead.
           */
           unsigned long            trans_start;       /* Time (in jiffies) of last Tx     */
     
           int                watchdog_timeo; /* used by dev_watchdog() */
           struct timer_list watchdog_timer;
     
           /* Number of references to this device */
           atomic_t          refcnt ____cacheline_aligned_in_smp ;
     
           /* delayed register/unregister */
           struct list_head  todo_list;
           /* device index hash chain */
           struct hlist_node index_hlist;
     
           struct list_head  link_watch_list;
     
           /* register/unregister state machine */
           enum { NETREG_UNINITIALIZED =0,
                NETREG_REGISTERED,      /* completed register_netdevice */
                NETREG_UNREGISTERING,   /* called unregister_netdevice */
                NETREG_UNREGISTERED,    /* completed unregister todo */
                NETREG_RELEASED,        /* called free_netdev */
                NETREG_DUMMY,           /* dummy device for NAPI poll */
           } reg_state: 16;
     
           enum {
                 RTNL_LINK_INITIALIZED,
                 RTNL_LINK_INITIALIZING,
          } rtnl_link_state:16;
     
           /* Called from unregister, can be used to call free_netdev */
           void (*destructor )(struct net_device *dev );
     
    #ifdef CONFIG_NETPOLL
           struct netpoll_info     *npinfo;
    #endif
     
    #ifdef CONFIG_NET_NS
          /* Network namespace this network device is inside */
           struct net        *nd_net;
    #endif
     
          /* mid-layer private */
           void               *ml_priv ;
     
           /* bridge stuff */
           struct net_bridge_port  * br_port;
           /* macvlan */
           struct macvlan_port     *macvlan_port ;
           /* GARP */
           struct garp_port  * garp_port;
     
           /* class/net/name entry */
           struct device           dev;
           /* space for optional device, statistics, and wireless sysfs groups */
           const struct attribute_group *sysfs_groups [4];
     
           /* rtnetlink link ops */
           const struct rtnl_link_ops *rtnl_link_ops ;
     
           /* VLAN feature mask */
           unsigned long vlan_features ;
     
           /* for setting kernel sock attribute on TCP connection setup */
    #define GSO_MAX_SIZE          65536
           unsigned int             gso_max_size;
     
    #ifdef CONFIG_DCB
          /* Data Center Bridging netlink ops */
           const struct dcbnl_rtnl_ops *dcbnl_ops ;
    #endif
     
    #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
          /* max exchange id for FCoE LRO by ddp */
           unsigned int             fcoe_ddp_xid;
    #endif
          /* n-tuple filter list attached to this device */
           struct ethtool_rx_ntuple_list ethtool_ntuple_list;
    };
     
     
    make it simple, make it happen
  • 相关阅读:
    Algs4-1.3.44文本编辑器的缓冲区
    Algs4-1.3.42复制栈
    Algs4-1.3.41复制队列采用for方式实现
    Algs4-1.3.39环形缓冲区
    Algs4-1.3.40前移编码
    Algs4-1.3.38删除第k个元素-数组实现
    安全测试的一些漏洞和测试方法
    Java内存自动回收,为什么会有内存泄露?
    三款主流静态源代码安全检测工具比较
    Fortify源码安全检测工具
  • 原文地址:https://www.cnblogs.com/zhuyp1015/p/2833163.html
Copyright © 2020-2023  润新知