• linux内核数据包转发流程(三)网卡帧接收分析


    【版权声明:转载请保留出处:blog.csdn.net/gentleliu。邮箱:shallnew*163.com】

    每一个cpu都有队列来处理接收到的帧,都有其数据结构来处理入口和出口流量,因此,不同cpu之间没有必要使用上锁机制,。此队列数据结构为softnet_data(定义在include/linux/netdevice.h中):

    /*
     * Incoming packets are placed on per-cpu queues so that
     * no locking is needed.
     */
    struct softnet_data
    {
    struct Qdisc *output_queue; 
    struct sk_buff_headinput_pkt_queue;//有数据要传输的设备列表
    struct list_headpoll_list; //双向链表,当中的设备有输入帧等着被处理。
    struct sk_buff*completion_queue;//缓冲区列表,当中缓冲区已成功传输,能够释放掉
    
    
    struct napi_structbacklog;
    };

    此结构字段可用于传输和接收。换而言之,NET_RX_SOFTIRQ和NET_TX_SOFTIRQ软IRQ都引用此结构。入口帧会排入input_pkt_queue(NAPI有所不同)。


    softnet_data是在net_dev_init函数中初始化的:
    /*
     *       This is called single threaded during boot, so no need
     *       to take the rtnl semaphore.
     */
    static int __init net_dev_init(void)
    {
    int i, rc = -ENOMEM;
    
    ......
    
    /*
    * Initialise the packet receive queues.
    */
    
    for_each_possible_cpu(i) {
    struct softnet_data *queue;
    
    queue = &per_cpu(softnet_data, i);
    skb_queue_head_init(&queue->input_pkt_queue);
    queue->completion_queue = NULL;
    INIT_LIST_HEAD(&queue->poll_list);
    
    queue->backlog.poll = process_backlog;
    queue->backlog.weight = weight_p;
    queue->backlog.gro_list = NULL;
    queue->backlog.gro_count = 0;
    }
    
    ......
    
    open_softirq(NET_TX_SOFTIRQ, net_tx_action);
    open_softirq(NET_RX_SOFTIRQ, net_rx_action);
    
    ......
    }
    非NAPI设备驱动会为其所接收的每个帧产生一个中断事件,在高流量负载下,会花掉大量时间处理中断事件,造成资源浪费。而NAPI驱动混合了中断事件和轮询,在高流量负载下其性能会比旧方法要好。
    NAPI主要思想是混合使用中断事件和轮询,而不是只使用中断事件驱动模型。当收到新的帧时,关中断,再一次处理全然部入口队列。从内核观点来看,NAPI方法由于中断事件少了,降低了cpu负载。
    使用非NAPI的驱动程序的xx_rx()函数一般例如以下:
    void xx_rx()
    {
    struct sk_buff *skb;
    
    skb = dev_alloc_skb(pkt_len + 5);
    if (skb != NULL) {
    skb_reserve(skb, 2);/* Align IP on 16 byte boundaries */
    
    /*memcpy(skb_put(skb, 2), pkt, pkt_len);*/ //copy data to skb
    
    skb->protocol = eth_type_trans(skb, dev);
    netif_rx(skb);
    }
    }
    第一步是分配一个缓存区来保存报文。 注意缓存分配函数 (dev_alloc_skb) 须要知道数据长度。

    第二步将报文数据被复制到缓存区; skb_put  函数更新缓存中的数据末尾指针并返回指向新建空间的指针。

    第三步提取协议标识及获取其它信息。

    最后调用netif_rx(skb)做进一步处理,该函数一般定义在net/core/dev.c中。

    int netif_rx(struct sk_buff *skb)
    {
    struct softnet_data *queue;
    unsigned long flags;
    
    /* if netpoll wants it, pretend we never saw it */
    if (netpoll_rx(skb))
    return NET_RX_DROP;
    
    if (!skb->tstamp.tv64)
    net_timestamp(skb);
    
    /*
    * The code is rearranged so that the path is the most
    * short when CPU is congested, but is still operating.
    */
    local_irq_save(flags);
    queue = &__get_cpu_var(softnet_data);
    
    __get_cpu_var(netdev_rx_stat).total++;
    if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {//是否还有空间,netdev_max_backlog一般为300
    //仅仅有当新缓冲区为空时,才会触发软中断(napi_schedule()),假设缓冲区不为空,软中断已被触发,没有必要再去触发一次。
    if (queue->input_pkt_queue.qlen) {
    enqueue:
    __skb_queue_tail(&queue->input_pkt_queue, skb);//这里是关键之处,将skb增加input_pkt_queue之中。
    local_irq_restore(flags);
    return NET_RX_SUCCESS;
    }
    
    napi_schedule(&queue->backlog);//触发软中断
    goto enqueue;
    }
    
    __get_cpu_var(netdev_rx_stat).dropped++;
    local_irq_restore(flags);
    
    kfree_skb(skb);
    return NET_RX_DROP;
    }
    EXPORT_SYMBOL(netif_rx);

    static inline void napi_schedule(struct napi_struct *n)
    {
    	if (napi_schedule_prep(n))
    		__napi_schedule(n);
    }
    

    void __napi_schedule(struct napi_struct *n)
    {
    	unsigned long flags;
    
    	local_irq_save(flags);
    	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);//将该设备增加轮询链表,等待该设备的帧被处理
    	__raise_softirq_irqoff(NET_RX_SOFTIRQ);//终于触发软中断
    	local_irq_restore(flags);
    }
    EXPORT_SYMBOL(__napi_schedule);

    至此中断的上半部完毕,其它的工作交由下半部来实现。napi_schedule(&queue->backlog)函数将有等待的接收数据包的NIC链入softnet_data的poll_list队列,然后触发软中断,让下半部去完毕数据的处理工作。
    而是用NAPI设备的接受数据时直接触发软中断,不须要通过netif_rx()函数设置好接收队列再触发软中断。比方e100硬中断处理函数为:

    static irqreturn_t e100_intr(int irq, void *dev_id)
    {
    	struct net_device *netdev = dev_id;
    	struct nic *nic = netdev_priv(netdev);
    	u8 stat_ack = ioread8(&nic->csr->scb.stat_ack);
    
    	DPRINTK(INTR, DEBUG, "stat_ack = 0x%02X
    ", stat_ack);
    
    	if (stat_ack == stat_ack_not_ours ||	/* Not our interrupt */
    	   stat_ack == stat_ack_not_present)	/* Hardware is ejected */
    		return IRQ_NONE;
    
    	/* Ack interrupt(s) */
    	iowrite8(stat_ack, &nic->csr->scb.stat_ack);
    
    	/* We hit Receive No Resource (RNR); restart RU after cleaning */
    	if (stat_ack & stat_ack_rnr)
    		nic->ru_running = RU_SUSPENDED;
    
    	if (likely(napi_schedule_prep(&nic->napi))) {
    		e100_disable_irq(nic);
    		__napi_schedule(&nic->napi);//此处触发软中断
    	}
    
    	return IRQ_HANDLED;
    }
    
    在前面我们已经知道在net_dev_init()函数中注冊了收报软中断函数net_rx_action(),当软中断被触发之后,该函数将被调用。
    net_rx_action()函数为:

    static void net_rx_action(struct softirq_action *h)
    {
    	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
    	unsigned long time_limit = jiffies + 2;
    	int budget = netdev_budget;
    	void *have;
    
    	local_irq_disable();
    
    	while (!list_empty(list)) {
    		struct napi_struct *n;
    		int work, weight;
    
    		/* If softirq window is exhuasted then punt.
    		 * Allow this to run for 2 jiffies since which will allow
    		 * an average latency of 1.5/HZ.
    		 */
    		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))//入口队列仍然有缓冲区,软IRQ再度被调度运行。
    			goto softnet_break;
    
    		local_irq_enable();
    
    		/* Even though interrupts have been re-enabled, this
    		 * access is safe because interrupts can only add new
    		 * entries to the tail of this list, and only ->poll()
    		 * calls can remove this head entry from the list.
    		 */
    		n = list_entry(list->next, struct napi_struct, poll_list);
    
    		have = netpoll_poll_lock(n);
    
    		weight = n->weight;
    
    		/* This NAPI_STATE_SCHED test is for avoiding a race
    		 * with netpoll's poll_napi().  Only the entity which
    		 * obtains the lock and sees NAPI_STATE_SCHED set will
    		 * actually make the ->poll() call.  Therefore we avoid
    		 * accidently calling ->poll() when NAPI is not scheduled.
    		 */
    		work = 0;
    		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
    			work = n->poll(n, weight);//运行poll函数,返回已处理的帧
    			trace_napi_poll(n);
    		}
    
    		WARN_ON_ONCE(work > weight);
    
    		budget -= work;
    
    		local_irq_disable();
    
    		/* Drivers must not modify the NAPI state if they
    		 * consume the entire weight.  In such cases this code
    		 * still "owns" the NAPI instance and therefore can
    		 * move the instance around on the list at-will.
    		 */
    		if (unlikely(work == weight)) {//队列被清空。调用napi_complete()负责此事。
    			if (unlikely(napi_disable_pending(n))) {
    				local_irq_enable();
    				napi_complete(n);
    				local_irq_disable();
    			} else
    				list_move_tail(&n->poll_list, list);
    		}
    
    		netpoll_poll_unlock(have);
    	}
    out:
    	local_irq_enable();
    
    #ifdef CONFIG_NET_DMA
    	/*
    	 * There may not be any more sk_buffs coming right now, so push
    	 * any pending DMA copies to hardware
    	 */
    	dma_issue_pending_all();
    #endif
    
    	return;
    
    softnet_break:
    	__get_cpu_var(netdev_rx_stat).time_squeeze++;
    	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
    	goto out;
    }
    
    由上可见,下半部的主要工作是遍历有数据帧等待接收的设备链表,对于每一个设备,运行它对应的poll函数。
    对非NAPI设备来说,poll函数在net_dev_init()函数中初始化为process_backlog()。
    process_backlog()函数定义为:

    static int process_backlog(struct napi_struct *napi, int quota)
    {
    	int work = 0;
    	struct softnet_data *queue = &__get_cpu_var(softnet_data);
    	unsigned long start_time = jiffies;
    
    	napi->weight = weight_p;
    	do {
    		struct sk_buff *skb;
    
    		local_irq_disable();
    		skb = __skb_dequeue(&queue->input_pkt_queue);
    		if (!skb) {
    			__napi_complete(napi);
    			local_irq_enable();
    			break;
    		}
    		local_irq_enable();
    
    		netif_receive_skb(skb);
    	} while (++work < quota && jiffies == start_time);
    
    	return work;
    }
    

    对NAPI设备来的说,驱动程序必须提供一个poll方法,poll 方法有以下原型:
    int (*poll)(struct napi_struct *dev, int *budget); 
    在初始化时须要加入该方法:
    netif_napi_add(netdev, &nic->napi, xx_poll, XX_NAPI_WEIGHT);

    NAPI驱动 的 poll 方法实现一般例如以下(借用《Linux设备驱动程序》中代码,内核有点没对上,懒得去写了):
    static int xx_poll(struct net_device *dev, int *budget)
    {
        int npackets = 0, quota = min(dev->quota, *budget);
        struct sk_buff *skb;
        struct xx_priv *priv = netdev_priv(dev);
        struct xx_packet *pkt;
    
        while (npackets < quota && priv->rx_queue) {
            pkt = xx_dequeue_buf(dev);
            skb = dev_alloc_skb(pkt->datalen + 2);
            if (! skb) {
    
                if (printk_ratelimit())
                    printk(KERN_NOTICE "xx: packet dropped
    "); priv->stats.rx_dropped++; xx_release_buffer(pkt); continue;
            }
            memcpy(skb_put(skb, pkt->datalen), pkt->data, pkt->datalen);
            skb->dev = dev;
            skb->protocol = eth_type_trans(skb, dev);
            skb->ip_summed = CHECKSUM_UNNECESSARY; /* don't check it */
            netif_receive_skb(skb);
    
            /* Maintain stats */
            npackets++;
            priv->stats.rx_packets++;
            priv->stats.rx_bytes += pkt->datalen;
            xx_release_buffer(pkt);
    
        }
        /* If we processed all packets, we're done; tell the kernel and reenable ints */
        *budget -= npackets;
        dev->quota -= npackets;
        if (! priv->rx_queue) {
    
            netif_rx_complete(dev);
            xx_rx_ints(dev, 1);
            return 0;
    
        }
        /* We couldn't process everything. */
        return 1;
    
    }

    NAPI驱动提供自己的poll函数和私有队列。
    无论是非NAPI或NAPI,他们的poll函数最后都会调用netif_receive_skb(skb)来处理接收到的帧。该函数会想各个已注冊的协议例程发送一个skb,之后数据进入Linux内核协议栈处理。



  • 相关阅读:
    315,谁来保护手游开发者的利益
    微信小程序之提高应用速度小技巧
    Python-爬虫-Beautifulsoup解析
    Python-爬虫-requests
    Python-form表单标签
    设计模式のTemplatePattern(模板模式)----行为模式
    链接
    python入门007
    007作业
    005作业
  • 原文地址:https://www.cnblogs.com/zfyouxi/p/4342564.html
Copyright © 2020-2023  润新知