• dpdk kni exception path


    dpdk例子中提供了两种方法与linux kernel协议栈交互: TAP和KNI; 两种方法都是创建虚拟设备用于收发报文;

    TAP/TUN设备的创建

    static int tap_create(char *name)
    {
        struct ifreq ifr;
        int fd, ret;
    
        fd = open("/dev/net/tun", O_RDWR);
        if (fd < 0)
            return fd;
    
        memset(&ifr, 0, sizeof(ifr));
    
        /* TAP device without packet information */
        ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
    
        if (name && *name)
            rte_snprintf(ifr.ifr_name, IFNAMSIZ, name);
    
        ret = ioctl(fd, TUNSETIFF, (void *) &ifr);
        if (ret < 0) {
            close(fd);
            return ret;
        }
    
        if (name)
            rte_snprintf(name, IFNAMSIZ, ifr.ifr_name);
    
        return fd;

    发送报文到TAP设备

    for (;;) {
        struct rte_mbuf *pkts_burst[PKT_BURST_SZ];
        unsigned i;
        /* 从PMD收包 */
        const unsigned nb_rx = rte_eth_rx_burst(port_ids[lcore_id], 0, pkts_burst, PKT_BURST_SZ);
        core_stats[lcore_id].rx += nb_rx;
    
        for (i = 0; likely(i < nb_rx); i++) {
            struct rte_mbuf *m = pkts_burst[i];
            /* 把收到的报文内容写到TAP设备 */
            /* Ignore return val from write() */
            int ret = write(tap_fd, rte_pktmbuf_mtod(m, void*), rte_pktmbuf_data_len(m));
    
            /* mbuf free */
            rte_pktmbuf_free(m);
            if (unlikely(ret < 0))
                lcore_stats[lcore_id].dropped++;
            else
                lcore_stats[lcore_id].tx++;
        }
    }

    报文发送到kernel中后在TAP/TUN设备中需要通过桥接/路由进行L2/L3转发后,到另外一个TAP/TUN设备供应用程序读取

    for (;;) {
        int ret;
        struct rte_mbuf *m = rte_pktmbuf_alloc(pktmbuf_pool);
        if (m == NULL)
            continue;
    
        /* 从TAP设备读报文内容 */
        ret = read(tap_fd, m->pkt.data, MAX_PACKET_SZ);
        lcore_stats[lcore_id].rx++;
        if (unlikely(ret < 0)) {
            FATAL_ERROR("Reading from %s interface failed", tap_name);
        }
    
        /* 转换为mbuf */
        m->pkt.nb_segs = 1;
        m->pkt.next = NULL;
        m->pkt.pkt_len = (uint16_t)ret;
        m->pkt.data_len = (uint16_t)ret;
    
        /* 发送报文 */
        ret = rte_eth_tx_burst(port_ids[lcore_id], 0, &m, 1);
        if (unlikely(ret < 1)) {
            rte_pktmbuf_free(m);
            lcore_stats[lcore_id].dropped++;
        }
        else {
            lcore_stats[lcore_id].tx++;
        }
    }

    这种方法比较简单.但是这个方法的效率可能比较低,数据需要从用户空间复制到内核空间,最后生成skb的时候还需要复制一次? 具体TAP/TUN的内核代码后面有需要再进一步学习吧;

    KNI的实现, example实现了以下功能

    image

    RX方向: PMD分配mbuf, 收包线程收到后把mbuf放入rx_q FIFO, kni线程从rx_q取出mbuf然后转换为skb调用netif_rx把报文发送到协议栈中; 最后rx_q中取出来的mbuf放入free_q中, 由用户空间的收包线程释放;

    TX方向: 从协议栈发到kni设备上的报文,kni的发包函数kni_net_tx从alloc_q中取可用的mbuf, 把skb转换为mbuf, 并把mbuf放入tx_q中, 用户空间的tx线程从tx_q取出mbuf并调用PMD驱动的发包函数发送报文;

    /* Initialise each port */
    for (port = 0; port < nb_sys_ports; port++) {
        /* Skip ports that are not enabled */
        if (!(ports_mask & (1 << port)))
            continue;
    
        /* 初始化端口的收发包队列并启用 */
        init_port(port);
    
        if (port >= RTE_MAX_ETHPORTS)
            rte_exit(EXIT_FAILURE, "Can not use more than "
                "%d ports for kni
    ", RTE_MAX_ETHPORTS);
        /* 为每个物理端口分配一个KNI设备 */
        kni_alloc(port);
    }
    static int
    kni_alloc(uint8_t port_id)
    {
        uint8_t i;
        struct rte_kni *kni;
        struct rte_kni_conf conf;
        struct kni_port_params **params = kni_port_params_array;
    
        if (port_id >= RTE_MAX_ETHPORTS || !params[port_id])
            return -1;
    
        /* 根据该端口的内核线程数决定kni设备数 */
        /* 多线程模式
           如果没有指定内核线程数则每个port创建一个kni设备且不指定内核线程对应的lcore
           如果指定了内核线程个数则根据内核线程个数为每个port创建相应个数的kni设备并指定内核线程对应的lcore
    
           单线程模式
           如果没有指定内核线程数则每个port创建一个kni设备
           如果指定了内核线程个数则根据内核线程个数为每个port创建相应个数的kni设备
        */
        params[port_id]->nb_kni = params[port_id]->nb_lcore_k ?
                    params[port_id]->nb_lcore_k : 1;
    
        for (i = 0; i < params[port_id]->nb_kni; i++) {
            /* Clear conf at first */
            memset(&conf, 0, sizeof(conf));
            if (params[port_id]->nb_lcore_k) {
                rte_snprintf(conf.name, RTE_KNI_NAMESIZE,
                        "vEth%u_%u", port_id, i);
                /* 多线程模式强制绑定内核线程到某个lcore */ 
                conf.core_id = params[port_id]->lcore_k[i];
                conf.force_bind = 1;
            } else
                rte_snprintf(conf.name, RTE_KNI_NAMESIZE,
                            "vEth%u", port_id);
            conf.group_id = (uint16_t)port_id;
            conf.mbuf_size = MAX_PACKET_SZ;
    
            /* 创建kni设备 */
            /*
             * The first KNI device associated to a port
             * is the master, for multiple kernel thread
             * environment.
             */
            if (i == 0) {
                struct rte_kni_ops ops;
                struct rte_eth_dev_info dev_info;
    
                memset(&dev_info, 0, sizeof(dev_info));
                rte_eth_dev_info_get(port_id, &dev_info);
                conf.addr = dev_info.pci_dev->addr;
                conf.id = dev_info.pci_dev->id;
    
                memset(&ops, 0, sizeof(ops));
                ops.port_id = port_id;
                ops.change_mtu = kni_change_mtu;
                ops.config_network_if = kni_config_network_interface;
    
                kni = rte_kni_alloc(pktmbuf_pool, &conf, &ops);
            } else
                kni = rte_kni_alloc(pktmbuf_pool, &conf, NULL);
    
            if (!kni)
                rte_exit(EXIT_FAILURE, "Fail to create kni for "
                            "port: %d
    ", port_id);
            params[port_id]->kni[i] = kni;
        }
    
        return 0;
    }

    struct rte_kni *
    rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
              const struct rte_kni_conf *conf,
              struct rte_kni_ops *ops)
    {
        int ret;
        struct rte_kni_device_info dev_info;
        struct rte_kni *ctx;
        char intf_name[RTE_KNI_NAMESIZE];
    #define OBJNAMSIZ 32
        char obj_name[OBJNAMSIZ];
        char mz_name[RTE_MEMZONE_NAMESIZE];
        const struct rte_memzone *mz;
    
        if (!pktmbuf_pool || !conf || !conf->name[0])
            return NULL;
    
        /* 通过/dev/kni发送创建kni设备请求 */
        /* Check FD and open once */
        if (kni_fd < 0) {
            kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
            if (kni_fd < 0) {
                RTE_LOG(ERR, KNI, "Can not open /dev/%s
    ",
                                KNI_DEVICE);
                return NULL;
            }
        }
    
        /* vEthx_x (port_thread)或者vEthx(port) */
        rte_snprintf(intf_name, RTE_KNI_NAMESIZE, conf->name);
        rte_snprintf(mz_name, RTE_MEMZONE_NAMESIZE, "KNI_INFO_%s", intf_name);
    
        /* rte_kni */
        mz = kni_memzone_reserve(mz_name, sizeof(struct rte_kni), 
                    SOCKET_ID_ANY, 0);
        KNI_MZ_CHECK(mz == NULL);
        ctx = mz->addr;
    
        if (ctx->in_use) {
            RTE_LOG(ERR, KNI, "KNI %s is in use
    ", ctx->name);
            goto fail;
        }
        memset(ctx, 0, sizeof(struct rte_kni));
        if (ops)
            memcpy(&ctx->ops, ops, sizeof(struct rte_kni_ops));
    
        memset(&dev_info, 0, sizeof(dev_info));
        dev_info.bus = conf->addr.bus;
        dev_info.devid = conf->addr.devid;
        dev_info.function = conf->addr.function;
        dev_info.vendor_id = conf->id.vendor_id;
        dev_info.device_id = conf->id.device_id;
        dev_info.core_id = conf->core_id;
        dev_info.force_bind = conf->force_bind;
        dev_info.group_id = conf->group_id;
        dev_info.mbuf_size = conf->mbuf_size;
    
        rte_snprintf(ctx->name, RTE_KNI_NAMESIZE, intf_name);
        rte_snprintf(dev_info.name, RTE_KNI_NAMESIZE, intf_name);
    
        RTE_LOG(INFO, KNI, "pci: %02x:%02x:%02x 	 %02x:%02x
    ",
            dev_info.bus, dev_info.devid, dev_info.function,
                dev_info.vendor_id, dev_info.device_id);
    
        /* 初始化7个FIFO,分别用于TX RX ALLOC FREE REQ RESP SYNC */
    
        /* TX RING */
        rte_snprintf(obj_name, OBJNAMSIZ, "kni_tx_%s", intf_name);
        mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
        KNI_MZ_CHECK(mz == NULL);
        ctx->tx_q = mz->addr;
        kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX);
        dev_info.tx_phys = mz->phys_addr;
    
        /* RX RING */
        rte_snprintf(obj_name, OBJNAMSIZ, "kni_rx_%s", intf_name);
        mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
        KNI_MZ_CHECK(mz == NULL);
        ctx->rx_q = mz->addr;
        kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX);
        dev_info.rx_phys = mz->phys_addr;
    
        /* ALLOC RING */
        rte_snprintf(obj_name, OBJNAMSIZ, "kni_alloc_%s", intf_name);
        mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
        KNI_MZ_CHECK(mz == NULL);
        ctx->alloc_q = mz->addr;
        kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX);
        dev_info.alloc_phys = mz->phys_addr;
    
        /* FREE RING */
        rte_snprintf(obj_name, OBJNAMSIZ, "kni_free_%s", intf_name);
        mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
        KNI_MZ_CHECK(mz == NULL);
        ctx->free_q = mz->addr;
        kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX);
        dev_info.free_phys = mz->phys_addr;
    
        /* Request RING */
        rte_snprintf(obj_name, OBJNAMSIZ, "kni_req_%s", intf_name);
        mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
        KNI_MZ_CHECK(mz == NULL);
        ctx->req_q = mz->addr;
        kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX);
        dev_info.req_phys = mz->phys_addr;
    
        /* Response RING */
        rte_snprintf(obj_name, OBJNAMSIZ, "kni_resp_%s", intf_name);
        mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
        KNI_MZ_CHECK(mz == NULL);
        ctx->resp_q = mz->addr;
        kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX);
        dev_info.resp_phys = mz->phys_addr;
    
        /* Req/Resp sync mem area */
        rte_snprintf(obj_name, OBJNAMSIZ, "kni_sync_%s", intf_name);
        mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
        KNI_MZ_CHECK(mz == NULL);
        ctx->sync_addr = mz->addr;
        dev_info.sync_va = mz->addr;
        dev_info.sync_phys = mz->phys_addr;
    
        /* MBUF mempool */
        rte_snprintf(mz_name, sizeof(mz_name), RTE_MEMPOOL_OBJ_NAME,
            pktmbuf_pool->name);
        mz = rte_memzone_lookup(mz_name);
        KNI_MZ_CHECK(mz == NULL);
    
        /* 记录mbuf pool的虚拟地址和物理地址,用于内核线程计算偏移 */
        dev_info.mbuf_va = mz->addr;
        dev_info.mbuf_phys = mz->phys_addr;
        ctx->pktmbuf_pool = pktmbuf_pool;
        ctx->group_id = conf->group_id;
        ctx->mbuf_size = conf->mbuf_size;
    
        /* 发送创建kni设备请求 */
        ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
        KNI_MZ_CHECK(ret < 0);
    
        ctx->in_use = 1;
    
        return ctx;
    
    fail:
    
        return NULL;
    }

    kernel的kni设备收到ioctl后调用kni_ioctl_create创建kni设备

    static int
    kni_ioctl_create(unsigned int ioctl_num, unsigned long ioctl_param)
    {
        int ret;
        struct rte_kni_device_info dev_info;
        struct pci_dev *pci = NULL;
        struct pci_dev *found_pci = NULL;
        struct net_device *net_dev = NULL;
        struct net_device *lad_dev = NULL;
        struct kni_dev *kni, *dev, *n;
    
        printk(KERN_INFO "KNI: Creating kni...
    ");
        /* Check the buffer size, to avoid warning */
        if (_IOC_SIZE(ioctl_num) > sizeof(dev_info))
            return -EINVAL;
    
        /* Copy kni info from user space */
        ret = copy_from_user(&dev_info, (void *)ioctl_param, sizeof(dev_info));
        if (ret) {
            KNI_ERR("copy_from_user in kni_ioctl_create");
            return -EIO;
        }
    
        /**
         * Check if the cpu core id is valid for binding,
         * for multiple kernel thread mode.
         */
        if (multiple_kthread_on && dev_info.force_bind &&
                    !cpu_online(dev_info.core_id)) {
            KNI_ERR("cpu %u is not online
    ", dev_info.core_id);
            return -EINVAL;
        }
    
        /* 遍历kni设备链表通过名字比较是否已经创建 */
        /* Check if it has been created */
        down_read(&kni_list_lock);
        list_for_each_entry_safe(dev, n, &kni_list_head, list) {
            if (kni_check_param(dev, &dev_info) < 0) {
                up_read(&kni_list_lock);
                return -EINVAL;
            }
        }
        up_read(&kni_list_lock);
    
        /* 虚拟设备创建 */
        net_dev = alloc_netdev(sizeof(struct kni_dev), dev_info.name,
                                kni_net_init);
        if (net_dev == NULL) {
            KNI_ERR("error allocating device "%s"
    ", dev_info.name);
            return -EBUSY;
        }
    
        kni = netdev_priv(net_dev);
    
        /* 参数保存在priv中 */ 
        kni->net_dev = net_dev;
        kni->group_id = dev_info.group_id;
        kni->core_id = dev_info.core_id;
        strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
    
        /* Translate user space info into kernel space info */
        kni->tx_q = phys_to_virt(dev_info.tx_phys);
        kni->rx_q = phys_to_virt(dev_info.rx_phys);
        kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
        kni->free_q = phys_to_virt(dev_info.free_phys);
    
        kni->req_q = phys_to_virt(dev_info.req_phys);
        kni->resp_q = phys_to_virt(dev_info.resp_phys);
        kni->sync_va = dev_info.sync_va;
        kni->sync_kva = phys_to_virt(dev_info.sync_phys);
    
        kni->mbuf_kva = phys_to_virt(dev_info.mbuf_phys);
        kni->mbuf_va = dev_info.mbuf_va;
    
    #ifdef RTE_KNI_VHOST
        kni->vhost_queue = NULL;
        kni->vq_status = BE_STOP;
    #endif
        kni->mbuf_size = dev_info.mbuf_size;
    
        KNI_PRINT("tx_phys:      0x%016llx, tx_q addr:      0x%p
    ",
            (unsigned long long) dev_info.tx_phys, kni->tx_q);
        KNI_PRINT("rx_phys:      0x%016llx, rx_q addr:      0x%p
    ",
            (unsigned long long) dev_info.rx_phys, kni->rx_q);
        KNI_PRINT("alloc_phys:   0x%016llx, alloc_q addr:   0x%p
    ",
            (unsigned long long) dev_info.alloc_phys, kni->alloc_q);
        KNI_PRINT("free_phys:    0x%016llx, free_q addr:    0x%p
    ",
            (unsigned long long) dev_info.free_phys, kni->free_q);
        KNI_PRINT("req_phys:     0x%016llx, req_q addr:     0x%p
    ",
            (unsigned long long) dev_info.req_phys, kni->req_q);
        KNI_PRINT("resp_phys:    0x%016llx, resp_q addr:    0x%p
    ",
            (unsigned long long) dev_info.resp_phys, kni->resp_q);
        KNI_PRINT("mbuf_phys:    0x%016llx, mbuf_kva:       0x%p
    ",
            (unsigned long long) dev_info.mbuf_phys, kni->mbuf_kva);
        KNI_PRINT("mbuf_va:      0x%p
    ", dev_info.mbuf_va);
        KNI_PRINT("mbuf_size:    %u
    ", kni->mbuf_size);
    
        KNI_DBG("PCI: %02x:%02x.%02x %04x:%04x
    ",
                        dev_info.bus,
                        dev_info.devid,
                        dev_info.function,
                        dev_info.vendor_id,
                        dev_info.device_id);
    
        pci = pci_get_device(dev_info.vendor_id, dev_info.device_id, NULL);
    
        /* Support Ethtool */
        while (pci) {
            KNI_PRINT("pci_bus: %02x:%02x:%02x 
    ",
                        pci->bus->number,
                        PCI_SLOT(pci->devfn),
                        PCI_FUNC(pci->devfn));
    
            if ((pci->bus->number == dev_info.bus) &&
                (PCI_SLOT(pci->devfn) == dev_info.devid) &&
                (PCI_FUNC(pci->devfn) == dev_info.function)) {
                found_pci = pci;
                switch (dev_info.device_id) {
                #define RTE_PCI_DEV_ID_DECL_IGB(vend, dev) case (dev):
                #include <rte_pci_dev_ids.h>
                    ret = igb_kni_probe(found_pci, &lad_dev);
                    break;
                #define RTE_PCI_DEV_ID_DECL_IXGBE(vend, dev) 
                                case (dev):
                #include <rte_pci_dev_ids.h>
                    ret = ixgbe_kni_probe(found_pci, &lad_dev);
                    break;
                default:
                    ret = -1;
                    break;
                }
    
                KNI_DBG("PCI found: pci=0x%p, lad_dev=0x%p
    ",
                                pci, lad_dev);
                if (ret == 0) {
                    kni->lad_dev = lad_dev;
                    kni_set_ethtool_ops(kni->net_dev);
                } else {
                    KNI_ERR("Device not supported by ethtool");
                    kni->lad_dev = NULL;
                }
    
                kni->pci_dev = found_pci;
                kni->device_id = dev_info.device_id;
                break;
            }
            pci = pci_get_device(dev_info.vendor_id,
                    dev_info.device_id, pci);
        }
        if (pci)
            pci_dev_put(pci);
    
        /* 注册虚拟设备 */
        ret = register_netdev(net_dev);
        if (ret) {
            KNI_ERR("error %i registering device "%s"
    ",
                        ret, dev_info.name);
            kni_dev_remove(kni);
            return -ENODEV;
        }
    
    #ifdef RTE_KNI_VHOST
        kni_vhost_init(kni);
    #endif
    
        /**
         * Create a new kernel thread for multiple mode, set its core affinity,
         * and finally wake it up.
         */
        if (multiple_kthread_on) {
            /* 多线程模式为每个kni设备创建一个内核线程 */
            kni->pthread = kthread_create(kni_thread_multiple,
                              (void *)kni,
                              "kni_%s", kni->name);
            if (IS_ERR(kni->pthread)) {
                kni_dev_remove(kni);
                return -ECANCELED;
            }
    
            /* 绑定内核线程到对应的lcore */
            if (dev_info.force_bind)
                kthread_bind(kni->pthread, kni->core_id);
    
            /* 唤醒该收包线程 */
            wake_up_process(kni->pthread);
        }
    
        down_write(&kni_list_lock);
        list_add(&kni->list, &kni_list_head);
        up_write(&kni_list_lock);
    
        return 0;
    }

    回到主线程的收发包

    if (flag == LCORE_RX) {
        RTE_LOG(INFO, APP, "Lcore %u is reading from port %d
    ",
                    kni_port_params_array[i]->lcore_rx,
                    kni_port_params_array[i]->port_id);
        while (1) {
            f_stop = rte_atomic32_read(&kni_stop);
            if (f_stop)
                break;
            kni_ingress(kni_port_params_array[i]);
        }
    } else if (flag == LCORE_TX) {
        RTE_LOG(INFO, APP, "Lcore %u is writing to port %d
    ",
                    kni_port_params_array[i]->lcore_tx,
                    kni_port_params_array[i]->port_id);
        while (1) {
            f_stop = rte_atomic32_read(&kni_stop);
            if (f_stop)
                break;
            kni_egress(kni_port_params_array[i]);
        }
    }

    收包线程从PMD中收包并发送给kni,然后检查是否有需要处理的请求消息

    static void
    kni_ingress(struct kni_port_params *p)
    {
        uint8_t i, port_id;
        unsigned nb_rx, num;
        uint32_t nb_kni;
        struct rte_mbuf *pkts_burst[PKT_BURST_SZ];
    
        if (p == NULL)
            return;
    
        nb_kni = p->nb_kni;
        port_id = p->port_id;
        for (i = 0; i < nb_kni; i++) {
            /* 从PMD驱动中收包 */ 
            /* Burst rx from eth */
            nb_rx = rte_eth_rx_burst(port_id, 0, pkts_burst, PKT_BURST_SZ);
            if (unlikely(nb_rx > PKT_BURST_SZ)) {
                RTE_LOG(ERR, APP, "Error receiving from eth
    ");
                return;
            }
     
            /* 把所有收到的mbuf都存入rx_q中 */ 
            /* Burst tx to kni */
            num = rte_kni_tx_burst(p->kni[i], pkts_burst, nb_rx);
            kni_stats[port_id].rx_packets += num;
    
            /* 处理端口的状态变化 */         
            rte_kni_handle_request(p->kni[i]);
            if (unlikely(num < nb_rx)) {
                /* Free mbufs not tx to kni interface */
                kni_burst_free_mbufs(&pkts_burst[num], nb_rx - num);
                kni_stats[port_id].rx_dropped += nb_rx - num;
            }
        }
    }
    unsigned
    rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num)
    {
        /* mbuf放入tx_q */
        unsigned ret = kni_fifo_put(kni->rx_q, (void **)mbufs, num);
    
        /* Get mbufs from free_q and then free them */
        kni_free_mbufs(kni);
    
        return ret;
    }
    
    static void
    kni_free_mbufs(struct rte_kni *kni)
    {
        int i, ret;
        struct rte_mbuf *pkts[MAX_MBUF_BURST_NUM];
    
        /* 从free_q中取mbuf并释放 */
        ret = kni_fifo_get(kni->free_q, (void **)pkts, MAX_MBUF_BURST_NUM);
        if (likely(ret > 0)) {
            for (i = 0; i < ret; i++)
                rte_pktmbuf_free(pkts[i]);
        }
    }

    这里由于生产者和消费者都只有一个,因此FIFO中并未用到互斥/同步机制

    /**
     * Adds num elements into the fifo. Return the number actually written
     */
    static inline unsigned
    kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
    {
        unsigned i = 0;
        /* 把write和read复制到局部变量中 */
        unsigned fifo_write = fifo->write;
        unsigned fifo_read = fifo->read;
        unsigned new_write = fifo_write;
    
        for (i = 0; i < num; i++) {
            new_write = (new_write + 1) & (fifo->len - 1);
    
            /* 空间已满 */
            if (new_write == fifo_read)
                break;
    
            /* 存入数据 */
            fifo->buffer[fifo_write] = data[i];
            fifo_write = new_write;
        }
    
        /* 更新FIFO的write */
        fifo->write = fifo_write;
        return i;
    }
    
    /**
     * Get up to num elements from the fifo. Return the number actully read
     */
    static inline unsigned
    kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num)
    {
        unsigned i = 0;
    
        /* 把write和read复制到局部变量中 */
        unsigned new_read = fifo->read;
        unsigned fifo_write = fifo->write;
    
    
        for (i = 0; i < num; i++) {
    
            /* 没有数据需要读取 */
            if (new_read == fifo_write)
                break;
    
            /* 读取数据 */
            data[i] = fifo->buffer[new_read];
            new_read = (new_read + 1) & (fifo->len - 1);
        }
    
        /* 更新FIFO的read */
        fifo->read = new_read;
        return i;
    }

    发包线程

    /**
     * Interface to dequeue mbufs from tx_q and burst tx
     */
    static void
    kni_egress(struct kni_port_params *p)
    {
        uint8_t i, port_id;
        unsigned nb_tx, num;
        uint32_t nb_kni;
        struct rte_mbuf *pkts_burst[PKT_BURST_SZ];
    
        if (p == NULL)
            return;
    
        nb_kni = p->nb_kni;
        port_id = p->port_id;
        for (i = 0; i < nb_kni; i++) {
            /* 从kni设备收包 */
            /* Burst rx from kni */
            num = rte_kni_rx_burst(p->kni[i], pkts_burst, PKT_BURST_SZ);
            if (unlikely(num > PKT_BURST_SZ)) {
                RTE_LOG(ERR, APP, "Error receiving from KNI
    ");
                return;
            }
    
            /* 发送给PMD发包 */
            /* Burst tx to eth */
            nb_tx = rte_eth_tx_burst(port_id, 0, pkts_burst, (uint16_t)num);
            kni_stats[port_id].tx_packets += nb_tx;
            if (unlikely(nb_tx < num)) {
                /* Free mbufs not tx to NIC */
                kni_burst_free_mbufs(&pkts_burst[nb_tx], num - nb_tx);
                kni_stats[port_id].tx_dropped += num - nb_tx;
            }
        }
    }
    unsigned
    rte_kni_rx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num)
    {
        /* 从tx_q中取出mbuf */
        unsigned ret = kni_fifo_get(kni->tx_q, (void **)mbufs, num);
    
        /* 申请mbuf放入allc_q */
        /* Allocate mbufs and then put them into alloc_q */
        kni_allocate_mbufs(kni);
    
        return ret;
    }
    
    static void
    kni_allocate_mbufs(struct rte_kni *kni)
    {
        int i, ret;
        struct rte_mbuf *pkts[MAX_MBUF_BURST_NUM];
    
        /* Check if pktmbuf pool has been configured */
        if (kni->pktmbuf_pool == NULL) {
            RTE_LOG(ERR, KNI, "No valid mempool for allocating mbufs
    ");
            return;
        }
    
        /* 每次申请MAX_MBUF_BURST_NUM个mbuf */
        for (i = 0; i < MAX_MBUF_BURST_NUM; i++) {
            pkts[i] = rte_pktmbuf_alloc(kni->pktmbuf_pool);
            if (unlikely(pkts[i] == NULL)) {
                /* Out of memory */
                RTE_LOG(ERR, KNI, "Out of memory
    ");
                break;
            }
        }
    
        /* No pkt mbuf alocated */
        if (i <= 0)
            return;
    
        /* 放入allc_q */
        ret = kni_fifo_put(kni->alloc_q, (void **)pkts, i);
    
        /* 队列已满 回收未入队的mbuf */
        /* Check if any mbufs not put into alloc_q, and then free them */
        if (ret >= 0 && ret < i && ret < MAX_MBUF_BURST_NUM) {
            int j;
    
            for (j = ret; j < i; j++)
                rte_pktmbuf_free(pkts[j]);
        }
    }

    下面看下内核的kni设备的收发包函数

    对于kni设备的收包函数,单线程模式下,打开设备的时候会启动kni_thread_single线程; 多线程模式下,创建kni设备的时候会启动kni_thread_multiple线程;

    static int
    kni_thread_single(void *unused)
    {
        int j;
        struct kni_dev *dev, *n;
    
        while (!kthread_should_stop()) {
            down_read(&kni_list_lock);
            for (j = 0; j < KNI_RX_LOOP_NUM; j++) {
                /* 单线程模式下遍历所有kni设备 */
                list_for_each_entry_safe(dev, n,
                        &kni_list_head, list) {
    #ifdef RTE_KNI_VHOST
                    kni_chk_vhost_rx(dev);
    #else
                    /* 从rx_q中收包 */
                    kni_net_rx(dev);
    #endif
                    /* 用户空间对request的响应 */
                    kni_net_poll_resp(dev);
                }
            }
            up_read(&kni_list_lock);
            /* reschedule out for a while */
            schedule_timeout_interruptible(usecs_to_jiffies( 
                    KNI_KTHREAD_RESCHEDULE_INTERVAL));
        }
    
        return 0;
    }

    收包函数, 没有配置lo模式的时候就是kni_net_rx_normal

    static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
    
    /* rx interface */
    void
    kni_net_rx(struct kni_dev *kni)
    {
        /**
         * It doesn't need to check if it is NULL pointer,
         * as it has a default value
         */
        (*kni_net_rx_func)(kni);
    }

    /*
     * RX: normal working mode
     */
    static void
    kni_net_rx_normal(struct kni_dev *kni)
    {
        unsigned ret;
        uint32_t len;
        unsigned i, num, num_rq, num_fq;
        struct rte_kni_mbuf *kva;
        struct rte_kni_mbuf *va[MBUF_BURST_SZ];
        void * data_kva;
    
        struct sk_buff *skb;
        struct net_device *dev = kni->net_dev;
    
        /* 每次收包的个数必须为rx_q和free_q的最小值且不超过MBUF_BURST_SZ */
    
        /* Get the number of entries in rx_q */
        num_rq = kni_fifo_count(kni->rx_q);
    
        /* Get the number of free entries in free_q */
        num_fq = kni_fifo_free_count(kni->free_q);
    
        /* Calculate the number of entries to dequeue in rx_q */
        num = min(num_rq, num_fq);
        num = min(num, (unsigned)MBUF_BURST_SZ);
    
        /* Return if no entry in rx_q and no free entry in free_q */
        if (num == 0)
            return;
    
        /* Burst dequeue from rx_q */
        ret = kni_fifo_get(kni->rx_q, (void **)va, num);
        if (ret == 0)
            return; /* Failing should not happen */
    
        /* mbuf转换为skb */
        /* Transfer received packets to netif */
        for (i = 0; i < num; i++) {
            /* mbuf kva */
            kva = (void *)va[i] - kni->mbuf_va + kni->mbuf_kva;
            len = kva->data_len;
            /* data kva */
            data_kva = kva->data - kni->mbuf_va + kni->mbuf_kva;
    
            skb = dev_alloc_skb(len + 2);
            if (!skb) {
                KNI_ERR("Out of mem, dropping pkts
    ");
                /* Update statistics */
                kni->stats.rx_dropped++;
            }
            else {
                /* Align IP on 16B boundary */
                skb_reserve(skb, 2);
                memcpy(skb_put(skb, len), data_kva, len);
                skb->dev = dev;
                skb->protocol = eth_type_trans(skb, dev);
                skb->ip_summed = CHECKSUM_UNNECESSARY;
    
                /* 发送skb到协议栈 */
                /* Call netif interface */
                netif_receive_skb(skb);
    
                /* Update statistics */
                kni->stats.rx_bytes += len;
                kni->stats.rx_packets++;
            }
        }
    
        /* 通知用户空间释放mbuf */
        /* Burst enqueue mbufs into free_q */
        ret = kni_fifo_put(kni->free_q, (void **)va, num);
        if (ret != num)
            /* Failing should not happen */
            KNI_ERR("Fail to enqueue entries into free_q
    ");
    }

    kni发包函数

    static int
    kni_net_tx(struct sk_buff *skb, struct net_device *dev)
    {
        int len = 0;
        unsigned ret;
        struct kni_dev *kni = netdev_priv(dev);
        struct rte_kni_mbuf *pkt_kva = NULL;
        struct rte_kni_mbuf *pkt_va = NULL;
    
        dev->trans_start = jiffies; /* save the timestamp */
    
        /* Check if the length of skb is less than mbuf size */
        if (skb->len > kni->mbuf_size)
            goto drop;
    
        /**
         * Check if it has at least one free entry in tx_q and
         * one entry in alloc_q.
         */
        if (kni_fifo_free_count(kni->tx_q) == 0 ||
                kni_fifo_count(kni->alloc_q) == 0) {
            /**
             * If no free entry in tx_q or no entry in alloc_q,
             * drops skb and goes out.
             */
            goto drop;
        }
    
        /* skb转mbuf */
        /* dequeue a mbuf from alloc_q */
        ret = kni_fifo_get(kni->alloc_q, (void **)&pkt_va, 1);
        if (likely(ret == 1)) {
            void *data_kva;
    
            pkt_kva = (void *)pkt_va - kni->mbuf_va + kni->mbuf_kva;
            data_kva = pkt_kva->data - kni->mbuf_va + kni->mbuf_kva;
    
            len = skb->len;
            memcpy(data_kva, skb->data, len);
            if (unlikely(len < ETH_ZLEN)) {
                memset(data_kva + len, 0, ETH_ZLEN - len);
                len = ETH_ZLEN;
            }
            pkt_kva->pkt_len = len;
            pkt_kva->data_len = len;
    
            /* enqueue mbuf into tx_q */
            ret = kni_fifo_put(kni->tx_q, (void **)&pkt_va, 1);
            if (unlikely(ret != 1)) {
                /* Failing should not happen */
                KNI_ERR("Fail to enqueue mbuf into tx_q
    ");
                goto drop;
            }
        } else {
            /* Failing should not happen */
            KNI_ERR("Fail to dequeue mbuf from alloc_q
    ");
            goto drop;
        }
    
        /* Free skb and update statistics */
        dev_kfree_skb(skb);
        kni->stats.tx_bytes += len;
        kni->stats.tx_packets++;
    
        return NETDEV_TX_OK;
    
    drop:
        /* Free skb and update statistics */
        dev_kfree_skb(skb);
        kni->stats.tx_dropped++;
    
        return NETDEV_TX_OK;
    }
  • 相关阅读:
    在Android模拟器(BlueStacks)中调试录音应用【20121221】
    一路来的脚印_停第二念【20130109】
    Office集锦_Word_禁止在粘帖内容前后添加空格【201318】
    Libgdx使用过程中在Android环境下遇到的各种错误提示的解决【2012112】
    童真趣语【20121217】
    【转】Android中的资源分析、读取【2012113】
    Office集锦_Word_禁止拼写检查【201318】
    BlueStacks中用户自己的SdCard的位置【2012116】
    Eclipse+BlueStacks调试Android应用_补充篇【20121030】
    The Power of H3D2 ! 截图,视频!
  • 原文地址:https://www.cnblogs.com/chanwai1219/p/3680239.html
Copyright © 2020-2023  润新知