网络虚拟化有和存储虚拟化类似的地方,例如它们都是基于virtio 的,因而在看网络虚拟化的过程中,会看到和存储虚拟化很像的数据结构和原理。但是网络虚拟化也有自己的特殊性。例如,存储虚拟化是将宿主机上的文件作为客户机上的硬盘,而网络虚拟化需要依赖于内核协议栈进行网络包的封装与解封装。那怎么实现客户机和宿主机之间的互通呢?就来看一看解析初始化的过程。还是从Virtio Network Device这个设备的初始化讲起,如下所示:
static const TypeInfo device_type_info = { .name = TYPE_DEVICE, .parent = TYPE_OBJECT, .instance_size = sizeof(DeviceState), .instance_init = device_initfn, .instance_post_init = device_post_init, .instance_finalize = device_finalize, .class_base_init = device_class_base_init, .class_init = device_class_init, .abstract = true, .class_size = sizeof(DeviceClass), }; static const TypeInfo virtio_device_info = { .name = TYPE_VIRTIO_DEVICE, .parent = TYPE_DEVICE, .instance_size = sizeof(VirtIODevice), .class_init = virtio_device_class_init, .instance_finalize = virtio_device_instance_finalize, .abstract = true, .class_size = sizeof(VirtioDeviceClass), }; static const TypeInfo virtio_net_info = { .name = TYPE_VIRTIO_NET, .parent = TYPE_VIRTIO_DEVICE, .instance_size = sizeof(VirtIONet), .instance_init = virtio_net_instance_init, .class_init = virtio_net_class_init, }; static void virtio_register_types(void) { type_register_static(&virtio_net_info); } type_init(virtio_register_types)
Virtio Network Device这种类的定义是有多层继承关系的,TYPE_VIRTIO_NET的父类是TYPE_VIRTIO_DEVICE,TYPE_VIRTIO_DEVICE的父类是TYPE_DEVICE,TYPE_DEVICE的父类是TYPE_OBJECT,继承关系就到头了。type_init用于注册这种类,这里面每一层都有class_init,用于从TypeImpl生成xxxClass,也有instance_init,会将xxxClass初始化为实例。TYPE_VIRTIO_NET层的class_init函数是virtio_net_class_init,它定义了DeviceClass的realize函数为virtio_net_device_realize,这一点和存储块设备是一样的,如下所示:
static void virtio_net_device_realize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIONet *n = VIRTIO_NET(dev); NetClientState *nc; int i; ...... virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size); /* * We set a lower limit on RX queue size to what it always was. * Guests that want a smaller ring can always resize it without * help from us (using virtio 1 and up). */ if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE || n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE || !is_power_of_2(n->net_conf.rx_queue_size)) { ...... return; } if (n->net_conf.tx_queue_size < VIRTIO_NET_TX_QUEUE_MIN_SIZE || n->net_conf.tx_queue_size > VIRTQUEUE_MAX_SIZE || !is_power_of_2(n->net_conf.tx_queue_size)) { ...... return; } n->max_queues = MAX(n->nic_conf.peers.queues, 1); if (n->max_queues * 2 + 1 > VIRTIO_QUEUE_MAX) { ...... return; } n->vqs = g_malloc0(sizeof(VirtIONetQueue) * n->max_queues); n->curr_queues = 1; ...... n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n), n->net_conf.tx_queue_size); for (i = 0; i < n->max_queues; i++) { virtio_net_add_queue(n, i); } n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl); qemu_macaddr_default_if_unset(&n->nic_conf.macaddr); memcpy(&n->mac[0], &n->nic_conf.macaddr, sizeof(n->mac)); n->status = VIRTIO_NET_S_LINK_UP; if (n->netclient_type) { n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf, n->netclient_type, n->netclient_name, n); } else { n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf, object_get_typename(OBJECT(dev)), dev->id, n); } ...... }
static void virtio_net_add_queue(VirtIONet *n, int index)
{
VirtIODevice *vdev = VIRTIO_DEVICE(n);
n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
virtio_net_handle_rx);
if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
n->vqs[index].tx_vq =
virtio_add_queue(vdev, n->net_conf.tx_queue_size,
virtio_net_handle_tx_timer);
n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
virtio_net_tx_timer,
&n->vqs[index]);
} else {
n->vqs[index].tx_vq =
virtio_add_queue(vdev, n->net_conf.tx_queue_size,
virtio_net_handle_tx_bh);
n->vqs[index].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[index]);
}
n->vqs[index].tx_waiting = 0;
n->vqs[index].n = n;
}
static void virtio_net_add_queue(VirtIONet *n, int index) { VirtIODevice *vdev = VIRTIO_DEVICE(n); n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size, virtio_net_handle_rx); if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) { n->vqs[index].tx_vq = virtio_add_queue(vdev, n->net_conf.tx_queue_size, virtio_net_handle_tx_timer); n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, virtio_net_tx_timer, &n->vqs[index]); } else { n->vqs[index].tx_vq = virtio_add_queue(vdev, n->net_conf.tx_queue_size, virtio_net_handle_tx_bh); n->vqs[index].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[index]); } n->vqs[index].tx_waiting = 0; n->vqs[index].n = n; }
这里面创建了一个VirtIODevice,这一点和存储虚拟化也是一样的。virtio_init用来初始化这个设备。VirtIODevice结构里面有一个VirtQueue数组,这就是virtio前端和后端互相传数据的队列,最多有VIRTIO_QUEUE_MAX个。
刚才说的都是一样的地方,其实也有不一样的地方。会发现这里面有这样的语句n->max_queues * 2 + 1 > VIRTIO_QUEUE_MAX。为什么要乘以2呢?这是因为对于网络设备来讲,应该分发送队列和接收队列两个方向。接下来调用virtio_net_add_queue来初始化队列,可以看出这里面就有发送tx_vq和接收rx_vq两个队列,如下所示:
typedef struct VirtIONetQueue { VirtQueue *rx_vq; VirtQueue *tx_vq; QEMUTimer *tx_timer; QEMUBH *tx_bh; uint32_t tx_waiting; struct { VirtQueueElement *elem; } async_tx; struct VirtIONet *n; } VirtIONetQueue; static void virtio_net_add_queue(VirtIONet *n, int index) { VirtIODevice *vdev = VIRTIO_DEVICE(n); n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size, virtio_net_handle_rx); ...... n->vqs[index].tx_vq = virtio_add_queue(vdev, n->net_conf.tx_queue_size, virtio_net_handle_tx_bh); n->vqs[index].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[index]); n->vqs[index].n = n;
每个VirtQueue中,都有一个vring用来维护这个队列里面的数据;另外还有函数virtio_net_handle_rx用于处理网络包的接收;函数virtio_net_handle_tx_bh用于网络包的发送,这个函数后面会用到。接下来,qemu_new_nic会创建一个虚拟机里面的网卡,如下所示
NICState *qemu_new_nic(NetClientInfo *info, NICConf *conf, const char *model, const char *name, void *opaque) { NetClientState **peers = conf->peers.ncs; NICState *nic; int i, queues = MAX(1, conf->peers.queues); ...... nic = g_malloc0(info->size + sizeof(NetClientState) * queues); nic->ncs = (void *)nic + info->size; nic->conf = conf; nic->opaque = opaque; for (i = 0; i < queues; i++) { qemu_net_client_setup(&nic->ncs[i], info, peers[i], model, name, NULL); nic->ncs[i].queue_index = i; } return nic; } static void qemu_net_client_setup(NetClientState *nc, NetClientInfo *info, NetClientState *peer, const char *model, const char *name, NetClientDestructor *destructor) { nc->info = info; nc->model = g_strdup(model); if (name) { nc->name = g_strdup(name); } else { nc->name = assign_name(nc, model); } QTAILQ_INSERT_TAIL(&net_clients, nc, next); nc->incoming_queue = qemu_new_net_queue(qemu_deliver_packet_iov, nc); nc->destructor = destructor; QTAILQ_INIT(&nc->filters); }
初始化过程解析完毕以后,接下来从qemu的启动过程看起。对于网卡的虚拟化,qemu的启动参数里面有关的是下面两行
-netdev tap,fd=32,id=hostnet0,vhost=on,vhostfd=37 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=fa:16:3e:d1:2d:99,bus=pci.0,addr=0x3
qemu的main函数会调用net_init_clients进行网络设备的初始化,可以解析net参数,也可以解析netdev参数,如下所示:
int net_init_clients(Error **errp) { QTAILQ_INIT(&net_clients); if (qemu_opts_foreach(qemu_find_opts("netdev"), net_init_netdev, NULL, errp)) { return -1; } if (qemu_opts_foreach(qemu_find_opts("nic"), net_param_nic, NULL, errp)) { return -1; } if (qemu_opts_foreach(qemu_find_opts("net"), net_init_client, NULL, errp)) { return -1; } return 0; }
net_init_clients会解析参数。上面的参数netdev会调用net_init_netdev->net_client_init->net_client_init1。net_client_init1会根据不同的driver类型,调用不同的初始化函数,如下所示:
static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])( const Netdev *netdev, const char *name, NetClientState *peer, Error **errp) = { [NET_CLIENT_DRIVER_NIC] = net_init_nic, [NET_CLIENT_DRIVER_TAP] = net_init_tap, [NET_CLIENT_DRIVER_SOCKET] = net_init_socket, [NET_CLIENT_DRIVER_HUBPORT] = net_init_hubport, ...... };
由于配置的driver类型是tap,因而这里会调用net_init_tap->net_tap_init->tap_open,如下所示:
#define PATH_NET_TUN "/dev/net/tun" int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required, int mq_required, Error **errp) { struct ifreq ifr; int fd, ret; int len = sizeof(struct virtio_net_hdr); unsigned int features; TFR(fd = open(PATH_NET_TUN, O_RDWR)); memset(&ifr, 0, sizeof(ifr)); ifr.ifr_flags = IFF_TAP | IFF_NO_PI; if (ioctl(fd, TUNGETFEATURES, &features) == -1) { features = 0; } if (features & IFF_ONE_QUEUE) { ifr.ifr_flags |= IFF_ONE_QUEUE; } if (*vnet_hdr) { if (features & IFF_VNET_HDR) { *vnet_hdr = 1; ifr.ifr_flags |= IFF_VNET_HDR; } else { *vnet_hdr = 0; } ioctl(fd, TUNSETVNETHDRSZ, &len); } ...... ret = ioctl(fd, TUNSETIFF, (void *) &ifr); ...... fcntl(fd, F_SETFL, O_NONBLOCK); return fd;
在tap_open中打开一个文件"/dev/net/tun",然后通过ioctl操作这个文件。这是Linux内核的一项机制,和KVM机制很像,其实这就是一种通过打开这个字符设备文件,然后通过ioctl操作这个文件和内核打交道,来使用内核的这么一种能力,如下图所示:
为什么需要使用内核的机制呢?因为网络包需要从虚拟机里面发送到虚拟机外面,发送到宿主机上的时候,必须是一个正常的网络包才能被转发。要形成一个网络包,那就需要经过复杂的协议栈。客户机会将网络包发送给qemu,qemu自己没有网络协议栈,现去实现一个也不可能,太复杂了,于是它就要借助内核的力量。qemu会将客户机发送给它的网络包转换成为文件流,写入"/dev/net/tun"字符设备,就像写一个文件一样。内核中TUN/TAP字符设备驱动会收到这个写入的文件流,然后交给TUN/TAP的虚拟网卡驱动,这个驱动会将文件流再次转成网络包,交给TCP/IP栈,最终从虚拟TAP网卡tap0发出来,成为标准的网络包。后面会看到这个过程。
现在到内核里面,看一看打开"/dev/net/tun"字符设备后,内核会发生什么事情。内核的实现在drivers/net/tun.c文件中,这是一个字符设备驱动程序,应该符合字符设备的格式,如下所示:
module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR(DRV_COPYRIGHT); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(TUN_MINOR); MODULE_ALIAS("devname:net/tun"); static int __init tun_init(void) { ...... ret = rtnl_link_register(&tun_link_ops); ...... ret = misc_register(&tun_miscdev); ...... ret = register_netdevice_notifier(&tun_notifier_block); ...... }
这里面注册了一个tun_miscdev字符设备,从它的定义可以看出,这就是"/dev/net/tun"字符设备,如下所示:
static struct miscdevice tun_miscdev = { .minor = TUN_MINOR, .name = "tun", .nodename = "net/tun", .fops = &tun_fops, }; static const struct file_operations tun_fops = { .owner = THIS_MODULE, .llseek = no_llseek, .read_iter = tun_chr_read_iter, .write_iter = tun_chr_write_iter, .poll = tun_chr_poll, .unlocked_ioctl = tun_chr_ioctl, .open = tun_chr_open, .release = tun_chr_close, .fasync = tun_chr_fasync, };
qemu的tap_open函数会打开这个字符设备PATH_NET_TUN。打开字符设备的过程这里不再重复,总之到了驱动这一层,调用的是tun_chr_open,如下所示:
static int tun_chr_open(struct inode *inode, struct file * file) { struct tun_file *tfile; tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto, 0); RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; init_waitqueue_head(&tfile->wq.wait); RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq); tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; sock_init_data(&tfile->socket, &tfile->sk); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; INIT_LIST_HEAD(&tfile->next); sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); return 0; }
在tun_chr_open的参数里面有一个struct file,它代表的就是打开的字符设备文件"/dev/net/tun",因而往这个字符设备文件中写数据,就会通过这个struct file写入。这个struct file里面的file_operations,按照字符设备打开的规则,指向的就是tun_fops。另外还需要在tun_chr_open创建一个结构struct tun_file,并且将struct file的private_data指向它,如下所示:
/* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures * to serve as one transmit queue for tuntap device. */ struct tun_file { struct sock sk; struct socket socket; struct socket_wq wq; struct tun_struct __rcu *tun; struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; union { u16 queue_index; unsigned int ifindex; }; struct list_head next; struct tun_struct *detached; struct skb_array tx_array; }; struct tun_struct { struct tun_file __rcu *tfiles[MAX_TAP_QUEUES]; unsigned int numqueues; unsigned int flags; kuid_t owner; kgid_t group; struct net_device *dev; netdev_features_t set_features; int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; struct sock_fprog fprog; /* protected by rtnl lock */ bool filter_attached; spinlock_t lock; struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; struct timer_list flow_gc_timer; unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; void *security; u32 flow_count; u32 rx_batched; struct tun_pcpu_stats __percpu *pcpu_stats; }; static const struct proto_ops tun_socket_ops = { .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, }
在struct tun_file中有一个成员struct tun_struct,它里面有一个struct net_device,这个用来表示宿主机上的tuntap网络设备。在struct tun_file中,还有struct socket和struct sock,因为要用到内核的网络协议栈,所以就需要这两个结构,这在以前网络协议部分已经分析过了。所以按照struct tun_file的注释所说,这是一个很重要的数据结构,"/dev/net/tun"对应的struct file的private_data指向它,因而可以接收qemu发过来的数据。除此之外,它还可以通过struct sock来操作内核协议栈,然后将网络包从宿主机上的tuntap网络设备发出去,宿主机上的tuntap网络设备对应的struct net_device也归它管。
32. 在qemu的tap_open函数中,打开这个字符设备文件之后,接下来要做的事情是,通过ioctl来设置宿主机的网卡 TUNSETIFF。接下来,ioctl到了内核里面会调用tun_chr_ioctl,如下所示:
static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; void __user* argp = (void __user*)arg; struct ifreq ifr; kuid_t owner; kgid_t group; int sndbuf; int vnet_hdr_sz; unsigned int ifindex; int le; int ret; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == SOCK_IOC_TYPE) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; } ...... tun = __tun_get(tfile); if (cmd == TUNSETIFF) { ifr.ifr_name[IFNAMSIZ-1] = '