先设置变量:
NS=cali
VETH=v-cali
创建 netns 和 veth, veth 一端塞进去, 射 ip:
ip netns add $NS
ip l add $VETH type veth peer name $VETH-peer
ip l set $VETH-peer up
ip l set $VETH netns $NS
ip netns exec $NS ip l set $VETH up
ip netns exec $NS ip a add 10.2.0.1/32 dev $VETH
然后在宿主机直接路由 ip 到 veth:
ip r add 10.2.0.1/32 dev $VETH-peer
netns 里设置 default gw 到 veth:
ip netns exec $NS ip r add default dev $VETH
[root@bogon ~]# ping 10.2.0.1 PING 10.2.0.1 (10.2.0.1) 56(84) bytes of data. 64 bytes from 10.2.0.1: icmp_seq=1 ttl=64 time=0.107 ms 64 bytes from 10.2.0.1: icmp_seq=2 ttl=64 time=0.024 ms ^C --- 10.2.0.1 ping statistics --- 2 packets transmitted, 2 received, 0% packet loss, time 1021ms rtt min/avg/max/mdev = 0.024/0.065/0.107/0.042 ms [root@bogon ~]# route -n Kernel IP routing table Destination Gateway Genmask Flags Metric Ref Use Iface 0.0.0.0 10.10.16.254 0.0.0.0 UG 0 0 0 enahisic2i0 10.2.0.1 0.0.0.0 255.255.255.255 UH 0 0 0 v-cali-peer
[root@bogon ~]# ip netns exec cali ip a 1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 98: v-cali@if97: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 16:95:64:35:e3:17 brd ff:ff:ff:ff:ff:ff link-netnsid 0 inet 10.2.0.1/32 scope global v-cali valid_lft forever preferred_lft forever inet6 fe80::1495:64ff:fe35:e317/64 scope link valid_lft forever preferred_lft forever [root@bogon ~]# ip netns exec cali tcpdump -i v-cali icmp -nnvv tcpdump: listening on v-cali, link-type EN10MB (Ethernet), capture size 262144 bytes 11:48:00.249026 IP (tos 0x0, ttl 64, id 43401, offset 0, flags [DF], proto ICMP (1), length 84) 10.10.16.81 > 10.2.0.1: ICMP echo request, id 46022, seq 1, length 64 11:48:00.249052 IP (tos 0x0, ttl 64, id 18221, offset 0, flags [none], proto ICMP (1), length 84) 10.2.0.1 > 10.10.16.81: ICMP echo reply, id 46022, seq 1, length 64 11:48:01.252474 IP (tos 0x0, ttl 64, id 43423, offset 0, flags [DF], proto ICMP (1), length 84) 10.10.16.81 > 10.2.0.1: ICMP echo request, id 46022, seq 2, length 64 11:48:01.252490 IP (tos 0x0, ttl 64, id 18254, offset 0, flags [none], proto ICMP (1), length 84) 10.2.0.1 > 10.10.16.81: ICMP echo reply, id 46022, seq 2, length 64
这时候可以从 host ping netns, 但是反过来就不可达, 抓包发现是因为 arp 不知道 mac 地址, 加上 arp proxy
[root@bogon ~]# ip netns exec cali ping 10.10.16.81 PING 10.10.16.81 (10.10.16.81) 56(84) bytes of data. 64 bytes from 10.10.16.81: icmp_seq=1 ttl=64 time=0.067 ms 64 bytes from 10.10.16.81: icmp_seq=2 ttl=64 time=0.036 ms 64 bytes from 10.10.16.81: icmp_seq=3 ttl=64 time=0.033 ms 64 bytes from 10.10.16.81: icmp_seq=4 ttl=64 time=0.024 ms 64 bytes from 10.10.16.81: icmp_seq=5 ttl=64 time=0.027 ms ^C^C --- 10.10.16.81 ping statistics --- 5 packets transmitted, 5 received, 0% packet loss, time 4187ms rtt min/avg/max/mdev = 0.024/0.037/0.067/0.016 ms [root@bogon ~]# ip netns exec cali ping 8.8.8.8 PING 8.8.8.8 (8.8.8.8) 56(84) bytes of data. ^C --- 8.8.8.8 ping statistics --- 2 packets transmitted, 0 received, 100% packet loss, time 1047ms [root@bogon ~]#
添加snat,还是无法访问
[root@bogon ~]# iptables -t nat -A POSTROUTING -s 10.2.0.1/32 -j MASQUERADE [root@bogon ~]# ip netns exec cali ping 8.8.8.8 PING 8.8.8.8 (8.8.8.8) 56(84) bytes of data. ^C --- 8.8.8.8 ping statistics --- 2 packets transmitted, 0 received, 100% packet loss, time 1027ms [root@bogon ~]#
host上抓包
[root@bogon ~]# tcpdump -i v-cali-peer arp -nv tcpdump: listening on v-cali-peer, link-type EN10MB (Ethernet), capture size 262144 bytes 11:50:57.812451 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 8.8.8.8 tell 10.2.0.1, length 28 11:50:58.852454 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 8.8.8.8 tell 10.2.0.1, length 28 11:50:59.892511 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 8.8.8.8 tell 10.2.0.1, length 28 11:51:00.932453 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 8.8.8.8 tell 10.2.0.1, length 28 11:51:01.972453 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 8.8.8.8 tell 10.2.0.1, length 28 11:51:03.012519 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 8.8.8.8 tell 10.2.0.1, length 28
抓包发现是因为 arp 不知道 mac 地址, 加上 arp proxy
[root@bogon ~]# echo 1 > /proc/sys/net/ipv4/conf/$VETH-peer/proxy_arp [root@bogon ~]# sysctl -p net.bridge.bridge-nf-call-iptables = 1 net.bridge.bridge-nf-call-ip6tables = 1 net.ipv4.ip_nonlocal_bind = 1 [root@bogon ~]#
[root@bogon ~]# ip netns exec cali ip n 8.8.8.8 dev v-cali lladdr 92:07:52:14:06:42 STALE 10.10.16.81 dev v-cali lladdr 92:07:52:14:06:42 STALE [root@bogon ~]#
可以访问通了
这下可以和 host 互 ping 了, google.com 也没问题了, 功能上没问题.
不过有个优化的问题, arp proxy 会有一些问题, 比如这里会导致 netns 里的 arp cache 无限扩张, 所有的 outbound ip 都会产生一条 arp entry.
[root@bogon ~]# ip netns exec cali ip n 8.8.8.8 dev v-cali lladdr 92:07:52:14:06:42 STALE 10.10.16.81 dev v-cali lladdr 92:07:52:14:06:42 STALE 114.114.114.114 dev v-cali lladdr 92:07:52:14:06:42 REACHABLE [root@bogon ~]#
为了解决这个问题, 我们用一个假的 ip 169.254.1.1 作为 link-local address, 绕一下:
scope link
[root@bogon ~]# ip netns exec $NS ip r del default dev $VETH [root@bogon ~]# ip netns exec $NS ip r add 169.254.1.1 dev $VETH scope link [root@bogon ~]# ip netns exec $NS ip r add default via 169.254.1.1 dev $VETH [root@bogon ~]# ip netns exec $NS ip r default via 169.254.1.1 dev v-cali 169.254.1.1 dev v-cali scope link [root@bogon ~]# ip netns exec $NS ip a 1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 98: v-cali@if97: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 16:95:64:35:e3:17 brd ff:ff:ff:ff:ff:ff link-netnsid 0 inet 10.2.0.1/32 scope global v-cali valid_lft forever preferred_lft forever inet6 fe80::1495:64ff:fe35:e317/64 scope link valid_lft forever preferred_lft forever [root@bogon ~]#
[root@bogon ~]# ip netns exec cali ip n 8.8.8.8 dev v-cali lladdr 92:07:52:14:06:42 STALE 10.10.16.81 dev v-cali lladdr 92:07:52:14:06:42 STALE 114.114.114.114 dev v-cali lladdr 92:07:52:14:06:42 STALE [root@bogon ~]# ip netns exec cali ip n del 8.8.8.8 dev v-cali [root@bogon ~]# ip netns exec cali ip n del 10.10.16.81 dev v-cali [root@bogon ~]# ip netns exec cali ip n del 114.114.114.114 dev v-cali [root@bogon ~]# ip netns exec cali ip n [root@bogon ~]# ip netns exec cali ping 8.8.8.8 PING 8.8.8.8 (8.8.8.8) 56(84) bytes of data. 64 bytes from 8.8.8.8: icmp_seq=1 ttl=103 time=279 ms 64 bytes from 8.8.8.8: icmp_seq=2 ttl=103 time=11.1 ms ^C --- 8.8.8.8 ping statistics --- 2 packets transmitted, 2 received, 0% packet loss, time 1000ms rtt min/avg/max/mdev = 11.161/145.151/279.141/133.990 ms [root@bogon ~]# ip netns exec cali ping 114.114.114.114 PING 114.114.114.114 (114.114.114.114) 56(84) bytes of data. 64 bytes from 114.114.
[root@bogon ~]# ip netns exec cali ip n 10.10.16.81 dev v-cali lladdr 92:07:52:14:06:42 STALE 169.254.1.1 dev v-cali lladdr 92:07:52:14:06:42 REACHABLE [root@bogon ~]# ip netns exec cali ip a 1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 98: v-cali@if97: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 16:95:64:35:e3:17 brd ff:ff:ff:ff:ff:ff link-netnsid 0 inet 10.2.0.1/32 scope global v-cali valid_lft forever preferred_lft forever inet6 fe80::1495:64ff:fe35:e317/64 scope link valid_lft forever preferred_lft forever [root@bogon ~]# ip a | grep cali 97: v-cali-peer@if98: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 [root@bogon ~]# ip a sh v-cali-peer 97: v-cali-peer@if98: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 92:07:52:14:06:42 brd ff:ff:ff:ff:ff:ff link-netnsid 5 inet6 fe80::9007:52ff:fe14:642/64 scope link valid_lft forever preferred_lft forever [root@bogon ~]#
arp请求报文
[root@bogon ~]# tcpdump -i v-cali-peer arp -nv tcpdump: listening on v-cali-peer, link-type EN10MB (Ethernet), capture size 262144 bytes 12:02:08.852461 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 10.2.0.1 tell 10.10.16.81, length 28 12:02:08.852487 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 169.254.1.1 tell 10.2.0.1, length 28 12:02:08.852499 ARP, Ethernet (len 6), IPv4 (len 4), Reply 169.254.1.1 is-at 92:07:52:14:06:42, length 28 12:02:08.852695 ARP, Ethernet (len 6), IPv4 (len 4), Reply 10.2.0.1 is-at 16:95:64:35:e3:17, length 28
[root@bogon ~]# iptables -t nat -A POSTROUTING -s 10.2.0.1/32 -j MASQUERADE [root@bogon ~]# ip netns exec cali ping 8.8.8.8 PING 8.8.8.8 (8.8.8.8) 56(84) bytes of data. 64 bytes from 8.8.8.8: icmp_seq=1 ttl=103 time=11.5 ms 64 bytes from 8.8.8.8: icmp_seq=2 ttl=103 time=11.0 ms 64 bytes from 8.8.8.8: icmp_seq=3 ttl=103 time=11.0 ms ^C --- 8.8.8.8 ping statistics --- 3 packets transmitted, 3 received, 0% packet loss, time 2002ms rtt min/avg/max/mdev = 11.082/11.229/11.513/0.234 ms [root@bogon ~]#
[root@bogon ~]# NS=cali [root@bogon ~]# VETH=v-cali [root@bogon ~]# ip netns add $NS [root@bogon ~]# ip l add $VETH type veth peer name $VETH-peer [root@bogon ~]# ip l set $VETH-peer up [root@bogon ~]# ip l set $VETH netns $NS [root@bogon ~]# ip netns exec $NS ip l set $VETH up [root@bogon ~]# ip netns exec $NS ip a add 10.2.0.1/32 dev $VETH [root@bogon ~]# ip r add 10.2.0.1/32 dev $VETH-peer [root@bogon ~]# ip netns exec $NS ip r add default dev $VETH [root@bogon ~]#
calico plugin源码解析
func Main(version string) { // ... err := flagSet.Parse(os.Args[1:]) // ... // 注册 `ADD` 和 `DEL` 命令 skel.PluginMain(cmdAdd, nil, cmdDel, cniSfunc Main(version string) { // ... err := flagSet.Parse(os.Args[1:]) // ... // 注册 `ADD` 和 `DEL` 命令 skel.PluginMain(cmdAdd, nil, cmdDel, cniSpecVersion.PluginSupports("0.1.0", "0.2.0", "0.3.0", "0.3.1"), "Calico CNI plugin "+version) }pecVersion.PluginSupports("0.1.0", "0.2.0", "0.3.0", "0.3.1"), "Calico CNI plugin "+version) }
ADD
命令里,主要做了三个逻辑:
- 查询calico datastore里有没有WorkloadEndpoint对象和当前的pod名字匹配,没有匹配,则会创建新的WorkloadEndpoint对象,该对象内主要保存该pod在host network namespace内的网卡名字和pod ip地址,以及container network namespace的网卡名字等等信息,对象示例如下。
- 创建一个veth pair,并把其中一个网卡置于宿主机端网络命名空间,另一个置于容器端网络命名空间。在container network namespace内创建网卡如eth0,并通过调用calico-ipam获得的IP地址赋值给该eth0网卡;在host network namespace内创建网卡,网卡名格式为
"cali" + sha1(namespace.pod)[:11]
,并设置MAC地址"ee:ee:ee:ee:ee:ee"。 - 在容器端和宿主机端创建路由。在容器端,设置默认网关为
169.254.1.1
,该网关地址代码写死的;在宿主机端,添加路由如10.217.120.85 dev calid0bda9976d5 scope link
,其中10.217.120.85
是pod ip地址,calid0bda9976d5
是该pod在宿主机端的网卡,也就是veth pair在宿主机这端的virtual ethernet interface虚拟网络设备。