路由查找
与IPv4不同,IPv6的出口路由和入口路由都使用函数ip6_pol_route实现,区别在于传入的接口索引参数不同
INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); } INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); }
/* 正常添加路由命令是通过ip6_pol_route_lookup->fib6_lookup->fib6_looup_1查表; 其他方式查找路由:ip6_pol_route->fib6_lookup->fib6_looup_1 —通过该函数查表. */ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; struct rt6_info *rt; int strict = 0; strict |= flags & RT6_LOOKUP_F_IFACE; if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; read_lock_bh(&table->tb6_lock); // find leaf node fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) oif = 0; redo_rt6_select: rt = rt6_select(fn, oif, strict); if (rt->rt6i_nsiblings) rt = rt6_multipath_select(rt, fl6, oif, strict); if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; else if (strict & RT6_LOOKUP_F_REACHABLE) { /* also consider unreachable route */ strict &= ~RT6_LOOKUP_F_REACHABLE; fn = saved_fn; goto redo_rt6_select; } } //查找缓存的路由,如果找到,则返回此值 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); rt6_dst_from_metrics_check(rt); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !(rt->rt6i_flags & RTF_GATEWAY))) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. 否则,判断流结构flowi6是否设置了FLOWI_FLAG_KNOWN_NH,并且没有设置了下一跳网关的地址组, 这种已知下一跳的前提下查找路由的情况不常见。而且,由于在fl6结构目的地址成员daddr使用的是下一跳地址, 而不是skb报文中的目的地址,此时创建的路由缓存项不会缓存在fib6树种, 将其添加到uncached_list链表 */ struct rt6_info *uncached_rt; dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); dst_release(&rt->dst); if (uncached_rt) rt6_uncached_list_add(uncached_rt); else uncached_rt = net->ipv6.ip6_null_entry; dst_hold(&uncached_rt->dst); return uncached_rt; } else { /* Get a percpu copy 以上两种情况都没有成立,分配每处理器路由缓存项,其过程中将缓存路由项,不必加到uncached_list链表*/ struct rt6_info *pcpu_rt; rt->dst.lastuse = jiffies; rt->dst.__use++; pcpu_rt = rt6_get_pcpu_route(rt); if (pcpu_rt) { read_unlock_bh(&table->tb6_lock); } else { /* We have to do the read_unlock first * because rt6_make_pcpu_route() may trigger * ip6_dst_gc() which will take the write_lock. */ dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); pcpu_rt = rt6_make_pcpu_route(rt); dst_release(&rt->dst); } return pcpu_rt; } }
对于使用ICMPv6的IPv6邻居发现、IGMP和MLD协议,利用icmp6_dst_alloc分配路由缓存项。对于这类报文,仅限于本地网络,报文的下一跳地址和目的地址相同,这里不查询fib6表,直接分配缓存项,导致新分配的路由在fib树中没有缓存位置,所以将其添加到uncached_list链表。
/* 对于使用ICMPv6的IPv6邻居发现、IGMP和MLD协议,利用icmp6_dst_alloc分配路由缓存项。 对于这类报文,仅限于本地网络,报文的下一跳地址和目的地址相同,这里不查询fib6表,直接分配缓存项, 导致新分配的路由在fib树中没有缓存位置,所以将其添加到uncached_list链表 如果放入uncache_list链表, 则需要开启timer 定时回收。 */ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6) { struct dst_entry *dst; struct rt6_info *rt; struct inet6_dev *idev = in6_dev_get(dev); struct net *net = dev_net(dev); if (unlikely(!idev)) return ERR_PTR(-ENODEV); rt = ip6_dst_alloc(net, dev, 0); if (unlikely(!rt)) { in6_dev_put(idev); dst = ERR_PTR(-ENOMEM); goto out; } rt->dst.flags |= DST_HOST; rt->dst.input = ip6_input; rt->dst.output = ip6_output; atomic_set(&rt->dst.__refcnt, 1); rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); spin_lock_bh(&icmp6_dst_lock); rt->dst.next = icmp6_dst_gc_list; icmp6_dst_gc_list = &rt->dst; spin_unlock_bh(&icmp6_dst_lock); fib6_force_start_gc(net); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); out: return dst; }
uncached路由缓存清除
当接口被注销或者down时,由函数rt6_uncached_list_flush_dev清除设备相关的uncached路由缓存
static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) { struct net_device *loopback_dev = net->loopback_dev; int cpu; if (dev == loopback_dev) return; //遍历所有的rt6_uncached_list中的路由缓存,将其中与操作设备相等的缓存项的设备换成黑洞设备blackhole_netdev, //并且将路由项的inet6_dev换成回环接口对应的inet6_dev。实际上并没有将路由缓存项从uncached_list链表中删除 for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); struct rt6_info *rt; spin_lock_bh(&ul->lock); list_for_each_entry(rt, &ul->head, rt6i_uncached) { struct inet6_dev *rt_idev = rt->rt6i_idev; struct net_device *rt_dev = rt->dst.dev; if (rt_idev->dev == dev) { rt->rt6i_idev = in6_dev_get(loopback_dev); in6_dev_put(rt_idev); } if (rt_dev == dev) { rt->dst.dev = loopback_dev;//将其中与操作设备相等的缓存项的设备换成黑洞设备blackhole_netdev/loopback_dev, dev_hold(rt->dst.dev); dev_put(rt_dev); } } spin_unlock_bh(&ul->lock); } }
PCPU路由缓存查找
如果路由查询结果中rt6_info成员rt6i_pcpu有值,表明缓存存在,直接返回其值。
/* It should be called with read_lock_bh(&tb6_lock) acquired */ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) { struct rt6_info *pcpu_rt, **p; p = this_cpu_ptr(rt->rt6i_pcpu); pcpu_rt = *p; if (pcpu_rt) { dst_hold(&pcpu_rt->dst); rt6_dst_from_metrics_check(pcpu_rt); } return pcpu_rt; }
函数ip6_rt_pcpu_alloc负责分配初始化每处理器路由缓存,如果在内核在删除路由信息,即在函数fib6_drop_pcpu_from将路由信息的fib6_destroying设置为1,此种情况下,应当释放每处理器路由缓存依据的路由信息。
static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) { struct fib6_table *table = rt->rt6i_table; struct rt6_info *pcpu_rt, *prev, **p; //分配路由缓存,并进行初始化,设置RTF_PCPU标志 pcpu_rt = ip6_rt_pcpu_alloc(rt); if (!pcpu_rt) { struct net *net = dev_net(rt->dst.dev); dst_hold(&net->ipv6.ip6_null_entry->dst); return net->ipv6.ip6_null_entry; } read_lock_bh(&table->tb6_lock); if (rt->rt6i_pcpu) { p = this_cpu_ptr(rt->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); if (prev) { /* If someone did it before us, return prev instead */ dst_destroy(&pcpu_rt->dst); pcpu_rt = prev; } } else { /* rt has been removed from the fib6 tree * before we have a chance to acquire the read_lock. * In this case, don't brother to create a pcpu rt * since rt is going away anyway. The next * dst_check() will trigger a re-lookup. */// 此处的逻辑要注意 直接释放生成的pcpurt dst_destroy(&pcpu_rt->dst); pcpu_rt = rt; } dst_hold(&pcpu_rt->dst); rt6_dst_from_metrics_check(pcpu_rt); read_unlock_bh(&table->tb6_lock); return pcpu_rt; }