• linux网络协议栈--路由流程分析


    转:http://blog.csdn.net/hsly_support/article/details/8797976

    来吧,路由
    路由是网络的核心,是linux网络协议栈的核心,我们找个入口进去看看

    还记得在笔记5-IP层的处理1中ip_rcv_finish走到过一个岔口
    ->ip_rcv_finish()
         ->ip_route_input()  查找路由信息
         ->if (iph->ihl > 5 && ip_rcv_options(skb)) 如果IP头部大于20字节,则表示IP头部包含IP选项,需要进行选项处理.
              goto drop;
         ->dst_input(skb);      dst_input实际上会调用skb->dst->input(skb).input函数会根据路由信息设置为合适的函数指针,
                                          如果是则递交到本地的则为ip_local_deliver,若是转发则为ip_forward
    两条路径:
    1) ip_local_deliver
    2) ip_forward

    是什么导致路径不同呢,我们看一看ip_route_input() 干了啥

    int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                 u8 tos, struct net_device *dev)
         ->net = dev_net(dev);
         ->hash = rt_hash(daddr, saddr, iif, rt_genid(net));        计算hash值,注意hash因子    
         ->                                                                             既然hash值算出来了,我们就去找吧
         rcu_read_lock();
         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;      
              rth = rcu_dereference(rth->u.dst.rt_next)) {
              if (((rth->fl.fl4_dst ^ daddr) |                    异或 相同为0                       
                   (rth->fl.fl4_src ^ saddr) |
                   (rth->fl.iif ^ iif) |
                   rth->fl.oif |
                   (rth->fl.fl4_tos ^ tos)) == 0 &&
                  rth->fl.mark == skb->mark &&
                  net_eq(dev_net(rth->u.dst.dev), net) &&           判断路由和报文的struct net指针地址是否相同
                  !rt_is_expired(rth)) {                                        路由项是否过期 
                               找到了
                   dst_use(&rth->u.dst, jiffies);                            表示路由的使用时间
                   RT_CACHE_STAT_INC(in_hit);
                   rcu_read_unlock();
                   skb_dst_set(skb, &rth->u.dst);                         设置到skb中去
                   return 0;
              }
              RT_CACHE_STAT_INC(in_hlist_search);
         }
         rcu_read_unlock();

    static struct rt_hash_bucket      *rt_hash_table __read_mostly;

    struct rt_hash_bucket {
         struct rtable     *chain;
    };

    struct rtable
    {
         union
         {
              struct dst_entry     dst;
         } u;
         /* Cache lookup keys */
         struct flowi          fl;                     存放的是查找该路由节点的哈希值,该哈希值用源IP,目的地址,TOS一起确定
         struct in_device     *idev;
        
         int               rt_genid;
         unsigned          rt_flags;               一些结构性的标志,例如,RTF_UP表示这条路由可用
         __u16               rt_type;               表明了目标地址的类型,例如RTN_LOCAL,RTN_MULTICAST

         __be32               rt_dst;     /* Path destination     */              用来存放目标的IP地址
         __be32               rt_src;     /* Path source          */               路由路径的起点ip地址
         int               rt_iif;                         

         /* Info on neighbour */
         __be32               rt_gateway;                     网关信息

         /* Miscellaneous cached information */
         __be32               rt_spec_dst; /* RFC1122 specific destination */
         struct inet_peer     *peer; /* long-living peer info */
    };

    当然,在rt_hash_table中查不到  则处理多播ipv4_is_multicast(daddr)   ->ip_route_input_mc
    再进ip_route_input_slow(skb, daddr, saddr, tos, dev);  rt_hash_table为路由高速缓存
    我们主要分析ip_route_input_slow()
    static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                          u8 tos, struct net_device *dev)
    {
         struct fib_result res;
         struct in_device *in_dev = in_dev_get(dev);
         struct flowi fl = { .nl_u = { .ip4_u =
                              { .daddr = daddr,
                             .saddr = saddr,
                             .tos = tos,
                             .scope = RT_SCOPE_UNIVERSE,
                              } },
                       .mark = skb->mark,
                       .iif = dev->ifindex };                       初始化路由键值
         unsigned     flags = 0;
         u32          itag = 0;
         struct rtable * rth;
         unsigned     hash;
         __be32          spec_dst;
         int          err = -EINVAL;
         int          free_res = 0;
         struct net    * net = dev_net(dev);

         /* IP on this device is disabled. */

         if (!in_dev)
              goto out;

         /* Check for the most weird martians, which can be not detected
            by fib_lookup.
         */

         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||         是否是多播、组播、环回地址等
             ipv4_is_loopback(saddr))
              goto martian_source;

         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
              goto brd_input;                    

         /* Accept zero addresses only to limited broadcast;
         * I even do not know to fix it or not. Waiting for complains :-)
         */
         if (ipv4_is_zeronet(saddr))               源地址是否是零网地址类型
              goto martian_source;

         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
             ipv4_is_loopback(daddr))
              goto martian_destination;

         /*
         *     Now we are ready to route packet.
         */
         if ((err = fib_lookup(net, &fl, &res)) != 0) {                 通过路由函数查找目标地址   结果记录在res中
              if (!IN_DEV_FORWARD(in_dev))                         如果设备不支持转发
                   goto e_hostunreach;
              goto no_route;
         }
         free_res = 1;                                          默认为释放查找结果

         RT_CACHE_STAT_INC(in_slow_tot);

         if (res.type == RTN_BROADCAST)            路由类型为广播   
              goto brd_input;

         if (res.type == RTN_LOCAL) {                  路由类型为本地类型
              int result;
              result = fib_validate_source(saddr, daddr, tos,                         检查源地址
                                  net->loopback_dev->ifindex,
                                  dev, &spec_dst, &itag, skb->mark);
              if (result < 0)
                   goto martian_source;                     源地址错误
              if (result)
                   flags |= RTCF_DIRECTSRC;         
              spec_dst = daddr;                              记录目标地址
              goto local_input;                                本地输入,跳转
         }

         if (!IN_DEV_FORWARD(in_dev))                       如果设备不支持转发
              goto e_hostunreach;
         if (res.type != RTN_UNICAST)                         如果目标地址不是单播类型
              goto martian_destination;                          目标地址错误,跳转

         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);         创建用于转发的路由表   
    done:
         in_dev_put(in_dev);                
         if (free_res)                                             如果需要释放
              fib_res_put(&res);
    out:     return err;

    brd_input:                            广播输入
         if (skb->protocol != htons(ETH_P_IP))            不是ip协议
              goto e_inval;

         if (ipv4_is_zeronet(saddr))
              spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
         else {
              err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,          检查源地址的有效性
                               &itag, skb->mark);
              if (err < 0)
                   goto martian_source;
              if (err)
                   flags |= RTCF_DIRECTSRC;        
         }
         flags |= RTCF_BROADCAST;             增加广播标志
         res.type = RTN_BROADCAST;               设置地址类型为广播类型
         RT_CACHE_STAT_INC(in_brd);

    local_input:                                            本地输入
         rth = dst_alloc(&ipv4_dst_ops);           创建路由表
         if (!rth)
              goto e_nobufs;

         rth->u.dst.output= ip_rt_bug;                       设置输出方向的函数
         rth->rt_genid = rt_genid(net);                         产生随机值

         atomic_set(&rth->u.dst.__refcnt, 1);              
         rth->u.dst.flags= DST_HOST;                            设置路由标志
         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
              rth->u.dst.flags |= DST_NOPOLICY;
         rth->fl.fl4_dst     = daddr;                      记录目标地址
         rth->rt_dst     = daddr;                     
         rth->fl.fl4_tos     = tos;                       记录TOS
         rth->fl.mark    = skb->mark;                  记录掩码
         rth->fl.fl4_src     = saddr;                    记录源地址
         rth->rt_src     = saddr;
    #ifdef CONFIG_NET_CLS_ROUTE
         rth->u.dst.tclassid = itag;
    #endif
         rth->rt_iif     =
         rth->fl.iif     = dev->ifindex;                          记录网络设备id
         rth->u.dst.dev     = net->loopback_dev;         记录环回设备
         dev_hold(rth->u.dst.dev);
         rth->idev     = in_dev_get(rth->u.dst.dev);
         rth->rt_gateway     = daddr;                          记录网关地址
         rth->rt_spec_dst= spec_dst;                          记录指定目标地址
         rth->u.dst.input= ip_local_deliver;                   设置输入函数         ip_local_deliver在这里设置,(文章开头的疑问?)
         rth->rt_flags      = flags|RTCF_LOCAL;           增加本地路由地址
         if (res.type == RTN_UNREACHABLE) {             如果目标地址不可达
              rth->u.dst.input= ip_error;
              rth->u.dst.error= -err;
              rth->rt_flags      &= ~RTCF_LOCAL;
         }
         rth->rt_type     = res.type;                           设置地址类型
         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));          计算hash值
         err = rt_intern_hash(hash, rth, NULL, skb);        将路由表插入hash队列并记录到数据包中
         goto done;

    no_route:
         RT_CACHE_STAT_INC(in_no_route);
         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);   确定指定目标
         res.type = RTN_UNREACHABLE;                          设置不可达
         if (err == -ESRCH)
              err = -ENETUNREACH;
         goto local_input;

         /*
         *     Do not cache martian addresses: they should be logged (RFC1812)
         */
    martian_destination:                                    目标地址错
         RT_CACHE_STAT_INC(in_martian_dst);
    #ifdef CONFIG_IP_ROUTE_VERBOSE
         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
              printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s ",
                   &daddr, &saddr, dev->name);
    #endif

    e_hostunreach:                                           主机不可达错误
         err = -EHOSTUNREACH;
         goto done; 

    e_inval:                                                       无法识别
         err = -EINVAL;
         goto done;

    e_nobufs:                                                     空间不足
         err = -ENOBUFS;
         goto done;

    martian_source:                                           源地址错误
         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
         goto e_inval;
    }

    在上面函数的分析中可以看到,对于目标地址是转发情况的,调用ip_mkroute_input()函数创建转发路由表
    对于广播或者本地类型,直接分配路由表并初始化,并直接指定下一步处理函数为ip_local_deliver()

    static int ip_mkroute_input(struct sk_buff *skb,
                       struct fib_result *res,
                       const struct flowi *fl,
                       struct in_device *in_dev,
                       __be32 daddr, __be32 saddr, u32 tos)
    {
         struct rtable* rth = NULL;              
         int err;
         unsigned hash;

    #ifdef CONFIG_IP_ROUTE_MULTIPATH                   多路径选择
         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
              fib_select_multipath(fl, res); 
    #endif

         /* create a routing cache entry */
         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);        创建路由表
         if (err)
              return err;

         /* put it into the cache */
         hash = rt_hash(daddr, saddr, fl->iif,
                     rt_genid(dev_net(rth->u.dst.dev)));    计算hash值
         return rt_intern_hash(hash, rth, NULL, skb);    将路由表插入路由高速缓存队列中,并记录到数据包中
    }

    static int __mkroute_input(struct sk_buff *skb,
                      struct fib_result *res,
                      struct in_device *in_dev,
                      __be32 daddr, __be32 saddr, u32 tos,
                      struct rtable **result)
    {

         struct rtable *rth;
         int err;
         struct in_device *out_dev;
         unsigned flags = 0;
         __be32 spec_dst;
         u32 itag;

         /* get a working reference to the output device */
         out_dev = in_dev_get(FIB_RES_DEV(*res));               取出输出设备的配置结构
         if (out_dev == NULL) {             
              if (net_ratelimit())
                   printk(KERN_CRIT "Bug in ip_route_input"
                          "_slow(). Please, report ");
              return -EINVAL;
         }


         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
                          in_dev->dev, &spec_dst, &itag, skb->mark);        检查源地址
         if (err < 0) {
              ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
                             saddr);

              err = -EINVAL;
              goto cleanup;
         }

         if (err)
              flags |= RTCF_DIRECTSRC;

         if (out_dev == in_dev && err &&
             (IN_DEV_SHARED_MEDIA(out_dev) ||
              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
              flags |= RTCF_DOREDIRECT;

         if (skb->protocol != htons(ETH_P_IP)) {               如果不是ip
              /* Not IP (i.e. ARP). Do not create route, if it is
              * invalid for proxy arp. DNAT routes are always valid.
              */
              if (out_dev == in_dev) {
                   err = -EINVAL;
                   goto cleanup;
              }
         }


         rth = dst_alloc(&ipv4_dst_ops);                申请路由项
         if (!rth) {
              err = -ENOBUFS;
              goto cleanup;
         }

         atomic_set(&rth->u.dst.__refcnt, 1);
         rth->u.dst.flags= DST_HOST;
         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
              rth->u.dst.flags |= DST_NOPOLICY;
         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
              rth->u.dst.flags |= DST_NOXFRM;
         rth->fl.fl4_dst     = daddr;
         rth->rt_dst     = daddr;
         rth->fl.fl4_tos     = tos;
         rth->fl.mark    = skb->mark;
         rth->fl.fl4_src     = saddr;
         rth->rt_src     = saddr;
         rth->rt_gateway     = daddr;
         rth->rt_iif      =
              rth->fl.iif     = in_dev->dev->ifindex;
         rth->u.dst.dev     = (out_dev)->dev;
         dev_hold(rth->u.dst.dev);
         rth->idev     = in_dev_get(rth->u.dst.dev);
         rth->fl.oif      = 0;
         rth->rt_spec_dst= spec_dst;

         rth->u.dst.input = ip_forward;                     设置输入函数         ip_forward在这里设置,(文章开头的疑问?)
         rth->u.dst.output = ip_output;                      设置输出函数
         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));

         rt_set_nexthop(rth, res, itag);

         rth->rt_flags = flags;

         *result = rth;
         err = 0;
    cleanup:
         /* release the working reference to the output device */
         in_dev_put(out_dev);
         return err;
    }

    经过路由查找分析,我们看到ip层在选择转发或者本地上送的选择是从路由信息里来的,而查找路由信息的过程则是先从
    rt_hash_table中查找,如果其中查不到则通过fib_lookup进行 查找,形成路由信息,根据目标地址类型是本地还是转发选择了不同的输入函数,这样ip层的后续投递就会有了两种选择路径, 当然 把查到的路由信息插入到rt_hash_table,更新skb的dst。
  • 相关阅读:
    视图、触发器、事物、存储过程、函数、流程控制
    pymysql
    单表查询与多表查询
    多线程学习(第三天)线程间通信
    多线程学习(第二天)Java内存模型
    多线程学习(第一天)java语言的线程
    springboot集成es7(基于high level client)
    elasticSearch(六)--全文搜索
    elasticSearch(五)--排序
    elasticSearch(四)--结构化查询
  • 原文地址:https://www.cnblogs.com/newjiang/p/7493686.html
Copyright © 2020-2023  润新知