• I40E网卡BUG引起内核异常重启问题分析


    问题描述

      线上服务器异常复位,产生了vmcore文件,发现是内核接收报文时,访问了空指针导致,堆栈信息如下:

       

     

     分析及定位

    1、分析vmcore文件

    根据core信息,访问异常的代码行:

    /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/net/core/dev.c: 3483

     

      通过反汇编异常地址,得到如下信息:

     

         __netif_receive_skb_core反汇编代码如下所示,分析了调用过程,并确认异常寄存器存储的变量信息:

     1 crash> dis -l __netif_receive_skb_core
     2 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/net/core/dev.c: 3460
     3 0xffffffff8152d150 <__netif_receive_skb_core>:  nopl   0x0(%rax,%rax,1) [FTRACE NOP]
     4 0xffffffff8152d155 <__netif_receive_skb_core+5>:        push   %rbp
     5 0xffffffff8152d156 <__netif_receive_skb_core+6>:        mov    %rsp,%rbp
     6 0xffffffff8152d159 <__netif_receive_skb_core+9>:        push   %r15
     7 0xffffffff8152d15b <__netif_receive_skb_core+11>:       push   %r14
     8 0xffffffff8152d15d <__netif_receive_skb_core+13>:       push   %r13
     9 0xffffffff8152d15f <__netif_receive_skb_core+15>:       mov    %esi,%r13  ##第二个参数
    10 0xffffffff8152d162 <__netif_receive_skb_core+18>:       push   %r12
    11 0xffffffff8152d164 <__netif_receive_skb_core+20>:       push   %rbx
    12 0xffffffff8152d165 <__netif_receive_skb_core+21>:       sub    $0x28,%rsp
    13 0xffffffff8152d169 <__netif_receive_skb_core+25>:       mov    %rdi,-0x40(%rbp)  ## rdi 第一个参数, 即skb参数
    14 0xffffffff8152d16d <__netif_receive_skb_core+29>:       mov    %gs:0x28,%rax
    15 0xffffffff8152d176 <__netif_receive_skb_core+38>:       mov    %rax,-0x30(%rbp)
    16 0xffffffff8152d17a <__netif_receive_skb_core+42>:       xor    %eax,%eax
    17 0xffffffff8152d17c <__netif_receive_skb_core+44>:       jmpq   0xffffffff8152d650 <__netif_receive_skb_core+1280>
    18 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/arch/x86/include/asm/jump_label.h: 16
    19 0xffffffff8152d181 <__netif_receive_skb_core+49>:       mov    -0x40(%rbp),%r12   ## 把skb值赋值给r12
    20 0xffffffff8152d185 <__netif_receive_skb_core+53>:       nopl   0x0(%rax,%rax,1)
    21 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/include/linux/skbuff.h: 1860
    22 0xffffffff8152d18a <__netif_receive_skb_core+58>:       mov    0xe8(%r12),%rax
    23 0xffffffff8152d192 <__netif_receive_skb_core+66>:       sub    0xe0(%r12),%rax
    24 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/net/core/dev.c: 3476  ## if (!skb_transport_header_was_set(skb))
    25 0xffffffff8152d19a <__netif_receive_skb_core+74>:       cmpw   $0xffff,0xc0(%r12)
    26 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/net/core/dev.c: 3473    ## dev.c:3473 orig_dev = skb->dev;
    27 0xffffffff8152d1a4 <__netif_receive_skb_core+84>:       mov    0x20(%r12),%r14                     ## r14 is orig_dev
    28 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/include/linux/skbuff.h: 1860
    29 0xffffffff8152d1a9 <__netif_receive_skb_core+89>:       mov    %ax,0xc2(%r12)
    30 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/net/core/dev.c: 3476
    31 0xffffffff8152d1b2 <__netif_receive_skb_core+98>:       je     0xffffffff8152d740 <__netif_receive_skb_core+1520>
    32 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/include/linux/skbuff.h: 1777
    33 0xffffffff8152d1b8 <__netif_receive_skb_core+104>:      sub    0xc4(%r12),%ax
    34 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/net/core/dev.c: 3466
    35 0xffffffff8152d1c1 <__netif_receive_skb_core+113>:      mov    $0x1,%r15d
    36 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/include/linux/skbuff.h: 1777
    37 0xffffffff8152d1c7 <__netif_receive_skb_core+119>:      mov    %ax,0x70(%r12)
    38 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/net/core/dev.c: 3473
    39 0xffffffff8152d1cd <__netif_receive_skb_core+125>:      mov    %r14,%rax        ## r14即skb->dev
    40 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/net/core/dev.c: 3483
    41 0xffffffff8152d1d0 <__netif_receive_skb_core+128>:      mov    0xc0(%rax),%eax  ## 取dev->ifindex赋值给 skb->ifindex,此时dev值为NULL,访问异常
    42 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/net/core/dev.c: 3485
    43 0xffffffff8152d1d6 <__netif_receive_skb_core+134>:      incl   %gs:0x15340
    44 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/net/core/dev.c: 3483
    45 0xffffffff8152d1de <__netif_receive_skb_core+142>:      mov    %eax,0x98(%r12)
    46 /usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/net/core/dev.c: 3487
    47 0xffffffff8152d1e6 <__netif_receive_skb_core+150>:      movzwl 0x7e(%r12),%eax
    .......

      在vmcore文件中,RAX寄存器值为0,结合__netif_receive_skb_core反汇编代码,可以确认这个寄存器应该存储skb->dev地址,

    然后取出设备索引赋值给skb->skb_iif。所以当执行到这个函数时,就会出现系统异常。

       

      下面就是分析为什么这个流程里skb->dev值是一个空指针。

    2、I40E网卡异常代码分析

      经过同事指点,发现了I40E网卡驱动(版本号2.0.30)一处BUG,在i40e_clean_rx_irq函数里,如下标红地方:

      1 static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
      2 {
      3     unsigned int total_rx_bytes = 0, total_rx_packets = 0;
      4     struct sk_buff *skb = rx_ring->skb;
      5     u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
      6     bool failure = false;
      7 
      8     while (likely(total_rx_packets < (unsigned int)budget)) {
      9         union i40e_rx_desc *rx_desc;
     10         u16 vlan_tag;
     11         u8 rx_ptype;
     12         u64 qword;
     13 
     14         /* return some buffers to hardware, one at a time is too slow */
     15         if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
     16             failure = failure ||
     17                   i40e_alloc_rx_buffers(rx_ring, cleaned_count);
     18             cleaned_count = 0;
     19         }
     20 
     21         rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
     22 
     23         /* status_error_len will always be zero for unused descriptors
     24          * because it's cleared in cleanup, and overlaps with hdr_addr
     25          * which is always zero because packet split isn't used, if the
     26          * hardware wrote DD then it will be non-zero
     27          */
     28         if (!i40e_test_staterr(rx_desc,
     29                        BIT(I40E_RX_DESC_STATUS_DD_SHIFT)))
     30             break;
     31 
     32         /* This memory barrier is needed to keep us from reading
     33          * any other fields out of the rx_desc until we know the
     34          * DD bit is set.
     35          */
     36         dma_rmb();
     37         /* 如果被释放的skb没有赋值为空,则会继续被用来接收网卡中的数据,这个skb放入接收队列后,会与系统其它流程产生访问冲突 */
     38         skb = i40e_fetch_rx_buffer(rx_ring, rx_desc, skb);
     39         if (!skb)
     40             break;
     41 
     42         cleaned_count++;
     43 
     44         if (i40e_is_non_eop(rx_ring, rx_desc, skb))
     45             continue;
     46 
     47         /* ERR_MASK will only have valid bits if EOP set, and
     48          * what we are doing here is actually checking
     49          * I40E_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
     50          * the error field
     51          */
     52         if (unlikely(i40e_test_staterr(rx_desc, BIT(I40E_RXD_QW1_ERROR_SHIFT)))) {
     53             dev_kfree_skb_any(skb);  // 此处释放skb后没有赋值为NULL,循环会进入i40e_fetch_rx_buffer继续接收网卡里的其它数据,但是此时skb可能已经回收到系统了!
     54             continue;
     55         }
     56 
     57         if (i40e_cleanup_headers(rx_ring, skb)) {
     58             skb = NULL;
     59             continue;
     60         }
     61 
     62         /* probably a little skewed due to removing CRC */
     63         total_rx_bytes += skb->len;
     64 
     65         qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
     66         rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
     67                I40E_RXD_QW1_PTYPE_SHIFT;
     68 
     69         /* populate checksum, VLAN, and protocol */
     70         i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
     71 
     72 #ifdef I40E_FCOE
     73         if (unlikely(
     74             i40e_rx_is_fcoe(rx_ptype) &&
     75             !i40e_fcoe_handle_offload(rx_ring, rx_desc, skb))) {
     76             dev_kfree_skb_any(skb);
     77             continue;
     78         }
     79 #endif
     80         vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ?
     81                le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : 0;
     82 
     83         i40e_receive_skb(rx_ring, skb, vlan_tag);
     84         skb = NULL;
     85 
     86         /* update budget accounting */
     87         total_rx_packets++;
     88     }
     89 
     90     rx_ring->skb = skb;
     91 
     92     u64_stats_update_begin(&rx_ring->syncp);
     93     rx_ring->stats.packets += total_rx_packets;
     94     rx_ring->stats.bytes += total_rx_bytes;
     95     u64_stats_update_end(&rx_ring->syncp);
     96     rx_ring->q_vector->rx.total_packets += total_rx_packets;
     97     rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
     98 
     99     /* guarantee a trip back through this routine if there was a failure */
    100     return failure ? budget : (int)total_rx_packets;
    101 }

         从驱动收包流程看,当出现RXD_QW1错误时,要释放掉skb,然后继续去从rx_buffer接收数据。但是这个skb在释放后,

    没有赋值为NULL,会在下一个循环中调用i40e_fetch_rx_buffer函数,会继续使用这个已经释放的skb去接收报文。这个skb即将

    被系统回收,是一个非法的地址,当这个skb被放入接收队列处理时,也有可能被另外一个cpu分配并使用,skb里面的所有值

    都会被修改,包括dev,所以就有可能出现访问skb->dev异常的情况。

    3、分析skb内容:

      

        异常skb 里面queue_mappingtransport_headernetwork_headermac_header都是有效值,而且queue_mapping是在

    dev_queue_xmit里面赋值的,不应该在接收流程里。所以skb被其他流程修改的可能性很大。

      而且查看地址为ffff88386438eb00的skb-dev信息,还是正常值,但是存储在ori_dev里面的值是异常的。

      

      根据dev信息,它属于eth7,这个skb应该是在cpu46上发送的,但是实际不在cpu46的发送队列上,而是在cpu45发送队列上。

      setting tx-22 xps_cpus ----> 4000,00000000 (cpu46)

    4、查看softnet_datat入向队列(用于发包)

      p softnet_data:

      [45]: ffff88407dab5300   ##cpu45上的softnet_data地址

      [46]: ffff88407dad5300   ##cpu46上的softnet_data地址

    crash> struct softnet_data.input_pkt_queue ffff88407dab5300  

      input_pkt_queue = {     //出向skb队列

        next = 0xffff880b8c0c8c00,   

        prev = 0xffff882dad78ed00,

        qlen = 284,

        lock = {

          {

            rlock = {

              raw_lock = {

                {

                  head_tail = 2055371394,

                  tickets = {

                    head = 31362,

                    tail = 31362

                  }

                }

              }

            }

          }

        }

      }

    查看 input_pkt_queue 列表内容,其在softnet_data内的片移为0xC0

    crash> list ffff88407dab53C0

    ffff88407dab53c0

    ffff880b8c0c8c00

    ffff88091e8f9c00

    ffff8836118d7e00

    ffff8836118d7f00

    ffff88141b640a00

    ffff8836118d7100

    ffff8836118d7500

    ffff88386438f700

    ffff88386438eb00   ## 这个skb与异常进程上的skb一致

    ffff8806b1e4c900

    ffff8806b1e4cd00

    ffff8806b1e4ce00

    ffff8806b1e4d900

    ffff883f96b67500

    ffff883f96b66f00

    ffff8823b19ea700

    综上所述,现场服务器出现异常复位原因,是由于网卡接收报文处理不当,导致skb访问异常,引起系统异常。

    解决方案:

    修改网卡驱动i40e_clean_rx_irq函数,修正这个bug;或者升级更高版本I40E驱动。

        

  • 相关阅读:
    各地电信运营商插广告赚钱,北京联通也不甘落后
    也谈Server Limit DOS的解决方案
    Still Believe
    无奈小虫何
    好朋有也有类别
    无为而治
    青鸟随想
    落寞时分
    网站开发学习路线和资料
    C++实例 添加快捷键表
  • 原文地址:https://www.cnblogs.com/smith9527/p/11437647.html
Copyright © 2020-2023  润新知