• linux 3.10 tcp的accept测试


    net.ipv4.tcp_abort_on_overflow 为 0 

    有个兄弟跟我说accept的时候,如果故意不去accept,那么客户端connect的时候,一开始很快,后来就很慢:

    connect(3, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000554>
    connect(4, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000579>
    connect(5, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000199>
    connect(6, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000161>
    connect(7, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000546>
    connect(8, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000249>
    connect(9, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000545>
    connect(10, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000099>
    connect(11, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.002572>------------开始慢
    connect(12, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000476>
    connect(13, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.006768>
    connect(14, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000160>
    connect(15, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.007360>
    connect(16, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000667>
    connect(17, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.006858>
    connect(18, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000394>
    connect(19, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.007592>
    connect(20, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000396>
    connect(21, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.007355>
    connect(22, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000095>
    connect(23, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.007109>
    connect(24, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000250>
    ~

    对应的抓包,看慢的那个是27102端口开始慢,它syn建联的时候,出现了重发,重发的周期为1s,也就是TCP_TIMEOUT_INIT,且可以排除非网络问题。

    11:08:38.730265 IP 10.229.200.12.27102 > 10.229.142.81.3333: Flags [S], seq 3869742768, win 14600, options [mss 1460,nop,nop,sackOK,nop,wscale 9], length 0
    11:08:38.730484 IP 10.229.200.12.27100 > 10.229.142.81.3333: Flags [.], ack 1, win 29, length 0
    11:08:38.730614 IP 10.229.200.12.27101 > 10.229.142.81.3333: Flags [.], ack 1, win 29, length 0
    11:08:39.731399 IP 10.229.142.81.3333 > 10.229.200.12.27100: Flags [S.], seq 833457747, ack 289533139, win 29200, options [mss 1460,nop,nop,sackOK,nop,wscale 9], length 0
    11:08:39.731932 IP 10.229.200.12.27100 > 10.229.142.81.3333: Flags [.], ack 1, win 29, length 0
    11:08:40.131387 IP 10.229.142.81.3333 > 10.229.200.12.27101: Flags [S.], seq 958793751, ack 483265294, win 29200, options [mss 1460,nop,nop,sackOK,nop,wscale 9], length 0
    11:08:40.131919 IP 10.229.200.12.27101 > 10.229.142.81.3333: Flags [.], ack 1, win 29, length 0
    11:08:41.731426 IP 10.229.142.81.3333 > 10.229.200.12.27100: Flags [S.], seq 833457747, ack 289533139, win 29200, options [mss 1460,nop,nop,sackOK,nop,wscale 9], length 0
    11:08:41.731961 IP 10.229.200.12.27100 > 10.229.142.81.3333: Flags [.], ack 1, win 29, length 0
    11:08:41.732424 IP 10.229.200.12.27102 > 10.229.142.81.3333: Flags [S], seq 3869742768, win 14600, options [mss 1460,nop,nop,sackOK,nop,wscale 9], length 0

    看了一下代码,捋一捋记录如下:

    既然connect慢,我们就从客户端 connect 开始分析,

    SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
            int, addrlen)
    {
        struct socket *sock;
        struct sockaddr_storage address;
        int err, fput_needed;
    
        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
            goto out;
        err = move_addr_to_kernel(uservaddr, addrlen, &address);
        if (err < 0)
            goto out_put;
    
        err =
            security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
        if (err)
            goto out_put;
    
        err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
                     sock->file->f_flags);---------------调用 tcp_v4_connect
    out_put:
        fput_light(sock->file, fput_needed);
    out:
        return err;
    }
    
    /* This will initiate an outgoing connection. */
    int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
    {
        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
        struct inet_sock *inet = inet_sk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        __be16 orig_sport, orig_dport;
        __be32 daddr, nexthop;
        struct flowi4 *fl4;
        struct rtable *rt;
        int err;
        struct ip_options_rcu *inet_opt;
    
        if (addr_len < sizeof(struct sockaddr_in))
            return -EINVAL;
    
        if (usin->sin_family != AF_INET)
            return -EAFNOSUPPORT;
    
        nexthop = daddr = usin->sin_addr.s_addr;
        inet_opt = rcu_dereference_protected(inet->inet_opt,
                             sock_owned_by_user(sk));
        if (inet_opt && inet_opt->opt.srr) {
            if (!daddr)
                return -EINVAL;
            nexthop = inet_opt->opt.faddr;
        }
    
        orig_sport = inet->inet_sport;
        orig_dport = usin->sin_port;
        fl4 = &inet->cork.fl.u.ip4;
        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
                      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
                      IPPROTO_TCP,
                      orig_sport, orig_dport, sk);
        if (IS_ERR(rt)) {
            err = PTR_ERR(rt);
            if (err == -ENETUNREACH)
                IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
            return err;
        }
    
        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
            ip_rt_put(rt);
            return -ENETUNREACH;
        }
    
        if (!inet_opt || !inet_opt->opt.srr)
            daddr = fl4->daddr;
    
        if (!inet->inet_saddr)
            inet->inet_saddr = fl4->saddr;
        inet->inet_rcv_saddr = inet->inet_saddr;
    
        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
            /* Reset inherited state */
            tp->rx_opt.ts_recent       = 0;
            tp->rx_opt.ts_recent_stamp = 0;
            if (likely(!tp->repair))
                tp->write_seq       = 0;
        }
    
        if (tcp_death_row.sysctl_tw_recycle &&
            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
            tcp_fetch_timewait_stamp(sk, &rt->dst);
    
        inet->inet_dport = usin->sin_port;
        inet->inet_daddr = daddr;
    
        inet_csk(sk)->icsk_ext_hdr_len = 0;
        if (inet_opt)
            inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
    
        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
    
        /* Socket identity is still unknown (sport may be zero).
         * However we set state to SYN-SENT and not releasing socket
         * lock select source port, enter ourselves into the hash tables and
         * complete initialization after this.
         */
        tcp_set_state(sk, TCP_SYN_SENT);
        err = inet_hash_connect(&tcp_death_row, sk);
        if (err)
            goto failure;
    
        sk_set_txhash(sk);
    
        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
                       inet->inet_sport, inet->inet_dport, sk);
        if (IS_ERR(rt)) {
            err = PTR_ERR(rt);
            rt = NULL;
            goto failure;
        }
        /* OK, now commit destination to socket.  */
        sk->sk_gso_type = SKB_GSO_TCPV4;
        sk_setup_caps(sk, &rt->dst);
    
        if (!tp->write_seq && likely(!tp->repair))
            tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
                                   inet->inet_daddr,
                                   inet->inet_sport,
                                   usin->sin_port);
    
        inet->inet_id = tp->write_seq ^ jiffies;
    
        err = tcp_connect(sk);----------构造syn,发送,并设置等待定时器
    
        rt = NULL;
        if (err)
            goto failure;
    
        return 0;
    
    failure:
        /*
         * This unhashes the socket and releases the local port,
         * if necessary.
         */
        tcp_set_state(sk, TCP_CLOSE);
        ip_rt_put(rt);
        sk->sk_route_caps = 0;
        inet->inet_dport = 0;
        return err;
    }

    在tcp_connect中,构造syn的时候,接收窗口设置如下:

        if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
            /* RFC1323: The window in SYN & SYN/ACK segments
             * is never scaled.
             */
            th->window    = htons(min(tp->rcv_wnd, 65535U));

    发送syn没有看到任何速度限制的地方,接收窗口也很大,不会出现对端回复报文没有窗口的情况,所以要看服务器端为啥会慢了。

    服务器段:看下协议栈接收syn的流程:

    在linux的内核协议栈中,tcp接收syn报文的话,是由处于listen态的sock处理的,流程为 tcp_v4_rcv -->tcp_v4_rcv --> tcp_v4_hnd_req,由于是第一次收到syn,那么显然半链接

    队列中并没有这个req,tcp_v4_hnd_req 肯定会返回listen的sk。接下来就进入 tcp_rcv_state_process :

     tcp_rcv_state_process :

    
    
    /*
    *  This function implements the receiving procedure of RFC 793 for
    *  all states except ESTABLISHED and TIME_WAIT.
    *  It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
    *  address independent.
    */
    这段英文注释有一些问题,因为tcp_rcv_state_process也会处理establish状态的报文,有兴趣的人可以提交patch修改一下。
    //
    tcp收包处理,处理各个状态上socket的情况,比如//客户端主动建立连接时,发送SYN段后,连接的状态变为SYN_SENT。此时如果收到SYNACK段,处理函数为 tcp_rcv_state_process()。 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *req; int queued = 0; bool acceptable; u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; switch (sk->sk_state) {//分状态处理报文 case TCP_CLOSE: goto discard; case TCP_LISTEN: if (th->ack) return 1; if (th->rst) goto discard; if (th->syn) { if (th->fin) goto discard; if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)//listen态的接收处理,回调 tcp_v4_conn_request return 1;

    对于listen的socket来说,它主要的工作就是,处理syn,icsk->icsk_af_ops->conn_request 其实就是回调的 tcp_v4_conn_request。

    tcp_v4_conn_request简单封装了一下 tcp_conn_request:

    int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
    {
        /* Never answer to SYNs send to broadcast or multicast */
        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
            goto drop;
    
        return tcp_conn_request(&tcp_request_sock_ops,
                    &tcp_request_sock_ipv4_ops, sk, skb);--------两个指针要关心,一个是 tcp_request_sock_ops,一个是 tcp_request_sock_ipv4_ops,两个名称容易混淆
    drop:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
        return 0;
    }

     tcp_conn_request处理流程如下:

    //被回调,同时就是tcp_v4_send_synack()向客户端发送了SYN+ACK报文,inet_csk_reqsk_queue_hash_add()将sk添加到了syn_table中,填充了该客户端相关的信息
    int tcp_conn_request(struct request_sock_ops *rsk_ops,
                 const struct tcp_request_sock_ops *af_ops,
                 struct sock *sk, struct sk_buff *skb)
    {。。。
        /* TW buckets are converted to open requests without
         * limitations, they conserve resources and peer is
         * evidently real one.
         */
        if ((sysctl_tcp_syncookies == 2 ||
         inet_csk_reqsk_queue_is_full(sk)) && !isn) {---------------isn为0,这个是drop的第一个条件
            want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
            if (!want_cookie)
                goto drop;
        }


        /* Accept backlog is full. If we have already queued enough
         * of warm entries in syn queue, drop request. It is better than
         * clogging syn queue with openreqs with exponentially increasing
         * timeout.
         */
        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {-----------这个是drop的第二个条件
            NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
            goto drop;
        }
    。。。
    }
     
    cat /proc/sys/net/ipv4/tcp_syncookies
    1

    对于正常流程,肯定不用看,无非就是把syn包处理一下,

    然后创建req:

    inet_reqsk_alloc(rsk_ops);//新建一个req
     
    初始化req:tcp_request_sock_ipv4_ops->init_req

    回复synack:tcp_request_sock_ipv4_ops->send_synack  

    添加到队列中:tcp_request_sock_ipv4_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);//回调 inet_csk_reqsk_queue_hash_add ,将sk添加到了syn_table中,填充了该客户端相关的信息。这样,再次收到客户端的ACK报文时,就可以在syn_table中找到相应项了。(这里不描述收到的是ack的情况)
     
    异常处理中,有两项值得关注,先看 inet_csk_reqsk_queue_is_full 是否满足
    static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
    {
        return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);
    }
    
    static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
    {
        return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
    }

    那么 queue->listen_opt->qlen 和 queue->listen_opt->max_qlen_log 分别是多少?

    int reqsk_queue_alloc(struct request_sock_queue *queue,
                  unsigned int nr_table_entries)
    {
        size_t lopt_size = sizeof(struct listen_sock);
        struct listen_sock *lopt;
    
        nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
        nr_table_entries = max_t(u32, nr_table_entries, 8);
        nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
        lopt_size += nr_table_entries * sizeof(struct request_sock *);
        if (lopt_size > PAGE_SIZE)
            lopt = vzalloc(lopt_size);
        else
            lopt = kzalloc(lopt_size, GFP_KERNEL);
        if (lopt == NULL)
            return -ENOMEM;
    
        for (lopt->max_qlen_log = 3;
             (1 << lopt->max_qlen_log) < nr_table_entries;
             lopt->max_qlen_log++);

    可以看出,max_qlen_log 最大是nr_table_entries以2为底的对数(可能涉及到+1),最小为3,

    nr_table_entries 为 小于 sysctl_max_syn_backlog 并且>=8的一个的一个2次元整的数,

    我server端测试程序的backlog:

    ret = listen(sockfd, 5);

    net.ipv4.tcp_max_syn_backlog = 131072--------------也就是2的17次方

    net.core.somaxconn = 65535

    根据上面的算法:

        nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
    

     此时,nr_table_entries 为5,

            nr_table_entries = max_t(u32, nr_table_entries, 8);

    此时,nr_table_entries 为8,

          nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);

    此时,nr_table_entries 为16,

    所以max_qlen_log 为4,而 nr_table_entries 应该为 16。

    可以用crash验证下:

    crash> ps |grep -i server
      70539  247342   0  ffff882787303f40  IN   0.0    5188    420  tcp_server.o
    crash> files ffff882787303f40
    PID: 70539  TASK: ffff882787303f40  CPU: 0   COMMAND: "tcp_server.o"
    ROOT: /    CWD: /home/caq/code
     FD       FILE            DENTRY           INODE       TYPE PATH
      0 ffff8857b9725a00 ffff8857bba40540 ffff885273c973a0 CHR  /dev/pts/25
      1 ffff8857b9725a00 ffff8857bba40540 ffff885273c973a0 CHR  /dev/pts/25
      2 ffff8857b9725a00 ffff8857bba40540 ffff885273c973a0 CHR  /dev/pts/25
      3 ffff88260b3bcc00 ffff885730c6fec0 ffff8827dbc35cb0 SOCK TCP
    crash> struct file.private_data ffff88260b3bcc00
      private_data = 0xffff8827dbc35c80
    crash> struct socket 0xffff8827dbc35c80
    struct socket {
      state = SS_UNCONNECTED,
      type = 1,
      flags = 0,
      wq = 0xffff8827ce157000,
      file = 0xffff88260b3bcc00,
      sk = 0xffff8827dc992000,
      ops = 0xffffffff8176ffc0 <inet_stream_ops>
    }
    crash> struct inet_connection_sock.icsk_accept_queue 0xffff8827dc992000
      icsk_accept_queue = {
        rskq_accept_head = 0xffff88277f63a700,
        rskq_accept_tail = 0xffff88587f734a00,
        syn_wait_lock = {
          raw_lock = {
            lock = 4294967296,
            {
              read = 0,
              write = 1
            }
          }
        },
        rskq_defer_accept = 0 '00',
        listen_opt = 0xffff881fed0b9380,
        fastopenq = 0x0
      }
    crash> struct listen_sock 0xffff881fed0b9380
    struct listen_sock {
      max_qlen_log = 4 '04',------------------这个是4,和算法相符合
      synflood_warned = 0 '00',
      qlen = 6,
      qlen_young = 2,
      clock_hand = 0,
      hash_rnd = 3192674541,
      nr_table_entries = 16,--------------------这个是16,和算法相符合,这个就是连接还未成功的队列长度,即三次握手没有完成
      syn_table = 0xffff881fed0b9398
    }

    qlen是一个动态的值,当它达到16,需要drop掉入向的syn,如果没有达到,inet_csk_reqsk_queue_is_full 不满足,则继续往下判断。

    [root@localhost code]# netstat -s |grep -i drop
        474 dropped because of missing route
        4893 SYNs to LISTEN sockets dropped
    [root@localhost code]# netstat -s |grep -i drop
        474 dropped because of missing route
        4898 SYNs to LISTEN sockets dropped
    [root@localhost code]# netstat -s |grep -i overflow
        5075 times the listen queue of a socket overflowed
    [root@localhost code]# netstat -s |grep -i overflow
        5080 times the listen queue of a socket overflowed

    由于overflow的也会记录在drop中,有必要同时看:

    [root@localhost code]# netstat -s |grep -E 'overflow|drop'
        474 dropped because of missing route
        5313 times the listen queue of a socket overflowed
        5313 SYNs to LISTEN sockets dropped
    [root@localhost code]# netstat -s |grep -E 'overflow|drop'
        474 dropped because of missing route
        5334 times the listen queue of a socket overflowed
        5334 SYNs to LISTEN sockets dropped

    原来drop的syn全部是来自于overflowed,也就是第二个条件不能满足,我们来看第二个条件:

    static inline bool sk_acceptq_is_full(const struct sock *sk)
    {
        return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
    }

    static inline int reqsk_queue_len_young(const struct request_sock_queue *queue)
    {
        return queue->listen_opt->qlen_young;
    }
     
    static inline int inet_csk_reqsk_queue_young(const struct sock *sk)
    {
        return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue);
    }
    第二个条件如下:
        /* Accept backlog is full. If we have already queued enough
         * of warm entries in syn queue, drop request. It is better than
         * clogging syn queue with openreqs with exponentially increasing
         * timeout.
         */
        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
            NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
            goto drop;
        }
     

    先看 sk_acceptq_is_full,其实就是判断 sk_ack_backlog  和  sk_max_ack_backlog 的大小,sk_max_ack_backlog 就是listen传入的时候设置的小于maxconn的值,也就是5.固定的。

    而 sk_ack_backlog  就是三次握手成功的链接个数。那么只要大于5,就会出现overflow,我们来看一下accept队列中是不是有6个呢?看如下打印:

    crash> list request_sock.dl_next 0xffff88277f63a700
    ffff88277f63a700
    ffff88260b3bca00
    ffff88260b355700
    ffff8821d58b7f00
    ffff8857a97d3a00
    ffff88587f734a00

     果然是只有6个,当然,这6个是不是一直不变,不是的,因为它要维护tcp的状态机,当close掉一个,又可以增加一个链路,

    除了这个条件,我们同时需要满足:inet_csk_reqsk_queue_young(sk) > 1这个条件,这个条件又是怎么来满足的呢?

    static inline int inet_csk_reqsk_queue_young(const struct sock *sk)
    {
        return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue);
    }
    
    static inline int reqsk_queue_len_young(const struct request_sock_queue *queue)
    {
        return queue->listen_opt->qlen_young;
    }

    原来是要求半链接队列的 qlen_young >1,由于这个值是用来描述半链接队列中的新链接的个数的,所谓的新就是,处于半链接的req,发送的syn_ack之后,出现了超时,那么就不是新的,

    当服务器收到客户端的syn的ack之后,会将半链接的req进行move到全链接的accept队列,此时会同时减少 半链接的syn_table中的qlen和qlen_young。所以 qlen_young出现了1,则至少

    说明 有一些新的syn过来请求我们服务器,而我们服务器响应不是那么及时。此时对新来的syn来说,进行drop。连req都不会生成,也不会插入到半链接队列了。客户端发现syn超时后,

    自然会重传发送syn,此时connect被阻塞,会显示会慢,对于后面链接开始出现connet慢来说,是已经理清楚了。

    当然不是所有的syn都会丢弃,毕竟young的判断是个瞬时值,所以会出现下面描述的情况:

    我们通过抓包发现,超过backlog这么多链路没有accept之后,客户端发起syn,服务器端还回复了syn_ack,如果是丢弃了syn,那么就不可能回复syn_ack,此时客户端并不知道自己的syn被丢弃了,而是傻傻地等,一般这个超时时间是 3s,并且是 exponential backoff 的,从这个角度说,就是connect慢了。也就回答了上面connect慢的问题。

    显然,要想让服务器端的三次握手

    无法完成,还有一个丢弃的地方,就是丢弃第三次握手的报文,也就是ack了,而正是这个丢弃,因为连接已经进入 incomplete sockets queue,Client 只要发数据上来服务端就会立即重传 SYN/ACK,所以不会直接产生延迟,只是会导致了客户端establish的socket比服务器段看起来多。

    通过走查代码发现, tcp_v4_syn_recv_sock 也存在 sk_acceptq_is_full 的判断流程:

        if (sk_acceptq_is_full(sk))
            goto exit_overflow;
    
        newsk = tcp_create_openreq_child(sk, req, skb);
        if (!newsk)
            goto exit_nonewsk;
    。。。
    //异常返回在下
    exit_overflow:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
    exit_nonewsk:
        dst_release(dst);
    exit:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
        return NULL;

    通过这段代码看出,同样也可以造成listenoverflow的统计打印和listendrop的打印,根据抓包来反推,是这个函数限制了链路。 tcp_check_req 会回调:

    child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);//回调 tcp_v4_syn_recv_sock ,v6是 tcp_v6_syn_recv_sock

    另外说一下: 

    在accept队列的req,此时虽然用户没有调用accept来取走req,但是状态维护还是按照tcp的状态图来处理,比如在accept队列中的一个req中的scok,收到一个fin包,

    则会在 tcp_data_queue-->tcp_fin ,然后将其状态设置为close-wait。其实在 int tcp_v4_rcv(struct sk_buff *skb) 中,当收到skb,首先会

    sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);//根据四元组 查找sk,由于我们之前迁移到accept队列的时候,已经将我们的sk加入到了tcp_hashinfo这个结构的ehash中,那么我们在accept队列中的req中的sk也会被索引到,直接处理,而不关心这个sk 到底处于什么队列中,当然,当listen的进程退出时,这些req同样也会被释放,sk自然也会被释放。
    crash> struct request_sock.sk 0xffff88277f63a700
      sk = 0xffff882760be7800
    crash> sock.__sk_common 0xffff882760be7800
      __sk_common = {
        {
          skc_addrpair = 5876886395108779274,
          {
            skc_daddr = 231269642,
            skc_rcv_saddr = 1368319242
          }
        },
        {
          skc_hash = 1287152674,
          skc_u16hashes = {25634, 19640}
        },
        {
          skc_portpair = 218452600,
          {
            skc_dport = 21112,
            skc_num = 3333
          }
        },
        skc_family = 2,
        skc_state = 8 '',----------这个就是close_wait
    [root@localhost code]# ss -nt dst 10.229.200.13
    State       Recv-Q Send-Q                  Local Address:Port                                 Peer Address:Port
    CLOSE-WAIT  1      0                       10.229.142.81:3333                                10.229.200.13:32437
    CLOSE-WAIT  1      0                       10.229.142.81:3333                                10.229.200.13:32369
    CLOSE-WAIT  1      0                       10.229.142.81:3333                                10.229.200.13:30802
    CLOSE-WAIT  1      0                       10.229.142.81:3333                                10.229.200.13:32438
    CLOSE-WAIT  1      0                       10.229.142.81:3333                                10.229.200.13:32431
    CLOSE-WAIT  1      0                       10.229.142.81:3333                                10.229.200.13:32439

    对于accept来说:

    accept() -> sys_accept4() -> inet_accept() -> inet_csk_accept()
    accept()实际要做的事件并不多,它的作用是返回一个已经建立连接的socket(即经过了三次握手),这个过程是异步的,accept()并不亲自去处理三次握手过程,而只是监听icsk_accept_queue队列,当有socket经过了三次握手,它就会被加到icsk_accept_queue中,所以accept要做的就是等待队列中插入socket,然后被唤醒并返回这个socket。而三次握手的过程完全是协议栈本身去完成的。换句话说,协议栈相当于写者,将socket写入队列,accept()相当于读者,将socket从队列读出。这个过程从listen就已开始,所以即使不调用accept(),客户仍可以和服务器建立连接,但由于没有处理,队列很快会被占满。
    协议栈向队列中加入socket的过程就是完成三次握手的过程,客户端通过向已知的listen fd发起连接请求,对于到来的每个连接,都会创建一个新的sock,当它经历了TCP_SYN_RCV -> TCP_ESTABLISHED后,就会被添加到icsk_accept_queue中,而监听的socket状态始终为TCP_LISTEN,保证连接的建立不会影响socket的接收。

    对于qlen_young来说,这个是一个瞬时值,

    crash> struct listen_sock 0xffff8856e7759380
    struct listen_sock {
      max_qlen_log = 4 '04',
      synflood_warned = 0 '00',
      qlen = 4,
      qlen_young = 0,
      clock_hand = 14,
      hash_rnd = 2613925628,
      nr_table_entries = 16,
      syn_table = 0xffff8856e7759398
    }
    crash> struct listen_sock 0xffff8856e7759380
    struct listen_sock {
      max_qlen_log = 4 '04',
      synflood_warned = 0 '00',
      qlen = 6,
      qlen_young = 2,
      clock_hand = 6,
      hash_rnd = 2613925628,
      nr_table_entries = 16,
      syn_table = 0xffff8856e7759398
    }

    由于客户端在接收到服务器端的syn_ack之后,就会将自己的链接状态改为establish,然后发送ack,所以会出现虽然很慢,但是从客户端来看,最终建联成功的链路数会超过listen的

    backlog的情况:

    ss -nt |grep -i 3333
    ESTAB      0      0             10.229.200.12:38826        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38839        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38825        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38840        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38360        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38852        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38829        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38313        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38835        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38834        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38828        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38851        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38860        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38861        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38827        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38857        10.229.142.81:3333
    ESTAB      0      0             10.229.200.12:38856        10.229.142.81:3333

     但是服务器端,在没有accept的情况下,始终只有6条链路,跟backlog强相关,backlog参数控制的是已经握手成功的还在accept queue的大小,如下图所示:(注意和前面客户端的

    打印不是同一次测试取的,所以端口对不上)

    服务器侧查看,在客户端端关闭之前:
    [root@localhost code]# ss -nt |grep 3333
    ESTAB      0      0      10.229.142.81:3333               10.229.200.12:42123
    ESTAB      0      0      10.229.142.81:3333               10.229.200.12:42126
    ESTAB      0      0      10.229.142.81:3333               10.229.200.12:42125
    ESTAB      0      0      10.229.142.81:3333               10.229.200.12:42124
    ESTAB      0      0      10.229.142.81:3333               10.229.200.12:42122
    ESTAB      0      0      10.229.142.81:3333               10.229.200.12:42127
    
    服务器侧查看,客户端程序关闭之后:
    [root@localhost code]# ss -nt |grep 3333
    CLOSE-WAIT 1      0      10.229.142.81:3333               10.229.200.12:42123
    CLOSE-WAIT 1      0      10.229.142.81:3333               10.229.200.12:42126
    CLOSE-WAIT 1      0      10.229.142.81:3333               10.229.200.12:42125
    CLOSE-WAIT 1      0      10.229.142.81:3333               10.229.200.12:42124
    CLOSE-WAIT 1      0      10.229.142.81:3333               10.229.200.12:42122
    CLOSE-WAIT 1      0      10.229.142.81:3333               10.229.200.12:42127

    从代码看,从syn_send收到syn的ack之后,自然就迁移到了establish状态,所以客户端的establish的链路多,但是对服务器端来说,由于没有accept,所以不能往accept队列中迁移超过

    backlog个establish状态的链接,自然连接数就被限制住了,那么,处于半连接状态的sk,当收到ack之后,按道理迁移到establish状态,但是由于 tcp_v4_syn_recv_sock中丢弃了客户端的

    第三次握手的报文,所以就无法调用 inet_csk_reqsk_queue_add。

    所以两个地方限制了客户端的connect:

    1.服务器端通过丢弃握手的syn,来限制客户端的connect的速度,

    2.服务器段通过丢弃握手的ack,来限制服务器端的该listen的socket的establish状态的总量。限制的阈值由listen的backlog限定。

    水平有限,如果有错误,请帮忙提醒我。如果您觉得本文对您有帮助,可以点击下面的 推荐 支持一下我。版权所有,需要转发请带上本文源地址,博客一直在更新,欢迎 关注 。
  • 相关阅读:
    Word批量转PDF(内容转图片,防复制文字)
    Word批量转PDF或者图片(实现方式二)
    Word批量生成软件(实现方式三)
    合同批量生成软件/工具(实现方式三)
    Word批量打印软件/工具
    Word文件批量查找替换字符串
    Word批量生成软件(实现方式二)
    Word批量生成软件
    合同批量生成软件/工具(实现方式二)
    MySQL处理大量数据的效率问题
  • 原文地址:https://www.cnblogs.com/10087622blog/p/9797193.html
Copyright © 2020-2023  润新知