net.ipv4.tcp_abort_on_overflow 为 0
有个兄弟跟我说accept的时候,如果故意不去accept,那么客户端connect的时候,一开始很快,后来就很慢:
connect(3, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000554> connect(4, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000579> connect(5, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000199> connect(6, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000161> connect(7, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000546> connect(8, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000249> connect(9, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000545> connect(10, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000099> connect(11, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.002572>------------开始慢 connect(12, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000476> connect(13, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.006768> connect(14, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000160> connect(15, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.007360> connect(16, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000667> connect(17, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.006858> connect(18, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000394> connect(19, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.007592> connect(20, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000396> connect(21, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.007355> connect(22, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000095> connect(23, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <3.007109> connect(24, {sa_family=AF_INET, sin_port=htons(3333), sin_addr=inet_addr("10.229.142.81")}, 16) = 0 <0.000250> ~
对应的抓包,看慢的那个是27102端口开始慢,它syn建联的时候,出现了重发,重发的周期为1s,也就是TCP_TIMEOUT_INIT,且可以排除非网络问题。
11:08:38.730265 IP 10.229.200.12.27102 > 10.229.142.81.3333: Flags [S], seq 3869742768, win 14600, options [mss 1460,nop,nop,sackOK,nop,wscale 9], length 0 11:08:38.730484 IP 10.229.200.12.27100 > 10.229.142.81.3333: Flags [.], ack 1, win 29, length 0 11:08:38.730614 IP 10.229.200.12.27101 > 10.229.142.81.3333: Flags [.], ack 1, win 29, length 0 11:08:39.731399 IP 10.229.142.81.3333 > 10.229.200.12.27100: Flags [S.], seq 833457747, ack 289533139, win 29200, options [mss 1460,nop,nop,sackOK,nop,wscale 9], length 0 11:08:39.731932 IP 10.229.200.12.27100 > 10.229.142.81.3333: Flags [.], ack 1, win 29, length 0 11:08:40.131387 IP 10.229.142.81.3333 > 10.229.200.12.27101: Flags [S.], seq 958793751, ack 483265294, win 29200, options [mss 1460,nop,nop,sackOK,nop,wscale 9], length 0 11:08:40.131919 IP 10.229.200.12.27101 > 10.229.142.81.3333: Flags [.], ack 1, win 29, length 0 11:08:41.731426 IP 10.229.142.81.3333 > 10.229.200.12.27100: Flags [S.], seq 833457747, ack 289533139, win 29200, options [mss 1460,nop,nop,sackOK,nop,wscale 9], length 0 11:08:41.731961 IP 10.229.200.12.27100 > 10.229.142.81.3333: Flags [.], ack 1, win 29, length 0 11:08:41.732424 IP 10.229.200.12.27102 > 10.229.142.81.3333: Flags [S], seq 3869742768, win 14600, options [mss 1460,nop,nop,sackOK,nop,wscale 9], length 0
看了一下代码,捋一捋记录如下:
既然connect慢,我们就从客户端 connect 开始分析,
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = move_addr_to_kernel(uservaddr, addrlen, &address); if (err < 0) goto out_put; err = security_socket_connect(sock, (struct sockaddr *)&address, addrlen); if (err) goto out_put; err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, sock->file->f_flags);---------------调用 tcp_v4_connect out_put: fput_light(sock->file, fput_needed); out: return err; } /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); __be16 orig_sport, orig_dport; __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; struct ip_options_rcu *inet_opt; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; nexthop = daddr = usin->sin_addr.s_addr; inet_opt = rcu_dereference_protected(inet->inet_opt, sock_owned_by_user(sk)); if (inet_opt && inet_opt->opt.srr) { if (!daddr) return -EINVAL; nexthop = inet_opt->opt.faddr; } orig_sport = inet->inet_sport; orig_dport = usin->sin_port; fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, orig_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return err; } if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } if (!inet_opt || !inet_opt->opt.srr) daddr = fl4->daddr; if (!inet->inet_saddr) inet->inet_saddr = fl4->saddr; inet->inet_rcv_saddr = inet->inet_saddr; if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; if (likely(!tp->repair)) tp->write_seq = 0; } if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) tcp_fetch_timewait_stamp(sk, &rt->dst); inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket * lock select source port, enter ourselves into the hash tables and * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; sk_set_txhash(sk); rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto failure; } /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, inet->inet_daddr, inet->inet_sport, usin->sin_port); inet->inet_id = tp->write_seq ^ jiffies; err = tcp_connect(sk);----------构造syn,发送,并设置等待定时器 rt = NULL; if (err) goto failure; return 0; failure: /* * This unhashes the socket and releases the local port, * if necessary. */ tcp_set_state(sk, TCP_CLOSE); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; return err; }
在tcp_connect中,构造syn的时候,接收窗口设置如下:
if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(min(tp->rcv_wnd, 65535U));
发送syn没有看到任何速度限制的地方,接收窗口也很大,不会出现对端回复报文没有窗口的情况,所以要看服务器端为啥会慢了。
服务器段:看下协议栈接收syn的流程:
在linux的内核协议栈中,tcp接收syn报文的话,是由处于listen态的sock处理的,流程为 tcp_v4_rcv -->tcp_v4_rcv --> tcp_v4_hnd_req,由于是第一次收到syn,那么显然半链接
队列中并没有这个req,tcp_v4_hnd_req 肯定会返回listen的sk。接下来就进入 tcp_rcv_state_process :
tcp_rcv_state_process :
这段英文注释有一些问题,因为tcp_rcv_state_process也会处理establish状态的报文,有兴趣的人可以提交patch修改一下。
//tcp收包处理,处理各个状态上socket的情况,比如//客户端主动建立连接时,发送SYN段后,连接的状态变为SYN_SENT。此时如果收到SYNACK段,处理函数为 tcp_rcv_state_process()。 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *req; int queued = 0; bool acceptable; u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; switch (sk->sk_state) {//分状态处理报文 case TCP_CLOSE: goto discard; case TCP_LISTEN: if (th->ack) return 1; if (th->rst) goto discard; if (th->syn) { if (th->fin) goto discard; if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)//listen态的接收处理,回调 tcp_v4_conn_request return 1;
对于listen的socket来说,它主要的工作就是,处理syn,icsk->icsk_af_ops->conn_request 其实就是回调的 tcp_v4_conn_request。
tcp_v4_conn_request简单封装了一下 tcp_conn_request:
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; return tcp_conn_request(&tcp_request_sock_ops, &tcp_request_sock_ipv4_ops, sk, skb);--------两个指针要关心,一个是 tcp_request_sock_ops,一个是 tcp_request_sock_ipv4_ops,两个名称容易混淆 drop: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return 0; }
tcp_conn_request处理流程如下:
//被回调,同时就是tcp_v4_send_synack()向客户端发送了SYN+ACK报文,inet_csk_reqsk_queue_hash_add()将sk添加到了syn_table中,填充了该客户端相关的信息 int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb)
{。。。
cat /proc/sys/net/ipv4/tcp_syncookies 1
对于正常流程,肯定不用看,无非就是把syn包处理一下,
然后创建req:
回复synack:tcp_request_sock_ipv4_ops->send_synack
static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); } static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) { return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log; }
那么 queue->listen_opt->qlen 和 queue->listen_opt->max_qlen_log 分别是多少?
int reqsk_queue_alloc(struct request_sock_queue *queue, unsigned int nr_table_entries) { size_t lopt_size = sizeof(struct listen_sock); struct listen_sock *lopt; nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); nr_table_entries = max_t(u32, nr_table_entries, 8); nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) lopt = vzalloc(lopt_size); else lopt = kzalloc(lopt_size, GFP_KERNEL); if (lopt == NULL) return -ENOMEM; for (lopt->max_qlen_log = 3; (1 << lopt->max_qlen_log) < nr_table_entries; lopt->max_qlen_log++);
可以看出,max_qlen_log 最大是nr_table_entries以2为底的对数(可能涉及到+1),最小为3,
nr_table_entries 为 小于 sysctl_max_syn_backlog 并且>=8的一个的一个2次元整的数,
我server端测试程序的backlog:
ret = listen(sockfd, 5);
net.ipv4.tcp_max_syn_backlog = 131072--------------也就是2的17次方
net.core.somaxconn = 65535
根据上面的算法:
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
此时,nr_table_entries 为5,
nr_table_entries = max_t(u32, nr_table_entries, 8);
此时,nr_table_entries 为8,
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
此时,nr_table_entries 为16,
所以max_qlen_log 为4,而 nr_table_entries 应该为 16。
可以用crash验证下:
crash> ps |grep -i server 70539 247342 0 ffff882787303f40 IN 0.0 5188 420 tcp_server.o crash> files ffff882787303f40 PID: 70539 TASK: ffff882787303f40 CPU: 0 COMMAND: "tcp_server.o" ROOT: / CWD: /home/caq/code FD FILE DENTRY INODE TYPE PATH 0 ffff8857b9725a00 ffff8857bba40540 ffff885273c973a0 CHR /dev/pts/25 1 ffff8857b9725a00 ffff8857bba40540 ffff885273c973a0 CHR /dev/pts/25 2 ffff8857b9725a00 ffff8857bba40540 ffff885273c973a0 CHR /dev/pts/25 3 ffff88260b3bcc00 ffff885730c6fec0 ffff8827dbc35cb0 SOCK TCP crash> struct file.private_data ffff88260b3bcc00 private_data = 0xffff8827dbc35c80 crash> struct socket 0xffff8827dbc35c80 struct socket { state = SS_UNCONNECTED, type = 1, flags = 0, wq = 0xffff8827ce157000, file = 0xffff88260b3bcc00, sk = 0xffff8827dc992000, ops = 0xffffffff8176ffc0 <inet_stream_ops> } crash> struct inet_connection_sock.icsk_accept_queue 0xffff8827dc992000 icsk_accept_queue = { rskq_accept_head = 0xffff88277f63a700, rskq_accept_tail = 0xffff88587f734a00, syn_wait_lock = { raw_lock = { lock = 4294967296, { read = 0, write = 1 } } }, rskq_defer_accept = 0 '