shutdown 系统调用关闭连接的读数据通道 写数据通道 或者 读写数据通道;
关闭读通道:丢弃socket fd 读数据以及调用shutdown 后到达的数据;
关闭写通道:不同协议处理不同;tcp协议,将所有的数据发送完成,发送完后发送FIN;
但是为了删除套接字和释放文件描述符,我们必须使用close();进程结束时,会调用close;
enum sock_shutdown_cmd { SHUT_RD,// 关闭读 SHUT_WR,//关闭写 SHUT_RDWR,//关闭读写 }; int inet_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; /* This should really check to make sure * the socket is a TCP socket. (WHY AC...) */ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and 1->2 bit 2 snds. how 变量+1 是为了使用位操作 2->3 */ if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ return -EINVAL; lock_sock(sk); if (sock->state == SS_CONNECTING) {//连接建立尚未完成 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) sock->state = SS_DISCONNECTING; else sock->state = SS_CONNECTED; } switch (sk->sk_state) { case TCP_CLOSE: err = -ENOTCONN; /* Hack to wake up other listeners, who can poll for POLLHUP, even on eg. unconnected UDP sockets -- RR */ default: sk->sk_shutdown |= how; if (sk->sk_prot->shutdown) sk->sk_prot->shutdown(sk, how); break; /* Remaining two branches are temporary solution for missing * close() in multithreaded environment. It is _not_ a good idea, * but we have no choice until close() is repaired at VFS level. */ case TCP_LISTEN://listening socket不能发包,故非RCV_SHUTDOWN是没有意义的 if (!(how & RCV_SHUTDOWN)) break; /* Fall through */ case TCP_SYN_SENT://对于 tcp socket 指向tcp_disconnect err = sk->sk_prot->disconnect(sk, O_NONBLOCK); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; break; } /* Wake up anyone sleeping in poll. *///最后来看sock_def_readable它就是sk->sk_state_change。也就是用来唤醒阻塞的进程。
sk->sk_state_change(sk);//唤醒睡眠的进程 release_sock(sk); return err; }
对于 sk->sk_prot->shutdown 回调:udp 回调为NULL, TCP回调函数为 tcp_shutdown
对于sk->sk_prot->disconnect 回调:udp 回调为udp_disconnect(此处不会调用) , TCP回调函数为 tcp_disconnect
以 tcp 协议为例分析:
/* * Shutdown the sending side of a connection. Much like close except * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). */ void tcp_shutdown(struct sock *sk, int how) { /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. *///不是SEND_SHUTDOWN ---- SHUT_WR 则不需要发送FIN if (!(how & SEND_SHUTDOWN)) return; /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk))//是否可发送FIN tcp_send_fin(sk);//发送FIN } }
/* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some * states. A shutdown() may have already sent the FIN, or we may be * closed. */ static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state];//获取下一个状态 以及是否需要发送FIN int ns = next & TCP_STATE_MASK;//下一个状态 tcp_set_state(sk, ns);//设置状态 return next & TCP_ACTION_FIN; }
可知:
如果在TCP_ESTABLISHED 或者 TCP_SYN_RECV状态下调用shutdown则状态会跳转到TCP_FIN_WAIT1,如果是TCP_CLOSE_WAIT则跳转到TCP_LAST_ACK。这两种情况下都会调用tcp_send_fin函数发送FIN:
/* Send a FIN. The caller locks the socket for us. * We should try to send a FIN packet really hard, but eventually give up. */ void tcp_send_fin(struct sock *sk) { struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and * this skb was not yet sent, or we are under memory pressure. * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ /*队列中还有尚未发送的数据 或者 TCP内存处于承压状态 将FIN标志位放在数据中 */ if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;/*将FIN标志位放在数据中*/ TCP_SKB_CB(tskb)->end_seq++; //FIN标志位占用一个序列号 tp->write_seq++; if (!tcp_send_head(sk)) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ tp->snd_nxt++; return; } } else {//队列为空,则新建一个包 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); if (unlikely(!skb)) { if (tskb) goto coalesce; return; } skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN);//设置FIN|ACK标记 tcp_queue_skb(sk, skb); //将包放入发送队列 }//发送FIN;对于队列中所有的包关闭Nagle算法再发送 __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); }
可见关闭连接时如果发送缓存中有数据则TCP会负责将其传送到对端,而这些数据包中的最后一个会携带FIN标记;
/* These states need RST on ABORT according to RFC793 */ static inline bool tcp_need_reset(int state) { return (1 << state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_SYN_RECV); }
/* tcp_disconnect函数并不会等待对端回复报文而是先行清空本端连接的资源与状态信息, 并且不发送FIN,而是可能会发送RST。发送RST的条件是TCP状态机处于 ESTABLISHED、CLOSE_WAIT 、FIN_WAIT1 、FIN_WAIT2 、SYN_RECV、TCPF_CLOSING、TCPF_LAST_ACK这7个状态之一时 (shutdown系统调用不会满足这一条件,因为inet_shutdown函数只会在TCP_LISTEN和TCP_SYN_SENT这两个状态 下调用tcp_disconnect函数),或发送队列中有未发送的数据时。 */ int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int err = 0; int old_state = sk->sk_state; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE);//将socket移出hash表,解除bind,状态跳转到TCP_CLOSE /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { inet_csk_listen_stop(sk); } else if (unlikely(tp->repair)) { sk->sk_err = ECONNABORTED; } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq &&//有数据未发送完毕 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { /* The last check adjusts for discrepancy of Linux wrt. RFC * states *///发送RST包 tcp_send_active_reset(sk, gfp_any()); sk->sk_err = ECONNRESET; } else if (old_state == TCP_SYN_SENT) sk->sk_err = ECONNRESET; tcp_clear_xmit_timers(sk);//清除xmit定时器重传,delack等 __skb_queue_purge(&sk->sk_receive_queue); //清空接收队列 tcp_write_queue_purge(sk);//清空发送队列 tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue);//清空乱序队列 inet->inet_dport = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; tp->write_seq += tp->max_window + 2; if (tp->write_seq == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; tp->packets_out = 0; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(sk->sk_rx_dst); sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); inet->defer_connect = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk->sk_error_report(sk); return err; }
如果应用进程选择了关闭读写的模式,则可以直接调用shuwdown系统调用。
应用进程也可以不使用shutdown系统调用而是仅使用close系统调用就可以完成连接的关闭(同时关闭读写)和socket资源的释放。
TCP是全双工协议,关闭连接时两个方向必须单独进行。当一端发送数据完毕后,应用进程就可以调用shutdown或close系统调用发送一个FIN来终止该方向上的连接。另一端收到FIN后,必须告知应用进程对端已经终止了数据连接。得到通知后应用进程可以调用shutdown或close系统调用关闭自己向对端发送数据的连接,这时TCP连接才会被真正关闭。
任何收到FIN的一方必须发送ACK确认FIN,并告知应用进程连接已经关闭。
一端收到FIN,只意味着对端不会再发送数据,但本端仍然可以发送数据,这就是TCP半关闭。先关闭的一方,即发送第一个FIN的一方,将执行主动关闭;收到FIN的一方执行被动关闭。
除了标准的“四次握手“方式之外,TCP还支持一种快速的连接关闭方式:要关闭连接的一方发送RST报文,然后不等待对端的回应直接释放本端的资源;收到RST报文的一方也需要立即跳转到TCP_CLOSE状态,并不需要发送任何报文。
- 调用tcp_send_active_reset发送RESET报文:
- 当TCP调用tcp_close时,如果socket的sk_receive_queue中skb还有数据的话,则调用tcp_send_active_reset,向对端发送RESET报文。
- 当TCP调用tcp_out_of_resource时,当orphan(孤儿)socket过多,或者TCP内存高度紧张的时候,向对端发送RESET报文。
- tcp_out_of_resource会在写超时tcp_write_timeout和零窗口探测tcp_probe_timer的时候调用。
- 当关闭监听socket的时候,该监听socket及其生成的子socket通过调用sock_orphan()函数,sock_set_flag(sk, SOCK_DEAD),变成orphan socket。
- 当TCP调用tcp_disconnect时,如果当前socket状态为:TCPF_ESTABLISHED、TCPF_CLOSE_WAIT、TCPF_FIN_WAIT1、TCPF_FIN_WAIT2 或 TCPF_SYN_RECV时;则向对端发送RESET报文;或者如果当前socket状态为TCPF_CLOSING 或 TCPF_LAST_ACK,且tp->snd_nxt != tp->write_seq(当前socket有数据未发送完毕),则也向对端发送RESET报文。
- 当TCP调用tcp_close时,如果当前状态为LISTEN,则会调用inet_csk_listen_stop,其中调用tcp_disconnect;
- 当TCP调用tcp_close时,如果socket设置LINGER,并且lingertime不为0,则调用tcp_disconnect;此时调用sk->sk_prot->disconnect(sk, 0) -> tcp_disconnect()函数丢失所有接收数据并且直接断开连接,具体也就是发送RST数据包,清空相关接收队列:
- 调用tcp_v4_send_reset发送RESET报文:
- TCP接收报文:在tcp_v4_rcv,如果校验和有问题,则发送RESET;
- TCP接收报文:在tcp_v4_rcv,如果 __inet_lookup_skb 函数找不到报文所请求的socket,则发送RESET;
- TCP收到SYN,发送SYN-ACK,并开始等待连接最后的ACK:在tcp_v4_do_rcv - tcp_v4_hnd_req - tcp_check_req,如果TCP报文头部包含RST,或者包含序列不合法的SYN,则发送RESET;
- TCP收到连接建立最后的ACK,并建立child套接字后:tcp_v4_do_rcv - tcp_child_process - tcp_rcv_state_process - tcp_ack 函数中,如果发现连接等待的最后ACK序列号有问题: before(ack, prior_snd_una),则发送RESET;
- TCP在ESTABLISH状态收到报文,在tcp_v4_do_rcv - tcp_rcv_established - tcp_validate_incoming 函数中,如果发现有SYN报文出现在当前的接收窗口中: th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt),则发送RESET;
- TCP在进行状态迁移时:tcp_rcv_state_process -
- 如果此时socket处于LISTEN状态,且报文中含有ACK,则发送RESET;
- 如果此时socket处于FIN_WAIT_1或者FIN_WAIT_2;当接收已经shutdown,并且报文中有新的数据时,发送RESET;
CLose 系统调用
close 系统调用顺序
/* * The peer socket should always be NULL (or else). When we call this * function we are destroying the object and from then on nobody * should refer to it. */ int inet_release(struct socket *sock) { struct sock *sk = sock->sk; if (sk) { long timeout; /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); /* If linger is set, we don't return until the close * is complete. Otherwise we return immediately. The * actually closing is done the same either way. * * If the close is due to the process exiting, we never * linger.. */ timeout = 0; if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING))//如果是进程退出导致调用inet_release,则立即关闭连接timeout = 0 //否者查看 LINGER选项 timeout = sk->sk_lingertime; sock->sk = NULL; sk->sk_prot->close(sk, timeout); } return 0; }
/* 当TCP主动关闭一端调用了close()来执行连接的完全关闭时会执行以下流程, 本端发送FIN给对端,对端回复ACK,本端进入FIN_WAIT_2状态, 此时只有对端发送了FIN,本端才会进入TIME_WAIT状态, 为了防止对端不发送关闭连接的FIN包给本端, 将会在进入FIN_WAIT_2状态时,设置一个FIN_WAIT_2定时器, 如果该连接超过一定时限,则进入CLOSE状态; 上述是针对close调用完全关闭连接的情况,shutdown执行半关闭不会启动FIN_WAIT_2定时器; */ /* 使用close中止一个连接,但它只是减少描述符的参考数, 并不直接关闭连接,只有当描述符的参考数为0时才关闭连接。 shutdown可直接关闭描述符,不考虑描述符的参考数, 可选择中止一个方向的连接。注意:??? 1>. 如果有多个进程共享一个套接字, close每被调用一次,计数减1,直到计数为0时, 也就是所用进程都调用了close,套接字将被释放。??? 2>. 在多进程中如果一个进程中shutdown(sfd, SHUT_RDWR)后其它的进程将无法进行通 信. 如果一个进程close(sfd)将不会影响到其它进程. */
void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; int state; lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); /* Special case. */ inet_csk_listen_stop(sk);//关闭LISTEN状态的socket goto adjudge_to_death; } /* We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! 释放接受队列里面的段 同时统计释放了多少数据 同时回收缓存 */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) len--; data_was_unread += len; __kfree_skb(skb); } sk_mem_reclaim(sk); /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; /* As outlined in RFC 2525, section 2.17, we send a RST here because * data was lost. To witness the awful effects of the old behavior of * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk * GET in an FTP client, suspend the process, wait for the client to * advertise a zero window, then kill -9 the FTP client, wheee... * Note: timeout is always zero in such a case. */ if (unlikely(tcp_sk(sk)->repair)) { sk->sk_prot->disconnect(sk, 0); } else if (data_was_unread) {//有未被读取的数据 /* Unread data was tossed, zap the connection. 关闭套接字 如果还是数据没有读取这发送RST 不要发送fin 因为fin 表示一切正常 */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {//如果调用的SO_LINGER 但是时间为0 直接调用disconnect /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. 其他情况: 禁止了SO_LINGER 或者 启用了SO_LINGER 但是延时 时间不为0 根据新旧状态转换表 如果 action 是发送fin 就发送fin */ /* RED-PEN. Formally speaking, we have broken TCP state * machine. State transitions: * * TCP_ESTABLISHED -> TCP_FIN_WAIT1 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) * TCP_CLOSE_WAIT -> TCP_LAST_ACK * * are legal only when FIN has been sent (i.e. in window), * rather than queued out of window. Purists blame. * * F.e. "RFC state" is ESTABLISHED, * if Linux state is FIN-WAIT-1, but FIN is still not sent. * * The visible declinations are that sometimes * we enter time-wait state, when it is not required really * (harmless), do not send active resets, when they are * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when * they look as CLOSING or LAST_ACK for Linux) * Probably, I missed some more holelets. * --ANK * XXX (TFO) - To start off we don't support SYN+ACK+FIN * in a single packet! (May consider it later but will * probably need API support or TCP_CORK SYN-ACK until * data is written and socket is closed.) */ tcp_send_fin(sk); } /*/在发送rst fin 后等待套接字关闭,直到 tcp状态 为fin_wait_1 closing last_ok 或者等待超时/*
在关闭socket销毁资源之前,调用sk_stream_wait_close函数等待数据发送完毕或者达到
lingertime超时时间,然后才继续进入关闭socket销毁资源的流程;
*/ /* 等待关闭,无数据发送或sk_lingertime超时 */ */ sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk);//设置socket为孤儿socket,即与进程和等待队列分离,设置socket状态为SOCK_DEAD /* It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); /* Now socket is owned by kernel and we acquire BH lock to finish close. No need to check for user refs. */ local_bh_disable(); bh_lock_sock(sk); WARN_ON(sock_owned_by_user(sk)); // inc 孤儿socket 数目 指向一个全局数据 见tcp_port .orphan_count = &tcp_orphan_count, percpu_counter_inc(sk->sk_prot->orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) goto out; /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. * * Nope, it was not mistake. It is really desired behaviour * f.e. on http servers, when such sockets are useless, but * consume significant resources. Let's do it with special * linger2 option. --ANK */ /* finwait2 到 close 状态 变化
启动FIN_WAIT_2定时器两个相关逻辑差不多,(fin1--->fin2/// closes时 为fin2)所以只拿一个位置来说明;
在tcp_close函数中,如果判断状态为FIN_WAIT2,则需要进一步判断linger2配置;
如下所示,在linger2<0的情况下,关闭连接到CLOSE状态,并且发送rst;
在linger2 >= 0的情况下,需判断该值与TIME_WAIT等待时间TCP_TIMEWAIT_LEN值的关系,
如果linger2 > TCP_TIMEWAIT_LEN,则启动FIN_WAIT_2定时器,其超时时间为二者的差值;
如果linger2<0,则直接进入到TIME_WAIT状态,该TIME_WAIT的子状态是FIN_WAIT2,
实际上就是由TIME_WAIT控制块进行了接管,统一交给TIME_WAIT控制块来处理
*/ if (sk->sk_state == TCP_FIN_WAIT2) {//本端已经发送FIN且接收到ACK,等待对端发送FIN struct tcp_sock *tp = tcp_sk(sk); if (tp->linger2 < 0) {//无需等待从finwait2 状态到close 状态直接设置为close状态 并且发送rst tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk);//计算超时时间 /*根据 tcpfintimeout 和 往返时间获取需要保持finwait2状态的时间 如果大于60s 则调用 finwait2 定时器处理 否者调用tcptimewait的定时器处理 状态 从finwait2 装换到closed */ if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } } } /*此时不处于close状态 需要检查当前孤儿套接数和发送队列中的数据长度 首先介绍一下什么是 orphan sockets,简单来说就是该 socket 不与任何一个文件描述符相关联。例如,当应用调用 close() 关闭一个链接时,此时该 socket 就成为了 orphan,但是该 sock 仍然会保留一段时间,直到最后根据 TCP 协议结束。 实际上 orphan socket 对于应用来说是无用的,因此内核希望尽可能减小 orphan 的数量。对于像 http 这样的短请求来说,出现 orphan 的概率会比较大。 对于系统允许的最大 orphan 数量,以及当前的 orphan 数量可以通过如下方式查看: cat /proc/sys/net/ipv4/tcp_max_orphans */ if (sk->sk_state != TCP_CLOSE) { sk_mem_reclaim(sk); if (tcp_check_oom(sk, 0)) { /*如果孤儿套接字超过 max-orphans 或者发送队列所有段的总数据长度超过 缓冲区长度上限的最小值 以及整个tcp 传输层缓冲区分配的内存 超过可用大小的最高硬性限制 需要 立即关闭socket 以及发送rst 给对端 */ tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } } if (sk->sk_state == TCP_CLOSE) { struct request_sock *req = tcp_sk(sk)->fastopen_rsk; /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. */ if (req) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); sock_put(sk); }
shuwdown&close 区别:
close(sockfd);使用close中止一个连接,但它只是减少描述符的参考数,并不直接关闭连接,只有当描述符的参考数为0时才关闭连接。所以在多进程/线程程序中,close只是确保了对于某个特定的进程或线程来说,该连接是关闭的。使用 client_fd = accept() 后 fork() 以在子进程中处理请求,此时在父进程中使用 close() 关闭该连接,子进程仍可以继续使用该连接。也可以调用shutdown()函数来关闭该socket。该函数允许你只停止在某个方向上的数据传输,而一个方向上的数据传输继续进行。如你可以关闭某socket的写操作而允许继续在该socket上接受数据,直至读入所有数据。int shutdown(int sockfd,int how);shutdown可直接关闭描述符,不考虑描述符的参考数,可选择中止一个方向的连接。
----->shutdown破坏了socket 链接,读的时候可能侦探到EOF结束符,写的时候可能会收到一个SIGPIPE信号
----->close关闭本进程的socket fd,但链接还是开着的,用这个socket id的其它进程还能用这个链接,能读或写这个socket fd, 这也就是引用计数的问题,毕竟多进程共享一个内核地址空间
如果文件描述符已经被进程关闭,但对应的内核数据结构struct sock仍然存在,这种sock被称为“孤儿套接字(orphan sock)”。比如,当进程执行close系统调用时,socket发送缓存中还有未处理的数据,这时struct sock不能释放。只有当发送缓存的数据全部传送完毕且TCP状态跳转完成(或收到对端的RST)的情况下才能关闭。在这之前,这个sock就是orphan sock,其数量过多会占用大量内存,故需要对其进行限制。如代码所示,一旦orphan sock的数量超出限额,则将当前socket的状态设置为TCP_CLOSE,并在2220行减去orphan sock的全局引用计数执行sock_put操作,不允许其成为orphan sock。