tcp syn-synack-ack 服务端接收ack
阅读原文时间:2023年07月10日阅读:2

TCP 服务端 接收到ack

tcp_v4_rcv() -> tcp_v4_do_rcv() -> tcp_v4_hnd_req() + tcp_child_process()
tcp_v4_hnd_req() -> tcp_check_req() -> tcp_v4_syn_recv_sock()
tcp_child_process() -> tcp_rcv_state_process()

1. 状态为ESTABLISHED时,用tcp_rcv_established()接收处理。
2. 状态为LISTEN时,说明这个sock处于监听状态,用于被动打开的接收处理,包括SYN和ACK。
3. 当状态不为ESTABLISHED或TIME_WAIT时,用tcp_rcv_state_process()处理。

收到SYN段后,服务器端会分配一个连接请求块,并初始化这个连接请求块。

构造和发送SYNACK段。然后把这个连接请求块链入半连接队列中,启动超时定时器。之后如果再收到ACK,就能完成三次握手了。

三次握手的第二阶段,服务器发送synack后,会进入TCP_NEW_SYN_RECV状态,并插入ehash中。
收到握手最后一个ack后,会找到TCP_NEW_SYN_RECV状态的req,然后创建一个新的sock进入TCP_SYN_RECV状态,最终进入TCP_ESTABLISHED状态. 并放入accept队列通知select/epoll

/*
在服务器接收了syn之后,
会调用tcp_conn_request来处理连接请求,
其中调用inet_reqsk_alloc来创建请求控制块,
可见请求控制块的ireq_state被初始化为TCP_NEW_SYN_RECV
*/
/*
对于协议栈的接收路径,syn包的处理路径如下
tcp_v4_rcv
->__inet_lookup_skb() //listen hash中找到对应的TCP_LISTEN的sk
->tcp_v4_do_rcv()
->tcp_v4_cookie_check() //syncookie检查,因为没有syn包没有ack选项,因此忽略, 如果syncookie验证通过则创建新的sock
->tcp_rcv_state_process()
->tcp_v4_conn_request()
对于syncookie,服务端不保存任何状态
对于fastopen,新建sock进入TCP_SYN_RCV状态, 并插入等待accpet队列,并把数据部分放倒接收队列中, 并设置重传定时器
对于一般的syn包,request_sock设置为TCP_NEW_SYN_RECV,插入ehash表, 设置req_timer定时器,重传synack
*/

int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
const struct iphdr *iph;
const struct tcphdr *th;
bool refcounted;
struct sock *sk;
int ret;


lookup:
/* 查找控制块 查找传输控制块 先查找ehash 然后查找listen hash*/
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
th->dest, &refcounted);
if (!sk)
goto no_tcp_socket;

process:
/* TIME_WAIT转过去处理 */
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
/*
三次握手的第二阶段,服务器发送synack后,
会进入TCP_NEW_SYN_RECV状态,并插入ehash中。
收到握手最后一个ack后,会找到TCP_NEW_SYN_RECV状态的req
,然后创建一个新的sock进入TCP_SYN_RECV状态
,最终进入TCP_ESTABLISHED状态. 并放入accept队列通知select/epoll
*/
/* TCP_NEW_SYN_RECV状态处理 */
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk;
/* 获取控制块 */
sk = req->rsk_listener;
if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
reqsk_put(req);
goto discard_it;
}
if (unlikely(sk->sk_state != TCP_LISTEN)) {
/* 从连接队列移除控制块 */
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;/* 根据skb参数重新查找控制块 */
}
/* We own a reference on the listener, increase it again
* as we might lose it too soon.
*/
sock_hold(sk);
refcounted = true;
/* 处理第三次握手ack,成功返回新控制块 */
nsk = tcp_check_req(sk, skb, req, false);//创建新的sock进入TCP_SYN_RECV state 并插入accept队列
if (!nsk) {
reqsk_put(req);
goto discard_and_relse;
}
/* 未新建控制块,进一步处理 */
if (nsk == sk) {
reqsk_put(req);
/*tcp_child_process中调用tcp_rcv_state_process来
处理TCP_SYN_RECV状态的child sock
。并进入TCP_ESTABLISHED状态*/
} else if (tcp_child_process(sk, nsk, skb)) {
//nsk进入ehash
/* 有新建控制块,进行初始化等 */
tcp_v4_send_reset(nsk, skb); /* 失败发送rst */
goto discard_and_relse;
} else {
sock_put(sk);
return 0;
}
}


}

tcp_check_req 分析

/*
* Process an incoming packet for SYN_RECV sockets represented as a
* request_sock. Normally sk is the listener socket but for TFO it
* points to the child socket.
*
* XXX (TFO) - The current impl contains a special check for ack
* validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
*
* We don't need to initialize tmp_opt.sack_ok as we don't use the results
*/
/*
* 用来处理接收到的TCP段,处理过程如下:
* 1. 解析并获取段中的TCP选项
* 2. 检验TCP序号
* 3. 如果是SYN段,则作为SYN段再处理一次
* 4. 检测ACK段确认序号是否有效,无效则立即返回不作处理
* 5. 检测ACK段序号是否有效,无效则丢弃该段
* 6. 如果是RST段或者是新的SYN段,则向客户端返送RST段进行复位
* 7. 校验通过,创建相应的"子"传输控制块
* 8. 将连接请求块插入已完成连接的队列中,等待用户进程的accept()调用
*
* @sk: 处理服务端连接过程的监听传输控制块
* @skb: 接收到的TCP段
* @req: 客户端请求的连接建立的连接请求块
*/
//这里面如果判断是ack会创建新的'子'struct sock,在函数tcp_v4_syn_recv_sock
//走到这里面来只可能是服务器端收到客户端的重传SYN或者 握手中的第三步ACK
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
bool fastopen)
{
struct tcp_options_received tmp_opt;
struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
bool paws_reject = false;
bool own_req;

tmp\_opt.saw\_tstamp = 0;  
if (th->doff > (sizeof(struct tcphdr)>>2)) { //有选项  
    tcp\_parse\_options(skb, &tmp\_opt, 0, NULL); //解析选项

    if (tmp\_opt.saw\_tstamp) {//如果有时间戳选项  
        tmp\_opt.ts\_recent = req->ts\_recent;  
        /\* We do not store true stamp, but it is not required,  
         \* it can be estimated (approximately)  
         \* from another data.  
         \*///估算存储req->ts\_recent的时间(秒)  
        tmp\_opt.ts\_recent\_stamp = get\_seconds() - ((TCP\_TIMEOUT\_INIT/HZ)<<req->num\_timeout);  
        paws\_reject = tcp\_paws\_reject(&tmp\_opt, th->rst);//查看序列号是否出现回绕  
    }  
}

/\* Check for pure retransmitted SYN. \*/  
if (TCP\_SKB\_CB(skb)->seq == tcp\_rsk(req)->rcv\_isn &&  
    flg == TCP\_FLAG\_SYN &&  
    !paws\_reject) {{//重传SYN且序列号没有回绕  
    /\*  
     \* RFC793 draws (Incorrectly! It was fixed in RFC1122)  
     \* this case on figure 6 and figure 8, but formal  
     \* protocol description says NOTHING.  
     \* To be more exact, it says that we should send ACK,  
     \* because this segment (at least, if it has no data)  
     \* is out of window.  
     \*  
     \*  CONCLUSION: RFC793 (even with RFC1122) DOES NOT  
     \*  describe SYN-RECV state. All the description  
     \*  is wrong, we cannot believe to it and should  
     \*  rely only on common sense and implementation  
     \*  experience.  
     \*  
     \* Enforce "SYN-ACK" according to figure 8, figure 6  
     \* of RFC793, fixed by RFC1122.  
     \*  
     \* Note that even if there is new data in the SYN packet  
     \* they will be thrown away too.  
     \*  
     \* Reset timer after retransmitting SYNACK, similar to  
     \* the idea of fast retransmit in recovery.  
     \*/  
    if (!tcp\_oow\_rate\_limited(sock\_net(sk), skb,  
                  LINUX\_MIB\_TCPACKSKIPPEDSYNRECV,  
                  &tcp\_rsk(req)->last\_oow\_ack\_time) &&

        !inet\_rtx\_syn\_ack(sk, req)) {  
        unsigned long expires = jiffies;  

//重新设置request sock的超时时间
expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
TCP_RTO_MAX);
if (!fastopen)
mod_timer_pending(&req->rsk_timer, expires);
else
req->rsk_timer.expires = expires;
}
return NULL;
}

/\* Further reproduces section "SEGMENT ARRIVES"  
   for state SYN-RECEIVED of RFC793.  
   It is broken, however, it does not work only  
   when SYNs are crossed.

   You would think that SYN crossing is impossible here, since  
   we should have a SYN\_SENT socket (from connect()) on our end,  
   but this is not true if the crossed SYNs were sent to both  
   ends by a malicious third party.  We must defend against this,  
   and to do that we first verify the ACK (as per RFC793, page  
   36) and reset if it is invalid.  Is this a true full defense?  
   To convince ourselves, let us consider a way in which the ACK  
   test can still pass in this 'malicious crossed SYNs' case.  
   Malicious sender sends identical SYNs (and thus identical sequence  
   numbers) to both A and B:

    A: gets SYN, seq=7  
    B: gets SYN, seq=7

   By our good fortune, both A and B select the same initial  
   send sequence number of seven :-)

    A: sends SYN|ACK, seq=7, ack\_seq=8  
    B: sends SYN|ACK, seq=7, ack\_seq=8

   So we are now A eating this SYN|ACK, ACK test passes.  So  
   does sequence test, SYN is truncated, and thus we consider  
   it a bare ACK.

   If icsk->icsk\_accept\_queue.rskq\_defer\_accept, we silently drop this  
   bare ACK.  Otherwise, we create an established connection.  Both  
   ends (listening sockets) accept the new incoming connection and try  
   to talk to each other. 8-)

   Note: This case is both harmless, and rare.  Possibility is about the  
   same as us discovering intelligent life on another plant tomorrow.

   But generally, we should (RFC lies!) to accept ACK  
   from SYNACK both here and in tcp\_rcv\_state\_process().  
   tcp\_rcv\_state\_process() does not, hence, we do not too.

   Note that the case is absolutely generic:  
   we cannot optimize anything here without  
   violating protocol. All the checks must be made  
   before attempt to create socket.  
   如果一个恶意的攻击者给A和B都发送了一个SYN,  
   其源|目的IP及端口与A和B都是匹配的。这样当A收到SYN时,就会认为是B发来的,  
   B也会认为是A发送的SYN,此时在A和B看来:??    
   A:收到从B发来的SYN,seq=7?  ?  
   B:收到从A发来的SYN,seq=7??    
   A和B分别回复SYN|ACK,很巧合的是,它们选择的起始序列号都是7(这个概率极低,与发现外星智慧生命的概率相似):?   ?  
   A:发送SYN|ACK,seq=7,ack\_seq=8?   ?  
   B:发送SYN|ACK,seq=7,ack\_seq=8?  ?  
   然后A和B会分别收到对方发送来的SYN|ACK,并会当作ACK包处理。?  ?  
   A:收到从B发来的ACK,seq=7,ack\_seq=8?  ?  
   B:收到从A发来的ACL,seq=7,ack\_seq=8?  ?  
   这时这个包中的ack\_seq正确,但seq号错误(应该为8),  
   在773-777的处理流程中会因序列号检查失败而发送ACK:???    
   A:发送ACK,seq=8,ack\_seq=8?  ?  
   B:发送ACK,seq=8,ack\_seq=8?  ?  
   这次当A和B分别收到对方发送的ACK时,就会建立连接,准备与对方通信。  
   但由于这个连接并不是双方中的任何一方想发起的,故不会有任何数据交互。  
   这个连接会一直存在直到应用层进程将其关闭。???    
   这种场景出现的概率很低,而且危害不大(多占用了两个服务器的各一个socket),  
   更重要的是无法在不修改协议的情况下解决这个问题(SYN|ACK包必须当作ACK处理,且收到序列号不合法的包必须发送ACK),  
   故Linux TCP无法解决这个问题。

 \*/

/\* RFC793 page 36: "If the connection is in any non-synchronized state ...  
 \*                  and the incoming segment acknowledges something not yet  
 \*                  sent (the segment carries an unacceptable ACK) ...  
 \*                  a reset is sent."  
 \*  
 \* Invalid ACK: reset will be sent by listening socket.  
 \* Note that the ACK validity check for a Fast Open socket is done  
 \* elsewhere and is checked directly against the child socket rather  
 \* than req because user data may have been sent out.  
 \*/  
if ((flg & TCP\_FLAG\_ACK) && !fastopen &&  
    (TCP\_SKB\_CB(skb)->ack\_seq !=  
     tcp\_rsk(req)->snt\_isn + 1))  
    return sk;//如果确认号不对,则返回listening socekt,在tcp\_v4\_do\_rcv函数中会发送Reset

/\* Also, it would be not so bad idea to check rcv\_tsecr, which  
 \* is essentially ACK extension and too early or too late values  
 \* should cause reset in unsynchronized states.  
 \*/

/\* RFC793: "first check sequence number". \*/  

//否则此处767-778行的处理流程中就会被丢弃
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST) && //没有RST标记位
!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSYNRECV,
&tcp_rsk(req)->last_oow_ack_time))
req->rsk_ops->send_ack(sk, skb, req);
//调用tcp_v4_reqsk_send_ack发送ACK,这个函数可以在仅有request_sock的时候发送ACK
if (paws_reject)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
return NULL;
}

/\* In sequence, PAWS is OK. \*/  

//如果开启了时间戳选项并且此包的序列号小于等于期望接收的序列号
if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
req->ts_recent = tmp_opt.rcv_tsval;//记录对端的时间戳

if (TCP\_SKB\_CB(skb)->seq == tcp\_rsk(req)->rcv\_isn) {  
    //当前包的序列号与第一个SYN包的一致  

//这是个部分超出窗口的包;它一定携带了数据,否则应该在767-778行的处理流程中就会被丢弃
/* Truncate SYN, it is out of window starting
at tcp_rsk(req)->rcv_isn + 1. */
flg &= ~TCP_FLAG_SYN;
}

/\* RFC793: "second check the RST bit" and  
 \*       "fourth, check the SYN bit"  
 \*/  
if (flg & (TCP\_FLAG\_RST|TCP\_FLAG\_SYN)) {  
    \_\_TCP\_INC\_STATS(sock\_net(sk), TCP\_MIB\_ATTEMPTFAILS);  
    goto embryonic\_reset;  
}

/\* ACK sequence verified above, just make sure ACK is  
 \* set.  If ACK not set, just silently drop the packet.  
 \*  
 \* XXX (TFO) - if we ever allow "data after SYN", the  
 \* following check needs to be removed.  
 \*/  
if (!(flg & TCP\_FLAG\_ACK))//不接受非ACK包  
    return NULL;

/\* For Fast Open no more processing is needed (sk is the  
 \* child socket).  
 \*/  
if (fastopen)  
    return sk;

/\* While TCP\_DEFER\_ACCEPT is active, drop bare ACK.  
    重传次数还没达到defer\_accept设置 并且不带数据  
\*/  
if (req->num\_timeout < inet\_csk(sk)->icsk\_accept\_queue.rskq\_defer\_accept &&  
    TCP\_SKB\_CB(skb)->end\_seq == tcp\_rsk(req)->rcv\_isn + 1) {  
    inet\_rsk(req)->acked = 1;  
    \_\_NET\_INC\_STATS(sock\_net(sk), LINUX\_MIB\_TCPDEFERACCEPTDROP);  
    return NULL;  
}

/\* OK, ACK is valid, create big socket and  
 \* feed this segment to it. It will repeat all  
 \* the tests. THIS SEGMENT MUST MOVE SOCKET TO  
 \* ESTABLISHED STATE. If it will be dropped after  
 \* socket is created, wait for troubles.  
 \*/  
  \*//\*  
 \* 到此为止作为第三次握手的  
 \* ACK段是有效的,因此调用tcp\_v4\_syn\_recv\_sock()\* 创建相应的"子"传输控制块  
 tcp\_v4\_syn\_recv\_sock, 生成child sk, 从ehash中删除req sock  
 \*/  
child = inet\_csk(sk)->icsk\_af\_ops->syn\_recv\_sock(sk, skb, req, NULL,  
                         req, &own\_req);  
if (!child)  
    goto listen\_overflow;

sock\_rps\_save\_rxhash(child, skb);//sk->sk\_rxhash = skb->hash;  
tcp\_synack\_rtt\_meas(child, req);//更新rtt\_min,srtt,rto  
return inet\_csk\_complete\_hashdance(sk, child, req, own\_req);//插入accept队列

listen_overflow:
if (!sysctl_tcp_abort_on_overflow) {
inet_rsk(req)->acked = 1;
return NULL;
}

embryonic_reset:
if (!(flg & TCP_FLAG_RST)) {
/* Received a bad SYN pkt - for TFO We try not to reset
* the local connection unless it's really necessary to
* avoid becoming vulnerable to outside attack aiming at
* resetting legit local connections.
*/
req->rsk_ops->send_reset(sk, skb);
} else if (fastopen) { /* received a valid RST pkt */
reqsk_fastopen_remove(sk, req, true);
tcp_reset(sk);
}
if (!fastopen) {
inet_csk_reqsk_queue_drop(sk, req);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
}
return NULL;
}
EXPORT_SYMBOL(tcp_check_req);

struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
struct request_sock *req, bool own_req)
{
if (own_req) {
inet_csk_reqsk_queue_drop(sk, req);//如果还没从ehash中删除req sock,则删除; 已经在inet_ehash_nolisten中删除过了
reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
if (inet_csk_reqsk_queue_add(sk, req, child))//添加到listen sk的accept队列
return child;
}
/* Too bad, another child took ownership of the request, undo. */
bh_unlock_sock(child);
sock_put(child);
return NULL;
}
struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
                      struct request_sock *req,
                      struct sock *child)
{
    struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;

    spin_lock(&queue->rskq_lock);
    if (unlikely(sk->sk_state != TCP_LISTEN)) {
        inet_child_forget(sk, req, child);
        child = NULL;
    } else {
        req->sk = child;
        req->dl_next = NULL;
        if (queue->rskq_accept_head == NULL)
            queue->rskq_accept_head = req;
        else
            queue->rskq_accept_tail->dl_next = req;
        queue->rskq_accept_tail = req;
        sk_acceptq_added(sk);
    }
    spin_unlock(&queue->rskq_lock);
    return child;
}

/*
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*/
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst,
struct request_sock *req_unhash,
bool *own_req)
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
#endif
struct ip_options_rcu *inet_opt;

if (sk\_acceptq\_is\_full(sk))  
    goto exit\_overflow;  

//创建并根据req的信息初始化sock, 进入TCP_SYN_RECV状态
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto exit_nonewsk;

newsk->sk\_gso\_type = SKB\_GSO\_TCPV4;  
inet\_sk\_rx\_dst\_set(newsk, skb);

newtp              = tcp\_sk(newsk);  
newinet              = inet\_sk(newsk);  
ireq              = inet\_rsk(req);  
sk\_daddr\_set(newsk, ireq->ir\_rmt\_addr);  
sk\_rcv\_saddr\_set(newsk, ireq->ir\_loc\_addr);  
newsk->sk\_bound\_dev\_if = ireq->ir\_iif;  
newinet->inet\_saddr          = ireq->ir\_loc\_addr;  
inet\_opt          = ireq->opt;//复制选项信息  
rcu\_assign\_pointer(newinet->inet\_opt, inet\_opt);  
ireq->opt          = NULL;  
newinet->mc\_index     = inet\_iif(skb);  
newinet->mc\_ttl          = ip\_hdr(skb)->ttl;  
newinet->rcv\_tos      = ip\_hdr(skb)->tos;  
inet\_csk(newsk)->icsk\_ext\_hdr\_len = 0;  
if (inet\_opt)  
    inet\_csk(newsk)->icsk\_ext\_hdr\_len = inet\_opt->opt.optlen;  
newinet->inet\_id = newtp->write\_seq ^ jiffies;

if (!dst) {  
    dst = inet\_csk\_route\_child\_sock(sk, newsk, req);  
    if (!dst)  
        goto put\_and\_exit;  
} else {  
    /\* syncookie case : see end of cookie\_v4\_check() \*/  
}  
sk\_setup\_caps(newsk, dst);  

//设置拥塞算法,进入TCP_CA_OPEN状态
tcp_ca_openreq_child(newsk, dst);
//根据pmtu,rwnd来计算mss放到tp->mss_cache
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric_advmss(dst);
if (tcp_sk(sk)->rx_opt.user_mss &&
tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;

tcp\_initialize\_rcv\_mss(newsk);

#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
AF_INET);
if (key) {
/*
* We're using one, so create a matching key
* on the newsk structure. If we fail to get
* memory, then we end up not copying the key
* across. Shucks.
*/
tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
AF_INET, key->key, key->keylen, GFP_ATOMIC);
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
#endif
//newsk也添加到bind hash bucket
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
//从ehash里删除req sock, 再insert newsk
/*
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
{
    bool ok = inet_ehash_insert(sk, osk);

    if (ok) {
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    } else {
        percpu_counter_inc(sk->sk_prot->orphan_count);
        sk->sk_state = TCP_CLOSE;
        sock_set_flag(sk, SOCK_DEAD);
        inet_csk_destroy_sock(sk);
    }
    return ok;
}

/* insert a socket into ehash, and eventually remove another one
 * (The another one can be a SYN_RECV or TIMEWAIT
 */
bool inet_ehash_insert(struct sock *sk, struct sock *osk)
{
    struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
        //--->tcp_hashinfo  需要注意的是tcp_hashinfo.ehash不仅包括已建立连接的TCP套接口,
        //还包括除了在LISTEN状态的其它所有状态的套接口。
    struct hlist_nulls_head *list;
    struct inet_ehash_bucket *head;
    spinlock_t *lock;
    bool ret = true;

    WARN_ON_ONCE(!sk_unhashed(sk));

    sk->sk_hash = sk_ehashfn(sk);
    head = inet_ehash_bucket(hashinfo, sk->sk_hash);
    list = &head->chain;
    lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

    spin_lock(lock);
    if (osk) {
        WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
        ret = sk_nulls_del_node_init_rcu(osk);
    }
    if (ret)
        __sk_nulls_add_node_rcu(sk, list);
    spin_unlock(lock);
    return ret;
*/

\*own\_req = inet\_ehash\_nolisten(newsk, req\_to\_sk(req\_unhash));  
if (\*own\_req)  
    tcp\_move\_syn(newtp, req);

return newsk;

exit_overflow:
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit_nonewsk:
dst_release(dst);
exit:
tcp_listendrop(sk);
return NULL;
put_and_exit:
inet_csk_prepare_forced_close(newsk);
tcp_done(newsk);
goto exit;
}

__inet_inherit_port函数会将新sock加入到listen sock监听端口的绑定队列中,这个动作的后果是如果newsk没有释放(比如处于TIME_WIAT状态)则不允许重复bind监听的地址和端口(除非设置了端口重用

TCP为什么要将新创建的子sock与监听的地址和端口绑定?这样会使得当前的bind有效,但如果取消当前bind(比如关闭socket)再快速重新bind则会失败。为什么要做成这个样子呢

/*
TCP为什么要将新创建的子sock与监听的地址和端口绑定?这样会使得当前的bind有效,
但如果取消当前bind(比如关闭socket)再快速重新bind则会失败。为什么要做成这个样子呢
*/
int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
//这个port就是本地port,赋值是在inet_csk_clone_lock函数中
unsigned short port = inet_sk(child)->inet_num;
const int bhash = inet_bhashfn(sock_net(sk), port,
table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind_bucket *tb;

spin\_lock(&head->lock);  
tb = inet\_csk(sk)->icsk\_bind\_hash;  
if (unlikely(!tb)) {  
    spin\_unlock(&head->lock);  
    return -ENOENT;  
}  
if (tb->port != port) {  
    /\* NOTE: using tproxy and redirecting skbs to a proxy  
     \* on a different listener port breaks the assumption  
     \* that the listener socket's icsk\_bind\_hash is the same  
     \* as that of the child socket. We have to look up or  
     \* create a new bind bucket for the child here. \*/  
    inet\_bind\_bucket\_for\_each(tb, &head->chain) {  
        if (net\_eq(ib\_net(tb), sock\_net(sk)) &&  
            tb->port == port)  
            break;  
    }  
    if (!tb) {  
        tb = inet\_bind\_bucket\_create(table->bind\_bucket\_cachep,  
                         sock\_net(sk), head, port);  
        if (!tb) {  
            spin\_unlock(&head->lock);  
            return -ENOMEM;  
        }  
    }  
}  
inet\_bind\_hash(child, tb, port);  
spin\_unlock(&head->lock);

return 0;  

}
EXPORT_SYMBOL_GPL(__inet_inherit_port);

tcp_check_req函数返回后,tcp_v4_do_rcv会调用tcp_child_process继续进行处理:

/*
* Queue segment on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket.
*
* For the vast majority of cases child->sk_state will be TCP_SYN_RECV
* when entering. But other states are possible due to a race condition
* where after __inet_lookup_established() fails but before the listener
* locked is obtained, other packets cause the same connection to
* be created.
在tcp_child_process中调用tcp_rcv_state_process来处理TCP_SYN_RECV状态的child sock。并进入TCP_ESTABLISHED状态
*/

int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb)
{
int ret = 0;
int state = child->sk_state;

tcp\_segs\_in(tcp\_sk(child), skb);  
if (!sock\_owned\_by\_user(child)) {////新的socket没有被进行系统调用的进程锁定  
    ret = tcp\_rcv\_state\_process(child, skb);  
    /\* Wakeup parent, send SIGIO \*/  
    if (state == TCP\_SYN\_RECV && child->sk\_state != state)//状态处理结束后socket的状态发生了变化  
        parent->sk\_data\_ready(parent);//调用sock\_def\_readable函数发送可读事件通告给listening socket,告知其可以进行accept系统调用  
} else {  
    /\* Alas, it is possible again, because we do lookup  
     \* in main socket hash table and lock on listening  
     \* socket does not protect us more.  
     //用户进程在处理,则挂到backlog上,等用户进程release\_sock是否锁的时候在进程上下文中处理  
    也就是等待进程系统调用结束时处理  
     \*/  
    \_\_sk\_add\_backlog(child, skb);  
}

bh\_unlock\_sock(child);  
sock\_put(child);  
return ret;

/*
* This function implements the receiving procedure of RFC 793 for
* all states except ESTABLISHED and TIME_WAIT.
* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
* address independent.
*/

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
int queued = 0;
bool acceptable;

switch (sk->sk\_state) {  
case TCP\_CLOSE:  
    goto discard;

case TCP\_LISTEN:  
    //服务器端收到SYN  
    /\*  
     \* 在半连接的LISTEN状态下,只处理SYN段。如果是  
     \* ACK段,此时连接尚未开始建立,因此返回1。在调用  
     \* tcp\_rcv\_state\_process()函数中会给对方发送RST段;  
     \* 如果接收的是RST段,则丢弃  
     \*/  
    if (th->ack)  
        return 1;

    if (th->rst)  
        goto discard;

    if (th->syn) {  
        if (th->fin)  
            goto discard;  
        /\*  
         \* 处理SYN段,主要由conn\_request接口(TCP中为tcp\_v4\_conn\_request)处理,  
         \* icsk\_af\_ops成员在创建套接字时被初始化,参见tcp\_v4\_init\_sock()  
         \*/  
         /\*收到三次握手的第一步SYN,  
            则在tcp\_v4\_conn\_request中创建连接请求控制块request\_sock  
            \*/  
        if (icsk->icsk\_af\_ops->conn\_request(sk, skb) < 0)//ipv4\_specific--->tcp\_v4\_conn\_request  
            return 1;

        consume\_skb(skb);  
        return 0;  
    }  
    goto discard;

case TCP\_SYN\_SENT://客户端收到SYN+ACK  
/\*  

对于TCP_SYN_SENT状态的sock,会调用tcp_rcv_synsent_state_process来进行处理
解析tcp选项,获取服务端的支持情况, 比如sack, TFO, wscale, MSS, timestamp等
如果有ack, 进行tcp_ack, 这时候可能fastopen确认了之前的数据
调用tcp_finish_connect,TCP_SYN_SENT->TCP_ESTABLISHED
如果包含fastopen cookie则保存
判断是否需要立即ack还是延时ack
如果包里没有ack,只有syn,则表示相互connect, TCP_SYN_SENT->TCP_SYN_RECV, 并发送synack
*/
tp->rx_opt.saw_tstamp = 0;
queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;

    /\* Do step6 onward by hand. \*/  
    tcp\_urg(sk, skb, th);  
    \_\_kfree\_skb(skb);  
    tcp\_data\_snd\_check(sk);  
    return 0;  
}

tp->rx\_opt.saw\_tstamp = 0;  
req = tp->fastopen\_rsk;  
if (req) {  
    WARN\_ON\_ONCE(sk->sk\_state != TCP\_SYN\_RECV &&  
        sk->sk\_state != TCP\_FIN\_WAIT1);

    if (!tcp\_check\_req(sk, skb, req, true))  
        goto discard;  
}

if (!th->ack && !th->rst && !th->syn)  
    goto discard;

if (!tcp\_validate\_incoming(sk, skb, th, 0))  
    return 0;  

/*
* 处理TCP段ACK标志,tcp_ack()返回非零值表示处理
* ACK段成功,是正常的第三次握手TCP段
*/
/* step 5: check the ACK field */
acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
FLAG_UPDATE_TS_RECENT) > 0;
/*
tcp_rcv_state_process函数中对于ack的处理步骤中,假如连接处于FIN_WAIT_1,
且数据均已经被确认完,则进入TIME_WAIT_2状态;如果无需在该状态等待(linger2<0), 或者收到了乱序数据段,则直接关闭连接;如果需要等待, 则需要判断等待时间与TIMEWAIT时间的大小关系,若>TIMEWAIT_LEN,
则添加TIME_WAIT_2定时器,否则直接进入TIME_WAIT接管(其子状态仍然是FIN_WAIT_2),
接管之后会添加TIME_WAIT定时器;
*/
switch (sk->sk_state) {
case TCP_SYN_RECV:////握手完成时的新建连接的初始状态
if (!acceptable)
return 1;

    if (!tp->srtt\_us)  
        tcp\_synack\_rtt\_meas(sk, req);  

/*/这里是由tcp_v4_do_rcv里面的tcp_child_process走到这里,
在tcp_child_process前会通过tcp_check_req创建一个新的struct sock
Once we leave TCP_SYN_RECV, we no longer need req
* so release it.
*/
if (req) {
tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false); //回收fastopen req
} else {
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);//调用inet_sk_rebuild_header或inet6_sk_rebuild_header,根据ACK包的信息重新计算路由
tcp_init_congestion_control(sk);//初始化拥塞控制算法

        tcp\_mtup\_init(sk);//初始化MTU探测功能  
        tp->copied\_seq = tp->rcv\_nxt;  
        tcp\_init\_buffer\_space(sk);/初始化接收缓存和发送缓存的空间  
    }  
    smp\_mb();  
    tcp\_set\_state(sk, TCP\_ESTABLISHED);// TCP\_SYN\_RECV->TCP\_ESTABLISHED  
    sk->sk\_state\_change(sk);//sock\_def\_wakeup, 唤醒epoll  

/*
sock_init_data中 有
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
*/
//epoll然后调用ep_send_events->ep_scan_ready_list->ep_send_events_proc->ep_item_poll->tcp_poll
/*
* 设置"子"传输控制块为ESTABLISHED状态
*/
/* Note, that this wakeup is only for marginal crossed SYN case.
* Passively open sockets are not waked up, because
* sk->sk_sleep == NULL and sk->sk_socket == NULL.
*/
/*
* 发信号给那些将通过该套接字发送数据的进程,
* 通知他们套接字目前已经可以发送数据了
sk_state_change()->sock_def_wakeup()->ep_poll_callback(), 添加到epoll的ready list中,并唤醒阻塞中的epoll。
epoll然后调用ep_send_events->ep_scan_ready_list->ep_send_events_proc->ep_item_poll->tcp_poll
*/

    if (sk->sk\_socket)  
        sk\_wake\_async(sk, SOCK\_WAKE\_IO, POLL\_OUT);  
 /\*  
             \* 初始化传输控制块各字段,如果存在时间戳选项,  
             \* 同时平滑RTT为零,则需计算重传超时时间等  
             \*/  
    tp->snd\_una = TCP\_SKB\_CB(skb)->ack\_seq;  
    tp->snd\_wnd = ntohs(th->window) << tp->rx\_opt.snd\_wscale;  
    tcp\_init\_wl(tp, TCP\_SKB\_CB(skb)->seq);

    if (tp->rx\_opt.tstamp\_ok)  
        tp->advmss -= TCPOLEN\_TSTAMP\_ALIGNED;

    if (req) {  
        /\* Re-arm the timer because data may have been sent out.  
         \* This is similar to the regular data transmission case  
         \* when new data has just been ack'ed.  
         \*  
         \* (TFO) - we could try to be more aggressive and  
         \* retransmitting any data sooner based on when they  
         \* are sent out.  
         \*/  
        tcp\_rearm\_rto(sk);  
    } else  
        tcp\_init\_metrics(sk);  

/*
* 为该套接字建立路由,初始化拥塞控制模块
*/
/*
* 初始化与路径MTU有关的成员
*/
tcp_update_pacing_rate(sk);
/*
* 更新最近一次发送数据包的时间
*/
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_time_stamp;

    tcp\_initialize\_rcv\_mss(sk);  
    /\*  
             \* 计算有关TCP首部预测的标志  
             \*/  
    tcp\_fast\_path\_on(tp);  
    break;

case TCP\_FIN\_WAIT1: {  
    struct dst\_entry \*dst;  
    int tmo;

    /\* If we enter the TCP\_FIN\_WAIT1 state and we are a  
     \* Fast Open socket and this is the first acceptable  
     \* ACK we have received, this would have acknowledged  
     \* our SYNACK so stop the SYNACK timer.  
     \*/  
    if (req) {  
        /\* Return RST if ack\_seq is invalid.  
         \* Note that RFC793 only says to generate a  
         \* DUPACK for it but for TCP Fast Open it seems  
         \* better to treat this case like TCP\_SYN\_RECV  
         \* above.  
         \*/  
        if (!acceptable)  
            return 1;  
        /\* We no longer need the request sock. \*/  
        reqsk\_fastopen\_remove(sk, req, false);  
        tcp\_rearm\_rto(sk);  
    }        /\* 发送数据未确认完毕 \*/  
    if (tp->snd\_una != tp->write\_seq)  
        break;

    tcp\_set\_state(sk, TCP\_FIN\_WAIT2); /\* 进入FIN\_WAIT\_2状态 \*/  
    sk->sk\_shutdown |= SEND\_SHUTDOWN;/\* 关闭发送端 \*/

    dst = \_\_sk\_dst\_get(sk);  
    if (dst)/\* 路由缓存确认 \*/  
        dst\_confirm(dst);

    if (!sock\_flag(sk, SOCK\_DEAD)) {  
        /\* Wake up lingering close() \*/  
        sk->sk\_state\_change(sk); /\* 套接口不是DEAD状态,状态发生变化,唤醒等待进程 \*/  
        break;  
    }  

/* linger2<0,无需在FIN_WAIT_2等待 */ if (tp->linger2 < 0 || /* 收到期望序号以后的数据段(data, fin) */ (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
tcp_done(sk);/* 关闭连接 */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}

    tmo = tcp\_fin\_time(sk); /\* 获取FIN\_WAIT\_2等待时间 \*/  
    if (tmo > TCP\_TIMEWAIT\_LEN) {  /\* > TIMEWAIT\_LEN,加入FIN\_WAIT\_2定时器 \*/  
        inet\_csk\_reset\_keepalive\_timer(sk, tmo - TCP\_TIMEWAIT\_LEN);  
    } else if (th->fin || sock\_owned\_by\_user(sk)) {  
        /\* Bad case. We could lose such FIN otherwise.  
         \* It is not a big problem, but it looks confusing  
         \* and not so rare event. We still can lose it now,  
         \* if it spins in bh\_lock\_sock(), but it is really  
         \* marginal case.  
         \*/ /\* 有fin?? 或者 被用户进程锁定,加入FIN\_WAIT\_2定时器 \*/  
        inet\_csk\_reset\_keepalive\_timer(sk, tmo);  
    } else { /\* 正常等待时间< TIMEWAIT\_LEN,进入TIMEWAIT接管状态 \*/  
        tcp\_time\_wait(sk, TCP\_FIN\_WAIT2, tmo);  
        goto discard;  
    }  
    break;  
}

case TCP\_CLOSING:  
    if (tp->snd\_una == tp->write\_seq) {  
        tcp\_time\_wait(sk, TCP\_TIME\_WAIT, 0);  
        goto discard;  
    }  
    break;

case TCP\_LAST\_ACK:  
    if (tp->snd\_una == tp->write\_seq) {  
        tcp\_update\_metrics(sk);  
        tcp\_done(sk);  
        goto discard;  
    }  
    break;  
}

/\* step 6: check the URG bit \*/  
tcp\_urg(sk, skb, th);  

/*
FIN_WAIT_2状态的走向有以下几个流程触发点,
(1)TIME_WAIT_2定时器未超时时间内,收到数据段触发;
(2)TIME_WAIT_2定时器超时触发;
(3)TIME_WAIT定时器未超时时间内,收到数据段触发;
(4)TIME_WAIT定时器超时触发;
*/
/* step 7: process the segment text */
switch (sk->sk_state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2://TIME_WAIT_2定时器未超时时间内,收到数据段触发,如果设置FIN标记,则直接进入TIME_WAIT状态;
/* RFC 793 says to queue data in these states,
* RFC 1122 says we MUST send a reset.
* BSD 4.4 also does reset.
*/
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
}
}
/* Fall through */
case TCP_ESTABLISHED:
tcp_data_queue(sk, skb); //如果带数据部分则处理,比如客户端设置了deferaccept的时候
queued = 1;
break;
}

/\* tcp\_data could move socket to TIME-WAIT \*/  
if (sk->sk\_state != TCP\_CLOSE) {  
    tcp\_data\_snd\_check(sk);//给数据一个发送机会,tcp\_push\_pending\_frame  
    tcp\_ack\_snd\_check(sk);//检查是否有ack被推迟,判断是否需要立即发送  
}

if (!queued) {  

discard:
tcp_drop(sk, skb);
}
return 0;

epoll

进入TCP_ESTABLISHED后调用sk_state_change来通知epoll及应用程序准备accept

sk_state_change()->sock_def_wakeup()->ep_poll_callback(), 添加到epoll的ready list中,并唤醒阻塞中的epoll。
epoll然后调用ep_send_events->ep_scan_ready_list->ep_send_events_proc->ep_item_poll->tcp_poll