tcp syn-synack-ack 服务端 接收 SYN tcp_v4_do_rcv分析
阅读原文时间:2023年07月09日阅读:3

rcv 分析:

/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
Tcp的处理中使用了三个队列,receive_queue,backlog_queue,pre_queue,在数据包到达tcp协议栈时,持有sk自旋锁,
然后检查当前使用有进程上下文操作sk的逻辑,通过sock_owned_by_user判断,如果sk_lock.owned被赋值说明进程持有sk,
如果为0则可以在当前软中断上下文中,继续数据报文的处理。
*/
/*
* TCP传输层接收到段之后,经过了简单的
* 校验,并确定接收处理该段的传输控制
* 块之后,除非处于FIN_WAIT_2或TIME_WAIT状态,
* 否则都会调用tcp_v4_do_rcv()作具体的处理
对于协议栈的接收路径,syn包的处理路径如下

tcp_v4_rcv
->__inet_lookup_skb() //listen hash中找到对应的TCP_LISTEN的sk
->tcp_v4_do_rcv()
->tcp_v4_cookie_check() //syncookie检查,因为没有syn包没有ack选项,因此忽略, 如果syncookie验证通过则创建新的sock
->tcp_rcv_state_process()
->tcp_v4_conn_request()
对于syncookie,服务端不保存任何状态
对于fastopen,新建sock进入TCP_SYN_RCV状态, 并插入等待accpet队列,并把数据部分放倒接收队列中, 并设置重传定时器
对于一般的syn包,request_sock设置为TCP_NEW_SYN_RECV,插入ehash表, 设置req_timer定时器,重传synack
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;

if (sk->sk\_state == TCP\_ESTABLISHED) { /\* Fast path \*/  
    struct dst\_entry \*dst = sk->sk\_rx\_dst;

    sock\_rps\_save\_rxhash(sk, skb);  
    sk\_mark\_napi\_id(sk, skb);  
    if (dst) {  
        if (inet\_sk(sk)->rx\_dst\_ifindex != skb->skb\_iif ||  
            !dst->ops->check(dst, 0)) {  
            dst\_release(dst);  
            sk->sk\_rx\_dst = NULL;  
        }  
    }  
    tcp\_rcv\_established(sk, skb, tcp\_hdr(skb), skb->len);  
    return 0;  
}

if (tcp\_checksum\_complete(skb))  
    goto csum\_err;

if (sk->sk\_state == TCP\_LISTEN) {//说明收到的是三次握手第一步SYN或者第三步ACK,这里是服务器端的情况  
    struct sock \*nsk = tcp\_v4\_cookie\_check(sk, skb);  

//syncookie检查,因为没有syn包没有ack选项,因此忽略, 如果syncookie验证通过则创建新的
if (!nsk)
goto discard;
/*/如果是第一次握手的SYN,这里的nsk应该是'父'sk,
如果这里是三次握手的第三步ACK,则这里的nsk是‘子'sk
*/
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
sk_mark_napi_id(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) { //这里面还是会调用tcp_rcv_state_proces
rsk = nsk;
goto reset;
}
return 0;//如果是握手的第三步,这里直接退出
} //如果是三次握手中的第一步SYN,则继续后面的操作
} else
sock_rps_save_rxhash(sk, skb);
//走到这里说明只能是客户端收到SYN+ACK,或者是服务器端收到SYN
if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
return 0;

reset:
tcp_v4_send_reset(rsk, skb);
discard:
kfree_skb(skb);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
return 0;

csum_err:
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;

tcp_rcv_state_process

/*
* This function implements the receiving procedure of RFC 793 for
* all states except ESTABLISHED and TIME_WAIT.
* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
* address independent.
*/

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
int queued = 0;
bool acceptable;

switch (sk->sk\_state) {  
case TCP\_CLOSE:  
    goto discard;

case TCP\_LISTEN:  
    //服务器端收到SYN  
    /\*  
     \* 在半连接的LISTEN状态下,只处理SYN段。如果是  
     \* ACK段,此时连接尚未开始建立,因此返回1。在调用  
     \* tcp\_rcv\_state\_process()函数中会给对方发送RST段;  
     \* 如果接收的是RST段,则丢弃  
     \*/  
    if (th->ack)  
        return 1;

    if (th->rst)  
        goto discard;

    if (th->syn) {  
        if (th->fin)  
            goto discard;  
        /\*  
         \* 处理SYN段,主要由conn\_request接口(TCP中为tcp\_v4\_conn\_request)处理,  
         \* icsk\_af\_ops成员在创建套接字时被初始化,参见tcp\_v4\_init\_sock()  
         \*/  
         /\*收到三次握手的第一步SYN,  
            则在tcp\_v4\_conn\_request中创建连接请求控制块request\_sock  
            \*/  
        if (icsk->icsk\_af\_ops->conn\_request(sk, skb) < 0)//ipv4\_specific--->tcp\_v4\_conn\_request  
            return 1;

        consume\_skb(skb);  
        return 0;  
    }  
    goto discard;

case TCP\_SYN\_SENT://客户端收到SYN+ACK  
/\*  

对于TCP_SYN_SENT状态的sock,会调用tcp_rcv_synsent_state_process来进行处理
解析tcp选项,获取服务端的支持情况, 比如sack, TFO, wscale, MSS, timestamp等
如果有ack, 进行tcp_ack, 这时候可能fastopen确认了之前的数据
调用tcp_finish_connect,TCP_SYN_SENT->TCP_ESTABLISHED
如果包含fastopen cookie则保存
判断是否需要立即ack还是延时ack
如果包里没有ack,只有syn,则表示相互connect, TCP_SYN_SENT->TCP_SYN_RECV, 并发送synack
*/
tp->rx_opt.saw_tstamp = 0;
queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;

    /\* Do step6 onward by hand. \*/  
    tcp\_urg(sk, skb, th);  
    \_\_kfree\_skb(skb);  
    tcp\_data\_snd\_check(sk);  
    return 0;  
}

tp->rx\_opt.saw\_tstamp = 0;  
req = tp->fastopen\_rsk;  
if (req) {  
    WARN\_ON\_ONCE(sk->sk\_state != TCP\_SYN\_RECV &&  
        sk->sk\_state != TCP\_FIN\_WAIT1);

    if (!tcp\_check\_req(sk, skb, req, true))  
        goto discard;  
}

if (!th->ack && !th->rst && !th->syn)  
    goto discard;

if (!tcp\_validate\_incoming(sk, skb, th, 0))  
    return 0;  

/*
* 处理TCP段ACK标志,tcp_ack()返回非零值表示处理
* ACK段成功,是正常的第三次握手TCP段
*/
/* step 5: check the ACK field */
acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
FLAG_UPDATE_TS_RECENT) > 0;
/*
tcp_rcv_state_process函数中对于ack的处理步骤中,假如连接处于FIN_WAIT_1,
且数据均已经被确认完,则进入TIME_WAIT_2状态;如果无需在该状态等待(linger2<0), 或者收到了乱序数据段,则直接关闭连接;如果需要等待, 则需要判断等待时间与TIMEWAIT时间的大小关系,若>TIMEWAIT_LEN,
则添加TIME_WAIT_2定时器,否则直接进入TIME_WAIT接管(其子状态仍然是FIN_WAIT_2),
接管之后会添加TIME_WAIT定时器;
*/
switch (sk->sk_state) {
case TCP_SYN_RECV:
if (!acceptable)
return 1;

    if (!tp->srtt\_us)  
        tcp\_synack\_rtt\_meas(sk, req);  

/*/这里是由tcp_v4_do_rcv里面的tcp_child_process走到这里,
在tcp_child_process前会通过tcp_check_req创建一个新的struct sock
Once we leave TCP_SYN_RECV, we no longer need req
* so release it.
*/
if (req) {
tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_congestion_control(sk);

        tcp\_mtup\_init(sk);  
        tp->copied\_seq = tp->rcv\_nxt;  
        tcp\_init\_buffer\_space(sk);  
    }  
    smp\_mb();  
    tcp\_set\_state(sk, TCP\_ESTABLISHED);// TCP\_SYN\_RECV->TCP\_ESTABLISHED  
    sk->sk\_state\_change(sk);//sock\_def\_wakeup, 唤醒epoll  

/*
sock_init_data中 有
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
*/
//epoll然后调用ep_send_events->ep_scan_ready_list->ep_send_events_proc->ep_item_poll->tcp_poll
/*
* 设置"子"传输控制块为ESTABLISHED状态
*/
/* Note, that this wakeup is only for marginal crossed SYN case.
* Passively open sockets are not waked up, because
* sk->sk_sleep == NULL and sk->sk_socket == NULL.
*/
/*
* 发信号给那些将通过该套接字发送数据的进程,
* 通知他们套接字目前已经可以发送数据了
sk_state_change()->sock_def_wakeup()->ep_poll_callback(), 添加到epoll的ready list中,并唤醒阻塞中的epoll。
epoll然后调用ep_send_events->ep_scan_ready_list->ep_send_events_proc->ep_item_poll->tcp_poll
*/

    if (sk->sk\_socket)  
        sk\_wake\_async(sk, SOCK\_WAKE\_IO, POLL\_OUT);  
 /\*  
             \* 初始化传输控制块各字段,如果存在时间戳选项,  
             \* 同时平滑RTT为零,则需计算重传超时时间等  
             \*/  
    tp->snd\_una = TCP\_SKB\_CB(skb)->ack\_seq;  
    tp->snd\_wnd = ntohs(th->window) << tp->rx\_opt.snd\_wscale;  
    tcp\_init\_wl(tp, TCP\_SKB\_CB(skb)->seq);

    if (tp->rx\_opt.tstamp\_ok)  
        tp->advmss -= TCPOLEN\_TSTAMP\_ALIGNED;

    if (req) {  
        /\* Re-arm the timer because data may have been sent out.  
         \* This is similar to the regular data transmission case  
         \* when new data has just been ack'ed.  
         \*  
         \* (TFO) - we could try to be more aggressive and  
         \* retransmitting any data sooner based on when they  
         \* are sent out.  
         \*/  
        tcp\_rearm\_rto(sk);  
    } else  
        tcp\_init\_metrics(sk);  

/*
* 为该套接字建立路由,初始化拥塞控制模块
*/
/*
* 初始化与路径MTU有关的成员
*/
tcp_update_pacing_rate(sk);
/*
* 更新最近一次发送数据包的时间
*/
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_time_stamp;

    tcp\_initialize\_rcv\_mss(sk);  
    /\*  
             \* 计算有关TCP首部预测的标志  
             \*/  
    tcp\_fast\_path\_on(tp);  
    break;

case TCP\_FIN\_WAIT1: {  
    struct dst\_entry \*dst;  
    int tmo;

    /\* If we enter the TCP\_FIN\_WAIT1 state and we are a  
     \* Fast Open socket and this is the first acceptable  
     \* ACK we have received, this would have acknowledged  
     \* our SYNACK so stop the SYNACK timer.  
     \*/  
    if (req) {  
        /\* Return RST if ack\_seq is invalid.  
         \* Note that RFC793 only says to generate a  
         \* DUPACK for it but for TCP Fast Open it seems  
         \* better to treat this case like TCP\_SYN\_RECV  
         \* above.  
         \*/  
        if (!acceptable)  
            return 1;  
        /\* We no longer need the request sock. \*/  
        reqsk\_fastopen\_remove(sk, req, false);  
        tcp\_rearm\_rto(sk);  
    }        /\* 发送数据未确认完毕 \*/  
    if (tp->snd\_una != tp->write\_seq)  
        break;

    tcp\_set\_state(sk, TCP\_FIN\_WAIT2); /\* 进入FIN\_WAIT\_2状态 \*/  
    sk->sk\_shutdown |= SEND\_SHUTDOWN;/\* 关闭发送端 \*/

    dst = \_\_sk\_dst\_get(sk);  
    if (dst)/\* 路由缓存确认 \*/  
        dst\_confirm(dst);

    if (!sock\_flag(sk, SOCK\_DEAD)) {  
        /\* Wake up lingering close() \*/  
        sk->sk\_state\_change(sk); /\* 套接口不是DEAD状态,状态发生变化,唤醒等待进程 \*/  
        break;  
    }  

/* linger2<0,无需在FIN_WAIT_2等待 */ if (tp->linger2 < 0 || /* 收到期望序号以后的数据段(data, fin) */ (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
tcp_done(sk);/* 关闭连接 */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}

    tmo = tcp\_fin\_time(sk); /\* 获取FIN\_WAIT\_2等待时间 \*/  
    if (tmo > TCP\_TIMEWAIT\_LEN) {  /\* > TIMEWAIT\_LEN,加入FIN\_WAIT\_2定时器 \*/  
        inet\_csk\_reset\_keepalive\_timer(sk, tmo - TCP\_TIMEWAIT\_LEN);  
    } else if (th->fin || sock\_owned\_by\_user(sk)) {  
        /\* Bad case. We could lose such FIN otherwise.  
         \* It is not a big problem, but it looks confusing  
         \* and not so rare event. We still can lose it now,  
         \* if it spins in bh\_lock\_sock(), but it is really  
         \* marginal case.  
         \*/ /\* 有fin?? 或者 被用户进程锁定,加入FIN\_WAIT\_2定时器 \*/  
        inet\_csk\_reset\_keepalive\_timer(sk, tmo);  
    } else { /\* 正常等待时间< TIMEWAIT\_LEN,进入TIMEWAIT接管状态 \*/  
        tcp\_time\_wait(sk, TCP\_FIN\_WAIT2, tmo);  
        goto discard;  
    }  
    break;  
}

case TCP\_CLOSING:  
    if (tp->snd\_una == tp->write\_seq) {  
        tcp\_time\_wait(sk, TCP\_TIME\_WAIT, 0);  
        goto discard;  
    }  
    break;

case TCP\_LAST\_ACK:  
    if (tp->snd\_una == tp->write\_seq) {  
        tcp\_update\_metrics(sk);  
        tcp\_done(sk);  
        goto discard;  
    }  
    break;  
}

/\* step 6: check the URG bit \*/  
tcp\_urg(sk, skb, th);  

/*
FIN_WAIT_2状态的走向有以下几个流程触发点,
(1)TIME_WAIT_2定时器未超时时间内,收到数据段触发;
(2)TIME_WAIT_2定时器超时触发;
(3)TIME_WAIT定时器未超时时间内,收到数据段触发;
(4)TIME_WAIT定时器超时触发;
*/
/* step 7: process the segment text */
switch (sk->sk_state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2://TIME_WAIT_2定时器未超时时间内,收到数据段触发,如果设置FIN标记,则直接进入TIME_WAIT状态;
/* RFC 793 says to queue data in these states,
* RFC 1122 says we MUST send a reset.
* BSD 4.4 also does reset.
*/
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
}
}
/* Fall through */
case TCP_ESTABLISHED:
tcp_data_queue(sk, skb);
queued = 1;
break;
}

/\* tcp\_data could move socket to TIME-WAIT \*/  
if (sk->sk\_state != TCP\_CLOSE) {  
    tcp\_data\_snd\_check(sk);  
    tcp\_ack\_snd\_check(sk);  
}

if (!queued) {  

discard:
tcp_drop(sk, skb);
}
return 0;
}

tcp_v4_conn_request

/*
//服务器端收到SYN后,创建连接控制块request_sock
。也就是收到第一步SYN的时候只是建立的连接控制块request_sock
,当收到第三次ack的时候,才创建新的struct sock
*/
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
{
struct tcp_fastopen_cookie foc = { .len = -1 };
__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct dst_entry *dst = NULL;
struct request_sock *req;
bool want_cookie = false;
/*如果启用了cookie机制,则会在第三步收到ACK的时候在tcp_v4_hnd_req中
的cookie_v4_check对之前发送的ack+syn进行检查,检查过程见cookie_v4_check
*/struct flowi fl;

/\* TW buckets are converted to open requests without  
 \* limitations, they conserve resources and peer is  
 \* evidently real one.  
 \*/  
if ((net->ipv4.sysctl\_tcp\_syncookies == 2 ||//sysctl\_tcp\_syncookies=2无条件生成syncookie  
     inet\_csk\_reqsk\_queue\_is\_full(sk)) && !isn) {//或者请求队列太长, 并且当前不是timewait  
    want\_cookie = tcp\_syn\_flood\_action(sk, skb, rsk\_ops->slab\_name);//sysctl\_tcp\_syncookies>0, 并未当前socket打印一次告警  
    if (!want\_cookie)//队列满了,但不使用syncookie,则丢弃  
        goto drop;  
}

/\* Accept backlog is full. If we have already queued enough  
 \* of warm entries in syn queue, drop request. It is better than  
 \* clogging syn queue with openreqs with exponentially increasing  
 \* timeout.  
 \*/  
 /\*  
 \* 如果连接队列长度已达到上限且SYN请求队列中至少有一个握手过程中  
 \* 没有重传过的段,则丢弃当前连接请求.  
 \*  如果半连接队列中未重传的请求块数量大于1,  
 \* 则表示未来可能有2个完成的连接,这些新完成  
 \* 的连接要放到连接队列中,但此时连接队列已满  
 \* 。如果在接收到三次握手中最后的ACK后连接队列  
 \* 中没有空闲的位置,会忽略接收到的ACK包,连接  
 \* 建立会推迟,所以此时最好丢掉部分新的连接请  
 \* 求,空出资源以完成正在进行的连接建立过程。  
 \* 还要注意,这个判断并没有考虑半连接队列是否  
 \* 已满的问题。从这里可以看出,即使开启了  
 \* SYN cookies机制并不意味着一定可以完成连接的建立。  
 \*  
 \*/  
if (sk\_acceptq\_is\_full(sk) && inet\_csk\_reqsk\_queue\_young(sk) > 1) {//accept队列满,但是syn队列依然有可能被accept的连接,此时丢弃  
    NET\_INC\_STATS(sock\_net(sk), LINUX\_MIB\_LISTENOVERFLOWS);  
    goto drop;  
}

/\*  
 \* 可以接收并处理连接请求,调用inet\_reqsk\_alloc()分配一个连接请求  
 \* 块,用于保存连接请求信息,同时初始化在建立连接过程中用来发送  
 \* ACK、RST段的操作集合,以便在建立连接过程中能方便地调用这些接口  
 \*/  

//rsk_ops ===tcp_request_sock_ops
req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);//分配request_sock, 进入TCP_NEW_SYN_RECV状态
if (!req)
goto drop;
//af_ops====tcp_request_sock_ipv4_opss
tcp_rsk(req)->af_specific = af_ops;//tcp_request_sock_ipv4_ops
/*
* 清除TCP选项后初始化mss_clamp和user_mss。
*/
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;//TCP_MSS_DEFAULT=536
tmp_opt.user_mss = tp->rx_opt.user_mss;//listen sock设置的或是tw的

/\*  
 \* 解析SYN段中的TCP选项  
 \*/  
tcp\_parse\_options(skb, &tmp\_opt, 0, want\_cookie ? NULL : &foc);//开启syncookie后则不用考虑fastopen, syncookie不允许使用tcp扩展

if (want\_cookie && !tmp\_opt.saw\_tstamp)    //开启syncookie,但是不带timestamp  
    tcp\_clear\_options(&tmp\_opt);//清除wscale,sack\_ok等选项,因为没地方存  

/*
* 初始化该连接中是否启用时间戳的选项tstamp_ok
*/
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
/*
* 根据接收到SYN段中的选项和序号来初始化连接请求块信息
*/
tcp_openreq_init(req, &tmp_opt, skb, sk);

/\* Note: tcp\_v6\_init\_req() might override ir\_iif for link locals \*/  
inet\_rsk(req)->ir\_iif = inet\_request\_bound\_dev\_if(sk, skb);  

/*
* 初始化TCP层次的连接请求信息块,包括目的地址、源地址,
* 并调用tcp_v4_save_options从IP层私有控制块中获取IP
* 选项保存到传输控制块的opt中,包括MSS、窗口扩大
* 因子、显式拥塞通知等
*/
af_ops->init_req(req, sk, skb);//tcp_v4_init_req 会调用tcp_v4_save_options

if (security\_inet\_conn\_request(sk, skb, req))  
    goto drop\_and\_free;

if (!want\_cookie && !isn) {//不需要生成syncookie,也不是从timewait recycle的新的sock  
    /\* VJ's idea. We save last timestamp seen  
     \* from the destination in peer table, when entering  
     \* state TIME-WAIT, and check against it before  
     \* accepting new connection request.  
     \*  
     \* If "isn" is not zero, this request hit alive  
     \* timewait bucket, so that all the necessary checks  
     \* are made in the function processing timewait state.  
     \*/  
     /\*  
     \* 进入TIMEWAIT状态时,从对端信息块中获取时间戳,在新的  
     \* 连接请求之前检测PAWS  
     \*/  
    if (tcp\_death\_row.sysctl\_tw\_recycle) {  
        bool strict;

        dst = af\_ops->route\_req(sk, &fl, req, &strict);    //tcp\_v4\_route\_req  

//当起了快速回收tw_recycle的时候,这里可能有问题,可能连接建立不上,针对TCP时间戳PAWS漏洞的代码。 见:http://blog.chinaunix.net/uid-736168-id-376061.html
//针对TCP时间戳PAWS漏洞,造成服务器端收到SYN的时候不回收SYN+ACK,解决办法是对方不要发送时间戳选项,同时关闭tcp_timestamps见tcp_v4_conn_request
if (dst && strict &&
!tcp_peer_is_proven(req, dst, true,
tmp_opt.saw_tstamp)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
/*
1 tcp的option有 time stamp字段.

2 tcp_tw_recycle有设置。

3 在路由表中是否存在完全相同的流(如果打开了xfrm的话,
还要比较端口,默认xfrm应该是打开的),如果存在则直接返回.

4 并且数据包的源地址和新请求的源地址相同.

5 根据路由表以及源地址能够查找到保存的peer
(这个可以看我以前的blog,也就是保存了一些连接统计信息).

6 当前时间(接收到syn)比最后一次的时间(time stamp)小于60秒.

7 已经存在peer的最近一次时间戳要大于当前请求进来的时间戳.
从上面可以看到,上面的条件中1/2都是 server端可以控制的,而其他的条件,
都是很容易就满足的,因此我们举个例子。

如果客户端是NAT出来的,并且我们server端有打开tcp_tw_recycle ,
并且time stamp也没有关闭,那么假设第一个连接进来,然后关闭,此时这个句柄处于time wait状态,然后很快(小于60秒)又一个客户端(相同的源地址,如果打开了xfrm还要相同的端口号)发一个syn包,此时linux内核就会认为这个数据包异常的,因此就会丢掉这个包,并发送rst。

而现在大部分的客户端都是NAT出来的,因此建议tw_recycle还
是关闭,或者说server段关闭掉time stamp(/proc/sys/net/ipv4/tcp_timestamps).
*/
}
}
/* Kill the following clause, if you dislike this way. */
//如果没开启sysctl_tw_recycle和syncookie,最后1/4的syn请求需要验证过去的连接信?
/*
* 未启动syncookies的情况下受到synflood攻击,则丢弃接收到的段
*/
else if (!net->ipv4.sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) &&
!tcp_peer_is_proven(req, dst, false,
tmp_opt.saw_tstamp)) {//如果不存在tcp metric或者过去的连接信息则丢弃
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
* It means that we continue to communicate
* to destinations, already remembered
* to the moment of synflood.
*/
pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
rsk_ops->family);
goto drop_and_release;
}

    isn = af\_ops->init\_seq(skb);//tcp\_v4\_init\_sequence,根据四元组,随机数,当前高精度时间来生成isn  
}  
if (!dst) {  
    dst = af\_ops->route\_req(sk, &fl, req, NULL);//tcp\_v4\_route\_req  
    if (!dst)  
        goto drop\_and\_free;  
}

tcp\_ecn\_create\_request(req, skb, sk, dst);

if (want\_cookie) {  
    /\*  
     \* 如果启动了syncookies,则每60秒警告一次可能受  
     \* synflood攻击,同时由客户端IP地址、客户端端口、  
     \* 服务器IP地址、服务器端口、客户端初始序列号  
     \* 等要素经hash运算后加密得到服务端初始化序列号  
     \*/  
     //如果开启了syncookie选项,则需要检查收到的第三步ack和这个isn值是否一致  
    isn = cookie\_init\_sequence(af\_ops, sk, skb, &req->mss);  
    //cookie\_v4\_init\_sequence生成syncookie,并作为ack的起始序号  
    req->cookie\_ts = tmp\_opt.tstamp\_ok;  
    if (!tmp\_opt.tstamp\_ok)  
        inet\_rsk(req)->ecn\_ok = 0;  
}

tcp\_rsk(req)->snt\_isn = isn;  
tcp\_rsk(req)->txhash = net\_tx\_rndhash();  
tcp\_openreq\_init\_rwin(req, sk, dst);//设置初始化rwnd  
if (!want\_cookie) {  
    tcp\_reqsk\_record\_syn(sk, req, skb);//如果设置保存TCP\_SAVE\_SYN标记,则保存  
    fastopen\_sk = tcp\_try\_fastopen(sk, skb, req, &foc, dst);  
    //验证后创建fastopen sock,并把数据部分放入接收队列中  
}  
if (fastopen\_sk) {//验证并创建fastsocket成功, 进入TCP\_SYN\_RCV状态  
    af\_ops->send\_synack(fastopen\_sk, dst, &fl, req,  
                &foc, TCP\_SYNACK\_FASTOPEN);//tcp\_v4\_send\_synac  
    /\* Add the child socket directly into the accept queue \*/  
    inet\_csk\_reqsk\_queue\_add(sk, req, fastopen\_sk);//添加到等待accept的队列  
    sk->sk\_data\_ready(sk);  
    bh\_unlock\_sock(fastopen\_sk);  
    sock\_put(fastopen\_sk);  
} else {  
    tcp\_rsk(req)->tfo\_listener = false;  
    if (!want\_cookie)  
        inet\_csk\_reqsk\_queue\_hash\_add(sk, req, TCP\_TIMEOUT\_INIT);//插入ehash,并设置定时器  
    af\_ops->send\_synack(sk, dst, &fl, req, &foc,  
                !want\_cookie ? TCP\_SYNACK\_NORMAL :  
                       TCP\_SYNACK\_COOKIE);//tcp\_v4\_send\_synack  
    if (want\_cookie) {  
        reqsk\_free(req);//启用syncookie的话,可以直接释放req  
        return 0;  
    }  
}  
reqsk\_put(req);  
return 0;

drop_and_release:
dst_release(dst);
drop_and_free:
reqsk_free(req);
drop:
tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_conn_request);

把req放入ehash中,并设置rsk_timer定时器