socket connect tcp_v4_connect
阅读原文时间:2023年07月09日阅读:4

tcp_v4_connect

/* This will initiate an outgoing connection.
tcp_v4_connect函数初始化一个对外的连接请求,创建一个SYN包并发送出去,
把套接字的状态从CLOSE切换到SYN_SENT,初始化TCP部分选项数据包序列号、
窗口大小、MSS、套接字传送超时等*/
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
struct ip_options_rcu *inet_opt;

if (addr\_len < sizeof(struct sockaddr\_in))  
    return -EINVAL;

if (usin->sin\_family != AF\_INET)  
    return -EAFNOSUPPORT;  
//是否设置源路由选项

nexthop = daddr = usin->sin\_addr.s\_addr;  
inet\_opt = rcu\_dereference\_protected(inet->inet\_opt,  
                     sock\_owned\_by\_user(sk));  
if (inet\_opt && inet\_opt->opt.srr) {  
    if (!daddr)  
        return -EINVAL;  
    nexthop = inet\_opt->opt.faddr;  
}  

/*
根据目的ip、目的端口、网络设备接口调用ip_route_connect选路由,
路由结构保存到rt->rt_dst中,实际调用的函数是ip_route_output_flow,
如果是广播地址、组地址就返回

*/
orig_sport = inet->inet_sport;
orig_dport = usin->sin_port;
fl4 = &inet->cork.fl.u.ip4;
rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
orig_sport, orig_dport, sk, true);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
return err;
}

if (rt->rt\_flags & (RTCF\_MULTICAST | RTCF\_BROADCAST)) {  
    ip\_rt\_put(rt);  
    return -ENETUNREACH;  
}

if (!inet\_opt || !inet\_opt->opt.srr)  
    daddr = fl4->daddr;

if (!inet->inet\_saddr)  
    inet->inet\_saddr = fl4->saddr;  
inet->inet\_rcv\_saddr = inet->inet\_saddr;

if (tp->rx\_opt.ts\_recent\_stamp && inet->inet\_daddr != daddr) {  
    /\* Reset inherited state \*/  
    tp->rx\_opt.ts\_recent       = 0;  
    tp->rx\_opt.ts\_recent\_stamp = 0;  
    if (likely(!tp->repair))  
        tp->write\_seq       = 0;  
}  
////获取套接字最近使用的时间

if (tcp\_death\_row.sysctl\_tw\_recycle &&  
    !tp->rx\_opt.ts\_recent\_stamp && fl4->daddr == daddr)  
    tcp\_fetch\_timewait\_stamp(sk, &rt->dst);

inet->inet\_dport = usin->sin\_port;  
inet->inet\_daddr = daddr;

inet\_csk(sk)->icsk\_ext\_hdr\_len = 0;  
if (inet\_opt)  
    inet\_csk(sk)->icsk\_ext\_hdr\_len = inet\_opt->opt.optlen;

tp->rx\_opt.mss\_clamp = TCP\_MSS\_DEFAULT;

/\* Socket identity is still unknown (sport may be zero).  
 \* However we set state to SYN-SENT and not releasing socket  
 \* lock select source port, enter ourselves into the hash tables and  
 \* complete initialization after this.  
 调用tcp\_set\_state设置套接字状态为TCP\_SYN\_SENT,本把套接字sk加入到连接管理哈希链表中,  
 为连接分配一个临时端口  
 \*/  
tcp\_set\_state(sk, TCP\_SYN\_SENT);  
//将套接字sk放入TCP连接管理哈希链表中 同时 Bind a port  
//绑定IP地址和端口,并将socket加入到连接表中  
err = inet\_hash\_connect(&tcp\_death\_row, sk);  
if (err)  
    goto failure;

rt = ip\_route\_newports(fl4, rt, orig\_sport, orig\_dport,  
               inet->inet\_sport, inet->inet\_dport, sk);  
if (IS\_ERR(rt)) {  
    err = PTR\_ERR(rt);  
    rt = NULL;  
    goto failure;  
}  
/\* OK, now commit destination to socket.  \*/  
sk->sk\_gso\_type = SKB\_GSO\_TCPV4;  
sk\_setup\_caps(sk, &rt->dst);

if (!tp->write\_seq && likely(!tp->repair))  
    tp->write\_seq = secure\_tcp\_sequence\_number(inet->inet\_saddr,  
                           inet->inet\_daddr,  
                           inet->inet\_sport,  
                           usin->sin\_port);

inet->inet\_id = tp->write\_seq ^ jiffies;  

/*
初始化第一个序列号,调用tcp_connect函数完成建立连接,
包括发送SYN,tcp_connect将创建号的SYN数据段加入到套接字发送队列,
最后调用tcp_transmit_skb数据包发送到IP层。

*/
if (likely(!tp->repair))
err = tcp_connect(sk);
else
err = tcp_repair_connect(sk);

rt = NULL;  
if (err)  
    goto failure;

return 0;

failure:
/*
* This unhashes the socket and releases the local port,
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->inet_dport = 0;
return err;
}

/*
* Bind a port for a connect operation and hash it.
*/
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
__inet_check_established, __inet_hash_nolisten);
}

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16, struct inet_timewait_sock **),
int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
const unsigned short snum = inet_sk(sk)->inet_num;
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
int ret;
struct net *net = sock_net(sk);
int twrefcnt = 1;

if (!snum) {//端口未绑定  
    int i, remaining, low, high, port;  
    static u32 hint;  
    u32 offset = hint + port\_offset;  
    struct hlist\_node \*node;  
    struct inet\_timewait\_sock \*tw = NULL;

    inet\_get\_local\_port\_range(&low, &high);  
    remaining = (high - low) + 1;

    local\_bh\_disable();  
    for (i = 1; i <= remaining; i++) {  
        port = low + (i + offset) % remaining;  
        if (inet\_is\_reserved\_local\_port(port))  
            continue;  
        head = &hinfo->bhash\[inet\_bhashfn(net, port,  
                hinfo->bhash\_size)\];  
        spin\_lock(&head->lock);

        /\* Does not bother with rcv\_saddr checks,  
         \* because the established check is already  
         \* unique enough.  
         //绑定到一个port的socket可能是通过bind 系统调用,也可能是调用connect系统调用时\_\_inet\_hash\_connect函数选取的  
         \*/  
        inet\_bind\_bucket\_for\_each(tb, node, &head->chain) {  
            if (net\_eq(ib\_net(tb), net) &&  
                tb->port == port) {  
                if (tb->fastreuse >= 0)  
                    goto next\_port;  
                WARN\_ON(hlist\_empty(&tb->owners));  
                if (!check\_established(death\_row, sk,  
                            port, &tw))  
                    goto ok;  
                goto next\_port;  
            }  
        }  

//当前端口没有被使用
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
}
tb->fastreuse = -1;
goto ok;

    next\_port:  
        spin\_unlock(&head->lock);  
    }  
    local\_bh\_enable();

    return -EADDRNOTAVAIL;

ok:
hint += i;

    /\* Head lock still held and bh's disabled  
    //将socket加入port对应的tb的socket队列中,即将此socket与port相关联  
    \*/  
    inet\_bind\_hash(sk, tb, port);  
    if (sk\_unhashed(sk)) { //如果socket没有被加入到“已建立连接”的连接表中  
        inet\_sk(sk)->inet\_sport = htons(port);  
        twrefcnt += hash(sk, tw);//将socket加入到“已建立连接”的连接表中  
    }  
    if (tw)  
        twrefcnt += inet\_twsk\_bind\_unhash(tw, hinfo);  
    spin\_unlock(&head->lock);

    if (tw) {  
        inet\_twsk\_deschedule(tw, death\_row);  
        while (twrefcnt) {  
            twrefcnt--;  
            inet\_twsk\_put(tw);  
        }  
    }

    ret = 0;  
    goto out;  
}

head = &hinfo->bhash\[inet\_bhashfn(net, snum, hinfo->bhash\_size)\];  
tb  = inet\_csk(sk)->icsk\_bind\_hash;//将tb加入到bind hash表中  
spin\_lock\_bh(&head->lock);  

//条件为false时,会执行else分支,检查是否可用。这么看来,调用bind()成功并不意味着这个端口就真的可以用
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {//有且仅有一个socket绑定到这个端口,无需冲突检查
hash(sk, NULL);//将socket加入到“已建立连接”的连接表中
spin_unlock_bh(&head->lock);
return 0;
} else {
spin_unlock(&head->lock);
/* No definite answer… Walk to established hash table */
ret = check_established(death_row, sk, snum, NULL);
out:
local_bh_enable();
return ret;
}
}

创建一个套接字,设置SO_REUSEADDR选项,建立连接后立即关闭,关闭后立即又重复同样的过程,发现在第二次调用connect()的时候返回EADDRNOTAVAIL错误
可以看到返回EADDRNOTVAIL错误的有两种情况:
   1、在TIME_WAIT传输控制块中找到匹配的端口,并且twsk_unique()返回true时
   2、在除TIME_WAIT和LISTEN状态外的传输块中存在匹配的端口。
  第二种情况很好容易理解了,只要状态在FIN_WAIT_1、ESTABLISHED等的传输控制块使用的端口和要查找的匹配,就会返回EADDRNOTVAIL错误。
第一种情况还要取决于twsk_uniqueue()的返回值

__inet_hash_connect的主要功能与bind系统调用中的inet_csk_get_port类似,都是:
1、如果没有选取端口则选定一个;

2、将socket与端口绑定;

3、将scoket加入到连接表中(这个功能inet_csk_get_port没有)。

  另外一点不同是:inet_csk_get_port进行冲突检查时关注的是绑定冲突
而__inet_hash_connect检查的是当前socket是否与“已建立连接的socket”的冲突。
__inet_hash_connect检查冲突的函数是__inet_check_established:

/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, __u16 lport,
struct inet_timewait_sock **twp)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
__be32 daddr = inet->inet_rcv_saddr;
__be32 saddr = inet->inet_daddr;
int dif = sk->sk_bound_dev_if;
INET_ADDR_COOKIE(acookie, saddr, daddr)
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
struct net *net = sock_net(sk);
unsigned int hash = inet_ehashfn(net, daddr, lport,
saddr, inet->inet_dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);//找到连接表中的表项
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw;
int twrefcnt = 0;

spin\_lock(lock);

/\* Check TIME-WAIT sockets first.  
先检查TIME\_WAIT表,然后再检查establish表,与这两个表中的任意一个冲突都是不允许的  
\*/  
sk\_nulls\_for\_each(sk2, node, &head->twchain) {  
    tw = inet\_twsk(sk2);

    if (INET\_TW\_MATCH(sk2, net, hash, acookie,  
                saddr, daddr, ports, dif)) {  
        if (twsk\_unique(sk, sk2, twp))  
            goto unique;  
        else  
            goto not\_unique;  
    }  
}  
tw = NULL;

/\* And established part... \*/  
sk\_nulls\_for\_each(sk2, node, &head->chain) {  
    if (INET\_MATCH(sk2, net, hash, acookie,  
                saddr, daddr, ports, dif))  
        goto not\_unique;  
}

unique:
/* Must record num and sport now. Otherwise we will see
* in hash table socket with a funny identity. */
inet->inet_num = lport;
inet->inet_sport = htons(lport);
sk->sk_hash = hash;
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
twrefcnt = inet_twsk_unhash(tw);
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
if (twrefcnt)
inet_twsk_put(tw);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

if (twp) {  
    \*twp = tw;  
} else if (tw) {  
    /\* Silly. Should hash-dance instead... \*/  
    inet\_twsk\_deschedule(tw, death\_row);

    inet\_twsk\_put(tw);  
}  
return 0;

not_unique:
spin_unlock(lock);
return -EADDRNOTAVAIL;
}
 在listen系统调用中,inet_hash函数会将socket加入到listen连接表中:

static void __inet_hash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;

if (sk->sk\_state != TCP\_LISTEN) {  
    \_\_inet\_hash\_nolisten(sk, NULL);  
    return;  
}

WARN\_ON(!sk\_unhashed(sk));  
ilb = &hashinfo->listening\_hash\[inet\_sk\_listen\_hashfn(sk)\];

spin\_lock(&ilb->lock);  
\_\_sk\_nulls\_add\_node\_rcu(sk, &ilb->head);  
sock\_prot\_inuse\_add(sock\_net(sk), sk->sk\_prot, 1);  
spin\_unlock(&ilb->lock);  

}

int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
spinlock_t *lock;
struct inet_ehash_bucket *head;
int twrefcnt = 0;

WARN\_ON(!sk\_unhashed(sk));

sk->sk\_hash = inet\_sk\_ehashfn(sk);  
head = inet\_ehash\_bucket(hashinfo, sk->sk\_hash);  
list = &head->chain;  
lock = inet\_ehash\_lockp(hashinfo, sk->sk\_hash);

spin\_lock(lock);  
\_\_sk\_nulls\_add\_node\_rcu(sk, list);  
if (tw) {  
    WARN\_ON(sk->sk\_hash != tw->tw\_hash);  
    twrefcnt = inet\_twsk\_unhash(tw);  
}  
spin\_unlock(lock);  
sock\_prot\_inuse\_add(sock\_net(sk), sk->sk\_prot, 1);  
return twrefcnt;  

}

static inline struct inet_ehash_bucket *inet_ehash_bucket(
struct inet_hashinfo *hashinfo,
unsigned int hash)
{
return &hashinfo->ehash[hash & hashinfo->ehash_mask];
}/*
可见server端的socket在进行listen系统调用后被加入到sk->sk_prot->h.hashinfo->listening_hash中,
client端的socket在进行connect系统调用后被加入到sk->sk_prot->h.hashinfo->ehash中,
而对于TCPv4和TCPv6,sk->sk_prot->h.hashinfo指向的都是tcp_hashinfo。*/