tcp_v4_connect
/* This will initiate an outgoing connection.
tcp_v4_connect函数初始化一个对外的连接请求,创建一个SYN包并发送出去,
把套接字的状态从CLOSE切换到SYN_SENT,初始化TCP部分选项数据包序列号、
窗口大小、MSS、套接字传送超时等*/
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
struct ip_options_rcu *inet_opt;
if (addr\_len < sizeof(struct sockaddr\_in))
return -EINVAL;
if (usin->sin\_family != AF\_INET)
return -EAFNOSUPPORT;
//是否设置源路由选项
nexthop = daddr = usin->sin\_addr.s\_addr;
inet\_opt = rcu\_dereference\_protected(inet->inet\_opt,
sock\_owned\_by\_user(sk));
if (inet\_opt && inet\_opt->opt.srr) {
if (!daddr)
return -EINVAL;
nexthop = inet\_opt->opt.faddr;
}
/*
根据目的ip、目的端口、网络设备接口调用ip_route_connect选路由,
路由结构保存到rt->rt_dst中,实际调用的函数是ip_route_output_flow,
如果是广播地址、组地址就返回
*/
orig_sport = inet->inet_sport;
orig_dport = usin->sin_port;
fl4 = &inet->cork.fl.u.ip4;
rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
orig_sport, orig_dport, sk, true);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
return err;
}
if (rt->rt\_flags & (RTCF\_MULTICAST | RTCF\_BROADCAST)) {
ip\_rt\_put(rt);
return -ENETUNREACH;
}
if (!inet\_opt || !inet\_opt->opt.srr)
daddr = fl4->daddr;
if (!inet->inet\_saddr)
inet->inet\_saddr = fl4->saddr;
inet->inet\_rcv\_saddr = inet->inet\_saddr;
if (tp->rx\_opt.ts\_recent\_stamp && inet->inet\_daddr != daddr) {
/\* Reset inherited state \*/
tp->rx\_opt.ts\_recent = 0;
tp->rx\_opt.ts\_recent\_stamp = 0;
if (likely(!tp->repair))
tp->write\_seq = 0;
}
////获取套接字最近使用的时间
if (tcp\_death\_row.sysctl\_tw\_recycle &&
!tp->rx\_opt.ts\_recent\_stamp && fl4->daddr == daddr)
tcp\_fetch\_timewait\_stamp(sk, &rt->dst);
inet->inet\_dport = usin->sin\_port;
inet->inet\_daddr = daddr;
inet\_csk(sk)->icsk\_ext\_hdr\_len = 0;
if (inet\_opt)
inet\_csk(sk)->icsk\_ext\_hdr\_len = inet\_opt->opt.optlen;
tp->rx\_opt.mss\_clamp = TCP\_MSS\_DEFAULT;
/\* Socket identity is still unknown (sport may be zero).
\* However we set state to SYN-SENT and not releasing socket
\* lock select source port, enter ourselves into the hash tables and
\* complete initialization after this.
调用tcp\_set\_state设置套接字状态为TCP\_SYN\_SENT,本把套接字sk加入到连接管理哈希链表中,
为连接分配一个临时端口
\*/
tcp\_set\_state(sk, TCP\_SYN\_SENT);
//将套接字sk放入TCP连接管理哈希链表中 同时 Bind a port
//绑定IP地址和端口,并将socket加入到连接表中
err = inet\_hash\_connect(&tcp\_death\_row, sk);
if (err)
goto failure;
rt = ip\_route\_newports(fl4, rt, orig\_sport, orig\_dport,
inet->inet\_sport, inet->inet\_dport, sk);
if (IS\_ERR(rt)) {
err = PTR\_ERR(rt);
rt = NULL;
goto failure;
}
/\* OK, now commit destination to socket. \*/
sk->sk\_gso\_type = SKB\_GSO\_TCPV4;
sk\_setup\_caps(sk, &rt->dst);
if (!tp->write\_seq && likely(!tp->repair))
tp->write\_seq = secure\_tcp\_sequence\_number(inet->inet\_saddr,
inet->inet\_daddr,
inet->inet\_sport,
usin->sin\_port);
inet->inet\_id = tp->write\_seq ^ jiffies;
/*
初始化第一个序列号,调用tcp_connect函数完成建立连接,
包括发送SYN,tcp_connect将创建号的SYN数据段加入到套接字发送队列,
最后调用tcp_transmit_skb数据包发送到IP层。
*/
if (likely(!tp->repair))
err = tcp_connect(sk);
else
err = tcp_repair_connect(sk);
rt = NULL;
if (err)
goto failure;
return 0;
failure:
/*
* This unhashes the socket and releases the local port,
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->inet_dport = 0;
return err;
}
/*
* Bind a port for a connect operation and hash it.
*/
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
__inet_check_established, __inet_hash_nolisten);
}
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16, struct inet_timewait_sock **),
int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
const unsigned short snum = inet_sk(sk)->inet_num;
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
int ret;
struct net *net = sock_net(sk);
int twrefcnt = 1;
if (!snum) {//端口未绑定
int i, remaining, low, high, port;
static u32 hint;
u32 offset = hint + port\_offset;
struct hlist\_node \*node;
struct inet\_timewait\_sock \*tw = NULL;
inet\_get\_local\_port\_range(&low, &high);
remaining = (high - low) + 1;
local\_bh\_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
if (inet\_is\_reserved\_local\_port(port))
continue;
head = &hinfo->bhash\[inet\_bhashfn(net, port,
hinfo->bhash\_size)\];
spin\_lock(&head->lock);
/\* Does not bother with rcv\_saddr checks,
\* because the established check is already
\* unique enough.
//绑定到一个port的socket可能是通过bind 系统调用,也可能是调用connect系统调用时\_\_inet\_hash\_connect函数选取的
\*/
inet\_bind\_bucket\_for\_each(tb, node, &head->chain) {
if (net\_eq(ib\_net(tb), net) &&
tb->port == port) {
if (tb->fastreuse >= 0)
goto next\_port;
WARN\_ON(hlist\_empty(&tb->owners));
if (!check\_established(death\_row, sk,
port, &tw))
goto ok;
goto next\_port;
}
}
//当前端口没有被使用
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
}
tb->fastreuse = -1;
goto ok;
next\_port:
spin\_unlock(&head->lock);
}
local\_bh\_enable();
return -EADDRNOTAVAIL;
ok:
hint += i;
/\* Head lock still held and bh's disabled
//将socket加入port对应的tb的socket队列中,即将此socket与port相关联
\*/
inet\_bind\_hash(sk, tb, port);
if (sk\_unhashed(sk)) { //如果socket没有被加入到“已建立连接”的连接表中
inet\_sk(sk)->inet\_sport = htons(port);
twrefcnt += hash(sk, tw);//将socket加入到“已建立连接”的连接表中
}
if (tw)
twrefcnt += inet\_twsk\_bind\_unhash(tw, hinfo);
spin\_unlock(&head->lock);
if (tw) {
inet\_twsk\_deschedule(tw, death\_row);
while (twrefcnt) {
twrefcnt--;
inet\_twsk\_put(tw);
}
}
ret = 0;
goto out;
}
head = &hinfo->bhash\[inet\_bhashfn(net, snum, hinfo->bhash\_size)\];
tb = inet\_csk(sk)->icsk\_bind\_hash;//将tb加入到bind hash表中
spin\_lock\_bh(&head->lock);
//条件为false时,会执行else分支,检查是否可用。这么看来,调用bind()成功并不意味着这个端口就真的可以用
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {//有且仅有一个socket绑定到这个端口,无需冲突检查
hash(sk, NULL);//将socket加入到“已建立连接”的连接表中
spin_unlock_bh(&head->lock);
return 0;
} else {
spin_unlock(&head->lock);
/* No definite answer… Walk to established hash table */
ret = check_established(death_row, sk, snum, NULL);
out:
local_bh_enable();
return ret;
}
}
创建一个套接字,设置SO_REUSEADDR选项,建立连接后立即关闭,关闭后立即又重复同样的过程,发现在第二次调用connect()的时候返回EADDRNOTAVAIL错误
可以看到返回EADDRNOTVAIL错误的有两种情况:
1、在TIME_WAIT传输控制块中找到匹配的端口,并且twsk_unique()返回true时
2、在除TIME_WAIT和LISTEN状态外的传输块中存在匹配的端口。
第二种情况很好容易理解了,只要状态在FIN_WAIT_1、ESTABLISHED等的传输控制块使用的端口和要查找的匹配,就会返回EADDRNOTVAIL错误。
第一种情况还要取决于twsk_uniqueue()的返回值
__inet_hash_connect的主要功能与bind系统调用中的inet_csk_get_port类似,都是:
1、如果没有选取端口则选定一个;
2、将socket与端口绑定;
3、将scoket加入到连接表中(这个功能inet_csk_get_port没有)。
另外一点不同是:inet_csk_get_port进行冲突检查时关注的是绑定冲突
而__inet_hash_connect检查的是当前socket是否与“已建立连接的socket”的冲突。
__inet_hash_connect检查冲突的函数是__inet_check_established:
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, __u16 lport,
struct inet_timewait_sock **twp)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
__be32 daddr = inet->inet_rcv_saddr;
__be32 saddr = inet->inet_daddr;
int dif = sk->sk_bound_dev_if;
INET_ADDR_COOKIE(acookie, saddr, daddr)
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
struct net *net = sock_net(sk);
unsigned int hash = inet_ehashfn(net, daddr, lport,
saddr, inet->inet_dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);//找到连接表中的表项
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw;
int twrefcnt = 0;
spin\_lock(lock);
/\* Check TIME-WAIT sockets first.
先检查TIME\_WAIT表,然后再检查establish表,与这两个表中的任意一个冲突都是不允许的
\*/
sk\_nulls\_for\_each(sk2, node, &head->twchain) {
tw = inet\_twsk(sk2);
if (INET\_TW\_MATCH(sk2, net, hash, acookie,
saddr, daddr, ports, dif)) {
if (twsk\_unique(sk, sk2, twp))
goto unique;
else
goto not\_unique;
}
}
tw = NULL;
/\* And established part... \*/
sk\_nulls\_for\_each(sk2, node, &head->chain) {
if (INET\_MATCH(sk2, net, hash, acookie,
saddr, daddr, ports, dif))
goto not\_unique;
}
unique:
/* Must record num and sport now. Otherwise we will see
* in hash table socket with a funny identity. */
inet->inet_num = lport;
inet->inet_sport = htons(lport);
sk->sk_hash = hash;
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
twrefcnt = inet_twsk_unhash(tw);
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
if (twrefcnt)
inet_twsk_put(tw);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
\*twp = tw;
} else if (tw) {
/\* Silly. Should hash-dance instead... \*/
inet\_twsk\_deschedule(tw, death\_row);
inet\_twsk\_put(tw);
}
return 0;
not_unique:
spin_unlock(lock);
return -EADDRNOTAVAIL;
}
在listen系统调用中,inet_hash函数会将socket加入到listen连接表中:
static void __inet_hash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
if (sk->sk\_state != TCP\_LISTEN) {
\_\_inet\_hash\_nolisten(sk, NULL);
return;
}
WARN\_ON(!sk\_unhashed(sk));
ilb = &hashinfo->listening\_hash\[inet\_sk\_listen\_hashfn(sk)\];
spin\_lock(&ilb->lock);
\_\_sk\_nulls\_add\_node\_rcu(sk, &ilb->head);
sock\_prot\_inuse\_add(sock\_net(sk), sk->sk\_prot, 1);
spin\_unlock(&ilb->lock);
}
int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
spinlock_t *lock;
struct inet_ehash_bucket *head;
int twrefcnt = 0;
WARN\_ON(!sk\_unhashed(sk));
sk->sk\_hash = inet\_sk\_ehashfn(sk);
head = inet\_ehash\_bucket(hashinfo, sk->sk\_hash);
list = &head->chain;
lock = inet\_ehash\_lockp(hashinfo, sk->sk\_hash);
spin\_lock(lock);
\_\_sk\_nulls\_add\_node\_rcu(sk, list);
if (tw) {
WARN\_ON(sk->sk\_hash != tw->tw\_hash);
twrefcnt = inet\_twsk\_unhash(tw);
}
spin\_unlock(lock);
sock\_prot\_inuse\_add(sock\_net(sk), sk->sk\_prot, 1);
return twrefcnt;
}
static inline struct inet_ehash_bucket *inet_ehash_bucket(
struct inet_hashinfo *hashinfo,
unsigned int hash)
{
return &hashinfo->ehash[hash & hashinfo->ehash_mask];
}/*
可见server端的socket在进行listen系统调用后被加入到sk->sk_prot->h.hashinfo->listening_hash中,
client端的socket在进行connect系统调用后被加入到sk->sk_prot->h.hashinfo->ehash中,
而对于TCPv4和TCPv6,sk->sk_prot->h.hashinfo指向的都是tcp_hashinfo。*/
手机扫一扫
移动阅读更方便
你可能感兴趣的文章