转自:http://blog.chinaunix.net/uid-20662820-id-3371081.html
关于TCP connect 返回错误99,可以能大家都会遇到,这里就分析一下这个错误的真正含义:
基于内核2.6.32
应用层调用connect,对应的系统调用的套接口实现是inet_stream_connect,对应tcp协议对应的传输层接口是tcp_v4_connect, 这里就从这个函数作为入口,该函数定义在net/ipv4/tcp_ipv4.c文件中
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct rtable *rt;
__be32 daddr, nexthop;
int tmp;
int err;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
if (usin->sin_family != AF_INET)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr;
if (inet->opt && inet->opt->srr) {
if (!daddr)
return -EINVAL;
nexthop = inet->opt->faddr;
}
tmp = ip_route_connect(&rt, nexthop, inet->saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
inet->sport, usin->sin_port, sk, 1);
if (tmp < 0) {
if (tmp == -ENETUNREACH)
IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
return tmp;
}
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
}
if (!inet->opt || !inet->opt->srr)
daddr = rt->rt_dst;
if (!inet->saddr)
inet->saddr = rt->rt_src;
inet->rcv_saddr = inet->saddr;
if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
tp->write_seq = 0;
}
if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
struct inet_peer *peer = rt_get_peer(rt);
/*
* VJ's idea. We save last timestamp seen from
* the destination in peer table, when entering state
* TIME-WAIT * and initialize rx_opt.ts_recent from it,
* when trying new connection.
*/
if (peer != NULL &&
peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
tp->rx_opt.ts_recent = peer->tcp_ts;
}
}
inet->dport = usin->sin_port;
inet->daddr = daddr;
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet->opt)
inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
tp->rx_opt.mss_clamp = 536;
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
* lock select source port, enter ourselves into the hash tables and
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
err = inet_hash_connect(&tcp_death_row, sk);
if (err)
goto failure;
err = ip_route_newports(&rt, IPPROTO_TCP,
inet->sport, inet->dport, sk);
if (err)
goto failure;
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->u.dst);
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->saddr,
inet->daddr,
inet->sport,
usin->sin_port);
inet->id = tp->write_seq ^ jiffies;
err = tcp_connect(sk);
rt = NULL;
if (err)
goto failure;
return 0;
failure:
/*
* This unhashes the socket and releases the local port,
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->dport = 0;
return err;
}
这里进入第84行代码的函数inet_hash_connect 查找一个本地可用端口与服务器建立连接
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
__inet_check_established, __inet_hash_nolisten);
}
该函数实际上调用_inet_hash_connect
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16, struct inet_timewait_sock **),
void (*hash)(struct sock *sk))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
const unsigned short snum = inet_sk(sk)->num;
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
int ret;
struct net *net = sock_net(sk);
if (!snum) {
int i, remaining, low, high, port;
static u32 hint;
u32 offset = hint + port_offset;
struct hlist_node *node;
struct inet_timewait_sock *tw = NULL;
inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;
local_bh_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock(&head->lock);
/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
inet_bind_bucket_for_each(tb, node, &head->chain) {
if (ib_net(tb) == net && tb->port == port) {
if (tb->fastreuse >= 0)
goto next_port;
WARN_ON(hlist_empty(&tb->owners));
if (!check_established(death_row, sk,
port, &tw))
goto ok;
goto next_port;
}
}
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
}
tb->fastreuse = -1;
goto ok;
next_port:
spin_unlock(&head->lock);
}
local_bh_enable();
return -EADDRNOTAVAIL;
ok:
hint += i;
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->sport = htons(port);
hash(sk);
}
spin_unlock(&head->lock);
if (tw) {
inet_twsk_deschedule(tw, death_row);
inet_twsk_put(tw);
}
ret = 0;
goto out;
}
head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
hash(sk);
spin_unlock_bh(&head->lock);
return 0;
} else {
spin_unlock(&head->lock);
/* No definite answer... Walk to established hash table */
ret = check_established(death_row, sk, snum, NULL);
out:
local_bh_enable();
return ret;
}
}
注意第61行代码,这里返回了EADDRNOTAVAIL错误。
这里分析一下这个函数实现:
调用inet_get_local_port_range(&low, &high) 获取可用的端口列表,这个值就是/proc/sys/net/ipv4/ip_local_port_range 中的值。
void inet_get_local_port_range(int *low, int *high)
{
unsigned seq;
do {
seq = read_seqbegin(&sysctl_local_ports.lock);
*low = sysctl_local_ports.range[0];
*high = sysctl_local_ports.range[1];
} while (read_seqretry(&sysctl_local_ports.lock, seq));
}
然后内核在这个范围内选择一个可用的端口作为本地端口去connect服务器,如果没有可用的端口可用,比如这个范围内的端口都处于如下状态中的一种:
1. bind使用的端口
2. 端口处于非TIME_WAIT状态
3. 端口处于TIME_WAIT状态,但是没有启用tcp_tw_reuse
那么就会返回EADDRNOTAVAIL错误。
一般情况下,出现这个错误应该是代码设计的问题,如果确定代码没有问题,那么根据上面的原则,可用使用如下方法解决问题:
1. 增大可选端口的范围,修改/proc/sys/net/ipv4/ip_local_port_range的值。
2. 开启tcp_tw_reuse,允许使用TIME_WAIT状态的端口。