接收入口

tcp_v4_rcv

    |--> tcp_v4_do_rcv

               |-> tcp_rcv_state_process

                         |-> tcp_rcv_synsent_state_process

1. 状态为ESTABLISHED时,用tcp_rcv_established()接收处理。
2. 状态为LISTEN时,说明这个sock处于监听状态,用于被动打开的接收处理,包括SYN和ACK。
3. 当状态不为ESTABLISHED或TIME_WAIT时,用tcp_rcv_state_process()处理。

/* The socket must have it's spinlock held when we get
 * here.
 *
 * We have a potential double-lock case here, so even when
 * doing backlog processing we use the BH locking scheme.
 * This is because we cannot sleep with the original spinlock
 * held.
 *//*
 * TCP传输层接收到段之后,经过了简单的
 * 校验,并确定接收处理该段的传输控制
 * 块之后,除非处于FIN_WAIT_2或TIME_WAIT状态,
 * 否则都会调用tcp_v4_do_rcv()作具体的处理
 */
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
    struct sock *rsk;
#ifdef CONFIG_TCP_MD5SIG
    /*
     * We really want to reject the packet as early as possible
     * if:
     *  o We're expecting an MD5'd packet and this is no MD5 tcp option
     *  o There is an MD5 option and we're not expecting one
     */
    if (tcp_v4_inbound_md5_hash(sk, skb))
        goto discard;
#endif

    if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 
        sock_rps_save_rxhash(sk, skb->rxhash);
        TCP_CHECK_TIMER(sk);
        if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
            rsk = sk;
            goto reset;
        }
        TCP_CHECK_TIMER(sk);
        return 0;
    }

    if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
        goto csum_err;

    if (sk->sk_state == TCP_LISTEN) { //说明收到的是三次握手第一步SYN或者第三步ACK,这里是服务器端的情况
        struct sock *nsk = tcp_v4_hnd_req(sk, skb);
        if (!nsk)
            goto discard;

        if (nsk != sk) {//如果是第一次握手的SYN,这里的nsk应该是'父'sk, 如果这里是三次握手的第三步ACK,则这里的nsk是‘子'sk
            if (tcp_child_process(sk, nsk, skb)) { //这里面还是会调用tcp_rcv_state_process
                rsk = nsk;
                goto reset;
            }
            return 0; //如果是握手的第三步,这里直接退出
        } //如果是三次握手中的第一步SYN,则继续后面的操作
    } else
        sock_rps_save_rxhash(sk, skb->rxhash);

    //走到这里说明只能是客户端收到SYN+ACK,或者是服务器端收到SYN
    TCP_CHECK_TIMER(sk);
    if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
        rsk = sk;
        goto reset;
    }
    TCP_CHECK_TIMER(sk);
    return 0;

reset:
    tcp_v4_send_reset(rsk, skb);
discard:
    kfree_skb(skb);
    /* Be careful here. If this function gets more complicated and
     * gcc suffers from register pressure on the x86, sk (in %ebx)
     * might be destroyed here. This current version compiles correctly,
     * but you have been warned.
     */
    return 0;

csum_err:
    TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
    goto discard;
}

当客户端connect()之后,sock进入TCP_SYN_SENT状态,并插入到ehash中, 如果是阻塞socket则connect()等待握手完成
本文考虑收到服务端synack的过程,也就是客户端握手的第二阶段;

发送SYN段后,连接的状态变为SYN_SENT。此时如果收到SYNACK段,处理函数为tcp_rcv_state_process()。

对于协议栈的接收路径,

  • tcp_v4_rcv
    • ->__inet_lookup_skb() //在ehash中找到TCP_SYN_SENT状态的sk
    • ->!sock_owned_by_user() //connect()即使阻塞也不占有锁
      • ->!tcp_prepare() //对于synack,不会排入prepare队列
      • ->tcp_v4_do_rcv()
        • ->tcp_rcv_state_process() //进入TCP_SYN_SENT状态处理逻辑
          • -> tcp_rcv_synsent_state_process

整体代码先折叠

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct inet_connection_sock *icsk = inet_csk(sk);
    const struct tcphdr *th = tcp_hdr(skb);
    struct request_sock *req;
    int queued = 0;
    bool acceptable;

    switch (sk->sk_state) {
    case TCP_CLOSE:
        goto discard;

    case TCP_LISTEN:
        //服务器端收到SYN
        /*
         * 在半连接的LISTEN状态下,只处理SYN段。如果是
         * ACK段,此时连接尚未开始建立,因此返回1。在调用
         * tcp_rcv_state_process()函数中会给对方发送RST段;
         * 如果接收的是RST段,则丢弃
         */
        if (th->ack)
            return 1;

        if (th->rst)
            goto discard;

        if (th->syn) {
            if (th->fin)
                goto discard;
            /*
             * 处理SYN段,主要由conn_request接口(TCP中为tcp_v4_conn_request)处理,
             * icsk_af_ops成员在创建套接字时被初始化,参见tcp_v4_init_sock()
             */
             /*收到三次握手的第一步SYN,
                则在tcp_v4_conn_request中创建连接请求控制块request_sock
                */
            if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)//ipv4_specific--->tcp_v4_conn_request
                return 1;

            consume_skb(skb);
            return 0;
        }
        goto discard;

    case TCP_SYN_SENT://客户端收到SYN+ACK
    /*
对于TCP_SYN_SENT状态的sock,会调用tcp_rcv_synsent_state_process来进行处理
解析tcp选项,获取服务端的支持情况, 比如sack, TFO, wscale, MSS, timestamp等
如果有ack, 进行tcp_ack, 这时候可能fastopen确认了之前的数据
调用tcp_finish_connect,TCP_SYN_SENT->TCP_ESTABLISHED
如果包含fastopen cookie则保存
判断是否需要立即ack还是延时ack
如果包里没有ack,只有syn,则表示相互connect, TCP_SYN_SENT->TCP_SYN_RECV, 并发送synack 
    */
        tp->rx_opt.saw_tstamp = 0;
        queued = tcp_rcv_synsent_state_process(sk, skb, th);
        if (queued >= 0)
            return queued;

        /* Do step6 onward by hand. */
        tcp_urg(sk, skb, th);
        __kfree_skb(skb);
        tcp_data_snd_check(sk);
        return 0;
    }

    tp->rx_opt.saw_tstamp = 0;
    req = tp->fastopen_rsk;
    if (req) {
        WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
            sk->sk_state != TCP_FIN_WAIT1);

        if (!tcp_check_req(sk, skb, req, true))
            goto discard;
    }

    if (!th->ack && !th->rst && !th->syn)
        goto discard;

    if (!tcp_validate_incoming(sk, skb, th, 0))
        return 0;
/*
         * 处理TCP段ACK标志,tcp_ack()返回非零值表示处理
         * ACK段成功,是正常的第三次握手TCP段
         */
    /* step 5: check the ACK field */
    acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
                      FLAG_UPDATE_TS_RECENT) > 0;
/*
tcp_rcv_state_process函数中对于ack的处理步骤中,假如连接处于FIN_WAIT_1,
且数据均已经被确认完,则进入TIME_WAIT_2状态;如果无需在该状态等待(linger2<0),
或者收到了乱序数据段,则直接关闭连接;如果需要等待,
则需要判断等待时间与TIMEWAIT时间的大小关系,若>TIMEWAIT_LEN,
则添加TIME_WAIT_2定时器,否则直接进入TIME_WAIT接管(其子状态仍然是FIN_WAIT_2),
接管之后会添加TIME_WAIT定时器;
*/
    switch (sk->sk_state) {
    case TCP_SYN_RECV:////握手完成时的新建连接的初始状态
        if (!acceptable)
            return 1;

        if (!tp->srtt_us)
            tcp_synack_rtt_meas(sk, req);
/*/这里是由tcp_v4_do_rcv里面的tcp_child_process走到这里,
在tcp_child_process前会通过tcp_check_req创建一个新的struct sock
         Once we leave TCP_SYN_RECV, we no longer need req
         * so release it.
         */
        if (req) {
            tp->total_retrans = req->num_retrans;
            reqsk_fastopen_remove(sk, req, false);    //回收fastopen req
        } else {
            /* Make sure socket is routed, for correct metrics. */
            icsk->icsk_af_ops->rebuild_header(sk);
            tcp_init_congestion_control(sk);

            tcp_mtup_init(sk);
            tp->copied_seq = tp->rcv_nxt;
            tcp_init_buffer_space(sk);
        }
        smp_mb();
        tcp_set_state(sk, TCP_ESTABLISHED);// TCP_SYN_RECV->TCP_ESTABLISHED
        sk->sk_state_change(sk);//sock_def_wakeup, 唤醒epoll
/*
sock_init_data中 有
sk->sk_state_change    =    sock_def_wakeup;
sk->sk_data_ready    =    sock_def_readable;
sk->sk_write_space    =    sock_def_write_space;
sk->sk_error_report    =    sock_def_error_report;
sk->sk_destruct        =    sock_def_destruct;
*/
//epoll然后调用ep_send_events->ep_scan_ready_list->ep_send_events_proc->ep_item_poll->tcp_poll
 /*
                 * 设置"子"传输控制块为ESTABLISHED状态
                 */
        /* Note, that this wakeup is only for marginal crossed SYN case.
         * Passively open sockets are not waked up, because
         * sk->sk_sleep == NULL and sk->sk_socket == NULL.
         */
         /*
                 * 发信号给那些将通过该套接字发送数据的进程,
                 * 通知他们套接字目前已经可以发送数据了
     sk_state_change()->sock_def_wakeup()->ep_poll_callback(), 添加到epoll的ready list中,并唤醒阻塞中的epoll。
epoll然后调用ep_send_events->ep_scan_ready_list->ep_send_events_proc->ep_item_poll->tcp_poll
                 */
                 
        if (sk->sk_socket)
            sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
     /*
                 * 初始化传输控制块各字段,如果存在时间戳选项,
                 * 同时平滑RTT为零,则需计算重传超时时间等
                 */
        tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
        tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
        tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

        if (tp->rx_opt.tstamp_ok)
            tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

        if (req) {
            /* Re-arm the timer because data may have been sent out.
             * This is similar to the regular data transmission case
             * when new data has just been ack'ed.
             *
             * (TFO) - we could try to be more aggressive and
             * retransmitting any data sooner based on when they
             * are sent out.
             */
            tcp_rearm_rto(sk);
        } else
            tcp_init_metrics(sk);
/*
                 * 为该套接字建立路由,初始化拥塞控制模块
                 */
                  /*
                 * 初始化与路径MTU有关的成员
                 */
        tcp_update_pacing_rate(sk);
/*
                 * 更新最近一次发送数据包的时间
                 */
        /* Prevent spurious tcp_cwnd_restart() on first data packet */
        tp->lsndtime = tcp_time_stamp;

        tcp_initialize_rcv_mss(sk);
        /*
                 * 计算有关TCP首部预测的标志
                 */
        tcp_fast_path_on(tp);
        break;

    case TCP_FIN_WAIT1: {
        struct dst_entry *dst;
        int tmo;

        /* If we enter the TCP_FIN_WAIT1 state and we are a
         * Fast Open socket and this is the first acceptable
         * ACK we have received, this would have acknowledged
         * our SYNACK so stop the SYNACK timer.
         */
        if (req) {
            /* Return RST if ack_seq is invalid.
             * Note that RFC793 only says to generate a
             * DUPACK for it but for TCP Fast Open it seems
             * better to treat this case like TCP_SYN_RECV
             * above.
             */
            if (!acceptable)
                return 1;
            /* We no longer need the request sock. */
            reqsk_fastopen_remove(sk, req, false);
            tcp_rearm_rto(sk);
        }        /* 发送数据未确认完毕 */
        if (tp->snd_una != tp->write_seq)
            break;

        tcp_set_state(sk, TCP_FIN_WAIT2); /* 进入FIN_WAIT_2状态 */
        sk->sk_shutdown |= SEND_SHUTDOWN;/* 关闭发送端 */

        dst = __sk_dst_get(sk);
        if (dst)/* 路由缓存确认 */
            dst_confirm(dst);

        if (!sock_flag(sk, SOCK_DEAD)) {
            /* Wake up lingering close() */
            sk->sk_state_change(sk); /* 套接口不是DEAD状态,状态发生变化,唤醒等待进程 */
            break;
        }
 /* linger2<0,无需在FIN_WAIT_2等待 */
        if (tp->linger2 < 0 || /* 收到期望序号以后的数据段(data, fin) */
            (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
             after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
            tcp_done(sk);/* 关闭连接 */
            NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
            return 1;
        }

        tmo = tcp_fin_time(sk); /* 获取FIN_WAIT_2等待时间 */
        if (tmo > TCP_TIMEWAIT_LEN) {  /* > TIMEWAIT_LEN,加入FIN_WAIT_2定时器 */
            inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
        } else if (th->fin || sock_owned_by_user(sk)) {
            /* Bad case. We could lose such FIN otherwise.
             * It is not a big problem, but it looks confusing
             * and not so rare event. We still can lose it now,
             * if it spins in bh_lock_sock(), but it is really
             * marginal case.
             */ /* 有fin?? 或者 被用户进程锁定,加入FIN_WAIT_2定时器 */
            inet_csk_reset_keepalive_timer(sk, tmo);
        } else { /* 正常等待时间< TIMEWAIT_LEN,进入TIMEWAIT接管状态 */
            tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
            goto discard;
        }
        break;
    }

    case TCP_CLOSING:
        if (tp->snd_una == tp->write_seq) {
            tcp_time_wait(sk, TCP_TIME_WAIT, 0);
            goto discard;
        }
        break;

    case TCP_LAST_ACK:
        if (tp->snd_una == tp->write_seq) {
            tcp_update_metrics(sk);
            tcp_done(sk);
            goto discard;
        }
        break;
    }

    /* step 6: check the URG bit */
    tcp_urg(sk, skb, th);
/*
FIN_WAIT_2状态的走向有以下几个流程触发点,
(1)TIME_WAIT_2定时器未超时时间内,收到数据段触发; 
(2)TIME_WAIT_2定时器超时触发; 
(3)TIME_WAIT定时器未超时时间内,收到数据段触发;
(4)TIME_WAIT定时器超时触发;
*/
    /* step 7: process the segment text */
    switch (sk->sk_state) {
    case TCP_CLOSE_WAIT:
    case TCP_CLOSING:
    case TCP_LAST_ACK:
        if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
            break;
    case TCP_FIN_WAIT1:
    case TCP_FIN_WAIT2://TIME_WAIT_2定时器未超时时间内,收到数据段触发,如果设置FIN标记,则直接进入TIME_WAIT状态;
        /* RFC 793 says to queue data in these states,
         * RFC 1122 says we MUST send a reset.
         * BSD 4.4 also does reset.
         */
        if (sk->sk_shutdown & RCV_SHUTDOWN) {
            if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
                after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
                tcp_reset(sk);
                return 1;
            }
        }
        /* Fall through */
    case TCP_ESTABLISHED:
        tcp_data_queue(sk, skb); //如果带数据部分则处理,比如客户端设置了deferaccept的时候
        queued = 1;
        break;
    }

    /* tcp_data could move socket to TIME-WAIT */
    if (sk->sk_state != TCP_CLOSE) {
        tcp_data_snd_check(sk);//给数据一个发送机会,tcp_push_pending_frame
        tcp_ack_snd_check(sk);//检查是否有ack被推迟,判断是否需要立即发送
    }

    if (!queued) {
discard:
        tcp_drop(sk, skb);
    }
    return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);
View Code

相关文章:

  • 2021-07-13
  • 2022-01-19
  • 2022-01-19
  • 2022-12-23
  • 2021-09-07
  • 2021-06-01
猜你喜欢
  • 2022-12-23
  • 2022-12-23
  • 2022-12-23
  • 2022-01-20
  • 2022-12-23
  • 2022-12-23
  • 2021-12-30
相关资源
相似解决方案