目录
背景:TCP 连接建立流程
在 TCP 三次握手中:
- 客户端调用
connect() → 发送 SYN → 套接字状态变为 TCP_SYN_SENT - 服务端收到
SYN → 回复 SYN+ACK → 套接字状态变为 TCP_NEW_SYN_RECV - 客户端收到
SYN+ACK → 发送 ACK → 连接建立成功(状态变为 TCP_ESTABLISHED)。 在此期间,客户端处于 TCP_SYN_SENT 状态。此时若收到任何 TCP 报文(包括合法或非法的),最终都由 tcp_rcv_synsent_state_process() 来处理。 - 服务端收到
ACK → 套接字状态变为TCP_ESTABLISHED。
本文接下来将分析 客户端收到 SYN+ACK 场景。
关键数据结构
- sk->sk_state:当前 TCP 状态(此处为 TCP_SYN_SENT)
- tp->snd_wl1: 上次更新窗口时skb的序列号
- tp->snd_wnd 滑动窗口的大小,即当前本端被对端允许发送的最大数据量
核心逻辑详解
- tcp_v4_rcv[1]收到数据以后,先根据源目
ip与port在tcp_hashinfo表中查找对应的sk。之前发送connect请求时已经将sk放入tcp_hashinfo。由于三次握手还未正式完成sock_owned_by_user[2]会返回false,流程进入到tcp_v4_do_rcv函数中。
inttcp_v4_rcv(struct sk_buff *skb){structnet *net = dev_net(skb->dev);structsk_buff *skb_to_free;int sdif = inet_sdif(skb);int dif = inet_iif(skb);conststructiphdr *iph;conststructtcphdr *th;bool refcounted;structsock *sk;int ret; ………… th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb);lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted);if (!sk)goto no_tcp_socket; ………… th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); skb->dev = NULL; …………if (!sock_owned_by_user(sk)) { skb_to_free = sk->sk_rx_skb_cache; sk->sk_rx_skb_cache = NULL; ret = tcp_v4_do_rcv(sk, skb); } else {if (tcp_add_backlog(sk, skb))goto discard_and_relse; skb_to_free = NULL; } bh_unlock_sock(sk);if (skb_to_free) __kfree_skb(skb_to_free);put_and_return:if (refcounted) sock_put(sk);return ret; …………}
- 此场景中
sk->sk_state ==TCP_SYN_SENT,tcp_v4_do_rcv核心作用是根据套接字状态调用tcp_rcv_state_process并最终调用tcp_rcv_synsent_state_process`进行处理。
inttcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb){structsock *rsk; …………if (tcp_rcv_state_process(sk, skb)) { rsk = sk;goto reset; }return0; …………}
inttcp_rcv_state_process(struct sock *sk, struct sk_buff *skb){structtcp_sock *tp = tcp_sk(sk);structinet_connection_sock *icsk = inet_csk(sk);conststructtcphdr *th = tcp_hdr(skb);structrequest_sock *req;int queued = 0;bool acceptable;switch (sk->sk_state) { …………case TCP_SYN_SENT: tp->rx_opt.saw_tstamp = 0; tcp_mstamp_refresh(tp); queued = tcp_rcv_synsent_state_process(sk, skb, th);if (queued >= 0)return queued;/* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); tcp_data_snd_check(sk);return0; } ………… }
tcp_rcv_synsent_state_process函数主要处理syn ack的响应。它首先进行ack 序列号进行范围检查,接着判断本次syn ack是否非法,然后调用tcp_ack确认syn ack消息,设置sk->sk_state为TCP_ESTABLISHED,通过sk->sk_state_change来唤醒等待连接成功的客户端应用程序,最后调用tcp_send_ack给tcp服务器端回复syn ack。其中最重要的处理函数是tcp_ack。
staticinttcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,const struct tcphdr *th){structinet_connection_sock *icsk = inet_csk(sk);structtcp_sock *tp = tcp_sk(sk);structtcp_fastopen_cookiefoc = { .len = -1 };int saved_clamp = tp->rx_opt.mss_clamp;bool fastopen_fail; tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset;if (th->ack) {/* rfc793: * "If the state is SYN-SENT then * first check the ACK bit * If the ACK bit is set * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send * a reset (unless the RST bit is set, if so drop * the segment and return)" */if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {/* Previous FIN/ACK or RST/ACK might be ignored. */if (icsk->icsk_retransmits == 0) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_TIMEOUT_MIN, TCP_RTO_MAX);goto reset_and_undo; }if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, tcp_time_stamp(tp))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);goto reset_and_undo; }/* Now ACK is acceptable. * * "If the RST bit is set * If the ACK was acceptable then signal the user "error: * connection reset", drop the segment, enter CLOSED state, * delete TCB, and return." */if (th->rst) { tcp_reset(sk);goto discard; }/* rfc793: * "fifth, if neither of the SYN or RST bits is set then * drop the segment and return." * * See note below! * --ANK(990513) */if (!th->syn)goto discard_and_undo;/* rfc793: * "If the SYN bit is on ... * are acceptable then ... * (our SYN has been ACKed), change the connection * state to ESTABLISHED..." */ tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); tcp_ack(sk, skb, FLAG_SLOWPATH);/* Ok.. it's good. Set up sequence numbers and * move to established. */ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;/* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ tp->snd_wnd = ntohs(th->window);if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; tp->window_clamp = min(tp->window_clamp, 65535U); }if (tp->rx_opt.saw_tstamp) { tp->rx_opt.tstamp_ok = 1; tp->tcp_header_len =sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; tcp_store_ts_recent(tp); } else { tp->tcp_header_len = sizeof(struct tcphdr); } tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk);/* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); smc_check_reset_syn(tp); smp_mb(); tcp_finish_connect(sk, skb); fastopen_fail = (tp->syn_fastopen || tp->syn_data) && tcp_rcv_fastopen_synack(sk, skb, &foc);if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); }if (fastopen_fail)return-1;if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || inet_csk_in_pingpong_mode(sk)) {/* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * * It may be deleted, but with this feature tcpdumps * look so _wonderfully_ clever, that I was not able * to stand against the temptation 8) --ANK */ inet_csk_schedule_ack(sk); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX);discard: tcp_drop(sk, skb);return0; } else { tcp_send_ack(sk); }return-1; } …………}
- tcp_ack函数是整个tcp协议的核心处理函数,它主要功能有:一、调用
tcp_ack_update_window 更新tp->snd_wnd与tp->snd_una 。二、如果ack序列号没有乱序,则将确认的skb从重传列表中删除。三、如果ack是乱序,则调用tcp_sacktag_write_queue对乱序报文进行处理,主要是对乱序确认的skb设置相应标志,进行相应的统计。四、对丢失报文进行拥塞处理,比如快速重传与快速恢复
staticinttcp_ack(struct sock *sk, const struct sk_buff *skb, int flag){structinet_connection_sock *icsk = inet_csk(sk);structtcp_sock *tp = tcp_sk(sk);structtcp_sacktag_statesack_state;structrate_samplers = { .prior_delivered = 0 }; u32 prior_snd_una = tp->snd_una;bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq;int num_dupack = 0;int prior_packets = tp->packets_out; u32 delivered = tp->delivered; u32 lost = tp->lost;int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ u32 prior_fack; sack_state.first_sackt = 0; sack_state.rate = &rs;/* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node);/* If the ack is older than previous acks * then we can probably ignore it. */if (before(ack, prior_snd_una)) {/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */if (before(ack, prior_snd_una - tp->max_window)) {if (!(flag & FLAG_NO_CHALLENGE_ACK)) tcp_send_challenge_ack(sk, skb);return-1; }goto old_ack; }/* If the ack includes data we haven't sent yet, discard * this segment (RFC793 Section 3.9). */if (after(ack, tp->snd_nxt))return-1;if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; icsk->icsk_retransmits = 0;#if IS_ENABLED(CONFIG_TLS_DEVICE)if (static_branch_unlikely(&clean_acked_data_enabled.key))if (icsk->icsk_clean_acked) icsk->icsk_clean_acked(sk, ack);#endif } prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp);/* ts_recent update must be made after we are sure that the packet * is in window. */if (flag & FLAG_UPDATE_TS_RECENT) tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == FLAG_SND_UNA_ADVANCED) { ………… } else { u32 ack_ev_flags = CA_ACK_SLOWPATH;if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA;else NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state);if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; ack_ev_flags |= CA_ACK_ECE; }if (flag & FLAG_WIN_UPDATE) ack_ev_flags |= CA_ACK_WIN_UPDATE; tcp_in_ack_event(sk, ack_ev_flags); }/* This is a deviation from RFC3168 since it states that: * "When the TCP data sender is ready to set the CWR bit after reducing * the congestion window, it SHOULD set the CWR bit only on the first * new data packet that it transmits." * We accept CWR on pure ACKs to be more robust * with widely-deployed TCP implementations that do this. */ tcp_ecn_accept_cwr(sk, skb);/* We passed data and got it acked, remove any soft error * log. Something worked... */ sk->sk_err_soft = 0; icsk->icsk_probes_out = 0; tp->rcv_tstamp = tcp_jiffies32;if (!prior_packets)goto no_queue;/* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); tcp_rack_update_reo_wnd(sk, &rs);if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag);/* If needed, reset TLP/RTO timer; RACK may later override this. */if (flag & FLAG_SET_XMIT_TIMER) tcp_set_xmit_timer(sk);if (tcp_ack_is_dubious(sk, flag)) {if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) { num_dupack = 1;/* Consider if pure acks were aggregated in tcp_add_backlog() */if (!(flag & FLAG_DATA)) num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); } tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); }if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit);return1; …………}
参考资料
[1] linux内核版本: linux5.7.8。
[2] 注1: 用于判断sock是否被用户态持用,其主要目的是为解决用户状与内核态对sock的竞争问题。