本文主要分析在TCP擁塞狀態(tài)機的實現(xiàn)中,函數(shù)tcp_fastretrans_alert()的實現(xiàn)溜族,及對一些相關(guān)函數(shù)也做了介紹。
變量介紹
這些變量都在include/linux/tcp.h中聲明,在net/ipv4/tcp.c中被賦初值蚓耽。
u32 packets_out; /* 表示離開網(wǎng)絡(luò)但沒被確認(rèn)的包 */
u32 sacked_out;
/* Packets, which arrived to receiver out of order and hence not ACKed.
* With SACK this number is simply amount of SACKed data. Even withou
* SACKs it is easy to give pretty reliable estimate of this number, counting
* duplicate ACKs.
* 上面是sacked_out的英文解釋族沃,其實應(yīng)該分兩種情況來看频祝,開和沒開SACK選項:
* 如果開了SACK選項,那么這個值無疑就是表示被SACK的亂序包的個數(shù)脆淹,
* 如果沒開SACK選項常空,那么該值就是表示dupack的個數(shù)。具體可參考tcp_add_reno_sack()函數(shù)相關(guān)代碼.
*/
u32 fackets_out;/* SACK數(shù)和丟失包的總和盖溺,fackets_out = lost_out + sacked_out */
tcp_fastretrans_alert()函數(shù)被調(diào)用條件
(1) 每一個到來的ACK漓糙,其狀態(tài)不是Open.
(2) ACK不是普通ack,即是:
?? SACK烘嘱,
?? Duplicate ACK兼蜈,
?? ECE ECN
tcp_fastretrans_alert()函數(shù)實現(xiàn)細(xì)節(jié)
@kernel version 3.12/net/ipv4/tcp_input.c
static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
int prior_sacked, int prior_packets, bool is_dupack, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* is_dupack表示重復(fù)ack,F(xiàn)LAG_DATA_SACKED表示SACK中添加了新的數(shù)據(jù)*/
int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
int newly_acked_sacked = 0;
int fast_rexmit = 0;
/* 如果packets_out為0拙友,但sacked_out不為0为狸,那么sacked_out應(yīng)改為0 */
if (WARN_ON(!tp->packets_out && tp->sacked_out))
tp->sacked_out = 0;
/* 如果sacked_out為0, 那么fackets_out應(yīng)為0 */
if (WARN_ON(!tp->sacked_out && tp->fackets_out))
tp->fackets_out = 0;
/* Now state machine starts.
* A. ECE, hence prohibit cwnd undoing, the reduction is required.
* 禁止cwnd撤銷遗契,并減小cwnd.
*/
if (flag & FLAG_ECE)
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs.
* 檢查是否為虛假SACK辐棒,虛假SACK是指:最新收到的ACK的ack_seq指向已記錄的SACK
* 塊,這說明記錄的SACK并沒有反應(yīng)接收方的真實的狀態(tài).
*/
if (tcp_check_sack_reneging(sk, flag))
return;
/* C. Check consistency of the current state.
* 丟失的包應(yīng)該比發(fā)送出去的包少牍蜂,即left_out < packets_out.
*/
tcp_verify_left_out(tp);
/* D. Check state exit conditions. State can be terminated
* when high_seq is ACKed.
* 如果state = TCP_CA_Open漾根,就不應(yīng)該有重傳包.
*/
if (icsk->icsk_ca_state == TCP_CA_Open) {
WARN_ON(tp->retrans_out != 0);
tp->retrans_stamp = 0; //將重傳發(fā)送時間置0.
/* 如果snd_una >= high_seq,state接下來應(yīng)該從其他狀態(tài)返回到Open狀態(tài) */
} else if (!before(tp->snd_una, tp->high_seq)) {
/* state的幾種不同值表示網(wǎng)絡(luò)處在不同的狀態(tài)鲫竞,在這篇blog[]()中有詳細(xì)介紹. */
switch (icsk->icsk_ca_state) {
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
/* 如果snd_una > high_seq辐怕,結(jié)束快速重傳,返回Open狀態(tài) */
if (tp->snd_una != tp->high_seq) {
inet_csk(sk)->icsk_retrans_ops->end_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
case TCP_CA_Recovery:
if (tcp_is_reno(tp)) /* 不是sack */
tcp_reset_reno_sack(tp); /* 重置sack_out = 0 */
if (tcp_try_undo_recovery(sk)) /* 嘗試撤銷 */
return;
/* 結(jié)束快速重傳 */
inet_csk(sk)->icsk_retrans_ops->end_cwnd_reduction(sk);
break;
}
}
/* 非正常ack處理情況 */
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
/* FLAG_SND_UNA_ADVANCED表示snd_una更新了 */
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
/* 不是sack从绘,是一個dupack則增加sacked_out */
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else
/* 這個函數(shù)見下文 */
do_lost = tcp_try_undo_partial(sk, pkts_acked);
/* 計算ack了多少新數(shù)據(jù) */
newly_acked_sacked = prior_packets - tp->packets_out +
tp->sacked_out - prior_sacked;
break;
/* timeout后的處理*/
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack);
if (icsk->icsk_ca_state != TCP_CA_Open)
return;
/* Fall through to processing in Open state. */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp); /* 重置sacked_out = 0 */
if (is_dupack)
tcp_add_reno_sack(sk);
}
/* 計算ack了多少新數(shù)據(jù)*/
newly_acked_sacked = prior_packets - tp->packets_out +
tp->sacked_out - prior_sacked;
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag, newly_acked_sacked);
return;
}
/* MTU probe failure: don't reduce cwnd */
if (icsk->icsk_ca_state < TCP_CA_CWR &&
icsk->icsk_mtup.probe_size &&
tp->snd_una == tp->mtu_probe.probe_seq_start) {
tcp_mtup_probe_failed(sk);
/* Restores the reduction we did in tcp_mtup_probe() */
tp->snd_cwnd++;
tcp_simple_retransmit(sk);/* 做一個簡單的轉(zhuǎn)發(fā)寄疏,而不使用回退機制 */
return;
}
/* Otherwise enter Recovery state */
tcp_enter_recovery(sk, (flag & FLAG_ECE)); /* 進入恢復(fù)狀態(tài) */
fast_rexmit = 1;/* 快速重傳標(biāo)志 */
}
/* 打上lost標(biāo)志 */
if (do_lost || (tcp_is_fack(tp) && tcp_head_timeout(sk))) {
/* 更新記分牌,標(biāo)記丟失和超時的數(shù)據(jù)包 */
tcp_update_scoreboard(sk, fast_rexmit);
}
/* 降低cwnd */
inet_csk(sk)->icsk_retrans_ops->cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
/* 重傳有l(wèi)ost標(biāo)志的包 */
tcp_xmit_retransmit_queue(sk);
}
tcp_add_reno_sack()函數(shù)
/* Emulate SACKs for SACKless connection: account for a new dupack. */
static void tcp_add_reno_sack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->sacked_out++; /* 收到重復(fù)ack僵井,sacked_out++*/
/* 檢查亂序情況陕截,該函數(shù)具體定義在下面介紹 */
tcp_check_reno_reordering(sk, 0);
tcp_verify_left_out(tp);
}
tcp_check_reno_reordering()函數(shù)
/* If we receive more dupacks than we expected counting segments
* in assumption of absent reordering, interpret this as reordering.
* The only another reason could be bug in receiver TCP.
*/
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
struct tcp_sock *tp = tcp_sk(sk);
/* 檢查sack的數(shù)量是否超過了限度,是則更新reordering */
if (tcp_limit_reno_sacked(tp))
tcp_update_reordering(sk, tp->packets_out + addend, 0);
}
tcp_limit_reno_sacked()函數(shù)
/* Limits sacked_out so that sum with lost_out isn't ever larger than
* packets_out. Returns false if sacked_out adjustement wasn't necessary.
*/
static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
{
u32 holes;
holes = max(tp->lost_out, 1U);
holes = min(holes, tp->packets_out);
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
return true;
}
return false;
}
tcp_update_scoreboard()函數(shù)
/* Account newly detected lost packet(s) */
static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_reno(tp)) {/* 不是SACK */
tcp_mark_head_lost(sk, 1, 1);/* 標(biāo)記一個丟失 */
} else if (tcp_is_fack(tp)) {/* 如果是fack */
int lost = tp->fackets_out - tp->reordering;/* 計算所有的丟包數(shù) */
if (lost <= 0)
lost = 1;
tcp_mark_head_lost(sk, lost, 0);/* 給所有丟包打標(biāo)記 */
} else {/* 是一個簡單的sack */
int sacked_upto = tp->sacked_out - tp->reordering;
if (sacked_upto >= 0)
tcp_mark_head_lost(sk, sacked_upto, 0);
else if (fast_rexmit)
tcp_mark_head_lost(sk, 1, 1);
}
tcp_timeout_skbs(sk);
}
tcp_mark_head_lost()函數(shù)
* Detect loss in event "A" above by marking head of queue up as lost.
* For FACK or non-SACK(Reno) senders, the first "packets" number of segments
* are considered lost. For RFC3517 SACK, a segment is considered lost if it
* has at least tp->reordering SACKed seqments above it; "packets" refers to
* the maximum SACKed segments to pass before reaching this limit.
* high_seq:可以標(biāo)記為lost的段序號的最大值批什。
* mark_head: 為1表示只需要標(biāo)志發(fā)送隊列的第一個段农曲。
*/
static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int cnt, oldcnt;
int err;
unsigned int mss;
/* Use SACK to deduce losses of new sequences sent during recovery */
const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
/* 丟失的包不可能必發(fā)出去的包還多 */
WARN_ON(packets > tp->packets_out);
/* 如果已經(jīng)有被標(biāo)記的段了 */
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;/* 讓skb指向這個段,便于后面的遍歷 */
cnt = tp->lost_cnt_hint;/* 已經(jīng)標(biāo)記了多少段 */
/* Head already handled? */
/* 已經(jīng)有標(biāo)記但驻债,skb不等于發(fā)送隊列的第一個包乳规,則返回 */
if (mark_head && skb != tcp_write_queue_head(sk))
return;
} else {
skb = tcp_write_queue_head(sk);/* 獲得發(fā)送隊列第一個包 */
cnt = 0;/* 初始化標(biāo)記了0個數(shù)據(jù) */
}
tcp_for_write_queue_from(skb, sk) {/* 根據(jù)取出來的skb形葬,遍歷重傳隊列 */
if (skb == tcp_send_head(sk))
break;/* 如果遍歷到snd_nxt,則停止 */
/* TODO: do this better */
/* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;/* 暗示已經(jīng)標(biāo)記有多少丟包 */
/* loss_high是最大的標(biāo)記為lost的序號暮的,end_seq不可能大于它 */
if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
break;
oldcnt = cnt;
if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
cnt += tcp_skb_pcount(skb);/* 此段已經(jīng)被sacked */
/* 主要用于判斷時機 */
if (cnt > packets) {
if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
(oldcnt >= packets))
break;
mss = skb_shinfo(skb)->gso_size;
err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
if (err < 0)
break;
cnt = packets;
}
tcp_skb_mark_lost(tp, skb);
if (mark_head)/* 只標(biāo)記一段的話笙以,那么就可以退出了 */
break;
}
tcp_verify_left_out(tp);
}
tcp_skb_mark_lost()函數(shù)
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tcp_verify_retransmit_hint(tp, skb);/* 更新重傳隊列 */
tp->lost_out += tcp_skb_pcount(skb);/* 統(tǒng)計丟包數(shù) */
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;/* 打上丟包標(biāo)記 */
}
}