TCP拥塞状态机的实现（下）

现在的位置: 首页 > 综合 > 正文

RSS

TCP拥塞状态机的实现（下）

2014年03月10日 ⁄ 综合 ⁄ 共 13340字 ⁄ 字号小中大 ⁄ 评论关闭

内容：本文主要分析TCP拥塞状态机的实现中，各个拥塞状态的进入、处理和退出的详细过程。

内核版本：2.6.37

作者：zhangskd @ csdn

各状态的退出

state E

各状态的退出时机：tp->snd_una >= tp->high_seq

（1） Open

因为Open态是正常态，所以无所谓退出，保持原样。

（2）Loss

icsk->icsk_retransmits = 0; /*超时重传次数归0*/

tcp_try_undo_recovery(sk);

检查是否需要undo，不管undo成功与否，都返回Open态。

（3）CWR

If seq number greater than high_seq is acked, it indicates that the CWR indication has reached the peer TCP,

call tcp_complete_cwr() to bring down the cwnd to ssthresh value.

tcp_complete_cwr(sk)中：

tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);

（4）Disorder

启用sack，则tcp_try_undo_dsack(sk)，交给它处理。

否则，tp->undo_marker = 0;

（5）Recovery

tcp_try_undo_recovery(sk);

在tcp_complete_cwr(sk)中：

tp->snd_cwnd = tp->snd_ssthresh;

/*cwr状态或Recovery状态结束时调用，减小cwnd*/ 

static inline void tcp_complete_cwr(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
    tp->snd_cwnd_stamp = tcp_time_stamp;
    tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}

Recovery状态处理

state F

（1）收到dupack

如果收到的ACK并没有使snd_una前进、是重复的ACK，并且没有使用SACK，则：

sacked_out++，增加sacked数据包的个数。

检查是否有reordering，如果有reordering则：

纠正sacked_out

禁用FACK(画外音：这实际上是多此一举，没有使用SACK，哪来的FACK？)

更新tp->reordering

/* Emulate SACKs for SACKless connection: account for a new dupack.*/
static void tcp_add_reno_sack(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    tp->sacked_out++; /* 增加sacked数据包个数*/
    tcp_check_reno_reordering(sk, 0); /*检查是否有reordering*/
    tcp_verify_left_out(tp);
}
 
/* If we receive more dupacks than we expected counting segments in 
 * assumption of absent reordering, interpret this as reordering.
 * The only another reason could be bug in receiver TCP.
 * tcp_limit_reno_sack()是判断是否有reordering的函数。
 */
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
    struct tcp_sock *tp = tcp_sk(sk);
    if (tcp_limit_reno_sack(tp)) /* 检查sack是否过多*/
        /* 如果是reordering则更新reordering信息*/
        tcp_update_reordering(sk, tp->packets_out + addend, 0);
}
 
/* Limit sacked_out so that sum with lost_out isn't ever larger than packets_out.
 * Returns zero if sacked_out adjustment wasn't necessary.
 * 检查sacked_out是否过多，过多则限制，且返回1说明出现reordering了。
 * Q: 怎么判断是否有reordering呢？
 * A: 我们知道dupack可能由lost引起，也有可能由reorder引起，那么如果
 *    sacked_out + lost_out > packets_out，则说明sacked_out偏大了，因为它错误的把由reorder
 *    引起的dupack当客户端的sack了。
 */
static int tcp_limit_reno_sacked(struct tcp_sock *tp)
{
    u32 holes;
    holes = max(tp->lost_out, 1U);
    holes = min(holes, tp->packets_out);
    if ((tp->sacked_out + holes) > tp->packets_out) {
        tp->sacked_out = tp->packets_out - holes;
        return 1;
    }
    return 0;
}

更新reordering信息

static void tcp_update_reordering(struct sock *sk, const int metric,
                                       const int ts)
{
    struct tcp_sock *tp = tcp_sk(sk);

    if (metric > tp->reordering) {
        int mib_idx;
        /* 更新reordering的值，取其小者*/
        tp->reordering = min(TCP_MAX_REORDERING, metric);
        
        if (ts)
            mib_idx = LINUX_MIB_TCPTSREORDER;
        else if (tcp_is_reno(tp))
            mib_idx = LINUX_MIB_TCPRENOREORDER;
        else if (tcp_is_fack(tp))
            mib_idx = LINUX_MIB_TCPFACKREORDER;
        else 
            mib_idx = LINUX_MIB_TCPSACKREORDER;

        NET_INC_STATS_BH(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
        printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
                   tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
                   tp->reordering, tp->fackets_out, tp->sacked_out,
                   tp->undo_marker ? tp->undo_retrans : 0);
#endif
        tcp_disable_fack(tp); /* 出现了reorder，再用fack就太激进了*/
    }
}
/* Packet counting of FACK is based on in-order assumptions, therefore
 * TCP disables it when reordering is detected.
 */

static void tcp_disable_fack(struct tcp_sock *tp)
{
    /* RFC3517 uses different metric in lost marker => reset on change */
    if (tcp_is_fack(tp))
        tp->lost_skb_hint = NULL;
    tp->rx_opt.sack_ok &= ~2; /* 取消FACK选项*/
}

（2）收到partical ack

do_lost = tcp_try_undo_partical(sk, pkts_acked);

一般情况下do_lost都会为真，除非需要undo。

具体可以看前面blog《TCP拥塞窗口调整撤销剖析》。

（3）跳出F state，标志丢失的数据段

执行完(1)或(2)后，就跳出F state。

如果有丢失的数据包，或者发送队列的第一个数据包超时，则调用tcp_update_scoreboard()来更新记分牌，

给丢失的段加TCPCB_LOST标志，增加lost_out。

检查发送队列的第一个数据包是否超时。

/* 检验发送队列的第一个数据包是否超时*/
static inline int tcp_head_timeout(const struct sock *sk)
{
    const struct tcp_sock *tp = tcp_sk(sk);
    return tp->packets_out && 
                tcp_skb_timeout(sk, tcp_write_queue_head(sk));
}

/* 检验发送队列的某个数据包是否超时*/
static inline int tcp_skb_timeout(const struct sock *sk,
                 const struct sk_buff *skb)
{
    return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
}

为确定丢失的段更新记分牌，记分牌指的是tcp_skb_cb结构中的sacked，保存该数据包的状态信息。

(1) 没有使用SACK，每次收到dupack或partical ack时，只能标志一个包为丢失。

(2) 使用FACK，每次收到dupack或partical ack时，分两种情况：

如果lost = fackets_out - reordering <= 0，这时虽然不能排除是由乱序引起的，但是fack的思想较为激进，所以也标志一个包为丢失。

如果lost >0，就可以肯定有丢包，一次性可以标志lost个包为丢失。

(3) 使用SACK，但是没有使用FACK。

如果sacked_upto = sacked_out - reordering，这是不能排除是由乱序引起的，除非快速重传标志fast_rexmit为真，才标志一个包为丢失。

如果sacked_upto > 0，就可以肯定有丢包，一次性可以标志sacked_upto个包为丢失。

内核默认使用的是(2)。

/* Account newly detected lost packet(s) */

 static void tcp_update_scoreboard (struct sock *sk, int fast_rexmit)
{
    struct tcp_sock *tp = tcp_sk(sk);
    if (tcp_is_reno(tp)) {
        /* 只标志第一个数据包为丢失，reno一次性只标志一个包*/
        tcp_mark_head_lost(sk, 1, 1);

    } else if (tcp_is_fack(tp)) {
        /* 还是考虑到乱序的，对于可能是由乱序引起的部分，一次标志一个包*/
        int lost = tp->fackets_out - tp->reordering;
        if (lost <= 0)
            lost = 1;

        /* 因为使用了FACK，可以标志多个数据包丢失*/
        tcp_mark_head_lost(sk, lost, 0);

    } else {
        int sacked_upto = tp->sacked_out - tp->reordering;
        if (sacked_upto >= 0)
            tcp_mark_head_lost(sk, sacked_upto, 0);

        else if (fast_rexmit)
            tcp_mark_head_lost(sk, 1, 1);
    }

    /* 检查发送队列中的数据包是否超时，如果超时则标志为丢失*/
    tcp_timeout_skbs(sk);
}

检查发送队列中哪些数据包超时，并标志为丢失

static void tcp_timeout_skbs(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct sk_buff *skb;

    if (! tcp_is_fack(tp) || !tcp_head_timeout(sk))
        return;

    skb = tp->scoreboard_skb_hint;

    if (tp->scoreboard_skb_hint == NULL)
        skb = tcp_write_queue_head(sk));

    tcp_for_write_queue_from(skb, sk) {
        if (skb == tcp_send_head(sk)) /*遇到snd_nxt则停止*/
            break;

        if (!tcp_skb_timeout(sk, skb)) /* 数据包不超时则停止*/
            break;

        tcp_skb_mark_lost(tp, skb); /* 标志为LOST，并增加lost_out */
    }

    tp->scoreboard_skb_hint = skb;
    tcp_verify_left_out(tp);
}

（4）减小snd_cwnd

拥塞窗口每隔一个确认段减小一个段，即每收到2个确认将拥塞窗口减1，直到拥塞窗口等于慢启动阈值为止。

/* Decrease cwnd each second ack. */
static void tcp_cwnd_down (struct sock *sk, int flag)
{
    struct tcp_sock *tp = tcp_sk(sk);
    int decr = tp->snd_cwnd_cnt + 1;

    if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK )) ||
        (tcp_is_reno(tp) && ! (flag & FLAG_NOT_DUP))) {
        tp->snd_cwnd_cnt = decr & 1; /* 0=>1,1=>0 */

        decr >>= 1; /*与上个snd_cwnd_cnt相同，0或1*/

        /* 减小cwnd */
        if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
            tp->snd_cwnd -= decr;
            
        /* 注：不太理解这句的用意。*/
        tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) +1);
        tp->snd_cwnd_stamp = tcp_time_stamp;
    }
}

/* Lower bound on congestion window is slow start threshold
 * unless congestion avoidance choice decides to override it.
 */
static inline u32 tcp_cwnd_min(const struct sock *tp)
{
    const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
    return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
}

（5）重传标志为丢失的段

/* This gets called after a retransmit timeout, and the initially retransmitted data is 
 * acknowledged. It tries to continue resending the rest of the retransmit queue, until 
 * either we've sent it all or the congestion window limit is reached. If doing SACK, 
 * the first ACK which comes back for a timeout based retransmit packet might feed us 
 * FACK information again. If so, we use it to avoid unnecessarily retransmissions.
 */

void tcp_xmit_retransmit_queue (struct sock *sk) {}

这个函数决定着发送哪些包，比较复杂，会在之后的blog单独分析。

（6）什么时候进入Recovery状态

tcp_time_to_recover()是一个重要函数，决定什么时候进入Recovery状态。

/* This function decides, when we should leave Disordered state and enter Recovery
 * phase, reducing congestion window.
 * 决定什么时候离开Disorder状态，进入Recovery状态。
 */

static int tcp_time_to_recover(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    __u32 packets_out;

    /* Do not perform any recovery during F-RTO algorithm
     * 这说明Recovery状态不能打断Loss状态。
     */
    if (tp->frto_counter)
        return 0;

    /* Trick#1: The loss is proven. 
     * 如果传输过程中存在丢失段，则可以进入Recovery状态。
     */
    if (tp->lost_out)
        return 1;
 
    /* Not-A-Trick#2 : Classic rule...
     * 如果收到重复的ACK大于乱序的阈值，表示有数据包丢失了，
     * 可以进入到Recovery状态。
     */
    if (tcp_dupack_heuristics(tp) > tp->reordering)
        return 1;
 
    /* Trick#3 : when we use RFC2988 timer restart, fast
     * retransmit can be triggered by timeout of queue head.
     * 如果发送队列的第一个数据包超时，则进入Recovery状态。
     */
      if (tcp_is_fack(tp) && tcp_head_timeout(sk))
         return 1;

    /* Trick#4 : It is still not OK... But will it be useful to delay recovery more?
     * 如果此时由于应用程序或接收窗口的限制而不能发包，且接收到很多的重复ACK。那么不能再等下去了，
     * 推测发生了丢包，且马上进入Recovery状态。
     */
    if (packets_out <= tp->reordering &&
        tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering)
        && ! tcp_may_send_now(sk)  ) {
        /* We have nothing to send. This connection is limited
         * either by receiver window or by application.
         */
        return 1;
    }

    /* If a thin stream is detected, retransmit after first received
     * dupack. Employ only if SACK is supported in order to avoid 
     * possible corner-case series of spurious retransmissions
     * Use only if there are no unsent data.
     */
    if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
         tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
         tcp_is_sack(tp) && ! tcp_send_head(sk))
         return 1;

    return 0; /*表示为假*/
}

/* Heurestics to calculate number of duplicate ACKs. There's no 
 * dupACKs counter when SACK is enabled (without SACK, sacked_out
 * is used for that purpose).
 * Instead, with FACK TCP uses fackets_out that includes both SACKed
 * segments up to the highest received SACK block so far and holes in
 * between them.
 *
 * With reordering, holes may still be in filght, so RFC3517 recovery uses
 * pure sacked_out (total number of SACKed segment) even though it
 * violates the RFC that uses duplicate ACKs, often these are equal but
 * when e.g. out-of-window ACKs or packet duplication occurs, they differ.
 * Since neither occurs due to loss, TCP shuld really ignore them.
 */
static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
{
    return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}


/* Determines whether this is a thin stream (which may suffer from increased
 * latency). Used to trigger latency-reducing mechanisms.
 */
static inline unsigned int tcp_stream_is_thin(struct tcp_sock *tp)
{
    return tp->packets_out < 4 && ! tcp_in_initial_slowstart(tp);
}

#define TCP_INFINITE_SSTHRESH 0x7fffffff

static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp)
{
    return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH;
}

This function examines various parameters (like number of packet lost) for TCP connection to decide

whether it is the right time to move to Recovery state. It's time to recover when TCP heuristics suggest a

strong possibility of packet loss in the network, the following checks are made.

总的来说，一旦确定有丢包，或者很可能丢包，就可以进入Recovery状态恢复丢包了。

可以进入Recovery状态的条件包括：

(1) some packets are lost (lost_out is non zero)。发现有丢包。

(2) SACK is an acknowledgement for out of order packets. If number of packets Sacked is greater than the

reordering metrics of the network, then loss is assumed to have happened.

被fack数据或收到的重复ACK，大于乱序的阈值，表明很可能发生丢包。

(3) If the first packet waiting to be acked (head of the write Queue) has waited for time equivalent to retransmission

timeout, the packet is assumed to have been lost. 发送队列的第一个数据段超时，表明它可能丢失了。

(4) If the following three conditions are true, TCP sender is in a state where no more data can be transmitted

and number of packets acked is big enough to assume that rest of the packets are lost in the network:

A: If packets in flight is less than the reordering metrics.

B: More than half of the packets in flight have been sacked by the receiver or number of packets sacked is more

than the Fast Retransmit thresh. (Fast Retransmit thresh is the number of dupacks that sender awaits before

fast retransmission)

C: The sender can not send any more packets because either it is bound by the sliding window or the application

has not delivered any more data to it in anticipation of ACK for already provided data.

我们收到很多的重复ACK，那么很可能有数据段丢失了。如果此时由于接收窗口或应用程序的限制而不能发送数据，

那么我们不打算再等下去，直接进入Recovery状态。

(5) 当检测到当前流量很小时（packets_out < 4），如果还满足以下条件：

A: tp->thin_dupack == 1 /* Fast retransmit on first dupack */

或者sysctl_tcp_thin_dupack为1，表明允许在收到第一个重复的包时就重传。

B: 启用SACK，且FACK或SACK的数据量大于1。

C: 没有未发送的数据，tcp_send_head(sk) == NULL。

这是一种特殊的情况，只有当流量非常小的时候才采用。

（7）刚进入Recovery时的设置

保存那些用于undo的数据：

tp->prior_ssthresh = tp->snd_ssthresh; /* 保存旧阈值*/

tp->undo_marker = tp->snd_una; /* tracking retrans started here.*/

tp->undo_retrans = tp->retrans_out; /* Retransmitted packets out */

保存退出点：

tp->high_seq = tp->snd_nxt;

重置变量：

tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);

tp->bytes_acked = 0;

tp->snd_cwnd_cnt = 0;

进入Recovery状态：

tcp_set_ca_state(sk, TCP_CA_Recovery);

Loss状态处理

state F

（1）收到partical ack

icsk->icsk_retransmits = 0; /* 超时重传的次数归零*/

如果使用的是reno，没有使用sack，则归零tp->sacked_out。

（2）尝试undo

调用tcp_try_undo_loss()，当使用时间戳检测到一个不必要的重传时：

移除记分牌中所有段的Loss标志，从而发送新的数据而不再重传。

调用tcp_undo_cwr()来撤销拥塞窗口和阈值的调整。

否则：

tcp_moderate_cwnd()调整拥塞窗口，防止爆发式重传。

tcp_xmit_retransmit_queue()继续重传丢失的数据段。

其它状态处理

state F

如果tcp_time_to_recover(sk)返回值为假，也就是说不能进入Recovery状态，则进行CWR、Disorder或Open

状态的处理。

static void tcp_try_to_open (struct sock *sk, int flag)
{
    struct tcp_sock *tp = tcp_sk(sk);
    tcp_verify_left_out(tp);

    if (!tp->frto_conter && !tcp_any_retrans_done(sk))
        tp->retrans_stamp = 0; /* 归零，因为不需要undo了*/

    /* 判断是否需要进入CWR状态*/
    if (flag & FLAG_ECE)
        tcp_enter_cwr(sk, 1);
 
    if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { /*没进入CWR*/
        tcp_try_keep_open(sk); /* 尝试保持Open状态*/
        tcp_moderate_cwnd(tp);

    } else { /* 说明进入CWR状态*/
        tcp_cwnd_down(sk, flag);/* 每2个ACK减小cwnd*/
    }
}

static void tcp_try_keep_open(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    int state = TCP_CA_Open;
    
    /* 是否需要进入Disorder状态*/
    if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
        state = TCP_CA_Disorder;

    if (inet_csk(sk)->icsk_ca_state != state) {
        tcp_set_ca_state(sk, state);
        tp->high_seq = tp->snd_nxt;
    }
}

（1）CWR状态

Q: 什么时候进入CWR状态？

A: 如果检测到ACK包含ECE标志，表示接收方通知发送法进行显示拥塞控制。

@tcp_try_to_open():

if (flag & FLAG_ECE)

tcp_enter_cwr(sk, 1);

tcp_enter_cwr()函数分析可见前面blog《TCP拥塞状态变迁》。

它主要做了：

1. 重新设置慢启动阈值。

2. 清除undo需要的标志，不允许undo。

3. 记录此时的最高序号(high_seq = snd_nxt)，用于判断退出时机。

4. 添加CWR标志，用于通知接收方它已经做出反应。

5. 设置此时的状态为TCP_CA_CWR。

Q: 在CWR期间采取什么措施？

A: 拥塞窗口每隔一个确认段减小一个段，即每收到2个确认将拥塞窗口减1，直到拥塞窗口等于慢启动阈值为止。

调用tcp_cwnd_down()。

（2）Disorder状态

Q: 什么时候进入Disorder状态？

A: 如果检测到有被sacked的数据包，或者有重传的数据包，则进入Disorder状态。

当然，之前已经确认不能进入Loss或Recovery状态了。

判断条件： sacked_out、lost_out、retrans_out、undo_marker不为0。

Q: 在Disorder期间采取什么措施？

A: 1. 设置CA状态为TCP_CA_Disorder。

2. 记录此时的最高序号(high_seq = snd_nxt)，用于判断退出时机。

3. 微调拥塞窗口，防止爆发式传输。

In Disorder state TCP is still unsure of genuiness of loss, after receiving acks with sack there may be

a clearing ack which acks many packets non dubiously in one go. Such a clearing ack may cause a

packet burst in the network, to avoid this cwnd size is reduced to allow no more than max_burst (usually 3)

number of packets.

（3）Open状态

因为Open状态是正常的状态，是状态处理的最终目的，所以不需要进行额外处理。

【上篇】ProjectEuler 125
【下篇】关于保护环境标语

作者: stabbed

该日志由 stabbed 于10年前发表在综合分类下，最后更新于 2014年03月10日.
转载请注明: TCP拥塞状态机的实现（下） | 学步园 +复制链接

抱歉!评论已关闭.

学步园