One document matched: draft-zimmermann-tcpm-reordering-reaction-02.xml


<?xml version="1.0" encoding="US-ASCII"?>
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
    <!-- Normative References -->
    <!ENTITY rfc0793 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.0793.xml">
    <!ENTITY rfc2018 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2018.xml">
    <!ENTITY rfc2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
    <!ENTITY rfc3042 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3042.xml">
    <!ENTITY rfc4653 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4653.xml">
    <!ENTITY rfc5681 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5681.xml">
    <!ENTITY rfc6582 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6582.xml">
    <!ENTITY rfc6675 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6675.xml">
    <!ENTITY rfc6928 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6928.xml">
    <!ENTITY rfc6937 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6937.xml">

    <!-- Informative References -->
    <!ENTITY rfc0896 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.0896.xml">
    <!ENTITY rfc1122 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.1122.xml">
    <!ENTITY rfc2861 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2861.xml">
    <!ENTITY rfc2960 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2960.xml">
    <!ENTITY rfc3522 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3522.xml">
    <!ENTITY rfc3708 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3708.xml">
    <!ENTITY rfc4015 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4015.xml">
    <!ENTITY rfc5682 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5682.xml">
    <!ENTITY rfc5827 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5827.xml">

    <!ENTITY blanton-tcp-reordering SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.blanton-tcp-reordering.xml">

    <!-- declare nbsp and friends -->
    <!ENTITY nbsp " ">
    <!ENTITY nbhy "‑">
    <!ENTITY wj "⁠">
]>

<?xml-stylesheet type="text/xsl" href="rfc2629.xslt" ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs),
     please see http://xml.resource.org/authoring/README.html. -->
<!-- Below are generally applicable Processing Instructions (PIs) that most I-Ds
     might want to use. (Here they are set differently than their defaults in
     xml2rfc v1.32) -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="3"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space
     (using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->

<rfc ipr="trust200902" category="exp" obsoletes="4653"
docName="draft-zimmermann-tcpm-reordering-reaction-02">
<!-- category values: std, bcp, info, exp, and historic
     ipr values: full3667, noModification3667, noDerivatives3667
     you can add the attributes updates="NNNN" and obsoletes="NNNN"
     they will automatically be output with "(if approved)" -->

    <!-- FRONT MATTER -->
    <front>
        <title abbrev="TCP-aNCR"> Making TCP Adaptively Robust to
        Non-Congestion Events</title>

        <author initials="A" surname="Zimmermann"
        fullname="Alexander Zimmermann">
            <organization>NetApp, Inc.</organization>
            <address>
                <postal>
                    <street>Sonnenallee 1</street>
                    <city>Kirchheim</city>
                    <code>85551</code>
                    <country>Germany</country>
                </postal>
                <phone>+49 89 900594712</phone>
                <email>alexander.zimmermann@netapp.com</email>
            </address>
        </author>

        <author initials="L" surname="Schulte"
        fullname="Lennart Schulte">
            <organization>Aalto University</organization>
            <address>
                <postal>
                    <street>Otakaari 5 A</street>
                    <city>Espoo</city>
                    <code>02150</code>
                    <country>Finland</country>
                </postal>
                <phone>+358 50 4355233</phone>
                <email>lennart.schulte@aalto.fi</email>
            </address>
        </author>

        <author initials="C" surname="Wolff"
        fullname="Carsten Wolff">
            <organization>credativ GmbH</organization>
            <address>
                <postal>
                    <street>Hohenzollernstrasse 133</street>
                    <city>Moenchengladbach</city>
                    <code>41061</code>
                    <country>Germany</country>
                </postal>
                <phone>+49 2161 4643 182</phone>
                <email>carsten.wolff@credativ.de</email>
            </address>
        </author>

        <author initials="A" surname="Hannemann"
        fullname="Arnd Hannemann">
            <organization>credativ GmbH</organization>
            <address>
                <postal>
                    <street>Hohenzollernstrasse 133</street>
                    <city>Moenchengladbach</city>
                    <code>41061</code>
                    <country>Germany</country>
                </postal>
                <phone>+49 2161 4643 134</phone>
                <email>arnd.hannemann@credativ.de</email>
            </address>
        </author>

        <date month="November" year="2014"/>

        <!-- Meta-data Declarations -->
        <area>Transport</area>

        <workgroup>TCP Maintenance and Minor Extensions (TCPM) WG</workgroup>

        <keyword>Transmission Control Protocol (TCP), Reordering,
        TCP-NCR</keyword>

        <abstract>
            <t>This document specifies an adaptive Non-Congestion Robustness
            (aNCR) mechanism for TCP. In the absence of explicit congestion
            notification from the network, TCP uses only packet loss as an
            indication of congestion. One of the signals TCP uses to determine
            loss is the arrival of three duplicate acknowledgments. However,
            this heuristic is not always correct, notably in the case when
            paths reorder packets. This results in degraded performance.</t>

            <t>TCP-aNCR is designed to mitigate this performance degradation by
            adaptively increasing the number of duplicate acknowledgments
            required to trigger loss recovery, based on the current state of
            the connection, in an effort to better disambiguate true segment
            loss from segment reordering. This document specifies the changes
            to TCP and TCP-NCR (on which this specification is build on) and
            discusses the costs and benefits of these modifications.</t>
        </abstract>

    </front>

    <!-- MAIN MATTER -->
    <middle>

        <!-- ***** Section: Introduction ***** -->
        <section anchor="intro" title="Introduction">
            <t>One strength of the Transmission Control Protocol (TCP)
            <xref target="RFC0793"/> lies in its ability to adjust its sending
            rate according to the perceived congestion in the network <xref
                target="RFC5681"/>. In the absence of explicit notification of
            congestion from the network, TCP uses segment loss as an indication
            of congestion (i.e., assuming queue overflow). A TCP receiver sends
            cumulative acknowledgments (ACKs) indicating the next sequence
            number expected from the sender for arriving segments <xref
                target="RFC0793"/>. When segments arrive out of order,
            duplicate ACKs are generated. As specified in <xref
                target="RFC5681"/>, a TCP sender uses the arrival of three
            duplicate ACKs as an indication of segment loss. The TCP sender
            retransmits the segment assumed lost and reduces the sending rate,
            based on the assumption that the loss was caused by resource
            contention on the path. The TCP sender does not assume loss on the
            first or second duplicate ACK, but waits for three duplicate ACKs
            to account for minor packet reordering. However, the use of this
            constant threshold of duplicate ACKs leads to performance
            degradation if the extent of the packet reordering in the network
            increases <xref target="RFC4653"/>.</t>

            <t>Whenever interoperability with the TCP congestion control and
            loss recovery standard <xref target="RFC5681"/> is a prerequisite,
            increasing the duplicate acknowledgment threshold (DupThresh) is
            the method of choice to a priori prevent any negative impact - in
            particular, a spurious Fast Retransmit and Fast Recovery phase -
            that packet reordering has on TCP. However, this procedure also
            delays a Fast Retransmit by increasing the DupThresh, and therefore
            has costs and risks, too.  According to <xref target="ZKFP03"/>,
            these are: (1) a delayed response to congestion in the network, (2)
            a potential expiration of the retransmission timer, and (3) a
            significant increase in the end-to-end delay for lost segments.</t>

            <t>In the current TCP standard, congestion control and loss
            recovery are tightly coupled: when the oldest outstanding segment
            is declared lost, a retransmission is triggered, and the sending
            rate is reduced on the assumption that the loss is due to resource
            contention <xref target="RFC5681"/>. Therefore, any change to
            DupThresh causes not only a change to the loss recovery, but also
            to the congestion control response.  TCP-NCR <xref
                target="RFC4653"/> addresses this problem by defining two
            extensions to TCP's Limited Transmit <xref target="RFC3042"/>
            scheme: Careful and Aggressive Extended Limited Transmit.</t> <!--
            FIXME: Arnd clarifies what NCR wants -->

            <t>The first variant of the two, Careful Limited Transmit, sends
            one previously unsent segment in response to duplicate
            acknowledgments for every two segments that are known to have left
            the network. This effectively halves the sending rate, since normal
            TCP operation sends one new segment for every segment that has left
            the network.  Further, the halving starts immediately and is not
            delayed until a retransmission is triggered. In the case of packet
            reordering (i.e., not segment loss), TCP-NCR restores the
            congestion control state to its previous state after the event.</t>

            <t>The second variant, Aggressive Limited Transmit, transmits one
            previously unsent data segment in response to duplicate
            acknowledgments for every segment known to have left the network.
            With this variant, while waiting to disambiguate the loss from a
            reordering event, ACK-clocked transmission continues at roughly the
            same rate as before the event started. Retransmission and the
            sending rate reduction happen per <xref target="RFC5681"/> <xref
                target="RFC6675"/>, albeit after a delay caused by the
            increased DupThresh. Although this approach delays legitimate rate
            reductions (possibly slightly, and temporarily aggravating overall
            congestion on the network), the scheme has the advantage of not
            reducing the transmission rate in the face of packet
            reordering.</t>

            <t>A basic requirement for preventing an avoidable expiration of
            the retransmission timer is to generally ensure that an increased
            DupThresh can potentially be reached in time so that Fast
            Retransmit is triggered and Fast Recovery is completed before the
            RTO expires.  Simply increasing DupThresh before retransmitting a
            segment can make TCP brittle to packet or ACK loss, since such loss
            reduces the number of duplicate ACKs that will arrive at the sender
            from the receiver. For instance, if cwnd is 10 segments and one
            segment is lost, a DupThresh of 10 will never be met, because
            duplicate ACKs corresponding to at most 9 segments will arrive at
            the sender. To mitigate this issue, the TCP-NCR <xref
                target="RFC4653"/> modification makes two fundamental changes
            to the way <xref target="RFC5681"/> <xref target="RFC6675"/>
            currently operates.</t>

            <t>First, as mentioned above, TCP-NCR <xref target="RFC4653"/>
            extends TCP's Limited Transmit <xref target="RFC3042"/> scheme to
            allow for the sending of new data segment while the TCP sender
            stays in the 'disorder' state and disambiguate loss and
            reordering. This new data serves to increase the likelihood that
            enough duplicate ACKs arrive at the sender to trigger loss
            recovery, if it is appropriate. Second, DupThresh is increased from
            the current fixed value of three <xref target="RFC5681"/> to a value
            indicating that approximately a congestion window's worth of data
            has left the network. Since cwnd represents the amount of data a
            TCP sender can transmit in one round-trip time (RTT), this
            corresponds to approximately the largest amount of time a TCP
            sender can wait before the costly retransmission timeout may be
            triggered.</t>

            <t>Of vital importance is that TCP-NCR <xref target="RFC4653"/>
            holds DupThresh not constant, but dynamically adjusts it on each
            SACK to the current amount of outstanding data, which depends not
            only on the congestion window, but also on the receiver's
            advertised window. Thus, it is guaranteed that the outstanding data
            generates a sufficient number of duplicate ACKs for reaching
            DupThresh and a transition to the 'recovery' state. This is
            important in cases where there is no new data available to send.</t>

            <t>Regarding the problem of packet reordering, TCP-NCR's
            <xref target="RFC4653"/> decision of waiting to receive notice that
            cwnd bytes have left the network before deciding whether the root
            cause is loss or reordering is essentially a trade-off between
            making the best decision regarding the cause of the duplicate ACKs
            and responsiveness, and represents a good compromise between
            avoiding spurious Fast Retransmits and avoiding unnecessary RTOs.
            On the other hand, if there is no visible packet reordering on the
            network path - which today is the rule and not the exception - or
            the delay caused by the reordering is very low, delaying Fast
            Retransmit is unnecessary in the case of congestion, and data is
            delivered to the application up to one RTT later. Especially for
            delay-sensitive applications, such as a terminal session over SSH,
            this is generally undesirable. By dynamically adapting DupThresh
            not only to the amount of outstanding data but also to the
            perceived packet reordering on the network path, this issue can be
            offset. This is the key idea behind the TCP-aNCR algorithm.</t>

            <t>This document specifies a set of TCP modifications to provide an
            adaptive Non-Congestion Robustness (aNCR) mechanism for TCP. The
            TCP-aNCR modifications lend themselves to incremental deployment.
            Only the TCP implementation on the sender side requires
            modification. The changes themselves are modest. TCP-aNCR is built
            on top of the TCP Selective Acknowledgments Option <xref
                target="RFC2018"/> and the SACK-based loss recovery scheme
            given in <xref target="RFC6675"/> and represents an enhancement of
            the original TCP-NCR mechanism <xref target="RFC4653"/>. Currently,
            TCP-aNCR is an independent approach of making TCP more robust to
            packet reordering. It is not clear if upcoming versions of this
            draft TCP-aNCR will obsolete TCP-NCR or not.</t>

            <t>It should be noted that the TCP-aNCR algorithm in this
            document could be easily adapted to the Stream Control Transmission
            Protocol (SCTP) <xref target="RFC2960"/>, since SCTP uses
            congestion control algorithms similar to TCP (and thus has the same
            reordering robustness issues).</t>

            <t>The remainder of this document is organized as follows.
            <xref target="idea"/> provides a high-level description of the
            TCP-aNCR mechanism. <xref target="requirements"/> defines
            TCP-aNCR's requirements for an appropriate detection and
            quantification algorithm. <xref target="algo"/> specifies the
            TCP-aNCR algorithm and <xref target="details"/> discusses each step
            of the algorithm in detail. <xref target="discussion"/> provides a
            discussion of several design decisions behind TCP-aNCR.  <xref
                target="interoperability"/> discusses interoperability issues
            related to introducing TCP-aNCR. Finally, related work is presented
            in <xref target="related"/> and security concerns in <xref
                target="security"/>.</t>
        </section>

        <!-- Subsection: Terminology -->
        <section anchor="terminology" title="Terminology">
            <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL
            NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL"
            in this document are to be interpreted as described
            <xref target="RFC2119"/>.</t>

            <t>The reader is expected to be familiar with the TCP state
            variables described in <xref target="RFC0793"/> (SND.NXT), <xref
                target="RFC5681"/> (cwnd, rwnd, ssthresh, FlightSize, IW),
            <xref target="RFC6675"/> (pipe, DupThresh, SACK scoreboard), and
            <xref target="RFC6582"/> (recover).  Further, the term 'acceptable
            acknowledgment' is used as defined in <xref target="RFC0793"/>.
            That is, an ACK that increases the connection's cumulative ACK
            point by acknowledging previously unacknowledged data. The term
            'duplicate acknowledgment' is used as defined in <xref
                target="RFC6675"/>, which is different from the definition of
            duplicate acknowledgment in <xref target="RFC5681"/>.</t>

            <t>This specification defines the four TCP sender
            states 'open', 'disorder', 'recovery', and 'loss' as follows. As
            long as no duplicate ACK is received and no segment is considered
            lost, the TCP sender is in the 'open' state. Upon the reception of
            the first consecutive duplicate ACK, TCP will enter the 'disorder'
            state.  After receiving DupThresh duplicate ACKs, the TCP sender
            switches to the 'recovery' state and executes standard loss
            recovery procedures like Fast Retransmit and Fast Recovery <xref
                target="RFC5681"/>. Upon a retransmission timeout, the TCP
            sender enters the 'loss' state. The 'recovery' state can only be
            reached by a transition from the 'disorder' state, the 'loss' state
            can be reached from any other state.</t>

            <t>The following specification depends on the standard TCP
            congestion control and loss recovery algorithms and the SACK-based
            loss recovery scheme given in <xref target="RFC5681"/>,
            respectively <xref target="RFC6675"/>. The algorithm presents an
            enhancement of TCP-NCR <xref target="RFC4653"/>. The reader is
            assumed to be familiar with the algorithms specified in these
            documents.</t>
        </section>

        <!-- Section: Basic Concept -->
        <section anchor="idea" title="Basic Concept">
            <t>The general idea behind the TCP-aNCR algorithm is to extend the
            TCP-NCR algorithm <xref target="RFC4653"/>, so that - based on an
            appropriate packet reordering detection and quantification
            algorithm (see <xref target="requirements"/>) - TCP congestion
            control and loss recovery <xref target="RFC5681"/> is adaptively
            adjusted to the actual perceived packet reordering on the network
            path.</t>

            <t>TCP-NCR <xref target="RFC4653" /> increases DupThresh from the
            current fixed value of three duplicate ACKs <xref
                target="RFC5681"/> to approximately until a congestion window
            of data has left the network. Since cwnd represents the amount of
            data a TCP sender can transmit in one RTT, the choice to trigger a
            retransmission only after a cwnd's worth of data is known to have
            left the network represents roughly the largest amount of time a
            TCP sender can wait before the RTO may be triggered. The approach
            chosen in TCP-aNCR is to take TCP-NCR's DupThresh as an upper bound
            for an adjustment of the DupThresh that is adaptive to the actual
            packet reordering on the network path.</t>

            <t>Using TCP-NCR's DupThresh as an upper bound decouples the
            avoidance of spurious Fast Retransmits from the avoidance of
            unnecessary retransmission timeouts. Therefore, the adaptive
            adjustment of the DupThresh to current perceived packet reordering
            can be conducted without taking any retransmission timeout
            avoidance strategy into account. This independence allows TCP-aNCR
            to quickly respond to perceived packet reordering by setting its
            DupThresh so that it always corresponds to the minimum of the
            maximum possible (TCP-NCR's DupThresh) and the maximum measured
            reordering extent since the last RTO. The reordering extent used by
            TCP-aNCR is by itself not a static absolute reordering extent, but
            a relative reordering extent (see <xref
                target="requirements"/>).</t>
        </section>

        <!-- Section: Detection Algorithms -->
        <section anchor="requirements"
        title="Appropriate Detection and Quantification Algorithms">
            <t>If the TCP-aNCR algorithm is implemented at the TCP sender, it
            MUST be implemented together with an appropriate packet reordering
            detection and quantification algorithm that is specified in a
            standards track or experimental RFC.</t>

            <t>Designers of reordering detection algorithms who want their
            algorithms to work together with the TCP-aNCR algorithm SHOULD
            reuse the variable 'ReorExtR' (relative reordering extent) with the
            semantics and defined values specified in <xref
                target="I-D.zimmermann-tcpm-reordering-detection"/>.  A
            'ReorExtR' given by the detection algorithm holds a value ranging
            from 0 to 1 which holds the new measured reordering sample as a
            fraction of the data in flight. TCP-aNCR then saves this new
            fraction if it is greater than the current value.</t>
        </section>

        <!-- Section: The TCP-aNCR Algorithm -->
        <section anchor="algo" title="The TCP-aNCR Algorithm">

            <t>When both the Nagle algorithm <xref target="RFC0896"/>
            <xref target="RFC1122"/> and the TCP Selective Acknowledgment Option
            <xref target="RFC2018"/> are enabled for a connection, a TCP sender
            MAY employ the following TCP-aNCR algorithm to dynamically adapt
            TCP's congestion control and loss recovery <xref target="RFC5681"/>
            to the currently perceived packet reordering on the network
            path.</t>

            <t>Without the Nagle algorithm, there is no straightforward way to
            accurately calculate the number of outstanding segments in the
            network (and, therefore, no good way to derive an appropriate
            DupThresh) without adding state to the TCP sender. A TCP connection
            that does not use the Nagle algorithm SHOULD NOT use TCP-aNCR.  The
            adaptation of TCP-aNCR to an implementation that carefully tracks
            the sequence numbers transmitted in each segment is considered
            future work.</t>

            <t>A necessary prerequisite for TCP-aNCR's adaptability is that a
            TCP sender has enabled an appropriate detection and quantification
            algorithm that complies with the requirements defined in <xref
                target="requirements"/>. If such an algorithm is either
            non-existent or not used, the behavior of TCP-aNCR is completely
            analogous to the TCP-NCR algorithm as defined in <xref
                target="RFC4653"/>. If a TCP sender does implement TCP-aNCR,
            the implementation MUST follow the various specifications provided
            in Sections <xref target="algo:3way" format="counter"/> to <xref
                target="algo:rto" format="counter"/>.</t>

            <!-- Subsection: Connection Establishment -->
            <section anchor="algo:3way"
            title="Initialization during Connection Establishment">
                <t>After the completion of the TCP connection establishment, the
                following state constants and variables MUST be initialized in
                the TCP transmission control block for the given TCP
                connection:</t>

                <t>
                    <list style="format (C.%d)" counter="c:3way">
                        <t>Depending on which variant of Extended Limited
                        Transmit should be executed, the constant LT_F MUST
                        initialized as follows. For Careful Extended Limited
                        Transmit:
                            <list style="empty">
                                <t>LT_F = 2/3</t>
                            </list>
                        For Aggressive Extended Limited Transmit:
                            <list style="empty">
                                <t>LT_F = 1/2</t>
                            </list>
                        This constant reflects the fraction of outstanding data
                        (including data sent during Extended Limited Transmit)
                        that must be SACKed before a retransmission is at the
                        latest triggered.</t>

                        <t>If TCP-aNCR should adaptively adjust the DupThresh to
                        the current perceived packet reordering on the network
                        path, then the variable 'ReorExtR', which stores the
                        maximum relative reordering extent, MUST initialized
                        as:
                            <list style="empty">
                                <t>ReorExtR = 0</t>
                            </list>
                        Otherwise the dynamically adaptation of TCP-aNCR SHOULD
                        be disabled by setting
                            <list style="empty">
                                <t>ReorExtR = -1</t>
                            </list>
                        A relative reordering extent of 0 results in the
                        standard DupThresh of three duplicate ACKs, as defined
                        in <xref target="RFC5681"/>. A fixed relative
                        reordering extent of -1 results in the TCP-NCR behavior
                        from <xref target="RFC4653"/>.</t>
                    </list>
                </t>
            </section>

            <!-- Subsection: Initializing ELT -->
            <section anchor="algo:init"
            title="Initializing Extended Limited Transmit">
                <t>If the SACK scoreboard is empty upon the receipt of a
                duplicate ACK (i.e., the TCP sender has received no SACK
                information from the receiver), a TCP sender MUST enter
                Extended Limited Transmit by initialize the following five
                state variables in the TCP Transmission Control Block:</t>

                <t>
                    <list style="format (I.%d)" counter="c:init">
                        <t>The TCP sender MUST save the current outstanding
                        data:
                            <list style="empty">
                                <t>FlightSizePrev = FlightSize</t>
                            </list>
                        </t>

                        <t>The TCP sender MUST save the highest sequence
                        number transmitted so far:
                            <list style="empty">
                                <t>recover = SND.NXT - 1</t>
                            </list>
                        Note: The state variable 'recover' from <xref
                            target="RFC6582"/> can be reused, since NewReno TCP
                        uses 'recover' at the initialization of a loss recovery
                        procedure, whereas TCP-aNCR uses 'recover' *before*
                        loss recovery.</t>

                        <t>The TCP sender MUST initialize the variable 'skipped'
                        that tracks the number of segments for which an ACK
                        does not trigger a transmission during Careful Limited
                        Transmit:
                            <list style="empty">
                                <t>skipped = 0</t>
                            </list>
                        During Aggressive Limited Transmit, 'skipped' is not
                        used.</t>

                        <t>The TCP sender MUST set DupThresh based on the
                        current FlightSize:
                            <list style="empty">
                                <t>DupThresh = max (LT_F * (FlightSize /
                                SMSS), 3)</t>
                            </list>
                        The lower bound of DupThresh = 3 is kept from
                        <xref target="RFC5681"/> <xref target="RFC6675"/>.</t>

                        <t>If (ReorExtR != -1) holds, then the TCP sender MUST
                        set DupThresh based on the relative reordering extent
                        'ReorExtR':
							<list style="hanging" hangIndent="8">
                                <?rfc subcompact="yes" ?>
								<t hangText="   DupThresh ="/>
								<t>max (min (DupThresh,</t>
								<t hangText="                ">
									ReorExtR * (FlightSizePrev / SMSS)), 3)</t>
                                <?rfc subcompact="no" ?>
                            </list>
                        </t>
                    </list>
                </t>

                <t>In addition to the above steps, the incoming ACK MUST be
                processed with the (E) series of steps in <xref
                    target="algo:elt"/>.</t>
            </section>

            <!-- Subsection: Executing ELT -->
            <section anchor="algo:elt" title="Executing Extended Limited Transmit">
                <t>On each ACK that a) arrives after TCP-aNCR has entered the
                Extended Limited Transmit phase (as outlined in <xref
                    target="algo:init"/>) *and* b) carries new SACK
                information, *and* c) does *not* advance the cumulative ACK
                point, the TCP sender MUST use the following procedure.</t>

                <t>
                    <list style="format (E.%d)" counter="c:elt">
                        <t>The TCP sender MUST update the SACK scoreboard and
                        uses the SetPipe() procedure from <xref
                            target="RFC6675"/> to set the 'pipe' variable
                        (which represents the number of bytes still considered
                        "in the network"). Note: the current value of DupThresh
                        MUST be used by SetPipe() to produce an accurate
                        assessment of the amount of data still considered in
                        the network.</t>

                        <t>The TCP sender MUST initialize the variable 'burst'
                        that tracks the number of segments that can at most be
                        sent per ACK to the size of the Initial Window (IW)
                        <xref target="RFC5681"/>:
                            <list style="empty">
                                <t>burst = IW</t>
                            </list>
                        </t>

                        <t>If a) (cwnd - pipe - skipped >= 1 * SMSS) holds,
                        *and* b) the receive window (rwnd) allows to send SMSS
                        bytes of previously unsent data, *and* c) there are
                        SMSS bytes of previously unsent data available for
                        transmission, then the TCP sender MUST transmit one
                        segment of SMSS bytes. Otherwise, the TCP sender MUST
                        skip to step (E.7).</t>

                        <t>The TCP sender MUST increment 'pipe' by SMSS bytes
                        and MUST decrement 'burst' by SMSS bytes to reflect the
                        newly transmitted segment:
                            <list style="empty">
                                <t>pipe = pipe + SMSS</t>
                                <?rfc subcompact="yes" ?>
                                <t>burst = burst - SMSS</t>
                                <?rfc subcompact="no" ?>
                            </list>
                        </t>

                        <t>If Careful Limited Transmit is used, 'skipped' MUST
                        be incremented by SMSS bytes to ensure that the next
                        SMSS bytes of SACKed data processed do not trigger a
                        Limited Transmit transmission.
                            <list style="empty">
                                <t>skipped = skipped + SMSS</t>
                            </list>
                        </t>

                        <t>If (burst > 0) holds, the TCP sender MUST return
                        to step (E.3) to ensure that as many bytes as
                        appropriate are transmitted. Otherwise, if more than IW
                        bytes were SACKed by a single ACK, the TCP sender MUST
                        skip to step (E.7). The additional amount of data
                        becomes available again by the next received duplicate
                        ACK and the re-execution of SetPipe().</t>

                        <t>The TCP sender MUST save the maximum amount of data
                        that is considered to have been in the network during
                        the last RTT:
                            <list style="empty">
                                <t>pipe_max = max (pipe, pipe_max)</t>
                            </list>
                        </t>

                        <t>The TCP sender MUST set DupThresh based on the
                        current FlightSize:
                            <list style="empty">
                                <t>DupThresh = max (LT_F * (FlightSize /
                                SMSS), 3)</t>
                            </list>
                        The lower bound of DupThresh = 3 is kept from
                        <xref target="RFC5681"/> <xref target="RFC6675"/>.</t>

                        <t>If (ReorExtR != -1) holds, then the TCP sender MUST
                        set DupThresh based on the relative reordering extent
                        'ReorExtR':
							<list style="hanging" hangIndent="8">
                                <?rfc subcompact="yes" ?>
								<t hangText="   DupThresh ="/>
								<t>max (min (DupThresh,</t>
								<t hangText="                ">
									ReorExtR * (FlightSizePrev / SMSS)), 3)</t>
                                <?rfc subcompact="no" ?>
                            </list>
                        </t>
                    </list>
                </t>
            </section>

            <!-- Subsection: Terminating ELT -->
            <section anchor="algo:term"
            title="Terminating Extended Limited Transmit">
                <t>On the receipt of a duplicate ACK that a) arrives
                after TCP-aNCR has entered the Extended Limited Transmit phase
                (as outlined in <xref target="algo:init"/>) *and* b) advances
                the cumulative ACK point, the TCP sender MUST use the following
                procedure.</t>

                <t>The arrival of an acceptable ACK that advances the cumulative
                ACK point while in Extended Limited Transmit, but before loss
                recovery is triggered, signals that a series of duplicate ACKs
                was caused by reordering and not congestion. Therefore,
                Extended Limited Transmit will be either terminated or
                re-entered.</t>

                <t>
                    <list style="format (T.%d)" counter="c:term">
                        <t>If the received ACK extends not only the cumulative
                        ACK point, but *also* carries new SACK information
                        (i.e., the ACK is both an acceptable ACK and a
                        duplicate ACK), the TCP sender MUST restart Extended
                        Limited Transmit and MUST go to step (T.2). Otherwise,
                        the TCP sender MUST terminate it and MUST skip to step
                        (T.3).</t>

                        <t>If the Cumulative Acknowledgment field of the
                        received ACK covers more than 'recover' (i.e., SEG.ACK
                        > recover), Extended Limited Transmit has
                        transmitted one cwnd worth of data without any losses
                        and the TCP sender MUST update the following state
                        variables by
                             <list style="empty">
                                <t>FlightSizePrev = pipe_max</t>
                                <?rfc subcompact="yes" ?>
                                <t>pipe_max = 0</t>
                                <?rfc subcompact="no" ?>
                            </list>
                        and MUST go to step (I.2) to re-start Extended Limited
                        Transmit. Otherwise if (SEG.ACK <= recover) holds,
                        the TCP sender MUST go to step (I.3). This ensures that
                        in the event of a loss the cwnd reduction is based on a
                        current value of FlightSizePrev.</t>
                    </list>
                </t>

                <t>The following steps are executed only if the received ACK
                does *not* carry SACK information. Extended Limited Transmit
                will be terminated.</t>

                <t>
                    <list style="format (T.%d)" counter="c:term">
                        <t>A TCP sender MUST set ssthresh to:
                            <list style="empty">
                                <t>ssthresh = max (cwnd, ssthresh)</t>
                            </list>
                        This step provides TCP-aNCR with a sense of "history".
                        If the next step (T.4) reduces the congestion window,
                        this step ensures that TCP-aNCR will slow-start back to
                        the operating point that was in effect before Extended
                        Limited Transmit.</t>

                        <t>A TCP sender MUST reset cwnd to:
                            <list style="empty">
                                <t>cwnd = FlightSize + SMSS</t>
                            </list>
                        This step ensures that cwnd is not significantly larger
                        than the amount of data outstanding, a situation that
                        would cause a line rate burst.</t>

                        <t>A TCP is now permitted to transmit previously unsent
                        data as allowed by cwnd, FlightSize, application data
                        availability, and the receiver's advertised window.</t>
                   </list>
                </t>
            </section>

            <!-- Subsection: Entering Loss Recovery -->
            <section anchor="algo:recovery" title="Entering Loss Recovery">
                <t>The receipt of an ACK that results in deeming the oldest
				outstanding segment is lost via the algorithms in <xref
					target="RFC6675"/> terminates Extended Limited Transmit and
				initializes the loss recovery according to <xref
					target="RFC6675"/>. One slight change to either <xref
					target="RFC6675"/>, or, if Proportional Rate Reduction
				(PRR) algorithm is used, to <xref target="RFC6937"/> MUST be
				made,
				however.</t>

                <t>
                    <list style="hanging" hangIndent="7">
						<t hangText="(Ret)">If the PRR algorithm is used to
						calculate how many bytes should be sent in response to
						each ACK, the initalization of 'RecoverFS'  in Section 3
						of <xref target="RFC6937"/> MUST be changed to:
							<list style="empty">
                               <t>RecoverFS = FlightSizePrev</t>
                            </list>
						Otherwise, if the standard Fast Recovery algorithm is
						used, step (4.2) of <xref target="RFC6675"/> MUST be
						changed in Section 5 to:
                            <list style="empty">
                                <t>ssthresh = cwnd = (FlightSizePrev / 2)</t>
                            </list>
						This change ensures that the congestion control
						modifications are made with respect to the amount of
						data in the network before FlightSize was increased by
						Extended Limited Transmit.</t>
                    </list>
                </t>

                <t>Once the algorithm in <xref target="RFC6675"/> takes over
                from Extended Limited Transmit, the DupThresh value MUST be
                held constant until the loss recovery phase terminates.</t>
            </section>

            <!-- Subsection: Reordering Extent -->
            <section anchor="algo:reorext" title="Reordering Extent">
                <t>Whenever the additional detection and quantification
                algorithm (see <xref target="requirements"/>) detects and
                quantifies a new reordering event, the TCP sender MUST update
                the state variable 'ReorExtR'.</t>

                <t>
                    <list style="hanging" hangIndent="7">
                        <t hangText="(Ext)">Let 'ReorExtR_New' the newly
                            determined relative reordering extent:
                            <list style="empty">
                                <t>ReorExtR = min (max (ReorExtR, ReorExtR_New),
                                1)</t>
                            </list>
                        </t>
                    </list>
                </t>
            </section>

            <!-- Subsection: Retransmission Timeout -->
            <section anchor="algo:rto" title="Retransmission Timeout">
                <t>The expiration of the retransmission timer SHOULD be
                interpreted as an indication of a path characteristics change,
                and the TCP sender SHOULD reset DupThresh to the default value
                of three.</t>

                <t>
                    <list style="hanging" hangIndent="7">
                        <t hangText="(RTO)">If an RTO occurs and
                        (ReorExtR != -1) (i.e. TCP-aNCR is used and not TCP-NCR),
                        then a TCP sender SHOULD reset 'ReorExtR':
                            <list style="empty">
                                <t>ReorExtR = 0</t>
                            </list>
                        </t>
                    </list>
                </t>
            </section>
        </section>

        <!-- Section: Protocol Steps in Detail -->
        <section anchor="details" title="Protocol Steps in Detail">
            <t>Upon the receipt of the first duplicate ACK in the 'open' state
            (the SACK scoreboard is empty), the TCP sender starts to execute
            TCP-aNCR by entering the 'disorder' state and the initialization of
            Extended Limited Transmit. First, the TCP sender saves the current
            amount of outstanding data as well as the highest sequence
            number transmitted so far (SND.NXT - 1) (steps (I.1) and (I.2)). In
            addition, if the TCP connection uses the careful variant of the
            Extended Careful Limited Transmit (step (C.1)), the 'skipped'
            variable, which tracks the number of segments for which an ACK does
            not trigger a transmission during Careful Limited Transmit, is
            initialized with zero (step (I.3)). The last step during the
            initialization is the determination of DupThresh. Depending on
            whether TCP-aNCR has been configured during the connection
            establishment to adaptively adjust to the currently perceived packet
            reordering on the path (step (C.2)), DupThresh is either determined
            exclusively based on the current FlightSize (as TCP-NCR
            <xref target="RFC4653"/> does) or, in addition, also based on the
            relative extent reordering (steps (I.4) and (I.5)).</t>

            <t>Depending on which variant of Extended Limited Transmit should
            be executed, the constant LT_F must be set accordingly (step
            (C.1)).  This constant reflects the fraction of outstanding data
            (including data sent during Extended Limited Transmit) that must be
            SACKed before a retransmission is triggered at the latest (which is
            the case when a DupThresh that is based on relative reordering
            extent is larger then TCP-NCR's DupThresh). Since Aggressive
            Limited Transmit sends a new segment for every segment known to
            have left the network, a total of approximately cwnd segments will
            be sent, and therefore ideally a total of approximately 2*cwnd
            segments will be outstanding when a retransmission is finally
            triggered.  DupThresh is then set to LT_F = 1/2 of 2*cwnd (or about
            1 RTT's worth of data) (see step (I.4)). The factor is different
            for Careful Limited Transmit, because the sender only transmits one
            new segment for every two segments that are SACKed and therefore
            will ideally have a total of maximum of 1.5*cwnd segments
            outstanding when the retransmission is triggered. Hence, the
            required threshold is LT_F=2/3 of 1.5*cwnd to delay the
            retransmission by roughly 1 RTT.</t>

            <t>For each duplicate ACK received in the 'disorder' state, which
            is not an acceptable ACK, i.e., it carries new SACK information,
            but does not advance the cumulative ACK point, Extended Limited
            Transmit is executed. First, the SACK scoreboard is updated and
            based on the current value of DupThresh, the amount of outstanding
            data (step (E.1)). Furthermore, the state variable 'burst' that
            indicates the number of segments that can be sent at most for of
            each received ACK is initialized to the size of the initial window
            <xref target="RFC6928"/> (step E.2)). If more than IW bytes were
            SACKed by a single ACK, the additional amount of data becomes
            available again by the next received duplicate ACK and the
            re-execution of SetPipe() (step (E.1)).</t>

            <t>Next, if new data is available for transmission and both the
            congestion window and the receiver window allow to send SMSS bytes
            of previously unsent data, a segment of SMSS bytes is sent (step
            (E.3)). Subsequently, the corresponding state variables 'pipe',
            'burst' and - optionally - 'skipped' are updated (steps (E.4) and
            (E.5)). If, due to the current size of the congestion and receiver
            windows (step (E.2)), due to the current value of 'burst' (step
            (E.5)), no further segment may be sent, the processing of the ACK
            is terminated. Provided that the amount of data that is currently
            considered to be in the network is greater than the previously
            stored one, this new value is stored for later use (step (E.7)).
            Finally, to take into account the new data sent, DupThresh is
            updated (steps (E.6) and (E.7)).</t>

            <t>The arrival of an acceptable ACK in the 'disorder' state that
            advances the cumulative ACK point during Extended Limited Transmit
            signals that a series of duplicate ACKs was caused by reordering
            and not congestion. Therefore, the receipt of an acceptable ACK
            that does not carry any SACK information terminates Extended
            Limited Transmit (step (T.1)). The slow start threshold is set to
            the maximum of its current value and the current value of cwnd
            (step (T.3)). Cwnd itself is set to the current value of FlightSize
            plus one segment (step (T.4)).  As a result, the congestion window
            is not significantly larger than the current amount of outstanding
            data, so that a burst of data is effectively prevented. If new data
            is available for transmission and both the new values of cwnd and
            rwnd allow to send SMSS bytes of previously unsent data, a segment
            is send (step (T.5)).</t>

            <t>On the other hand, if the received ACK acknowledges new data not
            only cumulatively but also selectively - the ACK carries new
            SACK information - Extended Limited Transmit is not
            terminated but re-entered (step (T.1)). If the Cumulative
            Acknowledgment field of the received ACK covers more than
            'recover', one cwnd worth of data has been transmitted during
            Extended Limited Transmit without any packet loss. Therefore,
            FlightSizePrev, the amount of outstanding data saved at the
            beginning of Extended Limited Transmit (step (I.1)), is considered
            outdated (step (T.2)). This step ensures that in the event of
            packet loss, the reduction of the cwnd is based on an up-to-date
            value, which reflects the number of bytes outstanding in the
            network (see <xref target="discussion"/>). Finally, regardless of
            whether or not 'recover' is covered, Extended Limited Transmit is
            re-entered.</t>

            <t>The second case that leads to a termination of Extended Limited
			Transmit is the receipt of an ACK that signals via the algorithm in
			<xref target="RFC6675"/> that the oldest outstanding segment is
			considered lost. If either DupThresh or more duplicate ACKs are
			received, or the oldest outstanding segment is deemed lost via the
			function IsLost() of <xref target="RFC6675"/>, Extended Limited
			Transmit is terminated and SACK-based loss recovery is entered
			<xref target="RFC6675"/>. Once the algorithm in <xref
				target="RFC6675"/> takes over from Extended Limited Transmit,
			the DupThresh value MUST be held constant until loss recovery is
			terminated. The process of loss recovery itself is not changed by
			TCP-aNCR. The only exception is a slight change to either <xref
				target="RFC6675">RFC 6675</xref> or <xref target="RFC6937">RFC
			6937</xref>, depending on whether the PRR algorithm or the
			traditional Fast Recovery algorithm is used during loss recovery.
			This change ensures that the adjustment made by the congestion
			control - the cwnd reducation - is made with respect to the initial
			amount of outstanding data while Limited Transmit Extended is
			executed (step (Ret)). The use of FlightSize at this point would no
			longer be valid since the amount of outstanding data may double by
			executing Extended Limited Transmit.</t>
        </section>

        <!-- Section: Discussion of TCP-aNCR -->
        <section anchor="discussion" title="Discussion of TCP-aNCR">
            <t>The specification of TCP-aNCR represents an incremental update of
            <xref target="RFC4653">RFC 4653</xref>. All changes made by
            TCP-aNCR can be divided into two categories. On one hand, they
            implement TCP-aNCR's ability to dynamically adapted TCP congestion
            control and loss recovery <xref target="RFC5681"/> to the currently
            perceived packet reordering on the network path. These include the
            use of a variable DupThresh and the use of a relative reordering
            extent. On the other hand, the changes that basically correct
            weaknesses of the original TCP-NCR algorithm and which are
            independent of TCP-aNCR adaptability. These include packet
            reordering during slow start, the prevention of bursts, and the
            persistent receipt of SACKs.</t>

            <!-- Subsection: Variable DUPTRESH -->
            <section anchor="discuss:dupthresh"
            title="Variable Duplicate Acknowledgment Threshold">
                <t>The central point of the TCP-aNCR algorithm is the usage of a
                DupThresh that is adaptable to the perceived packet reordering
                on the network path. Based on the actual amount of outstanding
                data, TCP-NCR's DupThresh represents roughly the largest amount
                of time a Fast Retransmit can safely be delayed before a costly
                retransmission timeout may be triggered. Therefore, to avoid an
                RTO, TCP-aNCR's reordering-aware DupThresh is an upper bound of
                the one calculated in TCP-NCR (steps (I.5) and (E.9)).  This
                decouples the avoidance of spurious Fast Retransmits from the
                avoidance of RTOs. It allows TCP-aNCR to react fast and
                efficiently to packet reordering. The DupThresh always
                corresponds to the minimum of the largest possible and largest
                detected reordering. With constant packet reordering in terms
                of the rate and delay, TCP-aNCR gives a DupThresh based on the
                relative reordering extent with an optimal delay for every
                bandwidth-delay-product. If TCP-aNCR should not adaptively
                adjust the DupThresh to the current perceived packet reordering
                on the network path (because for example an appropriate
                detection and quantification algorithm is not implemented), the
                dynamically adaptation of TCP-aNCR can be disabled, so that
                TCP-aNCR behaves like TCP-NCR <xref target="RFC4653"/>.</t>
            </section>

            <!-- Subsection: Relative Reordering Extent -->
            <section anchor="discuss:extent"
            title="Relative Reordering Extent">
                <t>Whenever a new reordering event is detected and presented to
                TCP-aNCR in the form of a relative reordering extend 'ReorExtR',
                TCP-aNCR saves and uses the new 'ReorExtR' if it is larger than
                the old one (step (EXT)). The upper bound of 1 assures that no
                excessively large value is used. A 'ReorExtR' larger than one
                means that more than FlightSize bytes would have been received
                out-of-order before the reordered segment is received. The
                delay caused by the reordering is thus longer than the RTT of
                the TCP connection. Since the RTT is roughly the time a Fast
                Retransmit can safely be delayed before the retransmission has
                to be to avoid an RTO, a maximum 'ReorExtR' of one seems to be a
                suitable value.</t>

                <t>The expiration of the retransmission timer is interpreted
                by TCP-aNCR as an indication of a change in path
                characteristics, hence, the saved 'ReorExtR' is assumed to be
                outdated and will be invalidated (step (RTO)). As a
                consequence, the relative reordering extent 'ReorExtR' increases
                monotonically between two successive retransmission timeouts
                and corresponds to the maximum measured reordering extent since
                the last RTO. Other approaches would be an
                exponentially-weighted moving average (EWMA) or a histogram of
                the last n reordering extents. The main drawback of an EWMA is
                however that on average half of the detected reordering events
                would be larger than the saved reordering extend. Thus, only
                half of the spurious retransmits could be avoided. Applying an
                histogram could largely avoid the disadvantages of an EWMA,
                however, it would result in a not acceptable increase in memory
                usage.</t>

                <t>In combination with the invalidation after an RTO, the
                advantage of using maximum is the low complexity as well as its
                fast convergence to the actual maximum reordering on the
                network path. As a result, the negative impact that packet
                reordering has on TCP's congestion control and loss recovery
                can be avoided. A disadvantage of using a maximum is that if
                the delay caused by the reordering decreases over the lifetime
                of the TCP connection, a Fast Retransmit is unnecessarily long
                delayed. Nevertheless, since the negative impact reordering has
                on TCP's congestion control and loss recovery is more
                substantial than the disadvantage of a longer delay, a decrease
                of the ReorExtR between RTOs is considered inappropriate.</t>
            </section>

            <!-- Subsection: Reordering during Slow Start -->
            <section anchor="discuss:slow-start"
            title="Reordering during Slow Start">
                <t>The arrival of an acceptable ACK during Extended Limited
                Transmit signals that previously received duplicate ACKs are
                the result of packet reordering and not congestion, so that
                Extended Limited Transmit is completed accordingly. Upon the
                termination of Extended Limited Transmit, and especially when
                using the Careful variant, TCP-NCR (as well as TCP-aNCR) may be
                in a situation where the entire cwnd is not being utilized.
                Therefore, to mitigate a potential burst of segments, in step
                (T.2) TCP-NCR sets the slow start threshold to the FlightSize
                that was saved at the beginning of Extended Limited Transmit
                <xref target="RFC4653"/>. This step should ensure that TCP-NCR
                slow starts back to the operating point in use before Extended
                Limited Transmit.</t>

                <t>Unfortunately, the assignment in step (T.2) is only correct
                if the TCP sender already was in congestion avoidance at the
                time Extended Limited Transmit was entered. Otherwise, if the
                TCP sender was instead in slow start, the value of ssthresh is
                greater than the saved FlightSize so that slow start
                prematurely concludes. This behavior can leave much of the
                network resources idle, and a long time may needed in order to
                use the full capacity. To mitigate this issue, TCP-aNCR sets
                the slow start threshold to the maximum of its current value
                and the current cwnd (step (T.3)). This continues slow start
                after a reordering event happening during slow start.</t>
            </section>

            <!-- Subsection: Preventing Bursts -->
            <section anchor="discuss:bursts" title="Preventing Bursts">
                <t>In cases where a new single SACK covers more than one segment
                - this can happen either due to packet loss or packet
                reordering on the ACK path - TCP-NCR <xref target="RFC4653"/>
                sends an undesirable burst of data. TCP-aNCR solves this
                problem by limiting the burst size - the maximum of data that
                can send in response to a single SACK - to the Initial Window
                <xref target="RFC5681"/> while executing Extended Limited
                Transmit (steps (E.2), (E.4), and (E.6)). Since IW represents
                the amount of data that a TCP sender is able to send into the
                network safely without knowing its characteristics, it is a
                reasonable value for the burst size, too. If more than IW bytes
                were SACKed by a single ACK, the additional amount of data
                becomes available again by the next received duplicate ACK.
                Thus, the transmission of new segments is spread over the next
                received ACKs, so that micro bursts - a characteristic of
                packet reordering in the reverse path - are largely
                compensated.</t>

                <t>Another situation that causes undesired bursts of
                segments with TCP-NCR is the receipt of an acceptable ACK
                during Careful Extended Limited Transmit. If multiple segments
                from a single window of data are delayed by packet reordering,
                typically the first acceptable ACK after entering the
                'disorder' state acknowledges data not only cumulatively but
                also selectively. Hence, Extended Limited Transmit is not
                terminated but re-started. If the segments are delayed by the
                reordering for almost one RTT, then the amount of outstanding
                data in the network ('pipe') is approximately half the amount
                of data saved at the beginning of Extended Limited Transmit
                (FlightSizePrev). If the sequence numbers of the delayed
                segments are close to each other in the sequence number space,
                the acceptable ACK acknowledges only a small amount of data, so
                that FlightSize is still large. As a result, TCP-NCR sets the
                cwnd to FlightSizePrev in step (T.1). Since 'pipe' is only half
                of FlightSizePrev due to Careful Extended Limited Transmit,
                TCP-NCR sends a burst of almost half a cwnd worth of data in
                the subsequent step (T.3).</t>

                <t>Note: Even in the case the sequence numbers of the delayed
                segments are not close to each other in the sequence number
                space and cwnd is set in step (T.1) to FlightSize + SMSS, a
                burst of data will emerge due to re-entering Extended Limited
                Transmit, because TCP-NCR sets 'skipped' to zero in step (I.2)
                and uses FlightSizePrev in step (E.2).</t>

                <t>TCP-aNCR prevents such a burst by making a clear
                differentiation between terminating Extended Limited Transmit
                and a restarting Extended Limited Transmit (step T.1). Only the
                first case causes the congestion window to be set to the
                current FlightSize plus one segment. In the latter case, when
                re-entering Extended Limited Transmit, the congestion window is
                not adjusted and the original (T.1) of the TCP-NCR
                specification is omitted. The transmission of new data is then
                only performed after re-entering Extended Limited Transmit in
                step (E.2) of the TCP-aNCR specification, where the actual
                burst mitigation takes place.</t>
            </section>

            <!-- Subsection: Persistent receiving of SACKs -->
            <section anchor="discuss:persistentSACK"
            title="Persistent receiving of Selective Acknowledgments">
                <t>In some inconvenient cases it could happen that a TCP sender
                persistently receives SACK information due to reordering on the
                network path, e.g., if the segments are often and/or lengthy
                delayed by the packet reordering. With TCP-NCR, the persistent
                reception of SACKs causes Extended Limited Transmit to be
                entered with the first received duplicate ACK but never to be
                terminated if no packet loss occurs - for every received ACK,
                TCP-NCR either follows steps (E.1) to (E.6) or steps (T.1) to
                (T.4). In particular, TCP-NCR executes a) for every acceptable
                ACK step (T.4) and b) at any time step (I.1) again. Hence, the
                amount of outstanding data saved at the beginning of Extended
                Limited Transmit, FlightSizePrev, is never updated.</t>

                <t>An emerging problem in this context is that during Extended
                Limited Transmit TCP-NCR determines the transmission of new
                segments in step (E.2) solely on the basis of FlightSizePrev,
                so that an interim increase of the cwnd is not considered
                (according to <xref target="RFC5681"/>, the congestion window
                is increased for every received acceptable ACK that advances
                the cumulative ACK point, no matter if it carries SACK
                information or not). As a result, TCP-NCR can only very slowly
                determine the available capacity of the communication path.</t>

                <t>TCP-aNCR addresses this problem by limiting the amount of
                data that is allowed to be sent into the network during
                Extended Limited Transmit not on the basis of FlightSizePrev,
                but on the size of the congestion window. The equation in step
                E.3 of the TCP-aNCR specification is therefore equal to the one
                used in <xref target="RFC6675"/> (except for the 'skipped'
                variable). If an acceptable ACK is received during the
                execution of Extended Limited Transmit, re-entering Extended
                Limited Transmit makes any increase in cwnd immediately
                available. Hence, even in the case when persistently receiving
                SACKs, the available capacity of the communication path can be
                determined quickly.</t>

                <t>Another problem resulting from persistently receiving SACKs,
                and which is related to the increase in cwnd in response to
                received acceptable ACKs, is the reduction of cwnd due to a
                packet loss. When a packet is considered lost, the congestion
                control adjustment is done with respect to the amount of
                outstanding data at the beginning of Extended Limited Transmit,
                FlightSizePrev (step (Ret)). As in the previous case, an
                increase in cwnd is again not taken into account. A simple
                solution to the problem would be to perform the window
                reduction not on the basis of FlightSizePrev but analogous to
                step (E.2) based on the current size of cwnd.</t>

                <t>A problem with this solution is that cwnd can potentially be
                increased, although the TCP connection is limited by the
                application and not by cwnd. Although <xref target="RFC2861"/>
                specifies that an increase of cwnd is only applicable if cwnd
                is fully utilized, this behavior is not specified by any
                standards track document. But even this conservative increase
                behavior is guaranteed to not be conservative enough. If, from
                a single window of data, both segments are delayed but also
                lost, cwnd would first be increased in response to each
                received acceptable ACKs, while subsequently reduced due to the
                lost segments, which would not result in a halving of the cwnd
                any more.</t>

                <t>The solution proposed by TCP-aNCR reuses the state variable
                'recover' from <xref target="RFC6582"/> and adapts the approach
                taken by NewReno TCP and SACK TCP to detect, with help of the
                state variable, the end of one loss recovery phase properly,
                allowing to recover multiple losses from a single window of
                data efficiently. Therefore, by entering the 'disorder' state
                and the starting Extended Limited Transmit, TCP-aNCR saves the
                highest sequence number sent so far in 'recover'. If a received
                acceptable ACK covers more than 'recover', one cwnd's worth of
                data has been transmitted during Extended Limited Transmit
                without any packet loss. Hence, FlightSizePrev can be updated
                by 'pipe_max', which reflects the maximum amount of data that is
                considered to have been in the network during the last RTT.
                This update takes an interim increase in cwnd into account, so
                that in case of packet loss, the reduction in cwnd can be based
                on the current value of FlightSizePrev.</t>
            </section>
        </section>

        <!-- Interoperability Issues -->
        <section anchor="interoperability" title="Interoperability Issues">
            <t>TCP-aNCR requires that both the TCP
            Selective Acknowledgment Option <xref target="RFC2018"/> as well as
            a SACK-based loss recovery scheme compatible to one given in
            <xref target="RFC6675"/> are used by the TCP sender.
            Hence, compatibility to both specifications is REQUIRED.</t>

            <!-- Subsection: Early Retransmit -->
            <section anchor="interoperability:EarlyReXmit"
            title="Early Retransmit">
                <t>The specification of TCP-aNCR in this document and the Early
                Retransmit algorithm specified in <xref target="RFC5827"/>
                define orthogonal methods to modify DupThresh.  Early
                Retransmit allows the TCP sender to reduce the number of
                duplicate ACKs required to trigger a Fast Retransmit below the
                standard DupThresh of three, if FlightSize is less than 4*SMSS
                and no new segment can be sent. In contrast, TCP-aNCR allows,
                starting from the minimum of three duplicate ACKs, to increase
                the DupThresh beyond the standard of three duplicate ACKs to
                make TCP more robust to packet reordering, if the amount of
                outstanding data is sufficient to reach the increased DupThresh
                to trigger Fast Retransmit and Fast Recovery.</t>
            </section>

            <!-- Subsection: Congestion Window Validation -->
            <section anchor="interoperability:CWV"
            title="Congestion Window Validation">
                <t>The increase of the congestion window during
                application-limited periods can lead to an invalidation of the
                congestion window, in that it no longer reflects current
                information about the state of the network, if the congestion
                window might never have been fully utilized during the last
                RTT. According to <xref target="RFC2861"/>, the congestion
                window should, first, only be increased during slow-start or
                congestion avoidance if the cwnd has been fully utilized by the
                TCP sender and, second, gradually be reduced during each RTT in
                which the cwnd was not fully used.</t>

                <t>A problem that arises in this context is that during Careful
                Extended Limited Transmit, cwnd is not fully utilized due to
                the variable 'skipped' (see step (E.3)), so that - strictly
                following <xref target="RFC2861"/> - the congestion window
                should not be increased upon the receipt of an acceptable ACK.
                A trivial solution of this problem is to include the variable
                'skipped' in the calculation of <xref target="RFC2861"/> to
                determine whether the congestion window is fully utilized or
                not.</t>
            </section>

            <!-- Subsection: Reactive Response to Packet Reordering -->
            <section anchor="interoperability:Undo"
            title="Reactive Response to Packet Reordering">
                <t>As a proactive scheme with the aim to a priori prevent the
                negative impact that packet reordering has on TCP, TCP-aNCR can
                conceptually be combined with any reactive response to packet
                reordering, which attempts to mitigate the negative effects of
                reordering a posteriori. This is because the modifications of
                TCP-aNCR to the standard TCP congestion control and loss
                recovery <xref target="RFC6675"/> are implemented in the
                'disorder' state and are performed by the TCP sender before it
                enters loss recovery, while reactive responses to packet
                reordering operate generally after entering loss recovery, by
                undoing the unnecessarily changes to the congestion control
                state.</t>

                <t>If unnecessary changes to the congestion control state
                are undone after loss recovery, which is typically the case if
                a spurious Fast Retransmit is detected based on the DSACK
                option <xref target="RFC3708"/><xref target="RFC4015"/>, since
                first ACK carrying a DSACK option usually arrives at a TCP
                sender only after loss recovery has already terminated, it
                might happen that the restoring of the original value of the
                congestion window is done at a time at which the TCP sender is
                already back in again in the 'disorder' state and executing
                Extended Limited Transmit. While this is basically compatible
                with the TCP-aNCR specification - the undo simply represents an
                increase of the congestion window - however, some care must be
                taken that the combination of the algorithms does not lead to
                unwanted behavior.</t>
            </section>

            <!-- Subsection: Buffer Auto-Tuning -->
            <section anchor="interoperability:Auto-Tuning"
            title="Buffer Auto-Tuning">
                <t>Although all modifications of the TCP-aNCR algorithm are
                implemented in the TCP sender, the receiver also potentially
                has a part to play. If some segments from a single window of
                data are delayed by the packet reordering in the network, all
                segments that are received in out-of-order have to be queued in
                the receive buffer until the holes in sequence number space
                have been closed and the data can be delivered to the receiving
                application. In the worst case, which occurs if the TCP sender
                uses Aggressive Limited Transmit and the reordering delay is
                close to the RTT, TCP-aNCR increases the receiver's buffering
                requirement by up to an extra cwnd. Therefore, to maximize the
                benefits from TCP-aNCR, receivers should advertise a large
                window - ideally by using buffer auto-tuning algorithms - to
                absorb the extra out-of-order data. In the case that the
                additional buffer requirements are not met, the use of the
                above algorithm takes into account the reduced advertised
                window - with a corresponding loss in robustness to packet
                reordering.</t>
            </section>
        </section>

        <!-- Section: Related Work -->
        <section anchor="related" title="Related Work">
            <t>Over the past few years, several solutions have been proposed to
            improve the performance of TCP in the face of packet reordering.
            These schemes generally fall into one of two categories (with some
            overlap): mechanisms that try to prevent spurious retransmits from
            happening (proactive schemes) and mechanisms that try to detect
            spurious retransmits and undo the needless congestion control state
            changes that have been taken (reactive schemes).</t>

            <t><xref target="I-D.blanton-tcp-reordering"/>, <xref
                target="ZKFP03"/> and <xref target="LM05"/> attempt to prevent
            packet reordering from triggering spurious retransmits by using
            various algorithms to approximate the DupThresh required to
            disambiguate loss and reordering over a given network path at a
            given time. This basic principle is also used in TCP-aNCR.
            While <xref target="I-D.blanton-tcp-reordering"/> describes four
            basic approaches on how to increase the DupThresh and discusses
            pros and cons of these approaches, presents <xref target="ZKFP03"/>
            a relatively complex algorithm that saves the reordering extents in
            a histogram and calculates the DupThresh in a way that a certain
            percentage of samples is smaller then the DupThresh. <xref
                target="LM05"/> uses an EWMA for the same purpose. Both
            algorithms do not prevent all the spurious retransmissions by
            design.</t>

            <t>In contrast to the above mentioned algorithms Linux <xref
                target="Linux"/> implements a proactive scheme by setting the
            DupThresh to the highest detected reordering and resets only upon
            an RTO. To avoid a costly retransmission timeout due to the
            increased DupThresh Linux implements first an extension of the
            Limited Transmit algorithm, second limits the DupThresh to an upper
            bound of 127 duplicate ACKs, and third prematurely enters loss
            recovery if too few segments are in-flight to reach the DupThresh
            and no additional segments can send. Especially the last change is
            commendable since, besides TCP-NCR, none of the described
            algorithms in this section mention a similar concern.</t>

            <t><xref target="BHLLO06"/> and <xref target="BSRV04"/> presents
            proactive schemes based on timers by which the DupThresh is ignored
            altogether. After the timer is expired TCP initialize the loss
            recovery. In <xref target="BSRV04"/> this timer has a length of one
            RTT and is started when the first duplicate ACK is received,
            whereas the approach taken in <xref target="BHLLO06"/> solely relies
            on timers to detect packet loss without taking into account any
            other congestion signals such as duplicate ACKs. It assigns each
            segment send a timestamp and retransmits the segment if the
            corresponding timer fires.</t>

            <t>TCP-NCR <xref target="RFC4653"/> tries to prevent spurious
            retransmits similar to <xref target="I-D.blanton-tcp-reordering"/>
            or <xref target="ZKFP03"/> as it delays a retransmission to
            disambiguate loss and reordering. However, TCP-NCR takes a
            simplified approach by simply delay a retransmission by an amount
            based on the current cwnd (in comparison to standard TCP), while
            the other schemes use relatively complex algorithms in an attempt
            to derive a more precise value for DupThresh that depends on the
            current patterns of packet reordering. Many of the features offered
            by TCP-NCR have been taken into account while designing
            TCP-aNCR.</t>

            <t>Besides the proactive schemes, several other schemes have been
            developed to detect and mitigate needless retransmissions after the
            fact. The Eifel detection algorithm <xref target="RFC3522"/>, the
            detection based on DSACKs <xref target="RFC3708"/>, and F-RTO
            scheme <xref target="RFC5682"/> represent approaches to detect
            spurious retransmissions, while the Eifel response algorithm <xref
                target="RFC4015"/>, <xref
                target="I-D.blanton-tcp-reordering"/>, and Linux <xref
                target="Linux"/> present respectively implement algorithms to
            mitigate the changes these events made to the congestion control
            state. As discussed in <xref target="interoperability:Undo"/>
            TCP-aNCR could be used in conjunction with these algorithms, with
            TCP-aNCR attempting to prevent spurious retransmits and some other
            scheme kicking in if the prevention failed.</t>
        </section>

        <!-- Section: IANA Considerations -->
        <section anchor="iana" title="IANA Considerations">
            <t>This memo includes no request to IANA.</t>
        </section>

        <!-- Section: Security Considerations -->
        <section anchor="security" title="Security Considerations">
            <t>By taking dedicated actions so that the perceived packet
            reordering in the network is either underestimating or
            overestimating by the use of an relative and absolute reordering,
            an attacker or misbehaving TCP receiver has in regards to TCP's
            congestion control two options to bias a TCP-aNCR sender. An
            underestimation of the present packet reordering in the network
            occursi, if for example, a misbehaving TCP receiver already
            acknowledges segments while they are actually still in-flight,
            causing holes premature are closed in the sequence number space of
            the SACK scoreboard. With regard to TCP-aNCR the result of an
            underestimated packet reordering is a too small DupThresh,
            resulting in a premature loss recovery execution. In context of
            TCP's congestion control the effects of such attacks are limited
            since the lower bound of TCP-aNCR's DupThresh is the default value
            of three duplicate ACKs <xref target="RFC5681"/>, so that in worst
            case TCP-aNCR behaves equal to TCP SACK <xref
                target="RFC6675"/>.</t>

            <t>In contrast to an underestimation, an overestimation of the
            packet reordering in the network occurs, if for example, a
            misbehaving TCP receiver still further send SACKs for subsequent
            segments before it sends an acceptable ACK for the actually already
            received delayed segment, so that the hole in the sequence number
            space of the SACK scoreboard is later closed. In the context of
            TCP-aNCR the result of such an overestimation is a too large
            DupThresh, so that in the case of a packet loss TCP's loss recovery
            is executed later than necessary. Similar to the previous case,
            the effects of delayed entry into the loss recovery are limited
            because on the one hand TCP-NCR's DupThresh is used as an upper
            bound for TCP-aNCR's variable DupThresh so that the entrance to the
            loss recovery and the adaptation of the congestion window may be
            delayed at most one RTT. On the other hand, such a limited delay
            of the congestion control adjustment has even in the worst case
            only a limited impact on the performance of TCP connection and has
            generally been regarded as safe for use on the Internet <xref
                target="BBFS01"/>.</t>
        </section>

        <!-- Section: Acknowledgments --> <section anchor="acks"
            title="Acknowledgments">
            <t>The authors would like to thank Daniel Slot for his TCP-NCR
            implementation in Linux. We also thank the flowgrind <xref
                target="Flowgrind"/> authors and contributors for here
            performance measurement tool, which give us a powerful tool to
            analyze TCP's congestion control and loss recovery behavior in
            detail.</t>
        </section>

    </middle>

    <!-- BACK MATTER -->
    <back>

        <!-- Normative References -->
        <references title="Normative References">
            &rfc0793;
            &rfc2018;
            &rfc2119;
            &rfc3042;
            &rfc4653;
            &rfc5681;
            &rfc6582;
            &rfc6675;
            &rfc6928;
            &rfc6937;

            <reference anchor="I-D.zimmermann-tcpm-reordering-detection">
                <front>
                    <title>Detection and Quantification of Packet Reordering
                    with TCP</title>
                    <author surname="Zimmermann" initials="A"
                        fullname="Alexander Zimmermann">
                    </author>
                    <author surname="Schulte" initials="L"
                        fullname="Lennart Schulte">
                    </author>
                    <author surname="Wolff" initials="C"
                        fullname="Carsten Wolff">
                    </author>
                    <author surname="Hannemann" initials="A"
                        fullname="Arnd Hannemann">
                    </author>
                    <date month="November" day="28" year="2013"/>
                </front>
                <seriesInfo name="draft-zimmermann-tcpm-reordering-detection-01 (work in" value="progress)"/>
                <format type="TXT"
                    target="http://www.ietf.org/internet-drafts/draft-zimmermann-tcpm-reordering-detection-01.txt"/>
            </reference>
       </references>

        <!-- Informative References -->
        <references title="Informative References">
            &rfc0896;
            &rfc1122;
            &rfc2861;
            &rfc2960;
            &rfc3522;
            &rfc3708;
            &rfc4015;
            &rfc5682;
            &rfc5827;

            &blanton-tcp-reordering;

            <reference anchor="BSRV04">
                <front>
                    <title>TCP-DCR: A Novel Protocol for Tolerating
                        Wireless Channel Errors</title>
                    <author surname="Bhandarkar" initials="S"
                        fullname="Sumitha Bhandarkar">
                    </author>
                    <author surname="Sadry" initials="N. E"
                        fullname="Nauzad Erach Sadry">
                    </author>
                    <author surname="Reddy" initials="A. L. N"
                        fullname="A. L. Narasimha Reddy">
                    </author>
                    <author surname="Vaidya" initials="N"
                        fullname="Nitin H. Vaidya">
                    </author>
                    <date year="2005" month="September"/>
                </front>
                <seriesInfo name="IEEE Transactions on Mobile Computing"
                    value="vol. 4, no. 5.,  pp. 517-529"/>
            </reference>

            <reference anchor="ZKFP03">
                <front>
                    <title>RR-TCP: A Reordering-Robust TCP with
                        DSACK</title>
                    <author surname="Zhang" initials="M"
                        fullname="Ming Zhang">
                    </author>
                    <author surname="Karp" initials="B"
                        fullname="Brad Karp">
                    </author>
                    <author surname="Floyd" initials="S"
                        fullname="Sally Floyd">
                    </author>
                    <author surname="Peterson" initials="L"
                        fullname="Larry Peterson">
                    </author>
                    <date year="2003" month="November"/>
                </front>
                <seriesInfo name="Proceedings of the 11th IEEE
                    International Conference on Network Protocols
                    (ICNP'03)" value="pp. 95-106"/>
            </reference>

            <reference anchor="LM05">
                <front>
                    <title>Enhancing TCP Performance to Persistent Packet
                        Reordering</title>
                    <author surname="Leung" initials="C"
                        fullname="Ka-Cheong Leung">
                    </author>
                    <author surname="Ma" initials="C"
                        fullname="Changming Ma">
                    </author>
                    <date year="2005" month="September"/>
                </front>
                <seriesInfo name="KICS Journal of Communications and
                    Networks" value="vol. 7, no. 3, pp. 385-393"/>
            </reference>

            <reference anchor="BHLLO06">
                <front>
                    <title>A New TCP for Persistent Packet Reordering</title>
                    <author surname="Bohacek" initials="S"
                        fullname="Stephan Bohacek">
                    </author>
                    <author surname="Hespanha" initials="J"
                        fullname="Joao P. Hespanha">
                    </author>
                    <author surname="Lee" initials="J"
                        fullname="Junsoo Lee">
                    </author>
                    <author surname="Lim" initials="C"
                        fullname="Chansook Lim">
                    </author>
                    <author surname="Obraczka" initials="K"
                        fullname="Katia Obraczka">
                    </author>
                    <date year="2006" month="April"/>
                </front>
                <seriesInfo name="IEEE/ACM Transactions on Networking"
                    value="vol. 2, no. 14, pp. 369-382"/>
            </reference>

            <reference anchor="Linux" target="http://www.kernel.org">
                <front>
                    <title>The Linux Project</title>
                    <author />
                    <date />
                </front>
            </reference>

            <reference anchor="BBFS01">
                <front>
                    <title>Dynamic Behavior of Slowly Responsive Congestion
                        Control Algorithms</title>
                    <author surname="Bansal" initials="D"
                        fullname="Deepak Bansal">
                    </author>
                    <author surname="Balakrishnan" initials="H"
                        fullname="Hari Balakrishnan">
                    </author>
                    <author surname="Floyd" initials="S"
                        fullname="Sally Floyd">
                    </author>
                    <author surname="Shenker" initials="S"
                        fullname="Scott Shenker">
                    </author>
                    <date year="2001" month="September"/>
                </front>
                <seriesInfo name="Proceedings of the Conference on Applications,
                    Technologies, Architectures, and Protocols for Computer
                    Communication (SIGCOMM'01)" value="pp. 263-274"/>
            </reference>

            <reference anchor="Flowgrind"
                target="http://www.flowgrind.net">
                <front>
                    <title>Flowgrind Home Page</title>
                    <author />
                    <date />
                </front>
            </reference>
        </references>

        <!-- Section: Changes from previous versions of the draft -->
        <section anchor="changes" title="Changes from previous versions of the draft">
            <t>This appendix should be removed by the RFC Editor before
            publishing this document as an RFC.</t>

			<section anchor="changes_02"
			title="Changes from draft-zimmermann-tcpm-reordering-reaction-01">
				<t>
					<list style="symbols">
						<t>Specify interaction between TCP-aNCR and PRR.</t>
						<t>Fix typo in DupThresh calculation (steps I.5 and
						E.9).</t>
                    </list>
                </t>
            </section>

            <section anchor="changes_01"
            title="Changes from draft-zimmermann-tcpm-reordering-reaction-00">
                <t>
                    <list style="symbols">
                        <t>Improved the wording throughout the document.</t>
                        <t>Replaced and updated some references.</t>
                    </list>
                </t>
            </section>
        </section>
    </back>
</rfc>

PAFTECH AB 2003-20262026-04-24 04:39:57