One document matched: draft-zimmermann-tcp-lcd-02.xml


<?xml version="1.0" encoding="US-ASCII"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
     which is available here: http://xml.resource.org. -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
<!-- One method to get references from the online citation libraries.
     There has to be one entity for each item to be referenced.
     An alternate method (rfc include) is described in the references. -->
<!ENTITY rfc0791 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.0791.xml">
<!ENTITY rfc0792 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.0792.xml">
<!ENTITY rfc0793 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.0793.xml">
<!ENTITY rfc0826 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.0826.xml">
<!ENTITY rfc1122 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.1122.xml">
<!ENTITY rfc1323bis SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-tcpm-1323bis.xml">
<!ENTITY rfc1812 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.1812.xml">
<!ENTITY rfc2581bis SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-tcpm-rfc2581bis.xml">
<!ENTITY rfc2914 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2914.xml">
<!ENTITY rfc2988 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2988.xml">
<!ENTITY rfc3522 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3522.xml">
<!ENTITY rfc3819 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3819.xml">
<!ENTITY rfc4138bis SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-tcpm-rfc4138bis.xml">
<!ENTITY rfc4443 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4443.xml">
<!ENTITY rfc4884 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4884.xml">
<!ENTITY retransmit-now SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.eggert-tcpm-tcp-retransmit-now.xml">
<!ENTITY tcp-rlci SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.schuetz-tcpm-tcp-rlci.xml">
<!ENTITY linkup SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.dawkins-trigtran-linkup.xml">
<!ENTITY rfc2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY rfc2629 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2629.xml">
]>

<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs),
     please see http://xml.resource.org/authoring/README.html. -->
<!-- Below are generally applicable Processing Instructions (PIs) that most I-Ds
     might want to use. (Here they are set differently than their defaults in
     xml2rfc v1.32) -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="3"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space
     (using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->

<rfc category="exp" docName="draft-zimmermann-tcp-lcd-02" ipr="trust200902">
<!-- category values: std, bcp, info, exp, and historic
     ipr values: full3667, noModification3667, noDerivatives3667
     you can add the attributes updates="NNNN" and obsoletes="NNNN"
     they will automatically be output with "(if approved)" -->

    <!-- ***** FRONT MATTER ***** -->
    <front>

        <!-- The abbreviated title is used in the page header - it is only
             necessary if the full title is longer than 39 characters -->
        <title abbrev="Make TCP more Robust to LCDs">
        Make TCP more Robust to Long Connectivity Disruptions</title>

        <!-- add 'role="editor"' below for the editors if appropriate -->
        <author initials="A.Z."
                surname="Zimmermann"
                fullname="Alexander Zimmermann">
            <organization>RWTH Aachen University</organization>
            <address>
                <postal>
                    <street>Ahornstrasse 55</street>
                    <city>Aachen</city>
                    <region></region>
                    <code>52074</code>
                    <country>Germany</country>
                </postal>
                <phone>+49 241 80 21422</phone>
                <email>zimmermann@cs.rwth-aachen.de</email>
                <!-- uri and facsimile elements may also be added -->
            </address>
        </author>

        <author initials="A.H."
                surname="Hannemann"
                fullname="Arnd Hannemann">
            <organization>RWTH Aachen University</organization>
            <address>
                <postal>
                    <street>Ahornstrasse 55</street>
                    <city>Aachen</city>
                    <region></region>
                    <code>52074</code>
                    <country>Germany</country>
                </postal>
                <phone>+49 241 80 21423</phone>
                <email>hannemann@nets.rwth-aachen.de</email>
                <!-- uri and facsimile elements may also be added -->
            </address>
        </author>

        <date year="2009" />
        <!-- If the month and year are both specified and are the current ones,
             xml2rfc will fill in the current day for you. If only the current
             year is specified, xml2rfc will fill in the current day and month
             for you. If the year is not the current one, it is necessary to
             specify at least a month (xml2rfc assumes day="1" if not specified
             for the purpose of calculating the expiry date). With drafts it is
             normally sufficient to specify just the year. -->

        <!-- Meta-data Declarations -->

        <area>General</area>

        <workgroup>Internet Engineering Task Force</workgroup>
        <!-- WG name at the upperleft corner of the doc, IETF is fine for
             individual submissions. If this element is not present, the default
             is "Network Working Group", which is used by the RFC Editor as a
             nod to the history of the IETF. -->

        <keyword>Transmission Control Protocol (TCP),
        Internet Control Message Protocol (ICMP), Long Connectivity
        Disruptions</keyword>
        <!-- Keywords will be incorporated into HTML output
             files in a meta tag but they have no effect on text or nroff
             output. If you submit your draft to the RFC Editor, the
             keywords will be used for the search engine. -->

        <abstract>
            <t>Disruptions in end-to-end path connectivity which last longer
            than one retransmission timeout cause suboptimal TCP performance.
            The reason for the performance degradation is that TCP interprets
            segment loss induced by connectivity disruptions as a sign of
            congestion, resulting in repeated backoffs of the retransmission
            timer. This leads in turn to a deferred detection of the
            re-establishment of the connection since TCP waits until the next
            retransmission timeout occurs before attempting the
            retransmission.</t>

            <t>This document describes how standard ICMP messages can be
            exploited to disambiguate true congestion loss from non-congestion
            loss caused by long connectivity disruptions. Moreover, a revert
            strategy of the retransmission timer is specified that enables a
            more prompt detection of whether the connectivity to a previously
            disconnected peer node has been restored or not. The specified
            algorithm is a TCP sender-only modification that effectively
            improves TCP performance in presence of connectivity disruptions.
            </t>
    </abstract>

    </front>

    <!--  ***** MAIN MATTER ***** -->
    <middle>

        <!-- ***** Section: Terminology ***** -->
        <section anchor="terminology" title="Terminology">
            <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL",
            "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and
            "OPTIONAL" in this document are to be interpreted as described in
            <xref target="RFC2119" />.</t>

            <t>As defined in <xref target='RFC0793' />, the term "acceptable
            acknowledgment (ACK)" refers to a TCP segment that acknowledges
            previously unacknowledged data. The Transmission Control Protocol
            (TCP) sender state variable "SND.UNA" and the current segment
            variable "SEG.SEQ" are used as defined in <xref target='RFC0793' />.
            SND.UNA holds the segment sequence number of earliest segment that
            has not been acknowledged by the TCP receiver (the oldest
            outstanding segment). SEG.SEQ is the segment sequence number of a
            given segment.</t>

            <t>We use both the term "retransmission timer" and the term
            "retransmission timeout (RTO)" as defined in
            <xref target='RFC2988' />.</t>
        </section>

        <!-- ***** Section: Introduction ***** -->
        <section anchor="intro" title="Introduction">
            <t>Connectivity disruptions can occur in many different situations.
            The frequency of the connectivity disruptions depends thereby on
            the property of the end-to-end path between the communicating
            hosts. While connectivity disruptions can occur in traditional
            wired networks too, e.g., simply due to an unplugged network cable,
            the likelihood of occurrence is significantly higher in wireless
            (multi-hop) networks. Especially, end-host mobility, network
            topology changes and wireless interferences are crucial factors. In
            the case of the Transmission Control Protocol (TCP)
            <xref target='RFC0793' />, the performance of the connection can
            exhibit a significant reduction compared to a permanently connected
            path <xref target='SESB05' />. This is because TCP, which was
            originally designed to operate in fixed and wired networks,
            generally assumes that the end-to-end path connectivity is
            relatively stable over the connection's lifetime.</t>

            <t>According to Schuetz et. al.
            <xref target='I-D.schuetz-tcpm-tcp-rlci' /> connectivity disruptions
            can be classified into two groups: "short" and "long" connectivity
            disruptions. A connectivity disruption is short if connectivity
            returns before the retransmission timer fires for the first
            time. In this case, TCP recovers lost data segments through Fast
            Retransmit and lost acknowledgments (ACK) through successfully
            delivered later ACKs. Connectivity disruptions are declared as
            "long" for a given TCP connection, if the retransmission timer fires
            at least once before connectivity returns. Whether or not path
            characteristics like the round trip time (RTT) or the available
            bandwidth have changed when the connectivity returns after a
            disruption is another important aspect for TCP's retransmission
            scheme <xref target='I-D.schuetz-tcpm-tcp-rlci' />.</t>

            <t>This document will focus on TCP's behavior in face of long
            connectivity disruptions in the time "before" connectivity is
            restored. In particular this memo does not describe any additional
            modification to detect if the path characteristics remain unchanged
            in order to improve TCP's behavior "after" connectivity is restored.
            Therefore, TCP's congestion control mechanisms
            <xref target='I-D.ietf-tcpm-rfc2581bis' /> will be unchanged.</t>

            <t>When a long connectivity disruption occurs on a TCP connection,
            the TCP sender stops receiving acknowledgments. After the
            retransmission timer expires, the TCP sender enters the
            timeout-based loss recovery and declares the oldest outstanding
            segment (SND.UNA) as lost. Since TCP tightly couples reliability
            and congestion control, the retransmission of SND.UNA is triggered
            together with the reduction of sending rate, which is based on the
            assumption that loss is indication of congestion
            <xref target='I-D.ietf-tcpm-rfc2581bis' />. As long as the
            connectivity disruption persists, TCP will repeat the procedure
            until the oldest outstanding segment is successfully acknowledged,
            or the connection times out. TCP implementations that follow the
            recommended retransmission timeout (RTO) management of RFC 2988
            <xref target='RFC2988' /> double the RTO after each
            retransmission attempt. However, the RTO growth may be bounded by
            an upper limit, the maximum RTO, which is at least 60s, but may be
            longer: Linux for example uses 120s. If the connectivity is
            restored between two retransmission attempts, TCP still has to wait
            until the retransmission timer expires before resuming transmission,
            since it simply does not have any means to know when the connectivity is
            re-established. Therefore, depending on when connectivity becomes
            available again, this can waste up to maximum RTO of possible
            transmission time.</t>

            <t>This retransmission behavior is not efficient, especially in
            scenarios or networks like wireless (multi-hop) networks where
            connectivity disruptions are frequent. In the ideal case, TCP would
            attempt a retransmission as soon as connectivity to its peer is
            re-established. This document describes how the standard Internet
            Control Message Protocol (ICMP) can be exploited to identify
            non-congestion loss caused by connectivity disruptions. An revert
            strategy of the retransmission timer is specified that enables, due
            to higher-frequency retransmissions, a prompt detection of whether
            connectivity to a previously disconnected peer node has been
            restored. The specified scheme is a TCP sender-only modification,
            i.e., neither intermediate routers nor the TCP receiver have to be
            modified. Furthermore, in the case the network allows, i.e., no
            congestion is present, the proposed algorithm approaches the ideal
            behavior.</t>
        </section>

        <!-- ***** Section: Connectivity Disruption Indication ***** -->
        <section anchor="cdi" title="Connectivity Disruption Indication">
            <t>As long as the queue of an intermediate router experiencing a link
            outage is deep enough, i.e., it can buffer all incoming packets, a
            connectivity disruption will only cause variation in delay which is
            handled well by contemporary TCP implementations with the help of
            Eifel <xref target='RFC3522' /> or forward RTO (F-RTO)
            <xref target='I-D.ietf-tcpm-rfc4138bis' />. However, if the link
            outage lasts too long, the router experiencing the link outage is
            forced to drop packets and finally to discard the according route.
            Means to detect such link outages comprise reacting on failed
            address resolution protocol (ARP) <xref target='RFC0826' />
            queries, unsuccessful link sensing, and the like. However, this is
            solely in the responsibility of the respective router.

            <list style="empty">
                <t>Note: The focus of this memo is on introducing a method how
                ICMP messages may be exploited to improve TCP's performance;
                how different physical and link layer mechanisms underneath the
                network layer may trigger ICMP destination unreachable messages
                are out of scope of this memo.</t>
            </list>
            </t>

            <t>The removal of the route usually goes along with a notification
            to the corresponding TCP sender about the dropped packets via ICMP
            destination unreachable messages of code 0 (net unreachable) or
            code 1 (host unreachable) <xref target='RFC1812' />. Therefore,
            since ICMP destination unreachable messages of these codes provide
            evidence that packets were dropped due to a link outage, they can
            be used by a TCP as an indication for a connectivity
            disruption.</t>

            <t>Note that there are also other ICMP destination unreachable
            messages with different codes. Some of them are candidates for
            connectivity disruption indications too, but need further
            investigation. For example ICMP destination unreachable messages
            with code 5 (source route failed), code 11 (net unreachable for
            TOS), or code 12 (host unreachable for TOS)
            <xref target='RFC1812' />. On the other side codes that flag hard
            errors are of no use for the proposed scheme, since TCP should
            abort the connection when those are received
            <xref target='RFC1122' />. In the following, the term "ICMP
            unreachable message" is used as synonym for ICMP destination
            unreachable messages of code 0 or code 1.</t>

            <t>The accurate interpretation of ICMP unreachable messages as an
            connectivity disruption indication is complicated by the following
            two peculiarities of ICMP messages. Firstly, they do not necessarily
            operate on the same timescale as the packets, i.e., in the given
            case TCP segments, which elicited them. When a router drops a
            packet due to a missing route it will not necessarily send an ICMP
            unreachable message immediately, but rather queues it for later
            delivery. Secondly, ICMP messages are subject to rate limiting,
            e.g., when a router drops a whole window of data due to a link
            outage, it will hardly send as many ICMP unreachable messages as it
            dropped TCP segments. Depending on the load of the router it may
            even send no ICMP unreachable messages at all. Both peculiarities
            originate from <xref target='RFC1812' />.</t>

            <t>Fortunately, according to <xref target='RFC0792' /> ICMP
            unreachable messages are obliged to contain in their body the
            Internet Protocol (IP) header <xref target='RFC0791' /> of the
            datagram eliciting the ICMP unreachable messages plus the first 64
            bits of the payload of that datagram. Hence, in case of TCP both
            port numbers and the sequence number are included. This allows the
            originating TCP to identify the connection which an ICMP
            unreachable message is reporting an error about. Moreover, it
            allows the originating TCP to identify which segment of the
            respective connection triggered the ICMP unreachable message,
            provided that there are not several segments in flight with the
            same sequence number. This may very well be the case when TCP is
            recovering lost segments (see <xref target='alg_discuss' />).</t>

            <t>A connectivity disruption indication in form of an ICMP
            unreachable message associated with a presumably lost TCP segment
            provides strong evidence that the segment was not dropped due to
            congestion but instead was successful delivered to the temporary
            end-point of the employed path, i.e., the reporting router. It
            therefore did not witness any congestion at least on that very part
            of the path which was traveled by both, the TCP segment eliciting
            the ICMP unreachable message as well as the ICMP unreachable
            message itself.</t>
        </section>

        <!-- ***** Section: Connectivity Disruption Reaction ***** -->
        <section anchor="cdr" title="Connectivity Disruption Reaction">
            <t>In <xref target='alg_idea' /> the basic idea of the algorithm is
            given. The complete algorithm is specified in <xref target='alg' />.
            In <xref target='alg_discuss' /> the algorithm is discussed in
            detail.</t>

            <!-- ***** Subsection: The Idea ***** -->
            <section anchor="alg_idea" title="Basic Idea">
                <t>The goal of the algorithm is the prompt detection when the
                connectivity to a previously disconnected peer node has been
                restored after a long connectivity disruption while retaining
                appropriate behavior in case of congestion. The proposed
                algorithm exploits standard ICMP unreachable messages to
                increase the TCP's retransmission frequency during
                timeout-based loss recovery by undoing one retransmission
                timer backoff whenever an ICMP unreachable message reports on
                a presumably lost retransmission.</t>

                <t>This approach has the advantage of appropriately reducing the
                probing rate in case of congestion. If either the
                (re-)transmission itself, or the corresponding ICMP message is
                dropped the conventional backoff is performed and not undone,
                effectively halving the probing rate.</t>
            </section>

            <!-- ***** Subsection: The Algorithm ***** -->
            <section anchor="alg" title="The Algorithm">
                <t>A TCP sender using RFC 2988 <xref target='RFC2988' /> to
                compute TCP's retransmission timer MAY employ the following
                scheme to avoid over-conservative backoffs of the retransmission
                timer in case of long connectivity disruptions. If a TCP
                sender does implement the scheme, the following steps MUST be
                taken, but only upon initiation of a timeout-based loss
                recovery, i.e., upon the first timeout of the oldest outstanding
                segment (SND.UNA). The algorithm MUST NOT be re-initiated after
                a timeout-based loss recovery has already been started but not
                completed. In particular, it must not be re-initiated upon
                subsequent timeouts for the same segment.</t>

                <t>A TCP sender that does not employ RFC 2988
                <xref target='RFC2988' /> to compute TCP's retransmission timer
                SHOULD NOT use the scheme. We envision that the scheme could be
                easily adapted to other algorithms than RFC 2988. However,
                we leave this as future work.</t>

                <t>The scheme specified in this document uses the "Backoff_cnt"
                variable, whose initial value is zero. The variable is used to
                count the number of performed retransmission timer backoffs
                during one timeout-based loss recovery. Moreover, the "RTO_base"
                variable is used to recover the previous RTO in case the
                retransmission timer backoff was unnecessary. The variable is
                initialized with the RTO upon initiation of timeout-based
                loss recovery.</t>

                <t>
                    <list style='format (%d)' counter="cnt">
                        <t>Before the variable RTO gets updated when timeout-based
                        loss recovery is initiated, set the variable "Backoff_cnt"
                        and the variable "RTO_base" as follows:
                            <list style='empty'>
                                <t>Backoff_cnt := 0;</t>
                                <?rfc subcompact='yes' ?>
                                <t>RTO_base := RTO.</t>
                                <?rfc subcompact='no' ?>
                            </list>
                        Proceed to step (R).</t>
                    </list>

                    <list style='hanging' hangIndent='5'>
                        <t hangText="(R)">This is a placeholder for the behavior
                        that a standard TCP must execute at this point in case
                        the retransmission timer is expired. In particular if
                        RFC 2988 <xref target='RFC2988' /> is used, steps
                        (5.4) - (5.6) of that algorithm go here. Proceed to
                        step (2).</t>
                    </list>

                    <list style='format (%d)' counter="cnt">

                        <t>If the retransmission timer was backed off in the
                        previous step (R), then increment the variable
                        "Backoff_cnt" by one to account for the new backoff
                            <list style='empty'>
                                <t>Backoff_cnt := Backoff_cnt + 1.</t>
                            </list>
                        </t>

                        <t>Wait either
                            <list style='empty'>
                                <t>for the expiration of the retransmission
                                timer. When the retransmission timer expires,
                                proceed to step (R);</t>

                                <t>or for the arrival of an acceptable ACK. When
                                an acceptable ACK arrives, proceed to step (A);
                                </t>

                                <t>or for the arrival of an ICMP unreachable
                                message. When the ICMP unreachable message
                                ICMP_DU arrives, proceed to step (4).</t>
                           </list>
                        </t>
                    </list>

                    <list style='format (%d)' counter="cnt">
                        <t>If "Backoff_cnt > 0", i.e., an undoing of the last
                        retransmission timer backoff is allowed, then
                            <list style='empty'>
                                <t>proceed to step (5);</t>
                            </list>
                        else
                            <list style='empty'>
                                <t>proceed to step (3).</t>
                            </list>
                        </t>

                        <t>Extract the TCP segment header included in the ICMP
                        destination unreachable message ICMP_DU
                            <list style='empty'>
                                <t>SEG := Extract(ICMP_DU).</t>
                            </list>
                        </t>

                        <t>If "SEG.SEQ == SND.UNA", i.e., the ICMP unreachable
                        ICMP_DU message reports on the oldest outstanding
                        segment, then undo the last retransmission timer
                        backoff
                            <list style='empty'>
                                <t>Backoff_cnt := Backoff_cnt - 1;</t>
                                <?rfc subcompact='yes' ?>
                                <t>RTO := RTO_base * 2^(Backoff_cnt).</t>
                                <?rfc subcompact='no' ?>
                            </list>
                        </t>

                        <t>If the retransmission timer expires due to the undoing
                        in the previous step (6), then
                            <list style='empty'>
                                <t>proceed to step (R);</t>
                            </list>
                        else
                            <list style='empty'>
                                <t>proceed to step (3).</t>
                            </list>
                        </t>
                    </list>

                    <list style='hanging' hangIndent='5'>
                        <t hangText="(A)">This is a placeholder for the standard
                        TCP behavior that must be executed at this point in the
                        case an acceptable ACK has arrived. No further
                        processing.</t>
                    </list>
                </t>

                <t>When a TCP in steady-state detects a segment loss using the
                retransmission timer it enters the timeout-based loss recovery
                and initiates the algorithm (step 1). It adjusts the slow start
                threshold (ssthresh), sets the congestion window (CWND) to one
                segment, back offs the retransmission timer and retransmits
                the first unacknowledged segment (step R)
                <xref target='I-D.ietf-tcpm-rfc2581bis' />
                <xref target='RFC2988' />.</t>

                <t>In case the retransmission timer expires again (step 3a) a
                TCP will repeat the retransmission of the first unacknowledged
                segment and back off the retransmission timer once more (step R).
                If a maximum value is placed on the RTO (rule 2.5 in
                <xref target='RFC2988' />) and that maximum value is already
                reached the TCP will not backoff the retransmission timer in this
                step and thus "Backoff_cnt" MUST NOT be incremented. However,
                the "last step" to reach this maximum RTO is still considered as
                a backoff in the scope of this algorithm and "Backoff_cnt" MUST
                be incremented, even if the RTO is not strictly doubled.</t>

                <t>If the first received packet after the retransmission(s) is
                an acceptable ACK (step 3b), a TCP will proceed as normal, i.e.,
                slow start the connection and terminate the algorithm (step A).
                Later ICMP unreachable messages from the just terminated
                timeout-based loss recovery are of no use and therefore ignored
                since the ACK clock is already restarting due to the successful
                retransmission.</t>

                <t>On the other side if the first received packet after the
                retransmission(s) is an ICMP unreachable message (step 3c), a
                TCP SHOULD if allowed (step 4) undo one backoff for each ICMP
                unreachable message reporting an error on a retransmission. To
                decide if an ICMP unreachable message reports on a
                retransmission, the sequence number therein is exploited
                (step 5, step 6). The undo is done by re-calculating the
                RTO with the previously reduced "Backoff_cnt". This calculation
                explicitly matches the exponential backoff specified in
                <xref target='RFC2988' /> (rule 5.5).</t>

                <t>Upon receipt of an ICMP unreachable message which
                legitimately undoes one backoff there is the possibility that
                this new started retransmission timer has expired already
                (step 7). Then, a TCP SHOULD retransmit immediately, i.e., an
                ICMP message clocked retransmission. In case the new started
                retransmission timer has not expired yet, TCP MUST wait
                accordingly.</t>
            </section>

            <!-- ***** Subsection: Discussion ***** -->
            <section anchor="alg_discuss" title="Discussion">
                <t>It is important to note that the proposed algorithm only
                reacts to connectivity disruption indications in form of ICMP
                destination unreachable messages during the phase of RTO
                induced loss recovery. That is, TCP's behavior is not altered
                when no ICMP unreachable messages are received, or
                the retransmission timer of the TCP sender did not yet expire
                since the last successfully received ACK. Thereby the algorithm
                is by definition only triggered in the case of long
                connectivity disruptions.</t>

                <t>Only such ICMP unreachable messages which are reporting on
                the sequence number of the retransmission (SND.UNA) are
                evaluated by the proposed algorithm. All other ICMP unreachable
                messages are ignored. If an ICMP unreachable message arrives
                for a retransmission it provides evidence that neither the
                retransmission nor the corresponding ICMP unreachable message
                itself did experience any congestion. In other words, it has
                been proved that the retransmission was not lost due to
                congestion, but due to a connectivity disruption instead.</t>

                <t>One could argue, that if an ICMP unreachable
                message arrives for an RTO induced retransmission, the RTO
                should be reset, and the next retransmission sent out
                immediately similar to what is done when an ACK arrives after
                an RTO induced recovery phase. This would allow for a much
                higher probing frequency based on the round trip time of the
                router where the connectivity is disrupted. However, we
                consider our proposed scheme a good trade off between
                conservative behavior and a fast detection of connectivity
                re-establishment.</t>

                <t>Of course there is an ambiguity on which (re-)transmission
                an ICMP unreachable message reports. However, for our purposes
                it is not considered to be problem, because the assumption that
                such an ICMP message provides evidence that one link loss was
                wrongly considered as a congestion loss, still holds. There is
                also the option to make use of the timestamps option to obtain
                a more strict mapping between segments and ICMP messages (see
                <xref target='alg_discuss' />).</t>

                <t>Besides the ambiguity if the first unacknowledged sequence
                number refers to the original transmission or to any of the
                retransmissions, there is another source of ambiguity about the
                sequence numbers contained in the ICMP unreachable messages.
                For high bandwidth paths like modern gigabit links the sequence
                space may wrap rather quickly, thereby allowing the possibility
                that a late ICMP unreachable message reporting on an old error
                may coincidentally fit as input in the scheme explained above.
                As a result, the scheme would wrongly undo one backoff. Chances
                for this to happen are minuscule, since a particular ICMP
                message would need to contain the exact sequence number of
                SND.UNA, while at the same TCP is coincidentally in
                timeout-based loss recovery. Moreover, as the scheme is
                tailored most conservatively no threat to the network from this
                issues may arise.</t>

                <t>Finally, the scheme explicitly does not call for a
                differentiation of ICMP unreachable messages originating from
                different routers, as the evidence of no congestion still holds
                even if the reporting router changed.</t>

                <t>Another exploitation of ICMP unreachable messages in the
                context of TCP congestion control might seem appropriate in
                case the ICMP unreachable message is received while TCP is in
                steady-state and the message refers to a segment from within
                the current window of data. As the RTT up to the router which
                generates the ICMP unreachable message is likely to be
                substantially shorter than the overall RTT to the destination,
                the ICMP unreachable message may very well reach the
                originating TCP while it is transmitting the current window of
                data. In case the remaining window is large, it might seem
                appropriate to refrain from transmitting the remaining window
                as there is timely evidence that it will only trigger further
                ICMP unreachable messages at the very router. Although this
                might seem appropriate from a wastage perspective, it may be
                counterproductive from a security perspective since ICMP
                message are easy to spoof, thereby allowing an easy attack to
                the TCP by simply forging such ICMP messages.</t>

                <t>An additional consideration is the following: in the presence
                of multi-path routing even the receipt of a legitimate ICMP
                unreachable message cannot be exploited accurately because
                there is the option that only one of the multiple paths to the
                destination is suffering from a connectivity disruption which
                causes ICMP unreachable messages to be sent. Then however,
                there is the possibility that the path along which the
                connectivity disruption occurred contributed considerably to
                the overall bandwidth, such that a congestion response is very
                well reasonable. However, this is not necessarily the case.
                Therefore, a TCP has no means except for its inherent
                congestion control to decide on this matter. All in all, it
                seems that for a connection in steady-state, i.e., not in RTO
                induced recovery, reacting on ICMP unreachable messages in
                regard to congestion control is not appropriate. For the case
                of RTO-based retransmissions, however, there is a reasonable
                congestion response, which is skipping further backoffs of the
                retransmission timer because there is no congestion
                indication - as described above.</t>
            </section>

            <!-- ***** Subsection: Protecting Against Misbehaving Routers ***** -->
            <section anchor="alg_save" title="Protecting Against Misbehaving
                Routers (the Safe Variant)">
                <t>Given that the TCP Timestamps option
                <xref target='I-D.ietf-tcpm-1323bis' /> is enabled for a
                connection, a TCP sender MAY use the following algorithm to
                protect against misbehaving routers.</t>
            </section>
        </section>

        <!-- ***** Section: Related Work ***** -->
        <section anchor="related_work" title="Related Work">
            <t>In literature there are several methods that address TCP's
            problems in the presence of connectivity disruptions. Some of them
            try to improve TCP's performance by modifying lower layers. For
            example <xref target='SM03'/> introduces a "smart link layer" that
            buffers one segment for each ongoing connection and replaying these
            segments on connectivity re-establishment. This approach has a
            serious drawback: previously stateless intermediate routers have
            to be modified in order to inspect TCP headers, to track the
            end-to-end connection and to provide additional buffer space. These
            lead all in all to an additional need of memory and processing
            power.</t>

            <t>On the other hand stateless link layer schemes, like proposed in
            <xref target='RFC3819'/>, which unconditionally buffer some small
            number of packets may have another problem: if a packet is buffered
            longer than the maximum segment lifetime (MSL) of 2 min
            <xref target='RFC0793' />, i.e., the disconnection lasts longer than
            MSL, TCP's assumption that such segments will never be received
            will no longer be true, violating TCP's semantics
            <xref target='I-D.eggert-tcpm-tcp-retransmit-now' />.</t>

            <t>Other approaches like TCP-F <xref target='CRVP01' /> or the
            Explicit Link Failure Notification (ELFN) <xref target='HV02' />
            inform the TCP sender about a disrupted path by special messages
            generated from intermediate routers. In case of a link failure they
            stop sending segments and freeze TCP's retransmission timers. TCP-F
            stays in this state and remains silent until either a "route
            establishment notification" is received or an internal timer
            expires. In contrast, ELFN periodically probes the network to detect
            connectivity re-establishment. Both proposals rely on changes to
            intermediate routers, whereas the scheme proposed in this document
            is a sender-only modification. Moreover, ELFN also does not consider
            congestion and may impose serious additional load on the network,
            depending on the probe interval.</t>

            <t>The authors of ATCP <xref target='LS01' /> propose enhancements
            to identify different types of packet loss by introducing a layer
            between TCP and IP. They utilize ICMP destination unreachable
            messages to set TCP's receiver advertised window to zero and thus
            forcing the TCP sender to perform zero window probing with a
            exponential backoff. ICMP destination unreachable messages, which
            arrive during this probing period, are ignored. This approach is
            nearly orthogonal to this document, which exploits ICMP messages to
            undo a retransmission timer backoff when TCP is already probing. In
            principle both mechanisms could be combined, however, due to security
            considerations it does not seem appropriate to adopt ATCP's reaction
            as discussed in <xref target='alg_discuss' />.</t>

            <t>Schuetz et al. describe in
            <xref target='I-D.schuetz-tcpm-tcp-rlci' /> a set of TCP extensions
            that improve TCP's behavior when transmitting over paths whose
            characteristics can change on short time-scales. Their proposed
            extensions modify the local behavior of TCP and introduce a new TCP
            option to signal locally received connectivity-change indications
            (CCIs) to remote peers. Upon reception of a CCI, they re-probe the
            path characteristics either by performing a speculative
            retransmission or by sending a single segment of new data, depending
            on whether the connection is currently stalled in exponential
            backoff or transmitting in steady-state, respectively. The authors
            focus on specifying TCP response mechanisms, nevertheless
            underlying layers would have to be modified to explicitly send CCIs
            to make these immediate responses possible.</t>
        </section>

        <!-- ***** Section: IANA Considerations ***** -->
        <section anchor="iana" title="IANA Considerations">
            <t>This memo includes no request to IANA.</t>
        </section>

        <!-- ***** Section: Security Considerations ***** -->
        <section anchor="security" title="Security Considerations">
            <t>The proposed algorithm is considered to be secure. For example an
            attacker cannot make a TCP modified with proposed scheme flood the
            network just by sending forged ICMP unreachable messages to attempt
            to maliciously shorten the retransmission timer. An attacker would
            need to guess the correct sequence number of the current
            retransmission, which seems very unlikely. Even in case of an
            omniscient attacker, the impact on network load would be low, since
            the retransmission frequency is limited by the RTO which was
            computed before TCP has entered the timeout-based loss recovery.
            (The highest probing frequency is expected to be even lower than
            once per minimum RTO, that is 1s as specified by
            <xref target='RFC2988' />.)</t>
        </section>

        <!-- ***** Section: Acknowledgments ***** -->
        <section anchor="acks" title="Acknowledgments">
            <t>We would like to thank Timothy Shepard and Joe Touch for feedback
            on earlier versions of this draft. We also thank Michael Faber,
            Daniel Schaffrath, and Damian Lukowski for implementing and testing
            the algorithm in Linux. Special thanks go to Ilpo Jarvinen, who
            gave valuable feedback regarding the Linux implementation.</t>

            <t>This document was written with the xml2rfc tool described
            in <xref target="RFC2629" />.</t>
        </section>

    </middle>

    <!--  ***** BACK MATTER ***** -->

    <back>

        <!-- There are 2 ways to insert reference entries from the citation
             libraries:
             1. define an ENTITY at the top, and use "ampersand character"RFC2629; here (as shown)
             2. simply use a PI "less than character"?rfc include="reference.RFC.2119.xml"?> here
             (for I-Ds: include="reference.I-D.narten-iana-considerations-rfc2434bis.xml")

             Both are cited textually in the same manner: by using xref elements.
             If you use the PI option, xml2rfc will, by default, try to find
             included files in the same directory as the including file. You can
             also define the XML_LIBRARY environment variable with a value
             containing a set of directories to search. These can be either in
             the local filing system or remote ones accessed by
             http (http://domain/dir/... ).-->

        <!-- References split into informative and normative -->
        <references title="Normative References">
            &rfc0792;

            &rfc0793;

            &rfc1812;

            &rfc1323bis;

            &rfc2581bis;

            &rfc2988;

            &rfc4443;
        </references>

        <references title="Informative References">
            &rfc0791;

            &rfc0826;

            &rfc1122;

            &rfc2119;

            &rfc2629;

<!--        &rfc2914; -->

            &rfc3522;

            &rfc3819;

            &rfc4138bis;

            &rfc4884;

            &retransmit-now;

            &tcp-rlci;

            <reference anchor="SESB05" target="">
                <front>
                    <title>Protocol enhancements for intermittently connected
                    hosts</title>
                    <author surname="Schuetz" initials="S."
                            fullname="Simon Schuetz">
                        <organization />
                    </author>
                    <author surname="Eggert" initials="L."
                            fullname="Lars Eggert">
                        <organization />
                    </author>
                    <author surname="Schmid" initials="S."
                            fullname="Stefan Schmid">
                        <organization />
                    </author>
                    <author surname="Brunner" initials="M."
                            fullname="Marcus Brunner">
                        <organization />
                    </author>
                    <date year="2005" month="December"/>
                </front>
                <seriesInfo name="SIGCOMM Computer Communication Review"
                value="vol. 35, no. 3, pp. 5-18" />
            </reference>

            <reference anchor="SM03" target="">
                <front>
                    <title>Link layer-based TCP optimisation for disconnecting
                    networks</title>
                    <author surname="Scott" initials="J."
                            fullname="James Scott">
                        <organization />
                    </author>
                    <author surname="Mapp" initials="G."
                            fullname="Glenford Mapp">
                        <organization />
                    </author>
                    <date year="2003" month="October"/>
                </front>
                <seriesInfo name="SIGCOMM Computer Communication Review"
                value="vol. 33, no. 5, pp. 31-42" />
            </reference>

            <reference anchor="CRVP01" target="">
                <front>
                    <title>A feedback-based scheme for improving TCP performance
                    in ad hoc wireless networks</title>
                    <author surname="Chandran" initials="K."
                            fullname="Kartik Chandran">
                        <organization />
                    </author>
                    <author surname="Raghunathan" initials="S."
                            fullname="Sudarshan Raghunathan">
                        <organization />
                    </author>
                    <author surname="Venkatesan" initials="S."
                            fullname="Subbarayan Venkatesan">
                        <organization />
                    </author>
                    <author surname="Prakash" initials="R."
                            fullname="Ravi Prakash">
                        <organization />
                    </author>
                    <date year="2001" month="February"/>
                </front>
                <seriesInfo name="IEEE Personal Communications"
                value="vol. 8, no. 1, pp. 34-39" />
            </reference>

            <reference anchor="HV02" target="">
                <front>
                    <title>Analysis of TCP performance over mobile ad hoc
                    networks</title>
                    <author surname="Holland" initials="G."
                            fullname="Gavin Holland">
                        <organization />
                    </author>
                    <author surname="Vaidya" initials="N."
                            fullname="Nitin Vaidya">
                        <organization />
                    </author>
                    <date year="2002" month="March"/>
                </front>
                <seriesInfo name="Wireless Networks"
                value="vol. 8, no. 2-3, pp. 275-288" />
            </reference>

            <reference anchor="LS01" target="">
                <front>
                   <title>ATCP: TCP for mobile ad hoc networks</title>
                    <author surname="Liu" initials="J."
                            fullname="Jian Liu">
                        <organization />
                    </author>
                    <author surname="Singh" initials="S."
                            fullname="Suresh Singh">
                        <organization />
                    </author>
                    <date year="July" month="2001"/>
                </front>
                <seriesInfo name="IEEE Journal on Selected Areas in
                Communications" value="vol. 19, no. 7, pp. 1300-1315" />
            </reference>
<!--
            <reference anchor="ZSH08" target="">
                <front>
                    <title>Improving TCP's Robustness to Long Connectivity
                    Disruptions</title>
                    <author surname="Zimmermann" initials="A."
                            fullname="Alexander Zimmermann">
                        <organization />
                    </author>
                    <author surname="Schaffrath" initials="D."
                            fullname="Daniel Schaffrath">
                        <organization />
                    </author>
                    <author surname="Hannemann" initials="A."
                            fullname="Arnd Hannemann">
                        <organization />
                    </author>
                    <date year="2008" month="November"/>
                </front>
                <seriesInfo name="Proceedings of the 20th IEEE Global
                Communications Conference (GLOBECOM'08)" value="" />
            </reference>
-->
        </references>

        <!-- ***** Section: TODO list ***** -->
        <section anchor="todo" title="TODO list">
            <t>
                <list style="symbols">
                    <t>Extend the Security Sections
                    <xref target='alg_save' format="counter" /> and
                    <xref target='security' format="counter" />.</t>

                    <t>Extend discussion in <xref target='alg_discuss' />
                        <list style="symbols">
                            <t>ICMPv6. See <xref target='RFC4443' /> and
                            <xref target='RFC4884' />.</t>

                            <t>Explicit Congestion Notification (ECN).</t>

                            <t>More about congestion in general.</t>
                        </list>
                    </t>

                    <t>Mention the possible side-effect on TCP implementations
                    that measure the thresholds R1 and R2 (Section 4.2.3.5 of
                    <xref target='RFC1122' />) as a count of retransmissions
                    instead of time units.</t>

                    <t>Discuss the influence of packet duplication on the
                    algorithm (Thanks to Ilpo).</t>
                </list>
            </t>
        </section>

        <!-- ***** Section: Changes from previous versions of the draft ***** -->
        <section anchor="changes" title="Changes from previous versions of the draft">
            <section anchor="changes_02" title="Changes from draft-zimmermann-tcp-lcd-01">
                <t>
                    <list style="symbols">
                        <t>The algorithm in <xref target='alg' /> was
                        slightly changed. Instead of reverting the RTO by
                        halving it, it is recalculated with help of the
                        "Backoff_cnt" variable. This fixes an issue that
                        occurred when the retransmission timer was backed off
                        but bounded by a maximum value. The algorithm in the
                        previous version of the draft, would have "reverted" to
                        half of that maximum value, instead of using the value,
                        before the RTO was doubled (and then bounded).</t>

                        <t>Miscellaneous editorial changes.</t>

                        <t>Extended the TODO list (<xref target='todo' />).</t>
                    </list>
                </t>
            </section>

            <section anchor="changes_01" title="Changes from draft-zimmermann-tcp-lcd-00">
                <t>
                    <list style="symbols">
                        <t>Miscellaneous editorial changes in Section
                        <xref target='terminology' format="counter" />,
                        <xref target='intro' format="counter" /> and
                        <xref target='cdi' format="counter" />.</t>

                        <t>The document was restructured in Section
                        <xref target='terminology' format="counter" />,
                        <xref target='intro' format="counter" /> and
                        <xref target='cdi' format="counter" /> for easier
                        reading. The motivation for the algorithm is changed
                        according TCP's problem to disambiguate congestion from
                        non-congestion loss.</t>

                        <t>Added <xref target='alg_idea' />.</t>

                        <t>The algorithm in <xref target='alg' /> was
                        restructured and simplified:
                            <list style="symbols">
                                <t>The special case of the first received ICMP
                                destination unreachable message after an RTO was
                                removed.</t>

                                <t>The "Backoff_cnt" variable was introduced so
                                it is no longer possible to perform more reverts
                                than backoffs.</t>
                            </list>
                        </t>

                        <t>The discussion in <xref target='alg_discuss' /> was
                        improved and expanded according to the algorithm
                        changes.</t>

                        <t>Added <xref target='alg_save' />.</t>
                    </list>
                </t>
            </section>

        </section>

    </back>
</rfc>

PAFTECH AB 2003-20262026-04-21 20:27:01