One document matched: draft-scheffenegger-tcpm-timestamp-negotiation-03.xml


<?xml version="1.0" encoding="US-ASCII"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
    which is available here: http://xml.resource.org. -->
<!DOCTYPE rfc SYSTEM "http://xml.resource.org/authoring/rfc2629.dtd" [
<!-- One method to get references from the online citation libraries.
    There has to be one entity for each item to be referenced.
    An alternate method (rfc include) is described in the references. -->

<!ENTITY RFC0793 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.0793.xml">
<!ENTITY RFC0896 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.0896.xml">
<!ENTITY RFC1122 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.1122.xml">
<!ENTITY RFC1323 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.1323.xml">
<!ENTITY RFC2018 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2018.xml">
<!ENTITY RFC2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY RFC2883 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2883.xml">
<!ENTITY RFC2988 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2988.xml">
<!ENTITY RFC3517 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3517.xml">
<!ENTITY RFC3522 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3522.xml">
<!ENTITY RFC3782 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3782.xml">
<!ENTITY RFC4015 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4015.xml">
<!ENTITY RFC4987 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4987.xml">
<!ENTITY RFC5681 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5681.xml">
<!ENTITY RFC5827 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5827.xml">
<!ENTITY RFC6013 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6013.xml">
<!ENTITY RFC6247 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6247.xml">
<!ENTITY RFC6298 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6298.xml">


<!ENTITY I-D.narten-iana-considerations-rfc2434bis SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.narten-iana-considerations-rfc2434bis.xml">
]>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs),
    please see http://xml.resource.org/authoring/README.html. -->
<!-- Below are generally applicable Processing Instructions (PIs) that most I-Ds might want to use.
    (Here they are set differently than their defaults in xml2rfc v1.32) -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="4"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space
    (using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->

<?rfc comments="yes" ?>

<!--
VER - may change to TYPE (because a ver0 client may always respond 
with ver0, also to ver1, 2, 3...)
MASK VALID Bit - use 4 MASK bits, 1 VLD bit; invalid MASK bits -> 
more codepoints in header, blank out TSval for lower version receivers (no paws, owd,...)
-->

<rfc 
  category="exp" 
	docName="draft-scheffenegger-tcpm-timestamp-negotiation-03" 
	ipr='trust200902'
	updates="1323">
  <!-- category values: std, bcp, info, exp, and historic
    ipr values: full3667, noModification3667, noDerivatives3667
    you can add the attributes updates="NNNN" and obsoletes="NNNN"
    they will automatically be output with "(if approved)" -->

  <!-- ***** FRONT MATTER ***** -->

  <front>
    <!-- The abbreviated title is used in the page header - it is only necessary if the
        full title is longer than 39 characters -->

    <title abbrev="Timestamp Negotiation">Additional negotiation in the TCP Timestamp Option field
                    during the TCP handshake
    </title>

    <!-- add 'role="editor"' below for the editors if appropriate -->

    <!-- Another author who claims to be an editor -->

    
    <author fullname="Richard Scheffenegger" initials="R." 
           surname="Scheffenegger">
     <organization>NetApp, Inc.</organization>
     <address>
       <postal>
         <street>Am Euro Platz 2</street>
         <code>1120</code>
         <city>Vienna</city>
         <region></region>
         <country>Austria</country>
       </postal>
       <phone>+43 1 3676811 3146</phone>
       <email>rs@netapp.com</email>
     </address>
    </author>
  
    <author fullname="Mirja Kuehlewind" initials="M."
    	surname="Kuehlewind">
    	<organization>University of Stuttgart</organization>
    	<address>
    		<postal>
    			<street>Pfaffenwaldring 47</street>
    			<code>70569</code>
    			<city>Stuttgart</city>
    			<country>Germany</country>
    		</postal>
    		<email>mirja.kuehlewind@ikr.uni-stuttgart.de</email>
    	</address>
    </author>
<!--
    <author fullname="Bob Briscoe" initials="B."
    	surname="Briscoe">
    	<organization>BT Research</organization>
    	<address>
    		<postal>
    			<street>B54/77, Sirius House Adastral Park Martlesham Heath</street>
    			<code>IP5 3RE</code>
    			<city>Ipswich, Suffolk</city>
    			<country>United Kingdom</country>
    		</postal>
    		<email>bob.briscoe@bt.com</email>
    	</address>
    </author>
-->
    <date year="2011" />

    <!-- If the month and year are both specified and are the current ones, xml2rfc will fill
        in the current day for you. If only the current year is specified, xml2rfc will fill
    in the current day and month for you. If the year is not the current one, it is
    necessary to specify at least a month (xml2rfc assumes day="1" if not specified for the
    purpose of calculating the expiry date).  With drafts it is normally sufficient to
    specify just the year. -->

    <!-- Meta-data Declarations -->

    <area>Transport</area>

    <workgroup>TCP Maintenance and Minor Extensions (tcpm)</workgroup>

    <!-- WG name at the upperleft corner of the doc,
        IETF is fine for individual submissions.  
    If this element is not present, the default is "Network Working Group",
        which is used by the RFC Editor as a nod to the history of the IETF. -->

    <keyword>Internet-Draft</keyword>
    <keyword>I-D</keyword>

    <!-- Keywords will be incorporated into HTML output
        files in a meta tag but they have no effect on text or nroff
        output. If you submit your draft to the RFC Editor, the
        keywords will be used for the search engine. -->

    <abstract>
<!--     <t>The Timestamp option defined in RFC1323 carries an opaque value
     with certain properties, for the soletary purpose of measuring the
     round trip time per segment on the sender side. That protocol requires
     only minimal state in both sender and receiver. </t>
     <t>However, --> <t>A number of TCP enhancements in so diverse
     fields as congestion control, loss recovery or side-band signaling
     could be improved by allowing both ends of a TCP session to interpret
     the values carried in the Timestamp option. Further enhancements are
     enabled by changing the receiver side processing of timestamps in the 
     presence of Selective Acknowledgements.</t> 
     <t> This documents updates RFC1323 and specifies a backwards compatible 
     	way of negotiating for Timestamp capabilities, and lists a number of 
     	benefits and drawbacks of this approach. </t>
     	
<!--     	the use of the TSecr field during the initial SYN
     to negotiate capabilities and signal additional information about 
     the content of the TSopt fields as well as the behavior of the receiver. 
     if the receiver understands this extension, it will use the TSecr field 
     of the SYN/ACK to reply a combination
     of the TSval and the receivers capabilities. 
     Otherwise the receiver will ignore the TSecr field and set a 
     timestamp in the TSecr field as specified in RFC 1323.</t>
     <t>Specifying detailed use cases enabled by this modification in
     	Timestamp capability signaling, or providing detailed guidelines as to how
     	the changed reflected timestamps interact with legacy uses of the
     	timestamp option are out of scope of this document.</t>-->
    </abstract>
  </front>

  <middle>
  	<section anchor="intro" title="Introduction">
  		<t>The timestamp option originally introduced in <xref target="RFC1323"/> was designed 
  			solely for two-way delay measurement and to support a particular TCP 
  			algorithm (Reno). It would be useful to be able to support one-way 
  			delay measurement and to take advantage of developments since TCP 
  			Reno, such as selective acknowledgements (SACK) <xref target="RFC2018"/>.
  		</t> 
  		<t>This specification defines a protocol for the two ends of a TCP session 
  			to negotiate alternative semantics for the timestamps they will exchange 
  			during the rest of the session. It updates RFC1323 but it is backwards 
  			compatible with implementations of RFC1323 timestamp options.
  		</t>
  		<t>The RFC1323 timestamp protocol presents the following problems when 
  			trying to extend it for alternative uses:
  			<list style="letters">
  				<t>Unclear meaning of the value in a timestamp.
  					<list style="symbols">
							<t>A timestamp value (TSval) as defined in <xref target="RFC1323"/>
								is deliberately only meaningful to the end that sends it. The 
								other end is merely meant to echo the value without understanding 
								it. This is fine if one end is trying to measure two-way delay 
								(round trip time). However, to measure one-way delay, timestamps 
								from both ends need to be compared by one end, which needs to 
								relate the values in timestamps from both ends to a notion of 
								the passage of time that both ends share.
							</t>
						</list>
					</t>
					<t>No control over which timestamp to echo.
						<list style="symbols">
							<t>A host implementing <xref target="RFC1323"/> is meant to echo 
								the timestamp value of the most recent in-order segment received. 
								This was fine for TCP Reno, but it is not the best choice for 
								TCP sessions using selective acknowledgement (SACK) 
								<xref target="RFC2018"/>.
							</t>
							<t>A <xref target="RFC1323"/> host is meant to echo the timestamp 
								value of the earliest unacknowledged segment, e.g. if a host 
								delays ACKs for one segment, it echoes the first timestamp not 
								the second. It is desirable to include delay due to ACK withholding 
								when a host is conservatively measuring RTT. However, is not 
								useful to include the delay due to ACK withholding when measuring 
								one-way delay.
							</t>
						</list>
					</t>
					<t>Alternative protection against wrapped sequence numbers.
						<list style="symbols">
							<t><xref target="RFC1323"/> also points out that the timestamps it 
								specifies will always strictly monotonically increase in each window 
								so they can be used to protect against wrapped sequence numbers 
								(PAWS). If the endpoints negotiate an alternative timestamp 
								scheme in which timestamps may not monotonically increase per 
								window, then it needs to be possible to negotiate alternative 
								protection against wrapped sequence numbers.
							</t>
						</list>
					</t>
				</list>
			</t>
			<t>To solve these problems this specification changes the wire protocol 
				of the TCP timestamp option in two main ways:
				<list style="numbers">
					<t>It updates <xref target="RFC1323"/> to add the ability to negotiate 
						the semantics of timestamp options. The initiator of a TCP session 
						starts the negotiation in the TSecr field in the first <SYN>, which is 
						currently unused. This specification defines the semantics of the 
						TSecr field in a segment with the SYN flag set. A version number is 
						included to allow further extension of capability negotiation in 
						future.
					</t>
					<t>A version independent ability to mask a specified number of the 
						lower significant bits of the timestamp values is present. These
						masked bits are not considered for timestamp calculations, or in 
						an algorithm to protect	against wrapped sequence numbers. Future
						extensions can thereby change the timestamp signaling
						without changing the modified treatment on the receiver side.
					</t>
					<t>It updates <xref target="RFC1323"/> to define version 0 of 
						timestamp capabilities to include:
						<list style="symbols">
							<t>the duration in seconds of a tick of the timestamp clock using 
								a floating point representation
							</t>
							<t>agreement that both ends will echo the timestamp on the most 
								recently received segment, rather than the one that would be 
								echoed by an <xref target="RFC1323"/> host. There is no specific 
								option to request this behavior, however it is implied by 
								successful negotiation of both SACK and timestamp capabilities.
							</t>
							
						</list>
					</t>
				</list>
			</t>
			<t>With this new wire protocol, a number of new use-cases for the TCP 
				timestamp option become possible. <xref target="uses"/> gives 
				some examples. Further extensions might be required in future. 
				<xref target="AppA"/> gives an example of a further version of 
				timestamp capability negotiation that could be defined in the
				future.
			</t>
      <t><vspace blankLines='100' /></t>
		</section>

    <section title="Terminology"> 
    	<t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", 
    		"SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
        document are to be interpreted as described in 
        <xref target="RFC2119"/>.
      </t>
      <t>The reader is expected to be familiar with the definitions given in 
        <xref target="RFC1323"/>.
      </t>
			<t>Further terminology used within this document:
				<list style="hanging" hangIndent="4">
					<t hangText="Timestamp clock interval"><vspace />
						<!--For signaling purposes, the interval is not directly indicated 
						in the protocol in the SI standard unit of one second, but with
						a much higher precision as the basic interval. The reason is to have high 
						precision	at long intervals (low frequencies) available in
						the encoding (see <xref target="signal"/> for details).-->
						The Timestamp value is derived from a clock source running at a 
						reasonable constant frequency. The interval between two ticks of 
						that clock is signaled during the timestamp capability 
						negotiation. Note that the timestamp clock is not required to be 
						identical with the TCP clock, even though most implementations
				    use the same clock for practical purposes.
					</t>
					<t hangText="Timestamp option"><vspace />
						This refers to the entire TCP timestamp option, including both 
						TSval and TSecr fields.
					</t>
					<t hangText="Timestamp capabilities"><vspace />
						Refers only to the values and bits carried in the TSecr field
						of <SYN> and <SYN,ACK> segments during a TCP handshake. For signaling
						purposes, the timestamp capabilities are sent in clear
						with the <SYN> segment, and in an	encoded form (see 
						<xref target="signal"/> for details) in the <SYN,ACK> segment.
					</t>
				</list>
			</t>
      <t><vspace blankLines='100' /></t>
    </section>

    <section title="Overview">
 	    <section anchor="overview" title="Overview of the TCP Timestamp Option">
        <t>The TCP Timestamp option (TSopt) provides timestamp echoing for
          round-trip time (RTT) measurements.  TSopt is widely deployed and
       		activated by default in many systems. <xref target="RFC1323"/> specifies
     	  	TSopt the following way:
     	  </t>
<figure anchor="f_tsopt" title="RFC1323 TSopt" align="center">
<artwork align="center"><![CDATA[
   Kind: 8

   Length: 10 bytes

   +-------+-------+---------------------+---------------------+
   |Kind=8 |  10   |   TS Value (TSval)  |TS Echo Reply (TSecr)|
   +-------+-------+---------------------+---------------------+
       1       1              4                     4
]]></artwork></figure>
   		  <t>
   			  <list style="empty">
     			  <t>"The Timestamps option carries two four-byte timestamp fields.
              The Timestamp Value field (TSval) contains the current value of
              the timestamp clock of the TCP sending the option.
            </t>
            <t>The Timestamp Echo Reply field (TSecr) is only valid if the ACK
              bit is set in the TCP header; if it is valid, it echos a times-
              tamp value that was sent by the remote TCP in the TSval field
              of a Timestamps option.  When TSecr is not valid, its value
              must be zero.  The TSecr value will generally be from the most
              recent Timestamp option that was received; however, there are
              exceptions that are explained below.
            </t>
            <t>A TCP may send the Timestamps option (TSopt) in an initial
              <SYN> segment (i.e., segment containing a SYN bit and no ACK
              bit), and may send a TSopt in other segments only if it received 
              a TSopt in the initial <SYN> segment for the connection."
            </t>
          </list>
        </t>
        <t>The comparison of the timestamp in the TSecr field to the current 
      	  timestamp clock gives an estimation of the two-way delay (RTT).
      	  With <xref target="RFC1323"/> the receiver is not supposed to interpret 
	        the TSVal field for timing purposes, e.g. one-way delay measurments, 
	        but only to echo the content in the TSecr field.
         	<xref target="RFC1323"/> specifies various cases when more than one 
      	  timestamp is available to echo.  The approach taken by 
      	  <xref target="RFC1323"/> is not always be the best choice, i.e. when 
      	  the TCP Selective Acknowledgment option (SACK) is used in 
      	  conjunction.  
      	</t>
      </section>
      	
      <section title="Overview of the Timestamp Capabilities">
      		
<!--
      	In addition there are use cases where one-way delay 
      	(OWD) measurements are needed.  These mechanisms usually also rely 
      	on the TSopt to estimated the variation in OWD. Current 
      	implementations are based around certain assumptions,
      	<list><t>
      	<list style="symbols">
      		<t>sender using one specific timestamp clock interval, or 
      		</t>
      		<t>one specific	interval from a limited set of possible timestamp 
      			clock intervals, or
      		</t> 
      		<t>the network conditions do not change for a short training 
      			period while timestamp values are sampled, and 
      		</t>
      		<t>the sender using all bits of TSval to reflect the timestamp 
      			clock value directly with no bits used for different purposes 
      			such as covert channels or integrity verification.
      		</t>
      	</list>
      	</t></list>
      	These assumptions may not be valid in general in the 
      	public internet.
      </t>-->
        <t>This document specifies a way of negotiating the timestamp 
      	  capabilities available between the end hosts.  This is enabled 
      	  by using the TSecr field in the TCP <SYN> segment. In order to remain 
      	  backwards compatible, a receiver capable of timestamp capability 
      	  negotiation has to XOR the receivers (local) capabilities flags 
      	  with the received TSval, before echoing the result back in the 
      	  TSecr field. During the initial handshake, the sender has to store 
      	  the sent initial TSval, in order to determine if the receiver can 
      	  support this timestamp capability negotiation.
        </t><!--
      <t>Enhancements in the area of TCP congestion control can use the 
      	measurement of the one-way delay variation as one input. However, 
      	without explicit knowledge of the partner's timestamp clock, 
      	arriving at a good estimate requires a training phase over 
      	multiple segment exchanges. In this phase, the network conditions 
      	need remain nearly static to arrive at good measurements. In 
      	addition, the receiver has to assume that the full TSval 
      	represents the timestamp clock value of the sender, with no 
      	different use of some bits of the TSval. Covert channels or 
      	fingerprinting a timestamp value artificially increase the 
      	measurement noise, and a receiver may be lead to assume a smaller 
      	timestamp clock interval than what is actually implemented by the 
      	sender. In order to assist such algorithms, explicit knowledge 
      	at an early phase of the session needs to be negotiated.
      </t>
      <t>In addition, by using synergistic signaling between timestamps 
      	<xref target="RFC1323"/> and selective acknowledgments 
      	<xref target="RFC2018"/>, enhancements in loss recovery are 
      	possible by removing any remaining retransmission and acknowledgment 
      	ambiguity. See <xref target="uses"/> for a detailed discussion.
      </t>
     	<t>Receivers conforming to <xref target="RFC1323"/> are required 
     		to only reflect the timestamp of the last segment that was 
     		received in order, or the timestamp of the last not yet 
     		acknowledged segment in the case of delayed acknowledgments. 
     		In order to allow progressive deployment of changed timestamp 
     		option semantics, a backwards compatible way of negotiating 
     		the semantic is required.
     	</t>-->
     	  <t>As there exist some benefit to change the receiver side treatment 
		      of which timestamp value to echo, the negotiation protocol itself 
		      must also provide some backwards compatibility. Therefore, even 
		      when a sender tries to negotiate for a higher version than supported 
		      by the receiver, the receiver MUST respond with at least version 0.
		      Also, a future protocol enhancement MUST make sure that any extension
		      is compatible with at least version 0. 
    	  </t>
     	  <t>As the importance of the timestamp option increases by using 
     		  it in more aspects of a TCP sender's operation e.g. congestion 
     		  control, so increases the 
     		  importance of maintaining the integrity of the reflected 
     		  timestamps. At the same time this must not inhibit the receiver 
     		  to interpret a received timestamp in TSval.
     	  </t>
     	  <t>This is achieved by indicating how many LSB bits of the 
     		  timestamp value MUST NOT be interpreted by the receiver. Apart
     		  from the purpose of maintaining timestamp integrity for the use 
     		  as input signal into congestion control algorithms, this also 
     		  allows the use of timestamp based methods to discriminate at 
     		  the earliest possible moment (within 1 RTT after the 
     		  retransmission) between spurious retransmissions and genuine 
     		  loss even when using slow running TCP timestamp clocks.
     	  </t>
     	  <t>In addition, by using synergistic signaling between timestamps 
		      <xref target="RFC1323"/> and selective acknowledgments 
		      <xref target="RFC2018"/>, enhancements in loss recovery are 
		      possible by removing any remaining retransmission and acknowledgment 
		      ambiguity. See <xref target="uses"/> for a detailed discussion.
        </t>
        <t>As an optional extension, a timestamp clock interval range 
      	  negotiation is also briefly introduced in <xref target="AppA"/>. This 
      	  is only included as one potential example of further enhancements.
        </t>
        <t><vspace blankLines='100' /></t>
		  </section>
		</section>
    
    <section anchor="problem" title="Problem statement">
    	<t>Timestamp values are carried in each segment if negotiated for. 
    		However, the content of this values is to be treated as an 
    		unmutable and largely uninterpreted entity by the receiver. This 
    		document	describes an enhancement to the timestamp negotiation, 
    		and must meet the following criteria:
    		<list style="symbols">
    			<t>Indicate the (approximate) timestamp clock interval used by the sender 
    				in a wide range. The longest interval should be around  10 seconds, 
    				while the shorted interval should allow unique timestamps per 
    				segment, even at extremely high link speeds. At the time of  
    				writing, the shortest meaningful duration was found to be a
    				64 byte	packets (i.e. ACK segment) sent at a rate of 100 
    				Gbit/s. This corresponds to a maximum timestamp clock rate
    				of around 200 MHz, or an interval between clock ticks of around 5 ns. 
    			</t>
    			<t>Allow for timestamps that are not directly related to real 
    				time (i.e. segment counting, or use of the timestamp value 
    				as a true extension of sequence numbers).
    			</t>
    			<t>Provide means to prevent or at least detect tampering with 
    				the echoed timestamp value, allowing for basic integrity and 
    				consistency checks.
    			</t>
    			<t>Allow for future extensions that may use some of the 
    				timestamp value bits for other signaling purposes during the
    				remainder of the session.
    			</t>
    			<t>Signaling must be backwards compatible with existing TCP 
    				stacks implementing	basic <xref target="RFC1323"/> 
    			  timestamps. Current methods for timestamp value generation
    			  must be supported.
    			</t>
    			<t>Allow to state timing information explicitly during the 
    				initial handshake, to avoid a training phase extending 
    				beyond the initial handshake.
    			</t>
    			<t>Provide a means to disambiguate between resent <SYN> 
    				segments.
    			</t> 
    			<t>Cater for broken implementations, that either send a non-zero 
    				TSecr value in the initial <SYN>, or a zero TSecr value 
    				in <SYN,ACK>.
    			</t>
    		</list>
    	</t>
    	<t>Some legacy implementations exist that violate 
    		<xref target="RFC1323"/> in that the TSecr field in a <SYN> is not 
    		cleared (see <xref target="I-D.ietf-tcpm-tcp-security"/>. The 
    		protocol should have some resiliency in the presence of such
    		misbehaving senders, and must not lead to an unfair advantage
    		for such wrongly negotiated sessions.  
    	</t>
    	<t>As there exist some benefit to change the receiver side treatment 
    		of which timestamp value to echo, the negotiation protocol itself 
    		must also provide some backwards compatibility. Therefore, even 
    		when a sender tries to negotiate for a higher version than supported 
    		by the receiver, the receiver MUST respond with at least version 0.
    		Also, a future protocol enhancement MUST make sure that any extension
    		is compatible with at least version 0. 
    	</t>
      <t><vspace blankLines='100' /></t>
		</section>

    <section anchor="signal" title="Signaling">
      <section title="Capability Flags">
        <t>In order to signal the supported capabilities, both the sender 
        	and the receiver will idependently generate a timestamp 
        	capability negotiation field, as indicated below. The TSecr 
        	value field of the <xref target="RFC1323"/> TSopt is overloaded 
        	with the following flags and fields during the initial 
        	<SYN> and <SYN,ACK> segments.  The connection 
        	initiator will send the timestamp capabilities in plain, as 
        	with <xref target="RFC1323"/> the TSecr is not used in the 
        	inital <SYN>. The receiver will XOR the local timestamp 
        	capabilities with the TSVal received from the sender and send 
        	the result in the TSecr field. The initiating host of a 
        	session with timestamp capability negotiation has to keep 
        	minimal state to decode the returned capabilities XOR'ed with 
        	the sent TSval.
        </t>

<figure anchor="f_tscap" title="Timestamp Capability flags" align="center">
<artwork align="center"><![CDATA[
  Kind: 8

  Length: 10 bytes

  +-------+-------+---------------------+---------------------+
  |Kind=8 |  10   |   TS Value (TSval)  |TS Echo Reply (TSecr)|
  +-------+-------+---------------------+---------------------+
      1       1              4          |           4         |
                                       /                      |
  .-----------------------------------´                       |
 /                                                             \
|                                                               |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|E|   |         #                                               |
|X|VER|   MSK   #           version specific contents           |
|O|   |         #                                               |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
]]></artwork></figure>
    	  <t>Common fields to all versions:
    		  <list style="hanging" hangIndent="4">
    		    <t hangText="EXO - Extended Options (1 bit)"><vspace />
    				  Indicates that the sender supports extended timestamp 
    				  capabilities as defined by this document, and MUST be 
    				  set to one by a compliant implementation. This flag 
    				  also enables the immediate echoing of the TSval with 
    				  the next ACK, if both timestamp capabilities and 
    				  selective acknowledgement <xref target="RFC2018"/>
    				  are successful negotiated during the initial handshake (see <xref target="implicit"/>).
    				  This change in semantics is independent of  the version 
    				  in the signaled timestamp capabilities.
    			  </t>
    			  <t hangText="VER - Version (2 bits)"><vspace />
    				  Version of the capabilities fields definition. This document 
    				  specifies codepoint 0. With the exception of the
              immediate mirroring - simplifying the receiver side 
              processing - and the masking of some LSB bits before 
              performing the Protection Against Wrapped Sequence Numbers 
    				  (PAWS) test, hosts must not interpret the received timestamps
    				  and not use a timestamp value as input into advanced heuristics, 
    				  if the version received is not supported. This is an identical
    				  requirement as with current <xref target="RFC1323"/> compliant
    				  implementations. The lower 3 octets of the 
    				  timestamp capability flags MUST be ignored if an unsupported
    				  version is received. It is expected, that a host will implement
    				  at least version 0. A receiver MUST respond with the 
    				  appropriate (equal or version 0) version when responding to 
    				  a new session request.
    			  </t>
    			  <t hangText="MSK - Mask Timestamps (5 bits)"><vspace />
    				  The MaSK field indicates how many least significant bits 
    				  should be excluded by the receiver, before further
    				  processing the timestamp (i.e. PAWS, or for timing purposes).
    				  The unmasked portion of a TSval has to comply with the
    				  constraints imposed by <xref target="RFC1323"/> on the
    				  generation of valid timestamps, e.g. must be monotone
    				  increasing between segments, and strict monotone 
    				  increasing for each TCP window. 
    				  Note that this does not impact the reflected timestamp in
    				  any way - TSecr will always be equal to an appropriate TSval. 
    				  This field MUST be present in all future version of 
    				  timestamp capability fields. A value of 31 (all bits set) 
    				  MUST be interpreted by a receiver that the full TSval is to
    				  be ignored by any legacy heuristics, including PAWS.
    				  For PAWS to be effective, at least 2 bits are required to 
    				  discriminate between an increase (and roll-over) versus 
    				  outdated segments. 
    		    </t>
    		  </list>
        </t>
      </section>
      <section anchor="ver0" title="Version 0 specific fields">
      	<t>
<figure anchor="f_tscapv0" title="Timestamp Capability flags - version 0" align="center">
<artwork align="center"><![CDATA[
  Kind: 8

  Length: 10 bytes

  +-------+-------+---------------------+---------------------+
  |Kind=8 |  10   |   TS Value (TSval)  |TS Echo Reply (TSecr)|
  +-------+-------+---------------------+---------------------+
      1       1              4          |           4         |
                                       /                      |
  .-----------------------------------´                       |
 /                                                             \
|                                                               |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|E|   |         #               |         |                     |
|X|VER|   MSK   #      RES      |   ADJ   |         INT         |
|O|   |         #               |         |                     |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
]]></artwork></figure>

    		  <list style="hanging" hangIndent="4">
           	<t hangText="RES - Reserved (8 bits)"><vspace />
              Reserved for future use, and MUST be zero ("0") with version 0. 
              If timestamp capabilities are received with version set to 0,  but 
             	some of these bits set, the receiver MUST ignore the
             	extended options field and react as if the TSecr was zero 
              (compatibility mode). 
            </t>
            <t hangText="ADJ - Adjustment factor (5 bits)"><vspace />
    	   			The scaling factor by which the signaled interval has to be
    	   			left-shifted. This is similar to the way the Window Scale
    	   			option is defined in <xref target="RFC1323"/>. All values
    	   			between zero and 31 are valid.  This allows timestamp clock ticks of 
        			up to 15.99 s.  <vspace/>
        			See	<xref target="owd"/> for details.
            </t> 
           	<t hangText="INT - Interval (11 bits)"><vspace />
        			The integer part of the timestamp clock interval can be signaled with 
        			up to 11 bits of precision. This allows a range with the highest resolution to cover 
        			clock intervals between	7.45 ns (INT=0x400, ADJ=0) and 15.99 s 
        			(INT=0x7FF, ADJ=31). If a sender is using a less precice clock source, 
        			fewer significant bits can be used to implicitly signal this. For example,
        			a timestamp clock interval of approximately 1 ms (1/1024th sec) can 
        			be represented by both (INT=0x001, ADJ=28) and (INT=0x400, ADJ=18). A
        			more accurate representation of 1 ms would be (INT=0x418, ADJ=18).
        			The latter representation carries more significant bits, indicating a
        			more stable clock source with low jitter.<vspace />
        			Only non-zero values are valid when ADJ is non-zero. An invalid 
        			combination of ADJ and INT MUST be treaded as if no timestamp 
        			capability negotiation is attempted. A compliant 
        			sender can choose the value of the <SYN> TSval in such a way,
        			that either the EXO bit, or some of the RES bits are set, or all the 
        			INT bits are cleared, in the encoded response from the receiver. 
        			A receiver that does not reflect the initial TSval
        			in it's <SYN,ACK> and instead sends a zero value in TSecr, will
        			not erraneously negotiate for timestamp capabilities. 
    		    </t>
    		  </list>
    		</t>
    		<t>Conceptually, the timestamp clock
         	interval can be represented as a unsigned integer with 42 bits 
         	length. In this form, the least significant bit represents an 
         	interval of 2^-38 sec (3.64 ps), while still allowing a maximum interval of 
         	16 sec. This value is then shifted to the right, until it can be 
         	represented by only 11 significant bits, and the number of shift operations 
         	is stored as scaling adjustment factor (ADJ).
        </t>
        <t>A value of 
         	zero (both ADJ and INT are set to zero) is supported and indicates, 
        	that the timestamp values are NOT	correlated to wall-clock 
          time (i.e. the sender may perform some form of segment counting
          or sequence number extension instead). A host receiving an 
          interval of zero from the other end host MUST NOT perform 
          time-based heuristics which take the received TSval into 
          account, but SHOULD apply the regular PAWS test. 
        </t>
        <t>Timestamp clock periods faster than 1 ms SHOULD be implemented 
        	by inserting the timestamp "late" before transmitting a segment
        	to avoid unnecessary timing jitter. Shortest clock periods, 
        	with intervals of only a few microseconds or less, are provided
        	for hardware-assisted implementations. 
        </t>
        <t>The range of possible values runs from 15.99 s to 7.45 ns with
        	highest precision, and down to 3.64 ps with reducing precision,
        	which is also the shortest difference in tick duration, that 
        	could be resolved. This equates to clock frequencies of 0.06 Hz, 
        	134 MHz and 275 GHz respectively.
        </t>
        <t>Despite the provision of such
          a large dynamic range, a receiver should consider, that a 
          timestamp clock may deviate from the indicated rate by a large 
          fraction. Similarily, a sender SHOULD refrain from signaling
          the clock interval with too much precision (significan bits),
          if the clock can not be sampled with low variance over time.
        </t>
        <t>Example for an timestamp capability negotiation, to indicate that the 
        	senders timestamp clock (tcp clock) is running with 1 ms per tick, and 
        	using a clock source of typical quality (e.g. software timer interrupt):
        </t>
        <t>SYN, TSval=<X>, TSecr=EXO|MSK|ADJ=22|INT=0x041 
        </t>
    
        <texttable anchor="Tab1" align="center"
        	title="Common used TCP Timestamp Clock intervals">
        	<ttcol align="right">tick interval</ttcol>
        	<ttcol align="right">tick frequency</ttcol>
        	<ttcol align="center">encoding at highest precision</ttcol>
        	<ttcol align="center">encoding at lowest precision</ttcol>
        	<c>16 s </c>  <c>0.06 Hz </c> <c>ADJ=31, INT=0x7FF</c> <c>ADJ=31, INT=0x7FF</c>
        	<c>1 s </c>   <c> 1 Hz </c>   <c>ADJ=28, INT=0x400</c> <c>ADJ=31, INT=0x080</c>
        	<c>0.5 s </c> <c> 2 Hz </c>   <c>ADJ=27, INT=0x400</c> <c>ADJ=31, INT=0x040</c>
        	<c>100 ms</c>      <c>10 Hz </c>   <c>ADJ=24, INT=0x666</c> <c>ADJ=31, INT=0x00C</c>
        	<c> 10 ms</c>      <c>100 Hz </c>  <c>ADJ=21, INT=0x51F</c> <c>ADJ=31, INT=0x001</c>
        	<c>  4 ms</c>      <c>250 Hz </c>  <c>ADJ=20, INT=0x419</c> <c>ADJ=30, INT=0x001</c>
        	<c>  1 ms</c>      <c> 1 kHz</c>        <c>ADJ=18, INT=0x418</c> <c>ADJ=28, INT=0x001</c>
        	<c>200 us</c>      <c> 5 kHz</c>        <c>ADJ=15, INT=0x68E</c> <c>ADJ=25, INT=0x001</c>
        	<c> 50 us</c>      <c>20 kHz</c>        <c>ADJ=13, INT=0x68E</c> <c>ADJ=23, INT=0x001</c>
        	<c>  1 us</c>      <c> 1 MHz</c>        <c> ADJ=8, INT=0x432</c>  <c>ADJ=18, INT=0x001</c>
        	<c> 60 ns</c>      <c>16.7 MHz</c>      <c> ADJ=4, INT=0x407</c>  <c>ADJ=14, INT=0x001</c>
        </texttable>
    	<t>The wide range of indicated timestamp clock intervals (spanning  
    		9 orders of (decimal) magnitude, or 28 binary digits, and the 
    		limitation to no more than 24 bits requires the use of a logarithmic 
    		encoding. Since the precision of the timestamp clock value is most
    		valuable at low frequencies (long tick durations), the clock rate
    		is encoded as a time duration. This results in full precision for
    		common used timestamp clock tick durations, while allowing even
    		shorter intervals at reduced precision. A format was chosen 
    	  that is simple to implement and poses no risk of confusion with
    	  common floating point representations.
    	</t>
    	<t>The timestamp clock values	a	host is using must not necessarily 
    		run synchronous with the internal TCP clock. Different clock 
    		sources, such as a NTP stratum, RTC, CPU cycle counters, or other 
    		independent clocks can be used to derive the TSval. This allows the
    		de-coupling of the coarse-grained TCP clock used for retransmission 
    		and delayed	ACK timeouts, from the clock frequency indicated in
    		the TSval itself. Since <xref target="RFC1323"/> timestamp clocks
    		used to be only useful for RTT measurement, and calculation of the
    		RTO, the straight forward use of the TCP timer directly seemed 
    		natural to minimize subsequent RTT calculations.
    	</t> 
    	<t>Most stacks will at first not be able to 
    	  dynamically adjust their timestamp clock interval. Therefore, the
    	  indicated clock duration can be a static, compile time value. To 
    	  use the indicated clock interval, for example to perform one-way 
    	  delay variation calculations, simple integer operations can be 
    	  used after an initial conversion of the wire presentation to
    	  longer (i.e. 32 or 64 bit) integer values.
    	</t>
    
    
       
      </section>
      <section anchor="nego" title="Timestamp Capability Negotiation">
      	<t>During the initial TCP three-way handshake, timestamp capabilities 
    		  are negotiated using the TSecr field. Timestamp capabilities MAY only 
    		  be negotiated in TSecr when the SYN bit is set. A host detects the 
    		  presence of timestamp capability flags when the EXO bit is set in the 
    		  TSecr field of the received <SYN> segment. When receiving a session 
    		  request (<SYN> segment with timestamp capabilities), a compliant TCP 
          receiver is required to XOR the received TSval with the receivers 
          timestamp capabilities. The resulting value is then sent in the 
          <SYN,ACK> response.
        </t>
    	  <t>To support these design goals stated in <xref target="problem"/>, only 
    		  the TSecr field in the initial <SYN> can be used directly. The response 
    		  from the receiver has to be encoded, since no unused field is available
    		  in the <SYN,ACK>. The most straightforward encoding is a XOR with a 
    		  value that is 
    		  known to the sender. Therefore, the receiver also uses TSecr to indicate 
    		  it's capabilities, but calculates the XOR sum with the received TSval. 
    		  This allows the receiver to remain stateless and functionalities like 
    		  syncache (see <xref target="RFC4987"/>) can be maintained with no 
    		  change.
    	  </t>
    	  <t>If a sender has to retransmit the <SYN>, this encoding also 
    	  	allows to detect which segment was received and responeded to. This is
    	  	possible by changing
    	  	the timestamp clock offset between retransmissions in such a way, that 
    	  	the decoding on the sender side would result in an invalid timestamp
    	  	capability negotiation field (e.g. some RES bits are set). If the
    	  	sender does not require the capability to differentiate which <SYN>
    	  	was received, the timestamp clock offset for each new <SYN> can be set
    	    in such a way, that the TSopt of the <SYN> is identical for each
    	    retransmission. 
    	  </t>
        <t>As a receiver MAY report back a zero value at any time, in 
    		  particular during the <SYN,ACK>, the sender is slightly constrained
    		  in it's selection of an initial Timestamp value. The Timestamp value sent
    		  in the <SYN> should be selected in such a way, that it does not resemble
    		  a valid Timestamp capabilities field. This prevents a compliant sender to
    		  erraneously detect a compliant receiver, if the returned TSecr value is zero.
    	  </t>
    	  <t>A host initiating a TCP session must verify if the partner also 
      	  supports timestamp capability negotiation and a supported version, 
      	  before using enhanced algorithms. Note that this change in 
      	  semantics does not necessarily change the signaling of timestamps 
      	  on the wire after initial negotiation.
        </t>
        <t>To mitigate the effect from misbehaving TCP senders appearing to
      	  negotiate for timestamp capabilities, a receiver MUST verify that 
      	  one specific bit (EXO) is set, and any reserved bits (currently 8, 
      	  RES field) are cleared. This limits the chance for a receiver
      	  to mistakenly negotiate	for version 0 capabilities to around 0.05%.
      	  However, as a receiver has to use changed semantics when reflecting
      	  TSval also for higher values in the version field, a misbehaving 
      	  sender negotiating for SACK, but not properly clearing TSecr, may have
      	  a 37.5% chance of receiving timestamp values with modified receiver
      	  behavior. This may lead to an increased number of spurious 
      	  retransmission timeouts, putting such a session to a disadvantage.
        </t>
        <t>Once timestamp capabilities are successfully negotiated, the 
      	  receiver MUST ignore an indicated number of masked, low-order bits, 
      	  before applying the heuristics defined in <xref target="RFC1323"/>. 
      	  The	monotonic increase of the timestamp value for each new  
      	  segment could be violated	if the full 32 bit field, including the 
      	  masked bits, are used. This conflicts with the constraints
      	  imposed by PAWS. The use of generic (secure) hash	algorithms makes it 
      	  possible to protect the integrity of the timestamp value, without
      	  any compromise to fulfill the PAWS requirement of monotonic increasing
      	  values.
        </t>
        <t>The presented distribution of the common three fields, EXO, VER and 
      	  MASK, that MUST be present regardless of which version is implemented
      	  in a compliant TCP stack, is a result of the previously mentioned 
      	  design goals. The lower three octets MAY be redefined freely with 
      	  subsequent versions of the timestamp capability negotiation protocol.
      	  This allows a future version to be implemented in such a way, that
      	  a receiver can still operate with the modified behavior, and a 
      	  minimum amount of processing (PAWS) 
        </t>

        <section anchor="implicit" title="Implicit extended negotiation">
   <!-- 	      <t>When selective acknowledgements <xref target="RFC2018"/> are also 
      	negotiated for, the immediate echoing of the last received timestamp 
      	value has to be enabled, regardless of the senders version of the 
      	timestamp capabilities.
    	</t>  
-->
          <t>If both Timestamp capabilities and Selective Acknowledgement options 
      	    <xref target="RFC2018"/> are negotiated (both hosts send these 
      	    options in their respective segments), both hosts MUST echo the 
            timestamp value of the last received segment, irrespective of the 
            order of delivery. Note that this is in conflict with 
            <xref target="RFC1323"/>, where only the timestamp of the last segment 
            received in sequence is mirrored. As SACK allows discrimination of 
            reordered or lost segments, the reflected timestamps are not required 
            to convey the most conservative information. If SACK indicates lost 
            or reordered packets at the receiver, the sender MUST take appropriate 
            action such as ignoring the received timestamps for calculating the 
            round trip time, or assuming a delayed packet (with appropriate
            handling). An updated algorithm to calculate the retransmission
            timeout timer (RTO) is not discribed in this document.
          </t>
          <t>The immediate echoing of the last received timestamp value allowed by 
      	    the synergistic use of the timestamp option with the SACK option 
      	    enables enhancements to improve loss recovery, round trip time (RTT) 
      	    and one-way delay (OWD) variation measurements (see 
      	    <xref target="uses"/>) even during loss or reordering episodes. This 
      	    is enabled by removing any retransmission ambiguity using unique 
      	    timestamps for every retransmission, while simultaneously the SACK 
      	    option indicates the ordering of received segments even in the 
      	    presence of ACK loss or reordering.
          </t>
      	  <t>The use of RTT and OWD measurements during loss episodes is an 
      		  open research topic. A sender has to accomodate for the changed
      		  timestamp semantics in order to maintain a conservative 
      		  estimate of the Retransmission Timer as defined in <xref target="RFC6298"/>,
      		  when a TCP sender has negotiated for an immideate reflection
      		  of the timestamp triggering an ACK (i.e. both timestamp capability
      		  negotiation and Selective Acknowledgements are enabled for the session).
      		  As the presence of a SACK option in an ACK signals an ongoing reordering
      		  or loss episode, timestamps conveyed in such segments MUST NOT be used
      		  to update the retransmission timeout. Also note that the presence of
      		  a SACK option alleviates the need of the receiver to reflect the last
      		  in-order timestamp, as lost ACKs can no longer cause erraneous updates
      		  of the retransmission timeout.
      	  </t>
        </section>
        <section title="Interaction with the Retransmission Timer">
      	
      	  <t>The above stated rule, to ignore timestamps as soon as a SACK option
      		  is present, is fully consistent with the guidance given in 
      		  <xref target="RFC1323"/>, even though most implementations skip over 
      		  such an additional verification step in the precense of the SACK option.
      	  </t>
      	  <t>To address the additional delay imposed by delayed ACKs, a compliant
      		  sender SHOULD modify the update procedure when receiving normal, in-sequence
      		  ACKs that acknowledge more than SMSS bytes, so that the input (denoted R in 
      		  <xref target="RFC6298"/>) is calculated as
      	  </t>
      	  <t>R = RTT * ( 1 + 1/(cwnd/smss) )
      	  </t>
      	  <t>If RTT (as measured in units of the timestamp clock) is smaller than the
      		  congestion window measured in full sized segments, the above heuristic
      		  MAY be bypassed before updating the retransmission timeout value.
      	  </t>
          <t><vspace blankLines='100' /></t>
		
        </section>
      
    
      </section>
       
    </section>
    
    <section anchor="uses" title="Possible use cases">
      <section anchor="owd" title="One-way delay variation measurement">
        <t>New congestion control algorithms are currently proposed, that 
        	react on the measured one-way delay variation (i.e.
        	<xref target="I-D.ietf-ledbat-congestion"/>, <xref target="Chirp"/>). 
        	This control variable is updated after each received ACK:
        </t>
        <t>C(t) = TSval(t) - TSecr(t)
        </t>
        <t>V(t) = C(t) - C(t-1) 
        </t>
        <t>provided that the timestamp clocks at both ends are running 
        	at roughly the same rate. Without prior knowledge of the timestamp 
        	clock interval used by the partner, a sender can try to learn this interval 
        	by observing the exchanged segments for a duration of a few RTTs. 
        	However, such a scheme fails if the partner uses some form of implicit 
        	integrity check of the timestamp values,  which would appear as 
      		either random scrambling of LSB bits in the timestamp, or give the 
      		impression of much shorter clock intervals than what is actually used. 
      		If the partner uses some form of segment counting as timestamp value, 
      		without any direct relationship to the wall-clock time, the above 
      		formula will fail to yield meaningful results. Finally the network 
      		conditions need to remain stable during any such training phase, so
      		that the sender can arrive at reasonable estimates of the partners 
      		timestamp clock tick duration.
      	</t>      
        <t>This note addresses these concerns by providing a means by which 
        	both host are required to use a timestamp clock that is closely 
        	related to the wall-clock time, with known clock rate, and also provides 
        	means by which a host can signal the use of a few LSB bits for timestamp 
        	value integrity checks. To arrive at a valid one-way delay (OWD)
        	variation, first the timestamp received from the partner has to be 
        	right-shifted by a known amount of bits as defined by the mask field.
        	Next the local and remote timestamp values need to be normalized to a 
        	common base clock interval (typically, the local clock interval):
        </t>
<figure><artwork align="left"><![CDATA[
                                                      remote interval
C  = (TSecr >> local mask) - (TSval >> remote mask) * ---------------
 t                                                    local interval
]]></artwork></figure>

        <t>V(t) = C(t) - C(t-1)
        </t>
				<t>The adjustment factor can be calculated once during the
					timestamp capability negotiation phase, and pure integer 
					arithmetic can be used during per-segment processing:
				</t>
		  	<t>EXP.min = min(EXP.loc, EXP.rem)
				</t>
				<t>EXP.rem -= EXP.min
				</t>
				<t>EXP.loc -= EXP.min
				</t>
				<t>FRAC.rem = (0x800 | FRAC.rem) << EXP.rem
				</t>
				<t>FRAC.loc = (0x800 | FRAC.loc) << EXP.loc
				</t>
				<t>and assuming that the local clock tick duration is lower
				</t>
				<t>ADJ = FRAC.rem / FRAC.loc
				</t>
				<t>with ADJ being a integer variable. For higher precision, two 
					appropriately calculated integers can be used.
				</t>
				<t>Any previously required training on the remote clock interval can 
					be removed, resulting in a simpler and more dependable algorithm. 
					Furthermore, transient network effects during the training phase 
					which may result in a wrong inference of the remote clock interval 
					are eliminated completely.
				</t>
      </section>
      <!-- VERIFY
      <section anchor="delack" title="Autotuning of delayed acknowledgments">
      	<t>A receiver can infer the number of packets outstanding per round
      		trip time, if the sender immediately reflects the last received TSval.
      		With <xref target="RFC1323"/>, the sender will keep reflecting the
      		timestamp of the last segment that advanced the receivers sequence
      		number, e.g. the TSval seen in the initial <SYN,ACK>. This
      		prevents a receiver to determine if the number of packets sent by
      		the sender per RTT is high enough, to warrant the use of delayed
      		ACKs. For senders transmitting at a rate below two segments per RTT,
      		a receiver could disable the use of delayed ACKs. Furthermore, 
      		the delayed ACK timeout interval can be adjusted to match the RTT.
      		As the delayed ACK timeout is measured in units of the TCP clock,
      		a minimum value of one tick has to be maintained when delayed ACKs
      		are active.
      		
      	</t>
      </section>
      -->
      <section anchor="spurrtx" title="Early spurious retransmit detection">
      <t>Using the provided timestamp negotiation scheme, clients utilizing slow running 
      timestamp clocks can set aside a small number of least significant bits in the 
      timestamps. These bits can be used to differentiate between original and 
      retransmitted segments, even within the same timestamp clock tick (i.e. when RTT 
      is shorter than the TCP timestamp clock interval). It is recommended to use only a 
      single bit (mask = 1), unless the sender can also perform lost retransmission 
      detection. Using more than 2 bits for this purpose is discouraged due 
      to the diminishing probability of loosing retransmitted packets more than one 
      time. A simple scheme could send out normal data segments with the so masked bits 
      all cleared. Each advance of the timestamp clock also clears those bits again. When 
      a segment is retransmitted without the timestamp clock increasing, these bits 
      increased by one for each consecutive retry of the same segment, until the maximum 
      value is reached. Newly sent segments (during the same clock interval) should 
      maintain these bits, in order to
      maintain monotonically increasing values, even though compliant end hosts do not
      require this property. This scheme maintains monotonically increasing timestamp values 
      - including the masked bits. Even without negotiating the immediate mirroring of 
      timestamps (done by simultaneously doing timestamp capabilities negotiation, 
      and selective acknowledgments), this extends the use of the Eifel Detection 
      <xref target="RFC3522"/> and Eifel Response <xref target="RFC4015"/> algorithm to detect and react to spurious 
      retransmissions under all circumstances. Also, currently experimental schemes
      such as ER-SRTO <xref target="Cho08"/> could be deployed without requiring the
      receiver to explicitly support that capability.</t>
<figure anchor="f_SRTO" title="timestamp for spurious retranmit detection" align="center">
<artwork align="center"><![CDATA[
Seg0 Seg1 Seg2 Seg3 Seg4 
TS00 TS00 TS00 TS00 TS00 
       X    

     Seg1                Seg5 
     TS01                TS01 
     
                              Seg6 Seg7
                              TS01 TS10
                              
]]></artwork></figure> 
      <t>Masked bits are the 2nd digit, the timestamp value is represented by the first 
      digit. The timestamp clock "ticks" between segment 6 and 7.</t>
      </section>
      
      <section anchor="earlrd" title="Early lost retransmission detection">
      <t>During phases where multiple segments in short succession (but not necessarily 
      successive segments) are lost, there is a high likelihood that at least one segment 
      is retransmitted, while the cause of loss (i.e. congestion, fading) is still 
      persisting. The best current algorithms can 
      recover such a lost retransmission with a few constraints, for example, that the 
      session has to have at least DupThresh more segments to send beyond the current 
      recovery phase. During loss recovery, when a retransmission is lost again, 
      currently the timestamp can also not be used as means of conveying additional 
      information, to allow more rapid loss recovery while maintaining packet 
      conservation principles. Only the timestamp of the last segment preceding the 
      continuous loss will be reflected. Using the extended timestamp option negotiation 
      together with selective acknowledgements, the receiver will immediately reflect 
      the timestamp of the last seen segment. Using both SACK and TS information 
      synergistically, a sender can infer the exact order in which original and 
      retransmitted segments are received. This allows a slightly less conservative 
      and faster approach to retransmit lost retransmitted segments.
      </t>
      <t>This can be implemented in combination with the masked bit approach
      described in the previous paragraph, or without. However, if the timestamp
      clock interval is lower than 1/2 RTT, both the original and the retransmitted segment
      may carry an identical timestamp. If the sender cannot discriminate between the
      original and the retransmitted segments, is MUST refrain from
      taking any action before such a determination can be made.</t>

      <t>In this example, masked bits are used, with a simple marking method. As the 
      timestamp value of the retransmission itself is already different from the original 
      segments, such an additional discrimination would not strictly be required here.
      The timestamp clock ticks in the first digit and the dupthresh value is 3.</t>

<figure anchor="f_TSloss" title="timestamp under loss" align="center">
<artwork align="center"><![CDATA[
Seg0 Seg1 Seg2 Seg3 Seg4 Seg5 Seg6 Seg7
TS00 TS10 TS10 TS10 TS10 TS10 TS10 TS20
       X    X    X    *

     Seg1 Seg2 Seg3 Seg4
     TS21 TS30 TS30 TS30
       X
       
     Seg1                               Seg8 Seg9
     TS31                               TS31 TS40
]]></artwork></figure>

      <t>If Seg1,TS00 is lost twice, and Seg4,TS10 is also lost, the sender could 
      resend Seg1 once more after seeing dupthresh number of segments sent after 
      the first retransmission of Seg1 being received (ie, when Seg4 is SACKed). 
      However, there is a ambiguity between retransmitted segments and original 
      segments, as the sender cannot know, if a SACK for one particular segment 
      was due to the retransmitted segment, or a delayed original segment. The 
      timestamp value will not help in this case, as per RFC1323 it will be held 
      at TS00 for the entire loss recovery episode. Therefore, currently a 
      sender has to assume that any SACKed segments may be due to delayed original 
      sent segments, and can only resolve this conflict by injecting additional, 
      previously unsent segments. Once dupthresh newly injected segments are 
      SACKed, continuous loss (and not further delay) of Seg1 can safely be 
      assumed, and that segment be resent. This approach is conservative but
      constrained by the requirement that additional segments can be sent, and 
      thereby delayed in the response.</t>

      <t>With the synergistic use of timestamp extended options together with 
      selective acknowledgments, the receiver would immediately reflect back the 
      timestamp of the last received segment. This allows the sender to 
      discriminate between a SACK due to a delayed Seg4,TS10, or a SACK because 
      of Seg4,TS30. Therefore, the appropriate decision (retransmission of Seg1 
      once more, or addressing the observed reordering/delay accordingly 
      <xref target="I-D.blanton-tcp-reordering"/> can be taken with 
      high confidence.</t>
      </section>
      
      <section title="Integrity of the Timestamp value">
        <t>If the timestamp is used for congestion control purposes, an 
        	incentive exists for malicious receivers to reflect tampered 
        	timestamps, as demonstrated with some exploits 
        	<xref target="CUBIC"/>.
        </t>
        <t>One way to address this is to not use timestamp information 
          directly, but to keep state in the sender for each sent segment, 
          and track the round trip time independent of sent timestamps. 
          Such an approach has the drawback, that it is not straightforward 
          to make it work during loss recovery phases for those segments 
          possibly lost (or reordered). In addition there is processing and 
          memory overhead to maintain possibly extensive lists in the 
          sender that need to be consulted with each ACK. Despite these 
          drawbacks, this approach is currently implemented due to lack of 
          alternatives (see <xref target="Linux"/>, and 
          <xref target="BSD10"/>).
        </t>
        <t>The preferred approach is that the sender MAY choose to protect 
        	timestamps from such modifications by including a fingerprint
        	(secure hash of some kind) in some of the least significant bits. 
        	However, doing so prevents a receiver from using the timestamp 
        	for other purposes, unless the receiver has prior knowledge about 
        	this use of some bits in the timestamp value. Furthermore, strict 
        	monotonic increasing values are still to be maintained. That 
        	constraint restricts this approach somewhat and limits or inhibits 
          the use of timestamp values for direct use by the receiver (i.e. 
          for one-way delay variation measurement, as the hash bits would 
          look like random noise in the delay measurement).
        </t>
      </section>
      <section title="Disambiguation with slow Timestamp clock">
        <t>In addition, but somewhat orthogonal to maintaining timestamp 
        	value integrity, there is a use case when the sender does not 
        	support a timestamp clock interval that can guarantee unique timestamps
        	for retransmitted segments. This may happen whenever the TCP 
        	timestamp clock interval is higher than the round-trip time of the 
        	path. For unambiguously identifying regular from retransmitted
          segments, the timestamp must be unique for otherwise identical 
          segments. Reserving the least significant bits for this purpose 
          allows senders with slow running timestamp clocks to make use of 
          this feature. However, without modifying the receiver behavior, 
          only limited benefits can be extracted from such an approach. 
          Furthermore the use of this option has implications in the 
          protection against wrapped sequence numbers (PAWS - 
          <xref target="RFC1323"/>), as the more bits are set aside for 
          tamper prevention, the faster the timestamp number space cycles.
        </t>
        <t>Using Timestamp capabilities to explicitly negotiate mask bits, 
        	and set aside a (low) number of least significant bits for the above 
        	listed purposes, allows a sender to use more reliable integrity 
        	checks. These masked bits are not to be considered part of the 
        	timestamp value, for the purposes described in <xref target="RFC1323"/>
        	(i.e. PAWS) and subsequent heuristics using timestamp values (i.e. 
        	Eifel Detection), thereby lifting the strict requirement of always 
        	monotonically increasing timestamp values. However, care should be 
        	taken to not mask too many bits, for the reasons outlined in 
        	<xref target="RFC1323"/>. Using a mask value higher than 8 is 
          therefore discouraged.
        </t>
        <t>The reason for having 5 bits for the mask field nevertheless is to 
        	allow the implementation of this protocol in conjunction with TCP 
        	cookie transaction (TCPCT) extended timestamps <xref target="RFC6013"/>. 
        	That allows for nearly a quarter of a 128 bit timestamp to be set 
        	aside.
        </t>
      </section>
      
      <section anchor="tcpcrc" title="Masked timestamps as segment digest">
      	<t>After making TCP alternate checksums historic (see <xref target="RFC6247"/>), 
      	  there still remains a need to address increased corruption probabilities when
      	  segment sizes are increased (see 
      	  <xref target="I-D.ietf-tcpm-anumita-tcp-stronger-checksum"/>).
      	</t>
      	<t>Utilizing a completely masked TSval field allows the sender to include a stronger
      		CRC32, with semantics independent of the fixed TCP header fields. However,
      		such a use would again exclude the use of PAWS on the receiver side, and
      		a receiver would need to know the specifics of the digest for processing.
      		It is assumed, that such a digest would only cover the data payload of a 
      		TCP segment. In order to allow disambiguation of retransmissions, a special
      		TSval can be defined (e.g. TSval=0) which bypasses regular CRC processing
      		but allows the identification of retransmitted segments.
      	</t>
      	<t>The full semantics of such a data-only CRC scheme are beyond the scope
      		of this document, but would require a different version of the timestamp
      		capability. Nevertheless, allowing the full TSval to remain unprocessed
      		by the receiver for the purpose of PAWS even in version 0 could still allow
      		the successful negotiation of sender-side enhancements such as loss recovery 
      		improvements (see <xref target="spurrtx"/>, and <xref target="earlrd"/>).
      	</t>
        <t>In effect, the masked portion of the timestamp value represent an 
        	unreliable out of band signal channel, that could also be used for other 
        	purposes than solely performing timestamp integrity checks (for example, 
        	this would allow ER-SRTO algorithms <xref target="Cho08"/>).
        </t>
      </section>
      	
      <section anchor="covert" title="Timestamp value as covert channel">
        <t>Covert channels SHOULD NOT be implemented by using the mask field, as the 
        	explicit masking clearly points to such a channel. As the regular operation 
        	of the timestamp clock is still maintained, covert channels working by 
        	artificially delaying data segments in an application (and thereby 
        	influencing the timestamp inserted into the segment) work 
        	unaffected. The received TSval would need to be shifted by the 
        	appropriate number of bits, before extracting the data from the covert
        	channel by the receiver.
        </t>
        <t><vspace blankLines='100' /></t>

      </section>		
    </section>
     
    <section title="Discussion">
      <t>RTT and OWD variation during loss episodes is not deeply researched. 
      	Current heuristics (<xref target="RFC1122"/>, <xref target="RFC1323"/>, 
      	Karn's algorithm <xref target="RFC2988"/>) explicitly exclude (and prevent) 
      	the use of RTT samples when loss occurs. However, solving the retransmission 
      	ambiguity problem - and the related reliable ACK delivery problem - would 
      	enable new functionality to improve TCP processing. Also, having an immediate 
      	echo of the last received timestamp value would enable new research to distinguish between 
      	corruption loss (assumed to have no RTT / OWD impact) and congestion 
      	loss (assumed to have RTT / OWD impact). Research into this field appears to 
      	be rather neglected, especially when it comes to large scale, public internet 
      	investigations. Due to the very nature of this, passive investigations without 
      	signals contained within the headers are only of limited use in empirical 
      	research.
      </t>
      <t>Retransmission ambiguity detection during loss recovery would allow an 
      	additional level of loss recovery control without reverting to timer-based 
      	methods. As with the deployment of SACK, separating "what" to send from 
      	"when" to send it could be driven one step further. In particular, less 
      	conservative loss recovery schemes which do not trade principles of packet 
      	conservation against timeliness, require a reliable way of prompt and best 
      	possible feedback from the receiver about any delivered segment and their 
      	ordering. <xref target="RFC2018"/> SACK alone goes quite a long way, but 
      	using timestamp information in addition could remove any ambiguity. However, 
      	the current specs in <xref target="RFC1323"/> make that use impossible, thus 
      	a modified semantic (receiver behavior) is a necessity.
      </t>
      <t>A synergistic signaling with immediate timestamp value echoes would however 
      	break	legacy, per-packet RTT measurements. The reason is, that delayed ACKs 
      	would not be covered. Research has shown, that per-packet updates of the RTT 
      	estimation (for the purpose of calculating a reasonable RTO value) are only 
      	of limited benefit (see <xref target="Path99"/>, and <xref target="PH04"/>). 
      	This is the most serious implication of the proposed synergistic signaling 
      	scheme with directly echoing	the timestamp value of the segment triggering 
      	the ACK. Even when using the directly reflected timestamp values in an 
      	unmodified RTT estimator, the immediate impact would be limited to causing 
      	premature RTOs when the sending rate suddenly drops below two segments per RTT.
      	That is, assuming the receiver implements delayed ACK and sending one ACK
      	for every other data segment received. If the receiver has D-SACK 
      	<xref target="RFC2883"/> enabled,	such premature RTOs can be detected and 
      	mitigated by the sender (for example, by increasing minRTO for low bandwidth 
      	flows).
      </t>
     
    </section>
   
    <section anchor="Acknowledgements" title="Acknowledgements">
      <t>The authors would like to thank Dragana Damjanovic for some initial 
      	 thoughts around Timestamps and their extended potential use.
      </t>
      <t>The editor would like to thank Bob Briscoe for his insightful
      	comments, and the gratuitous donation of text, that have resulted 
      	in a substantially improved document.
      </t>	
    </section>

    <!-- Possibly a 'Contributors' section ... -->
    
    <section anchor="updates" title="Updates to Existing RFCs">
      <t>Care has been taken to make sure the updates in this specification
        can be deployed incrementally.
      </t>
      <t>Updates to existing <xref target="RFC1323"/> implementations are 
      	only REQUIRED if they do not clear the TSecr value in the initial 
      	<SYN> segment. This is a misinterpretation of <xref target="RFC1323"/> 
      	and may leak data anyway (see 
      	<xref target="I-D.ietf-tcpm-tcp-security"/>). Otherwise, there will 
      	be no need to	update an RFC1323-compliant TCP stack unless the 
      	timestamp	capabilities negotiation is to be used.
      </t>
      <t>Implementations compliant with the definitions in this document 
        shall be prepared to encounter misbehaving senders, that don't clear
        TSecr in their initial <SYN>. It is believed, that checking the reserved
        bits to be all zero provides enough protection against misbehaving 
        senders.
      </t>
      <t>In the unlikely case of an erraneous negotiation of timestamp capabilities
      	between a compliant receiver, and a misbehaving sender, the proposed
      	semantic changes will trigger a higher rate of spurious retransmissions,
      	while time-based heuristics on the receiver side may further negatively
      	impact congestion control decisions. Overall, misbehaving receivers
      	will suffer from self-inflicted reductions in TCP performance.
      </t>
    </section>

    <section anchor="IANA" title="IANA Considerations">
      <t>With this document, the IANA is requested to establish a new registry
     	  to record the timestamp capability flags defined with future versions 
     	  (codepoints 1, 2 and 3).
     	</t>
      <t>The lower 24 bits (3 octets) of the timestamp capabilities field may 
      	be freely assigned in future versions. The first octet must always 
      	contain the EXO, VER and MASK fields for compatibility, and the MASK
      	field MUST be set to allow interoperation with a version 0 receiver.
      </t>
      <t>This document specifies version 0 and the use of the last 
      	three octets to signal the senders timestamp clock interval to the
      	receiver.
      </t>
      
   <t><vspace blankLines='100' /></t>
	
    </section>

    <section anchor="Security" title="Security Considerations">
      <t>The algorithm presented in this paper shares security considerations
        with <xref target="RFC1323"/> (see <xref target="I-D.ietf-tcpm-tcp-security"/>).
      </t>
      <t>An implementation can address the vulnerabilities of 
      	 <xref target="RFC1323"/>, by dedicating a few low-order bits of the 
      	 timestamp fields for use with a (secure) hash, that protects against 
      	 malicious modification of returned timestamp value by the receiver.	 A MASK field has 
      	 been provided to explicitly notify the receiver about that 
        alternate use of low-order bits. This allows the use of timestamps for
        purposes requiring higher integrity and security  while allowing  
        the receiver to extract useful information nevertheless.
      </t>
    </section>
  </middle>

 <!--  *****BACK MATTER ***** -->

 <back>
    <!-- References split into informative and normative -->

    <!-- There are 2 ways to insert reference entries from the citation libraries:
    1. define an ENTITY at the top, and use "ampersand character"RFC2629; here (as shown)
    2. simply use a PI "less than character"?rfc include="reference.RFC.2119.xml"?> here
       (for I-Ds: include="reference.I-D.narten-iana-considerations-rfc2434bis.xml")

    Both are cited textually in the same manner: by using xref elements.
    If you use the PI option, xml2rfc will, by default, try to find included files in the same
    directory as the including file. You can also define the XML_LIBRARY environment variable
    with a value containing a set of directories to search.  These can be either in the local
    filing system or remote ones accessed by http (http://domain/dir/... ).-->

    <references title="Normative References">
     <!--?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml"?-->
     
     
     &RFC1323;
     
     &RFC2018;

     &RFC2119;
          
<!--    
     <reference anchor="min_ref">
     
       <front>
         <title>Minimal Reference</title>

         <author initials="authInitials" surname="authSurName">
           <organization></organization>
         </author>

         <date year="2006" />
       </front>
     </reference>
-->
    </references>

    <references title="Informative References">
      <!-- Here we use entities that we defined at the beginning. -->
     
      &RFC1122;
     
      &RFC2883;
      
      &RFC2988;
     
      &RFC3522;
     
      &RFC4015;
      
      &RFC4987;
     
      &RFC6013;
     
      &RFC6247;
      
      &RFC6298;
     
      <?rfc include="reference.I-D.ietf-tcpm-tcp-security.xml"?>
      
      <?rfc include="reference.I-D.ietf-tcpm-anumita-tcp-stronger-checksum"?>

			<?rfc include="reference.I-D.ietf-ledbat-congestion"?>
			
			<?rfc include="reference.I-D.blanton-tcp-reordering"?>

 <!-- 
     <reference anchor="sack-recovery-entry"
                target="http://tools.ietf.org/html/draft-ietf-tcpm-sack-recovery-entry-01">
       <front>
         <title>Using TCP Selective Acknowledgement (SACK) Information 
         to Determine Duplicate Acknowledgements for Loss Recovery Initiation</title>
         
         <author initials="I." surname="Jarvinen">
           <organization>University of Helsinki</organization>
         </author>
         <author initials="M." surname="Kojo">
           <organization>University of Helsinki</organization>
         </author>
         <date month = "Mar" year = "2010"/>
       </front>
     </reference>
-->

	    <reference anchor="Chirp"
		      target="http://bobbriscoe.net/projects/netsvc_i-f/chirp_pfldnet10.pdf">
		    <front>
			    <title>Chirping for Congestion Control - 
				    Implementation Feasibility</title>
				
			    <author initials="M." surname="Kuehlewind">
				    <organization>University of Stuttgart</organization>
			    </author>
			    <author initials="B." surname="Briscoe">
				    <organization>British Telekom</organization>
			    </author>
			    <date month="Nov" year="2010"/>
		    </front>
	    </reference>

	    <reference anchor="Cho08"
		      target="http://ubinet.yonsei.ac.kr/v2/publication/hpmn_papaers/ic/2008_Enhanced%20Response%20Algorithm%20for%20Spurious%20TCP.pdf">
		    <front>
			    <title>Enhanced Response Algorithm for Spurious TCP
            Timeout (ER-SRTO)</title>
				
			    <author initials="I." surname="Cho">
				    <organization>Yonsei University</organization>
			    </author>
			    <author initials="J." surname="Han">
				    <organization>Yonsei University</organization>
			    </author>
			    <author initials="J." surname="Lee">
				    <organization>Yonsei University</organization>
			    </author>
			    <date month="Jan" year="2008"/>
		    </front>
	    </reference>
	    
	    <reference anchor="CUBIC"
		      target="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.153.3152&rep=rep1&type=pdf">
		    <front>
			    <title>CUBIC: A New TCP-Friendly High-Speed
            TCP Variant</title>
				
			    <author initials="I." surname="Rhee">
				    <organization>NC State University</organization>
			    </author>
			    <author initials="S." surname="Ha">
				    <organization>NC State University</organization>
			    </author>
			    <author initials="L." surname="Xu">
				    <organization>University of Nebraska</organization>
			    </author>
			    <date month="Feb" year="2005"/>
		    </front>
	    </reference>
	   
	    <reference anchor="BSD10"
		      target="http://caia.swin.edu.au/reports/100219A/CAIA-TR-100219A.pdf">
		    <front>
			    <title>Timing enhancements to the FreeBSD kernel to
            support delay and rate based TCP mechanisms</title>
				
			    <author initials="D." surname="Hayes">
				    <organization>Swinburne University of Technology</organization>
			    </author>
			    
			    <date month="Feb" year="2010"/>
		    </front>
	    </reference>
          
	    <reference anchor="Linux"
		      target="http://www.cs.clemson.edu/~westall/853/linuxtcp.pdf">
		    <front>
			    <title>Linux TCP</title>
				
			    <author initials="P." surname="Sarolahti">
				    <organization>Nokia</organization>
			    </author>
			    
			    <date month="Apr" year="2007"/>
		    </front>
	    </reference>
      
      
	    <reference anchor="PH04"
		      target="citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.76.2748&rep=rep1&type=pdf">
		    <front>
			    <title>The Peak-Hopper: A New End-to-End
            Retransmission Timer for Reliable Unicast Transport</title>
				
			    <author initials="H." surname="Eckstroem">
				    <organization>Ericsson</organization>
			    </author>
			    <author initials="R." surname="Ludwig">
				    <organization>Ericsson</organization>
			    </author>
			    
			    <date month="Apr" year="2004"/>
		    </front>
	    </reference>
      
        
	    <reference anchor="Path99"
		      target="http://www.icir.org/mallman/papers/estimation.ps">
		    <front>
			    <title>On Estimating End-to-End Network Path Properties</title>
				
			    <author initials="M." surname="Allman">
				    <organization>NASA</organization>
			    </author>
			    <author initials="V." surname="Paxson">
				    <organization>ICSI</organization>
			    </author>
			    
			    <date month="Sep" year="1999"/>
		    </front>
	    </reference>
      
          

          
    </references>
   <?rfc needLines="100" ?>

    <section anchor="AppA" title="Possible Extension">
    	<t>This section is not intended as normative description of an extension, 
    		but	merely as an example of a possible extension. Future extensions MUST 
    		set the	common fields in such a way that a receiver capable of version 0 
    		only can react appropriately.
    	</t>
      <t>Certain hosts may want to negotiate a common optimal timestamp clock 
      	interval between each other for various purposes. For example, the balance 
      	between PAWS (<xref target="RFC1323"/>) and the timestamp clock resolution 
      	should be more towards one or the other. Also, if a hosts wants to have 
      	identical timestamp clock intervals both at the sender and receiver to 
      	simplify one-way delay variation calculation, negotiating the clock interval 
      	could be useful. With identical timestamp clock intervals, instead of 
      	multiplications and divisions, only additions and subtractions are 
      	required for OWD variation calculation.
      </t>
      <t>Without a full three way handshake, full negotiation of the timestamp 
      	clock intervals is not possible. For this reason, a special semantic is 
      	required during negotiation. This allows both ends to know the exact 
      	timestamp clock interval with only two exchanged segments, while at the 
      	same time remaining compatible with version 0.
      </t>
      <t>For this purpose, the following extension (version 1) of this protocol is 
      	one suggestion. Depending on the exact requirements, a different signaling
      	may be more appropriate. For example, only the two different EXP fields
      	could be required, while a single, but higher precision FRAC field for
      	both low and high boundaries could suffice, and some additional 
      	signaling bits could be made available. 
      </t>
      
      <section title="Capability Flags">
<figure anchor="f_TScap1" title="Timestamp Capability enhanced flags" align="center">
<artwork align="center"><![CDATA[
  Kind: 8

  Length: 10 bytes

  +-------+-------+---------------------+---------------------+
  |Kind=8 |  10   |   TS Value (TSval)  |TS Echo Reply (TSecr)|
  +-------+-------+---------------------+---------------------+
      1       1              4          |           4         |
                                       /                      |
  .-----------------------------------´                       |
 /                                                             \
|                                                               |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|E|   |         #         DUR12lo       |         DUR12hi       | 
|X|VER|  MASK   #-----------------------|-----------------------|                                               
|O|   |         # ADJ12lo |   INT12lo   | ADJ12hi |   INT12hi   |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
]]></artwork></figure>

        <t>The following additional fields are defined:
          <list style="hanging" hangIndent="4">
            <t hangText="VER - version (2 bits)"><vspace />
              Version 1 could indicated that the sender is capable of 
              adjusting the timestamp clock interval within the bounds of 
              the two 12 bit fields (see <xref target="_12bit"/>). A 
              receiver that only implements version 0 SHOULD NOT ignore 
              the timestamp capability negotiation entirely when 
              encountering an unsupported version, any SHOULD respond 
              with a version 0 response nevertheless (see below) - 
              thereby enabling enhanced uses of the timestamp value 
              and the modification of the receiver side timestamp 
              processing.
            </t>
            <t hangText="DUR12lo"> and
            </t>
            <t hangText="DUR12hi - Duration (12 bits each)"><vspace />
            	The sender provides a range of two timestamp clock
            	intervals in the initial <SYN> to ask the receiver
            	to operate preferred in this range.
            </t>
            <t hangText="ADJ12lo"> and
            </t>
            <t hangText="ADJ12hi - Adjustment factor (5 bits each)"><vspace />
            	The scale adjustment factor indicating the possible timestamp clock
            	ranges. All values between zero and 31 are allowed, with the only
            	limitation that ADJ12hi must be equal or greater than ADJ12lo. As the 
            	base value representation is shorter by 4 bits than the single interval
            	representation, the values need to be left shifted always by 4.
            	left- 
            </t>
            <t hangText="INT12lo"> and
            </t>
            <t hangText="INT12hi - Base Interval (7 bits each)"><vspace />
            	The integer part of the timestamp clock interval before being
            	left-shifted. A a value of zero would have a special 
            	meaning, and is not a valid number for range negotiation. 
            	The properly scaled intervals MUST be given in the correct
            	order (lower interval in DUR12lo and higher interval
            	in DUR12hi).
            </t>
          </list>
        </t>
      </section>
      <section title="Range Negotiation" anchor="_12bit">
        <t>Only the host initiating a TCP session MAY offer a timestamp clock 
        	interval, while the receiver SHOULD select a timestamp clock interval within 
        	these bounds. If the receiver can not adjust it's timestamp clock to 
        	match the range, it MAY use a timestamp clock rate outside these 
        	bounds. If the receiver indicated a timestamp clock interval within the 
        	indicated bounds, the sender MUST set it's timestamp clock interval to 
        	the negotiated rate. If the receiver uses a timestamp clock interval
        	outside the indicated bounds, the sender MUST set the local 
        	timestamp clock interval to the value indicated by the closer boundary.
        </t>
        <t>The following example sequence is provided to demonstrate how 
        	timestamp clock range negotiation works.  Both sender and receiver 
        	finally know the clock interval of their respective partner.
        </t>
        <t>SYN, TSopt=<X>, TSecr=EXO|VER=1|MSK|DUR12lo=1ms|DUR12hi=100ms 
        </t>
        <t>SYN,ACK, TSopt=<Y>, TSecr=<X>^EXO|VER=0|MSK|DUR=10ms
        </t>
        <t>In this example, both hosts would run their respective timestamp 
        	clocks with one tick every 10 ms.
        </t>
        <t>SYN, TSopt=<X>, TSecr=EXO|VER=1|MSK|DUR12lo=1ms|DUR12hi=100ms 
        </t>
        <t>SYN,ACK, TSopt=<Y>, TSecr=<X>^EXO|VER=0|MSK|DUR=1000ms
        </t>
        <t>In this example, the sender would set the timestamp clock interval to 
        	100 ms (closer to the receivers clock interval of 1 sec), 
        	while the receiver will have a timestamp clock interval running at 1 sec.
        </t>
        <t>SYN, TSopt=<X>, TSecr=EXO|VER=1|MSK|DUR12lo=1ms|DUR12hi=100ms 
        </t>
        <t>SYN,ACK, TSopt=<Y>, TSecr=<X>^EXO|VER=0|MSK|DUR=100us
        </t>
        <t>In this example, the sender would set the timestamp clock rate to 
        	one tick every 10 ms (closest to the receiver's clock interval of 100 us), 
        	while the receiver will have the timestamp clock running at 100 us per tick.
        </t>
        <t><vspace blankLines='100' /></t>
		
      </section>
    </section>
    
    <section title="Revision history">
      <t>00 ... initial draft, early submission to meet deadline.
      </t>
	    <t>01 ... refined draft, focusing only on those capabilities that 
	    	have an immediate use case. Also excluding flags that can be 
	    	substituted by other means (MIR - synergistic with SACK option 
	    	only, RNG moved to appendix A, BIA removed and the exponent bias 
	    	set to a fixed value. Also extended other paragraphs.
	    </t>
	    <t>02 ... updated document after IETF80 - referrals to "timestamp 
	    	options" were seen to be ambiguous with "timestamp option", and 
	    	therefore replaced by "timestamp capabilities". Also,
	  	  the document was reworked to better align with RFC4101. Removed
	  	  SGN and increased FRAC to allow higher precision.
	  	</t>
	  	<t>03 ... removed references to "opaque" and "transparent". 
	  		substituted "timestamp clock interval" for all instances of 
	  		rate. Changed signal encoding to resemble a scale/value 
	  		approach like what is done with Window Scaling. As added
	  		benefit, clock quality can be implicitly signaled, since
	  		multiple representations can map to idential time intervals.
	  		Added discussion around resilience against broken RFC1323 
	  		implementations (Win95, Linux 2.3.41+), which deviate from
	  		expected Timestamp signaling behavior.
	  	</t>
	    
	    <t><vspace blankLines='100' /></t>
		
	  </section> 
  </back>
</rfc>


PAFTECH AB 2003-20262026-04-23 13:21:39