One document matched: draft-baker-openstack-ipv6-model-00.xml


<?xml version="1.0" encoding="US-ASCII"?>
<!DOCTYPE rfc SYSTEM "rfc2629.dtd">
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- Some of the more generally applicable PIs that most I-Ds might want to use -->
<!-- Try to enforce the ID-nits conventions and DTD validity -->
<?rfc strict="yes" ?>
<!-- Items used when reviewing the document -->
<?rfc comments="no" ?>
<!-- Controls display of <cref> elements -->
<?rfc inline="no" ?>
<!-- When no, put comments at end in comments section,
                                 otherwise, put inline -->
<?rfc editing="no" ?>
<!-- When yes, insert editing marks: editing marks consist of a 
                                 string such as <29> printed in the blank line at the 
                                 beginning of each paragraph of text. -->
<!-- Create Table of Contents (ToC) and set some options for it.  
         Note the ToC may be omitted for very short documents,but idnits insists on a ToC 
         if the document has more than 15 pages. -->
<?rfc toc="yes"?>
<?rfc tocompact="yes"?>
<!-- If "yes" eliminates blank lines before main section entries. -->
<?rfc tocdepth="3"?>
<!-- Sets the number of levels of sections/subsections... in ToC -->
<!-- Choose the options for the references. 
         Some like symbolic tags in the references (and citations) and others prefer 
         numbers. The RFC Editor always uses symbolic tags.
         The tags used are the anchor attributes of the references. -->
<?rfc symrefs="yes"?>
<?rfc sortrefs="yes" ?>
<!-- If "yes", causes the references to be sorted in order of tags.
                                 This doesn't have any effect unless symrefs is "yes" also. -->
<!-- These two save paper: Just setting compact to "yes" makes savings by not starting each 
         main section on a new page but does not omit the blank lines between list items. 
         If subcompact is also "yes" the blank lines between list items are also omitted. -->
<?rfc compact="yes" ?>
<?rfc subcompact="no" ?>
<!-- end of list of popular I-D processing instructions -->
<!-- end of list of processing instructions -->
<rfc category="info" docName="draft-baker-openstack-ipv6-model-00"
     ipr="trust200902">
  <front>
    <title abbrev="">A Model for IPv6 Operation in OpenStack</title>
    <author fullname="Fred Baker" initials="F." surname="Baker">
      <organization>Cisco Systems</organization>
      <address>
        <postal>
          <street/>
          <city>Santa Barbara</city>
          <code>93117</code>
          <region>California</region>
          <country>USA</country>
        </postal>
        <email>fred@cisco.com</email>
      </address>
    </author>
    <author fullname="Chris Marino" initials="C." surname="Marino">
      <organization>Cisco Systems</organization>
      <address>
        <postal>
          <street/>
          <city>San Jose</city>
          <code>95134</code>
          <region>California</region>
          <country>USA</country>
        </postal>
        <email>chrmarin@cisco.com</email>
      </address>
    </author>
    <author fullname="Ian Wells" initials="I." surname="Wells">
      <organization>Cisco Systems</organization>
      <address>
        <postal>
          <street/>
          <city>San Jose</city>
          <code>95134</code>
          <region>California</region>
          <country>USA</country>
        </postal>
        <email>iawells@cisco.com</email>
      </address>
    </author>
    <date/>
    <area/>
    <workgroup/>
    <abstract>
      <t>This is an overview of a network model for OpenStack, designed to
      dramatically simplify scalable network deployment and operations.</t>
    </abstract>
    <!--		
		<note title="Foreword">
		</note>
		-->
    <!--
      <texttable anchor="table_example" title="A Very Simple Table">
      <preamble>Tables use ttcol to define column headers and widths.
      Every cell then has a "c" element for its content.</preamble>
          <ttcol align="center">ttcol #1</ttcol>
                                    <ttcol align="center">ttcol #2</ttcol>
                      <c>c #1</c>		<c>c #2</c>
                      <c>c #3</c>		<c>c #4</c>
                      <c>c #5</c>		<c>c #6</c>
      <postamble>which is a very simple example.</postamble>
      </texttable>
    -->
  </front>
  <middle>
    <!--		
      <t>There are multiple list styles: "symbols", "letters", "numbers", "hanging", "format", etc.</t>
      <t>
	<list style="symbols">
	    <t>First bullet</t>
	    <t>Second bullet</t>
	</list>
     </t>
    -->
    <!--
	<figure anchor="reference" title="Figure">
	<artwork align="center">
	<![CDATA[
		ASCII artwork goes here... 
	]]>
	</artwork>
	</figure>
    -->
    <section anchor="introduction" title="Introduction">
      <t>OpenStack, and its issues.</t>
      <section anchor="projects" title="What is OpenStack?">
        <t>OpenStack is a cloud computing orchestration solution developed
        using an open source community process. It consists of a collection of
        'projects', each implementing the creation, control and administration
        of tenant resources. There are separate OpenStack projects for
        managing compute, storage and network resources.</t>
        <t>Neutron is the project that manages OpenStack networking. It
        exposes a northbound API to the other OpenStack projects for
        programmatic control over tenant network connectivity. The southbound
        interface is implemented as one or more device driver plugins that are
        built to interact with specific devices in the network. This approach
        provides the flexibility to deploy OpenStack networking using a range
        of alternative techniques.</t>
        <t>An OpenStack tenant is required to create what OpenStack identifies
        as a 'Network' connecting their virtual machines. This Network is
        instantiated via the plugins as either a layer 2 network, a layer 3
        network, or as an overlay network. The actual implementation is
        unknown to the tenant. The technology used to provide these networks
        is selected by the OpenStack operator based upon the requirements of
        the cloud deployment.</t>
        <t>The tenant also is required to specify a 'Subnet' for each Network.
        This specification is made by providing a CIDR prefix for IPv4 address
        allocation via DHCP and for IPv6 address allocation via DHCP or SLAAC.
        This address range may be from within the address range of the
        datacenter (non-overlapping), or overlapping RFC 1918 addresses.
        Tenants may create multiple Networks, each with its own Subnet.</t>
        <t>An OpenStack Subnet is a logical layer 2 network and requires layer
        3 routing for packets to exit the Subnet. This is achieved by
        attaching the Subnet to a Neutron Router. The Neutron router
        implements Network Address Translation for external traffic from
        tenant networks as well for providing connectivity to tenant networks
        from the outside. Using Linux utilities, OpenStack can support
        overlapping RFC 1918 addresses between tenants.</t>
        <t>OpenStack Subnets are typically implemented as VLANs in a
        datacenter. When tenant scalability requirement grow large, an overlay
        approach is typically used. Because of the difficulties in scaling and
        administering large layer 2 and/or overlay networks, some OpenStack
        integrations chose not to provide isolated Subnets and simply offer
        tenants a layer 3 based network alternative.</t>
        <t>OpenStack uses Layer 3 and Layer 2 Linux utilities on hosts to
        provide protection against IP/MAC spoofing and ARP poisoning.</t>
      </section>
      <section anchor="openstack-issues" title="OpenStack Scaling Issues">
        <t>One of the fundamental requirements of OpenStack Networking
        (Neutron) is to provide scalable, isolated tenant networks. Today this
        is achieved via L2 segmentation using either a) standard 802.1Q VLANs
        or b) an overlay approach based on one of several L2 over L3
        encapsulation techniques available today such as 802.1ad, VXLAN, STT
        or NVGRE.</t>
        <t>However, these approaches still struggle to provide scalable,
        transparent, manageable, high performing isolated tenant networks.
        VLAN's don't scale beyond 4096 (2^12) networks and have complex
        trunking requirements when tenants span host and racks. IEEE 802.1ad
        (QinQ) partially solves that, but adds another limit - at most 2^12
        tenants, each of which may have 2^12 VLANs. IP Encapsulation
        introduces additional complexity on host computers running hypervisors
        as well as impact performance of tenant applications running on
        virtual machines. Overlay based isolation techniques may also impair
        traditional network monitoring and performance management tools.
        Moreover, when these isolated (L2) networks require external access to
        other networks or the public Internet, they require even more complex
        solutions to accommodate overlapping IP prefixes and network address
        translation (NAT).</t>
        <t>As more capabilities are built on to these layer 2 based
        ‘virtual’ networks, complexity continues to grow.</t>
        <t>This draft presents a new Layer 3 based approach to OpenStack
        networking using IPv6 that can be deployed natively on IPv6 networks.
        It will be shown that this approach can provide tenant isolation
        without the limitations of existing L2 based alternatives, as well as
        deliver high performance networks transparently using a simplified
        tenant network connectivity model without the overhead of
        encapsulation or managing overlapping IP addresses and address
        translations. We note that some large content providers, notably
        Google and <xref target="FaceBook-IPv6">Facebook</xref>, are going in
        exactly this direction.</t>
      </section>
    </section>
    <section anchor="require" title="Requirements">
      <t>In this section, we attempt to list critical requirements.</t>
      <section anchor="approach" title="Design approach">
        <t>As a design approach, we presume an IPv6-only data center in a
        world that might have IPv4 clients outside of it. This design
        explicitly does not depend on VLANs, QinQ, VXLAN, MPLS, Segment
        Routing, IP/IP or GRE tunnels, or anything else. Data center operators
        remain free to use any of those tools, but they are not required. If
        we can do everything required for OpenStack networking with IPv6
        alone, these other networking technologies may be used as
        optimizations. If we are unable to satisfy the OpenStack requirements
        that also is something we wish to know and understand.</t>
        <t>OpenStack is designed to be used by many cloud users or tenants.
        Scalable, secure and isolated tenant networks are a requirement for
        building a multi-tenant cloud datacenter. The OpenStack
        administrator/operator can design and configure a cloud environment to
        provide network isolation using the approach described in this
        document, alone, or in combination with any of the above network
        technologies . However, all the details of the underlying technology
        and implementation details are completely transparent to the tenant
        itself.</t>
      </section>
      <section title="Requirements Language">
        <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
        "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
        document are to be interpreted as described in <xref
        target="RFC2119"/>.</t>
      </section>
      <section anchor="multicenter" title="Multiple Data Centers">
        <t>A common requirement in network and data center operations is
        reliability, serviceability, and maintainability of their operations
        in the presence of an outage. At minimum, this implies multihoming in
        the sense of having multiple upstream ISPs; in many cases, it also
        implies multiple and at times duplicate data centers, and tenants
        stretched or able to be readily moved or recreated across multiple
        data centers.</t>
      </section>
      <section anchor="large" title="Large Data Centers">
        <t><xref target="Microsoft-Azure">Microsoft Azure</xref> has purchased
        a 100 acre piece of land for the construction of a single data center.
        In terms of physical space, that is enough for a data center with
        about half a million 19" RETMA racks.</t>
        <t>With even modest virtual machine density, infrastructure at this
        scale easily exhausts the 16M available RFC 1918 private addresses
        (i.e. 10.0.0.0/8) and explains the recent efforts by webscale cloud
        providers to deploy IPv6 throughout their new datacenters.</t>
      </section>
      <section anchor="multitenant" title="Multi-tenancy">
        <t>While it is possible that a single tenant would require a 100 acre
        data center, it would be unusual. In most such data centers, one would
        expect a large number of tenants.</t>
      </section>
      <section anchor="isolation" title="Isolation">
        <t>Isolation is required between tenants, and at times between
        components of a single tenant.</t>
        <section anchor="interisolation" title="Inter-tenant isolation">
          <t>A "tenant" is defined as a set of resources under common
          administrative control. It may be appropriate for tenants to
          communicate with each other within the context of an application or
          relationships among their owners or operators. However, unless
          specified otherwise, tenants are intended to operate as if they were
          on their own company's premises and be isolated from one
          another.</t>
        </section>
        <section anchor="intraisolation" title="Intra-tenant isolation">
          <t>There are often security compartments within a corporate network,
          just as there are security barriers between companies. As a result,
          there is a recursive isolation requirement: it must be possible to
          isolate an identified part of a tenant from another part of the same
          tenant.</t>
        </section>
      </section>
      <section anchor="requirement5" title="Operational Simplicity">
        <t>To the extent possible (and, for operators, the concept will bring
        a smile), operation of a multi-location multi-tenant data center, and
        the design of an application that runs in one, should be simple and
        uncoupled.</t>
        <t>As discussed in <xref target="RFC3439"/>, this requires that the
        operational model required to support a tenant with only two physical
        machines, or virtual machines in the same physical chassis, should be
        the same as that required to support a tenant running a million
        machines in a multiple data center application. Additionally, this
        same operational model should scale from running a single tenant up to
        many thousands of tenants.</t>
      </section>
      <section anchor="requirement8" title="Address space ">
        <t>As described in <xref target="projects"/>, currently, an OpenStack
        tenant is required to specify a Subnet's CIDR prefix for IP address
        allocation. With this proposal, this is no longer required.</t>
      </section>
      <section anchor="requirement9" title="Data Center Federation">
        <t>It must be possible to extend the architecture across multiple data
        centers. These data centers may be operated by distinct entities, with
        security policies that apply to their interconnection.</t>
      </section>
    </section>
    <section anchor="models" title="Models">
      <t/>
      <section anchor="configuration" title="Configuration Model">
        <t>In the OpenStack model, the cloud computing user, or tenant, is
        building something Edward Yourdon might call a "structured design" for
        the application they are building. In the 1960's, when Yourdon started
        specifying process and data flow diagrams, these were job steps in a
        deck of Job Control Language cards; in OpenStack, they are multiple,
        individual machines, virtual or physical, running parts of a
        structured application.</t>
        <t>In these, one might find a load balancer that receives and
        distributes requests to request processors, a set of stored data
        processing applications, and the storage they depend on. What is
        important to the OpenStack tenant is that "this" and "that"
        communicate, potentially using or not using multicast communications,
        and don't communicate with "the other". Typically unnecessary is any
        and all information regarding how this communication actually needs to
        occur (i.e. placement of routers, switches, and IP subnets, prefixes,
        etc.).</t>
        <t>An IPv6 based networking model simplifies the configuration of
        tenant connectivity requirements. Global reachability eliminates the
        need for network address translation devices as well as
        tenant-specified Subnet prefixes (<xref target="requirement8"/>),
        although tenant-specified ULA prefixes or prefixes from the owner of
        the tenant's address space are usable with it. With the exception of
        network security functions, no network devices need to be specified or
        configured to provide connectivity.</t>
      </section>
      <section anchor="addressing" title="Data Center Model">
        <t>The premises of the routing and addressing models are that <list
            style="symbols">
            <t>The address tells the routing system what topological location
            to deliver a packet to, and within that, what interface to deliver
            it to, and</t>
            <t>The routing system should deliver traffic to a resource if and
            only if the sender is authorized to communicate with that
            resource.</t>
            <t>Contrary to the OpenStack Neutron Networking Model, tunnels are
            not necessary to provide tenant network isolation; we include
            resources in a tenant network by a Role-based Access Control
            model, but address the tenant resources within the data center in
            a manner that scales for the data center.</t>
          </list></t>
        <t>We expect to find the data center to be composed of some minimal
        unit of connectivity and maintenance, such as a rack or row, and
        equipped with one or more Top-of-Rack or End-of-Row switch(es); each
        configured with at least one subnet prefix, perhaps one per such
        switch. For the purposes of this note, these will be called Racks and
        Top-of-Rack switches, and when applied to other architectures the
        appropriate translation needs to be imposed.</t>
        <t><xref target="zynga"/> describes a relatively typical rack design.
        It is a simple fat-tree architecture, with every device in a pair, so
        that any failure has an immediate hot backup. There are other common
        designs, such as those that consider each rack to be in a "row" and in
        a "column", with a distribution switch in each.</t>
        <figure anchor="zynga" title="Typical Rack Design">
          <artwork align="center"><![CDATA[
       Distribution   Switches connecting
       / Layer /      racks in a pod, and
      /       /       connecting pods
     /       /
  +-+-+   +-+-+       Mutual backup TOR
+-+TOR+---+TOR+-+     switches
| +---+   +---+ |
| +-----------+ |
+-+    host   +-+     Each host has two
| +-----------+ |     Ethernet interfaces
+-+    host   +-+     with separate subnets
| +-----------+ |
| .           . |
| .           . |
| .           . |     Design premise: complete
| +-----------+ |     redundancy, with every
+-+    host   +-+     switch and every cable
| +-----------+ |     backed up by a doppelganger
+-+    host   +-+
  +-----------+
]]></artwork>
        </figure>
        <section anchor="tenant" title="Tenant Address Model">
          <t>Tenant resources need to be told, by configuration or naming, the
          addresses of resources they communicate with. This is true
          regardless of their location or relationship to a given tenant. In
          environments with well-known addresses, this gets complex and
          unscalable. This was learned very early with Internet hostnames; a
          single "hostfile" was maintained by a central entity and updated
          daily, which quickly became unmanageable. The result was the
          development of the Domain Name System; the level of indirection
          between names and addresses made the system more scalable. It also
          facilitated ongoing maintenance. If a service needed multiple
          servers, or a server needed to change its address, that was
          trivially solved by changing the DNS Resource Record; every resource
          that needed the new address would obtain it the next time it queried
          the DNS. It has also facilitated the IPv4/IPv6 transition; a
          resource that has an IPv6 address is given a AAAA record in addition
          to, or to replace, its IPv4 A record.</t>
          <t>Similarly, today's reliance on NAPT technology frequently limits
          the capabilities of an application. It works reasonably well for
          client accessing a client/server application when the protocol does
          not carry addressing information. If there is an expectation that
          one resource's private address will be meaningful to a peer, such as
          when an SIP client presents its address in SDP or an HTTP server
          presents an address in a redirection, either the resource needs to
          understand the difference between an "inside" and an "outside"
          address and know which is which, or it needs a traversal algorithm
          that changes the addresses. For peer-to-peer applications, this
          ultimately means providing a network design in which those issues
          don't apply.</t>
          <t>IPv6 provides global addresses, enough of them that there is no
          real expectation of running out any time soon, making these issues
          go away. In addition, with the IPv4 address space running out, both
          globally and within today's large datacenters, there aren't
          necessarily addresses available for an IPv4 application to use, even
          as a floating IP address.</t>
          <t>Hence, the model we propose is that a resource in a tenant is
          told the addresses of the other resources with which it
          communicates. They are IPv6 addresses, and the data center takes
          care to ensure that inappropriate communications do not take
          place.</t>
          <section anchor="gua-tenant"
                   title="Use of Global Unicast Addresses by Tenants">
            <t>A unicast address in an IP network identifies a topological
            location, by association with an IP prefix (which might be for a
            subnet or any aggregate of subnets). It also identifies a single
            interface located within that subnet, which may or may not be
            instantiated at the time. We assume that there is a subnet
            associated with a top-of-rack switch or whatever its counterpart
            would be in a given network design, and that the physical and
            virtual machines located in that rack have addresses in that
            subnet. This is the same prefix that is used by the datacenter
            administrator.</t>
          </section>
          <section anchor="ula" title="Unique Local Addresses">
            <t>A common requirement is that tenants have the use of some form
            of private address space. In an IPv6 network, a <xref
            target="RFC4193">Unique Local IPv6 Unicast Address</xref> may be
            used to accomplish this. In this case, however, the addresses will
            need to be explicitly assigned to physical or virtual machines
            used by the tenant, perhaps using DHCP or YANG, where a standard
            IPv6 address could be allocated using SLAAC or other
            technologies.</t>
            <t>The value of this is that Are we suggesting 2.8 address model
            is for GUA and that ULAs are a corner case in the data center.
            Tenants have no routing information or other awareness of the
            prefix. This is not intended for use behind a NAPT; resources that
            need accessibility to or from resources outside the tenant, and
            especially outside the data center, need global addresses.</t>
          </section>
          <section anchor="multicast" title="Multicast Domains">
            <t>Multicast capability is a capability enjoyed by some groups of
            resources, that they can send a single message and have it
            delivered to multiple destinations roughly simultaneously. At the
            link layer, this means sending a message once that is received by
            a specified set of recipient resources using hardware
            capabilities. IP multicast can be implemented on a LAN as
            specified in <xref target="RFC4291"/>, and can also cross multiple
            subnets directly, using routing protocols such as <xref
            target="RFC4601">Protocol Independent Multicast</xref> <xref
            target="RFC4602"/> <xref target="RFC4604"/> <xref
            target="RFC4605"/> <xref target="RFC4607"/>. In IPv6, the model
            would be that when a group of resources is created with a
            multicast capability, it is allocated one or more source-specific
            transient group addresses as defined in section 2.7 of that
            RFC.</t>
          </section>
          <section anchor="ipv4" title="IPv4 Interaction Model">
            <t>OpenStack IPv4 Neutron uses “floating IPv4
            addresses” – global or public IPv4 addresses - to
            enable remote resources to connect to tenant private network
            endpoints. Tenant end points can connect out to remote resources
            through an “External Default Gateway”. Both of these
            depend on NAPT (DNAT/SNAT) to ensure that IPv4 end points are able
            communicate and at the same time ensure tenant isolation.</t>
            <t>If IPv6 is deployed in a data center, there are fundamentally
            two ways a tenant can interact with IPv4 peers: <list
                style="symbols">
                <t>it can run existing IPv4 OpenStack technology in parallel
                with the IPv6 deployment, or</t>
                <t>It can have a translator at the data center edge (such as
                described in <xref target="I-D.anderson-v6ops-siit-dc"/>) that
                associates an IPv4 address or address plus port with an IPv6
                address or address plus port. The IPv4 address, in this model,
                becomes a floating IPv4 address attached to an internal IPv6
                address. The "data center edge" is, by definition, a system
                that has IPv4 reachability to at least the data center's
                upstream ISP and all IPv4 systems in the data center, IPv6
                connectivity to all of the IPv6 systems in the data center,
                and (if the upstream offers IPv6 service) IPv6 connectivity to
                the upstream as well.</t>
              </list></t>
            <t>The first model is complex, if for no other reason than that
            there are two fundamental models in use, one with various
            encapsulations hiding overlapping address space and one with
            non-overlapping address space.</t>
            <t>To simplify the network, as noted in <xref target="approach"/>,
            we suggest that the data center be internally IPv6-only, and IPv4
            be translated to IPv6 at the data center edge. The advantage is
            that it enables IPv4 access while that remains in use, and as IPv6
            takes over, it reduces the impact of vestigial support for
            IPv4.</t>
            <t>The SIIT Translation model in <xref
            target="I-D.anderson-v6ops-siit-dc"/> has IPv4 traffic come to an
            <xref target="RFC6145">translator</xref><xref target="RFC6146"/>
            having a pre-configured translation, resulting in an IPv6 packet
            indistinguishable from the packet the remote resource might have
            sent had it been IPv6-capable, with one exception. The IPv6
            destination address is that of the endpoint (the same address
            advertised in a AAAA record), but the source address is an <xref
            target="RFC6052">IPv4-Embedded IPv6 Address</xref> with the IPv4
            address of the sender embedded in a prefix used by the
            translator.</t>
            <t>Access to external IPv4 resources is provided in the same way:
            an <xref target="RFC6147">DNS64</xref> server is implemented that
            contains AAAA records with an <xref target="RFC6052">IPv4-Embedded
            IPv6 Address</xref> with the IPv4 address of the remote resource
            embedded in a prefix used by the translator.</t>
            <t>This follows the <xref target="RFC6144">Framework for IPv4/IPv6
            Translation</xref>, making the internal IPv4 address a floating IP
            address attached to an internal IPv6 address, and the external
            "dial-out" address indistinguishable from a native IPv6
            address.</t>
          </section>
          <section title="Legacy IPv4 OpenStack">
            <t>The other possible mdel, applicable to IPv4-only devices, is to
            run a legacy OpenStack environment inside IPv6 tunnels. This
            preserves the data center IPv6-only, and enables IPv4-only
            applications, notably those whose licenses tie them to IPv4
            addresses, to run. However, it adds significant overhead in terms
            of encapsulation size and network management complexity.</t>
          </section>
        </section>
        <section anchor="dc"
                 title="Use of Global Addresses by the Data Center">
          <t>Every rack and physical host requires an IP prefix that is
          reachable by the OpenStack operator. This will normally be a global
          IPv6 unicast address. For scalability purposes, as isolation is
          handled separately, this is normally the same prefix as is used by
          tenants in the rack.</t>
        </section>
      </section>
      <section anchor="security-isolation"
               title="Inter-tenant security services">
        <t>In this model, the a label is used to identify a project/tenant or
        part of a project/tenant, and is used to facilitate access control
        based on the label value. In <xref target="introduction"/>, we noted
        the limitation of 802.11ad QinQ, in that with Metro Ethernet networks
        it assigns a VLAN ID to a customer and a second VLAN id to a VLAN used
        by that customer, and is in that sense limited to 2^12 customers.
        Alternatively, it could be considered to be 2^12 geographies, with
        2^12 tenant VLANs in each, meaning that the data center operator has
        to think about placement of tenant VMs in places their VLANs reach.
        Labels manage that space with different limits.</t>
        <section anchor="label" title="Label Definition">
          <t>Three different types of labels are in view here: the IPv6 Flow
          Label, a federated identity in the IPv6 Destination Header, or a
          federated identity in the IPv6 Hop-by-Hop Header. These have
          different capabilities and implications.</t>
          <section anchor="ipv6label" title="Flow Label">
            <t>The IPv6 flow label may be used to identify a tenant or part of
            a tenant, and to facilitate access control based on the flow label
            value. The flow label is a flat 20 bits, facilitating the
            designation of 2^20 (1,048,576) tenants without regard to their
            location. 1,048,576 is less than infinity, but compared to current
            data centers is large, and much simpler to manage.</t>
            <t>Note that this usage differs from the current <xref
            target="RFC6437">IPv6 Flow Label Specification</xref>. It also
            differs from the use of a flow label recommended by the <xref
            target="RFC2460">IPv6 Specification</xref>, and the respective
            usages of the flow label in the <xref target="RFC2205">Resource
            ReSerVation Protocol</xref> and the previous <xref
            target="RFC3697">IPv6 Flow Label Specification</xref>, and the
            projected usage in <xref target="RFC5548">Low-Power and Lossy
            Networks</xref><xref target="RFC5673"/>. Within a target domain,
            the usage may be specified by the domain. That is the viewpoint
            taken in this specification.</t>
          </section>
          <section anchor="optlabel" title="Federated Identity">
            <t>Alternatively, <xref
            target="I-D.baker-openstack-rbac-federated-identity"/> defines a
            numeric label usable in network layer Role-based Access Control.
            The syntax is a sequence of positive integers; the semantics of
            the integers are defined by the administration(s) using them. A
            single integer may be used in the same way as the Flow Label in
            <xref target="ipv6label"/>, but without the 20 bit limitation. A
            pair of integers might, for example, signify a data center and its
            tenants, or a company and its departments. Three integers might,
            again as an example, signify one data center operator among a set
            of data center operators, the second the clients of those
            operators, and the third subsets of those clients. In any event,
            as in <xref target="ipv6label"/>, it identifies a set of machines,
            physical or virtual, that are authorized to communicate freely
            among themselves, but may or may not be authorized to communicate
            with other equipment.</t>
            <t>Carried in the Destination Options Extension Header, this
            option is visible to Neutron in the originating and terminating
            chassis. This limits overheads in intermediate switches, and
            enables filtering in the destination system.</t>
            <t>Carried in the Hop-by-Hop Extension Header, this option is
            visible to routers en route in addition to Neutron in the
            originating and terminating chassis, at the possible cost of some
            processing overhead. This facilitates filtering at any system.</t>
          </section>
        </section>
        <section anchor="ipv6-isolation"
                 title="IPv6 Tenant Isolation using the Label">
          <t>Neutron today already implements a form of <xref
          target="RFC2827">Network Ingress Filtering</xref>. It prevents the
          VM from emitting traffic with an unauthorized MAC, IPv4, or IPv6
          source address.</t>
          <t>In addition to this, in this model Neutron prevents the VM from
          transmitting a network packet with an unauthorized label value. The
          VM MAY be configured with and authorized to use one of a short list
          of authorized label values, as opposed to simply having its choice
          overridden; in that case, Neutron verifies the value and overwrites
          one not in the list.</t>
          <t>When a hypervisor is about to deliver an IPv6 packet to a VM, it
          checks the label value against a list of values that the VM is
          permitted to receive. If it contains an unauthorized value, the
          hypervisor discards the packet rather than deliver it. If the Flow
          Label is in use, Neutron zeros the label prior to delivery.</t>
          <t>The intention is to hide the label value from malware potentially
          found in the VM, and enable the label to be used as a form of first
          and last hop security. This provides basic tenant isolation, if the
          label is assigned as a tenant identifier, and may be used more
          creatively such as to identify a network management application as
          separate from a managed resource.</t>
        </section>
        <section anchor="routing" title="Isolation in Routing">
          <t>This concept has the weakness that if a packet is not dropped at
          its source, it is dropped at its destination. It would be preferable
          for the packet to be dropped in flight, such as at the top-of-rack
          switch or an aggregation router.</t>
          <t>Concepts discussed in <xref
          target="I-D.baker-ipv6-isis-dst-flowlabel-routing">IS-IS LSP
          Extendibility</xref><xref target="RFC5120"/><xref target="RFC5308"/>
          and <xref target="I-D.baker-ipv6-ospf-dst-flowlabel-routing">OSPFv3
          LSA Extendibility</xref> <xref
          target="I-D.ietf-ospf-ospfv3-lsa-extend"/><xref target="RFC5340"/>
          may be used to isolate tenants in the routing of the data center
          backbone. This is not strictly necessary, if <xref
          target="ipv6-isolation"/> is uniformly and correctly implemented. It
          does, however, present a second defense against misconfiguration, as
          the filter becomes ubiquitous in the data center and as scalable as
          routing.</t>
        </section>
      </section>
      <section anchor="bcp38" title="BCP 38 Ingress Filtering">
        <t>As noted in <xref target="ipv6-isolation"/>, Neutron today
        implements a form of <xref target="RFC2827">Network Ingress
        Filtering</xref>. It prevents the VM from emitting traffic with an
        unauthorized MAC, IPv4, or IPv6 source address.</t>
        <t>In IPv6, this is readily handled when the address or addresses used
        by a VM are selected by the OpenStack operator. It may then configure
        a per-VM filter with the addresses it has chosen, following logic
        similar to the <xref target="I-D.ietf-savi-dhcp">Source Address
        Validation Solution for DHCP</xref> or <xref
        target="RFC7219">SEND</xref>. This is also true of <xref
        target="RFC4862">IPv6 Stateless Address Autoconfiguration
        (SLAAC)</xref> when the MAC address is known and not shared.</t>
        <t>However, when SLAAC is in use and either the MAC address is unknown
        or SLAAC's <xref target="RFC4941">Privacy Extensions </xref><xref
        target="RFC7217"/>, are in use, Neutron will need to implement the
        provisions of <xref target="RFC6620">FCFS SAVI: First-Come,
        First-Served Source Address Validation</xref> in order to learn the
        addresses that a VM is using and include them in the per-VM
        filter.</t>
      </section>
      <section title="Moving virtual machines">
        <t>This design supports these kinds of required layer 2 networks with
        the additional use of a layer 2 over layer 3 encapsulation and
        tunneling protocol, such as <xref
        target="I-D.mahalingam-dutt-dcops-vxlan">VXLAN</xref>. The important
        point here being that these overlays are used to address specific
        tenant network requirements and NOT deployed to remove the scalability
        limitations of OpenStack networking.</t>
        <t>There are at least three ways VM movement can be accomplished:
        <list style="symbols">
            <t>Recreation of the VM</t>
            <t>VLAN Modification</t>
            <t>Live Migration of a Running Virtual Machine</t>
          </list></t>
        <section anchor="motion-new-vm" title="Recreation of the VM">
          <t>The simplest and most reliable is to <list style="numbers">
              <t>Create a new VM in the new location,</t>
              <t>Add its address to the DNS Resource Record for the name,
              allowing new references to the name to send transactions
              there,</t>
              <t>Remove the old address from the DNS Resource Record
              (including the SIIT translation, if one exists), ending the use
              of the old VM for new transactions,</t>
              <t>Wait for the period of the DNS Resource Record's lifetime
              (including the SIIT translation, if one exists), as it will get
              new requests throughout that interval,</t>
              <t>Wait for the for the old VM to finish any outstanding
              transactions, and then</t>
              <t>Kill the old VM.</t>
            </list></t>
          <t>This is obviously not movement of an existing VM, but
          preservation of the same number and function of VMs by creation of a
          new VM and killing the old.</t>
        </section>
        <section anchor="vmotion"
                 title="Live Migration of a Running Virtual Machine">
          <t>At
          http://blogs.vmware.com/vsphere/2011/02/vmotion-whats-going-on-under-the-covers.html,
          VMWare describes its capability, called vMotion, in the following
          terms: <list style="numbers">
              <t>Shadow VM created on the destination host.</t>
              <t>Copy each memory page from the source to the destination via
              the vMotion network. This is known as preCopy.</t>
              <t>Perform another pass over the VM's memory, copying any pages
              that changed during the last preCopy iteration.</t>
              <t>Continue this iterative memory copying until no changed pages
              (outstanding to be-copied pages) remain or 100 seconds
              elapse.</t>
              <t>Stun the VM on the source and resume it on the
              destination.</t>
            </list></t>
          <t>In a native-address environment, we add three steps:<list
              style="numbers">
              <t>Shadow VM created on the destination host.</t>
              <t>Copy each memory page from the source to the destination via
              the vMotion network. This is known as preCopy.</t>
              <t>Perform another pass over the VM's memory, copying any pages
              that changed during the last preCopy iteration.</t>
              <t>Continue this iterative memory copying until no changed pages
              (outstanding to be-copied pages) remain or 100 seconds
              elapse.</t>
              <t>Stitch routing for the old address.</t>
              <t>Stun the VM on the source and resume it on the
              destination.</t>
              <t>Renumber the VM as instructed in <xref
              target="RFC4192"/>.</t>
              <t>Unstitch routing for the old address.</t>
            </list></t>
          <t>If the VM is moved within the same subnet (which usually implies
          the same rack), there is no stitching or renumbering apart from
          ensuring that the MAC address moves with the VM. When the VM moves
          to a different subnet, however, we need to restitch routing, at
          least temporarily. This obviously calls for some definitions. <list
              style="hanging">
              <t hangText="Stitching Routing:">The VM is potentially in
              communication with two sets of peers: VMs in the same subnet,
              and VMs in different subnets. <list style="symbols">
                  <t>The router in the new subnet is instructed to advertise a
                  host route (/128) to the moved VM, and to install a static
                  route to the old address with the VM's address in the new
                  subnet as its next hop address. Traffic from VMs from other
                  subnets will now follow the host route to the VM in its new
                  location.</t>
                  <t>The router in the old subnet is instructed to direct LAN
                  traffic to the VM's MAC Address to its IPv6 forwarding
                  logic. Traffic from other VMs in the old subnet will now
                  follow the host route to the moved VM.</t>
                </list></t>
              <t hangText="Renumbering:">This step is optional, but is good
              hygiene if the VM will be there a while. If the VM will reside
              in its new location only temporarily, it can be skipped. <vspace
              blankLines="1"/> Note that every IPv6 address, unlike an IPv4
              address, has a lifetime. At least in theory, when the lifetime
              expires, neighbor relationships with the address must be
              extended or the address removed from the system. The <xref
              target="RFC4861">Neighbor Discovery</xref> process in the subnet
              router will periodically emit a Router Advertisement; the VM
              will gain an IPv6 address in the new subnet at that time if not
              earlier. As described in <xref target="RFC4192"/>, DNS should be
              changed to report the new address instead of the old. The DNS
              lifetime and any ambient sessions using the old address are now
              allowed to expire. That this point, any new sessions will be
              using the new address, and the old is vestigial. <vspace
              blankLines="1"/> Waiting for sessions using the address to
              expire can take an arbitrarily long interval, because the
              session generally has no knowledge of the lifetime of the IPv6
              address.</t>
              <t hangText="Unstitching Routing:">This is the reverse process
              of stitching. If the VM is renumbered, when the old address
              becomes vestigial, the address will be discarded by the VM; if
              the VM is subsequently taken out of service, it has the same
              effect. At that point, the host route is withdrawn, and the MAC
              address in the old subnet router's tables is removed.</t>
            </list></t>
        </section>
      </section>
    </section>
    <section anchor="implications" title="OpenStack implications">
      <section anchor="config-implications" title="Configuration implications">
        <t><list style="numbers">
            <t>Neutron MUST be configured with a pre-determined default label
            value for each tenant virtual network <xref
            target="ipv6-isolation"/>.</t>
            <t>Neutron MAY be configured with a set of authorized label values
            for each tenant virtual network <xref
            target="ipv6-isolation"/>.</t>
            <t>A virtual tenant network MAY be configured with a set of
            authorized label values <xref target="ipv6-isolation"/>.</t>
            <t>Neutron MUST be configured with one or more label values per
            virtual tenant network that the network is permitted to receive
            <xref target="ipv6-isolation"/>.</t>
          </list></t>
      </section>
      <section anchor="vSwitch-implications" title="vSwitch implications">
        <t>On messages transmitted by a virtual machine <list style="hanging">
            <t hangText="Label Correctness:">As described in <xref
            target="ipv6label"/> or <xref target="optlabel"/>, ensure that the
            label in the packet is one that the VM is authorized to use.
            Depending on configuration, it may be in the IPv6 Flow Label, an
            option in the Hop-by_hop header, or an option in the Destination
            Header. Again depending on configuratoin, the vSwitch may
            overwrite whatever value is there, or may ratify that the value
            there is as specified in a VM-specific list.</t>
            <t hangText="Source Address Validation:">As described in <xref
            target="bcp38"/>, force the source address to be among those the
            VM is authorized to use. The VM may simultaneously be authorized
            to use several addresses. </t>
            <t hangText="Destination Address Validation:">OpenStack for IPv4
            permits a NAT translation, called a "floating IP address", to
            enable a VM to comunicate outside the domain; without that, it
            cannot. For IPv6, the destination address should be permitted by
            some acces list, which may permit all addresses, or addresses
            matching one or more CIDR prefixes such as permitted multicast
            addresses, and the prefix of the data center.</t>
          </list></t>
        <t>On messages received for delivery to a virtual machine <list
            style="hanging">
            <t hangText="Label Authorization:">As described in <xref
            target="ipv6-isolation"/>, the vSwitch only delivers a packet to a
            VM if the VM is authorized to receive it. The VM may have been
            authorized to receive several such labels.</t>
          </list></t>
      </section>
    </section>
    <section anchor="IANA" title="IANA Considerations">
      <t> This document does not ask IANA to do anything.  </t>
      </section>
    <section anchor="Security" title="Security Considerations">
      <t>In <xref target="isolation"/> and <xref
      target="security-isolation"/>, this specification considers inter-tenant
      and intra-tenant network isolation. It is intended to contribute to the
      security of a network, much like encapsulation in a maze of tunnels or
      VLANs might, but without the complexities and overhead of the management
      of such resources. This does not replace the use of IPsec, SSH, or TLS
      encryption or the use authentication technologies; if these would be
      appropriate in an on-premises corporate data center, they remain
      appropriate in a multi-tenant data center regardless of the isolation
      technology. However, one can think of this as a simple inter-tenant
      firewall based on the concepts of role-based access control; if it can
      be readily determined that a sender is not authorized to communicate
      with a receiver, such a transmission is prevented.</t>
    </section>
    <section anchor="Privacy" title="Privacy Considerations">
      <t>This specification places no personally identifying information in an
      unencrypted part of a packet, unless a person is explicitly represented
      by an IPv6 address or label value, which is beyond its scope. That said,
      if the RBAC Identifier in <xref
      target="I-D.baker-openstack-rbac-federated-identity"/> is used, the
      security and privacy considerations of that document are relevant
      here.</t>
    </section>
    <section anchor="Acknowledgements" title="Acknowledgements">
      <t>This document grew out of a discussion among the authors and
      contributors.</t>
    </section>
    <section anchor="Contributors" title="Contributors">
      <figure anchor="a">
        <artwork align="left"><![CDATA[
   Puneet Konghot
   Cisco Systems
   San Jose, California  95134
   USA
   Email: pkonghot@cisco.com

   Rohit Agarwalla
   Cisco Systems
   San Jose, California  95134
   USA
   Email: roagarwa@cisco.com

   Shannon McFarland
   Cisco Systems
   Boulder, Colorado  80301
   USA
   Email: shmcfarl@cisco.com
	]]></artwork>
      </figure>
    </section>
  </middle>
  <back>
    <!-- references split to informative and normative -->
    <references title="Normative References">
      <?rfc include="reference.RFC.2119" ?>
      <?rfc include="reference.RFC.2460" ?>
    </references>
    <references title="Informative References">
      <reference anchor="Microsoft-Azure">
        <front>
          <title>Report: Microsoft Buys 100 Acres of Iowa land for Data Center
          http://www.datacenterknowledge.com/archives/2014/08/01/report-microsoft-buys-100-acres-iowa-land-data-center/</title>
          <author fullname="Y. Sverdlik" initials="Y." surname="Sverdlik">
            <organization>Data Center Knowledge</organization>
          </author>
          <date month="August" year="2014"/>
        </front>
        <format target="http://www.datacenterknowledge.com/archives/2014/08/01/report-microsoft-buys-100-acres-iowa-land-data-center/"
                type="HTML"/>
      </reference>
      <reference anchor="FaceBook-IPv6">
        <front>
          <title>Facebook Is Close to Having an IPv6-only Data Center
          http://blog.ipspace.net/2014/03/facebook-is-close-to-having-ipv6-only.html</title>
          <author fullname="I. Pepelnjak" initials="I." surname="Pepelnjak">
            <organization>Internetworking perspectives by Ivan
            Pepelnjak</organization>
          </author>
          <date month="March" year="2014"/>
        </front>
        <format target="http://blog.ipspace.net/2014/03/facebook-is-close-to-having-ipv6-only.html"
                type="HTML"/>
      </reference>
      <?rfc include="reference.I-D.anderson-v6ops-siit-dc" ?>
      <?rfc include="reference.I-D.ietf-savi-dhcp" ?>
      <?rfc include="reference.I-D.mahalingam-dutt-dcops-vxlan" ?>
      <reference anchor="I-D.baker-openstack-rbac-federated-identity">
        <front>
          <title>Federated Identity for IPv6 Role-base Access Control</title>
          <author fullname="Fred Baker" initials="F" surname="Baker">
            <organization/>
          </author>
          <date month="September" year="2014"/>
          <abstract>
            <t>This document provides advice on the filtering of IPv6 packets
            based on the IPv6 Extension Headers and the IPv6 options they
            contain. Additionally, it discusses the operational and
            interoperability implications of discarding packets based on the
            IPv6 Extension Headers and IPv6 options they contain.</t>
          </abstract>
        </front>
      </reference>
      <?rfc include="reference.I-D.baker-ipv6-isis-dst-flowlabel-routing" ?>
      <?rfc include="reference.I-D.baker-ipv6-ospf-dst-flowlabel-routing" ?>
      <?rfc include="reference.I-D.ietf-ospf-ospfv3-lsa-extend" ?>
      <?rfc include="reference.RFC.2205" ?>
      <?rfc include="reference.RFC.2827" ?>
      <?rfc include="reference.RFC.3439" ?>
      <?rfc include="reference.RFC.4192" ?>
      <?rfc include="reference.RFC.4601" ?>
      <?rfc include="reference.RFC.4602" ?>
      <?rfc include="reference.RFC.4604" ?>
      <?rfc include="reference.RFC.4605" ?>
      <?rfc include="reference.RFC.4607" ?>
      <?rfc include="reference.RFC.5548" ?>
      <?rfc include="reference.RFC.5673" ?>
      <?rfc include="reference.RFC.3697" ?>
      <?rfc include="reference.RFC.4193" ?>
      <?rfc include="reference.RFC.4291" ?>
      <?rfc include="reference.RFC.4861" ?>
      <?rfc include="reference.RFC.4862" ?>
      <?rfc include="reference.RFC.4941" ?>
      <?rfc include="reference.RFC.5120" ?>
      <?rfc include="reference.RFC.5308" ?>
      <?rfc include="reference.RFC.5340" ?>
      <?rfc include="reference.RFC.6052" ?>
      <?rfc include="reference.RFC.6144" ?>
      <?rfc include="reference.RFC.6145" ?>
      <?rfc include="reference.RFC.6146" ?>
      <?rfc include="reference.RFC.6147" ?>
      <?rfc include="reference.RFC.6437" ?>
      <?rfc include="reference.RFC.6620" ?>
      <?rfc include="reference.RFC.7217" ?>
      <?rfc include="reference.RFC.7219" ?>
    </references>
    <section anchor="log" title="Change Log">
      <t><list style="hanging">
          <t hangText="Initial Version:">October 2014</t>
        </list></t>
    </section>
  </back>
</rfc>

PAFTECH AB 2003-20262026-04-23 18:28:04