diff --git a/README.md b/README.md
index 88e8020..76d006a 100644
--- a/README.md
+++ b/README.md
@@ -110,7 +110,7 @@ glider -config CONFIGPATH -listen :8080 -verbose
 ## Usage
 
 ```bash
-glider v0.6.7 usage:
+glider v0.6.8 usage:
   -checkduration int
         proxy check interval(seconds) (default 30)
   -checkwebsite string
@@ -172,8 +172,8 @@ SS scheme:
   ss://method:pass@host:port
 
 Available methods for ss:
-  AEAD_AES_128_GCM AEAD_AES_192_GCM AEAD_AES_256_GCM AEAD_CHACHA20_POLY1305 AES-128-CFB AES-128-CTR AES-192-CFB AES-192-CTR AES-256-CFB AES-256-CTR CHACHA20-IETF XCHACHA20
-  NOTE: chacha20-ietf-poly1305 = AEAD_CHACHA20_POLY1305
+  AEAD_AES_128_GCM AEAD_AES_192_GCM AEAD_AES_256_GCM AEAD_CHACHA20_POLY1305 AEAD_XCHACHA20_POLY1305 AES-128-CFB AES-128-CTR AES-192-CFB AES-192-CTR AES-256-CFB AES-256-CTR CHACHA20-IETF XCHACHA20 CHACHA20
+  NOTE: chacha20-ietf-poly1305 = AEAD_CHACHA20_POLY1305, xchacha20-ietf-poly1305 = AEAD_XCHACHA20_POLY1305
 
 SSR scheme:
   ssr://method:pass@host:port?protocol=xxx&protocol_param=yyy&obfs=zzz&obfs_param=xyz
diff --git a/conf.go b/conf.go
index ae45663..c0b43c8 100644
--- a/conf.go
+++ b/conf.go
@@ -136,9 +136,9 @@ func usage() {
 	fmt.Fprintf(os.Stderr, "\n")
 
 	fmt.Fprintf(os.Stderr, "Available methods for ss:\n")
-	fmt.Fprintf(os.Stderr, "  AEAD_AES_128_GCM AEAD_AES_192_GCM AEAD_AES_256_GCM AEAD_CHACHA20_POLY1305 AES-128-CFB AES-128-CTR AES-192-CFB AES-192-CTR AES-256-CFB AES-256-CTR CHACHA20-IETF XCHACHA20")
+	fmt.Fprintf(os.Stderr, "  AEAD_AES_128_GCM AEAD_AES_192_GCM AEAD_AES_256_GCM AEAD_CHACHA20_POLY1305 AEAD_XCHACHA20_POLY1305 AES-128-CFB AES-128-CTR AES-192-CFB AES-192-CTR AES-256-CFB AES-256-CTR CHACHA20-IETF XCHACHA20 CHACHA20")
 	fmt.Fprintf(os.Stderr, "\n")
-	fmt.Fprintf(os.Stderr, "  NOTE: chacha20-ietf-poly1305 = AEAD_CHACHA20_POLY1305\n")
+	fmt.Fprintf(os.Stderr, "  NOTE: chacha20-ietf-poly1305 = AEAD_CHACHA20_POLY1305, xchacha20-ietf-poly1305 = AEAD_XCHACHA20_POLY1305\n")
 	fmt.Fprintf(os.Stderr, "\n")
 
 	fmt.Fprintf(os.Stderr, "SSR scheme:\n")
diff --git a/proxy/ss/ss.go b/proxy/ss/ss.go
index 919ac95..7273e9d 100644
--- a/proxy/ss/ss.go
+++ b/proxy/ss/ss.go
@@ -8,7 +8,7 @@ import (
 	"sync"
 	"time"
 
-	"github.com/shadowsocks/go-shadowsocks2/core"
+	"github.com/nadoo/go-shadowsocks2/core"
 
 	"github.com/nadoo/glider/common/conn"
 	"github.com/nadoo/glider/common/log"
diff --git a/proxy/vmess/aead.go b/proxy/vmess/aead.go
index 68729a0..d0e334c 100644
--- a/proxy/vmess/aead.go
+++ b/proxy/vmess/aead.go
@@ -105,13 +105,13 @@ func (r *aeadReader) Read(b []byte) (int, error) {
 	}
 
 	// if length == 0, then this is the end
-	len := binary.BigEndian.Uint16(r.buf[:lenSize])
-	if len == 0 {
+	l := binary.BigEndian.Uint16(r.buf[:lenSize])
+	if l == 0 {
 		return 0, nil
 	}
 
 	// get payload
-	buf := r.buf[:len]
+	buf := r.buf[:l]
 	_, err = io.ReadFull(r.Reader, buf)
 	if err != nil {
 		return 0, err
@@ -126,7 +126,7 @@ func (r *aeadReader) Read(b []byte) (int, error) {
 		return 0, err
 	}
 
-	dataLen := int(len) - r.Overhead()
+	dataLen := int(l) - r.Overhead()
 	m := copy(b, r.buf[:dataLen])
 	if m < int(dataLen) {
 		r.leftover = r.buf[m:dataLen]
diff --git a/vendor/github.com/Yawning/chacha20/LICENSE b/vendor/github.com/Yawning/chacha20/LICENSE
deleted file mode 100644
index 6ca207e..0000000
--- a/vendor/github.com/Yawning/chacha20/LICENSE
+++ /dev/null
@@ -1,122 +0,0 @@
-Creative Commons Legal Code
-
-CC0 1.0 Universal
-
-    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
-    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
-    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
-    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
-    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
-    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
-    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
-    HEREUNDER.
-
-Statement of Purpose
-
-The laws of most jurisdictions throughout the world automatically confer
-exclusive Copyright and Related Rights (defined below) upon the creator
-and subsequent owner(s) (each and all, an "owner") of an original work of
-authorship and/or a database (each, a "Work").
-
-Certain owners wish to permanently relinquish those rights to a Work for
-the purpose of contributing to a commons of creative, cultural and
-scientific works ("Commons") that the public can reliably and without fear
-of later claims of infringement build upon, modify, incorporate in other
-works, reuse and redistribute as freely as possible in any form whatsoever
-and for any purposes, including without limitation commercial purposes.
-These owners may contribute to the Commons to promote the ideal of a free
-culture and the further production of creative, cultural and scientific
-works, or to gain reputation or greater distribution for their Work in
-part through the use and efforts of others.
-
-For these and/or other purposes and motivations, and without any
-expectation of additional consideration or compensation, the person
-associating CC0 with a Work (the "Affirmer"), to the extent that he or she
-is an owner of Copyright and Related Rights in the Work, voluntarily
-elects to apply CC0 to the Work and publicly distribute the Work under its
-terms, with knowledge of his or her Copyright and Related Rights in the
-Work and the meaning and intended legal effect of CC0 on those rights.
-
-1. Copyright and Related Rights. A Work made available under CC0 may be
-protected by copyright and related or neighboring rights ("Copyright and
-Related Rights"). Copyright and Related Rights include, but are not
-limited to, the following:
-
-  i. the right to reproduce, adapt, distribute, perform, display,
-     communicate, and translate a Work;
- ii. moral rights retained by the original author(s) and/or performer(s);
-iii. publicity and privacy rights pertaining to a person's image or
-     likeness depicted in a Work;
- iv. rights protecting against unfair competition in regards to a Work,
-     subject to the limitations in paragraph 4(a), below;
-  v. rights protecting the extraction, dissemination, use and reuse of data
-     in a Work;
- vi. database rights (such as those arising under Directive 96/9/EC of the
-     European Parliament and of the Council of 11 March 1996 on the legal
-     protection of databases, and under any national implementation
-     thereof, including any amended or successor version of such
-     directive); and
-vii. other similar, equivalent or corresponding rights throughout the
-     world based on applicable law or treaty, and any national
-     implementations thereof.
-
-2. Waiver. To the greatest extent permitted by, but not in contravention
-of, applicable law, Affirmer hereby overtly, fully, permanently,
-irrevocably and unconditionally waives, abandons, and surrenders all of
-Affirmer's Copyright and Related Rights and associated claims and causes
-of action, whether now known or unknown (including existing as well as
-future claims and causes of action), in the Work (i) in all territories
-worldwide, (ii) for the maximum duration provided by applicable law or
-treaty (including future time extensions), (iii) in any current or future
-medium and for any number of copies, and (iv) for any purpose whatsoever,
-including without limitation commercial, advertising or promotional
-purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
-member of the public at large and to the detriment of Affirmer's heirs and
-successors, fully intending that such Waiver shall not be subject to
-revocation, rescission, cancellation, termination, or any other legal or
-equitable action to disrupt the quiet enjoyment of the Work by the public
-as contemplated by Affirmer's express Statement of Purpose.
-
-3. Public License Fallback. Should any part of the Waiver for any reason
-be judged legally invalid or ineffective under applicable law, then the
-Waiver shall be preserved to the maximum extent permitted taking into
-account Affirmer's express Statement of Purpose. In addition, to the
-extent the Waiver is so judged Affirmer hereby grants to each affected
-person a royalty-free, non transferable, non sublicensable, non exclusive,
-irrevocable and unconditional license to exercise Affirmer's Copyright and
-Related Rights in the Work (i) in all territories worldwide, (ii) for the
-maximum duration provided by applicable law or treaty (including future
-time extensions), (iii) in any current or future medium and for any number
-of copies, and (iv) for any purpose whatsoever, including without
-limitation commercial, advertising or promotional purposes (the
-"License"). The License shall be deemed effective as of the date CC0 was
-applied by Affirmer to the Work. Should any part of the License for any
-reason be judged legally invalid or ineffective under applicable law, such
-partial invalidity or ineffectiveness shall not invalidate the remainder
-of the License, and in such case Affirmer hereby affirms that he or she
-will not (i) exercise any of his or her remaining Copyright and Related
-Rights in the Work or (ii) assert any associated claims and causes of
-action with respect to the Work, in either case contrary to Affirmer's
-express Statement of Purpose.
-
-4. Limitations and Disclaimers.
-
- a. No trademark or patent rights held by Affirmer are waived, abandoned,
-    surrendered, licensed or otherwise affected by this document.
- b. Affirmer offers the Work as-is and makes no representations or
-    warranties of any kind concerning the Work, express, implied,
-    statutory or otherwise, including without limitation warranties of
-    title, merchantability, fitness for a particular purpose, non
-    infringement, or the absence of latent or other defects, accuracy, or
-    the present or absence of errors, whether or not discoverable, all to
-    the greatest extent permissible under applicable law.
- c. Affirmer disclaims responsibility for clearing rights of other persons
-    that may apply to the Work or any use thereof, including without
-    limitation any person's Copyright and Related Rights in the Work.
-    Further, Affirmer disclaims responsibility for obtaining any necessary
-    consents, permissions or other rights required for any use of the
-    Work.
- d. Affirmer understands and acknowledges that Creative Commons is not a
-    party to this document and has no duty or obligation with respect to
-    this CC0 or use of the Work.
-
diff --git a/vendor/github.com/Yawning/chacha20/README.md b/vendor/github.com/Yawning/chacha20/README.md
deleted file mode 100644
index 9080a84..0000000
--- a/vendor/github.com/Yawning/chacha20/README.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### chacha20 - ChaCha20
-#### Yawning Angel (yawning at schwanenlied dot me)
-
-Yet another Go ChaCha20 implementation.  Everything else I found  was slow,
-didn't support all the variants I need to use, or relied on cgo to go fast.
-
-Features:
-
- * 20 round, 256 bit key only.  Everything else is pointless and stupid.
- * IETF 96 bit nonce variant.
- * XChaCha 24 byte nonce variant.
- * SSE2 and AVX2 support on amd64 targets.
- * Incremental encrypt/decrypt support, unlike golang.org/x/crypto/salsa20.
-
diff --git a/vendor/github.com/Yawning/chacha20/chacha20.go b/vendor/github.com/Yawning/chacha20/chacha20.go
deleted file mode 100644
index 07d5e4b..0000000
--- a/vendor/github.com/Yawning/chacha20/chacha20.go
+++ /dev/null
@@ -1,273 +0,0 @@
-// chacha20.go - A ChaCha stream cipher implementation.
-//
-// To the extent possible under law, Yawning Angel has waived all copyright
-// and related or neighboring rights to chacha20, using the Creative
-// Commons "CC0" public domain dedication. See LICENSE or
-// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
-
-package chacha20
-
-import (
-	"crypto/cipher"
-	"encoding/binary"
-	"errors"
-	"math"
-	"runtime"
-)
-
-const (
-	// KeySize is the ChaCha20 key size in bytes.
-	KeySize = 32
-
-	// NonceSize is the ChaCha20 nonce size in bytes.
-	NonceSize = 8
-
-	// INonceSize is the IETF ChaCha20 nonce size in bytes.
-	INonceSize = 12
-
-	// XNonceSize is the XChaCha20 nonce size in bytes.
-	XNonceSize = 24
-
-	// HNonceSize is the HChaCha20 nonce size in bytes.
-	HNonceSize = 16
-
-	// BlockSize is the ChaCha20 block size in bytes.
-	BlockSize = 64
-
-	stateSize    = 16
-	chachaRounds = 20
-
-	// The constant "expand 32-byte k" as little endian uint32s.
-	sigma0 = uint32(0x61707865)
-	sigma1 = uint32(0x3320646e)
-	sigma2 = uint32(0x79622d32)
-	sigma3 = uint32(0x6b206574)
-)
-
-var (
-	// ErrInvalidKey is the error returned when the key is invalid.
-	ErrInvalidKey = errors.New("key length must be KeySize bytes")
-
-	// ErrInvalidNonce is the error returned when the nonce is invalid.
-	ErrInvalidNonce = errors.New("nonce length must be NonceSize/INonceSize/XNonceSize bytes")
-
-	// ErrInvalidCounter is the error returned when the counter is invalid.
-	ErrInvalidCounter = errors.New("block counter is invalid (out of range)")
-
-	useUnsafe    = false
-	usingVectors = false
-	blocksFn     = blocksRef
-)
-
-// A Cipher is an instance of ChaCha20/XChaCha20 using a particular key and
-// nonce.
-type Cipher struct {
-	state [stateSize]uint32
-
-	buf  [BlockSize]byte
-	off  int
-	ietf bool
-}
-
-// Reset zeros the key data so that it will no longer appear in the process's
-// memory.
-func (c *Cipher) Reset() {
-	for i := range c.state {
-		c.state[i] = 0
-	}
-	for i := range c.buf {
-		c.buf[i] = 0
-	}
-}
-
-// XORKeyStream sets dst to the result of XORing src with the key stream.  Dst
-// and src may be the same slice but otherwise should not overlap.
-func (c *Cipher) XORKeyStream(dst, src []byte) {
-	if len(dst) < len(src) {
-		src = src[:len(dst)]
-	}
-
-	for remaining := len(src); remaining > 0; {
-		// Process multiple blocks at once.
-		if c.off == BlockSize {
-			nrBlocks := remaining / BlockSize
-			directBytes := nrBlocks * BlockSize
-			if nrBlocks > 0 {
-				blocksFn(&c.state, src, dst, nrBlocks, c.ietf)
-				remaining -= directBytes
-				if remaining == 0 {
-					return
-				}
-				dst = dst[directBytes:]
-				src = src[directBytes:]
-			}
-
-			// If there's a partial block, generate 1 block of keystream into
-			// the internal buffer.
-			blocksFn(&c.state, nil, c.buf[:], 1, c.ietf)
-			c.off = 0
-		}
-
-		// Process partial blocks from the buffered keystream.
-		toXor := BlockSize - c.off
-		if remaining < toXor {
-			toXor = remaining
-		}
-		if toXor > 0 {
-			for i, v := range src[:toXor] {
-				dst[i] = v ^ c.buf[c.off+i]
-			}
-			dst = dst[toXor:]
-			src = src[toXor:]
-
-			remaining -= toXor
-			c.off += toXor
-		}
-	}
-}
-
-// KeyStream sets dst to the raw keystream.
-func (c *Cipher) KeyStream(dst []byte) {
-	for remaining := len(dst); remaining > 0; {
-		// Process multiple blocks at once.
-		if c.off == BlockSize {
-			nrBlocks := remaining / BlockSize
-			directBytes := nrBlocks * BlockSize
-			if nrBlocks > 0 {
-				blocksFn(&c.state, nil, dst, nrBlocks, c.ietf)
-				remaining -= directBytes
-				if remaining == 0 {
-					return
-				}
-				dst = dst[directBytes:]
-			}
-
-			// If there's a partial block, generate 1 block of keystream into
-			// the internal buffer.
-			blocksFn(&c.state, nil, c.buf[:], 1, c.ietf)
-			c.off = 0
-		}
-
-		// Process partial blocks from the buffered keystream.
-		toCopy := BlockSize - c.off
-		if remaining < toCopy {
-			toCopy = remaining
-		}
-		if toCopy > 0 {
-			copy(dst[:toCopy], c.buf[c.off:c.off+toCopy])
-			dst = dst[toCopy:]
-			remaining -= toCopy
-			c.off += toCopy
-		}
-	}
-}
-
-// ReKey reinitializes the ChaCha20/XChaCha20 instance with the provided key
-// and nonce.
-func (c *Cipher) ReKey(key, nonce []byte) error {
-	if len(key) != KeySize {
-		return ErrInvalidKey
-	}
-
-	switch len(nonce) {
-	case NonceSize:
-	case INonceSize:
-	case XNonceSize:
-		var subkey [KeySize]byte
-		var subnonce [HNonceSize]byte
-		copy(subnonce[:], nonce[0:16])
-		HChaCha(key, &subnonce, &subkey)
-		key = subkey[:]
-		nonce = nonce[16:24]
-		defer func() {
-			for i := range subkey {
-				subkey[i] = 0
-			}
-		}()
-	default:
-		return ErrInvalidNonce
-	}
-
-	c.Reset()
-	c.state[0] = sigma0
-	c.state[1] = sigma1
-	c.state[2] = sigma2
-	c.state[3] = sigma3
-	c.state[4] = binary.LittleEndian.Uint32(key[0:4])
-	c.state[5] = binary.LittleEndian.Uint32(key[4:8])
-	c.state[6] = binary.LittleEndian.Uint32(key[8:12])
-	c.state[7] = binary.LittleEndian.Uint32(key[12:16])
-	c.state[8] = binary.LittleEndian.Uint32(key[16:20])
-	c.state[9] = binary.LittleEndian.Uint32(key[20:24])
-	c.state[10] = binary.LittleEndian.Uint32(key[24:28])
-	c.state[11] = binary.LittleEndian.Uint32(key[28:32])
-	c.state[12] = 0
-	if len(nonce) == INonceSize {
-		c.state[13] = binary.LittleEndian.Uint32(nonce[0:4])
-		c.state[14] = binary.LittleEndian.Uint32(nonce[4:8])
-		c.state[15] = binary.LittleEndian.Uint32(nonce[8:12])
-		c.ietf = true
-	} else {
-		c.state[13] = 0
-		c.state[14] = binary.LittleEndian.Uint32(nonce[0:4])
-		c.state[15] = binary.LittleEndian.Uint32(nonce[4:8])
-		c.ietf = false
-	}
-	c.off = BlockSize
-	return nil
-
-}
-
-// Seek sets the block counter to a given offset.
-func (c *Cipher) Seek(blockCounter uint64) error {
-	if c.ietf {
-		if blockCounter > math.MaxUint32 {
-			return ErrInvalidCounter
-		}
-		c.state[12] = uint32(blockCounter)
-	} else {
-		c.state[12] = uint32(blockCounter)
-		c.state[13] = uint32(blockCounter >> 32)
-	}
-	c.off = BlockSize
-	return nil
-}
-
-// NewCipher returns a new ChaCha20/XChaCha20 instance.
-func NewCipher(key, nonce []byte) (*Cipher, error) {
-	c := new(Cipher)
-	if err := c.ReKey(key, nonce); err != nil {
-		return nil, err
-	}
-	return c, nil
-}
-
-// HChaCha is the HChaCha20 hash function used to make XChaCha.
-func HChaCha(key []byte, nonce *[HNonceSize]byte, out *[32]byte) {
-	var x [stateSize]uint32 // Last 4 slots unused, sigma hardcoded.
-	x[0] = binary.LittleEndian.Uint32(key[0:4])
-	x[1] = binary.LittleEndian.Uint32(key[4:8])
-	x[2] = binary.LittleEndian.Uint32(key[8:12])
-	x[3] = binary.LittleEndian.Uint32(key[12:16])
-	x[4] = binary.LittleEndian.Uint32(key[16:20])
-	x[5] = binary.LittleEndian.Uint32(key[20:24])
-	x[6] = binary.LittleEndian.Uint32(key[24:28])
-	x[7] = binary.LittleEndian.Uint32(key[28:32])
-	x[8] = binary.LittleEndian.Uint32(nonce[0:4])
-	x[9] = binary.LittleEndian.Uint32(nonce[4:8])
-	x[10] = binary.LittleEndian.Uint32(nonce[8:12])
-	x[11] = binary.LittleEndian.Uint32(nonce[12:16])
-	hChaChaRef(&x, out)
-}
-
-func init() {
-	switch runtime.GOARCH {
-	case "386", "amd64":
-		// Abuse unsafe to skip calling binary.LittleEndian.PutUint32
-		// in the critical path.  This is a big boost on systems that are
-		// little endian and not overly picky about alignment.
-		useUnsafe = true
-	}
-}
-
-var _ cipher.Stream = (*Cipher)(nil)
diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.go b/vendor/github.com/Yawning/chacha20/chacha20_amd64.go
deleted file mode 100644
index 05adad1..0000000
--- a/vendor/github.com/Yawning/chacha20/chacha20_amd64.go
+++ /dev/null
@@ -1,95 +0,0 @@
-// chacha20_amd64.go - AMD64 optimized chacha20.
-//
-// To the extent possible under law, Yawning Angel has waived all copyright
-// and related or neighboring rights to chacha20, using the Creative
-// Commons "CC0" public domain dedication. See LICENSE or
-// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
-
-// +build amd64,!gccgo,!appengine
-
-package chacha20
-
-import (
-	"math"
-)
-
-var usingAVX2 = false
-
-func blocksAmd64SSE2(x *uint32, inp, outp *byte, nrBlocks uint)
-
-func blocksAmd64AVX2(x *uint32, inp, outp *byte, nrBlocks uint)
-
-func cpuidAmd64(cpuidParams *uint32)
-
-func xgetbv0Amd64(xcrVec *uint32)
-
-func blocksAmd64(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) {
-	// Probably unneeded, but stating this explicitly simplifies the assembly.
-	if nrBlocks == 0 {
-		return
-	}
-
-	if isIetf {
-		var totalBlocks uint64
-		totalBlocks = uint64(x[12]) + uint64(nrBlocks)
-		if totalBlocks > math.MaxUint32 {
-			panic("chacha20: Exceeded keystream per nonce limit")
-		}
-	}
-
-	if in == nil {
-		for i := range out {
-			out[i] = 0
-		}
-		in = out
-	}
-
-	// Pointless to call the AVX2 code for just a single block, since half of
-	// the output gets discarded...
-	if usingAVX2 && nrBlocks > 1 {
-		blocksAmd64AVX2(&x[0], &in[0], &out[0], uint(nrBlocks))
-	} else {
-		blocksAmd64SSE2(&x[0], &in[0], &out[0], uint(nrBlocks))
-	}
-}
-
-func supportsAVX2() bool {
-	// https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
-	const (
-		osXsaveBit = 1 << 27
-		avx2Bit    = 1 << 5
-	)
-
-	// Check to see if CPUID actually supports the leaf that indicates AVX2.
-	// CPUID.(EAX=0H, ECX=0H) >= 7
-	regs := [4]uint32{0x00}
-	cpuidAmd64(&regs[0])
-	if regs[0] < 7 {
-		return false
-	}
-
-	// Check to see if the OS knows how to save/restore XMM/YMM state.
-	// CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1
-	regs = [4]uint32{0x01}
-	cpuidAmd64(&regs[0])
-	if regs[2]&osXsaveBit == 0 {
-		return false
-	}
-	xcrRegs := [2]uint32{}
-	xgetbv0Amd64(&xcrRegs[0])
-	if xcrRegs[0]&6 != 6 {
-		return false
-	}
-
-	// Check for AVX2 support.
-	// CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1
-	regs = [4]uint32{0x07}
-	cpuidAmd64(&regs[0])
-	return regs[1]&avx2Bit != 0
-}
-
-func init() {
-	blocksFn = blocksAmd64
-	usingVectors = true
-	usingAVX2 = supportsAVX2()
-}
diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.py b/vendor/github.com/Yawning/chacha20/chacha20_amd64.py
deleted file mode 100644
index 3bfebf4..0000000
--- a/vendor/github.com/Yawning/chacha20/chacha20_amd64.py
+++ /dev/null
@@ -1,1295 +0,0 @@
-#!/usr/bin/env python3
-#
-# To the extent possible under law, Yawning Angel has waived all copyright
-# and related or neighboring rights to chacha20, using the Creative
-# Commons "CC0" public domain dedication. See LICENSE or
-# <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
-
-#
-# cgo sucks.  Plan 9 assembly sucks.  Real languages have SIMD intrinsics.
-# The least terrible/retarded option is to use a Python code generator, so
-# that's what I did.
-#
-# Code based on Ted Krovetz's vec128 C implementation, with corrections
-# to use a 64 bit counter instead of 32 bit, and to allow unaligned input and
-# output pointers.
-#
-# Dependencies: https://github.com/Maratyszcza/PeachPy
-#
-# python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py
-#
-
-from peachpy import *
-from peachpy.x86_64 import *
-
-x = Argument(ptr(uint32_t))
-inp = Argument(ptr(const_uint8_t))
-outp = Argument(ptr(uint8_t))
-nrBlocks = Argument(ptr(size_t))
-
-#
-# SSE2 helper functions.  A temporary register is explicitly passed in because
-# the main fast loop uses every single register (and even spills) so manual
-# control is needed.
-#
-# This used to also have a DQROUNDS helper that did 2 rounds of ChaCha like
-# in the C code, but the C code has the luxury of an optimizer reordering
-# everything, while this does not.
-#
-
-def ROTW16_sse2(tmp, d):
-    MOVDQA(tmp, d)
-    PSLLD(tmp, 16)
-    PSRLD(d, 16)
-    PXOR(d, tmp)
-
-def ROTW12_sse2(tmp, b):
-    MOVDQA(tmp, b)
-    PSLLD(tmp, 12)
-    PSRLD(b, 20)
-    PXOR(b, tmp)
-
-def ROTW8_sse2(tmp, d):
-    MOVDQA(tmp, d)
-    PSLLD(tmp, 8)
-    PSRLD(d, 24)
-    PXOR(d, tmp)
-
-def ROTW7_sse2(tmp, b):
-    MOVDQA(tmp, b)
-    PSLLD(tmp, 7)
-    PSRLD(b, 25)
-    PXOR(b, tmp)
-
-def WriteXor_sse2(tmp, inp, outp, d, v0, v1, v2, v3):
-    MOVDQU(tmp, [inp+d])
-    PXOR(tmp, v0)
-    MOVDQU([outp+d], tmp)
-    MOVDQU(tmp, [inp+d+16])
-    PXOR(tmp, v1)
-    MOVDQU([outp+d+16], tmp)
-    MOVDQU(tmp, [inp+d+32])
-    PXOR(tmp, v2)
-    MOVDQU([outp+d+32], tmp)
-    MOVDQU(tmp, [inp+d+48])
-    PXOR(tmp, v3)
-    MOVDQU([outp+d+48], tmp)
-
-# SSE2 ChaCha20 (aka vec128).  Does not handle partial blocks, and will
-# process 4/2/1 blocks at a time.
-with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
-    reg_x = GeneralPurposeRegister64()
-    reg_inp = GeneralPurposeRegister64()
-    reg_outp = GeneralPurposeRegister64()
-    reg_blocks = GeneralPurposeRegister64()
-    reg_sp_save = GeneralPurposeRegister64()
-
-    LOAD.ARGUMENT(reg_x, x)
-    LOAD.ARGUMENT(reg_inp, inp)
-    LOAD.ARGUMENT(reg_outp, outp)
-    LOAD.ARGUMENT(reg_blocks, nrBlocks)
-
-    # Align the stack to a 32 byte boundary.
-    MOV(reg_sp_save, registers.rsp)
-    AND(registers.rsp, 0xffffffffffffffe0)
-    SUB(registers.rsp, 0x20)
-
-    # Build the counter increment vector on the stack, and allocate the scratch
-    # space
-    xmm_v0 = XMMRegister()
-    PXOR(xmm_v0, xmm_v0)
-    SUB(registers.rsp, 16+16)
-    MOVDQA([registers.rsp], xmm_v0)
-    reg_tmp = GeneralPurposeRegister32()
-    MOV(reg_tmp, 0x00000001)
-    MOV([registers.rsp], reg_tmp)
-    mem_one = [registers.rsp]     # (Stack) Counter increment vector
-    mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space.
-
-    mem_s0 = [reg_x]           # (Memory) Cipher state [0..3]
-    mem_s1 = [reg_x+16]        # (Memory) Cipher state [4..7]
-    mem_s2 = [reg_x+32]        # (Memory) Cipher state [8..11]
-    mem_s3 = [reg_x+48]        # (Memory) Cipher state [12..15]
-
-    # xmm_v0 allocated above...
-    xmm_v1 = XMMRegister()
-    xmm_v2 = XMMRegister()
-    xmm_v3 = XMMRegister()
-
-    xmm_v4 = XMMRegister()
-    xmm_v5 = XMMRegister()
-    xmm_v6 = XMMRegister()
-    xmm_v7 = XMMRegister()
-
-    xmm_v8 = XMMRegister()
-    xmm_v9 = XMMRegister()
-    xmm_v10 = XMMRegister()
-    xmm_v11 = XMMRegister()
-
-    xmm_v12 = XMMRegister()
-    xmm_v13 = XMMRegister()
-    xmm_v14 = XMMRegister()
-    xmm_v15 = XMMRegister()
-
-    xmm_tmp = xmm_v12
-
-    #
-    # 4 blocks at a time.
-    #
-
-    reg_rounds = GeneralPurposeRegister64()
-
-    vector_loop4 = Loop()
-    SUB(reg_blocks, 4)
-    JB(vector_loop4.end)
-    with vector_loop4:
-        MOVDQU(xmm_v0, mem_s0)
-        MOVDQU(xmm_v1, mem_s1)
-        MOVDQU(xmm_v2, mem_s2)
-        MOVDQU(xmm_v3, mem_s3)
-
-        MOVDQA(xmm_v4, xmm_v0)
-        MOVDQA(xmm_v5, xmm_v1)
-        MOVDQA(xmm_v6, xmm_v2)
-        MOVDQA(xmm_v7, xmm_v3)
-        PADDQ(xmm_v7, mem_one)
-
-        MOVDQA(xmm_v8, xmm_v0)
-        MOVDQA(xmm_v9, xmm_v1)
-        MOVDQA(xmm_v10, xmm_v2)
-        MOVDQA(xmm_v11, xmm_v7)
-        PADDQ(xmm_v11, mem_one)
-
-        MOVDQA(xmm_v12, xmm_v0)
-        MOVDQA(xmm_v13, xmm_v1)
-        MOVDQA(xmm_v14, xmm_v2)
-        MOVDQA(xmm_v15, xmm_v11)
-        PADDQ(xmm_v15, mem_one)
-
-        MOV(reg_rounds, 20)
-        rounds_loop4 = Loop()
-        with rounds_loop4:
-            # a += b; d ^= a; d = ROTW16(d);
-            PADDD(xmm_v0, xmm_v1)
-            PADDD(xmm_v4, xmm_v5)
-            PADDD(xmm_v8, xmm_v9)
-            PADDD(xmm_v12, xmm_v13)
-            PXOR(xmm_v3, xmm_v0)
-            PXOR(xmm_v7, xmm_v4)
-            PXOR(xmm_v11, xmm_v8)
-            PXOR(xmm_v15, xmm_v12)
-
-            MOVDQA(mem_tmp0, xmm_tmp) # Save
-
-            ROTW16_sse2(xmm_tmp, xmm_v3)
-            ROTW16_sse2(xmm_tmp, xmm_v7)
-            ROTW16_sse2(xmm_tmp, xmm_v11)
-            ROTW16_sse2(xmm_tmp, xmm_v15)
-
-            # c += d; b ^= c; b = ROTW12(b);
-            PADDD(xmm_v2, xmm_v3)
-            PADDD(xmm_v6, xmm_v7)
-            PADDD(xmm_v10, xmm_v11)
-            PADDD(xmm_v14, xmm_v15)
-            PXOR(xmm_v1, xmm_v2)
-            PXOR(xmm_v5, xmm_v6)
-            PXOR(xmm_v9, xmm_v10)
-            PXOR(xmm_v13, xmm_v14)
-            ROTW12_sse2(xmm_tmp, xmm_v1)
-            ROTW12_sse2(xmm_tmp, xmm_v5)
-            ROTW12_sse2(xmm_tmp, xmm_v9)
-            ROTW12_sse2(xmm_tmp, xmm_v13)
-
-            # a += b; d ^= a; d = ROTW8(d);
-            MOVDQA(xmm_tmp, mem_tmp0) # Restore
-
-            PADDD(xmm_v0, xmm_v1)
-            PADDD(xmm_v4, xmm_v5)
-            PADDD(xmm_v8, xmm_v9)
-            PADDD(xmm_v12, xmm_v13)
-            PXOR(xmm_v3, xmm_v0)
-            PXOR(xmm_v7, xmm_v4)
-            PXOR(xmm_v11, xmm_v8)
-            PXOR(xmm_v15, xmm_v12)
-
-            MOVDQA(mem_tmp0, xmm_tmp) # Save
-
-            ROTW8_sse2(xmm_tmp, xmm_v3)
-            ROTW8_sse2(xmm_tmp, xmm_v7)
-            ROTW8_sse2(xmm_tmp, xmm_v11)
-            ROTW8_sse2(xmm_tmp, xmm_v15)
-
-            # c += d; b ^= c; b = ROTW7(b)
-            PADDD(xmm_v2, xmm_v3)
-            PADDD(xmm_v6, xmm_v7)
-            PADDD(xmm_v10, xmm_v11)
-            PADDD(xmm_v14, xmm_v15)
-            PXOR(xmm_v1, xmm_v2)
-            PXOR(xmm_v5, xmm_v6)
-            PXOR(xmm_v9, xmm_v10)
-            PXOR(xmm_v13, xmm_v14)
-            ROTW7_sse2(xmm_tmp, xmm_v1)
-            ROTW7_sse2(xmm_tmp, xmm_v5)
-            ROTW7_sse2(xmm_tmp, xmm_v9)
-            ROTW7_sse2(xmm_tmp, xmm_v13)
-
-            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-            PSHUFD(xmm_v1, xmm_v1, 0x39)
-            PSHUFD(xmm_v5, xmm_v5, 0x39)
-            PSHUFD(xmm_v9, xmm_v9, 0x39)
-            PSHUFD(xmm_v13, xmm_v13, 0x39)
-            PSHUFD(xmm_v2, xmm_v2, 0x4e)
-            PSHUFD(xmm_v6, xmm_v6, 0x4e)
-            PSHUFD(xmm_v10, xmm_v10, 0x4e)
-            PSHUFD(xmm_v14, xmm_v14, 0x4e)
-            PSHUFD(xmm_v3, xmm_v3, 0x93)
-            PSHUFD(xmm_v7, xmm_v7, 0x93)
-            PSHUFD(xmm_v11, xmm_v11, 0x93)
-            PSHUFD(xmm_v15, xmm_v15, 0x93)
-
-            MOVDQA(xmm_tmp, mem_tmp0) # Restore
-
-            # a += b; d ^= a; d = ROTW16(d);
-            PADDD(xmm_v0, xmm_v1)
-            PADDD(xmm_v4, xmm_v5)
-            PADDD(xmm_v8, xmm_v9)
-            PADDD(xmm_v12, xmm_v13)
-            PXOR(xmm_v3, xmm_v0)
-            PXOR(xmm_v7, xmm_v4)
-            PXOR(xmm_v11, xmm_v8)
-            PXOR(xmm_v15, xmm_v12)
-
-            MOVDQA(mem_tmp0, xmm_tmp) # Save
-
-            ROTW16_sse2(xmm_tmp, xmm_v3)
-            ROTW16_sse2(xmm_tmp, xmm_v7)
-            ROTW16_sse2(xmm_tmp, xmm_v11)
-            ROTW16_sse2(xmm_tmp, xmm_v15)
-
-            # c += d; b ^= c; b = ROTW12(b);
-            PADDD(xmm_v2, xmm_v3)
-            PADDD(xmm_v6, xmm_v7)
-            PADDD(xmm_v10, xmm_v11)
-            PADDD(xmm_v14, xmm_v15)
-            PXOR(xmm_v1, xmm_v2)
-            PXOR(xmm_v5, xmm_v6)
-            PXOR(xmm_v9, xmm_v10)
-            PXOR(xmm_v13, xmm_v14)
-            ROTW12_sse2(xmm_tmp, xmm_v1)
-            ROTW12_sse2(xmm_tmp, xmm_v5)
-            ROTW12_sse2(xmm_tmp, xmm_v9)
-            ROTW12_sse2(xmm_tmp, xmm_v13)
-
-            # a += b; d ^= a; d = ROTW8(d);
-            MOVDQA(xmm_tmp, mem_tmp0) # Restore
-
-            PADDD(xmm_v0, xmm_v1)
-            PADDD(xmm_v4, xmm_v5)
-            PADDD(xmm_v8, xmm_v9)
-            PADDD(xmm_v12, xmm_v13)
-            PXOR(xmm_v3, xmm_v0)
-            PXOR(xmm_v7, xmm_v4)
-            PXOR(xmm_v11, xmm_v8)
-            PXOR(xmm_v15, xmm_v12)
-
-            MOVDQA(mem_tmp0, xmm_tmp) # Save
-
-            ROTW8_sse2(xmm_tmp, xmm_v3)
-            ROTW8_sse2(xmm_tmp, xmm_v7)
-            ROTW8_sse2(xmm_tmp, xmm_v11)
-            ROTW8_sse2(xmm_tmp, xmm_v15)
-
-            # c += d; b ^= c; b = ROTW7(b)
-            PADDD(xmm_v2, xmm_v3)
-            PADDD(xmm_v6, xmm_v7)
-            PADDD(xmm_v10, xmm_v11)
-            PADDD(xmm_v14, xmm_v15)
-            PXOR(xmm_v1, xmm_v2)
-            PXOR(xmm_v5, xmm_v6)
-            PXOR(xmm_v9, xmm_v10)
-            PXOR(xmm_v13, xmm_v14)
-            ROTW7_sse2(xmm_tmp, xmm_v1)
-            ROTW7_sse2(xmm_tmp, xmm_v5)
-            ROTW7_sse2(xmm_tmp, xmm_v9)
-            ROTW7_sse2(xmm_tmp, xmm_v13)
-
-            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-            PSHUFD(xmm_v1, xmm_v1, 0x93)
-            PSHUFD(xmm_v5, xmm_v5, 0x93)
-            PSHUFD(xmm_v9, xmm_v9, 0x93)
-            PSHUFD(xmm_v13, xmm_v13, 0x93)
-            PSHUFD(xmm_v2, xmm_v2, 0x4e)
-            PSHUFD(xmm_v6, xmm_v6, 0x4e)
-            PSHUFD(xmm_v10, xmm_v10, 0x4e)
-            PSHUFD(xmm_v14, xmm_v14, 0x4e)
-            PSHUFD(xmm_v3, xmm_v3, 0x39)
-            PSHUFD(xmm_v7, xmm_v7, 0x39)
-            PSHUFD(xmm_v11, xmm_v11, 0x39)
-            PSHUFD(xmm_v15, xmm_v15, 0x39)
-
-            MOVDQA(xmm_tmp, mem_tmp0) # Restore
-
-            SUB(reg_rounds, 2)
-            JNZ(rounds_loop4.begin)
-
-        MOVDQA(mem_tmp0, xmm_tmp)
-
-        PADDD(xmm_v0, mem_s0)
-        PADDD(xmm_v1, mem_s1)
-        PADDD(xmm_v2, mem_s2)
-        PADDD(xmm_v3, mem_s3)
-        WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
-        MOVDQU(xmm_v3, mem_s3)
-        PADDQ(xmm_v3, mem_one)
-
-        PADDD(xmm_v4, mem_s0)
-        PADDD(xmm_v5, mem_s1)
-        PADDD(xmm_v6, mem_s2)
-        PADDD(xmm_v7, xmm_v3)
-        WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
-        PADDQ(xmm_v3, mem_one)
-
-        PADDD(xmm_v8, mem_s0)
-        PADDD(xmm_v9, mem_s1)
-        PADDD(xmm_v10, mem_s2)
-        PADDD(xmm_v11, xmm_v3)
-        WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11)
-        PADDQ(xmm_v3, mem_one)
-
-        MOVDQA(xmm_tmp, mem_tmp0)
-
-        PADDD(xmm_v12, mem_s0)
-        PADDD(xmm_v13, mem_s1)
-        PADDD(xmm_v14, mem_s2)
-        PADDD(xmm_v15, xmm_v3)
-        WriteXor_sse2(xmm_v0, reg_inp, reg_outp, 192, xmm_v12, xmm_v13, xmm_v14, xmm_v15)
-        PADDQ(xmm_v3, mem_one)
-
-        MOVDQU(mem_s3, xmm_v3)
-
-        ADD(reg_inp, 4 * 64)
-        ADD(reg_outp, 4 * 64)
-
-        SUB(reg_blocks, 4)
-        JAE(vector_loop4.begin)
-
-    ADD(reg_blocks, 4)
-    out = Label()
-    JZ(out)
-
-    # Past this point, we no longer need to use every single register to hold
-    # the in progress state.
-
-    xmm_s0 = xmm_v8
-    xmm_s1 = xmm_v9
-    xmm_s2 = xmm_v10
-    xmm_s3 = xmm_v11
-    xmm_one = xmm_v13
-    MOVDQU(xmm_s0, mem_s0)
-    MOVDQU(xmm_s1, mem_s1)
-    MOVDQU(xmm_s2, mem_s2)
-    MOVDQU(xmm_s3, mem_s3)
-    MOVDQA(xmm_one, mem_one)
-
-    #
-    # 2 blocks at a time.
-    #
-
-    process_1_block = Label()
-    SUB(reg_blocks, 2)
-    JB(process_1_block) # < 2 blocks remaining.
-
-    MOVDQA(xmm_v0, xmm_s0)
-    MOVDQA(xmm_v1, xmm_s1)
-    MOVDQA(xmm_v2, xmm_s2)
-    MOVDQA(xmm_v3, xmm_s3)
-
-    MOVDQA(xmm_v4, xmm_v0)
-    MOVDQA(xmm_v5, xmm_v1)
-    MOVDQA(xmm_v6, xmm_v2)
-    MOVDQA(xmm_v7, xmm_v3)
-    PADDQ(xmm_v7, xmm_one)
-
-    MOV(reg_rounds, 20)
-    rounds_loop2 = Loop()
-    with rounds_loop2:
-        # a += b; d ^= a; d = ROTW16(d);
-        PADDD(xmm_v0, xmm_v1)
-        PADDD(xmm_v4, xmm_v5)
-        PXOR(xmm_v3, xmm_v0)
-        PXOR(xmm_v7, xmm_v4)
-        ROTW16_sse2(xmm_tmp, xmm_v3)
-        ROTW16_sse2(xmm_tmp, xmm_v7)
-
-        # c += d; b ^= c; b = ROTW12(b);
-        PADDD(xmm_v2, xmm_v3)
-        PADDD(xmm_v6, xmm_v7)
-        PXOR(xmm_v1, xmm_v2)
-        PXOR(xmm_v5, xmm_v6)
-        ROTW12_sse2(xmm_tmp, xmm_v1)
-        ROTW12_sse2(xmm_tmp, xmm_v5)
-
-        # a += b; d ^= a; d = ROTW8(d);
-        PADDD(xmm_v0, xmm_v1)
-        PADDD(xmm_v4, xmm_v5)
-        PXOR(xmm_v3, xmm_v0)
-        PXOR(xmm_v7, xmm_v4)
-        ROTW8_sse2(xmm_tmp, xmm_v3)
-        ROTW8_sse2(xmm_tmp, xmm_v7)
-
-        # c += d; b ^= c; b = ROTW7(b)
-        PADDD(xmm_v2, xmm_v3)
-        PADDD(xmm_v6, xmm_v7)
-        PXOR(xmm_v1, xmm_v2)
-        PXOR(xmm_v5, xmm_v6)
-        ROTW7_sse2(xmm_tmp, xmm_v1)
-        ROTW7_sse2(xmm_tmp, xmm_v5)
-
-        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-        PSHUFD(xmm_v1, xmm_v1, 0x39)
-        PSHUFD(xmm_v5, xmm_v5, 0x39)
-        PSHUFD(xmm_v2, xmm_v2, 0x4e)
-        PSHUFD(xmm_v6, xmm_v6, 0x4e)
-        PSHUFD(xmm_v3, xmm_v3, 0x93)
-        PSHUFD(xmm_v7, xmm_v7, 0x93)
-
-        # a += b; d ^= a; d = ROTW16(d);
-        PADDD(xmm_v0, xmm_v1)
-        PADDD(xmm_v4, xmm_v5)
-        PXOR(xmm_v3, xmm_v0)
-        PXOR(xmm_v7, xmm_v4)
-        ROTW16_sse2(xmm_tmp, xmm_v3)
-        ROTW16_sse2(xmm_tmp, xmm_v7)
-
-        # c += d; b ^= c; b = ROTW12(b);
-        PADDD(xmm_v2, xmm_v3)
-        PADDD(xmm_v6, xmm_v7)
-        PXOR(xmm_v1, xmm_v2)
-        PXOR(xmm_v5, xmm_v6)
-        ROTW12_sse2(xmm_tmp, xmm_v1)
-        ROTW12_sse2(xmm_tmp, xmm_v5)
-
-        # a += b; d ^= a; d = ROTW8(d);
-        PADDD(xmm_v0, xmm_v1)
-        PADDD(xmm_v4, xmm_v5)
-        PXOR(xmm_v3, xmm_v0)
-        PXOR(xmm_v7, xmm_v4)
-        ROTW8_sse2(xmm_tmp, xmm_v3)
-        ROTW8_sse2(xmm_tmp, xmm_v7)
-
-        # c += d; b ^= c; b = ROTW7(b)
-        PADDD(xmm_v2, xmm_v3)
-        PADDD(xmm_v6, xmm_v7)
-        PXOR(xmm_v1, xmm_v2)
-        PXOR(xmm_v5, xmm_v6)
-        ROTW7_sse2(xmm_tmp, xmm_v1)
-        ROTW7_sse2(xmm_tmp, xmm_v5)
-
-        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-        PSHUFD(xmm_v1, xmm_v1, 0x93)
-        PSHUFD(xmm_v5, xmm_v5, 0x93)
-        PSHUFD(xmm_v2, xmm_v2, 0x4e)
-        PSHUFD(xmm_v6, xmm_v6, 0x4e)
-        PSHUFD(xmm_v3, xmm_v3, 0x39)
-        PSHUFD(xmm_v7, xmm_v7, 0x39)
-
-        SUB(reg_rounds, 2)
-        JNZ(rounds_loop2.begin)
-
-    PADDD(xmm_v0, xmm_s0)
-    PADDD(xmm_v1, xmm_s1)
-    PADDD(xmm_v2, xmm_s2)
-    PADDD(xmm_v3, xmm_s3)
-    WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
-    PADDQ(xmm_s3, xmm_one)
-
-    PADDD(xmm_v4, xmm_s0)
-    PADDD(xmm_v5, xmm_s1)
-    PADDD(xmm_v6, xmm_s2)
-    PADDD(xmm_v7, xmm_s3)
-    WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
-    PADDQ(xmm_s3, xmm_one)
-
-    ADD(reg_inp, 2 * 64)
-    ADD(reg_outp, 2 * 64)
-    SUB(reg_blocks, 2)
-
-    LABEL(process_1_block)
-    ADD(reg_blocks, 2)
-    out_serial = Label()
-    JZ(out_serial)
-
-    #
-    # 1 block at a time.  Only executed once, because if there was > 1,
-    # the parallel code would have processed it already.
-    #
-
-    MOVDQA(xmm_v0, xmm_s0)
-    MOVDQA(xmm_v1, xmm_s1)
-    MOVDQA(xmm_v2, xmm_s2)
-    MOVDQA(xmm_v3, xmm_s3)
-
-    MOV(reg_rounds, 20)
-    rounds_loop1 = Loop()
-    with rounds_loop1:
-        # a += b; d ^= a; d = ROTW16(d);
-        PADDD(xmm_v0, xmm_v1)
-        PXOR(xmm_v3, xmm_v0)
-        ROTW16_sse2(xmm_tmp, xmm_v3)
-
-        # c += d; b ^= c; b = ROTW12(b);
-        PADDD(xmm_v2, xmm_v3)
-        PXOR(xmm_v1, xmm_v2)
-        ROTW12_sse2(xmm_tmp, xmm_v1)
-
-        # a += b; d ^= a; d = ROTW8(d);
-        PADDD(xmm_v0, xmm_v1)
-        PXOR(xmm_v3, xmm_v0)
-        ROTW8_sse2(xmm_tmp, xmm_v3)
-
-        # c += d; b ^= c; b = ROTW7(b)
-        PADDD(xmm_v2, xmm_v3)
-        PXOR(xmm_v1, xmm_v2)
-        ROTW7_sse2(xmm_tmp, xmm_v1)
-
-        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-        PSHUFD(xmm_v1, xmm_v1, 0x39)
-        PSHUFD(xmm_v2, xmm_v2, 0x4e)
-        PSHUFD(xmm_v3, xmm_v3, 0x93)
-
-        # a += b; d ^= a; d = ROTW16(d);
-        PADDD(xmm_v0, xmm_v1)
-        PXOR(xmm_v3, xmm_v0)
-        ROTW16_sse2(xmm_tmp, xmm_v3)
-
-        # c += d; b ^= c; b = ROTW12(b);
-        PADDD(xmm_v2, xmm_v3)
-        PXOR(xmm_v1, xmm_v2)
-        ROTW12_sse2(xmm_tmp, xmm_v1)
-
-        # a += b; d ^= a; d = ROTW8(d);
-        PADDD(xmm_v0, xmm_v1)
-        PXOR(xmm_v3, xmm_v0)
-        ROTW8_sse2(xmm_tmp, xmm_v3)
-
-        # c += d; b ^= c; b = ROTW7(b)
-        PADDD(xmm_v2, xmm_v3)
-        PXOR(xmm_v1, xmm_v2)
-        ROTW7_sse2(xmm_tmp, xmm_v1)
-
-        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-        PSHUFD(xmm_v1, xmm_v1, 0x93)
-        PSHUFD(xmm_v2, xmm_v2, 0x4e)
-        PSHUFD(xmm_v3, xmm_v3, 0x39)
-
-        SUB(reg_rounds, 2)
-        JNZ(rounds_loop1.begin)
-
-    PADDD(xmm_v0, xmm_s0)
-    PADDD(xmm_v1, xmm_s1)
-    PADDD(xmm_v2, xmm_s2)
-    PADDD(xmm_v3, xmm_s3)
-    WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
-    PADDQ(xmm_s3, xmm_one)
-
-    LABEL(out_serial)
-
-    # Write back the updated counter.  Stoping at 2^70 bytes is the user's
-    # problem, not mine.  (Skipped if there's exactly a multiple of 4 blocks
-    # because the counter is incremented in memory while looping.)
-    MOVDQU(mem_s3, xmm_s3)
-
-    LABEL(out)
-
-    # Paranoia, cleanse the scratch space.
-    PXOR(xmm_v0, xmm_v0)
-    MOVDQA(mem_tmp0, xmm_v0)
-
-    # Remove our stack allocation.
-    MOV(registers.rsp, reg_sp_save)
-
-    RETURN()
-
-#
-# AVX2 helpers.  Like the SSE2 equivalents, the scratch register is explicit,
-# and more helpers are used to increase readability for destructive operations.
-#
-# XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB.
-#
-
-def ADD_avx2(dst, src):
-    VPADDD(dst, dst, src)
-
-def XOR_avx2(dst, src):
-    VPXOR(dst, dst, src)
-
-def ROTW16_avx2(tmp, d):
-    VPSLLD(tmp, d, 16)
-    VPSRLD(d, d, 16)
-    XOR_avx2(d, tmp)
-
-def ROTW12_avx2(tmp, b):
-    VPSLLD(tmp, b, 12)
-    VPSRLD(b, b, 20)
-    XOR_avx2(b, tmp)
-
-def ROTW8_avx2(tmp, d):
-    VPSLLD(tmp, d, 8)
-    VPSRLD(d, d, 24)
-    XOR_avx2(d, tmp)
-
-def ROTW7_avx2(tmp, b):
-    VPSLLD(tmp, b, 7)
-    VPSRLD(b, b, 25)
-    XOR_avx2(b, tmp)
-
-def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3):
-    # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
-    VPERM2I128(tmp, v0, v1, 0x20)
-    VPXOR(tmp, tmp, [inp+d])
-    VMOVDQU([outp+d], tmp)
-
-    # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
-    VPERM2I128(tmp, v2, v3, 0x20)
-    VPXOR(tmp, tmp, [inp+d+32])
-    VMOVDQU([outp+d+32], tmp)
-
-    # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
-    VPERM2I128(tmp, v0, v1, 0x31)
-    VPXOR(tmp, tmp, [inp+d+64])
-    VMOVDQU([outp+d+64], tmp)
-
-    # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
-    VPERM2I128(tmp, v2, v3, 0x31)
-    VPXOR(tmp, tmp, [inp+d+96])
-    VMOVDQU([outp+d+96], tmp)
-
-# AVX2 ChaCha20 (aka avx2).  Does not handle partial blocks, will process
-# 8/4/2 blocks at a time.
-with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell):
-    reg_x = GeneralPurposeRegister64()
-    reg_inp = GeneralPurposeRegister64()
-    reg_outp = GeneralPurposeRegister64()
-    reg_blocks = GeneralPurposeRegister64()
-    reg_sp_save = GeneralPurposeRegister64()
-
-    LOAD.ARGUMENT(reg_x, x)
-    LOAD.ARGUMENT(reg_inp, inp)
-    LOAD.ARGUMENT(reg_outp, outp)
-    LOAD.ARGUMENT(reg_blocks, nrBlocks)
-
-    # Align the stack to a 32 byte boundary.
-    MOV(reg_sp_save, registers.rsp)
-    AND(registers.rsp, 0xffffffffffffffe0)
-    SUB(registers.rsp, 0x20)
-
-    x_s0 = [reg_x]           # (Memory) Cipher state [0..3]
-    x_s1 = [reg_x+16]        # (Memory) Cipher state [4..7]
-    x_s2 = [reg_x+32]        # (Memory) Cipher state [8..11]
-    x_s3 = [reg_x+48]        # (Memory) Cipher state [12..15]
-
-    ymm_v0 = YMMRegister()
-    ymm_v1 = YMMRegister()
-    ymm_v2 = YMMRegister()
-    ymm_v3 = YMMRegister()
-
-    ymm_v4 = YMMRegister()
-    ymm_v5 = YMMRegister()
-    ymm_v6 = YMMRegister()
-    ymm_v7 = YMMRegister()
-
-    ymm_v8 = YMMRegister()
-    ymm_v9 = YMMRegister()
-    ymm_v10 = YMMRegister()
-    ymm_v11 = YMMRegister()
-
-    ymm_v12 = YMMRegister()
-    ymm_v13 = YMMRegister()
-    ymm_v14 = YMMRegister()
-    ymm_v15 = YMMRegister()
-
-    ymm_tmp0 = ymm_v12
-
-    # Allocate the neccecary stack space for the counter vector and two ymm
-    # registers that we will spill.
-    SUB(registers.rsp, 96)
-    mem_tmp0 = [registers.rsp+64]  # (Stack) Scratch space.
-    mem_s3 = [registers.rsp+32]    # (Stack) Working copy of s3. (8x)
-    mem_inc = [registers.rsp]      # (Stack) Counter increment vector.
-
-    # Increment the counter for one side of the state vector.
-    VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0)
-    VMOVDQU(mem_inc, ymm_tmp0)
-    reg_tmp = GeneralPurposeRegister32()
-    MOV(reg_tmp, 0x00000001)
-    MOV([registers.rsp+16], reg_tmp)
-    VBROADCASTI128(ymm_v3, x_s3)
-    VPADDQ(ymm_v3, ymm_v3, [registers.rsp])
-    VMOVDQA(mem_s3, ymm_v3)
-
-    # As we process 2xN blocks at a time, so the counter increment for both
-    # sides of the state vector is 2.
-    MOV(reg_tmp, 0x00000002)
-    MOV([registers.rsp], reg_tmp)
-    MOV([registers.rsp+16], reg_tmp)
-
-    out_write_even = Label()
-    out_write_odd = Label()
-
-    #
-    # 8 blocks at a time.  Ted Krovetz's avx2 code does not do this, but it's
-    # a decent gain despite all the pain...
-    #
-
-    reg_rounds = GeneralPurposeRegister64()
-
-    vector_loop8 = Loop()
-    SUB(reg_blocks, 8)
-    JB(vector_loop8.end)
-    with vector_loop8:
-        VBROADCASTI128(ymm_v0, x_s0)
-        VBROADCASTI128(ymm_v1, x_s1)
-        VBROADCASTI128(ymm_v2, x_s2)
-        VMOVDQA(ymm_v3, mem_s3)
-
-        VMOVDQA(ymm_v4, ymm_v0)
-        VMOVDQA(ymm_v5, ymm_v1)
-        VMOVDQA(ymm_v6, ymm_v2)
-        VPADDQ(ymm_v7, ymm_v3, mem_inc)
-
-        VMOVDQA(ymm_v8, ymm_v0)
-        VMOVDQA(ymm_v9, ymm_v1)
-        VMOVDQA(ymm_v10, ymm_v2)
-        VPADDQ(ymm_v11, ymm_v7, mem_inc)
-
-        VMOVDQA(ymm_v12, ymm_v0)
-        VMOVDQA(ymm_v13, ymm_v1)
-        VMOVDQA(ymm_v14, ymm_v2)
-        VPADDQ(ymm_v15, ymm_v11, mem_inc)
-
-        MOV(reg_rounds, 20)
-        rounds_loop8 = Loop()
-        with rounds_loop8:
-            # a += b; d ^= a; d = ROTW16(d);
-            ADD_avx2(ymm_v0, ymm_v1)
-            ADD_avx2(ymm_v4, ymm_v5)
-            ADD_avx2(ymm_v8, ymm_v9)
-            ADD_avx2(ymm_v12, ymm_v13)
-            XOR_avx2(ymm_v3, ymm_v0)
-            XOR_avx2(ymm_v7, ymm_v4)
-            XOR_avx2(ymm_v11, ymm_v8)
-            XOR_avx2(ymm_v15, ymm_v12)
-
-            VMOVDQA(mem_tmp0, ymm_tmp0) # Save
-
-            ROTW16_avx2(ymm_tmp0, ymm_v3)
-            ROTW16_avx2(ymm_tmp0, ymm_v7)
-            ROTW16_avx2(ymm_tmp0, ymm_v11)
-            ROTW16_avx2(ymm_tmp0, ymm_v15)
-
-            # c += d; b ^= c; b = ROTW12(b);
-            ADD_avx2(ymm_v2, ymm_v3)
-            ADD_avx2(ymm_v6, ymm_v7)
-            ADD_avx2(ymm_v10, ymm_v11)
-            ADD_avx2(ymm_v14, ymm_v15)
-            XOR_avx2(ymm_v1, ymm_v2)
-            XOR_avx2(ymm_v5, ymm_v6)
-            XOR_avx2(ymm_v9, ymm_v10)
-            XOR_avx2(ymm_v13, ymm_v14)
-            ROTW12_avx2(ymm_tmp0, ymm_v1)
-            ROTW12_avx2(ymm_tmp0, ymm_v5)
-            ROTW12_avx2(ymm_tmp0, ymm_v9)
-            ROTW12_avx2(ymm_tmp0, ymm_v13)
-
-            # a += b; d ^= a; d = ROTW8(d);
-            VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
-
-            ADD_avx2(ymm_v0, ymm_v1)
-            ADD_avx2(ymm_v4, ymm_v5)
-            ADD_avx2(ymm_v8, ymm_v9)
-            ADD_avx2(ymm_v12, ymm_v13)
-            XOR_avx2(ymm_v3, ymm_v0)
-            XOR_avx2(ymm_v7, ymm_v4)
-            XOR_avx2(ymm_v11, ymm_v8)
-            XOR_avx2(ymm_v15, ymm_v12)
-
-            VMOVDQA(mem_tmp0, ymm_tmp0) # Save
-
-            ROTW8_avx2(ymm_tmp0, ymm_v3)
-            ROTW8_avx2(ymm_tmp0, ymm_v7)
-            ROTW8_avx2(ymm_tmp0, ymm_v11)
-            ROTW8_avx2(ymm_tmp0, ymm_v15)
-
-            # c += d; b ^= c; b = ROTW7(b)
-            ADD_avx2(ymm_v2, ymm_v3)
-            ADD_avx2(ymm_v6, ymm_v7)
-            ADD_avx2(ymm_v10, ymm_v11)
-            ADD_avx2(ymm_v14, ymm_v15)
-            XOR_avx2(ymm_v1, ymm_v2)
-            XOR_avx2(ymm_v5, ymm_v6)
-            XOR_avx2(ymm_v9, ymm_v10)
-            XOR_avx2(ymm_v13, ymm_v14)
-            ROTW7_avx2(ymm_tmp0, ymm_v1)
-            ROTW7_avx2(ymm_tmp0, ymm_v5)
-            ROTW7_avx2(ymm_tmp0, ymm_v9)
-            ROTW7_avx2(ymm_tmp0, ymm_v13)
-
-            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-            VPSHUFD(ymm_v1, ymm_v1, 0x39)
-            VPSHUFD(ymm_v5, ymm_v5, 0x39)
-            VPSHUFD(ymm_v9, ymm_v9, 0x39)
-            VPSHUFD(ymm_v13, ymm_v13, 0x39)
-            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
-            VPSHUFD(ymm_v6, ymm_v6, 0x4e)
-            VPSHUFD(ymm_v10, ymm_v10, 0x4e)
-            VPSHUFD(ymm_v14, ymm_v14, 0x4e)
-            VPSHUFD(ymm_v3, ymm_v3, 0x93)
-            VPSHUFD(ymm_v7, ymm_v7, 0x93)
-            VPSHUFD(ymm_v11, ymm_v11, 0x93)
-            VPSHUFD(ymm_v15, ymm_v15, 0x93)
-
-            # a += b; d ^= a; d = ROTW16(d);
-            VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
-
-            ADD_avx2(ymm_v0, ymm_v1)
-            ADD_avx2(ymm_v4, ymm_v5)
-            ADD_avx2(ymm_v8, ymm_v9)
-            ADD_avx2(ymm_v12, ymm_v13)
-            XOR_avx2(ymm_v3, ymm_v0)
-            XOR_avx2(ymm_v7, ymm_v4)
-            XOR_avx2(ymm_v11, ymm_v8)
-            XOR_avx2(ymm_v15, ymm_v12)
-
-            VMOVDQA(mem_tmp0, ymm_tmp0) # Save
-
-            ROTW16_avx2(ymm_tmp0, ymm_v3)
-            ROTW16_avx2(ymm_tmp0, ymm_v7)
-            ROTW16_avx2(ymm_tmp0, ymm_v11)
-            ROTW16_avx2(ymm_tmp0, ymm_v15)
-
-            # c += d; b ^= c; b = ROTW12(b);
-            ADD_avx2(ymm_v2, ymm_v3)
-            ADD_avx2(ymm_v6, ymm_v7)
-            ADD_avx2(ymm_v10, ymm_v11)
-            ADD_avx2(ymm_v14, ymm_v15)
-            XOR_avx2(ymm_v1, ymm_v2)
-            XOR_avx2(ymm_v5, ymm_v6)
-            XOR_avx2(ymm_v9, ymm_v10)
-            XOR_avx2(ymm_v13, ymm_v14)
-            ROTW12_avx2(ymm_tmp0, ymm_v1)
-            ROTW12_avx2(ymm_tmp0, ymm_v5)
-            ROTW12_avx2(ymm_tmp0, ymm_v9)
-            ROTW12_avx2(ymm_tmp0, ymm_v13)
-
-            # a += b; d ^= a; d = ROTW8(d);
-            VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
-
-            ADD_avx2(ymm_v0, ymm_v1)
-            ADD_avx2(ymm_v4, ymm_v5)
-            ADD_avx2(ymm_v8, ymm_v9)
-            ADD_avx2(ymm_v12, ymm_v13)
-            XOR_avx2(ymm_v3, ymm_v0)
-            XOR_avx2(ymm_v7, ymm_v4)
-            XOR_avx2(ymm_v11, ymm_v8)
-            XOR_avx2(ymm_v15, ymm_v12)
-
-            VMOVDQA(mem_tmp0, ymm_tmp0) # Save
-
-            ROTW8_avx2(ymm_tmp0, ymm_v3)
-            ROTW8_avx2(ymm_tmp0, ymm_v7)
-            ROTW8_avx2(ymm_tmp0, ymm_v11)
-            ROTW8_avx2(ymm_tmp0, ymm_v15)
-
-            # c += d; b ^= c; b = ROTW7(b)
-            ADD_avx2(ymm_v2, ymm_v3)
-            ADD_avx2(ymm_v6, ymm_v7)
-            ADD_avx2(ymm_v10, ymm_v11)
-            ADD_avx2(ymm_v14, ymm_v15)
-            XOR_avx2(ymm_v1, ymm_v2)
-            XOR_avx2(ymm_v5, ymm_v6)
-            XOR_avx2(ymm_v9, ymm_v10)
-            XOR_avx2(ymm_v13, ymm_v14)
-            ROTW7_avx2(ymm_tmp0, ymm_v1)
-            ROTW7_avx2(ymm_tmp0, ymm_v5)
-            ROTW7_avx2(ymm_tmp0, ymm_v9)
-            ROTW7_avx2(ymm_tmp0, ymm_v13)
-
-            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-            VPSHUFD(ymm_v1, ymm_v1, 0x93)
-            VPSHUFD(ymm_v5, ymm_v5, 0x93)
-            VPSHUFD(ymm_v9, ymm_v9, 0x93)
-            VPSHUFD(ymm_v13, ymm_v13, 0x93)
-            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
-            VPSHUFD(ymm_v6, ymm_v6, 0x4e)
-            VPSHUFD(ymm_v10, ymm_v10, 0x4e)
-            VPSHUFD(ymm_v14, ymm_v14, 0x4e)
-            VPSHUFD(ymm_v3, ymm_v3, 0x39)
-            VPSHUFD(ymm_v7, ymm_v7, 0x39)
-            VPSHUFD(ymm_v11, ymm_v11, 0x39)
-            VPSHUFD(ymm_v15, ymm_v15, 0x39)
-
-            VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
-
-            SUB(reg_rounds, 2)
-            JNZ(rounds_loop8.begin)
-
-        # ymm_v12 is in mem_tmp0 and is current....
-
-        # XXX: I assume VBROADCASTI128 is about as fast as VMOVDQA....
-        VBROADCASTI128(ymm_tmp0, x_s0)
-        ADD_avx2(ymm_v0, ymm_tmp0)
-        ADD_avx2(ymm_v4, ymm_tmp0)
-        ADD_avx2(ymm_v8, ymm_tmp0)
-        ADD_avx2(ymm_tmp0, mem_tmp0)
-        VMOVDQA(mem_tmp0, ymm_tmp0)
-
-        VBROADCASTI128(ymm_tmp0, x_s1)
-        ADD_avx2(ymm_v1, ymm_tmp0)
-        ADD_avx2(ymm_v5, ymm_tmp0)
-        ADD_avx2(ymm_v9, ymm_tmp0)
-        ADD_avx2(ymm_v13, ymm_tmp0)
-
-        VBROADCASTI128(ymm_tmp0, x_s2)
-        ADD_avx2(ymm_v2, ymm_tmp0)
-        ADD_avx2(ymm_v6, ymm_tmp0)
-        ADD_avx2(ymm_v10, ymm_tmp0)
-        ADD_avx2(ymm_v14, ymm_tmp0)
-
-        ADD_avx2(ymm_v3, mem_s3)
-        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
-        VMOVDQA(ymm_v3, mem_s3)
-        ADD_avx2(ymm_v3, mem_inc)
-
-        ADD_avx2(ymm_v7, ymm_v3)
-        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
-        ADD_avx2(ymm_v3, mem_inc)
-
-        ADD_avx2(ymm_v11, ymm_v3)
-        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11)
-        ADD_avx2(ymm_v3, mem_inc)
-
-        VMOVDQA(ymm_v12, mem_tmp0)
-        ADD_avx2(ymm_v15, ymm_v3)
-        WriteXor_avx2(ymm_v0, reg_inp, reg_outp, 384, ymm_v12, ymm_v13, ymm_v14, ymm_v15)
-        ADD_avx2(ymm_v3, mem_inc)
-
-        VMOVDQA(mem_s3, ymm_v3)
-
-        ADD(reg_inp, 8 * 64)
-        ADD(reg_outp, 8 * 64)
-
-        SUB(reg_blocks, 8)
-        JAE(vector_loop8.begin)
-
-    # ymm_v3 contains a current copy of mem_s3 either from when it was built,
-    # or because the loop updates it.  Copy this before we mess with the block
-    # counter in case we need to write it back and return.
-    ymm_s3 = ymm_v11
-    VMOVDQA(ymm_s3, ymm_v3)
-
-    ADD(reg_blocks, 8)
-    JZ(out_write_even)
-
-    # We now actually can do everything in registers.
-    ymm_s0 = ymm_v8
-    VBROADCASTI128(ymm_s0, x_s0)
-    ymm_s1 = ymm_v9
-    VBROADCASTI128(ymm_s1, x_s1)
-    ymm_s2 = ymm_v10
-    VBROADCASTI128(ymm_s2, x_s2)
-    ymm_inc = ymm_v14
-    VMOVDQA(ymm_inc, mem_inc)
-
-    #
-    # 4 blocks at a time.
-    #
-
-    process_2_blocks = Label()
-    SUB(reg_blocks, 4)
-    JB(process_2_blocks) # < 4 blocks remaining.
-
-    VMOVDQA(ymm_v0, ymm_s0)
-    VMOVDQA(ymm_v1, ymm_s1)
-    VMOVDQA(ymm_v2, ymm_s2)
-    VMOVDQA(ymm_v3, ymm_s3)
-
-    VMOVDQA(ymm_v4, ymm_v0)
-    VMOVDQA(ymm_v5, ymm_v1)
-    VMOVDQA(ymm_v6, ymm_v2)
-    VPADDQ(ymm_v7, ymm_v3, ymm_inc)
-
-    MOV(reg_rounds, 20)
-    rounds_loop4 = Loop()
-    with rounds_loop4:
-        # a += b; d ^= a; d = ROTW16(d);
-        ADD_avx2(ymm_v0, ymm_v1)
-        ADD_avx2(ymm_v4, ymm_v5)
-        XOR_avx2(ymm_v3, ymm_v0)
-        XOR_avx2(ymm_v7, ymm_v4)
-        ROTW16_avx2(ymm_tmp0, ymm_v3)
-        ROTW16_avx2(ymm_tmp0, ymm_v7)
-
-        # c += d; b ^= c; b = ROTW12(b);
-        ADD_avx2(ymm_v2, ymm_v3)
-        ADD_avx2(ymm_v6, ymm_v7)
-        XOR_avx2(ymm_v1, ymm_v2)
-        XOR_avx2(ymm_v5, ymm_v6)
-        ROTW12_avx2(ymm_tmp0, ymm_v1)
-        ROTW12_avx2(ymm_tmp0, ymm_v5)
-
-        # a += b; d ^= a; d = ROTW8(d);
-        ADD_avx2(ymm_v0, ymm_v1)
-        ADD_avx2(ymm_v4, ymm_v5)
-        XOR_avx2(ymm_v3, ymm_v0)
-        XOR_avx2(ymm_v7, ymm_v4)
-        ROTW8_avx2(ymm_tmp0, ymm_v3)
-        ROTW8_avx2(ymm_tmp0, ymm_v7)
-
-        # c += d; b ^= c; b = ROTW7(b)
-        ADD_avx2(ymm_v2, ymm_v3)
-        ADD_avx2(ymm_v6, ymm_v7)
-        XOR_avx2(ymm_v1, ymm_v2)
-        XOR_avx2(ymm_v5, ymm_v6)
-        ROTW7_avx2(ymm_tmp0, ymm_v1)
-        ROTW7_avx2(ymm_tmp0, ymm_v5)
-
-        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-        VPSHUFD(ymm_v1, ymm_v1, 0x39)
-        VPSHUFD(ymm_v5, ymm_v5, 0x39)
-        VPSHUFD(ymm_v2, ymm_v2, 0x4e)
-        VPSHUFD(ymm_v6, ymm_v6, 0x4e)
-        VPSHUFD(ymm_v3, ymm_v3, 0x93)
-        VPSHUFD(ymm_v7, ymm_v7, 0x93)
-
-        # a += b; d ^= a; d = ROTW16(d);
-        ADD_avx2(ymm_v0, ymm_v1)
-        ADD_avx2(ymm_v4, ymm_v5)
-        XOR_avx2(ymm_v3, ymm_v0)
-        XOR_avx2(ymm_v7, ymm_v4)
-        ROTW16_avx2(ymm_tmp0, ymm_v3)
-        ROTW16_avx2(ymm_tmp0, ymm_v7)
-
-        # c += d; b ^= c; b = ROTW12(b);
-        ADD_avx2(ymm_v2, ymm_v3)
-        ADD_avx2(ymm_v6, ymm_v7)
-        XOR_avx2(ymm_v1, ymm_v2)
-        XOR_avx2(ymm_v5, ymm_v6)
-        ROTW12_avx2(ymm_tmp0, ymm_v1)
-        ROTW12_avx2(ymm_tmp0, ymm_v5)
-
-        # a += b; d ^= a; d = ROTW8(d);
-        ADD_avx2(ymm_v0, ymm_v1)
-        ADD_avx2(ymm_v4, ymm_v5)
-        XOR_avx2(ymm_v3, ymm_v0)
-        XOR_avx2(ymm_v7, ymm_v4)
-        ROTW8_avx2(ymm_tmp0, ymm_v3)
-        ROTW8_avx2(ymm_tmp0, ymm_v7)
-
-        # c += d; b ^= c; b = ROTW7(b)
-        ADD_avx2(ymm_v2, ymm_v3)
-        ADD_avx2(ymm_v6, ymm_v7)
-        XOR_avx2(ymm_v1, ymm_v2)
-        XOR_avx2(ymm_v5, ymm_v6)
-        ROTW7_avx2(ymm_tmp0, ymm_v1)
-        ROTW7_avx2(ymm_tmp0, ymm_v5)
-
-        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-        VPSHUFD(ymm_v1, ymm_v1, 0x93)
-        VPSHUFD(ymm_v5, ymm_v5, 0x93)
-        VPSHUFD(ymm_v2, ymm_v2, 0x4e)
-        VPSHUFD(ymm_v6, ymm_v6, 0x4e)
-        VPSHUFD(ymm_v3, ymm_v3, 0x39)
-        VPSHUFD(ymm_v7, ymm_v7, 0x39)
-
-        SUB(reg_rounds, 2)
-        JNZ(rounds_loop4.begin)
-
-    ADD_avx2(ymm_v0, ymm_s0)
-    ADD_avx2(ymm_v1, ymm_s1)
-    ADD_avx2(ymm_v2, ymm_s2)
-    ADD_avx2(ymm_v3, ymm_s3)
-    WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
-    ADD_avx2(ymm_s3, ymm_inc)
-
-    ADD_avx2(ymm_v4, ymm_s0)
-    ADD_avx2(ymm_v5, ymm_s1)
-    ADD_avx2(ymm_v6, ymm_s2)
-    ADD_avx2(ymm_v7, ymm_s3)
-    WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
-    ADD_avx2(ymm_s3, ymm_inc)
-
-    ADD(reg_inp, 4 * 64)
-    ADD(reg_outp, 4 * 64)
-    SUB(reg_blocks, 4)
-
-    LABEL(process_2_blocks)
-    ADD(reg_blocks, 4)
-    JZ(out_write_even) # 0 blocks left.
-
-    #
-    # 2/1 blocks at a time.  The two codepaths are unified because
-    # with AVX2 we do 2 blocks at a time anyway, and this only gets called
-    # if 3/2/1 blocks are remaining, so the extra branches don't hurt that
-    # much.
-    #
-
-    vector_loop2 = Loop()
-    with vector_loop2:
-        VMOVDQA(ymm_v0, ymm_s0)
-        VMOVDQA(ymm_v1, ymm_s1)
-        VMOVDQA(ymm_v2, ymm_s2)
-        VMOVDQA(ymm_v3, ymm_s3)
-
-        MOV(reg_rounds, 20)
-        rounds_loop2 = Loop()
-        with rounds_loop2:
-            # a += b; d ^= a; d = ROTW16(d);
-            ADD_avx2(ymm_v0, ymm_v1)
-            XOR_avx2(ymm_v3, ymm_v0)
-            ROTW16_avx2(ymm_tmp0, ymm_v3)
-
-            # c += d; b ^= c; b = ROTW12(b);
-            ADD_avx2(ymm_v2, ymm_v3)
-            XOR_avx2(ymm_v1, ymm_v2)
-            ROTW12_avx2(ymm_tmp0, ymm_v1)
-
-            # a += b; d ^= a; d = ROTW8(d);
-            ADD_avx2(ymm_v0, ymm_v1)
-            XOR_avx2(ymm_v3, ymm_v0)
-            ROTW8_avx2(ymm_tmp0, ymm_v3)
-
-            # c += d; b ^= c; b = ROTW7(b)
-            ADD_avx2(ymm_v2, ymm_v3)
-            XOR_avx2(ymm_v1, ymm_v2)
-            ROTW7_avx2(ymm_tmp0, ymm_v1)
-
-            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-            VPSHUFD(ymm_v1, ymm_v1, 0x39)
-            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
-            VPSHUFD(ymm_v3, ymm_v3, 0x93)
-
-            # a += b; d ^= a; d = ROTW16(d);
-            ADD_avx2(ymm_v0, ymm_v1)
-            XOR_avx2(ymm_v3, ymm_v0)
-            ROTW16_avx2(ymm_tmp0, ymm_v3)
-
-            # c += d; b ^= c; b = ROTW12(b);
-            ADD_avx2(ymm_v2, ymm_v3)
-            XOR_avx2(ymm_v1, ymm_v2)
-            ROTW12_avx2(ymm_tmp0, ymm_v1)
-
-            # a += b; d ^= a; d = ROTW8(d);
-            ADD_avx2(ymm_v0, ymm_v1)
-            XOR_avx2(ymm_v3, ymm_v0)
-            ROTW8_avx2(ymm_tmp0, ymm_v3)
-
-            # c += d; b ^= c; b = ROTW7(b)
-            ADD_avx2(ymm_v2, ymm_v3)
-            XOR_avx2(ymm_v1, ymm_v2)
-            ROTW7_avx2(ymm_tmp0, ymm_v1)
-
-            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
-            VPSHUFD(ymm_v1, ymm_v1, 0x93)
-            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
-            VPSHUFD(ymm_v3, ymm_v3, 0x39)
-
-            SUB(reg_rounds, 2)
-            JNZ(rounds_loop2.begin)
-
-        ADD_avx2(ymm_v0, ymm_s0)
-        ADD_avx2(ymm_v1, ymm_s1)
-        ADD_avx2(ymm_v2, ymm_s2)
-        ADD_avx2(ymm_v3, ymm_s3)
-
-        # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
-        VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20)
-        VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp])
-        VMOVDQU([reg_outp], ymm_tmp0)
-
-        # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
-        VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20)
-        VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32])
-        VMOVDQU([reg_outp+32], ymm_tmp0)
-
-        SUB(reg_blocks, 1)
-        JZ(out_write_odd)
-
-        ADD_avx2(ymm_s3, ymm_inc)
-
-        # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
-        VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x31)
-        VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+64])
-        VMOVDQU([reg_outp+64], ymm_tmp0)
-
-        # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
-        VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x31)
-        VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+96])
-        VMOVDQU([reg_outp+96], ymm_tmp0)
-
-        SUB(reg_blocks, 1)
-        JZ(out_write_even)
-
-        ADD(reg_inp, 2 * 64)
-        ADD(reg_outp, 2 * 64)
-        JMP(vector_loop2.begin)
-
-    LABEL(out_write_odd)
-    VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01) # Odd number of blocks.
-
-    LABEL(out_write_even)
-    VMOVDQU(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3
-
-    # Paranoia, cleanse the scratch space.
-    VPXOR(ymm_v0, ymm_v0, ymm_v0)
-    VMOVDQA(mem_tmp0, ymm_v0)
-    VMOVDQA(mem_s3, ymm_v0)
-
-    # Clear all YMM (and XMM) registers.
-    VZEROALL()
-
-    # Remove our stack allocation.
-    MOV(registers.rsp, reg_sp_save)
-
-    RETURN()
-
-#
-# CPUID
-#
-
-cpuidParams = Argument(ptr(uint32_t))
-
-with Function("cpuidAmd64", (cpuidParams,)):
-    reg_params = registers.r15
-    LOAD.ARGUMENT(reg_params, cpuidParams)
-
-    MOV(registers.eax, [reg_params])
-    MOV(registers.ecx, [reg_params+8])
-
-    CPUID()
-
-    MOV([reg_params], registers.eax)
-    MOV([reg_params+4], registers.ebx)
-    MOV([reg_params+8], registers.ecx)
-    MOV([reg_params+12], registers.edx)
-
-    RETURN()
-
-#
-# XGETBV (ECX = 0)
-#
-
-xcrVec = Argument(ptr(uint32_t))
-
-with Function("xgetbv0Amd64", (xcrVec,)):
-    reg_vec = GeneralPurposeRegister64()
-
-    LOAD.ARGUMENT(reg_vec, xcrVec)
-
-    XOR(registers.ecx, registers.ecx)
-
-    XGETBV()
-
-    MOV([reg_vec], registers.eax)
-    MOV([reg_vec+4], registers.edx)
-
-    RETURN()
diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.s b/vendor/github.com/Yawning/chacha20/chacha20_amd64.s
deleted file mode 100644
index e3792af..0000000
--- a/vendor/github.com/Yawning/chacha20/chacha20_amd64.s
+++ /dev/null
@@ -1,1180 +0,0 @@
-// +build !noasm
-// Generated by PeachPy 0.2.0 from chacha20_amd64.py
-
-
-// func blocksAmd64SSE2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint)
-TEXT ·blocksAmd64SSE2(SB),4,$0-32
-	MOVQ x+0(FP), AX
-	MOVQ inp+8(FP), BX
-	MOVQ outp+16(FP), CX
-	MOVQ nrBlocks+24(FP), DX
-	MOVQ SP, DI
-	ANDQ $18446744073709551584, SP
-	SUBQ $32, SP
-	PXOR X0, X0
-	SUBQ $32, SP
-	MOVO X0, 0(SP)
-	MOVL $1, SI
-	MOVL SI, 0(SP)
-	SUBQ $4, DX
-	JCS vector_loop4_end
-vector_loop4_begin:
-		MOVOU 0(AX), X0
-		MOVOU 16(AX), X1
-		MOVOU 32(AX), X2
-		MOVOU 48(AX), X3
-		MOVO X0, X4
-		MOVO X1, X5
-		MOVO X2, X6
-		MOVO X3, X7
-		PADDQ 0(SP), X7
-		MOVO X0, X8
-		MOVO X1, X9
-		MOVO X2, X10
-		MOVO X7, X11
-		PADDQ 0(SP), X11
-		MOVO X0, X12
-		MOVO X1, X13
-		MOVO X2, X14
-		MOVO X11, X15
-		PADDQ 0(SP), X15
-		MOVQ $20, SI
-rounds_loop4_begin:
-			PADDL X1, X0
-			PADDL X5, X4
-			PADDL X9, X8
-			PADDL X13, X12
-			PXOR X0, X3
-			PXOR X4, X7
-			PXOR X8, X11
-			PXOR X12, X15
-			MOVO X12, 16(SP)
-			MOVO X3, X12
-			PSLLL $16, X12
-			PSRLL $16, X3
-			PXOR X12, X3
-			MOVO X7, X12
-			PSLLL $16, X12
-			PSRLL $16, X7
-			PXOR X12, X7
-			MOVO X11, X12
-			PSLLL $16, X12
-			PSRLL $16, X11
-			PXOR X12, X11
-			MOVO X15, X12
-			PSLLL $16, X12
-			PSRLL $16, X15
-			PXOR X12, X15
-			PADDL X3, X2
-			PADDL X7, X6
-			PADDL X11, X10
-			PADDL X15, X14
-			PXOR X2, X1
-			PXOR X6, X5
-			PXOR X10, X9
-			PXOR X14, X13
-			MOVO X1, X12
-			PSLLL $12, X12
-			PSRLL $20, X1
-			PXOR X12, X1
-			MOVO X5, X12
-			PSLLL $12, X12
-			PSRLL $20, X5
-			PXOR X12, X5
-			MOVO X9, X12
-			PSLLL $12, X12
-			PSRLL $20, X9
-			PXOR X12, X9
-			MOVO X13, X12
-			PSLLL $12, X12
-			PSRLL $20, X13
-			PXOR X12, X13
-			MOVO 16(SP), X12
-			PADDL X1, X0
-			PADDL X5, X4
-			PADDL X9, X8
-			PADDL X13, X12
-			PXOR X0, X3
-			PXOR X4, X7
-			PXOR X8, X11
-			PXOR X12, X15
-			MOVO X12, 16(SP)
-			MOVO X3, X12
-			PSLLL $8, X12
-			PSRLL $24, X3
-			PXOR X12, X3
-			MOVO X7, X12
-			PSLLL $8, X12
-			PSRLL $24, X7
-			PXOR X12, X7
-			MOVO X11, X12
-			PSLLL $8, X12
-			PSRLL $24, X11
-			PXOR X12, X11
-			MOVO X15, X12
-			PSLLL $8, X12
-			PSRLL $24, X15
-			PXOR X12, X15
-			PADDL X3, X2
-			PADDL X7, X6
-			PADDL X11, X10
-			PADDL X15, X14
-			PXOR X2, X1
-			PXOR X6, X5
-			PXOR X10, X9
-			PXOR X14, X13
-			MOVO X1, X12
-			PSLLL $7, X12
-			PSRLL $25, X1
-			PXOR X12, X1
-			MOVO X5, X12
-			PSLLL $7, X12
-			PSRLL $25, X5
-			PXOR X12, X5
-			MOVO X9, X12
-			PSLLL $7, X12
-			PSRLL $25, X9
-			PXOR X12, X9
-			MOVO X13, X12
-			PSLLL $7, X12
-			PSRLL $25, X13
-			PXOR X12, X13
-			PSHUFL $57, X1, X1
-			PSHUFL $57, X5, X5
-			PSHUFL $57, X9, X9
-			PSHUFL $57, X13, X13
-			PSHUFL $78, X2, X2
-			PSHUFL $78, X6, X6
-			PSHUFL $78, X10, X10
-			PSHUFL $78, X14, X14
-			PSHUFL $147, X3, X3
-			PSHUFL $147, X7, X7
-			PSHUFL $147, X11, X11
-			PSHUFL $147, X15, X15
-			MOVO 16(SP), X12
-			PADDL X1, X0
-			PADDL X5, X4
-			PADDL X9, X8
-			PADDL X13, X12
-			PXOR X0, X3
-			PXOR X4, X7
-			PXOR X8, X11
-			PXOR X12, X15
-			MOVO X12, 16(SP)
-			MOVO X3, X12
-			PSLLL $16, X12
-			PSRLL $16, X3
-			PXOR X12, X3
-			MOVO X7, X12
-			PSLLL $16, X12
-			PSRLL $16, X7
-			PXOR X12, X7
-			MOVO X11, X12
-			PSLLL $16, X12
-			PSRLL $16, X11
-			PXOR X12, X11
-			MOVO X15, X12
-			PSLLL $16, X12
-			PSRLL $16, X15
-			PXOR X12, X15
-			PADDL X3, X2
-			PADDL X7, X6
-			PADDL X11, X10
-			PADDL X15, X14
-			PXOR X2, X1
-			PXOR X6, X5
-			PXOR X10, X9
-			PXOR X14, X13
-			MOVO X1, X12
-			PSLLL $12, X12
-			PSRLL $20, X1
-			PXOR X12, X1
-			MOVO X5, X12
-			PSLLL $12, X12
-			PSRLL $20, X5
-			PXOR X12, X5
-			MOVO X9, X12
-			PSLLL $12, X12
-			PSRLL $20, X9
-			PXOR X12, X9
-			MOVO X13, X12
-			PSLLL $12, X12
-			PSRLL $20, X13
-			PXOR X12, X13
-			MOVO 16(SP), X12
-			PADDL X1, X0
-			PADDL X5, X4
-			PADDL X9, X8
-			PADDL X13, X12
-			PXOR X0, X3
-			PXOR X4, X7
-			PXOR X8, X11
-			PXOR X12, X15
-			MOVO X12, 16(SP)
-			MOVO X3, X12
-			PSLLL $8, X12
-			PSRLL $24, X3
-			PXOR X12, X3
-			MOVO X7, X12
-			PSLLL $8, X12
-			PSRLL $24, X7
-			PXOR X12, X7
-			MOVO X11, X12
-			PSLLL $8, X12
-			PSRLL $24, X11
-			PXOR X12, X11
-			MOVO X15, X12
-			PSLLL $8, X12
-			PSRLL $24, X15
-			PXOR X12, X15
-			PADDL X3, X2
-			PADDL X7, X6
-			PADDL X11, X10
-			PADDL X15, X14
-			PXOR X2, X1
-			PXOR X6, X5
-			PXOR X10, X9
-			PXOR X14, X13
-			MOVO X1, X12
-			PSLLL $7, X12
-			PSRLL $25, X1
-			PXOR X12, X1
-			MOVO X5, X12
-			PSLLL $7, X12
-			PSRLL $25, X5
-			PXOR X12, X5
-			MOVO X9, X12
-			PSLLL $7, X12
-			PSRLL $25, X9
-			PXOR X12, X9
-			MOVO X13, X12
-			PSLLL $7, X12
-			PSRLL $25, X13
-			PXOR X12, X13
-			PSHUFL $147, X1, X1
-			PSHUFL $147, X5, X5
-			PSHUFL $147, X9, X9
-			PSHUFL $147, X13, X13
-			PSHUFL $78, X2, X2
-			PSHUFL $78, X6, X6
-			PSHUFL $78, X10, X10
-			PSHUFL $78, X14, X14
-			PSHUFL $57, X3, X3
-			PSHUFL $57, X7, X7
-			PSHUFL $57, X11, X11
-			PSHUFL $57, X15, X15
-			MOVO 16(SP), X12
-			SUBQ $2, SI
-			JNE rounds_loop4_begin
-		MOVO X12, 16(SP)
-		PADDL 0(AX), X0
-		PADDL 16(AX), X1
-		PADDL 32(AX), X2
-		PADDL 48(AX), X3
-		MOVOU 0(BX), X12
-		PXOR X0, X12
-		MOVOU X12, 0(CX)
-		MOVOU 16(BX), X12
-		PXOR X1, X12
-		MOVOU X12, 16(CX)
-		MOVOU 32(BX), X12
-		PXOR X2, X12
-		MOVOU X12, 32(CX)
-		MOVOU 48(BX), X12
-		PXOR X3, X12
-		MOVOU X12, 48(CX)
-		MOVOU 48(AX), X3
-		PADDQ 0(SP), X3
-		PADDL 0(AX), X4
-		PADDL 16(AX), X5
-		PADDL 32(AX), X6
-		PADDL X3, X7
-		MOVOU 64(BX), X12
-		PXOR X4, X12
-		MOVOU X12, 64(CX)
-		MOVOU 80(BX), X12
-		PXOR X5, X12
-		MOVOU X12, 80(CX)
-		MOVOU 96(BX), X12
-		PXOR X6, X12
-		MOVOU X12, 96(CX)
-		MOVOU 112(BX), X12
-		PXOR X7, X12
-		MOVOU X12, 112(CX)
-		PADDQ 0(SP), X3
-		PADDL 0(AX), X8
-		PADDL 16(AX), X9
-		PADDL 32(AX), X10
-		PADDL X3, X11
-		MOVOU 128(BX), X12
-		PXOR X8, X12
-		MOVOU X12, 128(CX)
-		MOVOU 144(BX), X12
-		PXOR X9, X12
-		MOVOU X12, 144(CX)
-		MOVOU 160(BX), X12
-		PXOR X10, X12
-		MOVOU X12, 160(CX)
-		MOVOU 176(BX), X12
-		PXOR X11, X12
-		MOVOU X12, 176(CX)
-		PADDQ 0(SP), X3
-		MOVO 16(SP), X12
-		PADDL 0(AX), X12
-		PADDL 16(AX), X13
-		PADDL 32(AX), X14
-		PADDL X3, X15
-		MOVOU 192(BX), X0
-		PXOR X12, X0
-		MOVOU X0, 192(CX)
-		MOVOU 208(BX), X0
-		PXOR X13, X0
-		MOVOU X0, 208(CX)
-		MOVOU 224(BX), X0
-		PXOR X14, X0
-		MOVOU X0, 224(CX)
-		MOVOU 240(BX), X0
-		PXOR X15, X0
-		MOVOU X0, 240(CX)
-		PADDQ 0(SP), X3
-		MOVOU X3, 48(AX)
-		ADDQ $256, BX
-		ADDQ $256, CX
-		SUBQ $4, DX
-		JCC vector_loop4_begin
-vector_loop4_end:
-	ADDQ $4, DX
-	JEQ out
-	MOVOU 0(AX), X8
-	MOVOU 16(AX), X9
-	MOVOU 32(AX), X10
-	MOVOU 48(AX), X11
-	MOVO 0(SP), X13
-	SUBQ $2, DX
-	JCS process_1_block
-	MOVO X8, X0
-	MOVO X9, X1
-	MOVO X10, X2
-	MOVO X11, X3
-	MOVO X0, X4
-	MOVO X1, X5
-	MOVO X2, X6
-	MOVO X3, X7
-	PADDQ X13, X7
-	MOVQ $20, SI
-rounds_loop2_begin:
-		PADDL X1, X0
-		PADDL X5, X4
-		PXOR X0, X3
-		PXOR X4, X7
-		MOVO X3, X12
-		PSLLL $16, X12
-		PSRLL $16, X3
-		PXOR X12, X3
-		MOVO X7, X12
-		PSLLL $16, X12
-		PSRLL $16, X7
-		PXOR X12, X7
-		PADDL X3, X2
-		PADDL X7, X6
-		PXOR X2, X1
-		PXOR X6, X5
-		MOVO X1, X12
-		PSLLL $12, X12
-		PSRLL $20, X1
-		PXOR X12, X1
-		MOVO X5, X12
-		PSLLL $12, X12
-		PSRLL $20, X5
-		PXOR X12, X5
-		PADDL X1, X0
-		PADDL X5, X4
-		PXOR X0, X3
-		PXOR X4, X7
-		MOVO X3, X12
-		PSLLL $8, X12
-		PSRLL $24, X3
-		PXOR X12, X3
-		MOVO X7, X12
-		PSLLL $8, X12
-		PSRLL $24, X7
-		PXOR X12, X7
-		PADDL X3, X2
-		PADDL X7, X6
-		PXOR X2, X1
-		PXOR X6, X5
-		MOVO X1, X12
-		PSLLL $7, X12
-		PSRLL $25, X1
-		PXOR X12, X1
-		MOVO X5, X12
-		PSLLL $7, X12
-		PSRLL $25, X5
-		PXOR X12, X5
-		PSHUFL $57, X1, X1
-		PSHUFL $57, X5, X5
-		PSHUFL $78, X2, X2
-		PSHUFL $78, X6, X6
-		PSHUFL $147, X3, X3
-		PSHUFL $147, X7, X7
-		PADDL X1, X0
-		PADDL X5, X4
-		PXOR X0, X3
-		PXOR X4, X7
-		MOVO X3, X12
-		PSLLL $16, X12
-		PSRLL $16, X3
-		PXOR X12, X3
-		MOVO X7, X12
-		PSLLL $16, X12
-		PSRLL $16, X7
-		PXOR X12, X7
-		PADDL X3, X2
-		PADDL X7, X6
-		PXOR X2, X1
-		PXOR X6, X5
-		MOVO X1, X12
-		PSLLL $12, X12
-		PSRLL $20, X1
-		PXOR X12, X1
-		MOVO X5, X12
-		PSLLL $12, X12
-		PSRLL $20, X5
-		PXOR X12, X5
-		PADDL X1, X0
-		PADDL X5, X4
-		PXOR X0, X3
-		PXOR X4, X7
-		MOVO X3, X12
-		PSLLL $8, X12
-		PSRLL $24, X3
-		PXOR X12, X3
-		MOVO X7, X12
-		PSLLL $8, X12
-		PSRLL $24, X7
-		PXOR X12, X7
-		PADDL X3, X2
-		PADDL X7, X6
-		PXOR X2, X1
-		PXOR X6, X5
-		MOVO X1, X12
-		PSLLL $7, X12
-		PSRLL $25, X1
-		PXOR X12, X1
-		MOVO X5, X12
-		PSLLL $7, X12
-		PSRLL $25, X5
-		PXOR X12, X5
-		PSHUFL $147, X1, X1
-		PSHUFL $147, X5, X5
-		PSHUFL $78, X2, X2
-		PSHUFL $78, X6, X6
-		PSHUFL $57, X3, X3
-		PSHUFL $57, X7, X7
-		SUBQ $2, SI
-		JNE rounds_loop2_begin
-	PADDL X8, X0
-	PADDL X9, X1
-	PADDL X10, X2
-	PADDL X11, X3
-	MOVOU 0(BX), X12
-	PXOR X0, X12
-	MOVOU X12, 0(CX)
-	MOVOU 16(BX), X12
-	PXOR X1, X12
-	MOVOU X12, 16(CX)
-	MOVOU 32(BX), X12
-	PXOR X2, X12
-	MOVOU X12, 32(CX)
-	MOVOU 48(BX), X12
-	PXOR X3, X12
-	MOVOU X12, 48(CX)
-	PADDQ X13, X11
-	PADDL X8, X4
-	PADDL X9, X5
-	PADDL X10, X6
-	PADDL X11, X7
-	MOVOU 64(BX), X12
-	PXOR X4, X12
-	MOVOU X12, 64(CX)
-	MOVOU 80(BX), X12
-	PXOR X5, X12
-	MOVOU X12, 80(CX)
-	MOVOU 96(BX), X12
-	PXOR X6, X12
-	MOVOU X12, 96(CX)
-	MOVOU 112(BX), X12
-	PXOR X7, X12
-	MOVOU X12, 112(CX)
-	PADDQ X13, X11
-	ADDQ $128, BX
-	ADDQ $128, CX
-	SUBQ $2, DX
-process_1_block:
-	ADDQ $2, DX
-	JEQ out_serial
-	MOVO X8, X0
-	MOVO X9, X1
-	MOVO X10, X2
-	MOVO X11, X3
-	MOVQ $20, SI
-rounds_loop1_begin:
-		PADDL X1, X0
-		PXOR X0, X3
-		MOVO X3, X12
-		PSLLL $16, X12
-		PSRLL $16, X3
-		PXOR X12, X3
-		PADDL X3, X2
-		PXOR X2, X1
-		MOVO X1, X12
-		PSLLL $12, X12
-		PSRLL $20, X1
-		PXOR X12, X1
-		PADDL X1, X0
-		PXOR X0, X3
-		MOVO X3, X12
-		PSLLL $8, X12
-		PSRLL $24, X3
-		PXOR X12, X3
-		PADDL X3, X2
-		PXOR X2, X1
-		MOVO X1, X12
-		PSLLL $7, X12
-		PSRLL $25, X1
-		PXOR X12, X1
-		PSHUFL $57, X1, X1
-		PSHUFL $78, X2, X2
-		PSHUFL $147, X3, X3
-		PADDL X1, X0
-		PXOR X0, X3
-		MOVO X3, X12
-		PSLLL $16, X12
-		PSRLL $16, X3
-		PXOR X12, X3
-		PADDL X3, X2
-		PXOR X2, X1
-		MOVO X1, X12
-		PSLLL $12, X12
-		PSRLL $20, X1
-		PXOR X12, X1
-		PADDL X1, X0
-		PXOR X0, X3
-		MOVO X3, X12
-		PSLLL $8, X12
-		PSRLL $24, X3
-		PXOR X12, X3
-		PADDL X3, X2
-		PXOR X2, X1
-		MOVO X1, X12
-		PSLLL $7, X12
-		PSRLL $25, X1
-		PXOR X12, X1
-		PSHUFL $147, X1, X1
-		PSHUFL $78, X2, X2
-		PSHUFL $57, X3, X3
-		SUBQ $2, SI
-		JNE rounds_loop1_begin
-	PADDL X8, X0
-	PADDL X9, X1
-	PADDL X10, X2
-	PADDL X11, X3
-	MOVOU 0(BX), X12
-	PXOR X0, X12
-	MOVOU X12, 0(CX)
-	MOVOU 16(BX), X12
-	PXOR X1, X12
-	MOVOU X12, 16(CX)
-	MOVOU 32(BX), X12
-	PXOR X2, X12
-	MOVOU X12, 32(CX)
-	MOVOU 48(BX), X12
-	PXOR X3, X12
-	MOVOU X12, 48(CX)
-	PADDQ X13, X11
-out_serial:
-	MOVOU X11, 48(AX)
-out:
-	PXOR X0, X0
-	MOVO X0, 16(SP)
-	MOVQ DI, SP
-	RET
-
-// func blocksAmd64AVX2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint)
-TEXT ·blocksAmd64AVX2(SB),4,$0-32
-	MOVQ x+0(FP), AX
-	MOVQ inp+8(FP), BX
-	MOVQ outp+16(FP), CX
-	MOVQ nrBlocks+24(FP), DX
-	MOVQ SP, DI
-	ANDQ $18446744073709551584, SP
-	SUBQ $32, SP
-	SUBQ $96, SP
-	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm0, ymm0, ymm0
-	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x04; BYTE $0x24 // VMOVDQU [rsp], ymm0
-	MOVL $1, SI
-	MOVL SI, 16(SP)
-	BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x48; BYTE $0x30 // VBROADCASTI128 ymm1, [rax + 48]
-	BYTE $0xC5; BYTE $0xF5; BYTE $0xD4; BYTE $0x0C; BYTE $0x24 // VPADDQ ymm1, ymm1, [rsp]
-	BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm1
-	MOVL $2, SI
-	MOVL SI, 0(SP)
-	MOVL SI, 16(SP)
-	SUBQ $8, DX
-	JCS vector_loop8_end
-vector_loop8_begin:
-		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x10 // VBROADCASTI128 ymm2, [rax]
-		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x58; BYTE $0x10 // VBROADCASTI128 ymm3, [rax + 16]
-		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x60; BYTE $0x20 // VBROADCASTI128 ymm4, [rax + 32]
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA ymm1, [rsp + 32]
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xEA // VMOVDQA ymm5, ymm2
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF3 // VMOVDQA ymm6, ymm3
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFC // VMOVDQA ymm7, ymm4
-		BYTE $0xC5; BYTE $0x75; BYTE $0xD4; BYTE $0x04; BYTE $0x24 // VPADDQ ymm8, ymm1, [rsp]
-		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xCA // VMOVDQA ymm9, ymm2
-		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xD3 // VMOVDQA ymm10, ymm3
-		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xDC // VMOVDQA ymm11, ymm4
-		BYTE $0xC5; BYTE $0x3D; BYTE $0xD4; BYTE $0x24; BYTE $0x24 // VPADDQ ymm12, ymm8, [rsp]
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xC2 // VMOVDQA ymm0, ymm2
-		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xEB // VMOVDQA ymm13, ymm3
-		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xF4 // VMOVDQA ymm14, ymm4
-		BYTE $0xC5; BYTE $0x1D; BYTE $0xD4; BYTE $0x3C; BYTE $0x24 // VPADDQ ymm15, ymm12, [rsp]
-		MOVQ $20, SI
-rounds_loop8_begin:
-			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-			BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
-			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
-			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
-			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
-			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
-			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x10 // VPSLLD ymm0, ymm12, 16
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x10 // VPSRLD ymm12, ymm12, 16
-			BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16
-			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
-			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
-			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
-			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
-			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
-			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
-			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
-			BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
-			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x0C // VPSLLD ymm0, ymm10, 12
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x14 // VPSRLD ymm10, ymm10, 20
-			BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20
-			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
-			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-			BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
-			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
-			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
-			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
-			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
-			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x08 // VPSLLD ymm0, ymm12, 8
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x18 // VPSRLD ymm12, ymm12, 24
-			BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24
-			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
-			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
-			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
-			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
-			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
-			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
-			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
-			BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
-			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x07 // VPSLLD ymm0, ymm10, 7
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x19 // VPSRLD ymm10, ymm10, 25
-			BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25
-			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x39 // VPSHUFD ymm6, ymm6, 57
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x39 // VPSHUFD ymm10, ymm10, 57
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x39 // VPSHUFD ymm13, ymm13, 57
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x4E // VPSHUFD ymm11, ymm11, 78
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x93 // VPSHUFD ymm8, ymm8, 147
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xE4; BYTE $0x93 // VPSHUFD ymm12, ymm12, 147
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x93 // VPSHUFD ymm15, ymm15, 147
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
-			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-			BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
-			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
-			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
-			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
-			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
-			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x10 // VPSLLD ymm0, ymm12, 16
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x10 // VPSRLD ymm12, ymm12, 16
-			BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16
-			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
-			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
-			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
-			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
-			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
-			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
-			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
-			BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
-			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x0C // VPSLLD ymm0, ymm10, 12
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x14 // VPSRLD ymm10, ymm10, 20
-			BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20
-			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
-			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-			BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
-			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
-			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
-			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
-			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
-			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x08 // VPSLLD ymm0, ymm12, 8
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x18 // VPSRLD ymm12, ymm12, 24
-			BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24
-			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
-			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
-			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
-			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
-			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
-			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
-			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
-			BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
-			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x07 // VPSLLD ymm0, ymm10, 7
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x19 // VPSRLD ymm10, ymm10, 25
-			BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7
-			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25
-			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x93 // VPSHUFD ymm6, ymm6, 147
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x93 // VPSHUFD ymm10, ymm10, 147
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x93 // VPSHUFD ymm13, ymm13, 147
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x4E // VPSHUFD ymm11, ymm11, 78
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x39 // VPSHUFD ymm8, ymm8, 57
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xE4; BYTE $0x39 // VPSHUFD ymm12, ymm12, 57
-			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x39 // VPSHUFD ymm15, ymm15, 57
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
-			SUBQ $2, SI
-			JNE rounds_loop8_begin
-		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x00 // VBROADCASTI128 ymm0, [rax]
-		BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD0 // VPADDD ymm2, ymm2, ymm0
-		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xE8 // VPADDD ymm5, ymm5, ymm0
-		BYTE $0xC5; BYTE $0x35; BYTE $0xFE; BYTE $0xC8 // VPADDD ymm9, ymm9, ymm0
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xFE; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VPADDD ymm0, ymm0, [rsp + 64]
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
-		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x40; BYTE $0x10 // VBROADCASTI128 ymm0, [rax + 16]
-		BYTE $0xC5; BYTE $0xE5; BYTE $0xFE; BYTE $0xD8 // VPADDD ymm3, ymm3, ymm0
-		BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF0 // VPADDD ymm6, ymm6, ymm0
-		BYTE $0xC5; BYTE $0x2D; BYTE $0xFE; BYTE $0xD0 // VPADDD ymm10, ymm10, ymm0
-		BYTE $0xC5; BYTE $0x15; BYTE $0xFE; BYTE $0xE8 // VPADDD ymm13, ymm13, ymm0
-		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x40; BYTE $0x20 // VBROADCASTI128 ymm0, [rax + 32]
-		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE0 // VPADDD ymm4, ymm4, ymm0
-		BYTE $0xC5; BYTE $0xC5; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm0
-		BYTE $0xC5; BYTE $0x25; BYTE $0xFE; BYTE $0xD8 // VPADDD ymm11, ymm11, ymm0
-		BYTE $0xC5; BYTE $0x0D; BYTE $0xFE; BYTE $0xF0 // VPADDD ymm14, ymm14, ymm0
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VPADDD ymm1, ymm1, [rsp + 32]
-		BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0
-		BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0
-		BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0
-		BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA ymm1, [rsp + 32]
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
-		BYTE $0xC5; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm1
-		BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x20 // VPERM2I128 ymm0, ymm5, ymm6, 32
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 128]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm0
-		BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x20 // VPERM2I128 ymm0, ymm7, ymm8, 32
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 160]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm0
-		BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x31 // VPERM2I128 ymm0, ymm5, ymm6, 49
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 192]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm0
-		BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x31 // VPERM2I128 ymm0, ymm7, ymm8, 49
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 224]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm0
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
-		BYTE $0xC5; BYTE $0x1D; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm12, ymm12, ymm1
-		BYTE $0xC4; BYTE $0xC3; BYTE $0x35; BYTE $0x46; BYTE $0xC2; BYTE $0x20 // VPERM2I128 ymm0, ymm9, ymm10, 32
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 256]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 256], ymm0
-		BYTE $0xC4; BYTE $0xC3; BYTE $0x25; BYTE $0x46; BYTE $0xC4; BYTE $0x20 // VPERM2I128 ymm0, ymm11, ymm12, 32
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 288]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 288], ymm0
-		BYTE $0xC4; BYTE $0xC3; BYTE $0x35; BYTE $0x46; BYTE $0xC2; BYTE $0x31 // VPERM2I128 ymm0, ymm9, ymm10, 49
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 320]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 320], ymm0
-		BYTE $0xC4; BYTE $0xC3; BYTE $0x25; BYTE $0x46; BYTE $0xC4; BYTE $0x31 // VPERM2I128 ymm0, ymm11, ymm12, 49
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 352]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 352], ymm0
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
-		BYTE $0xC5; BYTE $0x05; BYTE $0xFE; BYTE $0xF9 // VPADDD ymm15, ymm15, ymm1
-		BYTE $0xC4; BYTE $0xC3; BYTE $0x7D; BYTE $0x46; BYTE $0xD5; BYTE $0x20 // VPERM2I128 ymm2, ymm0, ymm13, 32
-		BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0x80; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 384]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0x80; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 384], ymm2
-		BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xD7; BYTE $0x20 // VPERM2I128 ymm2, ymm14, ymm15, 32
-		BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xA0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 416]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xA0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 416], ymm2
-		BYTE $0xC4; BYTE $0xC3; BYTE $0x7D; BYTE $0x46; BYTE $0xD5; BYTE $0x31 // VPERM2I128 ymm2, ymm0, ymm13, 49
-		BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xC0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 448]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xC0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 448], ymm2
-		BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xD7; BYTE $0x31 // VPERM2I128 ymm2, ymm14, ymm15, 49
-		BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xE0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 480]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xE0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 480], ymm2
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm1
-		ADDQ $512, BX
-		ADDQ $512, CX
-		SUBQ $8, DX
-		JCC vector_loop8_begin
-vector_loop8_end:
-	BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xE1 // VMOVDQA ymm12, ymm1
-	ADDQ $8, DX
-	JEQ out_write_even
-	BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x08 // VBROADCASTI128 ymm9, [rax]
-	BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x50; BYTE $0x10 // VBROADCASTI128 ymm10, [rax + 16]
-	BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x58; BYTE $0x20 // VBROADCASTI128 ymm11, [rax + 32]
-	BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0x34; BYTE $0x24 // VMOVDQA ymm14, [rsp]
-	SUBQ $4, DX
-	JCS process_2_blocks
-	BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xCA // VMOVDQA ymm2, ymm9
-	BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xD3 // VMOVDQA ymm3, ymm10
-	BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xDC // VMOVDQA ymm4, ymm11
-	BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE1 // VMOVDQA ymm1, ymm12
-	BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xEA // VMOVDQA ymm5, ymm2
-	BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF3 // VMOVDQA ymm6, ymm3
-	BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFC // VMOVDQA ymm7, ymm4
-	BYTE $0xC4; BYTE $0x41; BYTE $0x75; BYTE $0xD4; BYTE $0xC6 // VPADDQ ymm8, ymm1, ymm14
-	MOVQ $20, SI
-rounds_loop4_begin:
-		BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
-		BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
-		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
-		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
-		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
-		BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
-		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
-		BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
-		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
-		BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
-		BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
-		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
-		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
-		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
-		BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
-		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
-		BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
-		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x39 // VPSHUFD ymm6, ymm6, 57
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147
-		BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x93 // VPSHUFD ymm8, ymm8, 147
-		BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
-		BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
-		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
-		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
-		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
-		BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
-		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
-		BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
-		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
-		BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
-		BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
-		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
-		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
-		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
-		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
-		BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
-		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
-		BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
-		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x93 // VPSHUFD ymm6, ymm6, 147
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
-		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57
-		BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x39 // VPSHUFD ymm8, ymm8, 57
-		SUBQ $2, SI
-		JNE rounds_loop4_begin
-	BYTE $0xC4; BYTE $0xC1; BYTE $0x6D; BYTE $0xFE; BYTE $0xD1 // VPADDD ymm2, ymm2, ymm9
-	BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDA // VPADDD ymm3, ymm3, ymm10
-	BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE3 // VPADDD ymm4, ymm4, ymm11
-	BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0xFE; BYTE $0xCC // VPADDD ymm1, ymm1, ymm12
-	BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32
-	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx]
-	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0
-	BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32
-	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32]
-	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0
-	BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49
-	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64]
-	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0
-	BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49
-	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96]
-	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0
-	BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14
-	BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xFE; BYTE $0xE9 // VPADDD ymm5, ymm5, ymm9
-	BYTE $0xC4; BYTE $0xC1; BYTE $0x4D; BYTE $0xFE; BYTE $0xF2 // VPADDD ymm6, ymm6, ymm10
-	BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xFB // VPADDD ymm7, ymm7, ymm11
-	BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC4 // VPADDD ymm8, ymm8, ymm12
-	BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x20 // VPERM2I128 ymm0, ymm5, ymm6, 32
-	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 128]
-	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm0
-	BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x20 // VPERM2I128 ymm0, ymm7, ymm8, 32
-	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 160]
-	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm0
-	BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x31 // VPERM2I128 ymm0, ymm5, ymm6, 49
-	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 192]
-	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm0
-	BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x31 // VPERM2I128 ymm0, ymm7, ymm8, 49
-	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 224]
-	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm0
-	BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14
-	ADDQ $256, BX
-	ADDQ $256, CX
-	SUBQ $4, DX
-process_2_blocks:
-	ADDQ $4, DX
-	JEQ out_write_even
-vector_loop2_begin:
-		BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xCA // VMOVDQA ymm2, ymm9
-		BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xD3 // VMOVDQA ymm3, ymm10
-		BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xDC // VMOVDQA ymm4, ymm11
-		BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE1 // VMOVDQA ymm1, ymm12
-		MOVQ $20, SI
-rounds_loop2_begin:
-			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
-			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
-			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
-			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
-			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147
-			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
-			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
-			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
-			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
-			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
-			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
-			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
-			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
-			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57
-			SUBQ $2, SI
-			JNE rounds_loop2_begin
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x6D; BYTE $0xFE; BYTE $0xD1 // VPADDD ymm2, ymm2, ymm9
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDA // VPADDD ymm3, ymm3, ymm10
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE3 // VPADDD ymm4, ymm4, ymm11
-		BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0xFE; BYTE $0xCC // VPADDD ymm1, ymm1, ymm12
-		BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0
-		BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0
-		SUBQ $1, DX
-		JEQ out_write_odd
-		BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14
-		BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0
-		BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49
-		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96]
-		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0
-		SUBQ $1, DX
-		JEQ out_write_even
-		ADDQ $128, BX
-		ADDQ $128, CX
-		JMP vector_loop2_begin
-out_write_odd:
-	BYTE $0xC4; BYTE $0x43; BYTE $0x1D; BYTE $0x46; BYTE $0xE4; BYTE $0x01 // VPERM2I128 ymm12, ymm12, ymm12, 1
-out_write_even:
-	BYTE $0xC5; BYTE $0x7A; BYTE $0x7F; BYTE $0x60; BYTE $0x30 // VMOVDQU [rax + 48], xmm12
-	BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0xD2 // VPXOR ymm2, ymm2, ymm2
-	BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x54; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm2
-	BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x54; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm2
-	BYTE $0xC5; BYTE $0xFC; BYTE $0x77 // VZEROALL
-	MOVQ DI, SP
-	RET
-
-// func cpuidAmd64(cpuidParams *uint32)
-TEXT ·cpuidAmd64(SB),4,$0-8
-	MOVQ cpuidParams+0(FP), R15
-	MOVL 0(R15), AX
-	MOVL 8(R15), CX
-	CPUID
-	MOVL AX, 0(R15)
-	MOVL BX, 4(R15)
-	MOVL CX, 8(R15)
-	MOVL DX, 12(R15)
-	RET
-
-// func xgetbv0Amd64(xcrVec *uint32)
-TEXT ·xgetbv0Amd64(SB),4,$0-8
-	MOVQ xcrVec+0(FP), BX
-	XORL CX, CX
-	BYTE $0x0F; BYTE $0x01; BYTE $0xD0 // XGETBV
-	MOVL AX, 0(BX)
-	MOVL DX, 4(BX)
-	RET
diff --git a/vendor/github.com/Yawning/chacha20/chacha20_ref.go b/vendor/github.com/Yawning/chacha20/chacha20_ref.go
deleted file mode 100644
index fcdc8c6..0000000
--- a/vendor/github.com/Yawning/chacha20/chacha20_ref.go
+++ /dev/null
@@ -1,394 +0,0 @@
-// chacha20_ref.go - Reference ChaCha20.
-//
-// To the extent possible under law, Yawning Angel has waived all copyright
-// and related or neighboring rights to chacha20, using the Creative
-// Commons "CC0" public domain dedication. See LICENSE or
-// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
-
-// +build !go1.9
-
-package chacha20
-
-import (
-	"encoding/binary"
-	"math"
-	"unsafe"
-)
-
-func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) {
-	if isIetf {
-		var totalBlocks uint64
-		totalBlocks = uint64(x[12]) + uint64(nrBlocks)
-		if totalBlocks > math.MaxUint32 {
-			panic("chacha20: Exceeded keystream per nonce limit")
-		}
-	}
-
-	// This routine ignores x[0]...x[4] in favor the const values since it's
-	// ever so slightly faster.
-
-	for n := 0; n < nrBlocks; n++ {
-		x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
-		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
-
-		for i := chachaRounds; i > 0; i -= 2 {
-			// quarterround(x, 0, 4, 8, 12)
-			x0 += x4
-			x12 ^= x0
-			x12 = (x12 << 16) | (x12 >> 16)
-			x8 += x12
-			x4 ^= x8
-			x4 = (x4 << 12) | (x4 >> 20)
-			x0 += x4
-			x12 ^= x0
-			x12 = (x12 << 8) | (x12 >> 24)
-			x8 += x12
-			x4 ^= x8
-			x4 = (x4 << 7) | (x4 >> 25)
-
-			// quarterround(x, 1, 5, 9, 13)
-			x1 += x5
-			x13 ^= x1
-			x13 = (x13 << 16) | (x13 >> 16)
-			x9 += x13
-			x5 ^= x9
-			x5 = (x5 << 12) | (x5 >> 20)
-			x1 += x5
-			x13 ^= x1
-			x13 = (x13 << 8) | (x13 >> 24)
-			x9 += x13
-			x5 ^= x9
-			x5 = (x5 << 7) | (x5 >> 25)
-
-			// quarterround(x, 2, 6, 10, 14)
-			x2 += x6
-			x14 ^= x2
-			x14 = (x14 << 16) | (x14 >> 16)
-			x10 += x14
-			x6 ^= x10
-			x6 = (x6 << 12) | (x6 >> 20)
-			x2 += x6
-			x14 ^= x2
-			x14 = (x14 << 8) | (x14 >> 24)
-			x10 += x14
-			x6 ^= x10
-			x6 = (x6 << 7) | (x6 >> 25)
-
-			// quarterround(x, 3, 7, 11, 15)
-			x3 += x7
-			x15 ^= x3
-			x15 = (x15 << 16) | (x15 >> 16)
-			x11 += x15
-			x7 ^= x11
-			x7 = (x7 << 12) | (x7 >> 20)
-			x3 += x7
-			x15 ^= x3
-			x15 = (x15 << 8) | (x15 >> 24)
-			x11 += x15
-			x7 ^= x11
-			x7 = (x7 << 7) | (x7 >> 25)
-
-			// quarterround(x, 0, 5, 10, 15)
-			x0 += x5
-			x15 ^= x0
-			x15 = (x15 << 16) | (x15 >> 16)
-			x10 += x15
-			x5 ^= x10
-			x5 = (x5 << 12) | (x5 >> 20)
-			x0 += x5
-			x15 ^= x0
-			x15 = (x15 << 8) | (x15 >> 24)
-			x10 += x15
-			x5 ^= x10
-			x5 = (x5 << 7) | (x5 >> 25)
-
-			// quarterround(x, 1, 6, 11, 12)
-			x1 += x6
-			x12 ^= x1
-			x12 = (x12 << 16) | (x12 >> 16)
-			x11 += x12
-			x6 ^= x11
-			x6 = (x6 << 12) | (x6 >> 20)
-			x1 += x6
-			x12 ^= x1
-			x12 = (x12 << 8) | (x12 >> 24)
-			x11 += x12
-			x6 ^= x11
-			x6 = (x6 << 7) | (x6 >> 25)
-
-			// quarterround(x, 2, 7, 8, 13)
-			x2 += x7
-			x13 ^= x2
-			x13 = (x13 << 16) | (x13 >> 16)
-			x8 += x13
-			x7 ^= x8
-			x7 = (x7 << 12) | (x7 >> 20)
-			x2 += x7
-			x13 ^= x2
-			x13 = (x13 << 8) | (x13 >> 24)
-			x8 += x13
-			x7 ^= x8
-			x7 = (x7 << 7) | (x7 >> 25)
-
-			// quarterround(x, 3, 4, 9, 14)
-			x3 += x4
-			x14 ^= x3
-			x14 = (x14 << 16) | (x14 >> 16)
-			x9 += x14
-			x4 ^= x9
-			x4 = (x4 << 12) | (x4 >> 20)
-			x3 += x4
-			x14 ^= x3
-			x14 = (x14 << 8) | (x14 >> 24)
-			x9 += x14
-			x4 ^= x9
-			x4 = (x4 << 7) | (x4 >> 25)
-		}
-
-		// On amd64 at least, this is a rather big boost.
-		if useUnsafe {
-			if in != nil {
-				inArr := (*[16]uint32)(unsafe.Pointer(&in[n*BlockSize]))
-				outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
-				outArr[0] = inArr[0] ^ (x0 + sigma0)
-				outArr[1] = inArr[1] ^ (x1 + sigma1)
-				outArr[2] = inArr[2] ^ (x2 + sigma2)
-				outArr[3] = inArr[3] ^ (x3 + sigma3)
-				outArr[4] = inArr[4] ^ (x4 + x[4])
-				outArr[5] = inArr[5] ^ (x5 + x[5])
-				outArr[6] = inArr[6] ^ (x6 + x[6])
-				outArr[7] = inArr[7] ^ (x7 + x[7])
-				outArr[8] = inArr[8] ^ (x8 + x[8])
-				outArr[9] = inArr[9] ^ (x9 + x[9])
-				outArr[10] = inArr[10] ^ (x10 + x[10])
-				outArr[11] = inArr[11] ^ (x11 + x[11])
-				outArr[12] = inArr[12] ^ (x12 + x[12])
-				outArr[13] = inArr[13] ^ (x13 + x[13])
-				outArr[14] = inArr[14] ^ (x14 + x[14])
-				outArr[15] = inArr[15] ^ (x15 + x[15])
-			} else {
-				outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
-				outArr[0] = x0 + sigma0
-				outArr[1] = x1 + sigma1
-				outArr[2] = x2 + sigma2
-				outArr[3] = x3 + sigma3
-				outArr[4] = x4 + x[4]
-				outArr[5] = x5 + x[5]
-				outArr[6] = x6 + x[6]
-				outArr[7] = x7 + x[7]
-				outArr[8] = x8 + x[8]
-				outArr[9] = x9 + x[9]
-				outArr[10] = x10 + x[10]
-				outArr[11] = x11 + x[11]
-				outArr[12] = x12 + x[12]
-				outArr[13] = x13 + x[13]
-				outArr[14] = x14 + x[14]
-				outArr[15] = x15 + x[15]
-			}
-		} else {
-			// Slow path, either the architecture cares about alignment, or is not little endian.
-			x0 += sigma0
-			x1 += sigma1
-			x2 += sigma2
-			x3 += sigma3
-			x4 += x[4]
-			x5 += x[5]
-			x6 += x[6]
-			x7 += x[7]
-			x8 += x[8]
-			x9 += x[9]
-			x10 += x[10]
-			x11 += x[11]
-			x12 += x[12]
-			x13 += x[13]
-			x14 += x[14]
-			x15 += x[15]
-			if in != nil {
-				binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^x0)
-				binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^x1)
-				binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^x2)
-				binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^x3)
-				binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4)
-				binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5)
-				binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6)
-				binary.LittleEndian.PutUint32(out[28:32], binary.LittleEndian.Uint32(in[28:32])^x7)
-				binary.LittleEndian.PutUint32(out[32:36], binary.LittleEndian.Uint32(in[32:36])^x8)
-				binary.LittleEndian.PutUint32(out[36:40], binary.LittleEndian.Uint32(in[36:40])^x9)
-				binary.LittleEndian.PutUint32(out[40:44], binary.LittleEndian.Uint32(in[40:44])^x10)
-				binary.LittleEndian.PutUint32(out[44:48], binary.LittleEndian.Uint32(in[44:48])^x11)
-				binary.LittleEndian.PutUint32(out[48:52], binary.LittleEndian.Uint32(in[48:52])^x12)
-				binary.LittleEndian.PutUint32(out[52:56], binary.LittleEndian.Uint32(in[52:56])^x13)
-				binary.LittleEndian.PutUint32(out[56:60], binary.LittleEndian.Uint32(in[56:60])^x14)
-				binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15)
-				in = in[BlockSize:]
-			} else {
-				binary.LittleEndian.PutUint32(out[0:4], x0)
-				binary.LittleEndian.PutUint32(out[4:8], x1)
-				binary.LittleEndian.PutUint32(out[8:12], x2)
-				binary.LittleEndian.PutUint32(out[12:16], x3)
-				binary.LittleEndian.PutUint32(out[16:20], x4)
-				binary.LittleEndian.PutUint32(out[20:24], x5)
-				binary.LittleEndian.PutUint32(out[24:28], x6)
-				binary.LittleEndian.PutUint32(out[28:32], x7)
-				binary.LittleEndian.PutUint32(out[32:36], x8)
-				binary.LittleEndian.PutUint32(out[36:40], x9)
-				binary.LittleEndian.PutUint32(out[40:44], x10)
-				binary.LittleEndian.PutUint32(out[44:48], x11)
-				binary.LittleEndian.PutUint32(out[48:52], x12)
-				binary.LittleEndian.PutUint32(out[52:56], x13)
-				binary.LittleEndian.PutUint32(out[56:60], x14)
-				binary.LittleEndian.PutUint32(out[60:64], x15)
-			}
-			out = out[BlockSize:]
-		}
-
-		// Stoping at 2^70 bytes per nonce is the user's responsibility.
-		ctr := uint64(x[13])<<32 | uint64(x[12])
-		ctr++
-		x[12] = uint32(ctr)
-		x[13] = uint32(ctr >> 32)
-	}
-}
-
-func hChaChaRef(x *[stateSize]uint32, out *[32]byte) {
-	x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
-	x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11]
-
-	for i := chachaRounds; i > 0; i -= 2 {
-		// quarterround(x, 0, 4, 8, 12)
-		x0 += x4
-		x12 ^= x0
-		x12 = (x12 << 16) | (x12 >> 16)
-		x8 += x12
-		x4 ^= x8
-		x4 = (x4 << 12) | (x4 >> 20)
-		x0 += x4
-		x12 ^= x0
-		x12 = (x12 << 8) | (x12 >> 24)
-		x8 += x12
-		x4 ^= x8
-		x4 = (x4 << 7) | (x4 >> 25)
-
-		// quarterround(x, 1, 5, 9, 13)
-		x1 += x5
-		x13 ^= x1
-		x13 = (x13 << 16) | (x13 >> 16)
-		x9 += x13
-		x5 ^= x9
-		x5 = (x5 << 12) | (x5 >> 20)
-		x1 += x5
-		x13 ^= x1
-		x13 = (x13 << 8) | (x13 >> 24)
-		x9 += x13
-		x5 ^= x9
-		x5 = (x5 << 7) | (x5 >> 25)
-
-		// quarterround(x, 2, 6, 10, 14)
-		x2 += x6
-		x14 ^= x2
-		x14 = (x14 << 16) | (x14 >> 16)
-		x10 += x14
-		x6 ^= x10
-		x6 = (x6 << 12) | (x6 >> 20)
-		x2 += x6
-		x14 ^= x2
-		x14 = (x14 << 8) | (x14 >> 24)
-		x10 += x14
-		x6 ^= x10
-		x6 = (x6 << 7) | (x6 >> 25)
-
-		// quarterround(x, 3, 7, 11, 15)
-		x3 += x7
-		x15 ^= x3
-		x15 = (x15 << 16) | (x15 >> 16)
-		x11 += x15
-		x7 ^= x11
-		x7 = (x7 << 12) | (x7 >> 20)
-		x3 += x7
-		x15 ^= x3
-		x15 = (x15 << 8) | (x15 >> 24)
-		x11 += x15
-		x7 ^= x11
-		x7 = (x7 << 7) | (x7 >> 25)
-
-		// quarterround(x, 0, 5, 10, 15)
-		x0 += x5
-		x15 ^= x0
-		x15 = (x15 << 16) | (x15 >> 16)
-		x10 += x15
-		x5 ^= x10
-		x5 = (x5 << 12) | (x5 >> 20)
-		x0 += x5
-		x15 ^= x0
-		x15 = (x15 << 8) | (x15 >> 24)
-		x10 += x15
-		x5 ^= x10
-		x5 = (x5 << 7) | (x5 >> 25)
-
-		// quarterround(x, 1, 6, 11, 12)
-		x1 += x6
-		x12 ^= x1
-		x12 = (x12 << 16) | (x12 >> 16)
-		x11 += x12
-		x6 ^= x11
-		x6 = (x6 << 12) | (x6 >> 20)
-		x1 += x6
-		x12 ^= x1
-		x12 = (x12 << 8) | (x12 >> 24)
-		x11 += x12
-		x6 ^= x11
-		x6 = (x6 << 7) | (x6 >> 25)
-
-		// quarterround(x, 2, 7, 8, 13)
-		x2 += x7
-		x13 ^= x2
-		x13 = (x13 << 16) | (x13 >> 16)
-		x8 += x13
-		x7 ^= x8
-		x7 = (x7 << 12) | (x7 >> 20)
-		x2 += x7
-		x13 ^= x2
-		x13 = (x13 << 8) | (x13 >> 24)
-		x8 += x13
-		x7 ^= x8
-		x7 = (x7 << 7) | (x7 >> 25)
-
-		// quarterround(x, 3, 4, 9, 14)
-		x3 += x4
-		x14 ^= x3
-		x14 = (x14 << 16) | (x14 >> 16)
-		x9 += x14
-		x4 ^= x9
-		x4 = (x4 << 12) | (x4 >> 20)
-		x3 += x4
-		x14 ^= x3
-		x14 = (x14 << 8) | (x14 >> 24)
-		x9 += x14
-		x4 ^= x9
-		x4 = (x4 << 7) | (x4 >> 25)
-	}
-
-	// HChaCha returns x0...x3 | x12...x15, which corresponds to the
-	// indexes of the ChaCha constant and the indexes of the IV.
-	if useUnsafe {
-		outArr := (*[16]uint32)(unsafe.Pointer(&out[0]))
-		outArr[0] = x0
-		outArr[1] = x1
-		outArr[2] = x2
-		outArr[3] = x3
-		outArr[4] = x12
-		outArr[5] = x13
-		outArr[6] = x14
-		outArr[7] = x15
-	} else {
-		binary.LittleEndian.PutUint32(out[0:4], x0)
-		binary.LittleEndian.PutUint32(out[4:8], x1)
-		binary.LittleEndian.PutUint32(out[8:12], x2)
-		binary.LittleEndian.PutUint32(out[12:16], x3)
-		binary.LittleEndian.PutUint32(out[16:20], x12)
-		binary.LittleEndian.PutUint32(out[20:24], x13)
-		binary.LittleEndian.PutUint32(out[24:28], x14)
-		binary.LittleEndian.PutUint32(out[28:32], x15)
-	}
-	return
-}
diff --git a/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go b/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go
deleted file mode 100644
index 8405c22..0000000
--- a/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go
+++ /dev/null
@@ -1,395 +0,0 @@
-// chacha20_ref.go - Reference ChaCha20.
-//
-// To the extent possible under law, Yawning Angel has waived all copyright
-// and related or neighboring rights to chacha20, using the Creative
-// Commons "CC0" public domain dedication. See LICENSE or
-// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
-
-// +build go1.9
-
-package chacha20
-
-import (
-	"encoding/binary"
-	"math"
-	"math/bits"
-	"unsafe"
-)
-
-func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) {
-	if isIetf {
-		var totalBlocks uint64
-		totalBlocks = uint64(x[12]) + uint64(nrBlocks)
-		if totalBlocks > math.MaxUint32 {
-			panic("chacha20: Exceeded keystream per nonce limit")
-		}
-	}
-
-	// This routine ignores x[0]...x[4] in favor the const values since it's
-	// ever so slightly faster.
-
-	for n := 0; n < nrBlocks; n++ {
-		x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
-		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
-
-		for i := chachaRounds; i > 0; i -= 2 {
-			// quarterround(x, 0, 4, 8, 12)
-			x0 += x4
-			x12 ^= x0
-			x12 = bits.RotateLeft32(x12, 16)
-			x8 += x12
-			x4 ^= x8
-			x4 = bits.RotateLeft32(x4, 12)
-			x0 += x4
-			x12 ^= x0
-			x12 = bits.RotateLeft32(x12, 8)
-			x8 += x12
-			x4 ^= x8
-			x4 = bits.RotateLeft32(x4, 7)
-
-			// quarterround(x, 1, 5, 9, 13)
-			x1 += x5
-			x13 ^= x1
-			x13 = bits.RotateLeft32(x13, 16)
-			x9 += x13
-			x5 ^= x9
-			x5 = bits.RotateLeft32(x5, 12)
-			x1 += x5
-			x13 ^= x1
-			x13 = bits.RotateLeft32(x13, 8)
-			x9 += x13
-			x5 ^= x9
-			x5 = bits.RotateLeft32(x5, 7)
-
-			// quarterround(x, 2, 6, 10, 14)
-			x2 += x6
-			x14 ^= x2
-			x14 = bits.RotateLeft32(x14, 16)
-			x10 += x14
-			x6 ^= x10
-			x6 = bits.RotateLeft32(x6, 12)
-			x2 += x6
-			x14 ^= x2
-			x14 = bits.RotateLeft32(x14, 8)
-			x10 += x14
-			x6 ^= x10
-			x6 = bits.RotateLeft32(x6, 7)
-
-			// quarterround(x, 3, 7, 11, 15)
-			x3 += x7
-			x15 ^= x3
-			x15 = bits.RotateLeft32(x15, 16)
-			x11 += x15
-			x7 ^= x11
-			x7 = bits.RotateLeft32(x7, 12)
-			x3 += x7
-			x15 ^= x3
-			x15 = bits.RotateLeft32(x15, 8)
-			x11 += x15
-			x7 ^= x11
-			x7 = bits.RotateLeft32(x7, 7)
-
-			// quarterround(x, 0, 5, 10, 15)
-			x0 += x5
-			x15 ^= x0
-			x15 = bits.RotateLeft32(x15, 16)
-			x10 += x15
-			x5 ^= x10
-			x5 = bits.RotateLeft32(x5, 12)
-			x0 += x5
-			x15 ^= x0
-			x15 = bits.RotateLeft32(x15, 8)
-			x10 += x15
-			x5 ^= x10
-			x5 = bits.RotateLeft32(x5, 7)
-
-			// quarterround(x, 1, 6, 11, 12)
-			x1 += x6
-			x12 ^= x1
-			x12 = bits.RotateLeft32(x12, 16)
-			x11 += x12
-			x6 ^= x11
-			x6 = bits.RotateLeft32(x6, 12)
-			x1 += x6
-			x12 ^= x1
-			x12 = bits.RotateLeft32(x12, 8)
-			x11 += x12
-			x6 ^= x11
-			x6 = bits.RotateLeft32(x6, 7)
-
-			// quarterround(x, 2, 7, 8, 13)
-			x2 += x7
-			x13 ^= x2
-			x13 = bits.RotateLeft32(x13, 16)
-			x8 += x13
-			x7 ^= x8
-			x7 = bits.RotateLeft32(x7, 12)
-			x2 += x7
-			x13 ^= x2
-			x13 = bits.RotateLeft32(x13, 8)
-			x8 += x13
-			x7 ^= x8
-			x7 = bits.RotateLeft32(x7, 7)
-
-			// quarterround(x, 3, 4, 9, 14)
-			x3 += x4
-			x14 ^= x3
-			x14 = bits.RotateLeft32(x14, 16)
-			x9 += x14
-			x4 ^= x9
-			x4 = bits.RotateLeft32(x4, 12)
-			x3 += x4
-			x14 ^= x3
-			x14 = bits.RotateLeft32(x14, 8)
-			x9 += x14
-			x4 ^= x9
-			x4 = bits.RotateLeft32(x4, 7)
-		}
-
-		// On amd64 at least, this is a rather big boost.
-		if useUnsafe {
-			if in != nil {
-				inArr := (*[16]uint32)(unsafe.Pointer(&in[n*BlockSize]))
-				outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
-				outArr[0] = inArr[0] ^ (x0 + sigma0)
-				outArr[1] = inArr[1] ^ (x1 + sigma1)
-				outArr[2] = inArr[2] ^ (x2 + sigma2)
-				outArr[3] = inArr[3] ^ (x3 + sigma3)
-				outArr[4] = inArr[4] ^ (x4 + x[4])
-				outArr[5] = inArr[5] ^ (x5 + x[5])
-				outArr[6] = inArr[6] ^ (x6 + x[6])
-				outArr[7] = inArr[7] ^ (x7 + x[7])
-				outArr[8] = inArr[8] ^ (x8 + x[8])
-				outArr[9] = inArr[9] ^ (x9 + x[9])
-				outArr[10] = inArr[10] ^ (x10 + x[10])
-				outArr[11] = inArr[11] ^ (x11 + x[11])
-				outArr[12] = inArr[12] ^ (x12 + x[12])
-				outArr[13] = inArr[13] ^ (x13 + x[13])
-				outArr[14] = inArr[14] ^ (x14 + x[14])
-				outArr[15] = inArr[15] ^ (x15 + x[15])
-			} else {
-				outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
-				outArr[0] = x0 + sigma0
-				outArr[1] = x1 + sigma1
-				outArr[2] = x2 + sigma2
-				outArr[3] = x3 + sigma3
-				outArr[4] = x4 + x[4]
-				outArr[5] = x5 + x[5]
-				outArr[6] = x6 + x[6]
-				outArr[7] = x7 + x[7]
-				outArr[8] = x8 + x[8]
-				outArr[9] = x9 + x[9]
-				outArr[10] = x10 + x[10]
-				outArr[11] = x11 + x[11]
-				outArr[12] = x12 + x[12]
-				outArr[13] = x13 + x[13]
-				outArr[14] = x14 + x[14]
-				outArr[15] = x15 + x[15]
-			}
-		} else {
-			// Slow path, either the architecture cares about alignment, or is not little endian.
-			x0 += sigma0
-			x1 += sigma1
-			x2 += sigma2
-			x3 += sigma3
-			x4 += x[4]
-			x5 += x[5]
-			x6 += x[6]
-			x7 += x[7]
-			x8 += x[8]
-			x9 += x[9]
-			x10 += x[10]
-			x11 += x[11]
-			x12 += x[12]
-			x13 += x[13]
-			x14 += x[14]
-			x15 += x[15]
-			if in != nil {
-				binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^x0)
-				binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^x1)
-				binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^x2)
-				binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^x3)
-				binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4)
-				binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5)
-				binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6)
-				binary.LittleEndian.PutUint32(out[28:32], binary.LittleEndian.Uint32(in[28:32])^x7)
-				binary.LittleEndian.PutUint32(out[32:36], binary.LittleEndian.Uint32(in[32:36])^x8)
-				binary.LittleEndian.PutUint32(out[36:40], binary.LittleEndian.Uint32(in[36:40])^x9)
-				binary.LittleEndian.PutUint32(out[40:44], binary.LittleEndian.Uint32(in[40:44])^x10)
-				binary.LittleEndian.PutUint32(out[44:48], binary.LittleEndian.Uint32(in[44:48])^x11)
-				binary.LittleEndian.PutUint32(out[48:52], binary.LittleEndian.Uint32(in[48:52])^x12)
-				binary.LittleEndian.PutUint32(out[52:56], binary.LittleEndian.Uint32(in[52:56])^x13)
-				binary.LittleEndian.PutUint32(out[56:60], binary.LittleEndian.Uint32(in[56:60])^x14)
-				binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15)
-				in = in[BlockSize:]
-			} else {
-				binary.LittleEndian.PutUint32(out[0:4], x0)
-				binary.LittleEndian.PutUint32(out[4:8], x1)
-				binary.LittleEndian.PutUint32(out[8:12], x2)
-				binary.LittleEndian.PutUint32(out[12:16], x3)
-				binary.LittleEndian.PutUint32(out[16:20], x4)
-				binary.LittleEndian.PutUint32(out[20:24], x5)
-				binary.LittleEndian.PutUint32(out[24:28], x6)
-				binary.LittleEndian.PutUint32(out[28:32], x7)
-				binary.LittleEndian.PutUint32(out[32:36], x8)
-				binary.LittleEndian.PutUint32(out[36:40], x9)
-				binary.LittleEndian.PutUint32(out[40:44], x10)
-				binary.LittleEndian.PutUint32(out[44:48], x11)
-				binary.LittleEndian.PutUint32(out[48:52], x12)
-				binary.LittleEndian.PutUint32(out[52:56], x13)
-				binary.LittleEndian.PutUint32(out[56:60], x14)
-				binary.LittleEndian.PutUint32(out[60:64], x15)
-			}
-			out = out[BlockSize:]
-		}
-
-		// Stoping at 2^70 bytes per nonce is the user's responsibility.
-		ctr := uint64(x[13])<<32 | uint64(x[12])
-		ctr++
-		x[12] = uint32(ctr)
-		x[13] = uint32(ctr >> 32)
-	}
-}
-
-func hChaChaRef(x *[stateSize]uint32, out *[32]byte) {
-	x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
-	x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11]
-
-	for i := chachaRounds; i > 0; i -= 2 {
-		// quarterround(x, 0, 4, 8, 12)
-		x0 += x4
-		x12 ^= x0
-		x12 = bits.RotateLeft32(x12, 16)
-		x8 += x12
-		x4 ^= x8
-		x4 = bits.RotateLeft32(x4, 12)
-		x0 += x4
-		x12 ^= x0
-		x12 = bits.RotateLeft32(x12, 8)
-		x8 += x12
-		x4 ^= x8
-		x4 = bits.RotateLeft32(x4, 7)
-
-		// quarterround(x, 1, 5, 9, 13)
-		x1 += x5
-		x13 ^= x1
-		x13 = bits.RotateLeft32(x13, 16)
-		x9 += x13
-		x5 ^= x9
-		x5 = bits.RotateLeft32(x5, 12)
-		x1 += x5
-		x13 ^= x1
-		x13 = bits.RotateLeft32(x13, 8)
-		x9 += x13
-		x5 ^= x9
-		x5 = bits.RotateLeft32(x5, 7)
-
-		// quarterround(x, 2, 6, 10, 14)
-		x2 += x6
-		x14 ^= x2
-		x14 = bits.RotateLeft32(x14, 16)
-		x10 += x14
-		x6 ^= x10
-		x6 = bits.RotateLeft32(x6, 12)
-		x2 += x6
-		x14 ^= x2
-		x14 = bits.RotateLeft32(x14, 8)
-		x10 += x14
-		x6 ^= x10
-		x6 = bits.RotateLeft32(x6, 7)
-
-		// quarterround(x, 3, 7, 11, 15)
-		x3 += x7
-		x15 ^= x3
-		x15 = bits.RotateLeft32(x15, 16)
-		x11 += x15
-		x7 ^= x11
-		x7 = bits.RotateLeft32(x7, 12)
-		x3 += x7
-		x15 ^= x3
-		x15 = bits.RotateLeft32(x15, 8)
-		x11 += x15
-		x7 ^= x11
-		x7 = bits.RotateLeft32(x7, 7)
-
-		// quarterround(x, 0, 5, 10, 15)
-		x0 += x5
-		x15 ^= x0
-		x15 = bits.RotateLeft32(x15, 16)
-		x10 += x15
-		x5 ^= x10
-		x5 = bits.RotateLeft32(x5, 12)
-		x0 += x5
-		x15 ^= x0
-		x15 = bits.RotateLeft32(x15, 8)
-		x10 += x15
-		x5 ^= x10
-		x5 = bits.RotateLeft32(x5, 7)
-
-		// quarterround(x, 1, 6, 11, 12)
-		x1 += x6
-		x12 ^= x1
-		x12 = bits.RotateLeft32(x12, 16)
-		x11 += x12
-		x6 ^= x11
-		x6 = bits.RotateLeft32(x6, 12)
-		x1 += x6
-		x12 ^= x1
-		x12 = bits.RotateLeft32(x12, 8)
-		x11 += x12
-		x6 ^= x11
-		x6 = bits.RotateLeft32(x6, 7)
-
-		// quarterround(x, 2, 7, 8, 13)
-		x2 += x7
-		x13 ^= x2
-		x13 = bits.RotateLeft32(x13, 16)
-		x8 += x13
-		x7 ^= x8
-		x7 = bits.RotateLeft32(x7, 12)
-		x2 += x7
-		x13 ^= x2
-		x13 = bits.RotateLeft32(x13, 8)
-		x8 += x13
-		x7 ^= x8
-		x7 = bits.RotateLeft32(x7, 7)
-
-		// quarterround(x, 3, 4, 9, 14)
-		x3 += x4
-		x14 ^= x3
-		x14 = bits.RotateLeft32(x14, 16)
-		x9 += x14
-		x4 ^= x9
-		x4 = bits.RotateLeft32(x4, 12)
-		x3 += x4
-		x14 ^= x3
-		x14 = bits.RotateLeft32(x14, 8)
-		x9 += x14
-		x4 ^= x9
-		x4 = bits.RotateLeft32(x4, 7)
-	}
-
-	// HChaCha returns x0...x3 | x12...x15, which corresponds to the
-	// indexes of the ChaCha constant and the indexes of the IV.
-	if useUnsafe {
-		outArr := (*[16]uint32)(unsafe.Pointer(&out[0]))
-		outArr[0] = x0
-		outArr[1] = x1
-		outArr[2] = x2
-		outArr[3] = x3
-		outArr[4] = x12
-		outArr[5] = x13
-		outArr[6] = x14
-		outArr[7] = x15
-	} else {
-		binary.LittleEndian.PutUint32(out[0:4], x0)
-		binary.LittleEndian.PutUint32(out[4:8], x1)
-		binary.LittleEndian.PutUint32(out[8:12], x2)
-		binary.LittleEndian.PutUint32(out[12:16], x3)
-		binary.LittleEndian.PutUint32(out[16:20], x12)
-		binary.LittleEndian.PutUint32(out[20:24], x13)
-		binary.LittleEndian.PutUint32(out[24:28], x14)
-		binary.LittleEndian.PutUint32(out[28:32], x15)
-	}
-	return
-}
diff --git a/vendor/github.com/dgryski/go-camellia/camellia.go b/vendor/github.com/dgryski/go-camellia/camellia.go
deleted file mode 100644
index 048b2e3..0000000
--- a/vendor/github.com/dgryski/go-camellia/camellia.go
+++ /dev/null
@@ -1,368 +0,0 @@
-// Copyright (c) 2013 Damian Gryski <damian@gryski.com>
-// Licensed under the GPLv3 or, at your option, any later version.
-
-// Package camellia is an implementation of the CAMELLIA encryption algorithm
-/*
-
-   This is an unoptimized version based on the description in RFC 3713.
-
-   References:
-   http://en.wikipedia.org/wiki/Camellia_%28cipher%29
-   https://info.isl.ntt.co.jp/crypt/eng/camellia/
-*/
-package camellia
-
-import (
-	"crypto/cipher"
-	"encoding/binary"
-	"strconv"
-)
-
-const BlockSize = 16
-
-type KeySizeError int
-
-func (k KeySizeError) Error() string {
-	return "camellia: invalid key size " + strconv.Itoa(int(k))
-}
-
-type camelliaCipher struct {
-	kw   [5]uint64
-	k    [25]uint64
-	ke   [7]uint64
-	klen int
-}
-
-const (
-	sigma1 = 0xA09E667F3BCC908B
-	sigma2 = 0xB67AE8584CAA73B2
-	sigma3 = 0xC6EF372FE94F82BE
-	sigma4 = 0x54FF53A5F1D36F1C
-	sigma5 = 0x10E527FADE682D1D
-	sigma6 = 0xB05688C2B3E6C1FD
-)
-
-func init() {
-	// initialize other sboxes
-	for i := range sbox1 {
-		sbox2[i] = rotl8(sbox1[i], 1)
-		sbox3[i] = rotl8(sbox1[i], 7)
-		sbox4[i] = sbox1[rotl8(uint8(i), 1)]
-	}
-}
-
-func rotl128(k [2]uint64, rot uint) (hi, lo uint64) {
-
-	if rot > 64 {
-		rot -= 64
-		k[0], k[1] = k[1], k[0]
-	}
-
-	t := k[0] >> (64 - rot)
-	hi = (k[0] << rot) | (k[1] >> (64 - rot))
-	lo = (k[1] << rot) | t
-	return hi, lo
-}
-
-func rotl32(k uint32, rot uint) uint32 {
-	return (k << rot) | (k >> (32 - rot))
-}
-
-func rotl8(k byte, rot uint) byte {
-	return (k << rot) | (k >> (8 - rot))
-}
-
-// New creates and returns a new cipher.Block.
-// The key argument should be 16, 24, or 32 bytes.
-func New(key []byte) (cipher.Block, error) {
-
-	klen := len(key)
-	switch klen {
-	default:
-		return nil, KeySizeError(klen)
-	case 16, 24, 32:
-		break
-	}
-
-	var d1, d2 uint64
-
-	var kl [2]uint64
-	var kr [2]uint64
-	var ka [2]uint64
-	var kb [2]uint64
-
-	kl[0] = binary.BigEndian.Uint64(key[0:])
-	kl[1] = binary.BigEndian.Uint64(key[8:])
-
-	switch klen {
-	case 24:
-		kr[0] = binary.BigEndian.Uint64(key[16:])
-		kr[1] = ^kr[0]
-	case 32:
-		kr[0] = binary.BigEndian.Uint64(key[16:])
-		kr[1] = binary.BigEndian.Uint64(key[24:])
-
-	}
-
-	d1 = (kl[0] ^ kr[0])
-	d2 = (kl[1] ^ kr[1])
-
-	d2 = d2 ^ f(d1, sigma1)
-	d1 = d1 ^ f(d2, sigma2)
-
-	d1 = d1 ^ (kl[0])
-	d2 = d2 ^ (kl[1])
-	d2 = d2 ^ f(d1, sigma3)
-	d1 = d1 ^ f(d2, sigma4)
-	ka[0] = d1
-	ka[1] = d2
-	d1 = (ka[0] ^ kr[0])
-	d2 = (ka[1] ^ kr[1])
-	d2 = d2 ^ f(d1, sigma5)
-	d1 = d1 ^ f(d2, sigma6)
-	kb[0] = d1
-	kb[1] = d2
-
-	// here we generate our keys
-	c := new(camelliaCipher)
-
-	c.klen = klen
-
-	if klen == 16 {
-
-		c.kw[1], c.kw[2] = rotl128(kl, 0)
-
-		c.k[1], c.k[2] = rotl128(ka, 0)
-		c.k[3], c.k[4] = rotl128(kl, 15)
-		c.k[5], c.k[6] = rotl128(ka, 15)
-
-		c.ke[1], c.ke[2] = rotl128(ka, 30)
-
-		c.k[7], c.k[8] = rotl128(kl, 45)
-		c.k[9], _ = rotl128(ka, 45)
-		_, c.k[10] = rotl128(kl, 60)
-		c.k[11], c.k[12] = rotl128(ka, 60)
-
-		c.ke[3], c.ke[4] = rotl128(kl, 77)
-
-		c.k[13], c.k[14] = rotl128(kl, 94)
-		c.k[15], c.k[16] = rotl128(ka, 94)
-		c.k[17], c.k[18] = rotl128(kl, 111)
-
-		c.kw[3], c.kw[4] = rotl128(ka, 111)
-
-	} else {
-		// 24 or 32
-
-		c.kw[1], c.kw[2] = rotl128(kl, 0)
-
-		c.k[1], c.k[2] = rotl128(kb, 0)
-		c.k[3], c.k[4] = rotl128(kr, 15)
-		c.k[5], c.k[6] = rotl128(ka, 15)
-
-		c.ke[1], c.ke[2] = rotl128(kr, 30)
-
-		c.k[7], c.k[8] = rotl128(kb, 30)
-		c.k[9], c.k[10] = rotl128(kl, 45)
-		c.k[11], c.k[12] = rotl128(ka, 45)
-
-		c.ke[3], c.ke[4] = rotl128(kl, 60)
-
-		c.k[13], c.k[14] = rotl128(kr, 60)
-		c.k[15], c.k[16] = rotl128(kb, 60)
-		c.k[17], c.k[18] = rotl128(kl, 77)
-
-		c.ke[5], c.ke[6] = rotl128(ka, 77)
-
-		c.k[19], c.k[20] = rotl128(kr, 94)
-		c.k[21], c.k[22] = rotl128(ka, 94)
-		c.k[23], c.k[24] = rotl128(kl, 111)
-
-		c.kw[3], c.kw[4] = rotl128(kb, 111)
-	}
-
-	return c, nil
-}
-
-func (c *camelliaCipher) Encrypt(dst, src []byte) {
-
-	d1 := binary.BigEndian.Uint64(src[0:])
-	d2 := binary.BigEndian.Uint64(src[8:])
-
-	d1 ^= c.kw[1]
-	d2 ^= c.kw[2]
-
-	d2 = d2 ^ f(d1, c.k[1])
-	d1 = d1 ^ f(d2, c.k[2])
-	d2 = d2 ^ f(d1, c.k[3])
-	d1 = d1 ^ f(d2, c.k[4])
-	d2 = d2 ^ f(d1, c.k[5])
-	d1 = d1 ^ f(d2, c.k[6])
-
-	d1 = fl(d1, c.ke[1])
-	d2 = flinv(d2, c.ke[2])
-
-	d2 = d2 ^ f(d1, c.k[7])
-	d1 = d1 ^ f(d2, c.k[8])
-	d2 = d2 ^ f(d1, c.k[9])
-	d1 = d1 ^ f(d2, c.k[10])
-	d2 = d2 ^ f(d1, c.k[11])
-	d1 = d1 ^ f(d2, c.k[12])
-
-	d1 = fl(d1, c.ke[3])
-	d2 = flinv(d2, c.ke[4])
-
-	d2 = d2 ^ f(d1, c.k[13])
-	d1 = d1 ^ f(d2, c.k[14])
-	d2 = d2 ^ f(d1, c.k[15])
-	d1 = d1 ^ f(d2, c.k[16])
-	d2 = d2 ^ f(d1, c.k[17])
-	d1 = d1 ^ f(d2, c.k[18])
-
-	if c.klen > 16 {
-		// 24 or 32
-
-		d1 = fl(d1, c.ke[5])
-		d2 = flinv(d2, c.ke[6])
-
-		d2 = d2 ^ f(d1, c.k[19])
-		d1 = d1 ^ f(d2, c.k[20])
-		d2 = d2 ^ f(d1, c.k[21])
-		d1 = d1 ^ f(d2, c.k[22])
-		d2 = d2 ^ f(d1, c.k[23])
-		d1 = d1 ^ f(d2, c.k[24])
-	}
-
-	d2 = d2 ^ c.kw[3]
-	d1 = d1 ^ c.kw[4]
-
-	binary.BigEndian.PutUint64(dst[0:], d2)
-	binary.BigEndian.PutUint64(dst[8:], d1)
-}
-
-func (c *camelliaCipher) Decrypt(dst, src []byte) {
-
-	d2 := binary.BigEndian.Uint64(src[0:])
-	d1 := binary.BigEndian.Uint64(src[8:])
-
-	d1 = d1 ^ c.kw[4]
-	d2 = d2 ^ c.kw[3]
-
-	if c.klen > 16 {
-		// 24 or 32
-
-		d1 = d1 ^ f(d2, c.k[24])
-		d2 = d2 ^ f(d1, c.k[23])
-		d1 = d1 ^ f(d2, c.k[22])
-		d2 = d2 ^ f(d1, c.k[21])
-		d1 = d1 ^ f(d2, c.k[20])
-		d2 = d2 ^ f(d1, c.k[19])
-
-		d2 = fl(d2, c.ke[6])
-		d1 = flinv(d1, c.ke[5])
-	}
-
-	d1 = d1 ^ f(d2, c.k[18])
-	d2 = d2 ^ f(d1, c.k[17])
-	d1 = d1 ^ f(d2, c.k[16])
-	d2 = d2 ^ f(d1, c.k[15])
-	d1 = d1 ^ f(d2, c.k[14])
-	d2 = d2 ^ f(d1, c.k[13])
-
-	d2 = fl(d2, c.ke[4])
-	d1 = flinv(d1, c.ke[3])
-
-	d1 = d1 ^ f(d2, c.k[12])
-	d2 = d2 ^ f(d1, c.k[11])
-	d1 = d1 ^ f(d2, c.k[10])
-	d2 = d2 ^ f(d1, c.k[9])
-	d1 = d1 ^ f(d2, c.k[8])
-	d2 = d2 ^ f(d1, c.k[7])
-
-	d2 = fl(d2, c.ke[2])
-	d1 = flinv(d1, c.ke[1])
-
-	d1 = d1 ^ f(d2, c.k[6])
-	d2 = d2 ^ f(d1, c.k[5])
-	d1 = d1 ^ f(d2, c.k[4])
-	d2 = d2 ^ f(d1, c.k[3])
-	d1 = d1 ^ f(d2, c.k[2])
-	d2 = d2 ^ f(d1, c.k[1])
-
-	d2 ^= c.kw[2]
-	d1 ^= c.kw[1]
-
-	binary.BigEndian.PutUint64(dst[0:], d1)
-	binary.BigEndian.PutUint64(dst[8:], d2)
-}
-
-func (c *camelliaCipher) BlockSize() int {
-	return BlockSize
-}
-
-func f(fin, ke uint64) uint64 {
-	var x uint64
-	x = fin ^ ke
-	t1 := sbox1[uint8(x>>56)]
-	t2 := sbox2[uint8(x>>48)]
-	t3 := sbox3[uint8(x>>40)]
-	t4 := sbox4[uint8(x>>32)]
-	t5 := sbox2[uint8(x>>24)]
-	t6 := sbox3[uint8(x>>16)]
-	t7 := sbox4[uint8(x>>8)]
-	t8 := sbox1[uint8(x)]
-	y1 := t1 ^ t3 ^ t4 ^ t6 ^ t7 ^ t8
-	y2 := t1 ^ t2 ^ t4 ^ t5 ^ t7 ^ t8
-	y3 := t1 ^ t2 ^ t3 ^ t5 ^ t6 ^ t8
-	y4 := t2 ^ t3 ^ t4 ^ t5 ^ t6 ^ t7
-	y5 := t1 ^ t2 ^ t6 ^ t7 ^ t8
-	y6 := t2 ^ t3 ^ t5 ^ t7 ^ t8
-	y7 := t3 ^ t4 ^ t5 ^ t6 ^ t8
-	y8 := t1 ^ t4 ^ t5 ^ t6 ^ t7
-	return uint64(y1)<<56 | uint64(y2)<<48 | uint64(y3)<<40 | uint64(y4)<<32 | uint64(y5)<<24 | uint64(y6)<<16 | uint64(y7)<<8 | uint64(y8)
-}
-
-func fl(flin, ke uint64) uint64 {
-	x1 := uint32(flin >> 32)
-	x2 := uint32(flin & 0xffffffff)
-	k1 := uint32(ke >> 32)
-	k2 := uint32(ke & 0xffffffff)
-	x2 = x2 ^ rotl32(x1&k1, 1)
-	x1 = x1 ^ (x2 | k2)
-	return uint64(x1)<<32 | uint64(x2)
-}
-
-func flinv(flin, ke uint64) uint64 {
-	y1 := uint32(flin >> 32)
-	y2 := uint32(flin & 0xffffffff)
-	k1 := uint32(ke >> 32)
-	k2 := uint32(ke & 0xffffffff)
-	y1 = y1 ^ (y2 | k2)
-	y2 = y2 ^ rotl32(y1&k1, 1)
-	return uint64(y1)<<32 | uint64(y2)
-}
-
-var sbox1 = [...]byte{
-	0x70, 0x82, 0x2c, 0xec, 0xb3, 0x27, 0xc0, 0xe5, 0xe4, 0x85, 0x57, 0x35, 0xea, 0x0c, 0xae, 0x41,
-	0x23, 0xef, 0x6b, 0x93, 0x45, 0x19, 0xa5, 0x21, 0xed, 0x0e, 0x4f, 0x4e, 0x1d, 0x65, 0x92, 0xbd,
-	0x86, 0xb8, 0xaf, 0x8f, 0x7c, 0xeb, 0x1f, 0xce, 0x3e, 0x30, 0xdc, 0x5f, 0x5e, 0xc5, 0x0b, 0x1a,
-	0xa6, 0xe1, 0x39, 0xca, 0xd5, 0x47, 0x5d, 0x3d, 0xd9, 0x01, 0x5a, 0xd6, 0x51, 0x56, 0x6c, 0x4d,
-	0x8b, 0x0d, 0x9a, 0x66, 0xfb, 0xcc, 0xb0, 0x2d, 0x74, 0x12, 0x2b, 0x20, 0xf0, 0xb1, 0x84, 0x99,
-	0xdf, 0x4c, 0xcb, 0xc2, 0x34, 0x7e, 0x76, 0x05, 0x6d, 0xb7, 0xa9, 0x31, 0xd1, 0x17, 0x04, 0xd7,
-	0x14, 0x58, 0x3a, 0x61, 0xde, 0x1b, 0x11, 0x1c, 0x32, 0x0f, 0x9c, 0x16, 0x53, 0x18, 0xf2, 0x22,
-	0xfe, 0x44, 0xcf, 0xb2, 0xc3, 0xb5, 0x7a, 0x91, 0x24, 0x08, 0xe8, 0xa8, 0x60, 0xfc, 0x69, 0x50,
-	0xaa, 0xd0, 0xa0, 0x7d, 0xa1, 0x89, 0x62, 0x97, 0x54, 0x5b, 0x1e, 0x95, 0xe0, 0xff, 0x64, 0xd2,
-	0x10, 0xc4, 0x00, 0x48, 0xa3, 0xf7, 0x75, 0xdb, 0x8a, 0x03, 0xe6, 0xda, 0x09, 0x3f, 0xdd, 0x94,
-	0x87, 0x5c, 0x83, 0x02, 0xcd, 0x4a, 0x90, 0x33, 0x73, 0x67, 0xf6, 0xf3, 0x9d, 0x7f, 0xbf, 0xe2,
-	0x52, 0x9b, 0xd8, 0x26, 0xc8, 0x37, 0xc6, 0x3b, 0x81, 0x96, 0x6f, 0x4b, 0x13, 0xbe, 0x63, 0x2e,
-	0xe9, 0x79, 0xa7, 0x8c, 0x9f, 0x6e, 0xbc, 0x8e, 0x29, 0xf5, 0xf9, 0xb6, 0x2f, 0xfd, 0xb4, 0x59,
-	0x78, 0x98, 0x06, 0x6a, 0xe7, 0x46, 0x71, 0xba, 0xd4, 0x25, 0xab, 0x42, 0x88, 0xa2, 0x8d, 0xfa,
-	0x72, 0x07, 0xb9, 0x55, 0xf8, 0xee, 0xac, 0x0a, 0x36, 0x49, 0x2a, 0x68, 0x3c, 0x38, 0xf1, 0xa4,
-	0x40, 0x28, 0xd3, 0x7b, 0xbb, 0xc9, 0x43, 0xc1, 0x15, 0xe3, 0xad, 0xf4, 0x77, 0xc7, 0x80, 0x9e,
-}
-
-var sbox2 [256]byte
-var sbox3 [256]byte
-var sbox4 [256]byte
-
-var _ cipher.Block = &camelliaCipher{}
diff --git a/vendor/github.com/dgryski/go-camellia/t_camellia.pl b/vendor/github.com/dgryski/go-camellia/t_camellia.pl
deleted file mode 100644
index f95ba48..0000000
--- a/vendor/github.com/dgryski/go-camellia/t_camellia.pl
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/perl
-
-# to run the full verification suite:
-# curl http://info.isl.ntt.co.jp/crypt/eng/camellia/dl/cryptrec/t_camellia.txt |perl ./t_camellia.pl >cfull_test.go
-# perl -pi -e 's/range camelliaTests/$&Full/' camellia_test.go
-# go test
-
-print <<GOCODE;
-package camellia
-
-var camelliaTestsFull = []struct {
-        key    []byte
-        plain  []byte
-        cipher []byte
-}{
-GOCODE
-
-my $k;
-my $p;
-my $c;
-
-# K No.010 : EF CD AB 89 67 45 23 01 10 32 54 76 98 BA DC FE 10 32 54 76 98 BA DC FE 
-sub linetostr {
-    my $l = shift;
-
-    $l =~ s/.*: *//;
-    $l =~ s/\s*$//;
-    my @h = split / /, $l;
-    $l = join(",", map "0x$_", @h);
-    return $l;
-}
-
-
-while(<>) {
-    next if /^\s*$/ or /^Camellia/;
-    if (/^K/) {
-        $k = linetostr($_);
-    }
-
-    if (/^P/) {
-        $p = linetostr($_);
-    }
-
-    if (/^C/) {
-        $c = linetostr($_);
-        print <<"GOCODE";
-            {
-                []byte{$k},
-                []byte{$p},
-                []byte{$c},
-            },
-GOCODE
-    }
-}
-
-print <<GOCODE
-}
-GOCODE
diff --git a/vendor/github.com/dgryski/go-idea/idea.go b/vendor/github.com/dgryski/go-idea/idea.go
deleted file mode 100644
index 5f6d735..0000000
--- a/vendor/github.com/dgryski/go-idea/idea.go
+++ /dev/null
@@ -1,250 +0,0 @@
-// Package idea implements the IDEA block cipher
-/*
-For more information, please see https://en.wikipedia.org/wiki/International_Data_Encryption_Algorithm
-
-This implementation derived from Public Domain code by Colin Plumb available at https://www.schneier.com/book-applied-source.html
-*/
-package idea
-
-import (
-	"crypto/cipher"
-	"encoding/binary"
-	"strconv"
-)
-
-const rounds = 8
-const keyLen = (6*rounds + 4)
-
-// KeySizeError is returned for incorrect key sizes
-type KeySizeError int
-
-func (k KeySizeError) Error() string {
-	return "idea: invalid key size " + strconv.Itoa(int(k))
-}
-
-type ideaCipher struct {
-	ek [keyLen]uint16
-	dk [keyLen]uint16
-}
-
-// NewCipher returns a cipher.Block implementing the IDEA block cipher.  The key argument should be 16 bytes.
-func NewCipher(key []byte) (cipher.Block, error) {
-
-	if l := len(key); l != 16 {
-		return nil, KeySizeError(l)
-	}
-
-	cipher := &ideaCipher{}
-
-	expandKey(key, cipher.ek[:])
-
-	// key inversion is expensive, we could do this lazily
-	invertKey(cipher.ek[:], cipher.dk[:])
-
-	return cipher, nil
-}
-
-func (c *ideaCipher) BlockSize() int          { return 8 }
-func (c *ideaCipher) Encrypt(dst, src []byte) { crypt(src, dst, c.ek[:]) }
-func (c *ideaCipher) Decrypt(dst, src []byte) { crypt(src, dst, c.dk[:]) }
-
-// mulInv computes the multiplicative inverse of x mod 2^16+1
-func mulInv(x uint16) (ret uint16) {
-
-	if x <= 1 {
-		return x // 0 and 1 are self-inverse
-	}
-
-	t1 := uint16(0x10001 / uint32(x)) // Since x >= 2, this fits into 16 bits
-	y := uint16(0x10001 % uint32(x))
-
-	if y == 1 {
-		return 1 - t1
-	}
-
-	var t0 uint16 = 1
-	var q uint16
-
-	for y != 1 {
-		q = x / y
-		x = x % y
-		t0 += q * t1
-		if x == 1 {
-			return t0
-		}
-		q = y / x
-		y = y % x
-		t1 += q * t0
-	}
-	return 1 - t1
-}
-
-// mul computes x*y mod 2^16+1
-func mul(x, y uint16) uint16 {
-
-	if y == 0 {
-		return 1 - x
-	}
-
-	if x == 0 {
-		return 1 - y
-	}
-
-	t32 := uint32(x) * uint32(y)
-	x = uint16(t32)
-	y = uint16(t32 >> 16)
-
-	if x < y {
-		return x - y + 1
-	}
-
-	return x - y
-}
-
-// expandKey computes encryption round-keys from a user-supplied key
-func expandKey(key []byte, EK []uint16) {
-	var i, j int
-
-	for j = 0; j < 8; j++ {
-		EK[j] = (uint16(key[0]) << 8) + uint16(key[1])
-		key = key[2:]
-	}
-	for i = 0; j < keyLen; j++ {
-		i++
-		EK[i+7] = EK[i&7]<<9 | EK[(i+1)&7]>>7
-		EK = EK[i&8:]
-		i &= 7
-	}
-}
-
-// invertKey computes the decryption round-keys from a set of encryption round-keys
-func invertKey(EK []uint16, DK []uint16) {
-
-	var t1, t2, t3 uint16
-	var p [keyLen]uint16
-	pidx := keyLen
-	ekidx := 0
-
-	t1 = mulInv(EK[ekidx])
-	ekidx++
-	t2 = -EK[ekidx]
-	ekidx++
-	t3 = -EK[ekidx]
-	ekidx++
-	pidx--
-	p[pidx] = mulInv(EK[ekidx])
-	ekidx++
-	pidx--
-	p[pidx] = t3
-	pidx--
-	p[pidx] = t2
-	pidx--
-	p[pidx] = t1
-
-	for i := 0; i < rounds-1; i++ {
-		t1 = EK[ekidx]
-		ekidx++
-		pidx--
-		p[pidx] = EK[ekidx]
-		ekidx++
-		pidx--
-		p[pidx] = t1
-
-		t1 = mulInv(EK[ekidx])
-		ekidx++
-		t2 = -EK[ekidx]
-		ekidx++
-		t3 = -EK[ekidx]
-		ekidx++
-		pidx--
-		p[pidx] = mulInv(EK[ekidx])
-		ekidx++
-		pidx--
-		p[pidx] = t2
-		pidx--
-		p[pidx] = t3
-		pidx--
-		p[pidx] = t1
-	}
-
-	t1 = EK[ekidx]
-	ekidx++
-	pidx--
-	p[pidx] = EK[ekidx]
-	ekidx++
-	pidx--
-	p[pidx] = t1
-
-	t1 = mulInv(EK[ekidx])
-	ekidx++
-	t2 = -EK[ekidx]
-	ekidx++
-	t3 = -EK[ekidx]
-	ekidx++
-	pidx--
-	p[pidx] = mulInv(EK[ekidx])
-	pidx--
-	p[pidx] = t3
-	pidx--
-	p[pidx] = t2
-	pidx--
-	p[pidx] = t1
-
-	copy(DK, p[:])
-}
-
-// crypt performs IDEA encryption given input/output buffers and a set of round-keys
-func crypt(inbuf, outbuf []byte, key []uint16) {
-
-	var x1, x2, x3, x4, s2, s3 uint16
-
-	x1 = binary.BigEndian.Uint16(inbuf[0:])
-	x2 = binary.BigEndian.Uint16(inbuf[2:])
-	x3 = binary.BigEndian.Uint16(inbuf[4:])
-	x4 = binary.BigEndian.Uint16(inbuf[6:])
-
-	for r := rounds; r > 0; r-- {
-
-		x1 = mul(x1, key[0])
-		key = key[1:]
-		x2 += key[0]
-		key = key[1:]
-		x3 += key[0]
-		key = key[1:]
-
-		x4 = mul(x4, key[0])
-		key = key[1:]
-
-		s3 = x3
-		x3 ^= x1
-		x3 = mul(x3, key[0])
-		key = key[1:]
-		s2 = x2
-
-		x2 ^= x4
-		x2 += x3
-		x2 = mul(x2, key[0])
-		key = key[1:]
-		x3 += x2
-
-		x1 ^= x2
-		x4 ^= x3
-
-		x2 ^= s3
-		x3 ^= s2
-
-	}
-	x1 = mul(x1, key[0])
-	key = key[1:]
-
-	x3 += key[0]
-	key = key[1:]
-	x2 += key[0]
-	key = key[1:]
-	x4 = mul(x4, key[0])
-
-	binary.BigEndian.PutUint16(outbuf[0:], x1)
-	binary.BigEndian.PutUint16(outbuf[2:], x3)
-	binary.BigEndian.PutUint16(outbuf[4:], x2)
-	binary.BigEndian.PutUint16(outbuf[6:], x4)
-}
diff --git a/vendor/github.com/dgryski/go-rc2/LICENSE b/vendor/github.com/dgryski/go-rc2/LICENSE
deleted file mode 100644
index 039a2e4..0000000
--- a/vendor/github.com/dgryski/go-rc2/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-The MIT License (MIT)
-
-Copyright (c) 2015 Damian Gryski <damian@gryski.com>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/vendor/github.com/dgryski/go-rc2/rc2.go b/vendor/github.com/dgryski/go-rc2/rc2.go
deleted file mode 100644
index aa194e5..0000000
--- a/vendor/github.com/dgryski/go-rc2/rc2.go
+++ /dev/null
@@ -1,284 +0,0 @@
-// Package rc2 implements the RC2 cipher
-/*
-https://www.ietf.org/rfc/rfc2268.txt
-http://people.csail.mit.edu/rivest/pubs/KRRR98.pdf
-
-This code is licensed under the MIT license.
-*/
-package rc2
-
-import (
-	"crypto/cipher"
-	"encoding/binary"
-	"strconv"
-)
-
-// The rc2 block size in bytes
-const BlockSize = 8
-
-type rc2Cipher struct {
-	k [64]uint16
-}
-
-// KeySizeError indicates the supplied key was invalid
-type KeySizeError int
-
-func (k KeySizeError) Error() string { return "rc2: invalid key size " + strconv.Itoa(int(k)) }
-
-// EffectiveKeySizeError indicates the supplied effective key length was invalid
-type EffectiveKeySizeError int
-
-func (k EffectiveKeySizeError) Error() string {
-	return "rc2: invalid effective key size " + strconv.Itoa(int(k))
-}
-
-// New returns a new rc2 cipher with the given key and effective key length t1
-func New(key []byte, t1 int) (cipher.Block, error) {
-	if l := len(key); l == 0 || l > 128 {
-		return nil, KeySizeError(l)
-	}
-
-	if t1 < 8 || t1 > 1024 {
-		return nil, EffectiveKeySizeError(t1)
-	}
-
-	return &rc2Cipher{
-		k: expandKey(key, t1),
-	}, nil
-}
-
-func (c *rc2Cipher) BlockSize() int { return BlockSize }
-
-var piTable = [256]byte{
-	0xd9, 0x78, 0xf9, 0xc4, 0x19, 0xdd, 0xb5, 0xed, 0x28, 0xe9, 0xfd, 0x79, 0x4a, 0xa0, 0xd8, 0x9d,
-	0xc6, 0x7e, 0x37, 0x83, 0x2b, 0x76, 0x53, 0x8e, 0x62, 0x4c, 0x64, 0x88, 0x44, 0x8b, 0xfb, 0xa2,
-	0x17, 0x9a, 0x59, 0xf5, 0x87, 0xb3, 0x4f, 0x13, 0x61, 0x45, 0x6d, 0x8d, 0x09, 0x81, 0x7d, 0x32,
-	0xbd, 0x8f, 0x40, 0xeb, 0x86, 0xb7, 0x7b, 0x0b, 0xf0, 0x95, 0x21, 0x22, 0x5c, 0x6b, 0x4e, 0x82,
-	0x54, 0xd6, 0x65, 0x93, 0xce, 0x60, 0xb2, 0x1c, 0x73, 0x56, 0xc0, 0x14, 0xa7, 0x8c, 0xf1, 0xdc,
-	0x12, 0x75, 0xca, 0x1f, 0x3b, 0xbe, 0xe4, 0xd1, 0x42, 0x3d, 0xd4, 0x30, 0xa3, 0x3c, 0xb6, 0x26,
-	0x6f, 0xbf, 0x0e, 0xda, 0x46, 0x69, 0x07, 0x57, 0x27, 0xf2, 0x1d, 0x9b, 0xbc, 0x94, 0x43, 0x03,
-	0xf8, 0x11, 0xc7, 0xf6, 0x90, 0xef, 0x3e, 0xe7, 0x06, 0xc3, 0xd5, 0x2f, 0xc8, 0x66, 0x1e, 0xd7,
-	0x08, 0xe8, 0xea, 0xde, 0x80, 0x52, 0xee, 0xf7, 0x84, 0xaa, 0x72, 0xac, 0x35, 0x4d, 0x6a, 0x2a,
-	0x96, 0x1a, 0xd2, 0x71, 0x5a, 0x15, 0x49, 0x74, 0x4b, 0x9f, 0xd0, 0x5e, 0x04, 0x18, 0xa4, 0xec,
-	0xc2, 0xe0, 0x41, 0x6e, 0x0f, 0x51, 0xcb, 0xcc, 0x24, 0x91, 0xaf, 0x50, 0xa1, 0xf4, 0x70, 0x39,
-	0x99, 0x7c, 0x3a, 0x85, 0x23, 0xb8, 0xb4, 0x7a, 0xfc, 0x02, 0x36, 0x5b, 0x25, 0x55, 0x97, 0x31,
-	0x2d, 0x5d, 0xfa, 0x98, 0xe3, 0x8a, 0x92, 0xae, 0x05, 0xdf, 0x29, 0x10, 0x67, 0x6c, 0xba, 0xc9,
-	0xd3, 0x00, 0xe6, 0xcf, 0xe1, 0x9e, 0xa8, 0x2c, 0x63, 0x16, 0x01, 0x3f, 0x58, 0xe2, 0x89, 0xa9,
-	0x0d, 0x38, 0x34, 0x1b, 0xab, 0x33, 0xff, 0xb0, 0xbb, 0x48, 0x0c, 0x5f, 0xb9, 0xb1, 0xcd, 0x2e,
-	0xc5, 0xf3, 0xdb, 0x47, 0xe5, 0xa5, 0x9c, 0x77, 0x0a, 0xa6, 0x20, 0x68, 0xfe, 0x7f, 0xc1, 0xad,
-}
-
-func expandKey(key []byte, t1 int) [64]uint16 {
-
-	l := make([]byte, 128)
-	copy(l, key)
-
-	var t = len(key)
-	var t8 = (t1 + 7) / 8
-	var tm = byte(255 % uint(1<<(8+uint(t1)-8*uint(t8))))
-
-	for i := len(key); i < 128; i++ {
-		l[i] = piTable[l[i-1]+l[uint8(i-t)]]
-	}
-
-	l[128-t8] = piTable[l[128-t8]&tm]
-
-	for i := 127 - t8; i >= 0; i-- {
-		l[i] = piTable[l[i+1]^l[i+t8]]
-	}
-
-	var k [64]uint16
-
-	for i := range k {
-		k[i] = uint16(l[2*i]) + uint16(l[2*i+1])*256
-	}
-
-	return k
-}
-
-func rotl16(x uint16, b uint) uint16 {
-	return (x >> (16 - b)) | (x << b)
-}
-
-func (c *rc2Cipher) Encrypt(dst, src []byte) {
-
-	r0 := binary.LittleEndian.Uint16(src[0:])
-	r1 := binary.LittleEndian.Uint16(src[2:])
-	r2 := binary.LittleEndian.Uint16(src[4:])
-	r3 := binary.LittleEndian.Uint16(src[6:])
-
-	var j int
-
-	// These three mix blocks have not been extracted to a common function for to performance reasons.
-	for j <= 16 {
-		// mix r0
-		r0 = r0 + c.k[j] + (r3 & r2) + ((^r3) & r1)
-		r0 = rotl16(r0, 1)
-		j++
-
-		// mix r1
-		r1 = r1 + c.k[j] + (r0 & r3) + ((^r0) & r2)
-		r1 = rotl16(r1, 2)
-		j++
-
-		// mix r2
-		r2 = r2 + c.k[j] + (r1 & r0) + ((^r1) & r3)
-		r2 = rotl16(r2, 3)
-		j++
-
-		// mix r3
-		r3 = r3 + c.k[j] + (r2 & r1) + ((^r2) & r0)
-		r3 = rotl16(r3, 5)
-		j++
-	}
-
-	r0 = r0 + c.k[r3&63]
-	r1 = r1 + c.k[r0&63]
-	r2 = r2 + c.k[r1&63]
-	r3 = r3 + c.k[r2&63]
-
-	for j <= 40 {
-		// mix r0
-		r0 = r0 + c.k[j] + (r3 & r2) + ((^r3) & r1)
-		r0 = rotl16(r0, 1)
-		j++
-
-		// mix r1
-		r1 = r1 + c.k[j] + (r0 & r3) + ((^r0) & r2)
-		r1 = rotl16(r1, 2)
-		j++
-
-		// mix r2
-		r2 = r2 + c.k[j] + (r1 & r0) + ((^r1) & r3)
-		r2 = rotl16(r2, 3)
-		j++
-
-		// mix r3
-		r3 = r3 + c.k[j] + (r2 & r1) + ((^r2) & r0)
-		r3 = rotl16(r3, 5)
-		j++
-	}
-
-	r0 = r0 + c.k[r3&63]
-	r1 = r1 + c.k[r0&63]
-	r2 = r2 + c.k[r1&63]
-	r3 = r3 + c.k[r2&63]
-
-	for j <= 60 {
-		// mix r0
-		r0 = r0 + c.k[j] + (r3 & r2) + ((^r3) & r1)
-		r0 = rotl16(r0, 1)
-		j++
-
-		// mix r1
-		r1 = r1 + c.k[j] + (r0 & r3) + ((^r0) & r2)
-		r1 = rotl16(r1, 2)
-		j++
-
-		// mix r2
-		r2 = r2 + c.k[j] + (r1 & r0) + ((^r1) & r3)
-		r2 = rotl16(r2, 3)
-		j++
-
-		// mix r3
-		r3 = r3 + c.k[j] + (r2 & r1) + ((^r2) & r0)
-		r3 = rotl16(r3, 5)
-		j++
-	}
-
-	binary.LittleEndian.PutUint16(dst[0:], r0)
-	binary.LittleEndian.PutUint16(dst[2:], r1)
-	binary.LittleEndian.PutUint16(dst[4:], r2)
-	binary.LittleEndian.PutUint16(dst[6:], r3)
-}
-
-func (c *rc2Cipher) Decrypt(dst, src []byte) {
-
-	r0 := binary.LittleEndian.Uint16(src[0:])
-	r1 := binary.LittleEndian.Uint16(src[2:])
-	r2 := binary.LittleEndian.Uint16(src[4:])
-	r3 := binary.LittleEndian.Uint16(src[6:])
-
-	j := 63
-
-	for j >= 44 {
-		// unmix r3
-		r3 = rotl16(r3, 16-5)
-		r3 = r3 - c.k[j] - (r2 & r1) - ((^r2) & r0)
-		j--
-
-		// unmix r2
-		r2 = rotl16(r2, 16-3)
-		r2 = r2 - c.k[j] - (r1 & r0) - ((^r1) & r3)
-		j--
-
-		// unmix r1
-		r1 = rotl16(r1, 16-2)
-		r1 = r1 - c.k[j] - (r0 & r3) - ((^r0) & r2)
-		j--
-
-		// unmix r0
-		r0 = rotl16(r0, 16-1)
-		r0 = r0 - c.k[j] - (r3 & r2) - ((^r3) & r1)
-		j--
-	}
-
-	r3 = r3 - c.k[r2&63]
-	r2 = r2 - c.k[r1&63]
-	r1 = r1 - c.k[r0&63]
-	r0 = r0 - c.k[r3&63]
-
-	for j >= 20 {
-		// unmix r3
-		r3 = rotl16(r3, 16-5)
-		r3 = r3 - c.k[j] - (r2 & r1) - ((^r2) & r0)
-		j--
-
-		// unmix r2
-		r2 = rotl16(r2, 16-3)
-		r2 = r2 - c.k[j] - (r1 & r0) - ((^r1) & r3)
-		j--
-
-		// unmix r1
-		r1 = rotl16(r1, 16-2)
-		r1 = r1 - c.k[j] - (r0 & r3) - ((^r0) & r2)
-		j--
-
-		// unmix r0
-		r0 = rotl16(r0, 16-1)
-		r0 = r0 - c.k[j] - (r3 & r2) - ((^r3) & r1)
-		j--
-	}
-
-	r3 = r3 - c.k[r2&63]
-	r2 = r2 - c.k[r1&63]
-	r1 = r1 - c.k[r0&63]
-	r0 = r0 - c.k[r3&63]
-
-	for j >= 0 {
-		// unmix r3
-		r3 = rotl16(r3, 16-5)
-		r3 = r3 - c.k[j] - (r2 & r1) - ((^r2) & r0)
-		j--
-
-		// unmix r2
-		r2 = rotl16(r2, 16-3)
-		r2 = r2 - c.k[j] - (r1 & r0) - ((^r1) & r3)
-		j--
-
-		// unmix r1
-		r1 = rotl16(r1, 16-2)
-		r1 = r1 - c.k[j] - (r0 & r3) - ((^r0) & r2)
-		j--
-
-		// unmix r0
-		r0 = rotl16(r0, 16-1)
-		r0 = r0 - c.k[j] - (r3 & r2) - ((^r3) & r1)
-		j--
-	}
-
-	binary.LittleEndian.PutUint16(dst[0:], r0)
-	binary.LittleEndian.PutUint16(dst[2:], r1)
-	binary.LittleEndian.PutUint16(dst[4:], r2)
-	binary.LittleEndian.PutUint16(dst[6:], r3)
-}
diff --git a/vendor/github.com/nadoo/conflag/LICENSE b/vendor/github.com/nadoo/conflag/LICENSE
deleted file mode 100644
index 9cecc1d..0000000
--- a/vendor/github.com/nadoo/conflag/LICENSE
+++ /dev/null
@@ -1,674 +0,0 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    {one line to give the program's name and a brief idea of what it does.}
-    Copyright (C) {year}  {name of author}
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    {project}  Copyright (C) {year}  {fullname}
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/vendor/github.com/nadoo/conflag/README.md b/vendor/github.com/nadoo/conflag/README.md
deleted file mode 100644
index 6a1b368..0000000
--- a/vendor/github.com/nadoo/conflag/README.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# conflag
-conflag is a config file and command line parser based on Go's standard flag package.
-
-## Usage
-
-### Your code:
-```Go
-package main
-
-import (
-	"fmt"
-
-	"github.com/nadoo/conflag"
-)
-
-var conf struct {
-	Name string
-	Age  int
-	Male bool
-}
-
-func main() {
-	// get a new conflag instance
-	flag := conflag.New()
-
-	// setup flags as the standard flag package
-	flag.StringVar(&conf.Name, "name", "", "your name")
-	flag.IntVar(&conf.Age, "age", 0, "your age")
-	flag.BoolVar(&conf.Male, "male", false, "your sex")
-
-	// parse before access flags
-	flag.Parse()
-
-	// now you're able to get the parsed flag values
-	fmt.Printf("  Name: %s\n", conf.Name)
-	fmt.Printf("  Age: %d\n", conf.Age)
-	fmt.Printf("  Male: %v\n", conf.Male)
-}
-```
-
-### Run without config file:
-command:
-```bash
-sample -name Jay -age 30
-```
-output:
-```bash
-  Name: Jay
-  Age: 30
-  Male: false
-```
-
-### Run with config file(-config):
-sample.conf:
-```bash
-name=Jason
-age=20
-male
-```
-command: **use "-config" flag to specify the config file path.**
-```bash
-sample -config sample.conf
-```
-output:
-```bash
-  Name: Jason
-  Age: 20
-  Male: true
-```
-
-### Run with config file and OVERRIDE a flag value using commandline:
-sample.conf:
-```bash
-name=Jason
-age=20
-male
-```
-command:
-```bash
-sample -config sample.conf -name Michael
-```
-output:
-```bash
-  Name: Michael
-  Age: 20
-  Male: true
-```
-
-## Config File
-- format: KEY=VALUE
-
-**just use the command line flag name as the key name**:
-
-```bash
-## config file
-# comment line starts with "#"
-
-# format:
-#KEY=VALUE, 
-# just use the command line flag name as the key name
-
-# your name
-name=Jason
-
-# your age
-age=20
-
-# are you male?
-male
-```
-See [simple.conf](examples/simple/simple.conf)
\ No newline at end of file
diff --git a/vendor/github.com/nadoo/conflag/conflag.go b/vendor/github.com/nadoo/conflag/conflag.go
deleted file mode 100644
index c011540..0000000
--- a/vendor/github.com/nadoo/conflag/conflag.go
+++ /dev/null
@@ -1,151 +0,0 @@
-package conflag
-
-import (
-	"bufio"
-	"flag"
-	"os"
-	"path/filepath"
-	"strings"
-)
-
-// Conflag .
-type Conflag struct {
-	*flag.FlagSet
-
-	app     string
-	osArgs  []string
-	cfgFile string
-	args    []string
-
-	includes []string
-
-	// TODO: add shorthand? or just use pflag?
-	// shorthand map[byte]string
-}
-
-// New ...
-func New(args ...string) *Conflag {
-	if args == nil {
-		args = os.Args
-	}
-
-	c := &Conflag{}
-	c.app = args[0]
-	c.osArgs = args[1:]
-	c.FlagSet = flag.NewFlagSet(c.app, flag.ExitOnError)
-	c.FlagSet.StringVar(&c.cfgFile, "config", "", "config file path")
-
-	return c
-}
-
-// NewFromFile ...
-func NewFromFile(app, cfgFile string) *Conflag {
-	c := &Conflag{}
-
-	if app != "" {
-		c.app = app
-	} else {
-		c.app = os.Args[0]
-	}
-
-	c.cfgFile = cfgFile
-	c.FlagSet = flag.NewFlagSet(c.app, flag.ExitOnError)
-
-	c.StringSliceUniqVar(&c.includes, "include", nil, "include file")
-
-	return c
-}
-
-// Parse ...
-func (c *Conflag) Parse() (err error) {
-	// parse 1st time and see whether there is a conf file.
-	err = c.FlagSet.Parse(c.osArgs)
-	if err != nil {
-		return err
-	}
-
-	// if there is no args, just try to load the app.conf file.
-	if c.cfgFile == "" && len(c.osArgs) == 0 {
-		// trim app exetension
-		for i := len(c.app) - 1; i >= 0 && c.app[i] != '/' && c.app[i] != '\\'; i-- {
-			if c.app[i] == '.' {
-				c.cfgFile = c.app[:i]
-				break
-			}
-		}
-
-		if c.cfgFile == "" {
-			c.cfgFile = c.app
-		}
-
-		c.cfgFile += ".conf"
-	}
-
-	if c.cfgFile == "" {
-		return nil
-	}
-
-	fargs, err := parseFile(c.cfgFile)
-	if err != nil {
-		return err
-	}
-
-	c.args = fargs
-	c.args = append(c.args, c.osArgs...)
-
-	// parse 2nd time to get the include file values
-	err = c.FlagSet.Parse(c.args)
-	if err != nil {
-		return err
-	}
-
-	dir := filepath.Dir(c.cfgFile)
-
-	// parse 3rd time to parse flags in include file
-	for _, include := range c.includes {
-		include = filepath.Join(dir, include)
-		fargs, err := parseFile(include)
-		if err != nil {
-			return err
-		}
-
-		c.args = fargs
-		c.args = append(c.args, c.osArgs...)
-
-		err = c.FlagSet.Parse(c.args)
-	}
-
-	return err
-}
-
-func parseFile(cfgFile string) ([]string, error) {
-	var s []string
-
-	fp, err := os.Open(cfgFile)
-	if err != nil {
-		return nil, err
-	}
-	defer fp.Close()
-
-	scanner := bufio.NewScanner(fp)
-	for scanner.Scan() {
-		line := scanner.Text()
-		line = strings.TrimSpace(line)
-		if len(line) == 0 || line[:1] == "#" {
-			continue
-		}
-		s = append(s, "-"+line)
-	}
-
-	return s, nil
-}
-
-// AppDir returns the app dir
-func (c *Conflag) AppDir() string {
-	return filepath.Dir(os.Args[0])
-}
-
-// ConfDir returns the config file dir
-func (c *Conflag) ConfDir() string {
-	return filepath.Dir(c.cfgFile)
-}
diff --git a/vendor/github.com/nadoo/conflag/string_slice.go b/vendor/github.com/nadoo/conflag/string_slice.go
deleted file mode 100644
index e7a02f1..0000000
--- a/vendor/github.com/nadoo/conflag/string_slice.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// source: https://github.com/spf13/pflag/blob/master/string_slice.go
-
-package conflag
-
-type stringSliceValue struct {
-	value   *[]string
-	changed bool
-}
-
-func newStringSliceValue(val []string, p *[]string) *stringSliceValue {
-	ssv := new(stringSliceValue)
-	ssv.value = p
-	*ssv.value = val
-	return ssv
-}
-
-func (s *stringSliceValue) Set(val string) error {
-	if !s.changed {
-		*s.value = []string{val}
-		s.changed = true
-	} else {
-		*s.value = append(*s.value, val)
-	}
-	return nil
-}
-
-func (s *stringSliceValue) Type() string {
-	return "stringSlice"
-}
-
-func (s *stringSliceValue) String() string {
-	return ""
-}
-
-// StringSliceVar defines a string flag with specified name, default value, and usage string.
-// The argument p points to a []string variable in which to store the value of the flag.
-func (c *Conflag) StringSliceVar(p *[]string, name string, value []string, usage string) {
-	c.Var(newStringSliceValue(value, p), name, usage)
-}
-
-// StringSlice defines a string flag with specified name, default value, and usage string.
-// The return value is the address of a []string variable that stores the value of the flag.
-func (c *Conflag) StringSlice(name string, value []string, usage string) *[]string {
-	p := []string{}
-	c.StringSliceVar(&p, name, value, usage)
-	return &p
-}
diff --git a/vendor/github.com/nadoo/conflag/string_slice_uniq.go b/vendor/github.com/nadoo/conflag/string_slice_uniq.go
deleted file mode 100644
index 70d7389..0000000
--- a/vendor/github.com/nadoo/conflag/string_slice_uniq.go
+++ /dev/null
@@ -1,51 +0,0 @@
-package conflag
-
-type stringSliceUniqValue struct {
-	*stringSliceValue
-}
-
-func newStringSliceUniqValue(val []string, p *[]string) *stringSliceUniqValue {
-	return &stringSliceUniqValue{stringSliceValue: newStringSliceValue(val, p)}
-}
-
-func (s *stringSliceUniqValue) Set(val string) error {
-	if !s.changed {
-		*s.value = []string{val}
-		s.changed = true
-	}
-
-	dup := false
-	for _, v := range *s.value {
-		if v == val {
-			dup = true
-		}
-	}
-
-	if !dup {
-		*s.value = append(*s.value, val)
-	}
-
-	return nil
-}
-
-func (s *stringSliceUniqValue) Type() string {
-	return "stringSliceUniq"
-}
-
-func (s *stringSliceUniqValue) String() string {
-	return ""
-}
-
-// StringSliceUniqVar defines a string flag with specified name, default value, and usage string.
-// The argument p points to a []string variable in which to store the value of the flag.
-func (c *Conflag) StringSliceUniqVar(p *[]string, name string, value []string, usage string) {
-	c.Var(newStringSliceUniqValue(value, p), name, usage)
-}
-
-// StringUniqSlice defines a string flag with specified name, default value, and usage string.
-// The return value is the address of a []string variable that stores the value of the flag.
-func (c *Conflag) StringUniqSlice(name string, value []string, usage string) *[]string {
-	p := []string{}
-	c.StringSliceUniqVar(&p, name, value, usage)
-	return &p
-}
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/LICENSE b/vendor/github.com/shadowsocks/go-shadowsocks2/LICENSE
deleted file mode 100644
index d645695..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/LICENSE
+++ /dev/null
@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/core/cipher.go b/vendor/github.com/shadowsocks/go-shadowsocks2/core/cipher.go
deleted file mode 100644
index c672510..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/core/cipher.go
+++ /dev/null
@@ -1,144 +0,0 @@
-package core
-
-import (
-	"crypto/md5"
-	"errors"
-	"net"
-	"sort"
-	"strings"
-
-	"github.com/shadowsocks/go-shadowsocks2/shadowaead"
-	"github.com/shadowsocks/go-shadowsocks2/shadowstream"
-)
-
-type Cipher interface {
-	StreamConnCipher
-	PacketConnCipher
-}
-
-type StreamConnCipher interface {
-	StreamConn(net.Conn) net.Conn
-}
-
-type PacketConnCipher interface {
-	PacketConn(net.PacketConn) net.PacketConn
-}
-
-// ErrCipherNotSupported occurs when a cipher is not supported (likely because of security concerns).
-var ErrCipherNotSupported = errors.New("cipher not supported")
-
-// List of AEAD ciphers: key size in bytes and constructor
-var aeadList = map[string]struct {
-	KeySize int
-	New     func([]byte) (shadowaead.Cipher, error)
-}{
-	"AEAD_AES_128_GCM":       {16, shadowaead.AESGCM},
-	"AEAD_AES_192_GCM":       {24, shadowaead.AESGCM},
-	"AEAD_AES_256_GCM":       {32, shadowaead.AESGCM},
-	"AEAD_CHACHA20_POLY1305": {32, shadowaead.Chacha20Poly1305},
-}
-
-// List of stream ciphers: key size in bytes and constructor
-var streamList = map[string]struct {
-	KeySize int
-	New     func(key []byte) (shadowstream.Cipher, error)
-}{
-	"AES-128-CTR":   {16, shadowstream.AESCTR},
-	"AES-192-CTR":   {24, shadowstream.AESCTR},
-	"AES-256-CTR":   {32, shadowstream.AESCTR},
-	"AES-128-CFB":   {16, shadowstream.AESCFB},
-	"AES-192-CFB":   {24, shadowstream.AESCFB},
-	"AES-256-CFB":   {32, shadowstream.AESCFB},
-	"CHACHA20-IETF": {32, shadowstream.Chacha20IETF},
-	"XCHACHA20":     {32, shadowstream.Xchacha20},
-}
-
-// ListCipher returns a list of available cipher names sorted alphabetically.
-func ListCipher() []string {
-	var l []string
-	for k := range aeadList {
-		l = append(l, k)
-	}
-	for k := range streamList {
-		l = append(l, k)
-	}
-	sort.Strings(l)
-	return l
-}
-
-// PickCipher returns a Cipher of the given name. Derive key from password if given key is empty.
-func PickCipher(name string, key []byte, password string) (Cipher, error) {
-	name = strings.ToUpper(name)
-
-	switch name {
-	case "DUMMY":
-		return &dummy{}, nil
-	case "CHACHA20-IETF-POLY1305":
-		name = "AEAD_CHACHA20_POLY1305"
-	case "AES-128-GCM":
-		name = "AEAD_AES_128_GCM"
-	case "AES-196-GCM":
-		name = "AEAD_AES_196_GCM"
-	case "AES-256-GCM":
-		name = "AEAD_AES_256_GCM"
-	}
-
-	if choice, ok := aeadList[name]; ok {
-		if len(key) == 0 {
-			key = kdf(password, choice.KeySize)
-		}
-		if len(key) != choice.KeySize {
-			return nil, shadowaead.KeySizeError(choice.KeySize)
-		}
-		aead, err := choice.New(key)
-		return &aeadCipher{aead}, err
-	}
-
-	if choice, ok := streamList[name]; ok {
-		if len(key) == 0 {
-			key = kdf(password, choice.KeySize)
-		}
-		if len(key) != choice.KeySize {
-			return nil, shadowstream.KeySizeError(choice.KeySize)
-		}
-		ciph, err := choice.New(key)
-		return &streamCipher{ciph}, err
-	}
-
-	return nil, ErrCipherNotSupported
-}
-
-type aeadCipher struct{ shadowaead.Cipher }
-
-func (aead *aeadCipher) StreamConn(c net.Conn) net.Conn { return shadowaead.NewConn(c, aead) }
-func (aead *aeadCipher) PacketConn(c net.PacketConn) net.PacketConn {
-	return shadowaead.NewPacketConn(c, aead)
-}
-
-type streamCipher struct{ shadowstream.Cipher }
-
-func (ciph *streamCipher) StreamConn(c net.Conn) net.Conn { return shadowstream.NewConn(c, ciph) }
-func (ciph *streamCipher) PacketConn(c net.PacketConn) net.PacketConn {
-	return shadowstream.NewPacketConn(c, ciph)
-}
-
-// dummy cipher does not encrypt
-
-type dummy struct{}
-
-func (dummy) StreamConn(c net.Conn) net.Conn             { return c }
-func (dummy) PacketConn(c net.PacketConn) net.PacketConn { return c }
-
-// key-derivation function from original Shadowsocks
-func kdf(password string, keyLen int) []byte {
-	var b, prev []byte
-	h := md5.New()
-	for len(b) < keyLen {
-		h.Write(prev)
-		h.Write([]byte(password))
-		b = h.Sum(b)
-		prev = b[len(b)-h.Size():]
-		h.Reset()
-	}
-	return b[:keyLen]
-}
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/core/doc.go b/vendor/github.com/shadowsocks/go-shadowsocks2/core/doc.go
deleted file mode 100644
index 4001c10..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/core/doc.go
+++ /dev/null
@@ -1,2 +0,0 @@
-// Package core implements essential parts of Shadowsocks
-package core
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/core/packet.go b/vendor/github.com/shadowsocks/go-shadowsocks2/core/packet.go
deleted file mode 100644
index 641aa13..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/core/packet.go
+++ /dev/null
@@ -1,8 +0,0 @@
-package core
-
-import "net"
-
-func ListenPacket(network, address string, ciph PacketConnCipher) (net.PacketConn, error) {
-	c, err := net.ListenPacket(network, address)
-	return ciph.PacketConn(c), err
-}
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/core/stream.go b/vendor/github.com/shadowsocks/go-shadowsocks2/core/stream.go
deleted file mode 100644
index 5c773cd..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/core/stream.go
+++ /dev/null
@@ -1,23 +0,0 @@
-package core
-
-import "net"
-
-type listener struct {
-	net.Listener
-	StreamConnCipher
-}
-
-func Listen(network, address string, ciph StreamConnCipher) (net.Listener, error) {
-	l, err := net.Listen(network, address)
-	return &listener{l, ciph}, err
-}
-
-func (l *listener) Accept() (net.Conn, error) {
-	c, err := l.Listener.Accept()
-	return l.StreamConn(c), err
-}
-
-func Dial(network, address string, ciph StreamConnCipher) (net.Conn, error) {
-	c, err := net.Dial(network, address)
-	return ciph.StreamConn(c), err
-}
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/cipher.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/cipher.go
deleted file mode 100644
index 19410df..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/cipher.go
+++ /dev/null
@@ -1,83 +0,0 @@
-package shadowaead
-
-import (
-	"crypto/aes"
-	"crypto/cipher"
-	"crypto/sha1"
-	"io"
-	"strconv"
-
-	"golang.org/x/crypto/chacha20poly1305"
-	"golang.org/x/crypto/hkdf"
-)
-
-type Cipher interface {
-	KeySize() int
-	SaltSize() int
-	Encrypter(salt []byte) (cipher.AEAD, error)
-	Decrypter(salt []byte) (cipher.AEAD, error)
-}
-
-type KeySizeError int
-
-func (e KeySizeError) Error() string {
-	return "key size error: need " + strconv.Itoa(int(e)) + " bytes"
-}
-
-func hkdfSHA1(secret, salt, info, outkey []byte) {
-	r := hkdf.New(sha1.New, secret, salt, info)
-	if _, err := io.ReadFull(r, outkey); err != nil {
-		panic(err) // should never happen
-	}
-}
-
-type metaCipher struct {
-	psk      []byte
-	makeAEAD func(key []byte) (cipher.AEAD, error)
-}
-
-func (a *metaCipher) KeySize() int { return len(a.psk) }
-func (a *metaCipher) SaltSize() int {
-	if ks := a.KeySize(); ks > 16 {
-		return ks
-	}
-	return 16
-}
-func (a *metaCipher) Encrypter(salt []byte) (cipher.AEAD, error) {
-	subkey := make([]byte, a.KeySize())
-	hkdfSHA1(a.psk, salt, []byte("ss-subkey"), subkey)
-	return a.makeAEAD(subkey)
-}
-func (a *metaCipher) Decrypter(salt []byte) (cipher.AEAD, error) {
-	subkey := make([]byte, a.KeySize())
-	hkdfSHA1(a.psk, salt, []byte("ss-subkey"), subkey)
-	return a.makeAEAD(subkey)
-}
-
-func aesGCM(key []byte) (cipher.AEAD, error) {
-	blk, err := aes.NewCipher(key)
-	if err != nil {
-		return nil, err
-	}
-	return cipher.NewGCM(blk)
-}
-
-// AESGCM creates a new Cipher with a pre-shared key. len(psk) must be
-// one of 16, 24, or 32 to select AES-128/196/256-GCM.
-func AESGCM(psk []byte) (Cipher, error) {
-	switch l := len(psk); l {
-	case 16, 24, 32: // AES 128/196/256
-	default:
-		return nil, aes.KeySizeError(l)
-	}
-	return &metaCipher{psk: psk, makeAEAD: aesGCM}, nil
-}
-
-// Chacha20Poly1305 creates a new Cipher with a pre-shared key. len(psk)
-// must be 32.
-func Chacha20Poly1305(psk []byte) (Cipher, error) {
-	if len(psk) != chacha20poly1305.KeySize {
-		return nil, KeySizeError(chacha20poly1305.KeySize)
-	}
-	return &metaCipher{psk: psk, makeAEAD: chacha20poly1305.New}, nil
-}
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/doc.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/doc.go
deleted file mode 100644
index 8d1e286..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/doc.go
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-Package shadowaead implements a simple AEAD-protected secure protocol.
-
-In general, there are two types of connections: stream-oriented and packet-oriented.
-Stream-oriented connections (e.g. TCP) assume reliable and orderly delivery of bytes.
-Packet-oriented connections (e.g. UDP) assume unreliable and out-of-order delivery of packets,
-where each packet is either delivered intact or lost.
-
-An encrypted stream starts with a random salt to derive a session key, followed by any number of
-encrypted records. Each encrypted record has the following structure:
-
-    [encrypted payload length]
-    [payload length tag]
-    [encrypted payload]
-    [payload tag]
-
-Payload length is 2-byte unsigned big-endian integer capped at 0x3FFF (16383).
-The higher 2 bits are reserved and must be set to zero. The first AEAD encrypt/decrypt
-operation uses a counting nonce starting from 0. After each encrypt/decrypt operation,
-the nonce is incremented by one as if it were an unsigned little-endian integer.
-
-
-Each encrypted packet transmitted on a packet-oriented connection has the following structure:
-
-    [random salt]
-    [encrypted payload]
-    [payload tag]
-
-The salt is used to derive a subkey to initiate an AEAD. Packets are encrypted/decrypted independently
-using zero nonce.
-
-In both stream-oriented and packet-oriented connections, length of nonce and tag varies
-depending on which AEAD is used. Salt should be at least 16-byte long.
-*/
-package shadowaead
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/packet.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/packet.go
deleted file mode 100644
index d8f20a4..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/packet.go
+++ /dev/null
@@ -1,93 +0,0 @@
-package shadowaead
-
-import (
-	"crypto/rand"
-	"errors"
-	"io"
-	"net"
-	"sync"
-)
-
-// ErrShortPacket means that the packet is too short for a valid encrypted packet.
-var ErrShortPacket = errors.New("short packet")
-
-var _zerononce [128]byte // read-only. 128 bytes is more than enough.
-
-// Pack encrypts plaintext using Cipher with a randomly generated salt and
-// returns a slice of dst containing the encrypted packet and any error occurred.
-// Ensure len(dst) >= ciph.SaltSize() + len(plaintext) + aead.Overhead().
-func Pack(dst, plaintext []byte, ciph Cipher) ([]byte, error) {
-	saltSize := ciph.SaltSize()
-	salt := dst[:saltSize]
-	if _, err := io.ReadFull(rand.Reader, salt); err != nil {
-		return nil, err
-	}
-
-	aead, err := ciph.Encrypter(salt)
-	if err != nil {
-		return nil, err
-	}
-
-	if len(dst) < saltSize+len(plaintext)+aead.Overhead() {
-		return nil, io.ErrShortBuffer
-	}
-	b := aead.Seal(dst[saltSize:saltSize], _zerononce[:aead.NonceSize()], plaintext, nil)
-	return dst[:saltSize+len(b)], nil
-}
-
-// Unpack decrypts pkt using Cipher and returns a slice of dst containing the decrypted payload and any error occurred.
-// Ensure len(dst) >= len(pkt) - aead.SaltSize() - aead.Overhead().
-func Unpack(dst, pkt []byte, ciph Cipher) ([]byte, error) {
-	saltSize := ciph.SaltSize()
-	if len(pkt) < saltSize {
-		return nil, ErrShortPacket
-	}
-	salt := pkt[:saltSize]
-	aead, err := ciph.Decrypter(salt)
-	if err != nil {
-		return nil, err
-	}
-	if len(pkt) < saltSize+aead.Overhead() {
-		return nil, ErrShortPacket
-	}
-	if saltSize+len(dst)+aead.Overhead() < len(pkt) {
-		return nil, io.ErrShortBuffer
-	}
-	b, err := aead.Open(dst[:0], _zerononce[:aead.NonceSize()], pkt[saltSize:], nil)
-	return b, err
-}
-
-type packetConn struct {
-	net.PacketConn
-	Cipher
-	sync.Mutex
-	buf []byte // write lock
-}
-
-// NewPacketConn wraps a net.PacketConn with cipher
-func NewPacketConn(c net.PacketConn, ciph Cipher) net.PacketConn {
-	const maxPacketSize = 64 * 1024
-	return &packetConn{PacketConn: c, Cipher: ciph, buf: make([]byte, maxPacketSize)}
-}
-
-// WriteTo encrypts b and write to addr using the embedded PacketConn.
-func (c *packetConn) WriteTo(b []byte, addr net.Addr) (int, error) {
-	c.Lock()
-	defer c.Unlock()
-	buf, err := Pack(c.buf, b, c)
-	if err != nil {
-		return 0, err
-	}
-	_, err = c.PacketConn.WriteTo(buf, addr)
-	return len(b), err
-}
-
-// ReadFrom reads from the embedded PacketConn and decrypts into b.
-func (c *packetConn) ReadFrom(b []byte) (int, net.Addr, error) {
-	n, addr, err := c.PacketConn.ReadFrom(b)
-	if err != nil {
-		return n, addr, err
-	}
-	b, err = Unpack(b, b[:n], c)
-	return len(b), addr, err
-}
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/stream.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/stream.go
deleted file mode 100644
index 5f499a2..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/stream.go
+++ /dev/null
@@ -1,270 +0,0 @@
-package shadowaead
-
-import (
-	"bytes"
-	"crypto/cipher"
-	"crypto/rand"
-	"io"
-	"net"
-)
-
-// payloadSizeMask is the maximum size of payload in bytes.
-const payloadSizeMask = 0x3FFF // 16*1024 - 1
-
-type writer struct {
-	io.Writer
-	cipher.AEAD
-	nonce []byte
-	buf   []byte
-}
-
-// NewWriter wraps an io.Writer with AEAD encryption.
-func NewWriter(w io.Writer, aead cipher.AEAD) io.Writer { return newWriter(w, aead) }
-
-func newWriter(w io.Writer, aead cipher.AEAD) *writer {
-	return &writer{
-		Writer: w,
-		AEAD:   aead,
-		buf:    make([]byte, 2+aead.Overhead()+payloadSizeMask+aead.Overhead()),
-		nonce:  make([]byte, aead.NonceSize()),
-	}
-}
-
-// Write encrypts b and writes to the embedded io.Writer.
-func (w *writer) Write(b []byte) (int, error) {
-	n, err := w.ReadFrom(bytes.NewBuffer(b))
-	return int(n), err
-}
-
-// ReadFrom reads from the given io.Reader until EOF or error, encrypts and
-// writes to the embedded io.Writer. Returns number of bytes read from r and
-// any error encountered.
-func (w *writer) ReadFrom(r io.Reader) (n int64, err error) {
-	for {
-		buf := w.buf
-		payloadBuf := buf[2+w.Overhead() : 2+w.Overhead()+payloadSizeMask]
-		nr, er := r.Read(payloadBuf)
-
-		if nr > 0 {
-			n += int64(nr)
-			buf = buf[:2+w.Overhead()+nr+w.Overhead()]
-			payloadBuf = payloadBuf[:nr]
-			buf[0], buf[1] = byte(nr>>8), byte(nr) // big-endian payload size
-			w.Seal(buf[:0], w.nonce, buf[:2], nil)
-			increment(w.nonce)
-
-			w.Seal(payloadBuf[:0], w.nonce, payloadBuf, nil)
-			increment(w.nonce)
-
-			_, ew := w.Writer.Write(buf)
-			if ew != nil {
-				err = ew
-				break
-			}
-		}
-
-		if er != nil {
-			if er != io.EOF { // ignore EOF as per io.ReaderFrom contract
-				err = er
-			}
-			break
-		}
-	}
-
-	return n, err
-}
-
-type reader struct {
-	io.Reader
-	cipher.AEAD
-	nonce    []byte
-	buf      []byte
-	leftover []byte
-}
-
-// NewReader wraps an io.Reader with AEAD decryption.
-func NewReader(r io.Reader, aead cipher.AEAD) io.Reader { return newReader(r, aead) }
-
-func newReader(r io.Reader, aead cipher.AEAD) *reader {
-	return &reader{
-		Reader: r,
-		AEAD:   aead,
-		buf:    make([]byte, payloadSizeMask+aead.Overhead()),
-		nonce:  make([]byte, aead.NonceSize()),
-	}
-}
-
-// read and decrypt a record into the internal buffer. Return decrypted payload length and any error encountered.
-func (r *reader) read() (int, error) {
-	// decrypt payload size
-	buf := r.buf[:2+r.Overhead()]
-	_, err := io.ReadFull(r.Reader, buf)
-	if err != nil {
-		return 0, err
-	}
-
-	_, err = r.Open(buf[:0], r.nonce, buf, nil)
-	increment(r.nonce)
-	if err != nil {
-		return 0, err
-	}
-
-	size := (int(buf[0])<<8 + int(buf[1])) & payloadSizeMask
-
-	// decrypt payload
-	buf = r.buf[:size+r.Overhead()]
-	_, err = io.ReadFull(r.Reader, buf)
-	if err != nil {
-		return 0, err
-	}
-
-	_, err = r.Open(buf[:0], r.nonce, buf, nil)
-	increment(r.nonce)
-	if err != nil {
-		return 0, err
-	}
-
-	return size, nil
-}
-
-// Read reads from the embedded io.Reader, decrypts and writes to b.
-func (r *reader) Read(b []byte) (int, error) {
-	// copy decrypted bytes (if any) from previous record first
-	if len(r.leftover) > 0 {
-		n := copy(b, r.leftover)
-		r.leftover = r.leftover[n:]
-		return n, nil
-	}
-
-	n, err := r.read()
-	m := copy(b, r.buf[:n])
-	if m < n { // insufficient len(b), keep leftover for next read
-		r.leftover = r.buf[m:n]
-	}
-	return m, err
-}
-
-// WriteTo reads from the embedded io.Reader, decrypts and writes to w until
-// there's no more data to write or when an error occurs. Return number of
-// bytes written to w and any error encountered.
-func (r *reader) WriteTo(w io.Writer) (n int64, err error) {
-	// write decrypted bytes left over from previous record
-	for len(r.leftover) > 0 {
-		nw, ew := w.Write(r.leftover)
-		r.leftover = r.leftover[nw:]
-		n += int64(nw)
-		if ew != nil {
-			return n, ew
-		}
-	}
-
-	for {
-		nr, er := r.read()
-		if nr > 0 {
-			nw, ew := w.Write(r.buf[:nr])
-			n += int64(nw)
-
-			if ew != nil {
-				err = ew
-				break
-			}
-		}
-
-		if er != nil {
-			if er != io.EOF { // ignore EOF as per io.Copy contract (using src.WriteTo shortcut)
-				err = er
-			}
-			break
-		}
-	}
-
-	return n, err
-}
-
-// increment little-endian encoded unsigned integer b. Wrap around on overflow.
-func increment(b []byte) {
-	for i := range b {
-		b[i]++
-		if b[i] != 0 {
-			return
-		}
-	}
-}
-
-type streamConn struct {
-	net.Conn
-	Cipher
-	r *reader
-	w *writer
-}
-
-func (c *streamConn) initReader() error {
-	salt := make([]byte, c.SaltSize())
-	if _, err := io.ReadFull(c.Conn, salt); err != nil {
-		return err
-	}
-
-	aead, err := c.Decrypter(salt)
-	if err != nil {
-		return err
-	}
-
-	c.r = newReader(c.Conn, aead)
-	return nil
-}
-
-func (c *streamConn) Read(b []byte) (int, error) {
-	if c.r == nil {
-		if err := c.initReader(); err != nil {
-			return 0, err
-		}
-	}
-	return c.r.Read(b)
-}
-
-func (c *streamConn) WriteTo(w io.Writer) (int64, error) {
-	if c.r == nil {
-		if err := c.initReader(); err != nil {
-			return 0, err
-		}
-	}
-	return c.r.WriteTo(w)
-}
-
-func (c *streamConn) initWriter() error {
-	salt := make([]byte, c.SaltSize())
-	if _, err := io.ReadFull(rand.Reader, salt); err != nil {
-		return err
-	}
-	aead, err := c.Encrypter(salt)
-	if err != nil {
-		return err
-	}
-	_, err = c.Conn.Write(salt)
-	if err != nil {
-		return err
-	}
-	c.w = newWriter(c.Conn, aead)
-	return nil
-}
-
-func (c *streamConn) Write(b []byte) (int, error) {
-	if c.w == nil {
-		if err := c.initWriter(); err != nil {
-			return 0, err
-		}
-	}
-	return c.w.Write(b)
-}
-
-func (c *streamConn) ReadFrom(r io.Reader) (int64, error) {
-	if c.w == nil {
-		if err := c.initWriter(); err != nil {
-			return 0, err
-		}
-	}
-	return c.w.ReadFrom(r)
-}
-
-// NewConn wraps a stream-oriented net.Conn with cipher.
-func NewConn(c net.Conn, ciph Cipher) net.Conn { return &streamConn{Conn: c, Cipher: ciph} }
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/cipher.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/cipher.go
deleted file mode 100644
index dea233e..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/cipher.go
+++ /dev/null
@@ -1,91 +0,0 @@
-package shadowstream
-
-import (
-	"crypto/aes"
-	"crypto/cipher"
-	"strconv"
-
-	"github.com/Yawning/chacha20"
-)
-
-// Cipher generates a pair of stream ciphers for encryption and decryption.
-type Cipher interface {
-	IVSize() int
-	Encrypter(iv []byte) cipher.Stream
-	Decrypter(iv []byte) cipher.Stream
-}
-
-type KeySizeError int
-
-func (e KeySizeError) Error() string {
-	return "key size error: need " + strconv.Itoa(int(e)) + " bytes"
-}
-
-// CTR mode
-type ctrStream struct{ cipher.Block }
-
-func (b *ctrStream) IVSize() int                       { return b.BlockSize() }
-func (b *ctrStream) Decrypter(iv []byte) cipher.Stream { return b.Encrypter(iv) }
-func (b *ctrStream) Encrypter(iv []byte) cipher.Stream { return cipher.NewCTR(b, iv) }
-
-func AESCTR(key []byte) (Cipher, error) {
-	blk, err := aes.NewCipher(key)
-	if err != nil {
-		return nil, err
-	}
-	return &ctrStream{blk}, nil
-}
-
-// CFB mode
-type cfbStream struct{ cipher.Block }
-
-func (b *cfbStream) IVSize() int                       { return b.BlockSize() }
-func (b *cfbStream) Decrypter(iv []byte) cipher.Stream { return cipher.NewCFBDecrypter(b, iv) }
-func (b *cfbStream) Encrypter(iv []byte) cipher.Stream { return cipher.NewCFBEncrypter(b, iv) }
-
-func AESCFB(key []byte) (Cipher, error) {
-	blk, err := aes.NewCipher(key)
-	if err != nil {
-		return nil, err
-	}
-	return &cfbStream{blk}, nil
-}
-
-// IETF-variant of chacha20
-type chacha20ietfkey []byte
-
-func (k chacha20ietfkey) IVSize() int                       { return chacha20.INonceSize }
-func (k chacha20ietfkey) Decrypter(iv []byte) cipher.Stream { return k.Encrypter(iv) }
-func (k chacha20ietfkey) Encrypter(iv []byte) cipher.Stream {
-	ciph, err := chacha20.NewCipher(k, iv)
-	if err != nil {
-		panic(err) // should never happen
-	}
-	return ciph
-}
-
-func Chacha20IETF(key []byte) (Cipher, error) {
-	if len(key) != chacha20.KeySize {
-		return nil, KeySizeError(chacha20.KeySize)
-	}
-	return chacha20ietfkey(key), nil
-}
-
-type xchacha20key []byte
-
-func (k xchacha20key) IVSize() int                       { return chacha20.XNonceSize }
-func (k xchacha20key) Decrypter(iv []byte) cipher.Stream { return k.Encrypter(iv) }
-func (k xchacha20key) Encrypter(iv []byte) cipher.Stream {
-	ciph, err := chacha20.NewCipher(k, iv)
-	if err != nil {
-		panic(err) // should never happen
-	}
-	return ciph
-}
-
-func Xchacha20(key []byte) (Cipher, error) {
-	if len(key) != chacha20.KeySize {
-		return nil, KeySizeError(chacha20.KeySize)
-	}
-	return xchacha20key(key), nil
-}
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/doc.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/doc.go
deleted file mode 100644
index 4c0897a..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/doc.go
+++ /dev/null
@@ -1,2 +0,0 @@
-// Package shadowstream implements the original Shadowsocks protocol protected by stream cipher.
-package shadowstream
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/packet.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/packet.go
deleted file mode 100644
index 8bbb27b..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/packet.go
+++ /dev/null
@@ -1,76 +0,0 @@
-package shadowstream
-
-import (
-	"crypto/rand"
-	"errors"
-	"io"
-	"net"
-	"sync"
-)
-
-// ErrShortPacket means the packet is too short to be a valid encrypted packet.
-var ErrShortPacket = errors.New("short packet")
-
-// Pack encrypts plaintext using stream cipher s and a random IV.
-// Returns a slice of dst containing random IV and ciphertext.
-// Ensure len(dst) >= s.IVSize() + len(plaintext).
-func Pack(dst, plaintext []byte, s Cipher) ([]byte, error) {
-	if len(dst) < s.IVSize()+len(plaintext) {
-		return nil, io.ErrShortBuffer
-	}
-	iv := dst[:s.IVSize()]
-	_, err := io.ReadFull(rand.Reader, iv)
-	if err != nil {
-		return nil, err
-	}
-
-	s.Encrypter(iv).XORKeyStream(dst[len(iv):], plaintext)
-	return dst[:len(iv)+len(plaintext)], nil
-}
-
-// Unpack decrypts pkt using stream cipher s.
-// Returns a slice of dst containing decrypted plaintext.
-func Unpack(dst, pkt []byte, s Cipher) ([]byte, error) {
-	if len(pkt) < s.IVSize() {
-		return nil, ErrShortPacket
-	}
-
-	if len(dst) < len(pkt)-s.IVSize() {
-		return nil, io.ErrShortBuffer
-	}
-	iv := pkt[:s.IVSize()]
-	s.Decrypter(iv).XORKeyStream(dst, pkt[len(iv):])
-	return dst[:len(pkt)-len(iv)], nil
-}
-
-type packetConn struct {
-	net.PacketConn
-	Cipher
-	buf        []byte
-	sync.Mutex // write lock
-}
-
-// NewPacketConn wraps a net.PacketConn with stream cipher encryption/decryption.
-func NewPacketConn(c net.PacketConn, ciph Cipher) net.PacketConn {
-	return &packetConn{PacketConn: c, Cipher: ciph, buf: make([]byte, 64*1024)}
-}
-
-func (c *packetConn) WriteTo(b []byte, addr net.Addr) (int, error) {
-	c.Lock()
-	defer c.Unlock()
-	buf, err := Pack(c.buf, b, c.Cipher)
-	if err != nil {
-		return 0, err
-	}
-	_, err = c.PacketConn.WriteTo(buf, addr)
-	return len(b), err
-}
-
-func (c *packetConn) ReadFrom(b []byte) (int, net.Addr, error) {
-	n, addr, err := c.PacketConn.ReadFrom(b)
-	if err != nil {
-		return n, addr, err
-	}
-	b, err = Unpack(b, b[:n], c.Cipher)
-	return len(b), addr, err
-}
diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/stream.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/stream.go
deleted file mode 100644
index eb4d967..0000000
--- a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/stream.go
+++ /dev/null
@@ -1,171 +0,0 @@
-package shadowstream
-
-import (
-	"bytes"
-	"crypto/cipher"
-	"crypto/rand"
-	"io"
-	"net"
-)
-
-const bufSize = 32 * 1024
-
-type writer struct {
-	io.Writer
-	cipher.Stream
-	buf []byte
-}
-
-// NewWriter wraps an io.Writer with stream cipher encryption.
-func NewWriter(w io.Writer, s cipher.Stream) io.Writer {
-	return &writer{Writer: w, Stream: s, buf: make([]byte, bufSize)}
-}
-
-func (w *writer) ReadFrom(r io.Reader) (n int64, err error) {
-	for {
-		buf := w.buf
-		nr, er := r.Read(buf)
-		if nr > 0 {
-			n += int64(nr)
-			buf = buf[:nr]
-			w.XORKeyStream(buf, buf)
-			_, ew := w.Writer.Write(buf)
-			if ew != nil {
-				err = ew
-				return
-			}
-		}
-
-		if er != nil {
-			if er != io.EOF { // ignore EOF as per io.ReaderFrom contract
-				err = er
-			}
-			return
-		}
-	}
-}
-
-func (w *writer) Write(b []byte) (int, error) {
-	n, err := w.ReadFrom(bytes.NewBuffer(b))
-	return int(n), err
-}
-
-type reader struct {
-	io.Reader
-	cipher.Stream
-	buf []byte
-}
-
-// NewReader wraps an io.Reader with stream cipher decryption.
-func NewReader(r io.Reader, s cipher.Stream) io.Reader {
-	return &reader{Reader: r, Stream: s, buf: make([]byte, bufSize)}
-}
-
-func (r *reader) Read(b []byte) (int, error) {
-
-	n, err := r.Reader.Read(b)
-	if err != nil {
-		return 0, err
-	}
-	b = b[:n]
-	r.XORKeyStream(b, b)
-	return n, nil
-}
-
-func (r *reader) WriteTo(w io.Writer) (n int64, err error) {
-	for {
-		buf := r.buf
-		nr, er := r.Read(buf)
-		if nr > 0 {
-			nw, ew := w.Write(buf[:nr])
-			n += int64(nw)
-
-			if ew != nil {
-				err = ew
-				return
-			}
-		}
-
-		if er != nil {
-			if er != io.EOF { // ignore EOF as per io.Copy contract (using src.WriteTo shortcut)
-				err = er
-			}
-			return
-		}
-	}
-}
-
-type conn struct {
-	net.Conn
-	Cipher
-	r *reader
-	w *writer
-}
-
-// NewConn wraps a stream-oriented net.Conn with stream cipher encryption/decryption.
-func NewConn(c net.Conn, ciph Cipher) net.Conn {
-	return &conn{Conn: c, Cipher: ciph}
-}
-
-func (c *conn) initReader() error {
-	if c.r == nil {
-		buf := make([]byte, bufSize)
-		iv := buf[:c.IVSize()]
-		if _, err := io.ReadFull(c.Conn, iv); err != nil {
-			return err
-		}
-		c.r = &reader{Reader: c.Conn, Stream: c.Decrypter(iv), buf: buf}
-	}
-	return nil
-}
-
-func (c *conn) Read(b []byte) (int, error) {
-	if c.r == nil {
-		if err := c.initReader(); err != nil {
-			return 0, err
-		}
-	}
-	return c.r.Read(b)
-}
-
-func (c *conn) WriteTo(w io.Writer) (int64, error) {
-	if c.r == nil {
-		if err := c.initReader(); err != nil {
-			return 0, err
-		}
-	}
-	return c.r.WriteTo(w)
-}
-
-func (c *conn) initWriter() error {
-	if c.w == nil {
-		buf := make([]byte, bufSize)
-		iv := buf[:c.IVSize()]
-		if _, err := io.ReadFull(rand.Reader, iv); err != nil {
-			return err
-		}
-		if _, err := c.Conn.Write(iv); err != nil {
-			return err
-		}
-		c.w = &writer{Writer: c.Conn, Stream: c.Encrypter(iv), buf: buf}
-	}
-	return nil
-}
-
-func (c *conn) Write(b []byte) (int, error) {
-	if c.w == nil {
-		if err := c.initWriter(); err != nil {
-			return 0, err
-		}
-	}
-	return c.w.Write(b)
-}
-
-func (c *conn) ReadFrom(r io.Reader) (int64, error) {
-	if c.w == nil {
-		if err := c.initWriter(); err != nil {
-			return 0, err
-		}
-	}
-	return c.w.ReadFrom(r)
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/LICENSE b/vendor/github.com/sun8911879/shadowsocksR/LICENSE
deleted file mode 100644
index bed05e0..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2018 YanXin Sun
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/vendor/github.com/sun8911879/shadowsocksR/README.md b/vendor/github.com/sun8911879/shadowsocksR/README.md
deleted file mode 100644
index 93938b4..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/README.md
+++ /dev/null
@@ -1,84 +0,0 @@
-# shadowsocksR
-
-[shadowsocksR](https://github.com/sun8911879/shadowsocksR) is a shadowsocksR for Go library
-
-* shadowsocksR is based on [avege](https://github.com/avege/avege) and [shadowsocksR for Python](https://github.com/shadowsocksr-backup/shadowsocksr) changes. 
-* Repair avege SSR communication BUG and streamline version. Is a normal use version.
-
-#### Use
-
-```go
-bi := &BackendInfo{
-	Address: "www.domain.com:445",
-	Type:    "ssr",
-	SSInfo: SSInfo{
-		EncryptMethod:   "aes-128-cfb",
-		EncryptPassword: "password",
-		SSRInfo: SSRInfo{
-			Protocol:      "auth_aes128_sha1",
-			ProtocolParam: "",
-			Obfs:          "tls1.2_ticket_auth",
-			ObfsParam:     "",
-		},
-	},
-}
-dst, err := bi.DialSSRConn(rawaddr)
-bi.Pipe(src, dst)
-bi.Pipe(dst, src)
-```
-
-See 'example/client.go' for detailed usage.
-
-#### SS Encrypting algorithm
-
-* aes-128-cfb
-* aes-192-cfb
-* aes-256-cfb
-* aes-128-ctr
-* aes-192-ctr
-* aes-256-ctr
-* aes-128-ofb
-* aes-192-ofb
-* aes-256-ofb
-* des-cfb
-* bf-cfb
-* cast5-cfb
-* rc4-md5
-* chacha20
-* chacha20-ietf
-* salsa20
-* camellia-128-cfb
-* camellia-192-cfb
-* camellia-256-cfb
-* idea-cfb
-* rc2-cfb
-* seed-cfb
-
-#### SSR Obfs
-
-* plain
-* http_simple
-* http_post
-* random_head
-* tls1.2_ticket_auth
-
-#### SSR Protocol
-
-* origin
-* verify_sha1 aka. one time auth(OTA)
-* auth_sha1_v4
-* auth_aes128_md5
-* auth_aes128_sha1
-
-## Todo (help wanted)
-
-* Optimize performance
-
-### Thanks avege project
-* [avege](https://github.com/avege/avege)
-
-### Reference
-* [avege](https://github.com/avege/avege)
-* [shadowsocks-go](https://github.com/shadowsocks/shadowsocks-go)
-* [go-shadowsocks2](https://github.com/shadowsocks/go-shadowsocks2)
-* [ShadowsocksR](https://github.com/shadowsocksr-backup/shadowsocksr)
\ No newline at end of file
diff --git a/vendor/github.com/sun8911879/shadowsocksR/client.go b/vendor/github.com/sun8911879/shadowsocksR/client.go
deleted file mode 100644
index 2c3d7ad..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/client.go
+++ /dev/null
@@ -1,61 +0,0 @@
-package shadowsocksr
-
-import (
-	"errors"
-	"net"
-	"net/url"
-	"strconv"
-	"strings"
-	"time"
-
-	"github.com/sun8911879/shadowsocksR/obfs"
-	"github.com/sun8911879/shadowsocksR/protocol"
-	"github.com/sun8911879/shadowsocksR/ssr"
-)
-
-func NewSSRClient(u *url.URL) (*SSTCPConn, error) {
-	query := u.Query()
-	encryptMethod := query.Get("encrypt-method")
-	encryptKey := query.Get("encrypt-key")
-	cipher, err := NewStreamCipher(encryptMethod, encryptKey)
-	if err != nil {
-		return nil, err
-	}
-
-	dialer := net.Dialer{
-		Timeout:   time.Millisecond * 500,
-		DualStack: true,
-	}
-	conn, err := dialer.Dial("tcp", u.Host)
-	if err != nil {
-		return nil, err
-	}
-
-	ssconn := NewSSTCPConn(conn, cipher)
-	if ssconn.Conn == nil || ssconn.RemoteAddr() == nil {
-		return nil, errors.New("nil connection")
-	}
-
-	// should initialize obfs/protocol now
-	rs := strings.Split(ssconn.RemoteAddr().String(), ":")
-	port, _ := strconv.Atoi(rs[1])
-
-	ssconn.IObfs = obfs.NewObfs(query.Get("obfs"))
-	obfsServerInfo := &ssr.ServerInfoForObfs{
-		Host:   rs[0],
-		Port:   uint16(port),
-		TcpMss: 1460,
-		Param:  query.Get("obfs-param"),
-	}
-	ssconn.IObfs.SetServerInfo(obfsServerInfo)
-	ssconn.IProtocol = protocol.NewProtocol(query.Get("protocol"))
-	protocolServerInfo := &ssr.ServerInfoForObfs{
-		Host:   rs[0],
-		Port:   uint16(port),
-		TcpMss: 1460,
-		Param:  query.Get("protocol-param"),
-	}
-	ssconn.IProtocol.SetServerInfo(protocolServerInfo)
-
-	return ssconn, nil
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/encrypt.go b/vendor/github.com/sun8911879/shadowsocksR/encrypt.go
deleted file mode 100644
index 3f5e6b9..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/encrypt.go
+++ /dev/null
@@ -1,296 +0,0 @@
-package shadowsocksr
-
-import (
-	"crypto/aes"
-	"crypto/cipher"
-	"crypto/des"
-	"crypto/md5"
-	"crypto/rand"
-	"crypto/rc4"
-	"encoding/binary"
-	"errors"
-
-	"github.com/sun8911879/shadowsocksR/tools"
-	"github.com/sun8911879/shadowsocksR/tools/leakybuf"
-
-	"github.com/Yawning/chacha20"
-	"github.com/dgryski/go-camellia"
-	"github.com/dgryski/go-idea"
-	"github.com/dgryski/go-rc2"
-	"golang.org/x/crypto/blowfish"
-	"golang.org/x/crypto/cast5"
-	"golang.org/x/crypto/salsa20/salsa"
-)
-
-var errEmptyPassword = errors.New("empty key")
-
-type DecOrEnc int
-
-const (
-	Decrypt DecOrEnc = iota
-	Encrypt
-)
-
-func newCTRStream(block cipher.Block, err error, key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	if err != nil {
-		return nil, err
-	}
-	return cipher.NewCTR(block, iv), nil
-}
-
-func newAESCTRStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	block, err := aes.NewCipher(key)
-	return newCTRStream(block, err, key, iv, doe)
-}
-
-func newOFBStream(block cipher.Block, err error, key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	if err != nil {
-		return nil, err
-	}
-	return cipher.NewCTR(block, iv), nil
-}
-
-func newAESOFBStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	block, err := aes.NewCipher(key)
-	return newOFBStream(block, err, key, iv, doe)
-}
-
-func newCFBStream(block cipher.Block, err error, key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	if err != nil {
-		return nil, err
-	}
-	if doe == Encrypt {
-		return cipher.NewCFBEncrypter(block, iv), nil
-	} else {
-		return cipher.NewCFBDecrypter(block, iv), nil
-	}
-}
-
-func newAESCFBStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	block, err := aes.NewCipher(key)
-	return newCFBStream(block, err, key, iv, doe)
-}
-
-func newDESStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	block, err := des.NewCipher(key)
-	return newCFBStream(block, err, key, iv, doe)
-}
-
-func newBlowFishStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	block, err := blowfish.NewCipher(key)
-	return newCFBStream(block, err, key, iv, doe)
-}
-
-func newCast5Stream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	block, err := cast5.NewCipher(key)
-	return newCFBStream(block, err, key, iv, doe)
-}
-
-func newRC4MD5Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) {
-	h := md5.New()
-	h.Write(key)
-	h.Write(iv)
-	rc4key := h.Sum(nil)
-
-	return rc4.NewCipher(rc4key)
-}
-
-func newChaCha20Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) {
-	return chacha20.NewCipher(key, iv)
-}
-
-func newChacha20IETFStream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) {
-	return chacha20.NewCipher(key, iv)
-}
-
-type salsaStreamCipher struct {
-	nonce   [8]byte
-	key     [32]byte
-	counter int
-}
-
-func (c *salsaStreamCipher) XORKeyStream(dst, src []byte) {
-	var buf []byte
-	padLen := c.counter % 64
-	dataSize := len(src) + padLen
-	if cap(dst) >= dataSize {
-		buf = dst[:dataSize]
-	} else if leakybuf.GlobalLeakyBufSize >= dataSize {
-		buf = leakybuf.GlobalLeakyBuf.Get()
-		defer leakybuf.GlobalLeakyBuf.Put(buf)
-		buf = buf[:dataSize]
-	} else {
-		buf = make([]byte, dataSize)
-	}
-
-	var subNonce [16]byte
-	copy(subNonce[:], c.nonce[:])
-	binary.LittleEndian.PutUint64(subNonce[len(c.nonce):], uint64(c.counter/64))
-
-	// It's difficult to avoid data copy here. src or dst maybe slice from
-	// Conn.Read/Write, which can't have padding.
-	copy(buf[padLen:], src[:])
-	salsa.XORKeyStream(buf, buf, &subNonce, &c.key)
-	copy(dst, buf[padLen:])
-
-	c.counter += len(src)
-}
-
-func newSalsa20Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) {
-	var c salsaStreamCipher
-	copy(c.nonce[:], iv[:8])
-	copy(c.key[:], key[:32])
-	return &c, nil
-}
-
-func newCamelliaStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	block, err := camellia.New(key)
-	return newCFBStream(block, err, key, iv, doe)
-}
-
-func newIdeaStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	block, err := idea.NewCipher(key)
-	return newCFBStream(block, err, key, iv, doe)
-}
-
-func newRC2Stream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	block, err := rc2.New(key, 16)
-	return newCFBStream(block, err, key, iv, doe)
-}
-
-func newSeedStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
-	// TODO: SEED block cipher implementation is required
-	block, err := rc2.New(key, 16)
-	return newCFBStream(block, err, key, iv, doe)
-}
-
-type cipherInfo struct {
-	keyLen    int
-	ivLen     int
-	newStream func(key, iv []byte, doe DecOrEnc) (cipher.Stream, error)
-}
-
-var streamCipherMethod = map[string]*cipherInfo{
-	"aes-128-cfb":      {16, 16, newAESCFBStream},
-	"aes-192-cfb":      {24, 16, newAESCFBStream},
-	"aes-256-cfb":      {32, 16, newAESCFBStream},
-	"aes-128-ctr":      {16, 16, newAESCTRStream},
-	"aes-192-ctr":      {24, 16, newAESCTRStream},
-	"aes-256-ctr":      {32, 16, newAESCTRStream},
-	"aes-128-ofb":      {16, 16, newAESOFBStream},
-	"aes-192-ofb":      {24, 16, newAESOFBStream},
-	"aes-256-ofb":      {32, 16, newAESOFBStream},
-	"des-cfb":          {8, 8, newDESStream},
-	"bf-cfb":           {16, 8, newBlowFishStream},
-	"cast5-cfb":        {16, 8, newCast5Stream},
-	"rc4-md5":          {16, 16, newRC4MD5Stream},
-	"rc4-md5-6":        {16, 6, newRC4MD5Stream},
-	"chacha20":         {32, 8, newChaCha20Stream},
-	"chacha20-ietf":    {32, 12, newChacha20IETFStream},
-	"salsa20":          {32, 8, newSalsa20Stream},
-	"camellia-128-cfb": {16, 16, newCamelliaStream},
-	"camellia-192-cfb": {24, 16, newCamelliaStream},
-	"camellia-256-cfb": {32, 16, newCamelliaStream},
-	"idea-cfb":         {16, 8, newIdeaStream},
-	"rc2-cfb":          {16, 8, newRC2Stream},
-	"seed-cfb":         {16, 8, newSeedStream},
-}
-
-func CheckCipherMethod(method string) error {
-	if method == "" {
-		method = "rc4-md5"
-	}
-	_, ok := streamCipherMethod[method]
-	if !ok {
-		return errors.New("Unsupported encryption method: " + method)
-	}
-	return nil
-}
-
-type StreamCipher struct {
-	enc  cipher.Stream
-	dec  cipher.Stream
-	key  []byte
-	info *cipherInfo
-	iv   []byte
-}
-
-// NewStreamCipher creates a cipher that can be used in Dial() etc.
-// Use cipher.Copy() to create a new cipher with the same method and password
-// to avoid the cost of repeated cipher initialization.
-func NewStreamCipher(method, password string) (c *StreamCipher, err error) {
-	if password == "" {
-		return nil, errEmptyPassword
-	}
-	if method == "" {
-		method = "rc4-md5"
-	}
-	mi, ok := streamCipherMethod[method]
-	if !ok {
-		return nil, errors.New("Unsupported encryption method: " + method)
-	}
-
-	key := tools.EVPBytesToKey(password, mi.keyLen)
-
-	c = &StreamCipher{key: key, info: mi}
-
-	if err != nil {
-		return nil, err
-	}
-	return c, nil
-}
-
-// Initializes the block cipher with CFB mode, returns IV.
-func (c *StreamCipher) initEncrypt() (iv []byte, err error) {
-	if c.iv == nil {
-		iv = make([]byte, c.info.ivLen)
-		rand.Read(iv)
-		c.iv = iv
-	} else {
-		iv = c.iv
-	}
-	c.enc, err = c.info.newStream(c.key, iv, Encrypt)
-	return
-}
-
-func (c *StreamCipher) initDecrypt(iv []byte) (err error) {
-	c.dec, err = c.info.newStream(c.key, iv, Decrypt)
-	return
-}
-
-func (c *StreamCipher) encrypt(dst, src []byte) {
-	c.enc.XORKeyStream(dst, src)
-}
-
-func (c *StreamCipher) decrypt(dst, src []byte) {
-	c.dec.XORKeyStream(dst, src)
-}
-
-// Copy creates a new cipher at it's initial state.
-func (c *StreamCipher) Copy() *StreamCipher {
-	// This optimization maybe not necessary. But without this function, we
-	// need to maintain a table cache for newTableCipher and use lock to
-	// protect concurrent access to that cache.
-
-	// AES and DES ciphers does not return specific types, so it's difficult
-	// to create copy. But their initialization time is less than 4000ns on my
-	// 2.26 GHz Intel Core 2 Duo processor. So no need to worry.
-
-	// Currently, blow-fish and cast5 initialization cost is an order of
-	// magnitude slower than other ciphers. (I'm not sure whether this is
-	// because the current implementation is not highly optimized, or this is
-	// the nature of the algorithm.)
-
-	nc := *c
-	nc.enc = nil
-	nc.dec = nil
-	return &nc
-}
-
-func (c *StreamCipher) Key() (key []byte, keyLen int) {
-	return c.key, c.info.keyLen
-}
-
-func (c *StreamCipher) IV() ([]byte, int) {
-	return c.iv, c.info.ivLen
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/base.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/base.go
deleted file mode 100644
index a57732e..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/obfs/base.go
+++ /dev/null
@@ -1,35 +0,0 @@
-package obfs
-
-import (
-	"strings"
-
-	"github.com/sun8911879/shadowsocksR/ssr"
-)
-
-type creator func() IObfs
-
-var (
-	creatorMap = make(map[string]creator)
-)
-
-type IObfs interface {
-	SetServerInfo(s *ssr.ServerInfoForObfs)
-	GetServerInfo() (s *ssr.ServerInfoForObfs)
-	Encode(data []byte) ([]byte, error)
-	Decode(data []byte) ([]byte, uint64, error)
-	SetData(data interface{})
-	GetData() interface{}
-}
-
-func register(name string, c creator) {
-	creatorMap[name] = c
-}
-
-// NewObfs create an obfs object by name and return as an IObfs interface
-func NewObfs(name string) IObfs {
-	c, ok := creatorMap[strings.ToLower(name)]
-	if ok {
-		return c()
-	}
-	return nil
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/http_post.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/http_post.go
deleted file mode 100644
index 46513d7..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/obfs/http_post.go
+++ /dev/null
@@ -1,19 +0,0 @@
-package obfs
-
-import (
-	"math/rand"
-)
-
-func init() {
-	register("http_post", newHttpPost)
-}
-
-// newHttpPost create a http_post object
-func newHttpPost() IObfs {
-	// newHttpSimple create a http_simple object
-	t := &httpSimplePost{
-		userAgentIndex: rand.Intn(len(requestUserAgent)),
-		getOrPost:      false,
-	}
-	return t
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/http_simple.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/http_simple.go
deleted file mode 100644
index 521a3e9..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/obfs/http_simple.go
+++ /dev/null
@@ -1,178 +0,0 @@
-package obfs
-
-import (
-	"bytes"
-	"encoding/hex"
-	"fmt"
-	"math/rand"
-	"strings"
-
-	"github.com/sun8911879/shadowsocksR/ssr"
-)
-
-var (
-	requestPath = []string{
-		"", "",
-		"login.php?redir=", "",
-		"register.php?code=", "",
-		"s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&ch=&bar=&wd=", "&rn=",
-		"post.php?id=", "&goto=view.php",
-	}
-	requestUserAgent = []string{
-		"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
-		"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/44.0",
-		"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
-		"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36",
-		"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0",
-		"Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)",
-		"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-		"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.3; Trident/7.0; .NET4.0E; .NET4.0C)",
-		"Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
-		"Mozilla/5.0 (Linux; Android 4.4; Nexus 5 Build/BuildID) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Mobile Safari/537.36",
-		"Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3",
-		"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3",
-	}
-)
-
-// HttpSimple http_simple obfs encapsulate
-type httpSimplePost struct {
-	ssr.ServerInfoForObfs
-	rawTransSent     bool
-	rawTransReceived bool
-	userAgentIndex   int
-	getOrPost        bool // true for get, false for post
-}
-
-func init() {
-	register("http_simple", newHttpSimple)
-}
-
-// newHttpSimple create a http_simple object
-func newHttpSimple() IObfs {
-	t := &httpSimplePost{
-		rawTransSent:     false,
-		rawTransReceived: false,
-		userAgentIndex:   rand.Intn(len(requestUserAgent)),
-		getOrPost:        true,
-	}
-	return t
-}
-
-func (t *httpSimplePost) SetServerInfo(s *ssr.ServerInfoForObfs) {
-	t.ServerInfoForObfs = *s
-}
-
-func (t *httpSimplePost) GetServerInfo() (s *ssr.ServerInfoForObfs) {
-	return &t.ServerInfoForObfs
-}
-
-func (t *httpSimplePost) SetData(data interface{}) {
-
-}
-
-func (t *httpSimplePost) GetData() interface{} {
-	return nil
-}
-
-func (t *httpSimplePost) boundary() (ret string) {
-	set := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
-	for i := 0; i < 32; i++ {
-		ret = fmt.Sprintf("%s%c", ret, set[rand.Intn(len(set))])
-	}
-	return
-}
-
-func (t *httpSimplePost) data2URLEncode(data []byte) (ret string) {
-	for i := 0; i < len(data); i++ {
-		ret = fmt.Sprintf("%s%%%s", ret, hex.EncodeToString([]byte{data[i]}))
-	}
-	return
-}
-
-func (t *httpSimplePost) Encode(data []byte) (encodedData []byte, err error) {
-	if t.rawTransSent {
-		return data, nil
-	}
-
-	dataLength := len(data)
-	var headData []byte
-	if headSize := t.IVLen + t.HeadLen; dataLength-headSize > 64 {
-		headData = make([]byte, headSize+rand.Intn(64))
-	} else {
-		headData = make([]byte, dataLength)
-	}
-	copy(headData, data[0:len(headData)])
-	requestPathIndex := rand.Intn(len(requestPath)/2) * 2
-	host := t.Host
-	var customHead string
-
-	if len(t.Param) > 0 {
-		customHeads := strings.Split(t.Param, "#")
-		if len(customHeads) > 2 {
-			customHeads = customHeads[0:2]
-		}
-		param := t.Param
-		if len(customHeads) > 1 {
-			customHead = customHeads[1]
-			param = customHeads[0]
-		}
-		hosts := strings.Split(param, ",")
-		if len(hosts) > 0 {
-			host = strings.TrimSpace(hosts[rand.Intn(len(hosts))])
-		}
-	}
-	method := "GET /"
-	if !t.getOrPost {
-		method = "POST /"
-	}
-	httpBuf := fmt.Sprintf("%s%s%s%s HTTP/1.1\r\nHost: %s:%d\r\n",
-		method,
-		requestPath[requestPathIndex],
-		t.data2URLEncode(headData),
-		requestPath[requestPathIndex+1],
-		host,
-		t.Port)
-	if len(customHead) > 0 {
-		httpBuf = httpBuf + strings.Replace(customHead, "\\n", "\r\n", -1) + "\r\n\r\n"
-	} else {
-		var contentType string
-		if !t.getOrPost {
-			contentType = "Content-Type: multipart/form-data; boundary=" + t.boundary() + "\r\n"
-		}
-		httpBuf = httpBuf +
-			"User-Agent: " + requestUserAgent[t.userAgentIndex] + "\r\n" +
-			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" +
-			"Accept-Language: en-US,en;q=0.8\r\n" +
-			"Accept-Encoding: gzip, deflate\r\n" +
-			contentType +
-			"DNT: 1\r\n" +
-			"Connection: keep-alive\r\n" +
-			"\r\n"
-	}
-
-	if len(headData) < dataLength {
-		encodedData = make([]byte, len(httpBuf)+(dataLength-len(headData)))
-		copy(encodedData, []byte(httpBuf))
-		copy(encodedData[len(httpBuf):], data[len(headData):])
-	} else {
-		encodedData = []byte(httpBuf)
-	}
-	t.rawTransSent = true
-
-	return
-}
-
-func (t *httpSimplePost) Decode(data []byte) ([]byte, uint64, error) {
-	if t.rawTransReceived {
-		return data, 0, nil
-	}
-
-	pos := bytes.Index(data, []byte("\r\n\r\n"))
-	if pos > 0 {
-		decodedData := make([]byte, len(data)-pos-4)
-		copy(decodedData, data[pos+4:])
-		t.rawTransReceived = true
-		return decodedData, 0, nil
-	}
-	return nil, 0, nil
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/plain.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/plain.go
deleted file mode 100644
index 4ee2682..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/obfs/plain.go
+++ /dev/null
@@ -1,42 +0,0 @@
-package obfs
-
-import (
-	"github.com/sun8911879/shadowsocksR/ssr"
-)
-
-func init() {
-	register("plain", newPlainObfs)
-}
-
-type plain struct {
-	ssr.ServerInfoForObfs
-}
-
-func newPlainObfs() IObfs {
-	p := &plain{}
-	return p
-}
-
-func (p *plain) SetServerInfo(s *ssr.ServerInfoForObfs) {
-	p.ServerInfoForObfs = *s
-}
-
-func (p *plain) GetServerInfo() (s *ssr.ServerInfoForObfs) {
-	return &p.ServerInfoForObfs
-}
-
-func (p *plain) Encode(data []byte) (encodedData []byte, err error) {
-	return data, nil
-}
-
-func (p *plain) Decode(data []byte) ([]byte, uint64, error) {
-	return data, 0, nil
-}
-
-func (p *plain) SetData(data interface{}) {
-
-}
-
-func (p *plain) GetData() interface{} {
-	return nil
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/random_head.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/random_head.go
deleted file mode 100644
index 6e0366f..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/obfs/random_head.go
+++ /dev/null
@@ -1,79 +0,0 @@
-package obfs
-
-import (
-	"math/rand"
-
-	"github.com/sun8911879/shadowsocksR/ssr"
-)
-
-type randomHead struct {
-	ssr.ServerInfoForObfs
-	rawTransSent     bool
-	rawTransReceived bool
-	hasSentHeader    bool
-	dataBuffer       []byte
-}
-
-func init() {
-	register("random_head", newRandomHead)
-}
-
-func newRandomHead() IObfs {
-	p := &randomHead{}
-	return p
-}
-
-func (r *randomHead) SetServerInfo(s *ssr.ServerInfoForObfs) {
-	r.ServerInfoForObfs = *s
-}
-
-func (r *randomHead) GetServerInfo() (s *ssr.ServerInfoForObfs) {
-	return &r.ServerInfoForObfs
-}
-
-func (r *randomHead) SetData(data interface{}) {
-
-}
-
-func (r *randomHead) GetData() interface{} {
-	return nil
-}
-
-func (r *randomHead) Encode(data []byte) (encodedData []byte, err error) {
-	if r.rawTransSent {
-		return data, nil
-	}
-
-	dataLength := len(data)
-	if r.hasSentHeader {
-		if dataLength > 0 {
-			d := make([]byte, len(r.dataBuffer)+dataLength)
-			copy(d, r.dataBuffer)
-			copy(d[len(r.dataBuffer):], data)
-			r.dataBuffer = d
-		} else {
-			encodedData = r.dataBuffer
-			r.dataBuffer = nil
-			r.rawTransSent = true
-		}
-	} else {
-		size := rand.Intn(96) + 8
-		encodedData = make([]byte, size)
-		rand.Read(encodedData)
-		ssr.SetCRC32(encodedData, size)
-
-		d := make([]byte, dataLength)
-		copy(d, data)
-		r.dataBuffer = d
-	}
-	r.hasSentHeader = true
-	return
-}
-
-func (r *randomHead) Decode(data []byte) ([]byte, uint64, error) {
-	if r.rawTransReceived {
-		return data, 0, nil
-	}
-	r.rawTransReceived = true
-	return data, 0, nil
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/tls12_ticket_auth.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/tls12_ticket_auth.go
deleted file mode 100644
index 847c2e6..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/obfs/tls12_ticket_auth.go
+++ /dev/null
@@ -1,279 +0,0 @@
-package obfs
-
-import (
-	"crypto/hmac"
-	"encoding/binary"
-	"fmt"
-	"math/rand"
-	"strings"
-	"time"
-
-	"github.com/sun8911879/shadowsocksR/ssr"
-	"github.com/sun8911879/shadowsocksR/tools"
-)
-
-func init() {
-	register("tls1.2_ticket_auth", newTLS12TicketAuth)
-}
-
-type tlsAuthData struct {
-	localClientID [32]byte
-}
-
-// tls12TicketAuth tls1.2_ticket_auth obfs encapsulate
-type tls12TicketAuth struct {
-	ssr.ServerInfoForObfs
-	data            *tlsAuthData
-	sendID          int
-	handshakeStatus int
-	sendBuffer      []byte
-}
-
-// newTLS12TicketAuth create a tlv1.2_ticket_auth object
-func newTLS12TicketAuth() IObfs {
-	return &tls12TicketAuth{}
-}
-
-func (t *tls12TicketAuth) SetServerInfo(s *ssr.ServerInfoForObfs) {
-	t.ServerInfoForObfs = *s
-}
-
-func (t *tls12TicketAuth) GetServerInfo() (s *ssr.ServerInfoForObfs) {
-	return &t.ServerInfoForObfs
-}
-
-func (t *tls12TicketAuth) SetData(data interface{}) {
-	if auth, ok := data.(*tlsAuthData); ok {
-		t.data = auth
-	}
-}
-
-func (t *tls12TicketAuth) GetData() interface{} {
-	if t.data == nil {
-		t.data = &tlsAuthData{}
-		b := make([]byte, 32)
-		rand.Read(b)
-		copy(t.data.localClientID[:], b)
-	}
-	return t.data
-}
-
-func (t *tls12TicketAuth) getHost() string {
-	host := t.Host
-	if len(t.Param) > 0 {
-		hosts := strings.Split(t.Param, ",")
-		if len(hosts) > 0 {
-			host = hosts[rand.Intn(len(hosts))]
-			host = strings.TrimSpace(host)
-		}
-	}
-	if len(host) > 0 && host[len(host)-1] >= byte('0') && host[len(host)-1] <= byte('9') && len(t.Param) == 0 {
-		host = ""
-	}
-	return host
-}
-
-func (t *tls12TicketAuth) Encode(data []byte) (encodedData []byte, err error) {
-	if t.handshakeStatus == -1 {
-		return data, nil
-	}
-	dataLength := len(data)
-
-	if t.handshakeStatus == 8 {
-		encodedData = make([]byte, dataLength+4096)
-		start := 0
-		outLength := 0
-
-		for t.sendID <= 4 && dataLength-start > 256 {
-			length := rand.Intn(512) + 64
-			if length > dataLength-start {
-				length = dataLength - start
-			}
-			copy(encodedData[outLength:], []byte{0x17, 0x3, 0x3})
-			binary.BigEndian.PutUint16(encodedData[outLength+3:], uint16(length&0xFFFF))
-			copy(encodedData[outLength+5:], data[start:start+length])
-			start += length
-			outLength += length + 5
-			t.sendID++
-		}
-		for dataLength-start > 2048 {
-			length := rand.Intn(3990) + 100
-			if length > dataLength-start {
-				length = dataLength - start
-			}
-			copy(encodedData[outLength:], []byte{0x17, 0x3, 0x3})
-			binary.BigEndian.PutUint16(encodedData[outLength+3:], uint16(length&0xFFFF))
-			copy(encodedData[outLength+5:], data[start:start+length])
-			start += length
-			outLength += length + 5
-			t.sendID++
-		}
-		if dataLength-start > 0 {
-			length := dataLength - start
-			copy(encodedData[outLength:], []byte{0x17, 0x3, 0x3})
-			binary.BigEndian.PutUint16(encodedData[outLength+3:], uint16(length&0xFFFF))
-			copy(encodedData[outLength+5:], data[start:start+length])
-			// not necessary to update variable *start* any more
-			outLength += length + 5
-			t.sendID++
-		}
-		encodedData = encodedData[:outLength]
-		return
-	}
-
-	if t.handshakeStatus == 1 {
-		//outLength := 0
-		if dataLength > 0 {
-			b := make([]byte, len(t.sendBuffer)+dataLength+5)
-			copy(b, t.sendBuffer)
-			copy(b[len(t.sendBuffer):], []byte{0x17, 0x3, 0x3})
-			binary.BigEndian.PutUint16(b[len(t.sendBuffer)+3:], uint16(dataLength&0xFFFF))
-			copy(b[len(t.sendBuffer)+5:], data)
-			t.sendBuffer = b
-			return []byte{}, nil
-		}
-
-		hmacData := make([]byte, 43)
-		rnd := make([]byte, 22)
-		rand.Read(rnd)
-
-		handshakeFinish := []byte("\x14\x03\x03\x00\x01\x01\x16\x03\x03\x00\x20")
-		copy(hmacData, handshakeFinish)
-		copy(hmacData[len(handshakeFinish):], rnd)
-
-		h := t.hmacSHA1(hmacData[:33])
-		copy(hmacData[33:], h)
-
-		encodedData = make([]byte, len(hmacData)+len(t.sendBuffer))
-		copy(encodedData, hmacData)
-		copy(encodedData[len(hmacData):], t.sendBuffer)
-		t.sendBuffer = nil
-		t.handshakeStatus = 8
-
-		return
-	}
-
-	rnd := t.packAuthData()
-
-	tlsData0 := []byte("\x00\x1c\xc0\x2b\xc0\x2f\xcc\xa9\xcc\xa8\xcc\x14\xcc\x13\xc0\x0a\xc0\x14\xc0\x09\xc0\x13\x00\x9c\x00\x35\x00\x2f\x00\x0a\x01\x00")
-	tlsData1 := []byte("\xff\x01\x00\x01\x00")
-	tlsData2 := []byte("\x00\x17\x00\x00\x00\x23\x00\xd0")
-	tlsData3 := []byte("\x00\x0d\x00\x16\x00\x14\x06\x01\x06\x03\x05\x01\x05\x03\x04\x01\x04\x03\x03\x01\x03\x03\x02\x01\x02\x03\x00\x05\x00\x05\x01\x00\x00\x00\x00\x00\x12\x00\x00\x75\x50\x00\x00\x00\x0b\x00\x02\x01\x00\x00\x0a\x00\x06\x00\x04\x00\x17\x00\x18" +
-		"\x00\x15\x00\x66\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00")
-
-	var sslBuf []byte
-	sslBuf = append(sslBuf, rnd...)
-	sslBuf = append(sslBuf, byte(32))
-	sslBuf = append(sslBuf, t.data.localClientID[:]...)
-	sslBuf = append(sslBuf, tlsData0...)
-
-	var extBuf []byte
-	extBuf = append(extBuf, tlsData1...)
-
-	host := t.getHost()
-
-	extBuf = append(extBuf, t.sni(host)...)
-	extBuf = append(extBuf, tlsData2...)
-	ticket := make([]byte, 208)
-	rand.Read(ticket)
-	extBuf = append(extBuf, ticket...)
-	extBuf = append(extBuf, tlsData3...)
-	extBuf = append([]byte{byte(len(extBuf) / 256), byte(len(extBuf) % 256)}, extBuf...)
-
-	sslBuf = append(sslBuf, extBuf...)
-	// client version
-	sslBuf = append([]byte{3, 3}, sslBuf...)
-	// length
-	sslBuf = append([]byte{1, 0, byte(len(sslBuf) / 256), byte(len(sslBuf) % 256)}, sslBuf...)
-	// length
-	sslBuf = append([]byte{byte(len(sslBuf) / 256), byte(len(sslBuf) % 256)}, sslBuf...)
-	// version
-	sslBuf = append([]byte{0x16, 3, 1}, sslBuf...)
-
-	encodedData = sslBuf
-
-	d := make([]byte, dataLength+5)
-	copy(d[0:], []byte{0x17, 0x3, 0x3})
-	binary.BigEndian.PutUint16(d[3:], uint16(dataLength&0xFFFF))
-	copy(d[5:], data)
-	b := make([]byte, len(t.sendBuffer)+len(d))
-	copy(b, t.sendBuffer)
-	copy(b[len(t.sendBuffer):], d)
-	t.sendBuffer = b
-
-	t.handshakeStatus = 1
-
-	return
-}
-
-func (t *tls12TicketAuth) Decode(data []byte) ([]byte, uint64, error) {
-	if t.handshakeStatus == -1 {
-		return data, 0, nil
-	}
-	dataLength := len(data)
-
-	if t.handshakeStatus == 8 {
-		if dataLength < 5 {
-			return nil, 5, fmt.Errorf("data need minimum length: 5 ,data only length: %d", dataLength)
-		}
-		if data[0] != 0x17 {
-			return nil, 0, ssr.ErrTLS12TicketAuthIncorrectMagicNumber
-		}
-		size := int(binary.BigEndian.Uint16(data[3:5]))
-		if size+5 > dataLength {
-			return nil, uint64(size + 5), fmt.Errorf("unexpected data length: %d ,data only length: %d", size+5, dataLength)
-		}
-		if dataLength == size+5 {
-			return data[5:], 0, nil
-		}
-		return data[5 : 5+size], uint64(size + 5), nil
-	}
-
-	if dataLength < 11+32+1+32 {
-		return nil, 0, ssr.ErrTLS12TicketAuthTooShortData
-	}
-
-	hash := t.hmacSHA1(data[11 : 11+22])
-
-	if !hmac.Equal(data[33:33+ssr.ObfsHMACSHA1Len], hash) {
-		return nil, 0, ssr.ErrTLS12TicketAuthHMACError
-	}
-	return nil, 1, nil
-}
-
-func (t *tls12TicketAuth) packAuthData() (outData []byte) {
-	outSize := 32
-	outData = make([]byte, outSize)
-
-	now := time.Now().Unix()
-	binary.BigEndian.PutUint32(outData[0:4], uint32(now))
-
-	rand.Read(outData[4 : 4+18])
-
-	hash := t.hmacSHA1(outData[:outSize-ssr.ObfsHMACSHA1Len])
-	copy(outData[outSize-ssr.ObfsHMACSHA1Len:], hash)
-
-	return
-}
-
-func (t *tls12TicketAuth) hmacSHA1(data []byte) []byte {
-	key := make([]byte, t.KeyLen+32)
-	copy(key, t.Key)
-	copy(key[t.KeyLen:], t.data.localClientID[:])
-
-	sha1Data := tools.HmacSHA1(key, data)
-	return sha1Data[:ssr.ObfsHMACSHA1Len]
-}
-
-func (t *tls12TicketAuth) sni(u string) []byte {
-	bURL := []byte(u)
-	length := len(bURL)
-	ret := make([]byte, length+9)
-	copy(ret[9:9+length], bURL)
-	binary.BigEndian.PutUint16(ret[7:], uint16(length&0xFFFF))
-	length += 3
-	binary.BigEndian.PutUint16(ret[4:], uint16(length&0xFFFF))
-	length += 2
-	binary.BigEndian.PutUint16(ret[2:], uint16(length&0xFFFF))
-	return ret
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_md5.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_md5.go
deleted file mode 100644
index 818b78d..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_md5.go
+++ /dev/null
@@ -1,282 +0,0 @@
-package protocol
-
-import (
-	"bytes"
-	"crypto/aes"
-	"crypto/cipher"
-	"encoding/base64"
-	"encoding/binary"
-	"math/rand"
-	"strconv"
-	"strings"
-	"time"
-
-	"github.com/sun8911879/shadowsocksR/ssr"
-	"github.com/sun8911879/shadowsocksR/tools"
-)
-
-type hmacMethod func(key []byte, data []byte) []byte
-type hashDigestMethod func(data []byte) []byte
-
-func init() {
-	register("auth_aes128_md5", NewAuthAES128MD5)
-}
-
-func NewAuthAES128MD5() IProtocol {
-	a := &authAES128{
-		salt:       "auth_aes128_md5",
-		hmac:       tools.HmacMD5,
-		hashDigest: tools.MD5Sum,
-		packID:     1,
-		recvInfo: recvInfo{
-			recvID: 1,
-			buffer: bytes.NewBuffer(nil),
-		},
-		data: &authData{
-			connectionID: 0xFF000001,
-		},
-	}
-	return a
-}
-
-type recvInfo struct {
-	recvID uint32
-	buffer *bytes.Buffer
-}
-
-type authAES128 struct {
-	ssr.ServerInfoForObfs
-	recvInfo
-	data          *authData
-	hasSentHeader bool
-	packID        uint32
-	userKey       []byte
-	salt          string
-	hmac          hmacMethod
-	hashDigest    hashDigestMethod
-}
-
-func (a *authAES128) SetServerInfo(s *ssr.ServerInfoForObfs) {
-	a.ServerInfoForObfs = *s
-}
-
-func (a *authAES128) GetServerInfo() (s *ssr.ServerInfoForObfs) {
-	return &a.ServerInfoForObfs
-}
-
-func (a *authAES128) SetData(data interface{}) {
-	if auth, ok := data.(*authData); ok {
-		a.data = auth
-	}
-}
-
-func (a *authAES128) GetData() interface{} {
-	if a.data == nil {
-		a.data = &authData{}
-	}
-	return a.data
-}
-
-func (a *authAES128) packData(data []byte) (outData []byte) {
-	dataLength := len(data)
-	randLength := 1
-	if dataLength <= 1200 {
-		if a.packID > 4 {
-			randLength += rand.Intn(32)
-		} else {
-			if dataLength > 900 {
-				randLength += rand.Intn(128)
-			} else {
-				randLength += rand.Intn(512)
-			}
-		}
-	}
-
-	outLength := randLength + dataLength + 8
-	outData = make([]byte, outLength)
-	// 0~1, out length
-	binary.LittleEndian.PutUint16(outData[0:], uint16(outLength&0xFFFF))
-	// 2~3, hmac
-	key := make([]byte, len(a.userKey)+4)
-	copy(key, a.userKey)
-	binary.LittleEndian.PutUint32(key[len(key)-4:], a.packID)
-	h := a.hmac(key, outData[0:2])
-	copy(outData[2:4], h[:2])
-	// 4~rand length+4, rand number
-	rand.Read(outData[4 : 4+randLength])
-	// 4, rand length
-	if randLength < 128 {
-		outData[4] = byte(randLength & 0xFF)
-	} else {
-		// 4, magic number 0xFF
-		outData[4] = 0xFF
-		// 5~6, rand length
-		binary.LittleEndian.PutUint16(outData[5:], uint16(randLength&0xFFFF))
-	}
-	// rand length+4~out length-4, data
-	if dataLength > 0 {
-		copy(outData[randLength+4:], data)
-	}
-	a.packID++
-	h = a.hmac(key, outData[:outLength-4])
-	copy(outData[outLength-4:], h[:4])
-	return
-}
-
-func (a *authAES128) packAuthData(data []byte) (outData []byte) {
-	dataLength := len(data)
-	var randLength int
-	if dataLength > 400 {
-		randLength = rand.Intn(512)
-	} else {
-		randLength = rand.Intn(1024)
-	}
-
-	dataOffset := randLength + 16 + 4 + 4 + 7
-	outLength := dataOffset + dataLength + 4
-	outData = make([]byte, outLength)
-	encrypt := make([]byte, 24)
-	key := make([]byte, a.IVLen+a.KeyLen)
-	copy(key, a.IV)
-	copy(key[a.IVLen:], a.Key)
-
-	rand.Read(outData[dataOffset-randLength:])
-
-	if a.data.connectionID > 0xFF000000 {
-		a.data.clientID = nil
-	}
-	if len(a.data.clientID) == 0 {
-		a.data.clientID = make([]byte, 4)
-		rand.Read(a.data.clientID)
-		b := make([]byte, 4)
-		rand.Read(b)
-		a.data.connectionID = binary.LittleEndian.Uint32(b) & 0xFFFFFF
-	}
-	a.data.connectionID++
-	copy(encrypt[4:], a.data.clientID)
-	binary.LittleEndian.PutUint32(encrypt[8:], a.data.connectionID)
-
-	now := time.Now().Unix()
-	binary.LittleEndian.PutUint32(encrypt[0:4], uint32(now))
-
-	binary.LittleEndian.PutUint16(encrypt[12:], uint16(outLength&0xFFFF))
-	binary.LittleEndian.PutUint16(encrypt[14:], uint16(randLength&0xFFFF))
-
-	params := strings.Split(a.Param, ":")
-	uid := make([]byte, 4)
-	if len(params) >= 2 {
-		if userID, err := strconv.ParseUint(params[0], 10, 32); err != nil {
-			rand.Read(uid)
-		} else {
-			binary.LittleEndian.PutUint32(uid, uint32(userID))
-			a.userKey = a.hashDigest([]byte(params[1]))
-		}
-	} else {
-		rand.Read(uid)
-	}
-
-	if a.userKey == nil {
-		a.userKey = make([]byte, a.KeyLen)
-		copy(a.userKey, a.Key)
-	}
-
-	encryptKey := make([]byte, len(a.userKey))
-	copy(encryptKey, a.userKey)
-
-	aesCipherKey := tools.EVPBytesToKey(base64.StdEncoding.EncodeToString(encryptKey)+a.salt, 16)
-	block, err := aes.NewCipher(aesCipherKey)
-	if err != nil {
-		return nil
-	}
-
-	encryptData := make([]byte, 16)
-	iv := make([]byte, aes.BlockSize)
-	cbc := cipher.NewCBCEncrypter(block, iv)
-	cbc.CryptBlocks(encryptData, encrypt[0:16])
-	copy(encrypt[4:4+16], encryptData)
-	copy(encrypt[0:4], uid)
-
-	h := a.hmac(key, encrypt[0:20])
-	copy(encrypt[20:], h[:4])
-
-	rand.Read(outData[0:1])
-	h = a.hmac(key, outData[0:1])
-	copy(outData[1:], h[0:7-1])
-
-	copy(outData[7:], encrypt)
-	copy(outData[dataOffset:], data)
-
-	h = a.hmac(a.userKey, outData[0:outLength-4])
-	copy(outData[outLength-4:], h[:4])
-
-	return
-}
-
-func (a *authAES128) PreEncrypt(plainData []byte) (outData []byte, err error) {
-	dataLength := len(plainData)
-	offset := 0
-	if !a.hasSentHeader {
-		authLength := dataLength
-		if authLength > 1200 {
-			authLength = 1200
-		}
-		packData := a.packAuthData(plainData[:authLength])
-		a.hasSentHeader = true
-		outData = append(outData, packData...)
-		dataLength -= authLength
-		offset += authLength
-	}
-	const blockSize = 4096
-	for dataLength > blockSize {
-		packData := a.packData(plainData[offset : offset+blockSize])
-		outData = append(outData, packData...)
-		dataLength -= blockSize
-		offset += blockSize
-	}
-	if dataLength > 0 {
-		packData := a.packData(plainData[offset:])
-		outData = append(outData, packData...)
-	}
-
-	return
-}
-
-func (a *authAES128) PostDecrypt(plainData []byte) ([]byte, int, error) {
-	a.buffer.Reset()
-	plainLength := len(plainData)
-	datalength := plainLength
-	readlenth := 0
-	key := make([]byte, len(a.userKey)+4)
-	copy(key, a.userKey)
-	for plainLength > 4 {
-		binary.LittleEndian.PutUint32(key[len(key)-4:], a.recvID)
-
-		h := a.hmac(key, plainData[0:2])
-		if h[0] != plainData[2] || h[1] != plainData[3] {
-			return nil, 0, ssr.ErrAuthAES128HMACError
-		}
-		length := int(binary.LittleEndian.Uint16(plainData[0:2]))
-		if length >= 8192 || length < 8 {
-			return nil, 0, ssr.ErrAuthAES128DataLengthError
-		}
-		if length > plainLength {
-			break
-		}
-		a.recvID++
-		pos := int(plainData[4])
-		if pos < 255 {
-			pos += 4
-		} else {
-			pos = int(binary.LittleEndian.Uint16(plainData[5:7])) + 4
-		}
-
-		a.buffer.Write(plainData[pos : length-4])
-		plainData = plainData[length:]
-		plainLength -= length
-		readlenth += length
-	}
-	if datalength == readlenth {
-		readlenth = -1
-	}
-	return a.buffer.Bytes(), readlenth, nil
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_sha1.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_sha1.go
deleted file mode 100644
index c6c2b0e..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_sha1.go
+++ /dev/null
@@ -1,28 +0,0 @@
-package protocol
-
-import (
-	"bytes"
-
-	"github.com/sun8911879/shadowsocksR/tools"
-)
-
-func init() {
-	register("auth_aes128_sha1", NewAuthAES128SHA1)
-}
-
-func NewAuthAES128SHA1() IProtocol {
-	a := &authAES128{
-		salt:       "auth_aes128_sha1",
-		hmac:       tools.HmacSHA1,
-		hashDigest: tools.SHA1Sum,
-		packID:     1,
-		recvInfo: recvInfo{
-			recvID: 1,
-			buffer: bytes.NewBuffer(nil),
-		},
-		data: &authData{
-			connectionID: 0xFF000001,
-		},
-	}
-	return a
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_sha1_v4.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_sha1_v4.go
deleted file mode 100644
index ffec01b..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_sha1_v4.go
+++ /dev/null
@@ -1,232 +0,0 @@
-package protocol
-
-import (
-	"encoding/binary"
-	"math/rand"
-	"time"
-
-	"github.com/sun8911879/shadowsocksR/ssr"
-	"github.com/sun8911879/shadowsocksR/tools"
-)
-
-func init() {
-	register("auth_sha1_v4", NewAuthSHA1v4)
-}
-
-type authSHA1v4 struct {
-	ssr.ServerInfoForObfs
-	data             *authData
-	hasSentHeader    bool
-	recvBuffer       []byte
-	recvBufferLength int
-}
-
-func NewAuthSHA1v4() IProtocol {
-	a := &authSHA1v4{}
-	return a
-}
-
-func (a *authSHA1v4) SetServerInfo(s *ssr.ServerInfoForObfs) {
-	a.ServerInfoForObfs = *s
-}
-
-func (a *authSHA1v4) GetServerInfo() (s *ssr.ServerInfoForObfs) {
-	return &a.ServerInfoForObfs
-}
-
-func (a *authSHA1v4) SetData(data interface{}) {
-	if auth, ok := data.(*authData); ok {
-		a.data = auth
-	}
-}
-
-func (a *authSHA1v4) GetData() interface{} {
-	if a.data == nil {
-		a.data = &authData{}
-	}
-	return a.data
-}
-
-func (a *authSHA1v4) packData(data []byte) (outData []byte) {
-	dataLength := len(data)
-	randLength := 1
-	if dataLength <= 1300 {
-		if dataLength > 400 {
-			randLength += rand.Intn(128)
-		} else {
-			randLength += rand.Intn(1024)
-		}
-	}
-
-	outLength := randLength + dataLength + 8
-	outData = make([]byte, outLength)
-	// 0~1, out length
-	binary.BigEndian.PutUint16(outData[0:2], uint16(outLength&0xFFFF))
-	// 2~3, crc of out length
-	crc32 := ssr.CalcCRC32(outData, 2, 0xFFFFFFFF)
-	binary.LittleEndian.PutUint16(outData[2:4], uint16(crc32&0xFFFF))
-	// 4~rand length+4, rand number
-	rand.Read(outData[4 : 4+randLength])
-	// 4, rand length
-	if randLength < 128 {
-		outData[4] = byte(randLength & 0xFF)
-	} else {
-		// 4, magic number 0xFF
-		outData[4] = 0xFF
-		// 5~6, rand length
-		binary.BigEndian.PutUint16(outData[5:], uint16(randLength&0xFFFF))
-	}
-	// rand length+4~out length-4, data
-	if dataLength > 0 {
-		copy(outData[randLength+4:], data)
-	}
-	// out length-4~end, adler32 of full data
-	adler := ssr.CalcAdler32(outData[:outLength-4])
-	binary.LittleEndian.PutUint32(outData[outLength-4:], adler)
-
-	return outData
-}
-
-func (a *authSHA1v4) packAuthData(data []byte) (outData []byte) {
-	dataLength := len(data)
-	randLength := 1
-	if dataLength <= 1300 {
-		if dataLength > 400 {
-			randLength += rand.Intn(128)
-		} else {
-			randLength += rand.Intn(1024)
-		}
-	}
-	dataOffset := randLength + 4 + 2
-	outLength := dataOffset + dataLength + 12 + ssr.ObfsHMACSHA1Len
-	outData = make([]byte, outLength)
-
-	a.data.connectionID++
-	if a.data.connectionID > 0xFF000000 {
-		a.data.clientID = nil
-	}
-	if len(a.data.clientID) == 0 {
-		a.data.clientID = make([]byte, 8)
-		rand.Read(a.data.clientID)
-		b := make([]byte, 4)
-		rand.Read(b)
-		a.data.connectionID = binary.LittleEndian.Uint32(b) & 0xFFFFFF
-	}
-	// 0-1, out length
-	binary.BigEndian.PutUint16(outData[0:], uint16(outLength&0xFFFF))
-
-	// 2~6, crc of out length+salt+key
-	salt := []byte("auth_sha1_v4")
-	crcData := make([]byte, len(salt)+a.KeyLen+2)
-	copy(crcData[0:2], outData[0:2])
-	copy(crcData[2:], salt)
-	copy(crcData[2+len(salt):], a.Key)
-	crc32 := ssr.CalcCRC32(crcData, len(crcData), 0xFFFFFFFF)
-	// 2~6, crc of out length+salt+key
-	binary.LittleEndian.PutUint32(outData[2:], crc32)
-	// 6~rand length+6, rand numbers
-	rand.Read(outData[dataOffset-randLength : dataOffset])
-	// 6, rand length
-	if randLength < 128 {
-		outData[6] = byte(randLength & 0xFF)
-	} else {
-		// 6, magic number 0xFF
-		outData[6] = 0xFF
-		// 7-8, rand length
-		binary.BigEndian.PutUint16(outData[7:], uint16(randLength&0xFFFF))
-	}
-	// rand length+6~rand length+10, time stamp
-	now := time.Now().Unix()
-	binary.LittleEndian.PutUint32(outData[dataOffset:dataOffset+4], uint32(now))
-	// rand length+10~rand length+14, client ID
-	copy(outData[dataOffset+4:dataOffset+4+4], a.data.clientID[0:4])
-	// rand length+14~rand length+18, connection ID
-	binary.LittleEndian.PutUint32(outData[dataOffset+8:dataOffset+8+4], a.data.connectionID)
-	// rand length+18~rand length+18+data length, data
-	copy(outData[dataOffset+12:], data)
-
-	key := make([]byte, a.IVLen+a.KeyLen)
-	copy(key, a.IV)
-	copy(key[a.IVLen:], a.Key)
-
-	h := tools.HmacSHA1(key, outData[:outLength-ssr.ObfsHMACSHA1Len])
-	// out length-10~out length/rand length+18+data length~end, hmac
-	copy(outData[outLength-ssr.ObfsHMACSHA1Len:], h[0:ssr.ObfsHMACSHA1Len])
-
-	return outData
-}
-
-func (a *authSHA1v4) PreEncrypt(plainData []byte) (outData []byte, err error) {
-	dataLength := len(plainData)
-	offset := 0
-	if !a.hasSentHeader && dataLength > 0 {
-		authLength := dataLength
-		if headSize := ssr.GetHeadSize(plainData, 30); headSize <= dataLength {
-			authLength = headSize
-		}
-		packData := a.packAuthData(plainData[:authLength])
-		a.hasSentHeader = true
-		outData = append(outData, packData...)
-		dataLength -= authLength
-		offset += authLength
-	}
-	const blockSize = 4096
-	for dataLength > blockSize {
-		packData := a.packData(plainData[offset : offset+blockSize])
-		outData = append(outData, packData...)
-		dataLength -= blockSize
-		offset += blockSize
-	}
-	if dataLength > 0 {
-		packData := a.packData(plainData[offset:])
-		outData = append(outData, packData...)
-	}
-
-	return
-}
-
-func (a *authSHA1v4) PostDecrypt(plainData []byte) ([]byte, int, error) {
-	var outData []byte
-	dataLength := len(plainData)
-	b := make([]byte, len(a.recvBuffer)+dataLength)
-	copy(b, a.recvBuffer)
-	copy(b[len(a.recvBuffer):], plainData)
-	a.recvBuffer = b
-	a.recvBufferLength = len(b)
-	for a.recvBufferLength > 4 {
-		crc32 := ssr.CalcCRC32(a.recvBuffer, 2, 0xFFFFFFFF)
-		if binary.LittleEndian.Uint16(a.recvBuffer[2:4]) != uint16(crc32&0xFFFF) {
-			return nil, 0, ssr.ErrAuthSHA1v4CRC32Error
-		}
-		length := int(binary.BigEndian.Uint16(a.recvBuffer[0:2]))
-		if length >= 8192 || length < 8 {
-			a.recvBufferLength = 0
-			a.recvBuffer = nil
-			return nil, 0, ssr.ErrAuthSHA1v4DataLengthError
-		}
-		if length > a.recvBufferLength {
-			break
-		}
-
-		if ssr.CheckAdler32(a.recvBuffer, length) {
-			pos := int(a.recvBuffer[4])
-			if pos != 0xFF {
-				pos += 4
-			} else {
-				pos = int(binary.BigEndian.Uint16(a.recvBuffer[5:5+2])) + 4
-			}
-			outLength := length - pos - 4
-			b = make([]byte, len(outData)+outLength)
-			copy(b, outData)
-			copy(b[len(outData):], a.recvBuffer[pos:pos+outLength])
-			outData = b
-			a.recvBufferLength -= length
-			a.recvBuffer = a.recvBuffer[length:]
-		} else {
-			a.recvBufferLength = 0
-			a.recvBuffer = nil
-			return nil, 0, ssr.ErrAuthSHA1v4IncorrectChecksum
-		}
-	}
-	return outData, 0, nil
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/base.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/base.go
deleted file mode 100644
index 4fff0aa..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/protocol/base.go
+++ /dev/null
@@ -1,39 +0,0 @@
-package protocol
-
-import (
-	"strings"
-
-	"github.com/sun8911879/shadowsocksR/ssr"
-)
-
-type creator func() IProtocol
-
-var (
-	creatorMap = make(map[string]creator)
-)
-
-type IProtocol interface {
-	SetServerInfo(s *ssr.ServerInfoForObfs)
-	GetServerInfo() *ssr.ServerInfoForObfs
-	PreEncrypt(data []byte) ([]byte, error)
-	PostDecrypt(data []byte) ([]byte, int, error)
-	SetData(data interface{})
-	GetData() interface{}
-}
-
-type authData struct {
-	clientID     []byte
-	connectionID uint32
-}
-
-func register(name string, c creator) {
-	creatorMap[name] = c
-}
-
-func NewProtocol(name string) IProtocol {
-	c, ok := creatorMap[strings.ToLower(name)]
-	if ok {
-		return c()
-	}
-	return nil
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/origin.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/origin.go
deleted file mode 100644
index 8dd1851..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/protocol/origin.go
+++ /dev/null
@@ -1,42 +0,0 @@
-package protocol
-
-import (
-	"github.com/sun8911879/shadowsocksR/ssr"
-)
-
-func init() {
-	register("origin", NewOrigin)
-}
-
-type origin struct {
-	ssr.ServerInfoForObfs
-}
-
-func NewOrigin() IProtocol {
-	a := &origin{}
-	return a
-}
-
-func (o *origin) SetServerInfo(s *ssr.ServerInfoForObfs) {
-	o.ServerInfoForObfs = *s
-}
-
-func (o *origin) GetServerInfo() (s *ssr.ServerInfoForObfs) {
-	return &o.ServerInfoForObfs
-}
-
-func (o *origin) PreEncrypt(data []byte) (encryptedData []byte, err error) {
-	return data, nil
-}
-
-func (o *origin) PostDecrypt(data []byte) ([]byte, int, error) {
-	return data, 0, nil
-}
-
-func (o *origin) SetData(data interface{}) {
-
-}
-
-func (o *origin) GetData() interface{} {
-	return nil
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/verify_sha1.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/verify_sha1.go
deleted file mode 100644
index 42543e5..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/protocol/verify_sha1.go
+++ /dev/null
@@ -1,101 +0,0 @@
-package protocol
-
-import (
-	"bytes"
-	"encoding/binary"
-
-	"github.com/sun8911879/shadowsocksR/ssr"
-	"github.com/sun8911879/shadowsocksR/tools"
-)
-
-func init() {
-	register("verify_sha1", NewVerifySHA1)
-	register("ota", NewVerifySHA1)
-}
-
-type verifySHA1 struct {
-	ssr.ServerInfoForObfs
-	hasSentHeader bool
-	chunkId       uint32
-}
-
-const (
-	oneTimeAuthMask byte = 0x10
-)
-
-func NewVerifySHA1() IProtocol {
-	a := &verifySHA1{}
-	return a
-}
-
-func (v *verifySHA1) otaConnectAuth(data []byte) []byte {
-	return append(data, tools.HmacSHA1(append(v.IV, v.Key...), data)...)
-}
-
-func (v *verifySHA1) otaReqChunkAuth(chunkId uint32, data []byte) []byte {
-	nb := make([]byte, 2)
-	binary.BigEndian.PutUint16(nb, uint16(len(data)))
-	chunkIdBytes := make([]byte, 4)
-	binary.BigEndian.PutUint32(chunkIdBytes, chunkId)
-	header := append(nb, tools.HmacSHA1(append(v.IV, chunkIdBytes...), data)...)
-	return append(header, data...)
-}
-
-func (v *verifySHA1) otaVerifyAuth(iv []byte, chunkId uint32, data []byte, expectedHmacSha1 []byte) bool {
-	chunkIdBytes := make([]byte, 4)
-	binary.BigEndian.PutUint32(chunkIdBytes, chunkId)
-	actualHmacSha1 := tools.HmacSHA1(append(iv, chunkIdBytes...), data)
-	return bytes.Equal(expectedHmacSha1, actualHmacSha1)
-}
-
-func (v *verifySHA1) getAndIncreaseChunkId() (chunkId uint32) {
-	chunkId = v.chunkId
-	v.chunkId += 1
-	return
-}
-
-func (v *verifySHA1) SetServerInfo(s *ssr.ServerInfoForObfs) {
-	v.ServerInfoForObfs = *s
-}
-
-func (v *verifySHA1) GetServerInfo() (s *ssr.ServerInfoForObfs) {
-	return &v.ServerInfoForObfs
-}
-
-func (v *verifySHA1) SetData(data interface{}) {
-
-}
-
-func (v *verifySHA1) GetData() interface{} {
-	return nil
-}
-
-func (v *verifySHA1) PreEncrypt(data []byte) (encryptedData []byte, err error) {
-	dataLength := len(data)
-	offset := 0
-	if !v.hasSentHeader {
-		data[0] |= oneTimeAuthMask
-		encryptedData = v.otaConnectAuth(data[:v.HeadLen])
-		v.hasSentHeader = true
-		dataLength -= v.HeadLen
-		offset += v.HeadLen
-	}
-	const blockSize = 4096
-	for dataLength > blockSize {
-		chunkId := v.getAndIncreaseChunkId()
-		b := v.otaReqChunkAuth(chunkId, data[offset:offset+blockSize])
-		encryptedData = append(encryptedData, b...)
-		dataLength -= blockSize
-		offset += blockSize
-	}
-	if dataLength > 0 {
-		chunkId := v.getAndIncreaseChunkId()
-		b := v.otaReqChunkAuth(chunkId, data[offset:])
-		encryptedData = append(encryptedData, b...)
-	}
-	return
-}
-
-func (v *verifySHA1) PostDecrypt(data []byte) ([]byte, int, error) {
-	return data, 0, nil
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/ssr/adler32.go b/vendor/github.com/sun8911879/shadowsocksR/ssr/adler32.go
deleted file mode 100644
index 6bda937..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/ssr/adler32.go
+++ /dev/null
@@ -1,31 +0,0 @@
-package ssr
-
-import "encoding/binary"
-
-func calcShortAdler32(input []byte, a, b uint32) (uint32, uint32) {
-	for _, i := range input {
-		a += uint32(i)
-		b += a
-	}
-	a %= 65521
-	b %= 65521
-	return a, b
-}
-
-func CalcAdler32(input []byte) uint32 {
-	var a uint32 = 1
-	var b uint32 = 0
-	const nMax = 5552
-	for length := len(input); length > nMax; length -= nMax {
-		a, b = calcShortAdler32(input[:nMax], a, b)
-		input = input[nMax:]
-	}
-	a, b = calcShortAdler32(input, a, b)
-	return (b << 16) + a
-}
-
-func CheckAdler32(input []byte, l int) bool {
-	adler32 := CalcAdler32(input[:l-4])
-	checksum := binary.LittleEndian.Uint32(input[l-4:])
-	return adler32 == checksum
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/ssr/crc32.go b/vendor/github.com/sun8911879/shadowsocksR/ssr/crc32.go
deleted file mode 100644
index 9cf6cc6..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/ssr/crc32.go
+++ /dev/null
@@ -1,52 +0,0 @@
-package ssr
-
-import "encoding/binary"
-
-var (
-	crc32Table = make([]uint32, 256)
-)
-
-func init() {
-	createCRC32Table()
-}
-
-func createCRC32Table() {
-	for i := 0; i < 256; i++ {
-		crc := uint32(i)
-		for j := 8; j > 0; j-- {
-			if crc&1 == 1 {
-				crc = (crc >> 1) ^ 0xEDB88320
-			} else {
-				crc >>= 1
-			}
-		}
-		crc32Table[i] = crc
-	}
-}
-
-func CalcCRC32(input []byte, length int, value uint32) uint32 {
-	value = 0xFFFFFFFF
-	return DoCalcCRC32(input, 0, length, value)
-}
-
-func DoCalcCRC32(input []byte, index int, length int, value uint32) uint32 {
-	buffer := input
-	for i := index; i < length; i++ {
-		value = (value >> 8) ^ crc32Table[byte(value&0xFF)^buffer[i]]
-	}
-	return value ^ 0xFFFFFFFF
-}
-
-func DoSetCRC32(buffer []byte, index int, length int) {
-	crc := CalcCRC32(buffer[:length-4], length-4, 0xFFFFFFFF)
-	binary.LittleEndian.PutUint32(buffer[length-4:], crc^0xFFFFFFFF)
-}
-
-func SetCRC32(buffer []byte, length int) {
-	DoSetCRC32(buffer, 0, length)
-}
-
-func CheckCRC32(buffer []byte, length int) bool {
-	crc := CalcCRC32(buffer, length, 0xFFFFFFFF)
-	return crc == 0xFFFFFFFF
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/ssr/obfs.go b/vendor/github.com/sun8911879/shadowsocksR/ssr/obfs.go
deleted file mode 100644
index cee94a2..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/ssr/obfs.go
+++ /dev/null
@@ -1,55 +0,0 @@
-package ssr
-
-import "errors"
-
-const ObfsHMACSHA1Len = 10
-
-var (
-	ErrAuthSHA1v4CRC32Error                = errors.New("auth_sha1_v4 post decrypt data crc32 error")
-	ErrAuthSHA1v4DataLengthError           = errors.New("auth_sha1_v4 post decrypt data length error")
-	ErrAuthSHA1v4IncorrectChecksum         = errors.New("auth_sha1_v4 post decrypt incorrect checksum")
-	ErrAuthAES128HMACError                 = errors.New("auth_aes128_* post decrypt hmac error")
-	ErrAuthAES128DataLengthError           = errors.New("auth_aes128_* post decrypt length mismatch")
-	ErrAuthAES128IncorrectChecksum         = errors.New("auth_aes128_* post decrypt incorrect checksum")
-	ErrTLS12TicketAuthTooShortData         = errors.New("tls1.2_ticket_auth too short data")
-	ErrTLS12TicketAuthHMACError            = errors.New("tls1.2_ticket_auth hmac verifying failed")
-	ErrTLS12TicketAuthIncorrectMagicNumber = errors.New("tls1.2_ticket_auth incorrect magic number")
-)
-
-type ServerInfoForObfs struct {
-	Host      string
-	Port      uint16
-	Param     string
-	IV        []byte
-	IVLen     int
-	RecvIV    []byte
-	RecvIVLen int
-	Key       []byte
-	KeyLen    int
-	HeadLen   int
-	TcpMss    int
-}
-
-func GetHeadSize(data []byte, defaultValue int) int {
-	if data == nil || len(data) < 2 {
-		return defaultValue
-	}
-	headType := data[0] & 0x07
-	switch headType {
-	case 1:
-		// IPv4 1+4+2
-		return 7
-	case 4:
-		// IPv6 1+16+2
-		return 19
-	case 3:
-		// domain name, variant length
-		return 4 + int(data[1])
-	}
-
-	return defaultValue
-}
-
-func (s *ServerInfoForObfs) SetHeadLen(data []byte, defaultValue int) {
-	s.HeadLen = GetHeadSize(data, defaultValue)
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/tcp.go b/vendor/github.com/sun8911879/shadowsocksR/tcp.go
deleted file mode 100644
index 4e86d98..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/tcp.go
+++ /dev/null
@@ -1,251 +0,0 @@
-package shadowsocksr
-
-import (
-	"bytes"
-	"fmt"
-	"net"
-	"sync"
-
-	"github.com/sun8911879/shadowsocksR/obfs"
-	"github.com/sun8911879/shadowsocksR/protocol"
-	"github.com/sun8911879/shadowsocksR/tools/leakybuf"
-)
-
-// SSTCPConn the struct that override the net.Conn methods
-type SSTCPConn struct {
-	net.Conn
-	sync.RWMutex
-	*StreamCipher
-	IObfs          obfs.IObfs
-	IProtocol      protocol.IProtocol
-	readBuf        []byte
-	readDecodeBuf  *bytes.Buffer
-	readIObfsBuf   *bytes.Buffer
-	readEncryptBuf *bytes.Buffer
-	readIndex      uint64
-	readUserBuf    *bytes.Buffer
-	writeBuf       []byte
-	lastReadError  error
-}
-
-func NewSSTCPConn(c net.Conn, cipher *StreamCipher) *SSTCPConn {
-	return &SSTCPConn{
-		Conn:           c,
-		StreamCipher:   cipher,
-		readBuf:        leakybuf.GlobalLeakyBuf.Get(),
-		readDecodeBuf:  bytes.NewBuffer(nil),
-		readIObfsBuf:   bytes.NewBuffer(nil),
-		readUserBuf:    bytes.NewBuffer(nil),
-		readEncryptBuf: bytes.NewBuffer(nil),
-		writeBuf:       leakybuf.GlobalLeakyBuf.Get(),
-	}
-}
-
-func (c *SSTCPConn) Close() error {
-	leakybuf.GlobalLeakyBuf.Put(c.readBuf)
-	leakybuf.GlobalLeakyBuf.Put(c.writeBuf)
-	return c.Conn.Close()
-}
-
-func (c *SSTCPConn) GetIv() (iv []byte) {
-	iv = make([]byte, len(c.iv))
-	copy(iv, c.iv)
-	return
-}
-
-func (c *SSTCPConn) GetKey() (key []byte) {
-	key = make([]byte, len(c.key))
-	copy(key, c.key)
-	return
-}
-
-func (c *SSTCPConn) initEncryptor(b []byte) (iv []byte, err error) {
-	if c.enc == nil {
-		iv, err = c.initEncrypt()
-		if err != nil {
-			return nil, err
-		}
-
-		// should initialize obfs/protocol now, because iv is ready now
-		obfsServerInfo := c.IObfs.GetServerInfo()
-		obfsServerInfo.SetHeadLen(b, 30)
-		obfsServerInfo.IV, obfsServerInfo.IVLen = c.IV()
-		obfsServerInfo.Key, obfsServerInfo.KeyLen = c.Key()
-		c.IObfs.SetServerInfo(obfsServerInfo)
-
-		protocolServerInfo := c.IProtocol.GetServerInfo()
-		protocolServerInfo.SetHeadLen(b, 30)
-		protocolServerInfo.IV, protocolServerInfo.IVLen = c.IV()
-		protocolServerInfo.Key, protocolServerInfo.KeyLen = c.Key()
-		c.IProtocol.SetServerInfo(protocolServerInfo)
-	}
-	return
-}
-
-func (c *SSTCPConn) Read(b []byte) (n int, err error) {
-	for {
-		n, err = c.doRead(b)
-		if b == nil || n != 0 || err != nil {
-			return n, err
-		}
-	}
-}
-
-func (c *SSTCPConn) doRead(b []byte) (n int, err error) {
-	//先吐出已经解密后数据
-	if c.readUserBuf.Len() > 0 {
-		return c.readUserBuf.Read(b)
-	}
-	//未读取够长度继续读取并解码
-	decodelength := c.readDecodeBuf.Len()
-	if (decodelength == 0 || c.readEncryptBuf.Len() > 0 || (c.readIndex != 0 && c.readIndex > uint64(decodelength))) && c.lastReadError == nil {
-		c.readIndex = 0
-		n, c.lastReadError = c.Conn.Read(c.readBuf)
-		//写入decode 缓存
-		c.readDecodeBuf.Write(c.readBuf[0:n])
-	}
-	//无缓冲数据返回错误
-	if c.lastReadError != nil && (decodelength == 0 || uint64(decodelength) < c.readIndex) {
-		return 0, c.lastReadError
-	}
-	decodelength = c.readDecodeBuf.Len()
-	decodebytes := c.readDecodeBuf.Bytes()
-	c.readDecodeBuf.Reset()
-
-	for {
-
-		decodedData, length, err := c.IObfs.Decode(decodebytes)
-		if length == 0 && err != nil {
-			return 0, err
-		}
-
-		//do send back
-		if length == 1 {
-			c.Write(make([]byte, 0))
-			return 0, nil
-		}
-
-		//数据不够长度
-		if err != nil {
-			if uint64(decodelength) >= length {
-				return 0, fmt.Errorf("data length: %d,decode data length: %d unknown panic", decodelength, length)
-			}
-			c.readIndex = length
-			c.readDecodeBuf.Write(decodebytes)
-			if c.readIObfsBuf.Len() == 0 {
-				return 0, nil
-			}
-			break
-		}
-
-		if length >= 1 {
-			//读出数据 但是有多余的数据 返回已经读取数值
-			c.readIObfsBuf.Write(decodedData)
-			decodebytes = decodebytes[length:]
-			decodelength = len(decodebytes)
-			continue
-		}
-
-		//完全读取数据 --	length == 0
-		c.readIObfsBuf.Write(decodedData)
-		break
-	}
-
-	decodedData := c.readIObfsBuf.Bytes()
-	decodelength = c.readIObfsBuf.Len()
-	c.readIObfsBuf.Reset()
-
-	if c.dec == nil {
-		iv := decodedData[0:c.info.ivLen]
-		if err = c.initDecrypt(iv); err != nil {
-			return 0, err
-		}
-
-		if len(c.iv) == 0 {
-			c.iv = iv
-		}
-		decodelength -= c.info.ivLen
-		if decodelength <= 0 {
-			return 0, nil
-		}
-		decodedData = decodedData[c.info.ivLen:]
-	}
-
-	buf := make([]byte, decodelength)
-	c.decrypt(buf, decodedData)
-
-	c.readEncryptBuf.Write(buf)
-	encryptbuf := c.readEncryptBuf.Bytes()
-	c.readEncryptBuf.Reset()
-	postDecryptedData, length, err := c.IProtocol.PostDecrypt(encryptbuf)
-	if err != nil {
-		return 0, err
-	}
-	if length == 0 {
-		c.readEncryptBuf.Write(encryptbuf)
-		return 0, nil
-	}
-
-	if length > 0 {
-		c.readEncryptBuf.Write(encryptbuf[length:])
-	}
-
-	postDecryptedlength := len(postDecryptedData)
-	blength := len(b)
-	copy(b, postDecryptedData)
-	if blength > postDecryptedlength {
-		return postDecryptedlength, nil
-	}
-	c.readUserBuf.Write(postDecryptedData[len(b):])
-	return blength, nil
-}
-
-func (c *SSTCPConn) preWrite(b []byte) (outData []byte, err error) {
-	var iv []byte
-	if iv, err = c.initEncryptor(b); err != nil {
-		return
-	}
-
-	var preEncryptedData []byte
-	preEncryptedData, err = c.IProtocol.PreEncrypt(b)
-	if err != nil {
-		return
-	}
-	preEncryptedDataLen := len(preEncryptedData)
-	//c.encrypt(cipherData[len(iv):], b)
-	encryptedData := make([]byte, preEncryptedDataLen)
-	//! \attention here the expected output buffer length MUST be accurate, it is preEncryptedDataLen now!
-	c.encrypt(encryptedData[0:preEncryptedDataLen], preEncryptedData)
-
-	//common.Info("len(b)=", len(b), ", b:", b,
-	//	", pre encrypted data length:", preEncryptedDataLen,
-	//	", pre encrypted data:", preEncryptedData,
-	//	", encrypted data length:", preEncryptedDataLen)
-
-	cipherData := c.writeBuf
-	dataSize := len(encryptedData) + len(iv)
-	if dataSize > len(cipherData) {
-		cipherData = make([]byte, dataSize)
-	} else {
-		cipherData = cipherData[:dataSize]
-	}
-
-	if iv != nil {
-		// Put initialization vector in buffer before be encoded
-		copy(cipherData, iv)
-	}
-	copy(cipherData[len(iv):], encryptedData)
-
-	return c.IObfs.Encode(cipherData)
-}
-
-func (c *SSTCPConn) Write(b []byte) (n int, err error) {
-	outData, err := c.preWrite(b)
-	if err == nil {
-		n, err = c.Conn.Write(outData)
-		if err != nil {
-			return n, err
-		}
-	}
-	return len(b), nil
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/tools/encrypt.go b/vendor/github.com/sun8911879/shadowsocksR/tools/encrypt.go
deleted file mode 100644
index 5ecb3b9..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/tools/encrypt.go
+++ /dev/null
@@ -1,51 +0,0 @@
-package tools
-
-import (
-	"crypto/hmac"
-	"crypto/md5"
-	"crypto/sha1"
-)
-
-func HmacMD5(key []byte, data []byte) []byte {
-	hmacMD5 := hmac.New(md5.New, key)
-	hmacMD5.Write(data)
-	return hmacMD5.Sum(nil)[:10]
-}
-
-func HmacSHA1(key []byte, data []byte) []byte {
-	hmacSHA1 := hmac.New(sha1.New, key)
-	hmacSHA1.Write(data)
-	return hmacSHA1.Sum(nil)[:10]
-}
-
-func MD5Sum(d []byte) []byte {
-	h := md5.New()
-	h.Write(d)
-	return h.Sum(nil)
-}
-
-func SHA1Sum(d []byte) []byte {
-	h := sha1.New()
-	h.Write(d)
-	return h.Sum(nil)
-}
-
-func EVPBytesToKey(password string, keyLen int) (key []byte) {
-	const md5Len = 16
-
-	cnt := (keyLen-1)/md5Len + 1
-	m := make([]byte, cnt*md5Len)
-	copy(m, MD5Sum([]byte(password)))
-
-	// Repeatedly call md5 until bytes generated is enough.
-	// Each call to md5 uses data: prev md5 sum + password.
-	d := make([]byte, md5Len+len(password))
-	start := 0
-	for i := 1; i < cnt; i++ {
-		start += md5Len
-		copy(d, m[start-md5Len:start])
-		copy(d[md5Len:], password)
-		copy(m[start:], MD5Sum(d))
-	}
-	return m[:keyLen]
-}
diff --git a/vendor/github.com/sun8911879/shadowsocksR/tools/leakybuf/leakybuf.go b/vendor/github.com/sun8911879/shadowsocksR/tools/leakybuf/leakybuf.go
deleted file mode 100644
index 096c9cb..0000000
--- a/vendor/github.com/sun8911879/shadowsocksR/tools/leakybuf/leakybuf.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// Provides leaky buffer, based on the example in Effective Go.
-package leakybuf
-
-type LeakyBuf struct {
-	bufSize  int // size of each buffer
-	freeList chan []byte
-}
-
-// NewLeakyBuf creates a leaky buffer which can hold at most n buffer, each
-// with bufSize bytes.
-func NewLeakyBuf(n, bufSize int) *LeakyBuf {
-	return &LeakyBuf{
-		bufSize:  bufSize,
-		freeList: make(chan []byte, n),
-	}
-}
-
-// Get returns a buffer from the leaky buffer or create a new buffer.
-func (lb *LeakyBuf) Get() (b []byte) {
-	select {
-	case b = <-lb.freeList:
-	default:
-		b = make([]byte, lb.bufSize)
-	}
-	return
-}
-
-// Put add the buffer into the free buffer pool for reuse. Panic if the buffer
-// size is not the same with the leaky buffer's. This is intended to expose
-// error usage of leaky buffer.
-func (lb *LeakyBuf) Put(b []byte) {
-	if len(b) != lb.bufSize {
-		panic("invalid buffer size that's put into leaky buffer")
-	}
-	select {
-	case lb.freeList <- b:
-	default:
-	}
-	return
-}
-
-const (
-	GlobalLeakyBufSize = 32 * 1024 // data.len(2) + hmacsha1(10) + data(4096)
-	maxNBuf            = 8192
-)
-
-var GlobalLeakyBuf = NewLeakyBuf(maxNBuf, GlobalLeakyBufSize)
diff --git a/vendor/golang.org/x/crypto/LICENSE b/vendor/golang.org/x/crypto/LICENSE
deleted file mode 100644
index 6a66aea..0000000
--- a/vendor/golang.org/x/crypto/LICENSE
+++ /dev/null
@@ -1,27 +0,0 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/golang.org/x/crypto/PATENTS b/vendor/golang.org/x/crypto/PATENTS
deleted file mode 100644
index 7330990..0000000
--- a/vendor/golang.org/x/crypto/PATENTS
+++ /dev/null
@@ -1,22 +0,0 @@
-Additional IP Rights Grant (Patents)
-
-"This implementation" means the copyrightable works distributed by
-Google as part of the Go project.
-
-Google hereby grants to You a perpetual, worldwide, non-exclusive,
-no-charge, royalty-free, irrevocable (except as stated in this section)
-patent license to make, have made, use, offer to sell, sell, import,
-transfer and otherwise run, modify and propagate the contents of this
-implementation of Go, where such license applies only to those patent
-claims, both currently owned or controlled by Google and acquired in
-the future, licensable by Google that are necessarily infringed by this
-implementation of Go.  This grant does not include claims that would be
-infringed only as a consequence of further modification of this
-implementation.  If you or your agent or exclusive licensee institute or
-order or agree to the institution of patent litigation against any
-entity (including a cross-claim or counterclaim in a lawsuit) alleging
-that this implementation of Go or any code incorporated within this
-implementation of Go constitutes direct or contributory patent
-infringement, or inducement of patent infringement, then any patent
-rights granted to you under this License for this implementation of Go
-shall terminate as of the date such litigation is filed.
diff --git a/vendor/golang.org/x/crypto/blowfish/block.go b/vendor/golang.org/x/crypto/blowfish/block.go
deleted file mode 100644
index 9d80f19..0000000
--- a/vendor/golang.org/x/crypto/blowfish/block.go
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright 2010 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package blowfish
-
-// getNextWord returns the next big-endian uint32 value from the byte slice
-// at the given position in a circular manner, updating the position.
-func getNextWord(b []byte, pos *int) uint32 {
-	var w uint32
-	j := *pos
-	for i := 0; i < 4; i++ {
-		w = w<<8 | uint32(b[j])
-		j++
-		if j >= len(b) {
-			j = 0
-		}
-	}
-	*pos = j
-	return w
-}
-
-// ExpandKey performs a key expansion on the given *Cipher. Specifically, it
-// performs the Blowfish algorithm's key schedule which sets up the *Cipher's
-// pi and substitution tables for calls to Encrypt. This is used, primarily,
-// by the bcrypt package to reuse the Blowfish key schedule during its
-// set up. It's unlikely that you need to use this directly.
-func ExpandKey(key []byte, c *Cipher) {
-	j := 0
-	for i := 0; i < 18; i++ {
-		// Using inlined getNextWord for performance.
-		var d uint32
-		for k := 0; k < 4; k++ {
-			d = d<<8 | uint32(key[j])
-			j++
-			if j >= len(key) {
-				j = 0
-			}
-		}
-		c.p[i] ^= d
-	}
-
-	var l, r uint32
-	for i := 0; i < 18; i += 2 {
-		l, r = encryptBlock(l, r, c)
-		c.p[i], c.p[i+1] = l, r
-	}
-
-	for i := 0; i < 256; i += 2 {
-		l, r = encryptBlock(l, r, c)
-		c.s0[i], c.s0[i+1] = l, r
-	}
-	for i := 0; i < 256; i += 2 {
-		l, r = encryptBlock(l, r, c)
-		c.s1[i], c.s1[i+1] = l, r
-	}
-	for i := 0; i < 256; i += 2 {
-		l, r = encryptBlock(l, r, c)
-		c.s2[i], c.s2[i+1] = l, r
-	}
-	for i := 0; i < 256; i += 2 {
-		l, r = encryptBlock(l, r, c)
-		c.s3[i], c.s3[i+1] = l, r
-	}
-}
-
-// This is similar to ExpandKey, but folds the salt during the key
-// schedule. While ExpandKey is essentially expandKeyWithSalt with an all-zero
-// salt passed in, reusing ExpandKey turns out to be a place of inefficiency
-// and specializing it here is useful.
-func expandKeyWithSalt(key []byte, salt []byte, c *Cipher) {
-	j := 0
-	for i := 0; i < 18; i++ {
-		c.p[i] ^= getNextWord(key, &j)
-	}
-
-	j = 0
-	var l, r uint32
-	for i := 0; i < 18; i += 2 {
-		l ^= getNextWord(salt, &j)
-		r ^= getNextWord(salt, &j)
-		l, r = encryptBlock(l, r, c)
-		c.p[i], c.p[i+1] = l, r
-	}
-
-	for i := 0; i < 256; i += 2 {
-		l ^= getNextWord(salt, &j)
-		r ^= getNextWord(salt, &j)
-		l, r = encryptBlock(l, r, c)
-		c.s0[i], c.s0[i+1] = l, r
-	}
-
-	for i := 0; i < 256; i += 2 {
-		l ^= getNextWord(salt, &j)
-		r ^= getNextWord(salt, &j)
-		l, r = encryptBlock(l, r, c)
-		c.s1[i], c.s1[i+1] = l, r
-	}
-
-	for i := 0; i < 256; i += 2 {
-		l ^= getNextWord(salt, &j)
-		r ^= getNextWord(salt, &j)
-		l, r = encryptBlock(l, r, c)
-		c.s2[i], c.s2[i+1] = l, r
-	}
-
-	for i := 0; i < 256; i += 2 {
-		l ^= getNextWord(salt, &j)
-		r ^= getNextWord(salt, &j)
-		l, r = encryptBlock(l, r, c)
-		c.s3[i], c.s3[i+1] = l, r
-	}
-}
-
-func encryptBlock(l, r uint32, c *Cipher) (uint32, uint32) {
-	xl, xr := l, r
-	xl ^= c.p[0]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[1]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[2]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[3]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[4]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[5]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[6]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[7]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[8]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[9]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[10]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[11]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[12]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[13]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[14]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[15]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[16]
-	xr ^= c.p[17]
-	return xr, xl
-}
-
-func decryptBlock(l, r uint32, c *Cipher) (uint32, uint32) {
-	xl, xr := l, r
-	xl ^= c.p[17]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[16]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[15]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[14]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[13]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[12]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[11]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[10]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[9]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[8]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[7]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[6]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[5]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[4]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[3]
-	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[2]
-	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[1]
-	xr ^= c.p[0]
-	return xr, xl
-}
diff --git a/vendor/golang.org/x/crypto/blowfish/cipher.go b/vendor/golang.org/x/crypto/blowfish/cipher.go
deleted file mode 100644
index 2641dad..0000000
--- a/vendor/golang.org/x/crypto/blowfish/cipher.go
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2010 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package blowfish implements Bruce Schneier's Blowfish encryption algorithm.
-package blowfish // import "golang.org/x/crypto/blowfish"
-
-// The code is a port of Bruce Schneier's C implementation.
-// See https://www.schneier.com/blowfish.html.
-
-import "strconv"
-
-// The Blowfish block size in bytes.
-const BlockSize = 8
-
-// A Cipher is an instance of Blowfish encryption using a particular key.
-type Cipher struct {
-	p              [18]uint32
-	s0, s1, s2, s3 [256]uint32
-}
-
-type KeySizeError int
-
-func (k KeySizeError) Error() string {
-	return "crypto/blowfish: invalid key size " + strconv.Itoa(int(k))
-}
-
-// NewCipher creates and returns a Cipher.
-// The key argument should be the Blowfish key, from 1 to 56 bytes.
-func NewCipher(key []byte) (*Cipher, error) {
-	var result Cipher
-	if k := len(key); k < 1 || k > 56 {
-		return nil, KeySizeError(k)
-	}
-	initCipher(&result)
-	ExpandKey(key, &result)
-	return &result, nil
-}
-
-// NewSaltedCipher creates a returns a Cipher that folds a salt into its key
-// schedule. For most purposes, NewCipher, instead of NewSaltedCipher, is
-// sufficient and desirable. For bcrypt compatibility, the key can be over 56
-// bytes.
-func NewSaltedCipher(key, salt []byte) (*Cipher, error) {
-	if len(salt) == 0 {
-		return NewCipher(key)
-	}
-	var result Cipher
-	if k := len(key); k < 1 {
-		return nil, KeySizeError(k)
-	}
-	initCipher(&result)
-	expandKeyWithSalt(key, salt, &result)
-	return &result, nil
-}
-
-// BlockSize returns the Blowfish block size, 8 bytes.
-// It is necessary to satisfy the Block interface in the
-// package "crypto/cipher".
-func (c *Cipher) BlockSize() int { return BlockSize }
-
-// Encrypt encrypts the 8-byte buffer src using the key k
-// and stores the result in dst.
-// Note that for amounts of data larger than a block,
-// it is not safe to just call Encrypt on successive blocks;
-// instead, use an encryption mode like CBC (see crypto/cipher/cbc.go).
-func (c *Cipher) Encrypt(dst, src []byte) {
-	l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
-	r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
-	l, r = encryptBlock(l, r, c)
-	dst[0], dst[1], dst[2], dst[3] = byte(l>>24), byte(l>>16), byte(l>>8), byte(l)
-	dst[4], dst[5], dst[6], dst[7] = byte(r>>24), byte(r>>16), byte(r>>8), byte(r)
-}
-
-// Decrypt decrypts the 8-byte buffer src using the key k
-// and stores the result in dst.
-func (c *Cipher) Decrypt(dst, src []byte) {
-	l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
-	r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
-	l, r = decryptBlock(l, r, c)
-	dst[0], dst[1], dst[2], dst[3] = byte(l>>24), byte(l>>16), byte(l>>8), byte(l)
-	dst[4], dst[5], dst[6], dst[7] = byte(r>>24), byte(r>>16), byte(r>>8), byte(r)
-}
-
-func initCipher(c *Cipher) {
-	copy(c.p[0:], p[0:])
-	copy(c.s0[0:], s0[0:])
-	copy(c.s1[0:], s1[0:])
-	copy(c.s2[0:], s2[0:])
-	copy(c.s3[0:], s3[0:])
-}
diff --git a/vendor/golang.org/x/crypto/blowfish/const.go b/vendor/golang.org/x/crypto/blowfish/const.go
deleted file mode 100644
index d040775..0000000
--- a/vendor/golang.org/x/crypto/blowfish/const.go
+++ /dev/null
@@ -1,199 +0,0 @@
-// Copyright 2010 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// The startup permutation array and substitution boxes.
-// They are the hexadecimal digits of PI; see:
-// https://www.schneier.com/code/constants.txt.
-
-package blowfish
-
-var s0 = [256]uint32{
-	0xd1310ba6, 0x98dfb5ac, 0x2ffd72db, 0xd01adfb7, 0xb8e1afed, 0x6a267e96,
-	0xba7c9045, 0xf12c7f99, 0x24a19947, 0xb3916cf7, 0x0801f2e2, 0x858efc16,
-	0x636920d8, 0x71574e69, 0xa458fea3, 0xf4933d7e, 0x0d95748f, 0x728eb658,
-	0x718bcd58, 0x82154aee, 0x7b54a41d, 0xc25a59b5, 0x9c30d539, 0x2af26013,
-	0xc5d1b023, 0x286085f0, 0xca417918, 0xb8db38ef, 0x8e79dcb0, 0x603a180e,
-	0x6c9e0e8b, 0xb01e8a3e, 0xd71577c1, 0xbd314b27, 0x78af2fda, 0x55605c60,
-	0xe65525f3, 0xaa55ab94, 0x57489862, 0x63e81440, 0x55ca396a, 0x2aab10b6,
-	0xb4cc5c34, 0x1141e8ce, 0xa15486af, 0x7c72e993, 0xb3ee1411, 0x636fbc2a,
-	0x2ba9c55d, 0x741831f6, 0xce5c3e16, 0x9b87931e, 0xafd6ba33, 0x6c24cf5c,
-	0x7a325381, 0x28958677, 0x3b8f4898, 0x6b4bb9af, 0xc4bfe81b, 0x66282193,
-	0x61d809cc, 0xfb21a991, 0x487cac60, 0x5dec8032, 0xef845d5d, 0xe98575b1,
-	0xdc262302, 0xeb651b88, 0x23893e81, 0xd396acc5, 0x0f6d6ff3, 0x83f44239,
-	0x2e0b4482, 0xa4842004, 0x69c8f04a, 0x9e1f9b5e, 0x21c66842, 0xf6e96c9a,
-	0x670c9c61, 0xabd388f0, 0x6a51a0d2, 0xd8542f68, 0x960fa728, 0xab5133a3,
-	0x6eef0b6c, 0x137a3be4, 0xba3bf050, 0x7efb2a98, 0xa1f1651d, 0x39af0176,
-	0x66ca593e, 0x82430e88, 0x8cee8619, 0x456f9fb4, 0x7d84a5c3, 0x3b8b5ebe,
-	0xe06f75d8, 0x85c12073, 0x401a449f, 0x56c16aa6, 0x4ed3aa62, 0x363f7706,
-	0x1bfedf72, 0x429b023d, 0x37d0d724, 0xd00a1248, 0xdb0fead3, 0x49f1c09b,
-	0x075372c9, 0x80991b7b, 0x25d479d8, 0xf6e8def7, 0xe3fe501a, 0xb6794c3b,
-	0x976ce0bd, 0x04c006ba, 0xc1a94fb6, 0x409f60c4, 0x5e5c9ec2, 0x196a2463,
-	0x68fb6faf, 0x3e6c53b5, 0x1339b2eb, 0x3b52ec6f, 0x6dfc511f, 0x9b30952c,
-	0xcc814544, 0xaf5ebd09, 0xbee3d004, 0xde334afd, 0x660f2807, 0x192e4bb3,
-	0xc0cba857, 0x45c8740f, 0xd20b5f39, 0xb9d3fbdb, 0x5579c0bd, 0x1a60320a,
-	0xd6a100c6, 0x402c7279, 0x679f25fe, 0xfb1fa3cc, 0x8ea5e9f8, 0xdb3222f8,
-	0x3c7516df, 0xfd616b15, 0x2f501ec8, 0xad0552ab, 0x323db5fa, 0xfd238760,
-	0x53317b48, 0x3e00df82, 0x9e5c57bb, 0xca6f8ca0, 0x1a87562e, 0xdf1769db,
-	0xd542a8f6, 0x287effc3, 0xac6732c6, 0x8c4f5573, 0x695b27b0, 0xbbca58c8,
-	0xe1ffa35d, 0xb8f011a0, 0x10fa3d98, 0xfd2183b8, 0x4afcb56c, 0x2dd1d35b,
-	0x9a53e479, 0xb6f84565, 0xd28e49bc, 0x4bfb9790, 0xe1ddf2da, 0xa4cb7e33,
-	0x62fb1341, 0xcee4c6e8, 0xef20cada, 0x36774c01, 0xd07e9efe, 0x2bf11fb4,
-	0x95dbda4d, 0xae909198, 0xeaad8e71, 0x6b93d5a0, 0xd08ed1d0, 0xafc725e0,
-	0x8e3c5b2f, 0x8e7594b7, 0x8ff6e2fb, 0xf2122b64, 0x8888b812, 0x900df01c,
-	0x4fad5ea0, 0x688fc31c, 0xd1cff191, 0xb3a8c1ad, 0x2f2f2218, 0xbe0e1777,
-	0xea752dfe, 0x8b021fa1, 0xe5a0cc0f, 0xb56f74e8, 0x18acf3d6, 0xce89e299,
-	0xb4a84fe0, 0xfd13e0b7, 0x7cc43b81, 0xd2ada8d9, 0x165fa266, 0x80957705,
-	0x93cc7314, 0x211a1477, 0xe6ad2065, 0x77b5fa86, 0xc75442f5, 0xfb9d35cf,
-	0xebcdaf0c, 0x7b3e89a0, 0xd6411bd3, 0xae1e7e49, 0x00250e2d, 0x2071b35e,
-	0x226800bb, 0x57b8e0af, 0x2464369b, 0xf009b91e, 0x5563911d, 0x59dfa6aa,
-	0x78c14389, 0xd95a537f, 0x207d5ba2, 0x02e5b9c5, 0x83260376, 0x6295cfa9,
-	0x11c81968, 0x4e734a41, 0xb3472dca, 0x7b14a94a, 0x1b510052, 0x9a532915,
-	0xd60f573f, 0xbc9bc6e4, 0x2b60a476, 0x81e67400, 0x08ba6fb5, 0x571be91f,
-	0xf296ec6b, 0x2a0dd915, 0xb6636521, 0xe7b9f9b6, 0xff34052e, 0xc5855664,
-	0x53b02d5d, 0xa99f8fa1, 0x08ba4799, 0x6e85076a,
-}
-
-var s1 = [256]uint32{
-	0x4b7a70e9, 0xb5b32944, 0xdb75092e, 0xc4192623, 0xad6ea6b0, 0x49a7df7d,
-	0x9cee60b8, 0x8fedb266, 0xecaa8c71, 0x699a17ff, 0x5664526c, 0xc2b19ee1,
-	0x193602a5, 0x75094c29, 0xa0591340, 0xe4183a3e, 0x3f54989a, 0x5b429d65,
-	0x6b8fe4d6, 0x99f73fd6, 0xa1d29c07, 0xefe830f5, 0x4d2d38e6, 0xf0255dc1,
-	0x4cdd2086, 0x8470eb26, 0x6382e9c6, 0x021ecc5e, 0x09686b3f, 0x3ebaefc9,
-	0x3c971814, 0x6b6a70a1, 0x687f3584, 0x52a0e286, 0xb79c5305, 0xaa500737,
-	0x3e07841c, 0x7fdeae5c, 0x8e7d44ec, 0x5716f2b8, 0xb03ada37, 0xf0500c0d,
-	0xf01c1f04, 0x0200b3ff, 0xae0cf51a, 0x3cb574b2, 0x25837a58, 0xdc0921bd,
-	0xd19113f9, 0x7ca92ff6, 0x94324773, 0x22f54701, 0x3ae5e581, 0x37c2dadc,
-	0xc8b57634, 0x9af3dda7, 0xa9446146, 0x0fd0030e, 0xecc8c73e, 0xa4751e41,
-	0xe238cd99, 0x3bea0e2f, 0x3280bba1, 0x183eb331, 0x4e548b38, 0x4f6db908,
-	0x6f420d03, 0xf60a04bf, 0x2cb81290, 0x24977c79, 0x5679b072, 0xbcaf89af,
-	0xde9a771f, 0xd9930810, 0xb38bae12, 0xdccf3f2e, 0x5512721f, 0x2e6b7124,
-	0x501adde6, 0x9f84cd87, 0x7a584718, 0x7408da17, 0xbc9f9abc, 0xe94b7d8c,
-	0xec7aec3a, 0xdb851dfa, 0x63094366, 0xc464c3d2, 0xef1c1847, 0x3215d908,
-	0xdd433b37, 0x24c2ba16, 0x12a14d43, 0x2a65c451, 0x50940002, 0x133ae4dd,
-	0x71dff89e, 0x10314e55, 0x81ac77d6, 0x5f11199b, 0x043556f1, 0xd7a3c76b,
-	0x3c11183b, 0x5924a509, 0xf28fe6ed, 0x97f1fbfa, 0x9ebabf2c, 0x1e153c6e,
-	0x86e34570, 0xeae96fb1, 0x860e5e0a, 0x5a3e2ab3, 0x771fe71c, 0x4e3d06fa,
-	0x2965dcb9, 0x99e71d0f, 0x803e89d6, 0x5266c825, 0x2e4cc978, 0x9c10b36a,
-	0xc6150eba, 0x94e2ea78, 0xa5fc3c53, 0x1e0a2df4, 0xf2f74ea7, 0x361d2b3d,
-	0x1939260f, 0x19c27960, 0x5223a708, 0xf71312b6, 0xebadfe6e, 0xeac31f66,
-	0xe3bc4595, 0xa67bc883, 0xb17f37d1, 0x018cff28, 0xc332ddef, 0xbe6c5aa5,
-	0x65582185, 0x68ab9802, 0xeecea50f, 0xdb2f953b, 0x2aef7dad, 0x5b6e2f84,
-	0x1521b628, 0x29076170, 0xecdd4775, 0x619f1510, 0x13cca830, 0xeb61bd96,
-	0x0334fe1e, 0xaa0363cf, 0xb5735c90, 0x4c70a239, 0xd59e9e0b, 0xcbaade14,
-	0xeecc86bc, 0x60622ca7, 0x9cab5cab, 0xb2f3846e, 0x648b1eaf, 0x19bdf0ca,
-	0xa02369b9, 0x655abb50, 0x40685a32, 0x3c2ab4b3, 0x319ee9d5, 0xc021b8f7,
-	0x9b540b19, 0x875fa099, 0x95f7997e, 0x623d7da8, 0xf837889a, 0x97e32d77,
-	0x11ed935f, 0x16681281, 0x0e358829, 0xc7e61fd6, 0x96dedfa1, 0x7858ba99,
-	0x57f584a5, 0x1b227263, 0x9b83c3ff, 0x1ac24696, 0xcdb30aeb, 0x532e3054,
-	0x8fd948e4, 0x6dbc3128, 0x58ebf2ef, 0x34c6ffea, 0xfe28ed61, 0xee7c3c73,
-	0x5d4a14d9, 0xe864b7e3, 0x42105d14, 0x203e13e0, 0x45eee2b6, 0xa3aaabea,
-	0xdb6c4f15, 0xfacb4fd0, 0xc742f442, 0xef6abbb5, 0x654f3b1d, 0x41cd2105,
-	0xd81e799e, 0x86854dc7, 0xe44b476a, 0x3d816250, 0xcf62a1f2, 0x5b8d2646,
-	0xfc8883a0, 0xc1c7b6a3, 0x7f1524c3, 0x69cb7492, 0x47848a0b, 0x5692b285,
-	0x095bbf00, 0xad19489d, 0x1462b174, 0x23820e00, 0x58428d2a, 0x0c55f5ea,
-	0x1dadf43e, 0x233f7061, 0x3372f092, 0x8d937e41, 0xd65fecf1, 0x6c223bdb,
-	0x7cde3759, 0xcbee7460, 0x4085f2a7, 0xce77326e, 0xa6078084, 0x19f8509e,
-	0xe8efd855, 0x61d99735, 0xa969a7aa, 0xc50c06c2, 0x5a04abfc, 0x800bcadc,
-	0x9e447a2e, 0xc3453484, 0xfdd56705, 0x0e1e9ec9, 0xdb73dbd3, 0x105588cd,
-	0x675fda79, 0xe3674340, 0xc5c43465, 0x713e38d8, 0x3d28f89e, 0xf16dff20,
-	0x153e21e7, 0x8fb03d4a, 0xe6e39f2b, 0xdb83adf7,
-}
-
-var s2 = [256]uint32{
-	0xe93d5a68, 0x948140f7, 0xf64c261c, 0x94692934, 0x411520f7, 0x7602d4f7,
-	0xbcf46b2e, 0xd4a20068, 0xd4082471, 0x3320f46a, 0x43b7d4b7, 0x500061af,
-	0x1e39f62e, 0x97244546, 0x14214f74, 0xbf8b8840, 0x4d95fc1d, 0x96b591af,
-	0x70f4ddd3, 0x66a02f45, 0xbfbc09ec, 0x03bd9785, 0x7fac6dd0, 0x31cb8504,
-	0x96eb27b3, 0x55fd3941, 0xda2547e6, 0xabca0a9a, 0x28507825, 0x530429f4,
-	0x0a2c86da, 0xe9b66dfb, 0x68dc1462, 0xd7486900, 0x680ec0a4, 0x27a18dee,
-	0x4f3ffea2, 0xe887ad8c, 0xb58ce006, 0x7af4d6b6, 0xaace1e7c, 0xd3375fec,
-	0xce78a399, 0x406b2a42, 0x20fe9e35, 0xd9f385b9, 0xee39d7ab, 0x3b124e8b,
-	0x1dc9faf7, 0x4b6d1856, 0x26a36631, 0xeae397b2, 0x3a6efa74, 0xdd5b4332,
-	0x6841e7f7, 0xca7820fb, 0xfb0af54e, 0xd8feb397, 0x454056ac, 0xba489527,
-	0x55533a3a, 0x20838d87, 0xfe6ba9b7, 0xd096954b, 0x55a867bc, 0xa1159a58,
-	0xcca92963, 0x99e1db33, 0xa62a4a56, 0x3f3125f9, 0x5ef47e1c, 0x9029317c,
-	0xfdf8e802, 0x04272f70, 0x80bb155c, 0x05282ce3, 0x95c11548, 0xe4c66d22,
-	0x48c1133f, 0xc70f86dc, 0x07f9c9ee, 0x41041f0f, 0x404779a4, 0x5d886e17,
-	0x325f51eb, 0xd59bc0d1, 0xf2bcc18f, 0x41113564, 0x257b7834, 0x602a9c60,
-	0xdff8e8a3, 0x1f636c1b, 0x0e12b4c2, 0x02e1329e, 0xaf664fd1, 0xcad18115,
-	0x6b2395e0, 0x333e92e1, 0x3b240b62, 0xeebeb922, 0x85b2a20e, 0xe6ba0d99,
-	0xde720c8c, 0x2da2f728, 0xd0127845, 0x95b794fd, 0x647d0862, 0xe7ccf5f0,
-	0x5449a36f, 0x877d48fa, 0xc39dfd27, 0xf33e8d1e, 0x0a476341, 0x992eff74,
-	0x3a6f6eab, 0xf4f8fd37, 0xa812dc60, 0xa1ebddf8, 0x991be14c, 0xdb6e6b0d,
-	0xc67b5510, 0x6d672c37, 0x2765d43b, 0xdcd0e804, 0xf1290dc7, 0xcc00ffa3,
-	0xb5390f92, 0x690fed0b, 0x667b9ffb, 0xcedb7d9c, 0xa091cf0b, 0xd9155ea3,
-	0xbb132f88, 0x515bad24, 0x7b9479bf, 0x763bd6eb, 0x37392eb3, 0xcc115979,
-	0x8026e297, 0xf42e312d, 0x6842ada7, 0xc66a2b3b, 0x12754ccc, 0x782ef11c,
-	0x6a124237, 0xb79251e7, 0x06a1bbe6, 0x4bfb6350, 0x1a6b1018, 0x11caedfa,
-	0x3d25bdd8, 0xe2e1c3c9, 0x44421659, 0x0a121386, 0xd90cec6e, 0xd5abea2a,
-	0x64af674e, 0xda86a85f, 0xbebfe988, 0x64e4c3fe, 0x9dbc8057, 0xf0f7c086,
-	0x60787bf8, 0x6003604d, 0xd1fd8346, 0xf6381fb0, 0x7745ae04, 0xd736fccc,
-	0x83426b33, 0xf01eab71, 0xb0804187, 0x3c005e5f, 0x77a057be, 0xbde8ae24,
-	0x55464299, 0xbf582e61, 0x4e58f48f, 0xf2ddfda2, 0xf474ef38, 0x8789bdc2,
-	0x5366f9c3, 0xc8b38e74, 0xb475f255, 0x46fcd9b9, 0x7aeb2661, 0x8b1ddf84,
-	0x846a0e79, 0x915f95e2, 0x466e598e, 0x20b45770, 0x8cd55591, 0xc902de4c,
-	0xb90bace1, 0xbb8205d0, 0x11a86248, 0x7574a99e, 0xb77f19b6, 0xe0a9dc09,
-	0x662d09a1, 0xc4324633, 0xe85a1f02, 0x09f0be8c, 0x4a99a025, 0x1d6efe10,
-	0x1ab93d1d, 0x0ba5a4df, 0xa186f20f, 0x2868f169, 0xdcb7da83, 0x573906fe,
-	0xa1e2ce9b, 0x4fcd7f52, 0x50115e01, 0xa70683fa, 0xa002b5c4, 0x0de6d027,
-	0x9af88c27, 0x773f8641, 0xc3604c06, 0x61a806b5, 0xf0177a28, 0xc0f586e0,
-	0x006058aa, 0x30dc7d62, 0x11e69ed7, 0x2338ea63, 0x53c2dd94, 0xc2c21634,
-	0xbbcbee56, 0x90bcb6de, 0xebfc7da1, 0xce591d76, 0x6f05e409, 0x4b7c0188,
-	0x39720a3d, 0x7c927c24, 0x86e3725f, 0x724d9db9, 0x1ac15bb4, 0xd39eb8fc,
-	0xed545578, 0x08fca5b5, 0xd83d7cd3, 0x4dad0fc4, 0x1e50ef5e, 0xb161e6f8,
-	0xa28514d9, 0x6c51133c, 0x6fd5c7e7, 0x56e14ec4, 0x362abfce, 0xddc6c837,
-	0xd79a3234, 0x92638212, 0x670efa8e, 0x406000e0,
-}
-
-var s3 = [256]uint32{
-	0x3a39ce37, 0xd3faf5cf, 0xabc27737, 0x5ac52d1b, 0x5cb0679e, 0x4fa33742,
-	0xd3822740, 0x99bc9bbe, 0xd5118e9d, 0xbf0f7315, 0xd62d1c7e, 0xc700c47b,
-	0xb78c1b6b, 0x21a19045, 0xb26eb1be, 0x6a366eb4, 0x5748ab2f, 0xbc946e79,
-	0xc6a376d2, 0x6549c2c8, 0x530ff8ee, 0x468dde7d, 0xd5730a1d, 0x4cd04dc6,
-	0x2939bbdb, 0xa9ba4650, 0xac9526e8, 0xbe5ee304, 0xa1fad5f0, 0x6a2d519a,
-	0x63ef8ce2, 0x9a86ee22, 0xc089c2b8, 0x43242ef6, 0xa51e03aa, 0x9cf2d0a4,
-	0x83c061ba, 0x9be96a4d, 0x8fe51550, 0xba645bd6, 0x2826a2f9, 0xa73a3ae1,
-	0x4ba99586, 0xef5562e9, 0xc72fefd3, 0xf752f7da, 0x3f046f69, 0x77fa0a59,
-	0x80e4a915, 0x87b08601, 0x9b09e6ad, 0x3b3ee593, 0xe990fd5a, 0x9e34d797,
-	0x2cf0b7d9, 0x022b8b51, 0x96d5ac3a, 0x017da67d, 0xd1cf3ed6, 0x7c7d2d28,
-	0x1f9f25cf, 0xadf2b89b, 0x5ad6b472, 0x5a88f54c, 0xe029ac71, 0xe019a5e6,
-	0x47b0acfd, 0xed93fa9b, 0xe8d3c48d, 0x283b57cc, 0xf8d56629, 0x79132e28,
-	0x785f0191, 0xed756055, 0xf7960e44, 0xe3d35e8c, 0x15056dd4, 0x88f46dba,
-	0x03a16125, 0x0564f0bd, 0xc3eb9e15, 0x3c9057a2, 0x97271aec, 0xa93a072a,
-	0x1b3f6d9b, 0x1e6321f5, 0xf59c66fb, 0x26dcf319, 0x7533d928, 0xb155fdf5,
-	0x03563482, 0x8aba3cbb, 0x28517711, 0xc20ad9f8, 0xabcc5167, 0xccad925f,
-	0x4de81751, 0x3830dc8e, 0x379d5862, 0x9320f991, 0xea7a90c2, 0xfb3e7bce,
-	0x5121ce64, 0x774fbe32, 0xa8b6e37e, 0xc3293d46, 0x48de5369, 0x6413e680,
-	0xa2ae0810, 0xdd6db224, 0x69852dfd, 0x09072166, 0xb39a460a, 0x6445c0dd,
-	0x586cdecf, 0x1c20c8ae, 0x5bbef7dd, 0x1b588d40, 0xccd2017f, 0x6bb4e3bb,
-	0xdda26a7e, 0x3a59ff45, 0x3e350a44, 0xbcb4cdd5, 0x72eacea8, 0xfa6484bb,
-	0x8d6612ae, 0xbf3c6f47, 0xd29be463, 0x542f5d9e, 0xaec2771b, 0xf64e6370,
-	0x740e0d8d, 0xe75b1357, 0xf8721671, 0xaf537d5d, 0x4040cb08, 0x4eb4e2cc,
-	0x34d2466a, 0x0115af84, 0xe1b00428, 0x95983a1d, 0x06b89fb4, 0xce6ea048,
-	0x6f3f3b82, 0x3520ab82, 0x011a1d4b, 0x277227f8, 0x611560b1, 0xe7933fdc,
-	0xbb3a792b, 0x344525bd, 0xa08839e1, 0x51ce794b, 0x2f32c9b7, 0xa01fbac9,
-	0xe01cc87e, 0xbcc7d1f6, 0xcf0111c3, 0xa1e8aac7, 0x1a908749, 0xd44fbd9a,
-	0xd0dadecb, 0xd50ada38, 0x0339c32a, 0xc6913667, 0x8df9317c, 0xe0b12b4f,
-	0xf79e59b7, 0x43f5bb3a, 0xf2d519ff, 0x27d9459c, 0xbf97222c, 0x15e6fc2a,
-	0x0f91fc71, 0x9b941525, 0xfae59361, 0xceb69ceb, 0xc2a86459, 0x12baa8d1,
-	0xb6c1075e, 0xe3056a0c, 0x10d25065, 0xcb03a442, 0xe0ec6e0e, 0x1698db3b,
-	0x4c98a0be, 0x3278e964, 0x9f1f9532, 0xe0d392df, 0xd3a0342b, 0x8971f21e,
-	0x1b0a7441, 0x4ba3348c, 0xc5be7120, 0xc37632d8, 0xdf359f8d, 0x9b992f2e,
-	0xe60b6f47, 0x0fe3f11d, 0xe54cda54, 0x1edad891, 0xce6279cf, 0xcd3e7e6f,
-	0x1618b166, 0xfd2c1d05, 0x848fd2c5, 0xf6fb2299, 0xf523f357, 0xa6327623,
-	0x93a83531, 0x56cccd02, 0xacf08162, 0x5a75ebb5, 0x6e163697, 0x88d273cc,
-	0xde966292, 0x81b949d0, 0x4c50901b, 0x71c65614, 0xe6c6c7bd, 0x327a140a,
-	0x45e1d006, 0xc3f27b9a, 0xc9aa53fd, 0x62a80f00, 0xbb25bfe2, 0x35bdd2f6,
-	0x71126905, 0xb2040222, 0xb6cbcf7c, 0xcd769c2b, 0x53113ec0, 0x1640e3d3,
-	0x38abbd60, 0x2547adf0, 0xba38209c, 0xf746ce76, 0x77afa1c5, 0x20756060,
-	0x85cbfe4e, 0x8ae88dd8, 0x7aaaf9b0, 0x4cf9aa7e, 0x1948c25c, 0x02fb8a8c,
-	0x01c36ae4, 0xd6ebe1f9, 0x90d4f869, 0xa65cdea0, 0x3f09252d, 0xc208e69f,
-	0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6,
-}
-
-var p = [18]uint32{
-	0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, 0xa4093822, 0x299f31d0,
-	0x082efa98, 0xec4e6c89, 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,
-	0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917, 0x9216d5d9, 0x8979fb1b,
-}
diff --git a/vendor/golang.org/x/crypto/cast5/cast5.go b/vendor/golang.org/x/crypto/cast5/cast5.go
deleted file mode 100644
index 0b4af37..0000000
--- a/vendor/golang.org/x/crypto/cast5/cast5.go
+++ /dev/null
@@ -1,526 +0,0 @@
-// Copyright 2010 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package cast5 implements CAST5, as defined in RFC 2144. CAST5 is a common
-// OpenPGP cipher.
-package cast5 // import "golang.org/x/crypto/cast5"
-
-import "errors"
-
-const BlockSize = 8
-const KeySize = 16
-
-type Cipher struct {
-	masking [16]uint32
-	rotate  [16]uint8
-}
-
-func NewCipher(key []byte) (c *Cipher, err error) {
-	if len(key) != KeySize {
-		return nil, errors.New("CAST5: keys must be 16 bytes")
-	}
-
-	c = new(Cipher)
-	c.keySchedule(key)
-	return
-}
-
-func (c *Cipher) BlockSize() int {
-	return BlockSize
-}
-
-func (c *Cipher) Encrypt(dst, src []byte) {
-	l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
-	r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
-
-	l, r = r, l^f1(r, c.masking[0], c.rotate[0])
-	l, r = r, l^f2(r, c.masking[1], c.rotate[1])
-	l, r = r, l^f3(r, c.masking[2], c.rotate[2])
-	l, r = r, l^f1(r, c.masking[3], c.rotate[3])
-
-	l, r = r, l^f2(r, c.masking[4], c.rotate[4])
-	l, r = r, l^f3(r, c.masking[5], c.rotate[5])
-	l, r = r, l^f1(r, c.masking[6], c.rotate[6])
-	l, r = r, l^f2(r, c.masking[7], c.rotate[7])
-
-	l, r = r, l^f3(r, c.masking[8], c.rotate[8])
-	l, r = r, l^f1(r, c.masking[9], c.rotate[9])
-	l, r = r, l^f2(r, c.masking[10], c.rotate[10])
-	l, r = r, l^f3(r, c.masking[11], c.rotate[11])
-
-	l, r = r, l^f1(r, c.masking[12], c.rotate[12])
-	l, r = r, l^f2(r, c.masking[13], c.rotate[13])
-	l, r = r, l^f3(r, c.masking[14], c.rotate[14])
-	l, r = r, l^f1(r, c.masking[15], c.rotate[15])
-
-	dst[0] = uint8(r >> 24)
-	dst[1] = uint8(r >> 16)
-	dst[2] = uint8(r >> 8)
-	dst[3] = uint8(r)
-	dst[4] = uint8(l >> 24)
-	dst[5] = uint8(l >> 16)
-	dst[6] = uint8(l >> 8)
-	dst[7] = uint8(l)
-}
-
-func (c *Cipher) Decrypt(dst, src []byte) {
-	l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
-	r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
-
-	l, r = r, l^f1(r, c.masking[15], c.rotate[15])
-	l, r = r, l^f3(r, c.masking[14], c.rotate[14])
-	l, r = r, l^f2(r, c.masking[13], c.rotate[13])
-	l, r = r, l^f1(r, c.masking[12], c.rotate[12])
-
-	l, r = r, l^f3(r, c.masking[11], c.rotate[11])
-	l, r = r, l^f2(r, c.masking[10], c.rotate[10])
-	l, r = r, l^f1(r, c.masking[9], c.rotate[9])
-	l, r = r, l^f3(r, c.masking[8], c.rotate[8])
-
-	l, r = r, l^f2(r, c.masking[7], c.rotate[7])
-	l, r = r, l^f1(r, c.masking[6], c.rotate[6])
-	l, r = r, l^f3(r, c.masking[5], c.rotate[5])
-	l, r = r, l^f2(r, c.masking[4], c.rotate[4])
-
-	l, r = r, l^f1(r, c.masking[3], c.rotate[3])
-	l, r = r, l^f3(r, c.masking[2], c.rotate[2])
-	l, r = r, l^f2(r, c.masking[1], c.rotate[1])
-	l, r = r, l^f1(r, c.masking[0], c.rotate[0])
-
-	dst[0] = uint8(r >> 24)
-	dst[1] = uint8(r >> 16)
-	dst[2] = uint8(r >> 8)
-	dst[3] = uint8(r)
-	dst[4] = uint8(l >> 24)
-	dst[5] = uint8(l >> 16)
-	dst[6] = uint8(l >> 8)
-	dst[7] = uint8(l)
-}
-
-type keyScheduleA [4][7]uint8
-type keyScheduleB [4][5]uint8
-
-// keyScheduleRound contains the magic values for a round of the key schedule.
-// The keyScheduleA deals with the lines like:
-//   z0z1z2z3 = x0x1x2x3 ^ S5[xD] ^ S6[xF] ^ S7[xC] ^ S8[xE] ^ S7[x8]
-// Conceptually, both x and z are in the same array, x first. The first
-// element describes which word of this array gets written to and the
-// second, which word gets read. So, for the line above, it's "4, 0", because
-// it's writing to the first word of z, which, being after x, is word 4, and
-// reading from the first word of x: word 0.
-//
-// Next are the indexes into the S-boxes. Now the array is treated as bytes. So
-// "xD" is 0xd. The first byte of z is written as "16 + 0", just to be clear
-// that it's z that we're indexing.
-//
-// keyScheduleB deals with lines like:
-//   K1 = S5[z8] ^ S6[z9] ^ S7[z7] ^ S8[z6] ^ S5[z2]
-// "K1" is ignored because key words are always written in order. So the five
-// elements are the S-box indexes. They use the same form as in keyScheduleA,
-// above.
-
-type keyScheduleRound struct{}
-type keySchedule []keyScheduleRound
-
-var schedule = []struct {
-	a keyScheduleA
-	b keyScheduleB
-}{
-	{
-		keyScheduleA{
-			{4, 0, 0xd, 0xf, 0xc, 0xe, 0x8},
-			{5, 2, 16 + 0, 16 + 2, 16 + 1, 16 + 3, 0xa},
-			{6, 3, 16 + 7, 16 + 6, 16 + 5, 16 + 4, 9},
-			{7, 1, 16 + 0xa, 16 + 9, 16 + 0xb, 16 + 8, 0xb},
-		},
-		keyScheduleB{
-			{16 + 8, 16 + 9, 16 + 7, 16 + 6, 16 + 2},
-			{16 + 0xa, 16 + 0xb, 16 + 5, 16 + 4, 16 + 6},
-			{16 + 0xc, 16 + 0xd, 16 + 3, 16 + 2, 16 + 9},
-			{16 + 0xe, 16 + 0xf, 16 + 1, 16 + 0, 16 + 0xc},
-		},
-	},
-	{
-		keyScheduleA{
-			{0, 6, 16 + 5, 16 + 7, 16 + 4, 16 + 6, 16 + 0},
-			{1, 4, 0, 2, 1, 3, 16 + 2},
-			{2, 5, 7, 6, 5, 4, 16 + 1},
-			{3, 7, 0xa, 9, 0xb, 8, 16 + 3},
-		},
-		keyScheduleB{
-			{3, 2, 0xc, 0xd, 8},
-			{1, 0, 0xe, 0xf, 0xd},
-			{7, 6, 8, 9, 3},
-			{5, 4, 0xa, 0xb, 7},
-		},
-	},
-	{
-		keyScheduleA{
-			{4, 0, 0xd, 0xf, 0xc, 0xe, 8},
-			{5, 2, 16 + 0, 16 + 2, 16 + 1, 16 + 3, 0xa},
-			{6, 3, 16 + 7, 16 + 6, 16 + 5, 16 + 4, 9},
-			{7, 1, 16 + 0xa, 16 + 9, 16 + 0xb, 16 + 8, 0xb},
-		},
-		keyScheduleB{
-			{16 + 3, 16 + 2, 16 + 0xc, 16 + 0xd, 16 + 9},
-			{16 + 1, 16 + 0, 16 + 0xe, 16 + 0xf, 16 + 0xc},
-			{16 + 7, 16 + 6, 16 + 8, 16 + 9, 16 + 2},
-			{16 + 5, 16 + 4, 16 + 0xa, 16 + 0xb, 16 + 6},
-		},
-	},
-	{
-		keyScheduleA{
-			{0, 6, 16 + 5, 16 + 7, 16 + 4, 16 + 6, 16 + 0},
-			{1, 4, 0, 2, 1, 3, 16 + 2},
-			{2, 5, 7, 6, 5, 4, 16 + 1},
-			{3, 7, 0xa, 9, 0xb, 8, 16 + 3},
-		},
-		keyScheduleB{
-			{8, 9, 7, 6, 3},
-			{0xa, 0xb, 5, 4, 7},
-			{0xc, 0xd, 3, 2, 8},
-			{0xe, 0xf, 1, 0, 0xd},
-		},
-	},
-}
-
-func (c *Cipher) keySchedule(in []byte) {
-	var t [8]uint32
-	var k [32]uint32
-
-	for i := 0; i < 4; i++ {
-		j := i * 4
-		t[i] = uint32(in[j])<<24 | uint32(in[j+1])<<16 | uint32(in[j+2])<<8 | uint32(in[j+3])
-	}
-
-	x := []byte{6, 7, 4, 5}
-	ki := 0
-
-	for half := 0; half < 2; half++ {
-		for _, round := range schedule {
-			for j := 0; j < 4; j++ {
-				var a [7]uint8
-				copy(a[:], round.a[j][:])
-				w := t[a[1]]
-				w ^= sBox[4][(t[a[2]>>2]>>(24-8*(a[2]&3)))&0xff]
-				w ^= sBox[5][(t[a[3]>>2]>>(24-8*(a[3]&3)))&0xff]
-				w ^= sBox[6][(t[a[4]>>2]>>(24-8*(a[4]&3)))&0xff]
-				w ^= sBox[7][(t[a[5]>>2]>>(24-8*(a[5]&3)))&0xff]
-				w ^= sBox[x[j]][(t[a[6]>>2]>>(24-8*(a[6]&3)))&0xff]
-				t[a[0]] = w
-			}
-
-			for j := 0; j < 4; j++ {
-				var b [5]uint8
-				copy(b[:], round.b[j][:])
-				w := sBox[4][(t[b[0]>>2]>>(24-8*(b[0]&3)))&0xff]
-				w ^= sBox[5][(t[b[1]>>2]>>(24-8*(b[1]&3)))&0xff]
-				w ^= sBox[6][(t[b[2]>>2]>>(24-8*(b[2]&3)))&0xff]
-				w ^= sBox[7][(t[b[3]>>2]>>(24-8*(b[3]&3)))&0xff]
-				w ^= sBox[4+j][(t[b[4]>>2]>>(24-8*(b[4]&3)))&0xff]
-				k[ki] = w
-				ki++
-			}
-		}
-	}
-
-	for i := 0; i < 16; i++ {
-		c.masking[i] = k[i]
-		c.rotate[i] = uint8(k[16+i] & 0x1f)
-	}
-}
-
-// These are the three 'f' functions. See RFC 2144, section 2.2.
-func f1(d, m uint32, r uint8) uint32 {
-	t := m + d
-	I := (t << r) | (t >> (32 - r))
-	return ((sBox[0][I>>24] ^ sBox[1][(I>>16)&0xff]) - sBox[2][(I>>8)&0xff]) + sBox[3][I&0xff]
-}
-
-func f2(d, m uint32, r uint8) uint32 {
-	t := m ^ d
-	I := (t << r) | (t >> (32 - r))
-	return ((sBox[0][I>>24] - sBox[1][(I>>16)&0xff]) + sBox[2][(I>>8)&0xff]) ^ sBox[3][I&0xff]
-}
-
-func f3(d, m uint32, r uint8) uint32 {
-	t := m - d
-	I := (t << r) | (t >> (32 - r))
-	return ((sBox[0][I>>24] + sBox[1][(I>>16)&0xff]) ^ sBox[2][(I>>8)&0xff]) - sBox[3][I&0xff]
-}
-
-var sBox = [8][256]uint32{
-	{
-		0x30fb40d4, 0x9fa0ff0b, 0x6beccd2f, 0x3f258c7a, 0x1e213f2f, 0x9c004dd3, 0x6003e540, 0xcf9fc949,
-		0xbfd4af27, 0x88bbbdb5, 0xe2034090, 0x98d09675, 0x6e63a0e0, 0x15c361d2, 0xc2e7661d, 0x22d4ff8e,
-		0x28683b6f, 0xc07fd059, 0xff2379c8, 0x775f50e2, 0x43c340d3, 0xdf2f8656, 0x887ca41a, 0xa2d2bd2d,
-		0xa1c9e0d6, 0x346c4819, 0x61b76d87, 0x22540f2f, 0x2abe32e1, 0xaa54166b, 0x22568e3a, 0xa2d341d0,
-		0x66db40c8, 0xa784392f, 0x004dff2f, 0x2db9d2de, 0x97943fac, 0x4a97c1d8, 0x527644b7, 0xb5f437a7,
-		0xb82cbaef, 0xd751d159, 0x6ff7f0ed, 0x5a097a1f, 0x827b68d0, 0x90ecf52e, 0x22b0c054, 0xbc8e5935,
-		0x4b6d2f7f, 0x50bb64a2, 0xd2664910, 0xbee5812d, 0xb7332290, 0xe93b159f, 0xb48ee411, 0x4bff345d,
-		0xfd45c240, 0xad31973f, 0xc4f6d02e, 0x55fc8165, 0xd5b1caad, 0xa1ac2dae, 0xa2d4b76d, 0xc19b0c50,
-		0x882240f2, 0x0c6e4f38, 0xa4e4bfd7, 0x4f5ba272, 0x564c1d2f, 0xc59c5319, 0xb949e354, 0xb04669fe,
-		0xb1b6ab8a, 0xc71358dd, 0x6385c545, 0x110f935d, 0x57538ad5, 0x6a390493, 0xe63d37e0, 0x2a54f6b3,
-		0x3a787d5f, 0x6276a0b5, 0x19a6fcdf, 0x7a42206a, 0x29f9d4d5, 0xf61b1891, 0xbb72275e, 0xaa508167,
-		0x38901091, 0xc6b505eb, 0x84c7cb8c, 0x2ad75a0f, 0x874a1427, 0xa2d1936b, 0x2ad286af, 0xaa56d291,
-		0xd7894360, 0x425c750d, 0x93b39e26, 0x187184c9, 0x6c00b32d, 0x73e2bb14, 0xa0bebc3c, 0x54623779,
-		0x64459eab, 0x3f328b82, 0x7718cf82, 0x59a2cea6, 0x04ee002e, 0x89fe78e6, 0x3fab0950, 0x325ff6c2,
-		0x81383f05, 0x6963c5c8, 0x76cb5ad6, 0xd49974c9, 0xca180dcf, 0x380782d5, 0xc7fa5cf6, 0x8ac31511,
-		0x35e79e13, 0x47da91d0, 0xf40f9086, 0xa7e2419e, 0x31366241, 0x051ef495, 0xaa573b04, 0x4a805d8d,
-		0x548300d0, 0x00322a3c, 0xbf64cddf, 0xba57a68e, 0x75c6372b, 0x50afd341, 0xa7c13275, 0x915a0bf5,
-		0x6b54bfab, 0x2b0b1426, 0xab4cc9d7, 0x449ccd82, 0xf7fbf265, 0xab85c5f3, 0x1b55db94, 0xaad4e324,
-		0xcfa4bd3f, 0x2deaa3e2, 0x9e204d02, 0xc8bd25ac, 0xeadf55b3, 0xd5bd9e98, 0xe31231b2, 0x2ad5ad6c,
-		0x954329de, 0xadbe4528, 0xd8710f69, 0xaa51c90f, 0xaa786bf6, 0x22513f1e, 0xaa51a79b, 0x2ad344cc,
-		0x7b5a41f0, 0xd37cfbad, 0x1b069505, 0x41ece491, 0xb4c332e6, 0x032268d4, 0xc9600acc, 0xce387e6d,
-		0xbf6bb16c, 0x6a70fb78, 0x0d03d9c9, 0xd4df39de, 0xe01063da, 0x4736f464, 0x5ad328d8, 0xb347cc96,
-		0x75bb0fc3, 0x98511bfb, 0x4ffbcc35, 0xb58bcf6a, 0xe11f0abc, 0xbfc5fe4a, 0xa70aec10, 0xac39570a,
-		0x3f04442f, 0x6188b153, 0xe0397a2e, 0x5727cb79, 0x9ceb418f, 0x1cacd68d, 0x2ad37c96, 0x0175cb9d,
-		0xc69dff09, 0xc75b65f0, 0xd9db40d8, 0xec0e7779, 0x4744ead4, 0xb11c3274, 0xdd24cb9e, 0x7e1c54bd,
-		0xf01144f9, 0xd2240eb1, 0x9675b3fd, 0xa3ac3755, 0xd47c27af, 0x51c85f4d, 0x56907596, 0xa5bb15e6,
-		0x580304f0, 0xca042cf1, 0x011a37ea, 0x8dbfaadb, 0x35ba3e4a, 0x3526ffa0, 0xc37b4d09, 0xbc306ed9,
-		0x98a52666, 0x5648f725, 0xff5e569d, 0x0ced63d0, 0x7c63b2cf, 0x700b45e1, 0xd5ea50f1, 0x85a92872,
-		0xaf1fbda7, 0xd4234870, 0xa7870bf3, 0x2d3b4d79, 0x42e04198, 0x0cd0ede7, 0x26470db8, 0xf881814c,
-		0x474d6ad7, 0x7c0c5e5c, 0xd1231959, 0x381b7298, 0xf5d2f4db, 0xab838653, 0x6e2f1e23, 0x83719c9e,
-		0xbd91e046, 0x9a56456e, 0xdc39200c, 0x20c8c571, 0x962bda1c, 0xe1e696ff, 0xb141ab08, 0x7cca89b9,
-		0x1a69e783, 0x02cc4843, 0xa2f7c579, 0x429ef47d, 0x427b169c, 0x5ac9f049, 0xdd8f0f00, 0x5c8165bf,
-	},
-	{
-		0x1f201094, 0xef0ba75b, 0x69e3cf7e, 0x393f4380, 0xfe61cf7a, 0xeec5207a, 0x55889c94, 0x72fc0651,
-		0xada7ef79, 0x4e1d7235, 0xd55a63ce, 0xde0436ba, 0x99c430ef, 0x5f0c0794, 0x18dcdb7d, 0xa1d6eff3,
-		0xa0b52f7b, 0x59e83605, 0xee15b094, 0xe9ffd909, 0xdc440086, 0xef944459, 0xba83ccb3, 0xe0c3cdfb,
-		0xd1da4181, 0x3b092ab1, 0xf997f1c1, 0xa5e6cf7b, 0x01420ddb, 0xe4e7ef5b, 0x25a1ff41, 0xe180f806,
-		0x1fc41080, 0x179bee7a, 0xd37ac6a9, 0xfe5830a4, 0x98de8b7f, 0x77e83f4e, 0x79929269, 0x24fa9f7b,
-		0xe113c85b, 0xacc40083, 0xd7503525, 0xf7ea615f, 0x62143154, 0x0d554b63, 0x5d681121, 0xc866c359,
-		0x3d63cf73, 0xcee234c0, 0xd4d87e87, 0x5c672b21, 0x071f6181, 0x39f7627f, 0x361e3084, 0xe4eb573b,
-		0x602f64a4, 0xd63acd9c, 0x1bbc4635, 0x9e81032d, 0x2701f50c, 0x99847ab4, 0xa0e3df79, 0xba6cf38c,
-		0x10843094, 0x2537a95e, 0xf46f6ffe, 0xa1ff3b1f, 0x208cfb6a, 0x8f458c74, 0xd9e0a227, 0x4ec73a34,
-		0xfc884f69, 0x3e4de8df, 0xef0e0088, 0x3559648d, 0x8a45388c, 0x1d804366, 0x721d9bfd, 0xa58684bb,
-		0xe8256333, 0x844e8212, 0x128d8098, 0xfed33fb4, 0xce280ae1, 0x27e19ba5, 0xd5a6c252, 0xe49754bd,
-		0xc5d655dd, 0xeb667064, 0x77840b4d, 0xa1b6a801, 0x84db26a9, 0xe0b56714, 0x21f043b7, 0xe5d05860,
-		0x54f03084, 0x066ff472, 0xa31aa153, 0xdadc4755, 0xb5625dbf, 0x68561be6, 0x83ca6b94, 0x2d6ed23b,
-		0xeccf01db, 0xa6d3d0ba, 0xb6803d5c, 0xaf77a709, 0x33b4a34c, 0x397bc8d6, 0x5ee22b95, 0x5f0e5304,
-		0x81ed6f61, 0x20e74364, 0xb45e1378, 0xde18639b, 0x881ca122, 0xb96726d1, 0x8049a7e8, 0x22b7da7b,
-		0x5e552d25, 0x5272d237, 0x79d2951c, 0xc60d894c, 0x488cb402, 0x1ba4fe5b, 0xa4b09f6b, 0x1ca815cf,
-		0xa20c3005, 0x8871df63, 0xb9de2fcb, 0x0cc6c9e9, 0x0beeff53, 0xe3214517, 0xb4542835, 0x9f63293c,
-		0xee41e729, 0x6e1d2d7c, 0x50045286, 0x1e6685f3, 0xf33401c6, 0x30a22c95, 0x31a70850, 0x60930f13,
-		0x73f98417, 0xa1269859, 0xec645c44, 0x52c877a9, 0xcdff33a6, 0xa02b1741, 0x7cbad9a2, 0x2180036f,
-		0x50d99c08, 0xcb3f4861, 0xc26bd765, 0x64a3f6ab, 0x80342676, 0x25a75e7b, 0xe4e6d1fc, 0x20c710e6,
-		0xcdf0b680, 0x17844d3b, 0x31eef84d, 0x7e0824e4, 0x2ccb49eb, 0x846a3bae, 0x8ff77888, 0xee5d60f6,
-		0x7af75673, 0x2fdd5cdb, 0xa11631c1, 0x30f66f43, 0xb3faec54, 0x157fd7fa, 0xef8579cc, 0xd152de58,
-		0xdb2ffd5e, 0x8f32ce19, 0x306af97a, 0x02f03ef8, 0x99319ad5, 0xc242fa0f, 0xa7e3ebb0, 0xc68e4906,
-		0xb8da230c, 0x80823028, 0xdcdef3c8, 0xd35fb171, 0x088a1bc8, 0xbec0c560, 0x61a3c9e8, 0xbca8f54d,
-		0xc72feffa, 0x22822e99, 0x82c570b4, 0xd8d94e89, 0x8b1c34bc, 0x301e16e6, 0x273be979, 0xb0ffeaa6,
-		0x61d9b8c6, 0x00b24869, 0xb7ffce3f, 0x08dc283b, 0x43daf65a, 0xf7e19798, 0x7619b72f, 0x8f1c9ba4,
-		0xdc8637a0, 0x16a7d3b1, 0x9fc393b7, 0xa7136eeb, 0xc6bcc63e, 0x1a513742, 0xef6828bc, 0x520365d6,
-		0x2d6a77ab, 0x3527ed4b, 0x821fd216, 0x095c6e2e, 0xdb92f2fb, 0x5eea29cb, 0x145892f5, 0x91584f7f,
-		0x5483697b, 0x2667a8cc, 0x85196048, 0x8c4bacea, 0x833860d4, 0x0d23e0f9, 0x6c387e8a, 0x0ae6d249,
-		0xb284600c, 0xd835731d, 0xdcb1c647, 0xac4c56ea, 0x3ebd81b3, 0x230eabb0, 0x6438bc87, 0xf0b5b1fa,
-		0x8f5ea2b3, 0xfc184642, 0x0a036b7a, 0x4fb089bd, 0x649da589, 0xa345415e, 0x5c038323, 0x3e5d3bb9,
-		0x43d79572, 0x7e6dd07c, 0x06dfdf1e, 0x6c6cc4ef, 0x7160a539, 0x73bfbe70, 0x83877605, 0x4523ecf1,
-	},
-	{
-		0x8defc240, 0x25fa5d9f, 0xeb903dbf, 0xe810c907, 0x47607fff, 0x369fe44b, 0x8c1fc644, 0xaececa90,
-		0xbeb1f9bf, 0xeefbcaea, 0xe8cf1950, 0x51df07ae, 0x920e8806, 0xf0ad0548, 0xe13c8d83, 0x927010d5,
-		0x11107d9f, 0x07647db9, 0xb2e3e4d4, 0x3d4f285e, 0xb9afa820, 0xfade82e0, 0xa067268b, 0x8272792e,
-		0x553fb2c0, 0x489ae22b, 0xd4ef9794, 0x125e3fbc, 0x21fffcee, 0x825b1bfd, 0x9255c5ed, 0x1257a240,
-		0x4e1a8302, 0xbae07fff, 0x528246e7, 0x8e57140e, 0x3373f7bf, 0x8c9f8188, 0xa6fc4ee8, 0xc982b5a5,
-		0xa8c01db7, 0x579fc264, 0x67094f31, 0xf2bd3f5f, 0x40fff7c1, 0x1fb78dfc, 0x8e6bd2c1, 0x437be59b,
-		0x99b03dbf, 0xb5dbc64b, 0x638dc0e6, 0x55819d99, 0xa197c81c, 0x4a012d6e, 0xc5884a28, 0xccc36f71,
-		0xb843c213, 0x6c0743f1, 0x8309893c, 0x0feddd5f, 0x2f7fe850, 0xd7c07f7e, 0x02507fbf, 0x5afb9a04,
-		0xa747d2d0, 0x1651192e, 0xaf70bf3e, 0x58c31380, 0x5f98302e, 0x727cc3c4, 0x0a0fb402, 0x0f7fef82,
-		0x8c96fdad, 0x5d2c2aae, 0x8ee99a49, 0x50da88b8, 0x8427f4a0, 0x1eac5790, 0x796fb449, 0x8252dc15,
-		0xefbd7d9b, 0xa672597d, 0xada840d8, 0x45f54504, 0xfa5d7403, 0xe83ec305, 0x4f91751a, 0x925669c2,
-		0x23efe941, 0xa903f12e, 0x60270df2, 0x0276e4b6, 0x94fd6574, 0x927985b2, 0x8276dbcb, 0x02778176,
-		0xf8af918d, 0x4e48f79e, 0x8f616ddf, 0xe29d840e, 0x842f7d83, 0x340ce5c8, 0x96bbb682, 0x93b4b148,
-		0xef303cab, 0x984faf28, 0x779faf9b, 0x92dc560d, 0x224d1e20, 0x8437aa88, 0x7d29dc96, 0x2756d3dc,
-		0x8b907cee, 0xb51fd240, 0xe7c07ce3, 0xe566b4a1, 0xc3e9615e, 0x3cf8209d, 0x6094d1e3, 0xcd9ca341,
-		0x5c76460e, 0x00ea983b, 0xd4d67881, 0xfd47572c, 0xf76cedd9, 0xbda8229c, 0x127dadaa, 0x438a074e,
-		0x1f97c090, 0x081bdb8a, 0x93a07ebe, 0xb938ca15, 0x97b03cff, 0x3dc2c0f8, 0x8d1ab2ec, 0x64380e51,
-		0x68cc7bfb, 0xd90f2788, 0x12490181, 0x5de5ffd4, 0xdd7ef86a, 0x76a2e214, 0xb9a40368, 0x925d958f,
-		0x4b39fffa, 0xba39aee9, 0xa4ffd30b, 0xfaf7933b, 0x6d498623, 0x193cbcfa, 0x27627545, 0x825cf47a,
-		0x61bd8ba0, 0xd11e42d1, 0xcead04f4, 0x127ea392, 0x10428db7, 0x8272a972, 0x9270c4a8, 0x127de50b,
-		0x285ba1c8, 0x3c62f44f, 0x35c0eaa5, 0xe805d231, 0x428929fb, 0xb4fcdf82, 0x4fb66a53, 0x0e7dc15b,
-		0x1f081fab, 0x108618ae, 0xfcfd086d, 0xf9ff2889, 0x694bcc11, 0x236a5cae, 0x12deca4d, 0x2c3f8cc5,
-		0xd2d02dfe, 0xf8ef5896, 0xe4cf52da, 0x95155b67, 0x494a488c, 0xb9b6a80c, 0x5c8f82bc, 0x89d36b45,
-		0x3a609437, 0xec00c9a9, 0x44715253, 0x0a874b49, 0xd773bc40, 0x7c34671c, 0x02717ef6, 0x4feb5536,
-		0xa2d02fff, 0xd2bf60c4, 0xd43f03c0, 0x50b4ef6d, 0x07478cd1, 0x006e1888, 0xa2e53f55, 0xb9e6d4bc,
-		0xa2048016, 0x97573833, 0xd7207d67, 0xde0f8f3d, 0x72f87b33, 0xabcc4f33, 0x7688c55d, 0x7b00a6b0,
-		0x947b0001, 0x570075d2, 0xf9bb88f8, 0x8942019e, 0x4264a5ff, 0x856302e0, 0x72dbd92b, 0xee971b69,
-		0x6ea22fde, 0x5f08ae2b, 0xaf7a616d, 0xe5c98767, 0xcf1febd2, 0x61efc8c2, 0xf1ac2571, 0xcc8239c2,
-		0x67214cb8, 0xb1e583d1, 0xb7dc3e62, 0x7f10bdce, 0xf90a5c38, 0x0ff0443d, 0x606e6dc6, 0x60543a49,
-		0x5727c148, 0x2be98a1d, 0x8ab41738, 0x20e1be24, 0xaf96da0f, 0x68458425, 0x99833be5, 0x600d457d,
-		0x282f9350, 0x8334b362, 0xd91d1120, 0x2b6d8da0, 0x642b1e31, 0x9c305a00, 0x52bce688, 0x1b03588a,
-		0xf7baefd5, 0x4142ed9c, 0xa4315c11, 0x83323ec5, 0xdfef4636, 0xa133c501, 0xe9d3531c, 0xee353783,
-	},
-	{
-		0x9db30420, 0x1fb6e9de, 0xa7be7bef, 0xd273a298, 0x4a4f7bdb, 0x64ad8c57, 0x85510443, 0xfa020ed1,
-		0x7e287aff, 0xe60fb663, 0x095f35a1, 0x79ebf120, 0xfd059d43, 0x6497b7b1, 0xf3641f63, 0x241e4adf,
-		0x28147f5f, 0x4fa2b8cd, 0xc9430040, 0x0cc32220, 0xfdd30b30, 0xc0a5374f, 0x1d2d00d9, 0x24147b15,
-		0xee4d111a, 0x0fca5167, 0x71ff904c, 0x2d195ffe, 0x1a05645f, 0x0c13fefe, 0x081b08ca, 0x05170121,
-		0x80530100, 0xe83e5efe, 0xac9af4f8, 0x7fe72701, 0xd2b8ee5f, 0x06df4261, 0xbb9e9b8a, 0x7293ea25,
-		0xce84ffdf, 0xf5718801, 0x3dd64b04, 0xa26f263b, 0x7ed48400, 0x547eebe6, 0x446d4ca0, 0x6cf3d6f5,
-		0x2649abdf, 0xaea0c7f5, 0x36338cc1, 0x503f7e93, 0xd3772061, 0x11b638e1, 0x72500e03, 0xf80eb2bb,
-		0xabe0502e, 0xec8d77de, 0x57971e81, 0xe14f6746, 0xc9335400, 0x6920318f, 0x081dbb99, 0xffc304a5,
-		0x4d351805, 0x7f3d5ce3, 0xa6c866c6, 0x5d5bcca9, 0xdaec6fea, 0x9f926f91, 0x9f46222f, 0x3991467d,
-		0xa5bf6d8e, 0x1143c44f, 0x43958302, 0xd0214eeb, 0x022083b8, 0x3fb6180c, 0x18f8931e, 0x281658e6,
-		0x26486e3e, 0x8bd78a70, 0x7477e4c1, 0xb506e07c, 0xf32d0a25, 0x79098b02, 0xe4eabb81, 0x28123b23,
-		0x69dead38, 0x1574ca16, 0xdf871b62, 0x211c40b7, 0xa51a9ef9, 0x0014377b, 0x041e8ac8, 0x09114003,
-		0xbd59e4d2, 0xe3d156d5, 0x4fe876d5, 0x2f91a340, 0x557be8de, 0x00eae4a7, 0x0ce5c2ec, 0x4db4bba6,
-		0xe756bdff, 0xdd3369ac, 0xec17b035, 0x06572327, 0x99afc8b0, 0x56c8c391, 0x6b65811c, 0x5e146119,
-		0x6e85cb75, 0xbe07c002, 0xc2325577, 0x893ff4ec, 0x5bbfc92d, 0xd0ec3b25, 0xb7801ab7, 0x8d6d3b24,
-		0x20c763ef, 0xc366a5fc, 0x9c382880, 0x0ace3205, 0xaac9548a, 0xeca1d7c7, 0x041afa32, 0x1d16625a,
-		0x6701902c, 0x9b757a54, 0x31d477f7, 0x9126b031, 0x36cc6fdb, 0xc70b8b46, 0xd9e66a48, 0x56e55a79,
-		0x026a4ceb, 0x52437eff, 0x2f8f76b4, 0x0df980a5, 0x8674cde3, 0xedda04eb, 0x17a9be04, 0x2c18f4df,
-		0xb7747f9d, 0xab2af7b4, 0xefc34d20, 0x2e096b7c, 0x1741a254, 0xe5b6a035, 0x213d42f6, 0x2c1c7c26,
-		0x61c2f50f, 0x6552daf9, 0xd2c231f8, 0x25130f69, 0xd8167fa2, 0x0418f2c8, 0x001a96a6, 0x0d1526ab,
-		0x63315c21, 0x5e0a72ec, 0x49bafefd, 0x187908d9, 0x8d0dbd86, 0x311170a7, 0x3e9b640c, 0xcc3e10d7,
-		0xd5cad3b6, 0x0caec388, 0xf73001e1, 0x6c728aff, 0x71eae2a1, 0x1f9af36e, 0xcfcbd12f, 0xc1de8417,
-		0xac07be6b, 0xcb44a1d8, 0x8b9b0f56, 0x013988c3, 0xb1c52fca, 0xb4be31cd, 0xd8782806, 0x12a3a4e2,
-		0x6f7de532, 0x58fd7eb6, 0xd01ee900, 0x24adffc2, 0xf4990fc5, 0x9711aac5, 0x001d7b95, 0x82e5e7d2,
-		0x109873f6, 0x00613096, 0xc32d9521, 0xada121ff, 0x29908415, 0x7fbb977f, 0xaf9eb3db, 0x29c9ed2a,
-		0x5ce2a465, 0xa730f32c, 0xd0aa3fe8, 0x8a5cc091, 0xd49e2ce7, 0x0ce454a9, 0xd60acd86, 0x015f1919,
-		0x77079103, 0xdea03af6, 0x78a8565e, 0xdee356df, 0x21f05cbe, 0x8b75e387, 0xb3c50651, 0xb8a5c3ef,
-		0xd8eeb6d2, 0xe523be77, 0xc2154529, 0x2f69efdf, 0xafe67afb, 0xf470c4b2, 0xf3e0eb5b, 0xd6cc9876,
-		0x39e4460c, 0x1fda8538, 0x1987832f, 0xca007367, 0xa99144f8, 0x296b299e, 0x492fc295, 0x9266beab,
-		0xb5676e69, 0x9bd3ddda, 0xdf7e052f, 0xdb25701c, 0x1b5e51ee, 0xf65324e6, 0x6afce36c, 0x0316cc04,
-		0x8644213e, 0xb7dc59d0, 0x7965291f, 0xccd6fd43, 0x41823979, 0x932bcdf6, 0xb657c34d, 0x4edfd282,
-		0x7ae5290c, 0x3cb9536b, 0x851e20fe, 0x9833557e, 0x13ecf0b0, 0xd3ffb372, 0x3f85c5c1, 0x0aef7ed2,
-	},
-	{
-		0x7ec90c04, 0x2c6e74b9, 0x9b0e66df, 0xa6337911, 0xb86a7fff, 0x1dd358f5, 0x44dd9d44, 0x1731167f,
-		0x08fbf1fa, 0xe7f511cc, 0xd2051b00, 0x735aba00, 0x2ab722d8, 0x386381cb, 0xacf6243a, 0x69befd7a,
-		0xe6a2e77f, 0xf0c720cd, 0xc4494816, 0xccf5c180, 0x38851640, 0x15b0a848, 0xe68b18cb, 0x4caadeff,
-		0x5f480a01, 0x0412b2aa, 0x259814fc, 0x41d0efe2, 0x4e40b48d, 0x248eb6fb, 0x8dba1cfe, 0x41a99b02,
-		0x1a550a04, 0xba8f65cb, 0x7251f4e7, 0x95a51725, 0xc106ecd7, 0x97a5980a, 0xc539b9aa, 0x4d79fe6a,
-		0xf2f3f763, 0x68af8040, 0xed0c9e56, 0x11b4958b, 0xe1eb5a88, 0x8709e6b0, 0xd7e07156, 0x4e29fea7,
-		0x6366e52d, 0x02d1c000, 0xc4ac8e05, 0x9377f571, 0x0c05372a, 0x578535f2, 0x2261be02, 0xd642a0c9,
-		0xdf13a280, 0x74b55bd2, 0x682199c0, 0xd421e5ec, 0x53fb3ce8, 0xc8adedb3, 0x28a87fc9, 0x3d959981,
-		0x5c1ff900, 0xfe38d399, 0x0c4eff0b, 0x062407ea, 0xaa2f4fb1, 0x4fb96976, 0x90c79505, 0xb0a8a774,
-		0xef55a1ff, 0xe59ca2c2, 0xa6b62d27, 0xe66a4263, 0xdf65001f, 0x0ec50966, 0xdfdd55bc, 0x29de0655,
-		0x911e739a, 0x17af8975, 0x32c7911c, 0x89f89468, 0x0d01e980, 0x524755f4, 0x03b63cc9, 0x0cc844b2,
-		0xbcf3f0aa, 0x87ac36e9, 0xe53a7426, 0x01b3d82b, 0x1a9e7449, 0x64ee2d7e, 0xcddbb1da, 0x01c94910,
-		0xb868bf80, 0x0d26f3fd, 0x9342ede7, 0x04a5c284, 0x636737b6, 0x50f5b616, 0xf24766e3, 0x8eca36c1,
-		0x136e05db, 0xfef18391, 0xfb887a37, 0xd6e7f7d4, 0xc7fb7dc9, 0x3063fcdf, 0xb6f589de, 0xec2941da,
-		0x26e46695, 0xb7566419, 0xf654efc5, 0xd08d58b7, 0x48925401, 0xc1bacb7f, 0xe5ff550f, 0xb6083049,
-		0x5bb5d0e8, 0x87d72e5a, 0xab6a6ee1, 0x223a66ce, 0xc62bf3cd, 0x9e0885f9, 0x68cb3e47, 0x086c010f,
-		0xa21de820, 0xd18b69de, 0xf3f65777, 0xfa02c3f6, 0x407edac3, 0xcbb3d550, 0x1793084d, 0xb0d70eba,
-		0x0ab378d5, 0xd951fb0c, 0xded7da56, 0x4124bbe4, 0x94ca0b56, 0x0f5755d1, 0xe0e1e56e, 0x6184b5be,
-		0x580a249f, 0x94f74bc0, 0xe327888e, 0x9f7b5561, 0xc3dc0280, 0x05687715, 0x646c6bd7, 0x44904db3,
-		0x66b4f0a3, 0xc0f1648a, 0x697ed5af, 0x49e92ff6, 0x309e374f, 0x2cb6356a, 0x85808573, 0x4991f840,
-		0x76f0ae02, 0x083be84d, 0x28421c9a, 0x44489406, 0x736e4cb8, 0xc1092910, 0x8bc95fc6, 0x7d869cf4,
-		0x134f616f, 0x2e77118d, 0xb31b2be1, 0xaa90b472, 0x3ca5d717, 0x7d161bba, 0x9cad9010, 0xaf462ba2,
-		0x9fe459d2, 0x45d34559, 0xd9f2da13, 0xdbc65487, 0xf3e4f94e, 0x176d486f, 0x097c13ea, 0x631da5c7,
-		0x445f7382, 0x175683f4, 0xcdc66a97, 0x70be0288, 0xb3cdcf72, 0x6e5dd2f3, 0x20936079, 0x459b80a5,
-		0xbe60e2db, 0xa9c23101, 0xeba5315c, 0x224e42f2, 0x1c5c1572, 0xf6721b2c, 0x1ad2fff3, 0x8c25404e,
-		0x324ed72f, 0x4067b7fd, 0x0523138e, 0x5ca3bc78, 0xdc0fd66e, 0x75922283, 0x784d6b17, 0x58ebb16e,
-		0x44094f85, 0x3f481d87, 0xfcfeae7b, 0x77b5ff76, 0x8c2302bf, 0xaaf47556, 0x5f46b02a, 0x2b092801,
-		0x3d38f5f7, 0x0ca81f36, 0x52af4a8a, 0x66d5e7c0, 0xdf3b0874, 0x95055110, 0x1b5ad7a8, 0xf61ed5ad,
-		0x6cf6e479, 0x20758184, 0xd0cefa65, 0x88f7be58, 0x4a046826, 0x0ff6f8f3, 0xa09c7f70, 0x5346aba0,
-		0x5ce96c28, 0xe176eda3, 0x6bac307f, 0x376829d2, 0x85360fa9, 0x17e3fe2a, 0x24b79767, 0xf5a96b20,
-		0xd6cd2595, 0x68ff1ebf, 0x7555442c, 0xf19f06be, 0xf9e0659a, 0xeeb9491d, 0x34010718, 0xbb30cab8,
-		0xe822fe15, 0x88570983, 0x750e6249, 0xda627e55, 0x5e76ffa8, 0xb1534546, 0x6d47de08, 0xefe9e7d4,
-	},
-	{
-		0xf6fa8f9d, 0x2cac6ce1, 0x4ca34867, 0xe2337f7c, 0x95db08e7, 0x016843b4, 0xeced5cbc, 0x325553ac,
-		0xbf9f0960, 0xdfa1e2ed, 0x83f0579d, 0x63ed86b9, 0x1ab6a6b8, 0xde5ebe39, 0xf38ff732, 0x8989b138,
-		0x33f14961, 0xc01937bd, 0xf506c6da, 0xe4625e7e, 0xa308ea99, 0x4e23e33c, 0x79cbd7cc, 0x48a14367,
-		0xa3149619, 0xfec94bd5, 0xa114174a, 0xeaa01866, 0xa084db2d, 0x09a8486f, 0xa888614a, 0x2900af98,
-		0x01665991, 0xe1992863, 0xc8f30c60, 0x2e78ef3c, 0xd0d51932, 0xcf0fec14, 0xf7ca07d2, 0xd0a82072,
-		0xfd41197e, 0x9305a6b0, 0xe86be3da, 0x74bed3cd, 0x372da53c, 0x4c7f4448, 0xdab5d440, 0x6dba0ec3,
-		0x083919a7, 0x9fbaeed9, 0x49dbcfb0, 0x4e670c53, 0x5c3d9c01, 0x64bdb941, 0x2c0e636a, 0xba7dd9cd,
-		0xea6f7388, 0xe70bc762, 0x35f29adb, 0x5c4cdd8d, 0xf0d48d8c, 0xb88153e2, 0x08a19866, 0x1ae2eac8,
-		0x284caf89, 0xaa928223, 0x9334be53, 0x3b3a21bf, 0x16434be3, 0x9aea3906, 0xefe8c36e, 0xf890cdd9,
-		0x80226dae, 0xc340a4a3, 0xdf7e9c09, 0xa694a807, 0x5b7c5ecc, 0x221db3a6, 0x9a69a02f, 0x68818a54,
-		0xceb2296f, 0x53c0843a, 0xfe893655, 0x25bfe68a, 0xb4628abc, 0xcf222ebf, 0x25ac6f48, 0xa9a99387,
-		0x53bddb65, 0xe76ffbe7, 0xe967fd78, 0x0ba93563, 0x8e342bc1, 0xe8a11be9, 0x4980740d, 0xc8087dfc,
-		0x8de4bf99, 0xa11101a0, 0x7fd37975, 0xda5a26c0, 0xe81f994f, 0x9528cd89, 0xfd339fed, 0xb87834bf,
-		0x5f04456d, 0x22258698, 0xc9c4c83b, 0x2dc156be, 0x4f628daa, 0x57f55ec5, 0xe2220abe, 0xd2916ebf,
-		0x4ec75b95, 0x24f2c3c0, 0x42d15d99, 0xcd0d7fa0, 0x7b6e27ff, 0xa8dc8af0, 0x7345c106, 0xf41e232f,
-		0x35162386, 0xe6ea8926, 0x3333b094, 0x157ec6f2, 0x372b74af, 0x692573e4, 0xe9a9d848, 0xf3160289,
-		0x3a62ef1d, 0xa787e238, 0xf3a5f676, 0x74364853, 0x20951063, 0x4576698d, 0xb6fad407, 0x592af950,
-		0x36f73523, 0x4cfb6e87, 0x7da4cec0, 0x6c152daa, 0xcb0396a8, 0xc50dfe5d, 0xfcd707ab, 0x0921c42f,
-		0x89dff0bb, 0x5fe2be78, 0x448f4f33, 0x754613c9, 0x2b05d08d, 0x48b9d585, 0xdc049441, 0xc8098f9b,
-		0x7dede786, 0xc39a3373, 0x42410005, 0x6a091751, 0x0ef3c8a6, 0x890072d6, 0x28207682, 0xa9a9f7be,
-		0xbf32679d, 0xd45b5b75, 0xb353fd00, 0xcbb0e358, 0x830f220a, 0x1f8fb214, 0xd372cf08, 0xcc3c4a13,
-		0x8cf63166, 0x061c87be, 0x88c98f88, 0x6062e397, 0x47cf8e7a, 0xb6c85283, 0x3cc2acfb, 0x3fc06976,
-		0x4e8f0252, 0x64d8314d, 0xda3870e3, 0x1e665459, 0xc10908f0, 0x513021a5, 0x6c5b68b7, 0x822f8aa0,
-		0x3007cd3e, 0x74719eef, 0xdc872681, 0x073340d4, 0x7e432fd9, 0x0c5ec241, 0x8809286c, 0xf592d891,
-		0x08a930f6, 0x957ef305, 0xb7fbffbd, 0xc266e96f, 0x6fe4ac98, 0xb173ecc0, 0xbc60b42a, 0x953498da,
-		0xfba1ae12, 0x2d4bd736, 0x0f25faab, 0xa4f3fceb, 0xe2969123, 0x257f0c3d, 0x9348af49, 0x361400bc,
-		0xe8816f4a, 0x3814f200, 0xa3f94043, 0x9c7a54c2, 0xbc704f57, 0xda41e7f9, 0xc25ad33a, 0x54f4a084,
-		0xb17f5505, 0x59357cbe, 0xedbd15c8, 0x7f97c5ab, 0xba5ac7b5, 0xb6f6deaf, 0x3a479c3a, 0x5302da25,
-		0x653d7e6a, 0x54268d49, 0x51a477ea, 0x5017d55b, 0xd7d25d88, 0x44136c76, 0x0404a8c8, 0xb8e5a121,
-		0xb81a928a, 0x60ed5869, 0x97c55b96, 0xeaec991b, 0x29935913, 0x01fdb7f1, 0x088e8dfa, 0x9ab6f6f5,
-		0x3b4cbf9f, 0x4a5de3ab, 0xe6051d35, 0xa0e1d855, 0xd36b4cf1, 0xf544edeb, 0xb0e93524, 0xbebb8fbd,
-		0xa2d762cf, 0x49c92f54, 0x38b5f331, 0x7128a454, 0x48392905, 0xa65b1db8, 0x851c97bd, 0xd675cf2f,
-	},
-	{
-		0x85e04019, 0x332bf567, 0x662dbfff, 0xcfc65693, 0x2a8d7f6f, 0xab9bc912, 0xde6008a1, 0x2028da1f,
-		0x0227bce7, 0x4d642916, 0x18fac300, 0x50f18b82, 0x2cb2cb11, 0xb232e75c, 0x4b3695f2, 0xb28707de,
-		0xa05fbcf6, 0xcd4181e9, 0xe150210c, 0xe24ef1bd, 0xb168c381, 0xfde4e789, 0x5c79b0d8, 0x1e8bfd43,
-		0x4d495001, 0x38be4341, 0x913cee1d, 0x92a79c3f, 0x089766be, 0xbaeeadf4, 0x1286becf, 0xb6eacb19,
-		0x2660c200, 0x7565bde4, 0x64241f7a, 0x8248dca9, 0xc3b3ad66, 0x28136086, 0x0bd8dfa8, 0x356d1cf2,
-		0x107789be, 0xb3b2e9ce, 0x0502aa8f, 0x0bc0351e, 0x166bf52a, 0xeb12ff82, 0xe3486911, 0xd34d7516,
-		0x4e7b3aff, 0x5f43671b, 0x9cf6e037, 0x4981ac83, 0x334266ce, 0x8c9341b7, 0xd0d854c0, 0xcb3a6c88,
-		0x47bc2829, 0x4725ba37, 0xa66ad22b, 0x7ad61f1e, 0x0c5cbafa, 0x4437f107, 0xb6e79962, 0x42d2d816,
-		0x0a961288, 0xe1a5c06e, 0x13749e67, 0x72fc081a, 0xb1d139f7, 0xf9583745, 0xcf19df58, 0xbec3f756,
-		0xc06eba30, 0x07211b24, 0x45c28829, 0xc95e317f, 0xbc8ec511, 0x38bc46e9, 0xc6e6fa14, 0xbae8584a,
-		0xad4ebc46, 0x468f508b, 0x7829435f, 0xf124183b, 0x821dba9f, 0xaff60ff4, 0xea2c4e6d, 0x16e39264,
-		0x92544a8b, 0x009b4fc3, 0xaba68ced, 0x9ac96f78, 0x06a5b79a, 0xb2856e6e, 0x1aec3ca9, 0xbe838688,
-		0x0e0804e9, 0x55f1be56, 0xe7e5363b, 0xb3a1f25d, 0xf7debb85, 0x61fe033c, 0x16746233, 0x3c034c28,
-		0xda6d0c74, 0x79aac56c, 0x3ce4e1ad, 0x51f0c802, 0x98f8f35a, 0x1626a49f, 0xeed82b29, 0x1d382fe3,
-		0x0c4fb99a, 0xbb325778, 0x3ec6d97b, 0x6e77a6a9, 0xcb658b5c, 0xd45230c7, 0x2bd1408b, 0x60c03eb7,
-		0xb9068d78, 0xa33754f4, 0xf430c87d, 0xc8a71302, 0xb96d8c32, 0xebd4e7be, 0xbe8b9d2d, 0x7979fb06,
-		0xe7225308, 0x8b75cf77, 0x11ef8da4, 0xe083c858, 0x8d6b786f, 0x5a6317a6, 0xfa5cf7a0, 0x5dda0033,
-		0xf28ebfb0, 0xf5b9c310, 0xa0eac280, 0x08b9767a, 0xa3d9d2b0, 0x79d34217, 0x021a718d, 0x9ac6336a,
-		0x2711fd60, 0x438050e3, 0x069908a8, 0x3d7fedc4, 0x826d2bef, 0x4eeb8476, 0x488dcf25, 0x36c9d566,
-		0x28e74e41, 0xc2610aca, 0x3d49a9cf, 0xbae3b9df, 0xb65f8de6, 0x92aeaf64, 0x3ac7d5e6, 0x9ea80509,
-		0xf22b017d, 0xa4173f70, 0xdd1e16c3, 0x15e0d7f9, 0x50b1b887, 0x2b9f4fd5, 0x625aba82, 0x6a017962,
-		0x2ec01b9c, 0x15488aa9, 0xd716e740, 0x40055a2c, 0x93d29a22, 0xe32dbf9a, 0x058745b9, 0x3453dc1e,
-		0xd699296e, 0x496cff6f, 0x1c9f4986, 0xdfe2ed07, 0xb87242d1, 0x19de7eae, 0x053e561a, 0x15ad6f8c,
-		0x66626c1c, 0x7154c24c, 0xea082b2a, 0x93eb2939, 0x17dcb0f0, 0x58d4f2ae, 0x9ea294fb, 0x52cf564c,
-		0x9883fe66, 0x2ec40581, 0x763953c3, 0x01d6692e, 0xd3a0c108, 0xa1e7160e, 0xe4f2dfa6, 0x693ed285,
-		0x74904698, 0x4c2b0edd, 0x4f757656, 0x5d393378, 0xa132234f, 0x3d321c5d, 0xc3f5e194, 0x4b269301,
-		0xc79f022f, 0x3c997e7e, 0x5e4f9504, 0x3ffafbbd, 0x76f7ad0e, 0x296693f4, 0x3d1fce6f, 0xc61e45be,
-		0xd3b5ab34, 0xf72bf9b7, 0x1b0434c0, 0x4e72b567, 0x5592a33d, 0xb5229301, 0xcfd2a87f, 0x60aeb767,
-		0x1814386b, 0x30bcc33d, 0x38a0c07d, 0xfd1606f2, 0xc363519b, 0x589dd390, 0x5479f8e6, 0x1cb8d647,
-		0x97fd61a9, 0xea7759f4, 0x2d57539d, 0x569a58cf, 0xe84e63ad, 0x462e1b78, 0x6580f87e, 0xf3817914,
-		0x91da55f4, 0x40a230f3, 0xd1988f35, 0xb6e318d2, 0x3ffa50bc, 0x3d40f021, 0xc3c0bdae, 0x4958c24c,
-		0x518f36b2, 0x84b1d370, 0x0fedce83, 0x878ddada, 0xf2a279c7, 0x94e01be8, 0x90716f4b, 0x954b8aa3,
-	},
-	{
-		0xe216300d, 0xbbddfffc, 0xa7ebdabd, 0x35648095, 0x7789f8b7, 0xe6c1121b, 0x0e241600, 0x052ce8b5,
-		0x11a9cfb0, 0xe5952f11, 0xece7990a, 0x9386d174, 0x2a42931c, 0x76e38111, 0xb12def3a, 0x37ddddfc,
-		0xde9adeb1, 0x0a0cc32c, 0xbe197029, 0x84a00940, 0xbb243a0f, 0xb4d137cf, 0xb44e79f0, 0x049eedfd,
-		0x0b15a15d, 0x480d3168, 0x8bbbde5a, 0x669ded42, 0xc7ece831, 0x3f8f95e7, 0x72df191b, 0x7580330d,
-		0x94074251, 0x5c7dcdfa, 0xabbe6d63, 0xaa402164, 0xb301d40a, 0x02e7d1ca, 0x53571dae, 0x7a3182a2,
-		0x12a8ddec, 0xfdaa335d, 0x176f43e8, 0x71fb46d4, 0x38129022, 0xce949ad4, 0xb84769ad, 0x965bd862,
-		0x82f3d055, 0x66fb9767, 0x15b80b4e, 0x1d5b47a0, 0x4cfde06f, 0xc28ec4b8, 0x57e8726e, 0x647a78fc,
-		0x99865d44, 0x608bd593, 0x6c200e03, 0x39dc5ff6, 0x5d0b00a3, 0xae63aff2, 0x7e8bd632, 0x70108c0c,
-		0xbbd35049, 0x2998df04, 0x980cf42a, 0x9b6df491, 0x9e7edd53, 0x06918548, 0x58cb7e07, 0x3b74ef2e,
-		0x522fffb1, 0xd24708cc, 0x1c7e27cd, 0xa4eb215b, 0x3cf1d2e2, 0x19b47a38, 0x424f7618, 0x35856039,
-		0x9d17dee7, 0x27eb35e6, 0xc9aff67b, 0x36baf5b8, 0x09c467cd, 0xc18910b1, 0xe11dbf7b, 0x06cd1af8,
-		0x7170c608, 0x2d5e3354, 0xd4de495a, 0x64c6d006, 0xbcc0c62c, 0x3dd00db3, 0x708f8f34, 0x77d51b42,
-		0x264f620f, 0x24b8d2bf, 0x15c1b79e, 0x46a52564, 0xf8d7e54e, 0x3e378160, 0x7895cda5, 0x859c15a5,
-		0xe6459788, 0xc37bc75f, 0xdb07ba0c, 0x0676a3ab, 0x7f229b1e, 0x31842e7b, 0x24259fd7, 0xf8bef472,
-		0x835ffcb8, 0x6df4c1f2, 0x96f5b195, 0xfd0af0fc, 0xb0fe134c, 0xe2506d3d, 0x4f9b12ea, 0xf215f225,
-		0xa223736f, 0x9fb4c428, 0x25d04979, 0x34c713f8, 0xc4618187, 0xea7a6e98, 0x7cd16efc, 0x1436876c,
-		0xf1544107, 0xbedeee14, 0x56e9af27, 0xa04aa441, 0x3cf7c899, 0x92ecbae6, 0xdd67016d, 0x151682eb,
-		0xa842eedf, 0xfdba60b4, 0xf1907b75, 0x20e3030f, 0x24d8c29e, 0xe139673b, 0xefa63fb8, 0x71873054,
-		0xb6f2cf3b, 0x9f326442, 0xcb15a4cc, 0xb01a4504, 0xf1e47d8d, 0x844a1be5, 0xbae7dfdc, 0x42cbda70,
-		0xcd7dae0a, 0x57e85b7a, 0xd53f5af6, 0x20cf4d8c, 0xcea4d428, 0x79d130a4, 0x3486ebfb, 0x33d3cddc,
-		0x77853b53, 0x37effcb5, 0xc5068778, 0xe580b3e6, 0x4e68b8f4, 0xc5c8b37e, 0x0d809ea2, 0x398feb7c,
-		0x132a4f94, 0x43b7950e, 0x2fee7d1c, 0x223613bd, 0xdd06caa2, 0x37df932b, 0xc4248289, 0xacf3ebc3,
-		0x5715f6b7, 0xef3478dd, 0xf267616f, 0xc148cbe4, 0x9052815e, 0x5e410fab, 0xb48a2465, 0x2eda7fa4,
-		0xe87b40e4, 0xe98ea084, 0x5889e9e1, 0xefd390fc, 0xdd07d35b, 0xdb485694, 0x38d7e5b2, 0x57720101,
-		0x730edebc, 0x5b643113, 0x94917e4f, 0x503c2fba, 0x646f1282, 0x7523d24a, 0xe0779695, 0xf9c17a8f,
-		0x7a5b2121, 0xd187b896, 0x29263a4d, 0xba510cdf, 0x81f47c9f, 0xad1163ed, 0xea7b5965, 0x1a00726e,
-		0x11403092, 0x00da6d77, 0x4a0cdd61, 0xad1f4603, 0x605bdfb0, 0x9eedc364, 0x22ebe6a8, 0xcee7d28a,
-		0xa0e736a0, 0x5564a6b9, 0x10853209, 0xc7eb8f37, 0x2de705ca, 0x8951570f, 0xdf09822b, 0xbd691a6c,
-		0xaa12e4f2, 0x87451c0f, 0xe0f6a27a, 0x3ada4819, 0x4cf1764f, 0x0d771c2b, 0x67cdb156, 0x350d8384,
-		0x5938fa0f, 0x42399ef3, 0x36997b07, 0x0e84093d, 0x4aa93e61, 0x8360d87b, 0x1fa98b0c, 0x1149382c,
-		0xe97625a5, 0x0614d1b7, 0x0e25244b, 0x0c768347, 0x589e8d82, 0x0d2059d1, 0xa466bb1e, 0xf8da0a82,
-		0x04f19130, 0xba6e4ec0, 0x99265164, 0x1ee7230d, 0x50b2ad80, 0xeaee6801, 0x8db2a283, 0xea8bf59e,
-	},
-}
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go
deleted file mode 100644
index e28f49d..0000000
--- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD as specified in RFC 7539.
-package chacha20poly1305 // import "golang.org/x/crypto/chacha20poly1305"
-
-import (
-	"crypto/cipher"
-	"encoding/binary"
-	"errors"
-)
-
-const (
-	// KeySize is the size of the key used by this AEAD, in bytes.
-	KeySize = 32
-	// NonceSize is the size of the nonce used with this AEAD, in bytes.
-	NonceSize = 12
-)
-
-type chacha20poly1305 struct {
-	key [8]uint32
-}
-
-// New returns a ChaCha20-Poly1305 AEAD that uses the given, 256-bit key.
-func New(key []byte) (cipher.AEAD, error) {
-	if len(key) != KeySize {
-		return nil, errors.New("chacha20poly1305: bad key length")
-	}
-	ret := new(chacha20poly1305)
-	ret.key[0] = binary.LittleEndian.Uint32(key[0:4])
-	ret.key[1] = binary.LittleEndian.Uint32(key[4:8])
-	ret.key[2] = binary.LittleEndian.Uint32(key[8:12])
-	ret.key[3] = binary.LittleEndian.Uint32(key[12:16])
-	ret.key[4] = binary.LittleEndian.Uint32(key[16:20])
-	ret.key[5] = binary.LittleEndian.Uint32(key[20:24])
-	ret.key[6] = binary.LittleEndian.Uint32(key[24:28])
-	ret.key[7] = binary.LittleEndian.Uint32(key[28:32])
-	return ret, nil
-}
-
-func (c *chacha20poly1305) NonceSize() int {
-	return NonceSize
-}
-
-func (c *chacha20poly1305) Overhead() int {
-	return 16
-}
-
-func (c *chacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte {
-	if len(nonce) != NonceSize {
-		panic("chacha20poly1305: bad nonce length passed to Seal")
-	}
-
-	if uint64(len(plaintext)) > (1<<38)-64 {
-		panic("chacha20poly1305: plaintext too large")
-	}
-
-	return c.seal(dst, nonce, plaintext, additionalData)
-}
-
-var errOpen = errors.New("chacha20poly1305: message authentication failed")
-
-func (c *chacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
-	if len(nonce) != NonceSize {
-		panic("chacha20poly1305: bad nonce length passed to Open")
-	}
-	if len(ciphertext) < 16 {
-		return nil, errOpen
-	}
-	if uint64(len(ciphertext)) > (1<<38)-48 {
-		panic("chacha20poly1305: ciphertext too large")
-	}
-
-	return c.open(dst, nonce, ciphertext, additionalData)
-}
-
-// sliceForAppend takes a slice and a requested number of bytes. It returns a
-// slice with the contents of the given slice followed by that many bytes and a
-// second slice that aliases into it and contains only the extra bytes. If the
-// original slice has sufficient capacity then no allocation is performed.
-func sliceForAppend(in []byte, n int) (head, tail []byte) {
-	if total := len(in) + n; cap(in) >= total {
-		head = in[:total]
-	} else {
-		head = make([]byte, total)
-		copy(head, in)
-	}
-	tail = head[len(in):]
-	return
-}
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go
deleted file mode 100644
index 07d18a3..0000000
--- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.7,amd64,!gccgo,!appengine
-
-package chacha20poly1305
-
-import (
-	"encoding/binary"
-
-	"golang.org/x/sys/cpu"
-)
-
-//go:noescape
-func chacha20Poly1305Open(dst []byte, key []uint32, src, ad []byte) bool
-
-//go:noescape
-func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte)
-
-var (
-	useASM  = cpu.X86.HasSSSE3
-	useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
-)
-
-// setupState writes a ChaCha20 input matrix to state. See
-// https://tools.ietf.org/html/rfc7539#section-2.3.
-func setupState(state *[16]uint32, key *[8]uint32, nonce []byte) {
-	state[0] = 0x61707865
-	state[1] = 0x3320646e
-	state[2] = 0x79622d32
-	state[3] = 0x6b206574
-
-	state[4] = key[0]
-	state[5] = key[1]
-	state[6] = key[2]
-	state[7] = key[3]
-	state[8] = key[4]
-	state[9] = key[5]
-	state[10] = key[6]
-	state[11] = key[7]
-
-	state[12] = 0
-	state[13] = binary.LittleEndian.Uint32(nonce[:4])
-	state[14] = binary.LittleEndian.Uint32(nonce[4:8])
-	state[15] = binary.LittleEndian.Uint32(nonce[8:12])
-}
-
-func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte {
-	if !useASM {
-		return c.sealGeneric(dst, nonce, plaintext, additionalData)
-	}
-
-	var state [16]uint32
-	setupState(&state, &c.key, nonce)
-
-	ret, out := sliceForAppend(dst, len(plaintext)+16)
-	chacha20Poly1305Seal(out[:], state[:], plaintext, additionalData)
-	return ret
-}
-
-func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
-	if !useASM {
-		return c.openGeneric(dst, nonce, ciphertext, additionalData)
-	}
-
-	var state [16]uint32
-	setupState(&state, &c.key, nonce)
-
-	ciphertext = ciphertext[:len(ciphertext)-16]
-	ret, out := sliceForAppend(dst, len(ciphertext))
-	if !chacha20Poly1305Open(out, state[:], ciphertext, additionalData) {
-		for i := range out {
-			out[i] = 0
-		}
-		return nil, errOpen
-	}
-
-	return ret, nil
-}
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
deleted file mode 100644
index af76bbc..0000000
--- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
+++ /dev/null
@@ -1,2695 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
-
-// +build go1.7,amd64,!gccgo,!appengine
-
-#include "textflag.h"
-// General register allocation
-#define oup DI
-#define inp SI
-#define inl BX
-#define adp CX // free to reuse, after we hash the additional data
-#define keyp R8 // free to reuse, when we copy the key to stack
-#define itr2 R9 // general iterator
-#define itr1 CX // general iterator
-#define acc0 R10
-#define acc1 R11
-#define acc2 R12
-#define t0 R13
-#define t1 R14
-#define t2 R15
-#define t3 R8
-// Register and stack allocation for the SSE code
-#define rStore (0*16)(BP)
-#define sStore (1*16)(BP)
-#define state1Store (2*16)(BP)
-#define state2Store (3*16)(BP)
-#define tmpStore (4*16)(BP)
-#define ctr0Store (5*16)(BP)
-#define ctr1Store (6*16)(BP)
-#define ctr2Store (7*16)(BP)
-#define ctr3Store (8*16)(BP)
-#define A0 X0
-#define A1 X1
-#define A2 X2
-#define B0 X3
-#define B1 X4
-#define B2 X5
-#define C0 X6
-#define C1 X7
-#define C2 X8
-#define D0 X9
-#define D1 X10
-#define D2 X11
-#define T0 X12
-#define T1 X13
-#define T2 X14
-#define T3 X15
-#define A3 T0
-#define B3 T1
-#define C3 T2
-#define D3 T3
-// Register and stack allocation for the AVX2 code
-#define rsStoreAVX2 (0*32)(BP)
-#define state1StoreAVX2 (1*32)(BP)
-#define state2StoreAVX2 (2*32)(BP)
-#define ctr0StoreAVX2 (3*32)(BP)
-#define ctr1StoreAVX2 (4*32)(BP)
-#define ctr2StoreAVX2 (5*32)(BP)
-#define ctr3StoreAVX2 (6*32)(BP)
-#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
-#define AA0 Y0
-#define AA1 Y5
-#define AA2 Y6
-#define AA3 Y7
-#define BB0 Y14
-#define BB1 Y9
-#define BB2 Y10
-#define BB3 Y11
-#define CC0 Y12
-#define CC1 Y13
-#define CC2 Y8
-#define CC3 Y15
-#define DD0 Y4
-#define DD1 Y1
-#define DD2 Y2
-#define DD3 Y3
-#define TT0 DD3
-#define TT1 AA3
-#define TT2 BB3
-#define TT3 CC3
-// ChaCha20 constants
-DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
-DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
-// <<< 16 with PSHUFB
-DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
-DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
-// <<< 8 with PSHUFB
-DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
-DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
-
-DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
-DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
-
-DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
-DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
-// Poly1305 key clamp
-DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
-DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
-
-DATA ·sseIncMask<>+0x00(SB)/8, $0x1
-DATA ·sseIncMask<>+0x08(SB)/8, $0x0
-// To load/store the last < 16 bytes in a buffer
-DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
-DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
-
-GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
-GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
-GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
-// No PALIGNR in Go ASM yet (but VPALIGNR is present).
-#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
-#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
-#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
-#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
-#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
-#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
-#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
-#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
-#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
-#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
-#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
-#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
-#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
-#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
-#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
-#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
-#define shiftC0Right shiftC0Left
-#define shiftC1Right shiftC1Left
-#define shiftC2Right shiftC2Left
-#define shiftC3Right shiftC3Left
-#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
-#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
-#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
-#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
-// Some macros
-#define chachaQR(A, B, C, D, T) \
-	PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D                            \
-	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
-	PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D                             \
-	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
-
-#define chachaQR_AVX2(A, B, C, D, T) \
-	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
-	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
-	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
-	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
-
-#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
-#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
-#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
-#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
-
-#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
-#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
-
-#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
-#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
-// ----------------------------------------------------------------------------
-TEXT polyHashADInternal<>(SB), NOSPLIT, $0
-	// adp points to beginning of additional data
-	// itr2 holds ad length
-	XORQ acc0, acc0
-	XORQ acc1, acc1
-	XORQ acc2, acc2
-	CMPQ itr2, $13
-	JNE  hashADLoop
-
-openFastTLSAD:
-	// Special treatment for the TLS case of 13 bytes
-	MOVQ (adp), acc0
-	MOVQ 5(adp), acc1
-	SHRQ $24, acc1
-	MOVQ $1, acc2
-	polyMul
-	RET
-
-hashADLoop:
-	// Hash in 16 byte chunks
-	CMPQ itr2, $16
-	JB   hashADTail
-	polyAdd(0(adp))
-	LEAQ (1*16)(adp), adp
-	SUBQ $16, itr2
-	polyMul
-	JMP  hashADLoop
-
-hashADTail:
-	CMPQ itr2, $0
-	JE   hashADDone
-
-	// Hash last < 16 byte tail
-	XORQ t0, t0
-	XORQ t1, t1
-	XORQ t2, t2
-	ADDQ itr2, adp
-
-hashADTailLoop:
-	SHLQ $8, t1:t0
-	SHLQ $8, t0
-	MOVB -1(adp), t2
-	XORQ t2, t0
-	DECQ adp
-	DECQ itr2
-	JNE  hashADTailLoop
-
-hashADTailFinish:
-	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
-
-	// Finished AD
-hashADDone:
-	RET
-
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
-TEXT ·chacha20Poly1305Open(SB), 0, $288-97
-	// For aligned stack access
-	MOVQ SP, BP
-	ADDQ $32, BP
-	ANDQ $-32, BP
-	MOVQ dst+0(FP), oup
-	MOVQ key+24(FP), keyp
-	MOVQ src+48(FP), inp
-	MOVQ src_len+56(FP), inl
-	MOVQ ad+72(FP), adp
-
-	// Check for AVX2 support
-	CMPB ·useAVX2(SB), $1
-	JE   chacha20Poly1305Open_AVX2
-
-	// Special optimization, for very short buffers
-	CMPQ inl, $128
-	JBE  openSSE128 // About 16% faster
-
-	// For long buffers, prepare the poly key first
-	MOVOU ·chacha20Constants<>(SB), A0
-	MOVOU (1*16)(keyp), B0
-	MOVOU (2*16)(keyp), C0
-	MOVOU (3*16)(keyp), D0
-	MOVO  D0, T1
-
-	// Store state on stack for future use
-	MOVO B0, state1Store
-	MOVO C0, state2Store
-	MOVO D0, ctr3Store
-	MOVQ $10, itr2
-
-openSSEPreparePolyKey:
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	DECQ          itr2
-	JNE           openSSEPreparePolyKey
-
-	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
-	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
-
-	// Clamp and store the key
-	PAND ·polyClampMask<>(SB), A0
-	MOVO A0, rStore; MOVO B0, sStore
-
-	// Hash AAD
-	MOVQ ad_len+80(FP), itr2
-	CALL polyHashADInternal<>(SB)
-
-openSSEMainLoop:
-	CMPQ inl, $256
-	JB   openSSEMainLoopDone
-
-	// Load state, increment counter blocks
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
-
-	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
-
-	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
-	MOVQ $4, itr1
-	MOVQ inp, itr2
-
-openSSEInternalLoop:
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyAdd(0(itr2))
-	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
-	polyMulStage1
-	polyMulStage2
-	LEAQ          (2*8)(itr2), itr2
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	polyMulStage3
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyMulReduceStage
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	DECQ          itr1
-	JGE           openSSEInternalLoop
-
-	polyAdd(0(itr2))
-	polyMul
-	LEAQ (2*8)(itr2), itr2
-
-	CMPQ itr1, $-6
-	JG   openSSEInternalLoop
-
-	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
-
-	// Load - xor - store
-	MOVO  D3, tmpStore
-	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
-	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
-	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
-	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
-	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
-	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
-	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
-	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
-	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
-	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
-	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
-	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
-	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
-	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
-	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
-	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
-	LEAQ  256(inp), inp
-	LEAQ  256(oup), oup
-	SUBQ  $256, inl
-	JMP   openSSEMainLoop
-
-openSSEMainLoopDone:
-	// Handle the various tail sizes efficiently
-	TESTQ inl, inl
-	JE    openSSEFinalize
-	CMPQ  inl, $64
-	JBE   openSSETail64
-	CMPQ  inl, $128
-	JBE   openSSETail128
-	CMPQ  inl, $192
-	JBE   openSSETail192
-	JMP   openSSETail256
-
-openSSEFinalize:
-	// Hash in the PT, AAD lengths
-	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
-	polyMul
-
-	// Final reduce
-	MOVQ    acc0, t0
-	MOVQ    acc1, t1
-	MOVQ    acc2, t2
-	SUBQ    $-5, acc0
-	SBBQ    $-1, acc1
-	SBBQ    $3, acc2
-	CMOVQCS t0, acc0
-	CMOVQCS t1, acc1
-	CMOVQCS t2, acc2
-
-	// Add in the "s" part of the key
-	ADDQ 0+sStore, acc0
-	ADCQ 8+sStore, acc1
-
-	// Finally, constant time compare to the tag at the end of the message
-	XORQ    AX, AX
-	MOVQ    $1, DX
-	XORQ    (0*8)(inp), acc0
-	XORQ    (1*8)(inp), acc1
-	ORQ     acc1, acc0
-	CMOVQEQ DX, AX
-
-	// Return true iff tags are equal
-	MOVB AX, ret+96(FP)
-	RET
-
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 129 bytes
-openSSE128:
-	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
-	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
-	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
-	MOVQ  $10, itr2
-
-openSSE128InnerCipherLoop:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left;  shiftB1Left; shiftB2Left
-	shiftC0Left;  shiftC1Left; shiftC2Left
-	shiftD0Left;  shiftD1Left; shiftD2Left
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftB1Right; shiftB2Right
-	shiftC0Right; shiftC1Right; shiftC2Right
-	shiftD0Right; shiftD1Right; shiftD2Right
-	DECQ          itr2
-	JNE           openSSE128InnerCipherLoop
-
-	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
-	PADDL T2, C1; PADDL T2, C2
-	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
-
-	// Clamp and store the key
-	PAND  ·polyClampMask<>(SB), A0
-	MOVOU A0, rStore; MOVOU B0, sStore
-
-	// Hash
-	MOVQ ad_len+80(FP), itr2
-	CALL polyHashADInternal<>(SB)
-
-openSSE128Open:
-	CMPQ inl, $16
-	JB   openSSETail16
-	SUBQ $16, inl
-
-	// Load for hashing
-	polyAdd(0(inp))
-
-	// Load for decryption
-	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
-	LEAQ  (1*16)(inp), inp
-	LEAQ  (1*16)(oup), oup
-	polyMul
-
-	// Shift the stream "left"
-	MOVO B1, A1
-	MOVO C1, B1
-	MOVO D1, C1
-	MOVO A2, D1
-	MOVO B2, A2
-	MOVO C2, B2
-	MOVO D2, C2
-	JMP  openSSE128Open
-
-openSSETail16:
-	TESTQ inl, inl
-	JE    openSSEFinalize
-
-	// We can safely load the CT from the end, because it is padded with the MAC
-	MOVQ   inl, itr2
-	SHLQ   $4, itr2
-	LEAQ   ·andMask<>(SB), t0
-	MOVOU  (inp), T0
-	ADDQ   inl, inp
-	PAND   -16(t0)(itr2*1), T0
-	MOVO   T0, 0+tmpStore
-	MOVQ   T0, t0
-	MOVQ   8+tmpStore, t1
-	PXOR   A1, T0
-
-	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
-openSSETail16Store:
-	MOVQ T0, t3
-	MOVB t3, (oup)
-	PSRLDQ $1, T0
-	INCQ   oup
-	DECQ   inl
-	JNE    openSSETail16Store
-	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
-	JMP    openSSEFinalize
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of ciphertext
-openSSETail64:
-	// Need to decrypt up to 64 bytes - prepare single block
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
-	XORQ itr2, itr2
-	MOVQ inl, itr1
-	CMPQ itr1, $16
-	JB   openSSETail64LoopB
-
-openSSETail64LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMul
-	SUBQ $16, itr1
-
-openSSETail64LoopB:
-	ADDQ          $16, itr2
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-
-	CMPQ itr1, $16
-	JAE  openSSETail64LoopA
-
-	CMPQ itr2, $160
-	JNE  openSSETail64LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
-
-openSSETail64DecLoop:
-	CMPQ  inl, $16
-	JB    openSSETail64DecLoopDone
-	SUBQ  $16, inl
-	MOVOU (inp), T0
-	PXOR  T0, A0
-	MOVOU A0, (oup)
-	LEAQ  16(inp), inp
-	LEAQ  16(oup), oup
-	MOVO  B0, A0
-	MOVO  C0, B0
-	MOVO  D0, C0
-	JMP   openSSETail64DecLoop
-
-openSSETail64DecLoopDone:
-	MOVO A0, A1
-	JMP  openSSETail16
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
-openSSETail128:
-	// Need to decrypt up to 128 bytes - prepare two blocks
-	MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
-	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
-	XORQ itr2, itr2
-	MOVQ inl, itr1
-	ANDQ $-16, itr1
-
-openSSETail128LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMul
-
-openSSETail128LoopB:
-	ADDQ          $16, itr2
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	shiftB1Left;  shiftC1Left; shiftD1Left
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
-
-	CMPQ itr2, itr1
-	JB   openSSETail128LoopA
-
-	CMPQ itr2, $160
-	JNE  openSSETail128LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
-	PADDL state1Store, B0; PADDL state1Store, B1
-	PADDL state2Store, C0; PADDL state2Store, C1
-	PADDL ctr1Store, D0; PADDL ctr0Store, D1
-
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
-	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
-
-	SUBQ $64, inl
-	LEAQ 64(inp), inp
-	LEAQ 64(oup), oup
-	JMP  openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of ciphertext
-openSSETail192:
-	// Need to decrypt up to 192 bytes - prepare three blocks
-	MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
-	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
-	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
-
-	MOVQ    inl, itr1
-	MOVQ    $160, itr2
-	CMPQ    itr1, $160
-	CMOVQGT itr2, itr1
-	ANDQ    $-16, itr1
-	XORQ    itr2, itr2
-
-openSSLTail192LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMul
-
-openSSLTail192LoopB:
-	ADDQ         $16, itr2
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left; shiftC0Left; shiftD0Left
-	shiftB1Left; shiftC1Left; shiftD1Left
-	shiftB2Left; shiftC2Left; shiftD2Left
-
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
-	shiftB2Right; shiftC2Right; shiftD2Right
-
-	CMPQ itr2, itr1
-	JB   openSSLTail192LoopA
-
-	CMPQ itr2, $160
-	JNE  openSSLTail192LoopB
-
-	CMPQ inl, $176
-	JB   openSSLTail192Store
-
-	polyAdd(160(inp))
-	polyMul
-
-	CMPQ inl, $192
-	JB   openSSLTail192Store
-
-	polyAdd(176(inp))
-	polyMul
-
-openSSLTail192Store:
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
-	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
-	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
-
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
-	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
-
-	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
-	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
-	SUBQ $128, inl
-	LEAQ 128(inp), inp
-	LEAQ 128(oup), oup
-	JMP  openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
-openSSETail256:
-	// Need to decrypt up to 256 bytes - prepare four blocks
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
-
-	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
-	XORQ itr2, itr2
-
-openSSETail256Loop:
-	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
-	polyAdd(0(inp)(itr2*1))
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
-	polyMulStage1
-	polyMulStage2
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyMulStage3
-	polyMulReduceStage
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	ADDQ          $2*8, itr2
-	CMPQ          itr2, $160
-	JB            openSSETail256Loop
-	MOVQ          inl, itr1
-	ANDQ          $-16, itr1
-
-openSSETail256HashLoop:
-	polyAdd(0(inp)(itr2*1))
-	polyMul
-	ADDQ $2*8, itr2
-	CMPQ itr2, itr1
-	JB   openSSETail256HashLoop
-
-	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
-	MOVO  D3, tmpStore
-
-	// Load - xor - store
-	MOVOU (0*16)(inp), D3; PXOR D3, A0
-	MOVOU (1*16)(inp), D3; PXOR D3, B0
-	MOVOU (2*16)(inp), D3; PXOR D3, C0
-	MOVOU (3*16)(inp), D3; PXOR D3, D0
-	MOVOU A0, (0*16)(oup)
-	MOVOU B0, (1*16)(oup)
-	MOVOU C0, (2*16)(oup)
-	MOVOU D0, (3*16)(oup)
-	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
-	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
-	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
-	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
-	LEAQ  192(inp), inp
-	LEAQ  192(oup), oup
-	SUBQ  $192, inl
-	MOVO  A3, A0
-	MOVO  B3, B0
-	MOVO  C3, C0
-	MOVO  tmpStore, D0
-
-	JMP openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
-chacha20Poly1305Open_AVX2:
-	VZEROUPPER
-	VMOVDQU ·chacha20Constants<>(SB), AA0
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
-	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
-	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
-
-	// Special optimization, for very short buffers
-	CMPQ inl, $192
-	JBE  openAVX2192
-	CMPQ inl, $320
-	JBE  openAVX2320
-
-	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
-	VMOVDQA BB0, state1StoreAVX2
-	VMOVDQA CC0, state2StoreAVX2
-	VMOVDQA DD0, ctr3StoreAVX2
-	MOVQ    $10, itr2
-
-openAVX2PreparePolyKey:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
-	DECQ     itr2
-	JNE      openAVX2PreparePolyKey
-
-	VPADDD ·chacha20Constants<>(SB), AA0, AA0
-	VPADDD state1StoreAVX2, BB0, BB0
-	VPADDD state2StoreAVX2, CC0, CC0
-	VPADDD ctr3StoreAVX2, DD0, DD0
-
-	VPERM2I128 $0x02, AA0, BB0, TT0
-
-	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA TT0, rsStoreAVX2
-
-	// Stream for the first 64 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-
-	// Hash AD + first 64 bytes
-	MOVQ ad_len+80(FP), itr2
-	CALL polyHashADInternal<>(SB)
-	XORQ itr1, itr1
-
-openAVX2InitialHash64:
-	polyAdd(0(inp)(itr1*1))
-	polyMulAVX2
-	ADDQ $16, itr1
-	CMPQ itr1, $64
-	JNE  openAVX2InitialHash64
-
-	// Decrypt the first 64 bytes
-	VPXOR   (0*32)(inp), AA0, AA0
-	VPXOR   (1*32)(inp), BB0, BB0
-	VMOVDQU AA0, (0*32)(oup)
-	VMOVDQU BB0, (1*32)(oup)
-	LEAQ    (2*32)(inp), inp
-	LEAQ    (2*32)(oup), oup
-	SUBQ    $64, inl
-
-openAVX2MainLoop:
-	CMPQ inl, $512
-	JB   openAVX2MainLoopDone
-
-	// Load state, increment counter blocks, store the incremented counters
-	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-	XORQ    itr1, itr1
-
-openAVX2InternalLoop:
-	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
-	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
-	polyAdd(0*8(inp)(itr1*1))
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage1_AVX2
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulStage2_AVX2
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyMulStage3_AVX2
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulReduceStage
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	polyAdd(2*8(inp)(itr1*1))
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage1_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage2_AVX2
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage3_AVX2
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulReduceStage
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(4*8(inp)(itr1*1))
-	LEAQ     (6*8)(itr1), itr1
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage1_AVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	polyMulStage2_AVX2
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage3_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulReduceStage
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-	CMPQ     itr1, $480
-	JNE      openAVX2InternalLoop
-
-	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA CC3, tmpStoreAVX2
-
-	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
-	polyAdd(480(inp))
-	polyMulAVX2
-	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
-	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
-	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
-
-	// and here
-	polyAdd(496(inp))
-	polyMulAVX2
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
-	LEAQ       (32*16)(inp), inp
-	LEAQ       (32*16)(oup), oup
-	SUBQ       $(32*16), inl
-	JMP        openAVX2MainLoop
-
-openAVX2MainLoopDone:
-	// Handle the various tail sizes efficiently
-	TESTQ inl, inl
-	JE    openSSEFinalize
-	CMPQ  inl, $128
-	JBE   openAVX2Tail128
-	CMPQ  inl, $256
-	JBE   openAVX2Tail256
-	CMPQ  inl, $384
-	JBE   openAVX2Tail384
-	JMP   openAVX2Tail512
-
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
-openAVX2192:
-	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
-	VMOVDQA AA0, AA1
-	VMOVDQA BB0, BB1
-	VMOVDQA CC0, CC1
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2
-	VMOVDQA BB0, BB2
-	VMOVDQA CC0, CC2
-	VMOVDQA DD0, DD2
-	VMOVDQA DD1, TT3
-	MOVQ    $10, itr2
-
-openAVX2192InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	DECQ       itr2
-	JNE        openAVX2192InnerCipherLoop
-	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
-	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
-	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
-	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, TT0
-
-	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA TT0, rsStoreAVX2
-
-	// Stream for up to 192 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
-
-openAVX2ShortOpen:
-	// Hash
-	MOVQ ad_len+80(FP), itr2
-	CALL polyHashADInternal<>(SB)
-
-openAVX2ShortOpenLoop:
-	CMPQ inl, $32
-	JB   openAVX2ShortTail32
-	SUBQ $32, inl
-
-	// Load for hashing
-	polyAdd(0*8(inp))
-	polyMulAVX2
-	polyAdd(2*8(inp))
-	polyMulAVX2
-
-	// Load for decryption
-	VPXOR   (inp), AA0, AA0
-	VMOVDQU AA0, (oup)
-	LEAQ    (1*32)(inp), inp
-	LEAQ    (1*32)(oup), oup
-
-	// Shift stream left
-	VMOVDQA BB0, AA0
-	VMOVDQA CC0, BB0
-	VMOVDQA DD0, CC0
-	VMOVDQA AA1, DD0
-	VMOVDQA BB1, AA1
-	VMOVDQA CC1, BB1
-	VMOVDQA DD1, CC1
-	VMOVDQA AA2, DD1
-	VMOVDQA BB2, AA2
-	JMP     openAVX2ShortOpenLoop
-
-openAVX2ShortTail32:
-	CMPQ    inl, $16
-	VMOVDQA A0, A1
-	JB      openAVX2ShortDone
-
-	SUBQ $16, inl
-
-	// Load for hashing
-	polyAdd(0*8(inp))
-	polyMulAVX2
-
-	// Load for decryption
-	VPXOR      (inp), A0, T0
-	VMOVDQU    T0, (oup)
-	LEAQ       (1*16)(inp), inp
-	LEAQ       (1*16)(oup), oup
-	VPERM2I128 $0x11, AA0, AA0, AA0
-	VMOVDQA    A0, A1
-
-openAVX2ShortDone:
-	VZEROUPPER
-	JMP openSSETail16
-
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
-openAVX2320:
-	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
-	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
-	MOVQ    $10, itr2
-
-openAVX2320InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-	DECQ     itr2
-	JNE      openAVX2320InnerCipherLoop
-
-	VMOVDQA ·chacha20Constants<>(SB), TT0
-	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
-	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
-	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
-	VMOVDQA ·avx2IncMask<>(SB), TT0
-	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD2, DD2
-
-	// Clamp and store poly key
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPAND      ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA    TT0, rsStoreAVX2
-
-	// Stream for up to 320 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
-	VPERM2I128 $0x02, AA2, BB2, CC1
-	VPERM2I128 $0x02, CC2, DD2, DD1
-	VPERM2I128 $0x13, AA2, BB2, AA2
-	VPERM2I128 $0x13, CC2, DD2, BB2
-	JMP        openAVX2ShortOpen
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
-openAVX2Tail128:
-	// Need to decrypt up to 128 bytes - prepare two blocks
-	VMOVDQA ·chacha20Constants<>(SB), AA1
-	VMOVDQA state1StoreAVX2, BB1
-	VMOVDQA state2StoreAVX2, CC1
-	VMOVDQA ctr3StoreAVX2, DD1
-	VPADDD  ·avx2IncMask<>(SB), DD1, DD1
-	VMOVDQA DD1, DD0
-
-	XORQ  itr2, itr2
-	MOVQ  inl, itr1
-	ANDQ  $-16, itr1
-	TESTQ itr1, itr1
-	JE    openAVX2Tail128LoopB
-
-openAVX2Tail128LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMulAVX2
-
-openAVX2Tail128LoopB:
-	ADDQ     $16, itr2
-	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $4, DD1, DD1, DD1
-	CMPQ     itr2, itr1
-	JB       openAVX2Tail128LoopA
-	CMPQ     itr2, $160
-	JNE      openAVX2Tail128LoopB
-
-	VPADDD     ·chacha20Constants<>(SB), AA1, AA1
-	VPADDD     state1StoreAVX2, BB1, BB1
-	VPADDD     state2StoreAVX2, CC1, CC1
-	VPADDD     DD0, DD1, DD1
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-
-openAVX2TailLoop:
-	CMPQ inl, $32
-	JB   openAVX2Tail
-	SUBQ $32, inl
-
-	// Load for decryption
-	VPXOR   (inp), AA0, AA0
-	VMOVDQU AA0, (oup)
-	LEAQ    (1*32)(inp), inp
-	LEAQ    (1*32)(oup), oup
-	VMOVDQA BB0, AA0
-	VMOVDQA CC0, BB0
-	VMOVDQA DD0, CC0
-	JMP     openAVX2TailLoop
-
-openAVX2Tail:
-	CMPQ    inl, $16
-	VMOVDQA A0, A1
-	JB      openAVX2TailDone
-	SUBQ    $16, inl
-
-	// Load for decryption
-	VPXOR      (inp), A0, T0
-	VMOVDQU    T0, (oup)
-	LEAQ       (1*16)(inp), inp
-	LEAQ       (1*16)(oup), oup
-	VPERM2I128 $0x11, AA0, AA0, AA0
-	VMOVDQA    A0, A1
-
-openAVX2TailDone:
-	VZEROUPPER
-	JMP openSSETail16
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
-openAVX2Tail256:
-	// Need to decrypt up to 256 bytes - prepare four blocks
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA DD0, TT1
-	VMOVDQA DD1, TT2
-
-	// Compute the number of iterations that will hash data
-	MOVQ    inl, tmpStoreAVX2
-	MOVQ    inl, itr1
-	SUBQ    $128, itr1
-	SHRQ    $4, itr1
-	MOVQ    $10, itr2
-	CMPQ    itr1, $10
-	CMOVQGT itr2, itr1
-	MOVQ    inp, inl
-	XORQ    itr2, itr2
-
-openAVX2Tail256LoopA:
-	polyAdd(0(inl))
-	polyMulAVX2
-	LEAQ 16(inl), inl
-
-	// Perform ChaCha rounds, while hashing the remaining input
-openAVX2Tail256LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	INCQ     itr2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	CMPQ     itr2, itr1
-	JB       openAVX2Tail256LoopA
-
-	CMPQ itr2, $10
-	JNE  openAVX2Tail256LoopB
-
-	MOVQ inl, itr2
-	SUBQ inp, inl
-	MOVQ inl, itr1
-	MOVQ tmpStoreAVX2, inl
-
-	// Hash the remainder of data (if any)
-openAVX2Tail256Hash:
-	ADDQ $16, itr1
-	CMPQ itr1, inl
-	JGT  openAVX2Tail256HashEnd
-	polyAdd (0(itr2))
-	polyMulAVX2
-	LEAQ 16(itr2), itr2
-	JMP  openAVX2Tail256Hash
-
-// Store 128 bytes safely, then go to store loop
-openAVX2Tail256HashEnd:
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
-	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-
-	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
-	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
-	LEAQ    (4*32)(inp), inp
-	LEAQ    (4*32)(oup), oup
-	SUBQ    $4*32, inl
-
-	JMP openAVX2TailLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
-openAVX2Tail384:
-	// Need to decrypt up to 384 bytes - prepare six blocks
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VPADDD  ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA DD0, ctr0StoreAVX2
-	VMOVDQA DD1, ctr1StoreAVX2
-	VMOVDQA DD2, ctr2StoreAVX2
-
-	// Compute the number of iterations that will hash two blocks of data
-	MOVQ    inl, tmpStoreAVX2
-	MOVQ    inl, itr1
-	SUBQ    $256, itr1
-	SHRQ    $4, itr1
-	ADDQ    $6, itr1
-	MOVQ    $10, itr2
-	CMPQ    itr1, $10
-	CMOVQGT itr2, itr1
-	MOVQ    inp, inl
-	XORQ    itr2, itr2
-
-	// Perform ChaCha rounds, while hashing the remaining input
-openAVX2Tail384LoopB:
-	polyAdd(0(inl))
-	polyMulAVX2
-	LEAQ 16(inl), inl
-
-openAVX2Tail384LoopA:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	polyAdd(0(inl))
-	polyMulAVX2
-	LEAQ     16(inl), inl
-	INCQ     itr2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-
-	CMPQ itr2, itr1
-	JB   openAVX2Tail384LoopB
-
-	CMPQ itr2, $10
-	JNE  openAVX2Tail384LoopA
-
-	MOVQ inl, itr2
-	SUBQ inp, inl
-	MOVQ inl, itr1
-	MOVQ tmpStoreAVX2, inl
-
-openAVX2Tail384Hash:
-	ADDQ $16, itr1
-	CMPQ itr1, inl
-	JGT  openAVX2Tail384HashEnd
-	polyAdd(0(itr2))
-	polyMulAVX2
-	LEAQ 16(itr2), itr2
-	JMP  openAVX2Tail384Hash
-
-// Store 256 bytes safely, then go to store loop
-openAVX2Tail384HashEnd:
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
-	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
-	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
-	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
-	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	LEAQ       (8*32)(inp), inp
-	LEAQ       (8*32)(oup), oup
-	SUBQ       $8*32, inl
-	JMP        openAVX2TailLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
-openAVX2Tail512:
-	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-	XORQ    itr1, itr1
-	MOVQ    inp, itr2
-
-openAVX2Tail512LoopB:
-	polyAdd(0(itr2))
-	polyMulAVX2
-	LEAQ (2*8)(itr2), itr2
-
-openAVX2Tail512LoopA:
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyAdd(0*8(itr2))
-	polyMulAVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(2*8(itr2))
-	polyMulAVX2
-	LEAQ     (4*8)(itr2), itr2
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-	INCQ     itr1
-	CMPQ     itr1, $4
-	JLT      openAVX2Tail512LoopB
-
-	CMPQ itr1, $10
-	JNE  openAVX2Tail512LoopA
-
-	MOVQ inl, itr1
-	SUBQ $384, itr1
-	ANDQ $-16, itr1
-
-openAVX2Tail512HashLoop:
-	TESTQ itr1, itr1
-	JE    openAVX2Tail512HashEnd
-	polyAdd(0(itr2))
-	polyMulAVX2
-	LEAQ  16(itr2), itr2
-	SUBQ  $16, itr1
-	JMP   openAVX2Tail512HashLoop
-
-openAVX2Tail512HashEnd:
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA    CC3, tmpStoreAVX2
-	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
-	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
-	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-
-	LEAQ (12*32)(inp), inp
-	LEAQ (12*32)(oup), oup
-	SUBQ $12*32, inl
-
-	JMP openAVX2TailLoop
-
-// ----------------------------------------------------------------------------
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Seal(dst, key, src, ad []byte)
-TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
-	// For aligned stack access
-	MOVQ SP, BP
-	ADDQ $32, BP
-	ANDQ $-32, BP
-	MOVQ dst+0(FP), oup
-	MOVQ key+24(FP), keyp
-	MOVQ src+48(FP), inp
-	MOVQ src_len+56(FP), inl
-	MOVQ ad+72(FP), adp
-
-	CMPB ·useAVX2(SB), $1
-	JE   chacha20Poly1305Seal_AVX2
-
-	// Special optimization, for very short buffers
-	CMPQ inl, $128
-	JBE  sealSSE128 // About 15% faster
-
-	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
-	MOVOU ·chacha20Constants<>(SB), A0
-	MOVOU (1*16)(keyp), B0
-	MOVOU (2*16)(keyp), C0
-	MOVOU (3*16)(keyp), D0
-
-	// Store state on stack for future use
-	MOVO B0, state1Store
-	MOVO C0, state2Store
-
-	// Load state, increment counter blocks
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
-
-	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
-	MOVQ $10, itr2
-
-sealSSEIntroLoop:
-	MOVO         C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO         tmpStore, C3
-	MOVO         C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO         tmpStore, C1
-	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
-
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	DECQ          itr2
-	JNE           sealSSEIntroLoop
-
-	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
-
-	// Clamp and store the key
-	PAND ·polyClampMask<>(SB), A0
-	MOVO A0, rStore
-	MOVO B0, sStore
-
-	// Hash AAD
-	MOVQ ad_len+80(FP), itr2
-	CALL polyHashADInternal<>(SB)
-
-	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
-	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
-	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
-	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
-	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
-	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
-
-	MOVQ $128, itr1
-	SUBQ $128, inl
-	LEAQ 128(inp), inp
-
-	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
-
-	CMPQ inl, $64
-	JBE  sealSSE128SealHash
-
-	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
-	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
-	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
-
-	ADDQ $64, itr1
-	SUBQ $64, inl
-	LEAQ 64(inp), inp
-
-	MOVQ $2, itr1
-	MOVQ $8, itr2
-
-	CMPQ inl, $64
-	JBE  sealSSETail64
-	CMPQ inl, $128
-	JBE  sealSSETail128
-	CMPQ inl, $192
-	JBE  sealSSETail192
-
-sealSSEMainLoop:
-	// Load state, increment counter blocks
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
-
-	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
-
-sealSSEInnerLoop:
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyAdd(0(oup))
-	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
-	polyMulStage1
-	polyMulStage2
-	LEAQ          (2*8)(oup), oup
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	polyMulStage3
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyMulReduceStage
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	DECQ          itr2
-	JGE           sealSSEInnerLoop
-	polyAdd(0(oup))
-	polyMul
-	LEAQ          (2*8)(oup), oup
-	DECQ          itr1
-	JG            sealSSEInnerLoop
-
-	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
-	MOVO  D3, tmpStore
-
-	// Load - xor - store
-	MOVOU (0*16)(inp), D3; PXOR D3, A0
-	MOVOU (1*16)(inp), D3; PXOR D3, B0
-	MOVOU (2*16)(inp), D3; PXOR D3, C0
-	MOVOU (3*16)(inp), D3; PXOR D3, D0
-	MOVOU A0, (0*16)(oup)
-	MOVOU B0, (1*16)(oup)
-	MOVOU C0, (2*16)(oup)
-	MOVOU D0, (3*16)(oup)
-	MOVO  tmpStore, D3
-
-	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
-	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
-	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
-	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
-	ADDQ  $192, inp
-	MOVQ  $192, itr1
-	SUBQ  $192, inl
-	MOVO  A3, A1
-	MOVO  B3, B1
-	MOVO  C3, C1
-	MOVO  D3, D1
-	CMPQ  inl, $64
-	JBE   sealSSE128SealHash
-	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
-	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
-	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
-	LEAQ  64(inp), inp
-	SUBQ  $64, inl
-	MOVQ  $6, itr1
-	MOVQ  $4, itr2
-	CMPQ  inl, $192
-	JG    sealSSEMainLoop
-
-	MOVQ  inl, itr1
-	TESTQ inl, inl
-	JE    sealSSE128SealHash
-	MOVQ  $6, itr1
-	CMPQ  inl, $64
-	JBE   sealSSETail64
-	CMPQ  inl, $128
-	JBE   sealSSETail128
-	JMP   sealSSETail192
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of plaintext
-sealSSETail64:
-	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
-	MOVO  ·chacha20Constants<>(SB), A1
-	MOVO  state1Store, B1
-	MOVO  state2Store, C1
-	MOVO  ctr3Store, D1
-	PADDL ·sseIncMask<>(SB), D1
-	MOVO  D1, ctr0Store
-
-sealSSETail64LoopA:
-	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
-
-sealSSETail64LoopB:
-	chachaQR(A1, B1, C1, D1, T1)
-	shiftB1Left;  shiftC1Left; shiftD1Left
-	chachaQR(A1, B1, C1, D1, T1)
-	shiftB1Right; shiftC1Right; shiftD1Right
-	polyAdd(0(oup))
-	polyMul
-	LEAQ          16(oup), oup
-
-	DECQ itr1
-	JG   sealSSETail64LoopA
-
-	DECQ  itr2
-	JGE   sealSSETail64LoopB
-	PADDL ·chacha20Constants<>(SB), A1
-	PADDL state1Store, B1
-	PADDL state2Store, C1
-	PADDL ctr0Store, D1
-
-	JMP sealSSE128Seal
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of plaintext
-sealSSETail128:
-	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
-
-sealSSETail128LoopA:
-	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
-
-sealSSETail128LoopB:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	shiftB1Left;  shiftC1Left; shiftD1Left
-	polyAdd(0(oup))
-	polyMul
-	LEAQ          16(oup), oup
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
-
-	DECQ itr1
-	JG   sealSSETail128LoopA
-
-	DECQ itr2
-	JGE  sealSSETail128LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
-	PADDL state1Store, B0; PADDL state1Store, B1
-	PADDL state2Store, C0; PADDL state2Store, C1
-	PADDL ctr0Store, D0; PADDL ctr1Store, D1
-
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
-	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
-
-	MOVQ $64, itr1
-	LEAQ 64(inp), inp
-	SUBQ $64, inl
-
-	JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of plaintext
-sealSSETail192:
-	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
-
-sealSSETail192LoopA:
-	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
-
-sealSSETail192LoopB:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left; shiftC0Left; shiftD0Left
-	shiftB1Left; shiftC1Left; shiftD1Left
-	shiftB2Left; shiftC2Left; shiftD2Left
-
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
-
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
-	shiftB2Right; shiftC2Right; shiftD2Right
-
-	DECQ itr1
-	JG   sealSSETail192LoopA
-
-	DECQ itr2
-	JGE  sealSSETail192LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
-	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
-	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
-
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
-	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
-	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
-	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
-	MOVO A2, A1
-	MOVO B2, B1
-	MOVO C2, C1
-	MOVO D2, D1
-	MOVQ $128, itr1
-	LEAQ 128(inp), inp
-	SUBQ $128, inl
-
-	JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special seal optimization for buffers smaller than 129 bytes
-sealSSE128:
-	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
-	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
-	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
-	MOVQ  $10, itr2
-
-sealSSE128InnerCipherLoop:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left;  shiftB1Left; shiftB2Left
-	shiftC0Left;  shiftC1Left; shiftC2Left
-	shiftD0Left;  shiftD1Left; shiftD2Left
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftB1Right; shiftB2Right
-	shiftC0Right; shiftC1Right; shiftC2Right
-	shiftD0Right; shiftD1Right; shiftD2Right
-	DECQ          itr2
-	JNE           sealSSE128InnerCipherLoop
-
-	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
-	PADDL T2, C1; PADDL T2, C2
-	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
-	PAND  ·polyClampMask<>(SB), A0
-	MOVOU A0, rStore
-	MOVOU B0, sStore
-
-	// Hash
-	MOVQ ad_len+80(FP), itr2
-	CALL polyHashADInternal<>(SB)
-	XORQ itr1, itr1
-
-sealSSE128SealHash:
-	// itr1 holds the number of bytes encrypted but not yet hashed
-	CMPQ itr1, $16
-	JB   sealSSE128Seal
-	polyAdd(0(oup))
-	polyMul
-
-	SUBQ $16, itr1
-	ADDQ $16, oup
-
-	JMP sealSSE128SealHash
-
-sealSSE128Seal:
-	CMPQ inl, $16
-	JB   sealSSETail
-	SUBQ $16, inl
-
-	// Load for decryption
-	MOVOU (inp), T0
-	PXOR  T0, A1
-	MOVOU A1, (oup)
-	LEAQ  (1*16)(inp), inp
-	LEAQ  (1*16)(oup), oup
-
-	// Extract for hashing
-	MOVQ   A1, t0
-	PSRLDQ $8, A1
-	MOVQ A1, t1
-	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
-
-	// Shift the stream "left"
-	MOVO B1, A1
-	MOVO C1, B1
-	MOVO D1, C1
-	MOVO A2, D1
-	MOVO B2, A2
-	MOVO C2, B2
-	MOVO D2, C2
-	JMP  sealSSE128Seal
-
-sealSSETail:
-	TESTQ inl, inl
-	JE    sealSSEFinalize
-
-	// We can only load the PT one byte at a time to avoid read after end of buffer
-	MOVQ inl, itr2
-	SHLQ $4, itr2
-	LEAQ ·andMask<>(SB), t0
-	MOVQ inl, itr1
-	LEAQ -1(inp)(inl*1), inp
-	XORQ t2, t2
-	XORQ t3, t3
-	XORQ AX, AX
-
-sealSSETailLoadLoop:
-	SHLQ $8, t2, t3
-	SHLQ $8, t2
-	MOVB (inp), AX
-	XORQ AX, t2
-	LEAQ   -1(inp), inp
-	DECQ   itr1
-	JNE    sealSSETailLoadLoop
-	MOVQ t2, 0+tmpStore
-	MOVQ t3, 8+tmpStore
-	PXOR 0+tmpStore, A1
-	MOVOU  A1, (oup)
-	MOVOU  -16(t0)(itr2*1), T0
-	PAND   T0, A1
-	MOVQ   A1, t0
-	PSRLDQ $8, A1
-	MOVQ   A1, t1
-	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
-
-	ADDQ inl, oup
-
-sealSSEFinalize:
-	// Hash in the buffer lengths
-	ADDQ ad_len+80(FP), acc0
-	ADCQ src_len+56(FP), acc1
-	ADCQ $1, acc2
-	polyMul
-
-	// Final reduce
-	MOVQ    acc0, t0
-	MOVQ    acc1, t1
-	MOVQ    acc2, t2
-	SUBQ    $-5, acc0
-	SBBQ    $-1, acc1
-	SBBQ    $3, acc2
-	CMOVQCS t0, acc0
-	CMOVQCS t1, acc1
-	CMOVQCS t2, acc2
-
-	// Add in the "s" part of the key
-	ADDQ 0+sStore, acc0
-	ADCQ 8+sStore, acc1
-
-	// Finally store the tag at the end of the message
-	MOVQ acc0, (0*8)(oup)
-	MOVQ acc1, (1*8)(oup)
-	RET
-
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
-chacha20Poly1305Seal_AVX2:
-	VZEROUPPER
-	VMOVDQU ·chacha20Constants<>(SB), AA0
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
-	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
-	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
-
-	// Special optimizations, for very short buffers
-	CMPQ inl, $192
-	JBE  seal192AVX2 // 33% faster
-	CMPQ inl, $320
-	JBE  seal320AVX2 // 17% faster
-
-	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
-	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
-	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
-	VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
-	VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
-	VMOVDQA DD3, ctr3StoreAVX2
-	MOVQ    $10, itr2
-
-sealAVX2IntroLoop:
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
-	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
-	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
-	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
-	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
-	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
-	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
-	DECQ     itr2
-	JNE      sealAVX2IntroLoop
-
-	VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-
-	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
-	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
-	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
-
-	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), DD0, DD0
-	VMOVDQA DD0, rsStoreAVX2
-
-	// Hash AD
-	MOVQ ad_len+80(FP), itr2
-	CALL polyHashADInternal<>(SB)
-
-	// Can store at least 320 bytes
-	VPXOR   (0*32)(inp), AA0, AA0
-	VPXOR   (1*32)(inp), CC0, CC0
-	VMOVDQU AA0, (0*32)(oup)
-	VMOVDQU CC0, (1*32)(oup)
-
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
-
-	MOVQ $320, itr1
-	SUBQ $320, inl
-	LEAQ 320(inp), inp
-
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
-	CMPQ       inl, $128
-	JBE        sealAVX2SealHash
-
-	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
-	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
-	SUBQ    $128, inl
-	LEAQ    128(inp), inp
-
-	MOVQ $8, itr1
-	MOVQ $2, itr2
-
-	CMPQ inl, $128
-	JBE  sealAVX2Tail128
-	CMPQ inl, $256
-	JBE  sealAVX2Tail256
-	CMPQ inl, $384
-	JBE  sealAVX2Tail384
-	CMPQ inl, $512
-	JBE  sealAVX2Tail512
-
-	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
-	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
-	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
-	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
-	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
-	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
-	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-
-	SUBQ $16, oup                  // Adjust the pointer
-	MOVQ $9, itr1
-	JMP  sealAVX2InternalLoopStart
-
-sealAVX2MainLoop:
-	// Load state, increment counter blocks, store the incremented counters
-	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-	MOVQ    $10, itr1
-
-sealAVX2InternalLoop:
-	polyAdd(0*8(oup))
-	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage1_AVX2
-	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulStage2_AVX2
-	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyMulStage3_AVX2
-	VMOVDQA CC3, tmpStoreAVX2
-	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA tmpStoreAVX2, CC3
-	polyMulReduceStage
-
-sealAVX2InternalLoopStart:
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	polyAdd(2*8(oup))
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage1_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage2_AVX2
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage3_AVX2
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulReduceStage
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(4*8(oup))
-	LEAQ     (6*8)(oup), oup
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage1_AVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	polyMulStage2_AVX2
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage3_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulReduceStage
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-	DECQ     itr1
-	JNE      sealAVX2InternalLoop
-
-	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA CC3, tmpStoreAVX2
-
-	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	LEAQ       (4*8)(oup), oup
-	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
-	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
-	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
-
-	// and here
-	polyAdd(-2*8(oup))
-	polyMulAVX2
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
-	LEAQ       (32*16)(inp), inp
-	SUBQ       $(32*16), inl
-	CMPQ       inl, $512
-	JG         sealAVX2MainLoop
-
-	// Tail can only hash 480 bytes
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	polyAdd(2*8(oup))
-	polyMulAVX2
-	LEAQ 32(oup), oup
-
-	MOVQ $10, itr1
-	MOVQ $0, itr2
-	CMPQ inl, $128
-	JBE  sealAVX2Tail128
-	CMPQ inl, $256
-	JBE  sealAVX2Tail256
-	CMPQ inl, $384
-	JBE  sealAVX2Tail384
-	JMP  sealAVX2Tail512
-
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
-seal192AVX2:
-	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
-	VMOVDQA AA0, AA1
-	VMOVDQA BB0, BB1
-	VMOVDQA CC0, CC1
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2
-	VMOVDQA BB0, BB2
-	VMOVDQA CC0, CC2
-	VMOVDQA DD0, DD2
-	VMOVDQA DD1, TT3
-	MOVQ    $10, itr2
-
-sealAVX2192InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	DECQ       itr2
-	JNE        sealAVX2192InnerCipherLoop
-	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
-	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
-	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
-	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, TT0
-
-	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA TT0, rsStoreAVX2
-
-	// Stream for up to 192 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
-
-sealAVX2ShortSeal:
-	// Hash aad
-	MOVQ ad_len+80(FP), itr2
-	CALL polyHashADInternal<>(SB)
-	XORQ itr1, itr1
-
-sealAVX2SealHash:
-	// itr1 holds the number of bytes encrypted but not yet hashed
-	CMPQ itr1, $16
-	JB   sealAVX2ShortSealLoop
-	polyAdd(0(oup))
-	polyMul
-	SUBQ $16, itr1
-	ADDQ $16, oup
-	JMP  sealAVX2SealHash
-
-sealAVX2ShortSealLoop:
-	CMPQ inl, $32
-	JB   sealAVX2ShortTail32
-	SUBQ $32, inl
-
-	// Load for encryption
-	VPXOR   (inp), AA0, AA0
-	VMOVDQU AA0, (oup)
-	LEAQ    (1*32)(inp), inp
-
-	// Now can hash
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	polyAdd(2*8(oup))
-	polyMulAVX2
-	LEAQ (1*32)(oup), oup
-
-	// Shift stream left
-	VMOVDQA BB0, AA0
-	VMOVDQA CC0, BB0
-	VMOVDQA DD0, CC0
-	VMOVDQA AA1, DD0
-	VMOVDQA BB1, AA1
-	VMOVDQA CC1, BB1
-	VMOVDQA DD1, CC1
-	VMOVDQA AA2, DD1
-	VMOVDQA BB2, AA2
-	JMP     sealAVX2ShortSealLoop
-
-sealAVX2ShortTail32:
-	CMPQ    inl, $16
-	VMOVDQA A0, A1
-	JB      sealAVX2ShortDone
-
-	SUBQ $16, inl
-
-	// Load for encryption
-	VPXOR   (inp), A0, T0
-	VMOVDQU T0, (oup)
-	LEAQ    (1*16)(inp), inp
-
-	// Hash
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	LEAQ       (1*16)(oup), oup
-	VPERM2I128 $0x11, AA0, AA0, AA0
-	VMOVDQA    A0, A1
-
-sealAVX2ShortDone:
-	VZEROUPPER
-	JMP sealSSETail
-
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
-seal320AVX2:
-	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
-	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
-	MOVQ    $10, itr2
-
-sealAVX2320InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-	DECQ     itr2
-	JNE      sealAVX2320InnerCipherLoop
-
-	VMOVDQA ·chacha20Constants<>(SB), TT0
-	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
-	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
-	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
-	VMOVDQA ·avx2IncMask<>(SB), TT0
-	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD2, DD2
-
-	// Clamp and store poly key
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPAND      ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA    TT0, rsStoreAVX2
-
-	// Stream for up to 320 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
-	VPERM2I128 $0x02, AA2, BB2, CC1
-	VPERM2I128 $0x02, CC2, DD2, DD1
-	VPERM2I128 $0x13, AA2, BB2, AA2
-	VPERM2I128 $0x13, CC2, DD2, BB2
-	JMP        sealAVX2ShortSeal
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
-sealAVX2Tail128:
-	// Need to decrypt up to 128 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0
-	VMOVDQA state1StoreAVX2, BB0
-	VMOVDQA state2StoreAVX2, CC0
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VMOVDQA DD0, DD1
-
-sealAVX2Tail128LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
-
-sealAVX2Tail128LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	polyAdd(0(oup))
-	polyMul
-	VPALIGNR $4, BB0, BB0, BB0
-	VPALIGNR $8, CC0, CC0, CC0
-	VPALIGNR $12, DD0, DD0, DD0
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	polyAdd(16(oup))
-	polyMul
-	LEAQ     32(oup), oup
-	VPALIGNR $12, BB0, BB0, BB0
-	VPALIGNR $8, CC0, CC0, CC0
-	VPALIGNR $4, DD0, DD0, DD0
-	DECQ     itr1
-	JG       sealAVX2Tail128LoopA
-	DECQ     itr2
-	JGE      sealAVX2Tail128LoopB
-
-	VPADDD ·chacha20Constants<>(SB), AA0, AA1
-	VPADDD state1StoreAVX2, BB0, BB1
-	VPADDD state2StoreAVX2, CC0, CC1
-	VPADDD DD1, DD0, DD1
-
-	VPERM2I128 $0x02, AA1, BB1, AA0
-	VPERM2I128 $0x02, CC1, DD1, BB0
-	VPERM2I128 $0x13, AA1, BB1, CC0
-	VPERM2I128 $0x13, CC1, DD1, DD0
-	JMP        sealAVX2ShortSealLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
-sealAVX2Tail256:
-	// Need to decrypt up to 256 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA DD0, TT1
-	VMOVDQA DD1, TT2
-
-sealAVX2Tail256LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
-
-sealAVX2Tail256LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	polyAdd(0(oup))
-	polyMul
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	polyAdd(16(oup))
-	polyMul
-	LEAQ     32(oup), oup
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	DECQ     itr1
-	JG       sealAVX2Tail256LoopA
-	DECQ     itr2
-	JGE      sealAVX2Tail256LoopB
-
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
-	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPERM2I128 $0x02, CC0, DD0, TT1
-	VPERM2I128 $0x13, AA0, BB0, TT2
-	VPERM2I128 $0x13, CC0, DD0, TT3
-	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
-	MOVQ       $128, itr1
-	LEAQ       128(inp), inp
-	SUBQ       $128, inl
-	VPERM2I128 $0x02, AA1, BB1, AA0
-	VPERM2I128 $0x02, CC1, DD1, BB0
-	VPERM2I128 $0x13, AA1, BB1, CC0
-	VPERM2I128 $0x13, CC1, DD1, DD0
-
-	JMP sealAVX2SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
-sealAVX2Tail384:
-	// Need to decrypt up to 384 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
-
-sealAVX2Tail384LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
-
-sealAVX2Tail384LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	polyAdd(0(oup))
-	polyMul
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	polyAdd(16(oup))
-	polyMul
-	LEAQ     32(oup), oup
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-	DECQ     itr1
-	JG       sealAVX2Tail384LoopA
-	DECQ     itr2
-	JGE      sealAVX2Tail384LoopB
-
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
-	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPERM2I128 $0x02, CC0, DD0, TT1
-	VPERM2I128 $0x13, AA0, BB0, TT2
-	VPERM2I128 $0x13, CC0, DD0, TT3
-	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, TT0
-	VPERM2I128 $0x02, CC1, DD1, TT1
-	VPERM2I128 $0x13, AA1, BB1, TT2
-	VPERM2I128 $0x13, CC1, DD1, TT3
-	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
-	MOVQ       $256, itr1
-	LEAQ       256(inp), inp
-	SUBQ       $256, inl
-	VPERM2I128 $0x02, AA2, BB2, AA0
-	VPERM2I128 $0x02, CC2, DD2, BB0
-	VPERM2I128 $0x13, AA2, BB2, CC0
-	VPERM2I128 $0x13, CC2, DD2, DD0
-
-	JMP sealAVX2SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
-sealAVX2Tail512:
-	// Need to decrypt up to 512 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-
-sealAVX2Tail512LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
-
-sealAVX2Tail512LoopB:
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(2*8(oup))
-	polyMulAVX2
-	LEAQ     (4*8)(oup), oup
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-
-	DECQ itr1
-	JG   sealAVX2Tail512LoopA
-	DECQ itr2
-	JGE  sealAVX2Tail512LoopB
-
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA    CC3, tmpStoreAVX2
-	VPERM2I128 $0x02, AA0, BB0, CC3
-	VPXOR      (0*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (0*32)(oup)
-	VPERM2I128 $0x02, CC0, DD0, CC3
-	VPXOR      (1*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (1*32)(oup)
-	VPERM2I128 $0x13, AA0, BB0, CC3
-	VPXOR      (2*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (2*32)(oup)
-	VPERM2I128 $0x13, CC0, DD0, CC3
-	VPXOR      (3*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (3*32)(oup)
-
-	VPERM2I128 $0x02, AA1, BB1, AA0
-	VPERM2I128 $0x02, CC1, DD1, BB0
-	VPERM2I128 $0x13, AA1, BB1, CC0
-	VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
-
-	VPERM2I128 $0x02, AA2, BB2, AA0
-	VPERM2I128 $0x02, CC2, DD2, BB0
-	VPERM2I128 $0x13, AA2, BB2, CC0
-	VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-
-	MOVQ       $384, itr1
-	LEAQ       384(inp), inp
-	SUBQ       $384, inl
-	VPERM2I128 $0x02, AA3, BB3, AA0
-	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
-	VPERM2I128 $0x13, AA3, BB3, CC0
-	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-
-	JMP sealAVX2SealHash
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go
deleted file mode 100644
index 8d28ce2..0000000
--- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package chacha20poly1305
-
-import (
-	"encoding/binary"
-
-	"golang.org/x/crypto/internal/chacha20"
-	"golang.org/x/crypto/poly1305"
-)
-
-func roundTo16(n int) int {
-	return 16 * ((n + 15) / 16)
-}
-
-func (c *chacha20poly1305) sealGeneric(dst, nonce, plaintext, additionalData []byte) []byte {
-	ret, out := sliceForAppend(dst, len(plaintext)+poly1305.TagSize)
-
-	var polyKey [32]byte
-	s := chacha20.New(c.key, [3]uint32{
-		binary.LittleEndian.Uint32(nonce[0:4]),
-		binary.LittleEndian.Uint32(nonce[4:8]),
-		binary.LittleEndian.Uint32(nonce[8:12]),
-	})
-	s.XORKeyStream(polyKey[:], polyKey[:])
-	s.Advance() // skip the next 32 bytes
-	s.XORKeyStream(out, plaintext)
-
-	polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(plaintext))+8+8)
-	copy(polyInput, additionalData)
-	copy(polyInput[roundTo16(len(additionalData)):], out[:len(plaintext)])
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData)))
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(plaintext)))
-
-	var tag [poly1305.TagSize]byte
-	poly1305.Sum(&tag, polyInput, &polyKey)
-	copy(out[len(plaintext):], tag[:])
-
-	return ret
-}
-
-func (c *chacha20poly1305) openGeneric(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
-	var tag [poly1305.TagSize]byte
-	copy(tag[:], ciphertext[len(ciphertext)-16:])
-	ciphertext = ciphertext[:len(ciphertext)-16]
-
-	var polyKey [32]byte
-	s := chacha20.New(c.key, [3]uint32{
-		binary.LittleEndian.Uint32(nonce[0:4]),
-		binary.LittleEndian.Uint32(nonce[4:8]),
-		binary.LittleEndian.Uint32(nonce[8:12]),
-	})
-	s.XORKeyStream(polyKey[:], polyKey[:])
-	s.Advance() // skip the next 32 bytes
-
-	polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(ciphertext))+8+8)
-	copy(polyInput, additionalData)
-	copy(polyInput[roundTo16(len(additionalData)):], ciphertext)
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData)))
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(ciphertext)))
-
-	ret, out := sliceForAppend(dst, len(ciphertext))
-	if !poly1305.Verify(&tag, polyInput, &polyKey) {
-		for i := range out {
-			out[i] = 0
-		}
-		return nil, errOpen
-	}
-
-	s.XORKeyStream(out, ciphertext)
-	return ret, nil
-}
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go
deleted file mode 100644
index 4c2eb70..0000000
--- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64 !go1.7 gccgo appengine
-
-package chacha20poly1305
-
-func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte {
-	return c.sealGeneric(dst, nonce, plaintext, additionalData)
-}
-
-func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
-	return c.openGeneric(dst, nonce, ciphertext, additionalData)
-}
diff --git a/vendor/golang.org/x/crypto/hkdf/hkdf.go b/vendor/golang.org/x/crypto/hkdf/hkdf.go
deleted file mode 100644
index 5bc2463..0000000
--- a/vendor/golang.org/x/crypto/hkdf/hkdf.go
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package hkdf implements the HMAC-based Extract-and-Expand Key Derivation
-// Function (HKDF) as defined in RFC 5869.
-//
-// HKDF is a cryptographic key derivation function (KDF) with the goal of
-// expanding limited input keying material into one or more cryptographically
-// strong secret keys.
-//
-// RFC 5869: https://tools.ietf.org/html/rfc5869
-package hkdf // import "golang.org/x/crypto/hkdf"
-
-import (
-	"crypto/hmac"
-	"errors"
-	"hash"
-	"io"
-)
-
-type hkdf struct {
-	expander hash.Hash
-	size     int
-
-	info    []byte
-	counter byte
-
-	prev  []byte
-	cache []byte
-}
-
-func (f *hkdf) Read(p []byte) (int, error) {
-	// Check whether enough data can be generated
-	need := len(p)
-	remains := len(f.cache) + int(255-f.counter+1)*f.size
-	if remains < need {
-		return 0, errors.New("hkdf: entropy limit reached")
-	}
-	// Read from the cache, if enough data is present
-	n := copy(p, f.cache)
-	p = p[n:]
-
-	// Fill the buffer
-	for len(p) > 0 {
-		f.expander.Reset()
-		f.expander.Write(f.prev)
-		f.expander.Write(f.info)
-		f.expander.Write([]byte{f.counter})
-		f.prev = f.expander.Sum(f.prev[:0])
-		f.counter++
-
-		// Copy the new batch into p
-		f.cache = f.prev
-		n = copy(p, f.cache)
-		p = p[n:]
-	}
-	// Save leftovers for next run
-	f.cache = f.cache[n:]
-
-	return need, nil
-}
-
-// New returns a new HKDF using the given hash, the secret keying material to expand
-// and optional salt and info fields.
-func New(hash func() hash.Hash, secret, salt, info []byte) io.Reader {
-	if salt == nil {
-		salt = make([]byte, hash().Size())
-	}
-	extractor := hmac.New(hash, salt)
-	extractor.Write(secret)
-	prk := extractor.Sum(nil)
-
-	return &hkdf{hmac.New(hash, prk), extractor.Size(), info, 1, nil, nil}
-}
diff --git a/vendor/golang.org/x/crypto/internal/chacha20/asm_s390x.s b/vendor/golang.org/x/crypto/internal/chacha20/asm_s390x.s
deleted file mode 100644
index 98427c5..0000000
--- a/vendor/golang.org/x/crypto/internal/chacha20/asm_s390x.s
+++ /dev/null
@@ -1,283 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,!gccgo,!appengine
-
-#include "go_asm.h"
-#include "textflag.h"
-
-// This is an implementation of the ChaCha20 encryption algorithm as
-// specified in RFC 7539. It uses vector instructions to compute
-// 4 keystream blocks in parallel (256 bytes) which are then XORed
-// with the bytes in the input slice.
-
-GLOBL ·constants<>(SB), RODATA|NOPTR, $32
-// BSWAP: swap bytes in each 4-byte element
-DATA ·constants<>+0x00(SB)/4, $0x03020100
-DATA ·constants<>+0x04(SB)/4, $0x07060504
-DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
-DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
-// J0: [j0, j1, j2, j3]
-DATA ·constants<>+0x10(SB)/4, $0x61707865
-DATA ·constants<>+0x14(SB)/4, $0x3320646e
-DATA ·constants<>+0x18(SB)/4, $0x79622d32
-DATA ·constants<>+0x1c(SB)/4, $0x6b206574
-
-// EXRL targets:
-TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0
-	MVC $1, (R1), (R8)
-	RET
-
-TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0
-	MVC $1, (R8), (R9)
-	RET
-
-#define BSWAP V5
-#define J0    V6
-#define KEY0  V7
-#define KEY1  V8
-#define NONCE V9
-#define CTR   V10
-#define M0    V11
-#define M1    V12
-#define M2    V13
-#define M3    V14
-#define INC   V15
-#define X0    V16
-#define X1    V17
-#define X2    V18
-#define X3    V19
-#define X4    V20
-#define X5    V21
-#define X6    V22
-#define X7    V23
-#define X8    V24
-#define X9    V25
-#define X10   V26
-#define X11   V27
-#define X12   V28
-#define X13   V29
-#define X14   V30
-#define X15   V31
-
-#define NUM_ROUNDS 20
-
-#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
-	VAF    a1, a0, a0  \
-	VAF    b1, b0, b0  \
-	VAF    c1, c0, c0  \
-	VAF    d1, d0, d0  \
-	VX     a0, a2, a2  \
-	VX     b0, b2, b2  \
-	VX     c0, c2, c2  \
-	VX     d0, d2, d2  \
-	VERLLF $16, a2, a2 \
-	VERLLF $16, b2, b2 \
-	VERLLF $16, c2, c2 \
-	VERLLF $16, d2, d2 \
-	VAF    a2, a3, a3  \
-	VAF    b2, b3, b3  \
-	VAF    c2, c3, c3  \
-	VAF    d2, d3, d3  \
-	VX     a3, a1, a1  \
-	VX     b3, b1, b1  \
-	VX     c3, c1, c1  \
-	VX     d3, d1, d1  \
-	VERLLF $12, a1, a1 \
-	VERLLF $12, b1, b1 \
-	VERLLF $12, c1, c1 \
-	VERLLF $12, d1, d1 \
-	VAF    a1, a0, a0  \
-	VAF    b1, b0, b0  \
-	VAF    c1, c0, c0  \
-	VAF    d1, d0, d0  \
-	VX     a0, a2, a2  \
-	VX     b0, b2, b2  \
-	VX     c0, c2, c2  \
-	VX     d0, d2, d2  \
-	VERLLF $8, a2, a2  \
-	VERLLF $8, b2, b2  \
-	VERLLF $8, c2, c2  \
-	VERLLF $8, d2, d2  \
-	VAF    a2, a3, a3  \
-	VAF    b2, b3, b3  \
-	VAF    c2, c3, c3  \
-	VAF    d2, d3, d3  \
-	VX     a3, a1, a1  \
-	VX     b3, b1, b1  \
-	VX     c3, c1, c1  \
-	VX     d3, d1, d1  \
-	VERLLF $7, a1, a1  \
-	VERLLF $7, b1, b1  \
-	VERLLF $7, c1, c1  \
-	VERLLF $7, d1, d1
-
-#define PERMUTE(mask, v0, v1, v2, v3) \
-	VPERM v0, v0, mask, v0 \
-	VPERM v1, v1, mask, v1 \
-	VPERM v2, v2, mask, v2 \
-	VPERM v3, v3, mask, v3
-
-#define ADDV(x, v0, v1, v2, v3) \
-	VAF x, v0, v0 \
-	VAF x, v1, v1 \
-	VAF x, v2, v2 \
-	VAF x, v3, v3
-
-#define XORV(off, dst, src, v0, v1, v2, v3) \
-	VLM  off(src), M0, M3          \
-	PERMUTE(BSWAP, v0, v1, v2, v3) \
-	VX   v0, M0, M0                \
-	VX   v1, M1, M1                \
-	VX   v2, M2, M2                \
-	VX   v3, M3, M3                \
-	VSTM M0, M3, off(dst)
-
-#define SHUFFLE(a, b, c, d, t, u, v, w) \
-	VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
-	VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
-	VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
-	VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
-	VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
-	VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
-	VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
-	VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
-
-// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
-TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
-	MOVD $·constants<>(SB), R1
-	MOVD dst+0(FP), R2         // R2=&dst[0]
-	LMG  src+24(FP), R3, R4    // R3=&src[0] R4=len(src)
-	MOVD key+48(FP), R5        // R5=key
-	MOVD nonce+56(FP), R6      // R6=nonce
-	MOVD counter+64(FP), R7    // R7=counter
-	MOVD buf+72(FP), R8        // R8=buf
-	MOVD len+80(FP), R9        // R9=len
-
-	// load BSWAP and J0
-	VLM (R1), BSWAP, J0
-
-	// set up tail buffer
-	ADD     $-1, R4, R12
-	MOVBZ   R12, R12
-	CMPUBEQ R12, $255, aligned
-	MOVD    R4, R1
-	AND     $~255, R1
-	MOVD    $(R3)(R1*1), R1
-	EXRL    $·mvcSrcToBuf(SB), R12
-	MOVD    $255, R0
-	SUB     R12, R0
-	MOVD    R0, (R9)               // update len
-
-aligned:
-	// setup
-	MOVD  $95, R0
-	VLM   (R5), KEY0, KEY1
-	VLL   R0, (R6), NONCE
-	VZERO M0
-	VLEIB $7, $32, M0
-	VSRLB M0, NONCE, NONCE
-
-	// initialize counter values
-	VLREPF (R7), CTR
-	VZERO  INC
-	VLEIF  $1, $1, INC
-	VLEIF  $2, $2, INC
-	VLEIF  $3, $3, INC
-	VAF    INC, CTR, CTR
-	VREPIF $4, INC
-
-chacha:
-	VREPF $0, J0, X0
-	VREPF $1, J0, X1
-	VREPF $2, J0, X2
-	VREPF $3, J0, X3
-	VREPF $0, KEY0, X4
-	VREPF $1, KEY0, X5
-	VREPF $2, KEY0, X6
-	VREPF $3, KEY0, X7
-	VREPF $0, KEY1, X8
-	VREPF $1, KEY1, X9
-	VREPF $2, KEY1, X10
-	VREPF $3, KEY1, X11
-	VLR   CTR, X12
-	VREPF $1, NONCE, X13
-	VREPF $2, NONCE, X14
-	VREPF $3, NONCE, X15
-
-	MOVD $(NUM_ROUNDS/2), R1
-
-loop:
-	ROUND4(X0, X4, X12,  X8, X1, X5, X13,  X9, X2, X6, X14, X10, X3, X7, X15, X11)
-	ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8,  X3, X4, X14, X9)
-
-	ADD $-1, R1
-	BNE loop
-
-	// decrement length
-	ADD $-256, R4
-	BLT tail
-
-continue:
-	// rearrange vectors
-	SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
-	ADDV(J0, X0, X1, X2, X3)
-	SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
-	ADDV(KEY0, X4, X5, X6, X7)
-	SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
-	ADDV(KEY1, X8, X9, X10, X11)
-	VAF CTR, X12, X12
-	SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
-	ADDV(NONCE, X12, X13, X14, X15)
-
-	// increment counters
-	VAF INC, CTR, CTR
-
-	// xor keystream with plaintext
-	XORV(0*64, R2, R3, X0, X4,  X8, X12)
-	XORV(1*64, R2, R3, X1, X5,  X9, X13)
-	XORV(2*64, R2, R3, X2, X6, X10, X14)
-	XORV(3*64, R2, R3, X3, X7, X11, X15)
-
-	// increment pointers
-	MOVD $256(R2), R2
-	MOVD $256(R3), R3
-
-	CMPBNE  R4, $0, chacha
-	CMPUBEQ R12, $255, return
-	EXRL    $·mvcBufToDst(SB), R12 // len was updated during setup
-
-return:
-	VSTEF $0, CTR, (R7)
-	RET
-
-tail:
-	MOVD R2, R9
-	MOVD R8, R2
-	MOVD R8, R3
-	MOVD $0, R4
-	JMP  continue
-
-// func hasVectorFacility() bool
-TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
-	MOVD  $x-24(SP), R1
-	XC    $24, 0(R1), 0(R1) // clear the storage
-	MOVD  $2, R0            // R0 is the number of double words stored -1
-	WORD  $0xB2B01000       // STFLE 0(R1)
-	XOR   R0, R0            // reset the value of R0
-	MOVBZ z-8(SP), R1
-	AND   $0x40, R1
-	BEQ   novector
-
-vectorinstalled:
-	// check if the vector instruction has been enabled
-	VLEIB  $0, $0xF, V16
-	VLGVB  $0, V16, R1
-	CMPBNE R1, $0xF, novector
-	MOVB   $1, ret+0(FP)      // have vx
-	RET
-
-novector:
-	MOVB $0, ret+0(FP) // no vx
-	RET
diff --git a/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go b/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go
deleted file mode 100644
index 7ed1cd9..0000000
--- a/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package ChaCha20 implements the core ChaCha20 function as specified
-// in https://tools.ietf.org/html/rfc7539#section-2.3.
-package chacha20
-
-import (
-	"crypto/cipher"
-	"encoding/binary"
-)
-
-// assert that *Cipher implements cipher.Stream
-var _ cipher.Stream = (*Cipher)(nil)
-
-// Cipher is a stateful instance of ChaCha20 using a particular key
-// and nonce. A *Cipher implements the cipher.Stream interface.
-type Cipher struct {
-	key     [8]uint32
-	counter uint32 // incremented after each block
-	nonce   [3]uint32
-	buf     [bufSize]byte // buffer for unused keystream bytes
-	len     int           // number of unused keystream bytes at end of buf
-}
-
-// New creates a new ChaCha20 stream cipher with the given key and nonce.
-// The initial counter value is set to 0.
-func New(key [8]uint32, nonce [3]uint32) *Cipher {
-	return &Cipher{key: key, nonce: nonce}
-}
-
-// XORKeyStream XORs each byte in the given slice with a byte from the
-// cipher's key stream. Dst and src must overlap entirely or not at all.
-//
-// If len(dst) < len(src), XORKeyStream will panic. It is acceptable
-// to pass a dst bigger than src, and in that case, XORKeyStream will
-// only update dst[:len(src)] and will not touch the rest of dst.
-//
-// Multiple calls to XORKeyStream behave as if the concatenation of
-// the src buffers was passed in a single run. That is, Cipher
-// maintains state and does not reset at each XORKeyStream call.
-func (s *Cipher) XORKeyStream(dst, src []byte) {
-	// xor src with buffered keystream first
-	if s.len != 0 {
-		buf := s.buf[len(s.buf)-s.len:]
-		if len(src) < len(buf) {
-			buf = buf[:len(src)]
-		}
-		td, ts := dst[:len(buf)], src[:len(buf)] // BCE hint
-		for i, b := range buf {
-			td[i] = ts[i] ^ b
-		}
-		s.len -= len(buf)
-		if s.len != 0 {
-			return
-		}
-		s.buf = [len(s.buf)]byte{} // zero the empty buffer
-		src = src[len(buf):]
-		dst = dst[len(buf):]
-	}
-
-	if len(src) == 0 {
-		return
-	}
-	if haveAsm {
-		s.xorKeyStreamAsm(dst, src)
-		return
-	}
-
-	// set up a 64-byte buffer to pad out the final block if needed
-	// (hoisted out of the main loop to avoid spills)
-	rem := len(src) % 64  // length of final block
-	fin := len(src) - rem // index of final block
-	if rem > 0 {
-		copy(s.buf[len(s.buf)-64:], src[fin:])
-	}
-
-	// qr calculates a quarter round
-	qr := func(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
-		a += b
-		d ^= a
-		d = (d << 16) | (d >> 16)
-		c += d
-		b ^= c
-		b = (b << 12) | (b >> 20)
-		a += b
-		d ^= a
-		d = (d << 8) | (d >> 24)
-		c += d
-		b ^= c
-		b = (b << 7) | (b >> 25)
-		return a, b, c, d
-	}
-
-	// ChaCha20 constants
-	const (
-		j0 = 0x61707865
-		j1 = 0x3320646e
-		j2 = 0x79622d32
-		j3 = 0x6b206574
-	)
-
-	// pre-calculate most of the first round
-	s1, s5, s9, s13 := qr(j1, s.key[1], s.key[5], s.nonce[0])
-	s2, s6, s10, s14 := qr(j2, s.key[2], s.key[6], s.nonce[1])
-	s3, s7, s11, s15 := qr(j3, s.key[3], s.key[7], s.nonce[2])
-
-	n := len(src)
-	src, dst = src[:n:n], dst[:n:n] // BCE hint
-	for i := 0; i < n; i += 64 {
-		// calculate the remainder of the first round
-		s0, s4, s8, s12 := qr(j0, s.key[0], s.key[4], s.counter)
-
-		// execute the second round
-		x0, x5, x10, x15 := qr(s0, s5, s10, s15)
-		x1, x6, x11, x12 := qr(s1, s6, s11, s12)
-		x2, x7, x8, x13 := qr(s2, s7, s8, s13)
-		x3, x4, x9, x14 := qr(s3, s4, s9, s14)
-
-		// execute the remaining 18 rounds
-		for i := 0; i < 9; i++ {
-			x0, x4, x8, x12 = qr(x0, x4, x8, x12)
-			x1, x5, x9, x13 = qr(x1, x5, x9, x13)
-			x2, x6, x10, x14 = qr(x2, x6, x10, x14)
-			x3, x7, x11, x15 = qr(x3, x7, x11, x15)
-
-			x0, x5, x10, x15 = qr(x0, x5, x10, x15)
-			x1, x6, x11, x12 = qr(x1, x6, x11, x12)
-			x2, x7, x8, x13 = qr(x2, x7, x8, x13)
-			x3, x4, x9, x14 = qr(x3, x4, x9, x14)
-		}
-
-		x0 += j0
-		x1 += j1
-		x2 += j2
-		x3 += j3
-
-		x4 += s.key[0]
-		x5 += s.key[1]
-		x6 += s.key[2]
-		x7 += s.key[3]
-		x8 += s.key[4]
-		x9 += s.key[5]
-		x10 += s.key[6]
-		x11 += s.key[7]
-
-		x12 += s.counter
-		x13 += s.nonce[0]
-		x14 += s.nonce[1]
-		x15 += s.nonce[2]
-
-		// increment the counter
-		s.counter += 1
-		if s.counter == 0 {
-			panic("chacha20: counter overflow")
-		}
-
-		// pad to 64 bytes if needed
-		in, out := src[i:], dst[i:]
-		if i == fin {
-			// src[fin:] has already been copied into s.buf before
-			// the main loop
-			in, out = s.buf[len(s.buf)-64:], s.buf[len(s.buf)-64:]
-		}
-		in, out = in[:64], out[:64] // BCE hint
-
-		// XOR the key stream with the source and write out the result
-		xor(out[0:], in[0:], x0)
-		xor(out[4:], in[4:], x1)
-		xor(out[8:], in[8:], x2)
-		xor(out[12:], in[12:], x3)
-		xor(out[16:], in[16:], x4)
-		xor(out[20:], in[20:], x5)
-		xor(out[24:], in[24:], x6)
-		xor(out[28:], in[28:], x7)
-		xor(out[32:], in[32:], x8)
-		xor(out[36:], in[36:], x9)
-		xor(out[40:], in[40:], x10)
-		xor(out[44:], in[44:], x11)
-		xor(out[48:], in[48:], x12)
-		xor(out[52:], in[52:], x13)
-		xor(out[56:], in[56:], x14)
-		xor(out[60:], in[60:], x15)
-	}
-	// copy any trailing bytes out of the buffer and into dst
-	if rem != 0 {
-		s.len = 64 - rem
-		copy(dst[fin:], s.buf[len(s.buf)-64:])
-	}
-}
-
-// Advance discards bytes in the key stream until the next 64 byte block
-// boundary is reached and updates the counter accordingly. If the key
-// stream is already at a block boundary no bytes will be discarded and
-// the counter will be unchanged.
-func (s *Cipher) Advance() {
-	s.len -= s.len % 64
-	if s.len == 0 {
-		s.buf = [len(s.buf)]byte{}
-	}
-}
-
-// XORKeyStream crypts bytes from in to out using the given key and counters.
-// In and out must overlap entirely or not at all. Counter contains the raw
-// ChaCha20 counter bytes (i.e. block counter followed by nonce).
-func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
-	s := Cipher{
-		key: [8]uint32{
-			binary.LittleEndian.Uint32(key[0:4]),
-			binary.LittleEndian.Uint32(key[4:8]),
-			binary.LittleEndian.Uint32(key[8:12]),
-			binary.LittleEndian.Uint32(key[12:16]),
-			binary.LittleEndian.Uint32(key[16:20]),
-			binary.LittleEndian.Uint32(key[20:24]),
-			binary.LittleEndian.Uint32(key[24:28]),
-			binary.LittleEndian.Uint32(key[28:32]),
-		},
-		nonce: [3]uint32{
-			binary.LittleEndian.Uint32(counter[4:8]),
-			binary.LittleEndian.Uint32(counter[8:12]),
-			binary.LittleEndian.Uint32(counter[12:16]),
-		},
-		counter: binary.LittleEndian.Uint32(counter[0:4]),
-	}
-	s.XORKeyStream(out, in)
-}
diff --git a/vendor/golang.org/x/crypto/internal/chacha20/chacha_noasm.go b/vendor/golang.org/x/crypto/internal/chacha20/chacha_noasm.go
deleted file mode 100644
index 91520d1..0000000
--- a/vendor/golang.org/x/crypto/internal/chacha20/chacha_noasm.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !s390x gccgo appengine
-
-package chacha20
-
-const (
-	bufSize = 64
-	haveAsm = false
-)
-
-func (*Cipher) xorKeyStreamAsm(dst, src []byte) {
-	panic("not implemented")
-}
diff --git a/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.go b/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.go
deleted file mode 100644
index 0c1c671..0000000
--- a/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.go
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,!gccgo,!appengine
-
-package chacha20
-
-var haveAsm = hasVectorFacility()
-
-const bufSize = 256
-
-// hasVectorFacility reports whether the machine supports the vector
-// facility (vx).
-// Implementation in asm_s390x.s.
-func hasVectorFacility() bool
-
-// xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only
-// be called when the vector facility is available.
-// Implementation in asm_s390x.s.
-//go:noescape
-func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
-
-func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
-	xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter, &c.buf, &c.len)
-}
-
-// EXRL targets, DO NOT CALL!
-func mvcSrcToBuf()
-func mvcBufToDst()
diff --git a/vendor/golang.org/x/crypto/internal/chacha20/xor.go b/vendor/golang.org/x/crypto/internal/chacha20/xor.go
deleted file mode 100644
index 9c5ba0b..0000000
--- a/vendor/golang.org/x/crypto/internal/chacha20/xor.go
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found src the LICENSE file.
-
-package chacha20
-
-import (
-	"runtime"
-)
-
-// Platforms that have fast unaligned 32-bit little endian accesses.
-const unaligned = runtime.GOARCH == "386" ||
-	runtime.GOARCH == "amd64" ||
-	runtime.GOARCH == "arm64" ||
-	runtime.GOARCH == "ppc64le" ||
-	runtime.GOARCH == "s390x"
-
-// xor reads a little endian uint32 from src, XORs it with u and
-// places the result in little endian byte order in dst.
-func xor(dst, src []byte, u uint32) {
-	_, _ = src[3], dst[3] // eliminate bounds checks
-	if unaligned {
-		// The compiler should optimize this code into
-		// 32-bit unaligned little endian loads and stores.
-		// TODO: delete once the compiler does a reliably
-		// good job with the generic code below.
-		// See issue #25111 for more details.
-		v := uint32(src[0])
-		v |= uint32(src[1]) << 8
-		v |= uint32(src[2]) << 16
-		v |= uint32(src[3]) << 24
-		v ^= u
-		dst[0] = byte(v)
-		dst[1] = byte(v >> 8)
-		dst[2] = byte(v >> 16)
-		dst[3] = byte(v >> 24)
-	} else {
-		dst[0] = src[0] ^ byte(u)
-		dst[1] = src[1] ^ byte(u>>8)
-		dst[2] = src[2] ^ byte(u>>16)
-		dst[3] = src[3] ^ byte(u>>24)
-	}
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/poly1305.go b/vendor/golang.org/x/crypto/poly1305/poly1305.go
deleted file mode 100644
index f562fa5..0000000
--- a/vendor/golang.org/x/crypto/poly1305/poly1305.go
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-/*
-Package poly1305 implements Poly1305 one-time message authentication code as
-specified in https://cr.yp.to/mac/poly1305-20050329.pdf.
-
-Poly1305 is a fast, one-time authentication function. It is infeasible for an
-attacker to generate an authenticator for a message without the key. However, a
-key must only be used for a single message. Authenticating two different
-messages with the same key allows an attacker to forge authenticators for other
-messages with the same key.
-
-Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was
-used with a fixed key in order to generate one-time keys from an nonce.
-However, in this package AES isn't used and the one-time key is specified
-directly.
-*/
-package poly1305 // import "golang.org/x/crypto/poly1305"
-
-import "crypto/subtle"
-
-// TagSize is the size, in bytes, of a poly1305 authenticator.
-const TagSize = 16
-
-// Verify returns true if mac is a valid authenticator for m with the given
-// key.
-func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
-	var tmp [16]byte
-	Sum(&tmp, m, key)
-	return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
deleted file mode 100644
index 4dd72fe..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build amd64,!gccgo,!appengine
-
-package poly1305
-
-// This function is implemented in sum_amd64.s
-//go:noescape
-func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
-
-// Sum generates an authenticator for m using a one-time key and puts the
-// 16-byte result into out. Authenticating two different messages with the same
-// key allows an attacker to forge messages at will.
-func Sum(out *[16]byte, m []byte, key *[32]byte) {
-	var mPtr *byte
-	if len(m) > 0 {
-		mPtr = &m[0]
-	}
-	poly1305(out, mPtr, uint64(len(m)), key)
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/poly1305/sum_amd64.s
deleted file mode 100644
index 2edae63..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build amd64,!gccgo,!appengine
-
-#include "textflag.h"
-
-#define POLY1305_ADD(msg, h0, h1, h2) \
-	ADDQ 0(msg), h0;  \
-	ADCQ 8(msg), h1;  \
-	ADCQ $1, h2;      \
-	LEAQ 16(msg), msg
-
-#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
-	MOVQ  r0, AX;                  \
-	MULQ  h0;                      \
-	MOVQ  AX, t0;                  \
-	MOVQ  DX, t1;                  \
-	MOVQ  r0, AX;                  \
-	MULQ  h1;                      \
-	ADDQ  AX, t1;                  \
-	ADCQ  $0, DX;                  \
-	MOVQ  r0, t2;                  \
-	IMULQ h2, t2;                  \
-	ADDQ  DX, t2;                  \
-	                               \
-	MOVQ  r1, AX;                  \
-	MULQ  h0;                      \
-	ADDQ  AX, t1;                  \
-	ADCQ  $0, DX;                  \
-	MOVQ  DX, h0;                  \
-	MOVQ  r1, t3;                  \
-	IMULQ h2, t3;                  \
-	MOVQ  r1, AX;                  \
-	MULQ  h1;                      \
-	ADDQ  AX, t2;                  \
-	ADCQ  DX, t3;                  \
-	ADDQ  h0, t2;                  \
-	ADCQ  $0, t3;                  \
-	                               \
-	MOVQ  t0, h0;                  \
-	MOVQ  t1, h1;                  \
-	MOVQ  t2, h2;                  \
-	ANDQ  $3, h2;                  \
-	MOVQ  t2, t0;                  \
-	ANDQ  $0xFFFFFFFFFFFFFFFC, t0; \
-	ADDQ  t0, h0;                  \
-	ADCQ  t3, h1;                  \
-	ADCQ  $0, h2;                  \
-	SHRQ  $2, t3, t2;              \
-	SHRQ  $2, t3;                  \
-	ADDQ  t2, h0;                  \
-	ADCQ  t3, h1;                  \
-	ADCQ  $0, h2
-
-DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-GLOBL ·poly1305Mask<>(SB), RODATA, $16
-
-// func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]key)
-TEXT ·poly1305(SB), $0-32
-	MOVQ out+0(FP), DI
-	MOVQ m+8(FP), SI
-	MOVQ mlen+16(FP), R15
-	MOVQ key+24(FP), AX
-
-	MOVQ 0(AX), R11
-	MOVQ 8(AX), R12
-	ANDQ ·poly1305Mask<>(SB), R11   // r0
-	ANDQ ·poly1305Mask<>+8(SB), R12 // r1
-	XORQ R8, R8                    // h0
-	XORQ R9, R9                    // h1
-	XORQ R10, R10                  // h2
-
-	CMPQ R15, $16
-	JB   bytes_between_0_and_15
-
-loop:
-	POLY1305_ADD(SI, R8, R9, R10)
-
-multiply:
-	POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
-	SUBQ $16, R15
-	CMPQ R15, $16
-	JAE  loop
-
-bytes_between_0_and_15:
-	TESTQ R15, R15
-	JZ    done
-	MOVQ  $1, BX
-	XORQ  CX, CX
-	XORQ  R13, R13
-	ADDQ  R15, SI
-
-flush_buffer:
-	SHLQ $8, BX, CX
-	SHLQ $8, BX
-	MOVB -1(SI), R13
-	XORQ R13, BX
-	DECQ SI
-	DECQ R15
-	JNZ  flush_buffer
-
-	ADDQ BX, R8
-	ADCQ CX, R9
-	ADCQ $0, R10
-	MOVQ $16, R15
-	JMP  multiply
-
-done:
-	MOVQ    R8, AX
-	MOVQ    R9, BX
-	SUBQ    $0xFFFFFFFFFFFFFFFB, AX
-	SBBQ    $0xFFFFFFFFFFFFFFFF, BX
-	SBBQ    $3, R10
-	CMOVQCS R8, AX
-	CMOVQCS R9, BX
-	MOVQ    key+24(FP), R8
-	ADDQ    16(R8), AX
-	ADCQ    24(R8), BX
-
-	MOVQ AX, 0(DI)
-	MOVQ BX, 8(DI)
-	RET
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_arm.go b/vendor/golang.org/x/crypto/poly1305/sum_arm.go
deleted file mode 100644
index 5dc321c..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_arm.go
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build arm,!gccgo,!appengine,!nacl
-
-package poly1305
-
-// This function is implemented in sum_arm.s
-//go:noescape
-func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]byte)
-
-// Sum generates an authenticator for m using a one-time key and puts the
-// 16-byte result into out. Authenticating two different messages with the same
-// key allows an attacker to forge messages at will.
-func Sum(out *[16]byte, m []byte, key *[32]byte) {
-	var mPtr *byte
-	if len(m) > 0 {
-		mPtr = &m[0]
-	}
-	poly1305_auth_armv6(out, mPtr, uint32(len(m)), key)
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_arm.s b/vendor/golang.org/x/crypto/poly1305/sum_arm.s
deleted file mode 100644
index f70b4ac..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_arm.s
+++ /dev/null
@@ -1,427 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build arm,!gccgo,!appengine,!nacl
-
-#include "textflag.h"
-
-// This code was translated into a form compatible with 5a from the public
-// domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305.
-
-DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff
-DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03
-DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff
-DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff
-DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff
-GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20
-
-// Warning: the linker may use R11 to synthesize certain instructions. Please
-// take care and verify that no synthetic instructions use it.
-
-TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0
-	// Needs 16 bytes of stack and 64 bytes of space pointed to by R0.  (It
-	// might look like it's only 60 bytes of space but the final four bytes
-	// will be written by another function.) We need to skip over four
-	// bytes of stack because that's saving the value of 'g'.
-	ADD       $4, R13, R8
-	MOVM.IB   [R4-R7], (R8)
-	MOVM.IA.W (R1), [R2-R5]
-	MOVW      $·poly1305_init_constants_armv6<>(SB), R7
-	MOVW      R2, R8
-	MOVW      R2>>26, R9
-	MOVW      R3>>20, g
-	MOVW      R4>>14, R11
-	MOVW      R5>>8, R12
-	ORR       R3<<6, R9, R9
-	ORR       R4<<12, g, g
-	ORR       R5<<18, R11, R11
-	MOVM.IA   (R7), [R2-R6]
-	AND       R8, R2, R2
-	AND       R9, R3, R3
-	AND       g, R4, R4
-	AND       R11, R5, R5
-	AND       R12, R6, R6
-	MOVM.IA.W [R2-R6], (R0)
-	EOR       R2, R2, R2
-	EOR       R3, R3, R3
-	EOR       R4, R4, R4
-	EOR       R5, R5, R5
-	EOR       R6, R6, R6
-	MOVM.IA.W [R2-R6], (R0)
-	MOVM.IA.W (R1), [R2-R5]
-	MOVM.IA   [R2-R6], (R0)
-	ADD       $20, R13, R0
-	MOVM.DA   (R0), [R4-R7]
-	RET
-
-#define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
-	MOVBU (offset+0)(Rsrc), Rtmp; \
-	MOVBU Rtmp, (offset+0)(Rdst); \
-	MOVBU (offset+1)(Rsrc), Rtmp; \
-	MOVBU Rtmp, (offset+1)(Rdst); \
-	MOVBU (offset+2)(Rsrc), Rtmp; \
-	MOVBU Rtmp, (offset+2)(Rdst); \
-	MOVBU (offset+3)(Rsrc), Rtmp; \
-	MOVBU Rtmp, (offset+3)(Rdst)
-
-TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0
-	// Needs 24 bytes of stack for saved registers and then 88 bytes of
-	// scratch space after that. We assume that 24 bytes at (R13) have
-	// already been used: four bytes for the link register saved in the
-	// prelude of poly1305_auth_armv6, four bytes for saving the value of g
-	// in that function and 16 bytes of scratch space used around
-	// poly1305_finish_ext_armv6_skip1.
-	ADD     $24, R13, R12
-	MOVM.IB [R4-R8, R14], (R12)
-	MOVW    R0, 88(R13)
-	MOVW    R1, 92(R13)
-	MOVW    R2, 96(R13)
-	MOVW    R1, R14
-	MOVW    R2, R12
-	MOVW    56(R0), R8
-	WORD    $0xe1180008                // TST R8, R8 not working see issue 5921
-	EOR     R6, R6, R6
-	MOVW.EQ $(1<<24), R6
-	MOVW    R6, 84(R13)
-	ADD     $116, R13, g
-	MOVM.IA (R0), [R0-R9]
-	MOVM.IA [R0-R4], (g)
-	CMP     $16, R12
-	BLO     poly1305_blocks_armv6_done
-
-poly1305_blocks_armv6_mainloop:
-	WORD    $0xe31e0003                            // TST R14, #3 not working see issue 5921
-	BEQ     poly1305_blocks_armv6_mainloop_aligned
-	ADD     $100, R13, g
-	MOVW_UNALIGNED(R14, g, R0, 0)
-	MOVW_UNALIGNED(R14, g, R0, 4)
-	MOVW_UNALIGNED(R14, g, R0, 8)
-	MOVW_UNALIGNED(R14, g, R0, 12)
-	MOVM.IA (g), [R0-R3]
-	ADD     $16, R14
-	B       poly1305_blocks_armv6_mainloop_loaded
-
-poly1305_blocks_armv6_mainloop_aligned:
-	MOVM.IA.W (R14), [R0-R3]
-
-poly1305_blocks_armv6_mainloop_loaded:
-	MOVW    R0>>26, g
-	MOVW    R1>>20, R11
-	MOVW    R2>>14, R12
-	MOVW    R14, 92(R13)
-	MOVW    R3>>8, R4
-	ORR     R1<<6, g, g
-	ORR     R2<<12, R11, R11
-	ORR     R3<<18, R12, R12
-	BIC     $0xfc000000, R0, R0
-	BIC     $0xfc000000, g, g
-	MOVW    84(R13), R3
-	BIC     $0xfc000000, R11, R11
-	BIC     $0xfc000000, R12, R12
-	ADD     R0, R5, R5
-	ADD     g, R6, R6
-	ORR     R3, R4, R4
-	ADD     R11, R7, R7
-	ADD     $116, R13, R14
-	ADD     R12, R8, R8
-	ADD     R4, R9, R9
-	MOVM.IA (R14), [R0-R4]
-	MULLU   R4, R5, (R11, g)
-	MULLU   R3, R5, (R14, R12)
-	MULALU  R3, R6, (R11, g)
-	MULALU  R2, R6, (R14, R12)
-	MULALU  R2, R7, (R11, g)
-	MULALU  R1, R7, (R14, R12)
-	ADD     R4<<2, R4, R4
-	ADD     R3<<2, R3, R3
-	MULALU  R1, R8, (R11, g)
-	MULALU  R0, R8, (R14, R12)
-	MULALU  R0, R9, (R11, g)
-	MULALU  R4, R9, (R14, R12)
-	MOVW    g, 76(R13)
-	MOVW    R11, 80(R13)
-	MOVW    R12, 68(R13)
-	MOVW    R14, 72(R13)
-	MULLU   R2, R5, (R11, g)
-	MULLU   R1, R5, (R14, R12)
-	MULALU  R1, R6, (R11, g)
-	MULALU  R0, R6, (R14, R12)
-	MULALU  R0, R7, (R11, g)
-	MULALU  R4, R7, (R14, R12)
-	ADD     R2<<2, R2, R2
-	ADD     R1<<2, R1, R1
-	MULALU  R4, R8, (R11, g)
-	MULALU  R3, R8, (R14, R12)
-	MULALU  R3, R9, (R11, g)
-	MULALU  R2, R9, (R14, R12)
-	MOVW    g, 60(R13)
-	MOVW    R11, 64(R13)
-	MOVW    R12, 52(R13)
-	MOVW    R14, 56(R13)
-	MULLU   R0, R5, (R11, g)
-	MULALU  R4, R6, (R11, g)
-	MULALU  R3, R7, (R11, g)
-	MULALU  R2, R8, (R11, g)
-	MULALU  R1, R9, (R11, g)
-	ADD     $52, R13, R0
-	MOVM.IA (R0), [R0-R7]
-	MOVW    g>>26, R12
-	MOVW    R4>>26, R14
-	ORR     R11<<6, R12, R12
-	ORR     R5<<6, R14, R14
-	BIC     $0xfc000000, g, g
-	BIC     $0xfc000000, R4, R4
-	ADD.S   R12, R0, R0
-	ADC     $0, R1, R1
-	ADD.S   R14, R6, R6
-	ADC     $0, R7, R7
-	MOVW    R0>>26, R12
-	MOVW    R6>>26, R14
-	ORR     R1<<6, R12, R12
-	ORR     R7<<6, R14, R14
-	BIC     $0xfc000000, R0, R0
-	BIC     $0xfc000000, R6, R6
-	ADD     R14<<2, R14, R14
-	ADD.S   R12, R2, R2
-	ADC     $0, R3, R3
-	ADD     R14, g, g
-	MOVW    R2>>26, R12
-	MOVW    g>>26, R14
-	ORR     R3<<6, R12, R12
-	BIC     $0xfc000000, g, R5
-	BIC     $0xfc000000, R2, R7
-	ADD     R12, R4, R4
-	ADD     R14, R0, R0
-	MOVW    R4>>26, R12
-	BIC     $0xfc000000, R4, R8
-	ADD     R12, R6, R9
-	MOVW    96(R13), R12
-	MOVW    92(R13), R14
-	MOVW    R0, R6
-	CMP     $32, R12
-	SUB     $16, R12, R12
-	MOVW    R12, 96(R13)
-	BHS     poly1305_blocks_armv6_mainloop
-
-poly1305_blocks_armv6_done:
-	MOVW    88(R13), R12
-	MOVW    R5, 20(R12)
-	MOVW    R6, 24(R12)
-	MOVW    R7, 28(R12)
-	MOVW    R8, 32(R12)
-	MOVW    R9, 36(R12)
-	ADD     $48, R13, R0
-	MOVM.DA (R0), [R4-R8, R14]
-	RET
-
-#define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
-	MOVBU.P 1(Rsrc), Rtmp; \
-	MOVBU.P Rtmp, 1(Rdst); \
-	MOVBU.P 1(Rsrc), Rtmp; \
-	MOVBU.P Rtmp, 1(Rdst)
-
-#define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
-	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
-	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
-
-// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
-TEXT ·poly1305_auth_armv6(SB), $196-16
-	// The value 196, just above, is the sum of 64 (the size of the context
-	// structure) and 132 (the amount of stack needed).
-	//
-	// At this point, the stack pointer (R13) has been moved down. It
-	// points to the saved link register and there's 196 bytes of free
-	// space above it.
-	//
-	// The stack for this function looks like:
-	//
-	// +---------------------
-	// |
-	// | 64 bytes of context structure
-	// |
-	// +---------------------
-	// |
-	// | 112 bytes for poly1305_blocks_armv6
-	// |
-	// +---------------------
-	// | 16 bytes of final block, constructed at
-	// | poly1305_finish_ext_armv6_skip8
-	// +---------------------
-	// | four bytes of saved 'g'
-	// +---------------------
-	// | lr, saved by prelude    <- R13 points here
-	// +---------------------
-	MOVW g, 4(R13)
-
-	MOVW out+0(FP), R4
-	MOVW m+4(FP), R5
-	MOVW mlen+8(FP), R6
-	MOVW key+12(FP), R7
-
-	ADD  $136, R13, R0 // 136 = 4 + 4 + 16 + 112
-	MOVW R7, R1
-
-	// poly1305_init_ext_armv6 will write to the stack from R13+4, but
-	// that's ok because none of the other values have been written yet.
-	BL    poly1305_init_ext_armv6<>(SB)
-	BIC.S $15, R6, R2
-	BEQ   poly1305_auth_armv6_noblocks
-	ADD   $136, R13, R0
-	MOVW  R5, R1
-	ADD   R2, R5, R5
-	SUB   R2, R6, R6
-	BL    poly1305_blocks_armv6<>(SB)
-
-poly1305_auth_armv6_noblocks:
-	ADD  $136, R13, R0
-	MOVW R5, R1
-	MOVW R6, R2
-	MOVW R4, R3
-
-	MOVW  R0, R5
-	MOVW  R1, R6
-	MOVW  R2, R7
-	MOVW  R3, R8
-	AND.S R2, R2, R2
-	BEQ   poly1305_finish_ext_armv6_noremaining
-	EOR   R0, R0
-	ADD   $8, R13, R9                           // 8 = offset to 16 byte scratch space
-	MOVW  R0, (R9)
-	MOVW  R0, 4(R9)
-	MOVW  R0, 8(R9)
-	MOVW  R0, 12(R9)
-	WORD  $0xe3110003                           // TST R1, #3 not working see issue 5921
-	BEQ   poly1305_finish_ext_armv6_aligned
-	WORD  $0xe3120008                           // TST R2, #8 not working see issue 5921
-	BEQ   poly1305_finish_ext_armv6_skip8
-	MOVWP_UNALIGNED(R1, R9, g)
-	MOVWP_UNALIGNED(R1, R9, g)
-
-poly1305_finish_ext_armv6_skip8:
-	WORD $0xe3120004                     // TST $4, R2 not working see issue 5921
-	BEQ  poly1305_finish_ext_armv6_skip4
-	MOVWP_UNALIGNED(R1, R9, g)
-
-poly1305_finish_ext_armv6_skip4:
-	WORD $0xe3120002                     // TST $2, R2 not working see issue 5921
-	BEQ  poly1305_finish_ext_armv6_skip2
-	MOVHUP_UNALIGNED(R1, R9, g)
-	B    poly1305_finish_ext_armv6_skip2
-
-poly1305_finish_ext_armv6_aligned:
-	WORD      $0xe3120008                             // TST R2, #8 not working see issue 5921
-	BEQ       poly1305_finish_ext_armv6_skip8_aligned
-	MOVM.IA.W (R1), [g-R11]
-	MOVM.IA.W [g-R11], (R9)
-
-poly1305_finish_ext_armv6_skip8_aligned:
-	WORD   $0xe3120004                             // TST $4, R2 not working see issue 5921
-	BEQ    poly1305_finish_ext_armv6_skip4_aligned
-	MOVW.P 4(R1), g
-	MOVW.P g, 4(R9)
-
-poly1305_finish_ext_armv6_skip4_aligned:
-	WORD    $0xe3120002                     // TST $2, R2 not working see issue 5921
-	BEQ     poly1305_finish_ext_armv6_skip2
-	MOVHU.P 2(R1), g
-	MOVH.P  g, 2(R9)
-
-poly1305_finish_ext_armv6_skip2:
-	WORD    $0xe3120001                     // TST $1, R2 not working see issue 5921
-	BEQ     poly1305_finish_ext_armv6_skip1
-	MOVBU.P 1(R1), g
-	MOVBU.P g, 1(R9)
-
-poly1305_finish_ext_armv6_skip1:
-	MOVW  $1, R11
-	MOVBU R11, 0(R9)
-	MOVW  R11, 56(R5)
-	MOVW  R5, R0
-	ADD   $8, R13, R1
-	MOVW  $16, R2
-	BL    poly1305_blocks_armv6<>(SB)
-
-poly1305_finish_ext_armv6_noremaining:
-	MOVW      20(R5), R0
-	MOVW      24(R5), R1
-	MOVW      28(R5), R2
-	MOVW      32(R5), R3
-	MOVW      36(R5), R4
-	MOVW      R4>>26, R12
-	BIC       $0xfc000000, R4, R4
-	ADD       R12<<2, R12, R12
-	ADD       R12, R0, R0
-	MOVW      R0>>26, R12
-	BIC       $0xfc000000, R0, R0
-	ADD       R12, R1, R1
-	MOVW      R1>>26, R12
-	BIC       $0xfc000000, R1, R1
-	ADD       R12, R2, R2
-	MOVW      R2>>26, R12
-	BIC       $0xfc000000, R2, R2
-	ADD       R12, R3, R3
-	MOVW      R3>>26, R12
-	BIC       $0xfc000000, R3, R3
-	ADD       R12, R4, R4
-	ADD       $5, R0, R6
-	MOVW      R6>>26, R12
-	BIC       $0xfc000000, R6, R6
-	ADD       R12, R1, R7
-	MOVW      R7>>26, R12
-	BIC       $0xfc000000, R7, R7
-	ADD       R12, R2, g
-	MOVW      g>>26, R12
-	BIC       $0xfc000000, g, g
-	ADD       R12, R3, R11
-	MOVW      $-(1<<26), R12
-	ADD       R11>>26, R12, R12
-	BIC       $0xfc000000, R11, R11
-	ADD       R12, R4, R9
-	MOVW      R9>>31, R12
-	SUB       $1, R12
-	AND       R12, R6, R6
-	AND       R12, R7, R7
-	AND       R12, g, g
-	AND       R12, R11, R11
-	AND       R12, R9, R9
-	MVN       R12, R12
-	AND       R12, R0, R0
-	AND       R12, R1, R1
-	AND       R12, R2, R2
-	AND       R12, R3, R3
-	AND       R12, R4, R4
-	ORR       R6, R0, R0
-	ORR       R7, R1, R1
-	ORR       g, R2, R2
-	ORR       R11, R3, R3
-	ORR       R9, R4, R4
-	ORR       R1<<26, R0, R0
-	MOVW      R1>>6, R1
-	ORR       R2<<20, R1, R1
-	MOVW      R2>>12, R2
-	ORR       R3<<14, R2, R2
-	MOVW      R3>>18, R3
-	ORR       R4<<8, R3, R3
-	MOVW      40(R5), R6
-	MOVW      44(R5), R7
-	MOVW      48(R5), g
-	MOVW      52(R5), R11
-	ADD.S     R6, R0, R0
-	ADC.S     R7, R1, R1
-	ADC.S     g, R2, R2
-	ADC.S     R11, R3, R3
-	MOVM.IA   [R0-R3], (R8)
-	MOVW      R5, R12
-	EOR       R0, R0, R0
-	EOR       R1, R1, R1
-	EOR       R2, R2, R2
-	EOR       R3, R3, R3
-	EOR       R4, R4, R4
-	EOR       R5, R5, R5
-	EOR       R6, R6, R6
-	EOR       R7, R7, R7
-	MOVM.IA.W [R0-R7], (R12)
-	MOVM.IA   [R0-R7], (R12)
-	MOVW      4(R13), g
-	RET
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_noasm.go b/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
deleted file mode 100644
index 751eec5..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,!go1.11 !arm,!amd64,!s390x gccgo appengine nacl
-
-package poly1305
-
-// Sum generates an authenticator for msg using a one-time key and puts the
-// 16-byte result into out. Authenticating two different messages with the same
-// key allows an attacker to forge messages at will.
-func Sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
-	sumGeneric(out, msg, key)
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_ref.go b/vendor/golang.org/x/crypto/poly1305/sum_ref.go
deleted file mode 100644
index c4d59bd..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_ref.go
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package poly1305
-
-import "encoding/binary"
-
-// sumGeneric generates an authenticator for msg using a one-time key and
-// puts the 16-byte result into out. This is the generic implementation of
-// Sum and should be called if no assembly implementation is available.
-func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
-	var (
-		h0, h1, h2, h3, h4 uint32 // the hash accumulators
-		r0, r1, r2, r3, r4 uint64 // the r part of the key
-	)
-
-	r0 = uint64(binary.LittleEndian.Uint32(key[0:]) & 0x3ffffff)
-	r1 = uint64((binary.LittleEndian.Uint32(key[3:]) >> 2) & 0x3ffff03)
-	r2 = uint64((binary.LittleEndian.Uint32(key[6:]) >> 4) & 0x3ffc0ff)
-	r3 = uint64((binary.LittleEndian.Uint32(key[9:]) >> 6) & 0x3f03fff)
-	r4 = uint64((binary.LittleEndian.Uint32(key[12:]) >> 8) & 0x00fffff)
-
-	R1, R2, R3, R4 := r1*5, r2*5, r3*5, r4*5
-
-	for len(msg) >= TagSize {
-		// h += msg
-		h0 += binary.LittleEndian.Uint32(msg[0:]) & 0x3ffffff
-		h1 += (binary.LittleEndian.Uint32(msg[3:]) >> 2) & 0x3ffffff
-		h2 += (binary.LittleEndian.Uint32(msg[6:]) >> 4) & 0x3ffffff
-		h3 += (binary.LittleEndian.Uint32(msg[9:]) >> 6) & 0x3ffffff
-		h4 += (binary.LittleEndian.Uint32(msg[12:]) >> 8) | (1 << 24)
-
-		// h *= r
-		d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1)
-		d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2)
-		d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3)
-		d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4)
-		d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0)
-
-		// h %= p
-		h0 = uint32(d0) & 0x3ffffff
-		h1 = uint32(d1) & 0x3ffffff
-		h2 = uint32(d2) & 0x3ffffff
-		h3 = uint32(d3) & 0x3ffffff
-		h4 = uint32(d4) & 0x3ffffff
-
-		h0 += uint32(d4>>26) * 5
-		h1 += h0 >> 26
-		h0 = h0 & 0x3ffffff
-
-		msg = msg[TagSize:]
-	}
-
-	if len(msg) > 0 {
-		var block [TagSize]byte
-		off := copy(block[:], msg)
-		block[off] = 0x01
-
-		// h += msg
-		h0 += binary.LittleEndian.Uint32(block[0:]) & 0x3ffffff
-		h1 += (binary.LittleEndian.Uint32(block[3:]) >> 2) & 0x3ffffff
-		h2 += (binary.LittleEndian.Uint32(block[6:]) >> 4) & 0x3ffffff
-		h3 += (binary.LittleEndian.Uint32(block[9:]) >> 6) & 0x3ffffff
-		h4 += (binary.LittleEndian.Uint32(block[12:]) >> 8)
-
-		// h *= r
-		d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1)
-		d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2)
-		d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3)
-		d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4)
-		d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0)
-
-		// h %= p
-		h0 = uint32(d0) & 0x3ffffff
-		h1 = uint32(d1) & 0x3ffffff
-		h2 = uint32(d2) & 0x3ffffff
-		h3 = uint32(d3) & 0x3ffffff
-		h4 = uint32(d4) & 0x3ffffff
-
-		h0 += uint32(d4>>26) * 5
-		h1 += h0 >> 26
-		h0 = h0 & 0x3ffffff
-	}
-
-	// h %= p reduction
-	h2 += h1 >> 26
-	h1 &= 0x3ffffff
-	h3 += h2 >> 26
-	h2 &= 0x3ffffff
-	h4 += h3 >> 26
-	h3 &= 0x3ffffff
-	h0 += 5 * (h4 >> 26)
-	h4 &= 0x3ffffff
-	h1 += h0 >> 26
-	h0 &= 0x3ffffff
-
-	// h - p
-	t0 := h0 + 5
-	t1 := h1 + (t0 >> 26)
-	t2 := h2 + (t1 >> 26)
-	t3 := h3 + (t2 >> 26)
-	t4 := h4 + (t3 >> 26) - (1 << 26)
-	t0 &= 0x3ffffff
-	t1 &= 0x3ffffff
-	t2 &= 0x3ffffff
-	t3 &= 0x3ffffff
-
-	// select h if h < p else h - p
-	t_mask := (t4 >> 31) - 1
-	h_mask := ^t_mask
-	h0 = (h0 & h_mask) | (t0 & t_mask)
-	h1 = (h1 & h_mask) | (t1 & t_mask)
-	h2 = (h2 & h_mask) | (t2 & t_mask)
-	h3 = (h3 & h_mask) | (t3 & t_mask)
-	h4 = (h4 & h_mask) | (t4 & t_mask)
-
-	// h %= 2^128
-	h0 |= h1 << 26
-	h1 = ((h1 >> 6) | (h2 << 20))
-	h2 = ((h2 >> 12) | (h3 << 14))
-	h3 = ((h3 >> 18) | (h4 << 8))
-
-	// s: the s part of the key
-	// tag = (h + s) % (2^128)
-	t := uint64(h0) + uint64(binary.LittleEndian.Uint32(key[16:]))
-	h0 = uint32(t)
-	t = uint64(h1) + uint64(binary.LittleEndian.Uint32(key[20:])) + (t >> 32)
-	h1 = uint32(t)
-	t = uint64(h2) + uint64(binary.LittleEndian.Uint32(key[24:])) + (t >> 32)
-	h2 = uint32(t)
-	t = uint64(h3) + uint64(binary.LittleEndian.Uint32(key[28:])) + (t >> 32)
-	h3 = uint32(t)
-
-	binary.LittleEndian.PutUint32(out[0:], h0)
-	binary.LittleEndian.PutUint32(out[4:], h1)
-	binary.LittleEndian.PutUint32(out[8:], h2)
-	binary.LittleEndian.PutUint32(out[12:], h3)
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go b/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
deleted file mode 100644
index 7a266ce..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,go1.11,!gccgo,!appengine
-
-package poly1305
-
-// hasVectorFacility reports whether the machine supports
-// the vector facility (vx).
-func hasVectorFacility() bool
-
-// hasVMSLFacility reports whether the machine supports
-// Vector Multiply Sum Logical (VMSL).
-func hasVMSLFacility() bool
-
-var hasVX = hasVectorFacility()
-var hasVMSL = hasVMSLFacility()
-
-// poly1305vx is an assembly implementation of Poly1305 that uses vector
-// instructions. It must only be called if the vector facility (vx) is
-// available.
-//go:noescape
-func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
-
-// poly1305vmsl is an assembly implementation of Poly1305 that uses vector
-// instructions, including VMSL. It must only be called if the vector facility (vx) is
-// available and if VMSL is supported.
-//go:noescape
-func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
-
-// Sum generates an authenticator for m using a one-time key and puts the
-// 16-byte result into out. Authenticating two different messages with the same
-// key allows an attacker to forge messages at will.
-func Sum(out *[16]byte, m []byte, key *[32]byte) {
-	if hasVX {
-		var mPtr *byte
-		if len(m) > 0 {
-			mPtr = &m[0]
-		}
-		if hasVMSL && len(m) > 256 {
-			poly1305vmsl(out, mPtr, uint64(len(m)), key)
-		} else {
-			poly1305vx(out, mPtr, uint64(len(m)), key)
-		}
-	} else {
-		sumGeneric(out, m, key)
-	}
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s b/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
deleted file mode 100644
index 356c07a..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
+++ /dev/null
@@ -1,400 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,go1.11,!gccgo,!appengine
-
-#include "textflag.h"
-
-// Implementation of Poly1305 using the vector facility (vx).
-
-// constants
-#define MOD26 V0
-#define EX0   V1
-#define EX1   V2
-#define EX2   V3
-
-// temporaries
-#define T_0 V4
-#define T_1 V5
-#define T_2 V6
-#define T_3 V7
-#define T_4 V8
-
-// key (r)
-#define R_0  V9
-#define R_1  V10
-#define R_2  V11
-#define R_3  V12
-#define R_4  V13
-#define R5_1 V14
-#define R5_2 V15
-#define R5_3 V16
-#define R5_4 V17
-#define RSAVE_0 R5
-#define RSAVE_1 R6
-#define RSAVE_2 R7
-#define RSAVE_3 R8
-#define RSAVE_4 R9
-#define R5SAVE_1 V28
-#define R5SAVE_2 V29
-#define R5SAVE_3 V30
-#define R5SAVE_4 V31
-
-// message block
-#define F_0 V18
-#define F_1 V19
-#define F_2 V20
-#define F_3 V21
-#define F_4 V22
-
-// accumulator
-#define H_0 V23
-#define H_1 V24
-#define H_2 V25
-#define H_3 V26
-#define H_4 V27
-
-GLOBL ·keyMask<>(SB), RODATA, $16
-DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
-DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
-
-GLOBL ·bswapMask<>(SB), RODATA, $16
-DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
-DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
-
-GLOBL ·constants<>(SB), RODATA, $64
-// MOD26
-DATA ·constants<>+0(SB)/8, $0x3ffffff
-DATA ·constants<>+8(SB)/8, $0x3ffffff
-// EX0
-DATA ·constants<>+16(SB)/8, $0x0006050403020100
-DATA ·constants<>+24(SB)/8, $0x1016151413121110
-// EX1
-DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
-DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
-// EX2
-DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
-DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
-
-// h = (f*g) % (2**130-5) [partial reduction]
-#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
-	VMLOF  f0, g0, h0        \
-	VMLOF  f0, g1, h1        \
-	VMLOF  f0, g2, h2        \
-	VMLOF  f0, g3, h3        \
-	VMLOF  f0, g4, h4        \
-	VMLOF  f1, g54, T_0      \
-	VMLOF  f1, g0, T_1       \
-	VMLOF  f1, g1, T_2       \
-	VMLOF  f1, g2, T_3       \
-	VMLOF  f1, g3, T_4       \
-	VMALOF f2, g53, h0, h0   \
-	VMALOF f2, g54, h1, h1   \
-	VMALOF f2, g0, h2, h2    \
-	VMALOF f2, g1, h3, h3    \
-	VMALOF f2, g2, h4, h4    \
-	VMALOF f3, g52, T_0, T_0 \
-	VMALOF f3, g53, T_1, T_1 \
-	VMALOF f3, g54, T_2, T_2 \
-	VMALOF f3, g0, T_3, T_3  \
-	VMALOF f3, g1, T_4, T_4  \
-	VMALOF f4, g51, h0, h0   \
-	VMALOF f4, g52, h1, h1   \
-	VMALOF f4, g53, h2, h2   \
-	VMALOF f4, g54, h3, h3   \
-	VMALOF f4, g0, h4, h4    \
-	VAG    T_0, h0, h0       \
-	VAG    T_1, h1, h1       \
-	VAG    T_2, h2, h2       \
-	VAG    T_3, h3, h3       \
-	VAG    T_4, h4, h4
-
-// carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
-#define REDUCE(h0, h1, h2, h3, h4) \
-	VESRLG $26, h0, T_0  \
-	VESRLG $26, h3, T_1  \
-	VN     MOD26, h0, h0 \
-	VN     MOD26, h3, h3 \
-	VAG    T_0, h1, h1   \
-	VAG    T_1, h4, h4   \
-	VESRLG $26, h1, T_2  \
-	VESRLG $26, h4, T_3  \
-	VN     MOD26, h1, h1 \
-	VN     MOD26, h4, h4 \
-	VESLG  $2, T_3, T_4  \
-	VAG    T_3, T_4, T_4 \
-	VAG    T_2, h2, h2   \
-	VAG    T_4, h0, h0   \
-	VESRLG $26, h2, T_0  \
-	VESRLG $26, h0, T_1  \
-	VN     MOD26, h2, h2 \
-	VN     MOD26, h0, h0 \
-	VAG    T_0, h3, h3   \
-	VAG    T_1, h1, h1   \
-	VESRLG $26, h3, T_2  \
-	VN     MOD26, h3, h3 \
-	VAG    T_2, h4, h4
-
-// expand in0 into d[0] and in1 into d[1]
-#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
-	VGBM   $0x0707, d1       \ // d1=tmp
-	VPERM  in0, in1, EX2, d4 \
-	VPERM  in0, in1, EX0, d0 \
-	VPERM  in0, in1, EX1, d2 \
-	VN     d1, d4, d4        \
-	VESRLG $26, d0, d1       \
-	VESRLG $30, d2, d3       \
-	VESRLG $4, d2, d2        \
-	VN     MOD26, d0, d0     \
-	VN     MOD26, d1, d1     \
-	VN     MOD26, d2, d2     \
-	VN     MOD26, d3, d3
-
-// pack h4:h0 into h1:h0 (no carry)
-#define PACK(h0, h1, h2, h3, h4) \
-	VESLG $26, h1, h1  \
-	VESLG $26, h3, h3  \
-	VO    h0, h1, h0   \
-	VO    h2, h3, h2   \
-	VESLG $4, h2, h2   \
-	VLEIB $7, $48, h1  \
-	VSLB  h1, h2, h2   \
-	VO    h0, h2, h0   \
-	VLEIB $7, $104, h1 \
-	VSLB  h1, h4, h3   \
-	VO    h3, h0, h0   \
-	VLEIB $7, $24, h1  \
-	VSRLB h1, h4, h1
-
-// if h > 2**130-5 then h -= 2**130-5
-#define MOD(h0, h1, t0, t1, t2) \
-	VZERO t0          \
-	VLEIG $1, $5, t0  \
-	VACCQ h0, t0, t1  \
-	VAQ   h0, t0, t0  \
-	VONE  t2          \
-	VLEIG $1, $-4, t2 \
-	VAQ   t2, t1, t1  \
-	VACCQ h1, t1, t1  \
-	VONE  t2          \
-	VAQ   t2, t1, t1  \
-	VN    h0, t1, t2  \
-	VNC   t0, t1, t1  \
-	VO    t1, t2, h0
-
-// func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
-TEXT ·poly1305vx(SB), $0-32
-	// This code processes up to 2 blocks (32 bytes) per iteration
-	// using the algorithm described in:
-	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
-	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
-	LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
-
-	// load MOD26, EX0, EX1 and EX2
-	MOVD $·constants<>(SB), R5
-	VLM  (R5), MOD26, EX2
-
-	// setup r
-	VL   (R4), T_0
-	MOVD $·keyMask<>(SB), R6
-	VL   (R6), T_1
-	VN   T_0, T_1, T_0
-	EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
-
-	// setup r*5
-	VLEIG $0, $5, T_0
-	VLEIG $1, $5, T_0
-
-	// store r (for final block)
-	VMLOF T_0, R_1, R5SAVE_1
-	VMLOF T_0, R_2, R5SAVE_2
-	VMLOF T_0, R_3, R5SAVE_3
-	VMLOF T_0, R_4, R5SAVE_4
-	VLGVG $0, R_0, RSAVE_0
-	VLGVG $0, R_1, RSAVE_1
-	VLGVG $0, R_2, RSAVE_2
-	VLGVG $0, R_3, RSAVE_3
-	VLGVG $0, R_4, RSAVE_4
-
-	// skip r**2 calculation
-	CMPBLE R3, $16, skip
-
-	// calculate r**2
-	MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
-	REDUCE(H_0, H_1, H_2, H_3, H_4)
-	VLEIG $0, $5, T_0
-	VLEIG $1, $5, T_0
-	VMLOF T_0, H_1, R5_1
-	VMLOF T_0, H_2, R5_2
-	VMLOF T_0, H_3, R5_3
-	VMLOF T_0, H_4, R5_4
-	VLR   H_0, R_0
-	VLR   H_1, R_1
-	VLR   H_2, R_2
-	VLR   H_3, R_3
-	VLR   H_4, R_4
-
-	// initialize h
-	VZERO H_0
-	VZERO H_1
-	VZERO H_2
-	VZERO H_3
-	VZERO H_4
-
-loop:
-	CMPBLE R3, $32, b2
-	VLM    (R2), T_0, T_1
-	SUB    $32, R3
-	MOVD   $32(R2), R2
-	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
-	VLEIB  $4, $1, F_4
-	VLEIB  $12, $1, F_4
-
-multiply:
-	VAG    H_0, F_0, F_0
-	VAG    H_1, F_1, F_1
-	VAG    H_2, F_2, F_2
-	VAG    H_3, F_3, F_3
-	VAG    H_4, F_4, F_4
-	MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
-	REDUCE(H_0, H_1, H_2, H_3, H_4)
-	CMPBNE R3, $0, loop
-
-finish:
-	// sum vectors
-	VZERO  T_0
-	VSUMQG H_0, T_0, H_0
-	VSUMQG H_1, T_0, H_1
-	VSUMQG H_2, T_0, H_2
-	VSUMQG H_3, T_0, H_3
-	VSUMQG H_4, T_0, H_4
-
-	// h may be >= 2*(2**130-5) so we need to reduce it again
-	REDUCE(H_0, H_1, H_2, H_3, H_4)
-
-	// carry h1->h4
-	VESRLG $26, H_1, T_1
-	VN     MOD26, H_1, H_1
-	VAQ    T_1, H_2, H_2
-	VESRLG $26, H_2, T_2
-	VN     MOD26, H_2, H_2
-	VAQ    T_2, H_3, H_3
-	VESRLG $26, H_3, T_3
-	VN     MOD26, H_3, H_3
-	VAQ    T_3, H_4, H_4
-
-	// h is now < 2*(2**130-5)
-	// pack h into h1 (hi) and h0 (lo)
-	PACK(H_0, H_1, H_2, H_3, H_4)
-
-	// if h > 2**130-5 then h -= 2**130-5
-	MOD(H_0, H_1, T_0, T_1, T_2)
-
-	// h += s
-	MOVD  $·bswapMask<>(SB), R5
-	VL    (R5), T_1
-	VL    16(R4), T_0
-	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
-	VAQ   T_0, H_0, H_0
-	VPERM H_0, H_0, T_1, H_0    // reverse bytes (to little)
-	VST   H_0, (R1)
-
-	RET
-
-b2:
-	CMPBLE R3, $16, b1
-
-	// 2 blocks remaining
-	SUB    $17, R3
-	VL     (R2), T_0
-	VLL    R3, 16(R2), T_1
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBEQ R3, $16, 2(PC)
-	VLVGB  R3, R0, T_1
-	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
-	CMPBNE R3, $16, 2(PC)
-	VLEIB  $12, $1, F_4
-	VLEIB  $4, $1, F_4
-
-	// setup [r²,r]
-	VLVGG $1, RSAVE_0, R_0
-	VLVGG $1, RSAVE_1, R_1
-	VLVGG $1, RSAVE_2, R_2
-	VLVGG $1, RSAVE_3, R_3
-	VLVGG $1, RSAVE_4, R_4
-	VPDI  $0, R5_1, R5SAVE_1, R5_1
-	VPDI  $0, R5_2, R5SAVE_2, R5_2
-	VPDI  $0, R5_3, R5SAVE_3, R5_3
-	VPDI  $0, R5_4, R5SAVE_4, R5_4
-
-	MOVD $0, R3
-	BR   multiply
-
-skip:
-	VZERO H_0
-	VZERO H_1
-	VZERO H_2
-	VZERO H_3
-	VZERO H_4
-
-	CMPBEQ R3, $0, finish
-
-b1:
-	// 1 block remaining
-	SUB    $1, R3
-	VLL    R3, (R2), T_0
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBEQ R3, $16, 2(PC)
-	VLVGB  R3, R0, T_0
-	VZERO  T_1
-	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
-	CMPBNE R3, $16, 2(PC)
-	VLEIB  $4, $1, F_4
-	VLEIG  $1, $1, R_0
-	VZERO  R_1
-	VZERO  R_2
-	VZERO  R_3
-	VZERO  R_4
-	VZERO  R5_1
-	VZERO  R5_2
-	VZERO  R5_3
-	VZERO  R5_4
-
-	// setup [r, 1]
-	VLVGG $0, RSAVE_0, R_0
-	VLVGG $0, RSAVE_1, R_1
-	VLVGG $0, RSAVE_2, R_2
-	VLVGG $0, RSAVE_3, R_3
-	VLVGG $0, RSAVE_4, R_4
-	VPDI  $0, R5SAVE_1, R5_1, R5_1
-	VPDI  $0, R5SAVE_2, R5_2, R5_2
-	VPDI  $0, R5SAVE_3, R5_3, R5_3
-	VPDI  $0, R5SAVE_4, R5_4, R5_4
-
-	MOVD $0, R3
-	BR   multiply
-
-TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
-	MOVD  $x-24(SP), R1
-	XC    $24, 0(R1), 0(R1) // clear the storage
-	MOVD  $2, R0            // R0 is the number of double words stored -1
-	WORD  $0xB2B01000       // STFLE 0(R1)
-	XOR   R0, R0            // reset the value of R0
-	MOVBZ z-8(SP), R1
-	AND   $0x40, R1
-	BEQ   novector
-
-vectorinstalled:
-	// check if the vector instruction has been enabled
-	VLEIB  $0, $0xF, V16
-	VLGVB  $0, V16, R1
-	CMPBNE R1, $0xF, novector
-	MOVB   $1, ret+0(FP)      // have vx
-	RET
-
-novector:
-	MOVB $0, ret+0(FP) // no vx
-	RET
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s b/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
deleted file mode 100644
index e548020..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
+++ /dev/null
@@ -1,931 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,go1.11,!gccgo,!appengine
-
-#include "textflag.h"
-
-// Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction.
-
-// constants
-#define EX0   V1
-#define EX1   V2
-#define EX2   V3
-
-// temporaries
-#define T_0 V4
-#define T_1 V5
-#define T_2 V6
-#define T_3 V7
-#define T_4 V8
-#define T_5 V9
-#define T_6 V10
-#define T_7 V11
-#define T_8 V12
-#define T_9 V13
-#define T_10 V14
-
-// r**2 & r**4
-#define R_0  V15
-#define R_1  V16
-#define R_2  V17
-#define R5_1 V18
-#define R5_2 V19
-// key (r)
-#define RSAVE_0 R7
-#define RSAVE_1 R8
-#define RSAVE_2 R9
-#define R5SAVE_1 R10
-#define R5SAVE_2 R11
-
-// message block
-#define M0 V20
-#define M1 V21
-#define M2 V22
-#define M3 V23
-#define M4 V24
-#define M5 V25
-
-// accumulator
-#define H0_0 V26
-#define H1_0 V27
-#define H2_0 V28
-#define H0_1 V29
-#define H1_1 V30
-#define H2_1 V31
-
-GLOBL ·keyMask<>(SB), RODATA, $16
-DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
-DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
-
-GLOBL ·bswapMask<>(SB), RODATA, $16
-DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
-DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
-
-GLOBL ·constants<>(SB), RODATA, $48
-// EX0
-DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+8(SB)/8, $0x0000050403020100
-// EX1
-DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+24(SB)/8, $0x00000a0908070605
-// EX2
-DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b
-
-GLOBL ·c<>(SB), RODATA, $48
-// EX0
-DATA ·c<>+0(SB)/8, $0x0000050403020100
-DATA ·c<>+8(SB)/8, $0x0000151413121110
-// EX1
-DATA ·c<>+16(SB)/8, $0x00000a0908070605
-DATA ·c<>+24(SB)/8, $0x00001a1918171615
-// EX2
-DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b
-DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b
-
-GLOBL ·reduce<>(SB), RODATA, $32
-// 44 bit
-DATA ·reduce<>+0(SB)/8, $0x0
-DATA ·reduce<>+8(SB)/8, $0xfffffffffff
-// 42 bit
-DATA ·reduce<>+16(SB)/8, $0x0
-DATA ·reduce<>+24(SB)/8, $0x3ffffffffff
-
-// h = (f*g) % (2**130-5) [partial reduction]
-// uses T_0...T_9 temporary registers
-// input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
-// output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2
-#define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \
-	\ // Eliminate the dependency for the last 2 VMSLs
-	VMSLG m02_0, r_2, m4_2, m4_2                       \
-	VMSLG m13_0, r_2, m5_2, m5_2                       \ // 8 VMSLs pipelined
-	VMSLG m02_0, r_0, m4_0, m4_0                       \
-	VMSLG m02_1, r5_2, V0, T_0                         \
-	VMSLG m02_0, r_1, m4_1, m4_1                       \
-	VMSLG m02_1, r_0, V0, T_1                          \
-	VMSLG m02_1, r_1, V0, T_2                          \
-	VMSLG m02_2, r5_1, V0, T_3                         \
-	VMSLG m02_2, r5_2, V0, T_4                         \
-	VMSLG m13_0, r_0, m5_0, m5_0                       \
-	VMSLG m13_1, r5_2, V0, T_5                         \
-	VMSLG m13_0, r_1, m5_1, m5_1                       \
-	VMSLG m13_1, r_0, V0, T_6                          \
-	VMSLG m13_1, r_1, V0, T_7                          \
-	VMSLG m13_2, r5_1, V0, T_8                         \
-	VMSLG m13_2, r5_2, V0, T_9                         \
-	VMSLG m02_2, r_0, m4_2, m4_2                       \
-	VMSLG m13_2, r_0, m5_2, m5_2                       \
-	VAQ   m4_0, T_0, m02_0                             \
-	VAQ   m4_1, T_1, m02_1                             \
-	VAQ   m5_0, T_5, m13_0                             \
-	VAQ   m5_1, T_6, m13_1                             \
-	VAQ   m02_0, T_3, m02_0                            \
-	VAQ   m02_1, T_4, m02_1                            \
-	VAQ   m13_0, T_8, m13_0                            \
-	VAQ   m13_1, T_9, m13_1                            \
-	VAQ   m4_2, T_2, m02_2                             \
-	VAQ   m5_2, T_7, m13_2                             \
-
-// SQUARE uses three limbs of r and r_2*5 to output square of r
-// uses T_1, T_5 and T_7 temporary registers
-// input: r_0, r_1, r_2, r5_2
-// temp: TEMP0, TEMP1, TEMP2
-// output: p0, p1, p2
-#define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \
-	VMSLG r_0, r_0, p0, p0     \
-	VMSLG r_1, r5_2, V0, TEMP0 \
-	VMSLG r_2, r5_2, p1, p1    \
-	VMSLG r_0, r_1, V0, TEMP1  \
-	VMSLG r_1, r_1, p2, p2     \
-	VMSLG r_0, r_2, V0, TEMP2  \
-	VAQ   TEMP0, p0, p0        \
-	VAQ   TEMP1, p1, p1        \
-	VAQ   TEMP2, p2, p2        \
-	VAQ   TEMP0, p0, p0        \
-	VAQ   TEMP1, p1, p1        \
-	VAQ   TEMP2, p2, p2        \
-
-// carry h0->h1->h2->h0 || h3->h4->h5->h3
-// uses T_2, T_4, T_5, T_7, T_8, T_9
-//       t6,  t7,  t8,  t9, t10, t11
-// input: h0, h1, h2, h3, h4, h5
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
-// output: h0, h1, h2, h3, h4, h5
-#define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
-	VLM    (R12), t6, t7  \ // 44 and 42 bit clear mask
-	VLEIB  $7, $0x28, t10 \ // 5 byte shift mask
-	VREPIB $4, t8         \ // 4 bit shift mask
-	VREPIB $2, t11        \ // 2 bit shift mask
-	VSRLB  t10, h0, t0    \ // h0 byte shift
-	VSRLB  t10, h1, t1    \ // h1 byte shift
-	VSRLB  t10, h2, t2    \ // h2 byte shift
-	VSRLB  t10, h3, t3    \ // h3 byte shift
-	VSRLB  t10, h4, t4    \ // h4 byte shift
-	VSRLB  t10, h5, t5    \ // h5 byte shift
-	VSRL   t8, t0, t0     \ // h0 bit shift
-	VSRL   t8, t1, t1     \ // h2 bit shift
-	VSRL   t11, t2, t2    \ // h2 bit shift
-	VSRL   t8, t3, t3     \ // h3 bit shift
-	VSRL   t8, t4, t4     \ // h4 bit shift
-	VESLG  $2, t2, t9     \ // h2 carry x5
-	VSRL   t11, t5, t5    \ // h5 bit shift
-	VN     t6, h0, h0     \ // h0 clear carry
-	VAQ    t2, t9, t2     \ // h2 carry x5
-	VESLG  $2, t5, t9     \ // h5 carry x5
-	VN     t6, h1, h1     \ // h1 clear carry
-	VN     t7, h2, h2     \ // h2 clear carry
-	VAQ    t5, t9, t5     \ // h5 carry x5
-	VN     t6, h3, h3     \ // h3 clear carry
-	VN     t6, h4, h4     \ // h4 clear carry
-	VN     t7, h5, h5     \ // h5 clear carry
-	VAQ    t0, h1, h1     \ // h0->h1
-	VAQ    t3, h4, h4     \ // h3->h4
-	VAQ    t1, h2, h2     \ // h1->h2
-	VAQ    t4, h5, h5     \ // h4->h5
-	VAQ    t2, h0, h0     \ // h2->h0
-	VAQ    t5, h3, h3     \ // h5->h3
-	VREPG  $1, t6, t6     \ // 44 and 42 bit masks across both halves
-	VREPG  $1, t7, t7     \
-	VSLDB  $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5]
-	VSLDB  $8, h1, h1, h1 \
-	VSLDB  $8, h2, h2, h2 \
-	VO     h0, h3, h3     \
-	VO     h1, h4, h4     \
-	VO     h2, h5, h5     \
-	VESRLG $44, h3, t0    \ // 44 bit shift right
-	VESRLG $44, h4, t1    \
-	VESRLG $42, h5, t2    \
-	VN     t6, h3, h3     \ // clear carry bits
-	VN     t6, h4, h4     \
-	VN     t7, h5, h5     \
-	VESLG  $2, t2, t9     \ // multiply carry by 5
-	VAQ    t9, t2, t2     \
-	VAQ    t0, h4, h4     \
-	VAQ    t1, h5, h5     \
-	VAQ    t2, h3, h3     \
-
-// carry h0->h1->h2->h0
-// input: h0, h1, h2
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8
-// output: h0, h1, h2
-#define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \
-	VLEIB  $7, $0x28, t3 \ // 5 byte shift mask
-	VREPIB $4, t4        \ // 4 bit shift mask
-	VREPIB $2, t7        \ // 2 bit shift mask
-	VGBM   $0x003F, t5   \ // mask to clear carry bits
-	VSRLB  t3, h0, t0    \
-	VSRLB  t3, h1, t1    \
-	VSRLB  t3, h2, t2    \
-	VESRLG $4, t5, t5    \ // 44 bit clear mask
-	VSRL   t4, t0, t0    \
-	VSRL   t4, t1, t1    \
-	VSRL   t7, t2, t2    \
-	VESRLG $2, t5, t6    \ // 42 bit clear mask
-	VESLG  $2, t2, t8    \
-	VAQ    t8, t2, t2    \
-	VN     t5, h0, h0    \
-	VN     t5, h1, h1    \
-	VN     t6, h2, h2    \
-	VAQ    t0, h1, h1    \
-	VAQ    t1, h2, h2    \
-	VAQ    t2, h0, h0    \
-	VSRLB  t3, h0, t0    \
-	VSRLB  t3, h1, t1    \
-	VSRLB  t3, h2, t2    \
-	VSRL   t4, t0, t0    \
-	VSRL   t4, t1, t1    \
-	VSRL   t7, t2, t2    \
-	VN     t5, h0, h0    \
-	VN     t5, h1, h1    \
-	VESLG  $2, t2, t8    \
-	VN     t6, h2, h2    \
-	VAQ    t0, h1, h1    \
-	VAQ    t8, t2, t2    \
-	VAQ    t1, h2, h2    \
-	VAQ    t2, h0, h0    \
-
-// expands two message blocks into the lower halfs of the d registers
-// moves the contents of the d registers into upper halfs
-// input: in1, in2, d0, d1, d2, d3, d4, d5
-// temp: TEMP0, TEMP1, TEMP2, TEMP3
-// output: d0, d1, d2, d3, d4, d5
-#define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \
-	VGBM   $0xff3f, TEMP0      \
-	VGBM   $0xff1f, TEMP1      \
-	VESLG  $4, d1, TEMP2       \
-	VESLG  $4, d4, TEMP3       \
-	VESRLG $4, TEMP0, TEMP0    \
-	VPERM  in1, d0, EX0, d0    \
-	VPERM  in2, d3, EX0, d3    \
-	VPERM  in1, d2, EX2, d2    \
-	VPERM  in2, d5, EX2, d5    \
-	VPERM  in1, TEMP2, EX1, d1 \
-	VPERM  in2, TEMP3, EX1, d4 \
-	VN     TEMP0, d0, d0       \
-	VN     TEMP0, d3, d3       \
-	VESRLG $4, d1, d1          \
-	VESRLG $4, d4, d4          \
-	VN     TEMP1, d2, d2       \
-	VN     TEMP1, d5, d5       \
-	VN     TEMP0, d1, d1       \
-	VN     TEMP0, d4, d4       \
-
-// expands one message block into the lower halfs of the d registers
-// moves the contents of the d registers into upper halfs
-// input: in, d0, d1, d2
-// temp: TEMP0, TEMP1, TEMP2
-// output: d0, d1, d2
-#define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \
-	VGBM   $0xff3f, TEMP0     \
-	VESLG  $4, d1, TEMP2      \
-	VGBM   $0xff1f, TEMP1     \
-	VPERM  in, d0, EX0, d0    \
-	VESRLG $4, TEMP0, TEMP0   \
-	VPERM  in, d2, EX2, d2    \
-	VPERM  in, TEMP2, EX1, d1 \
-	VN     TEMP0, d0, d0      \
-	VN     TEMP1, d2, d2      \
-	VESRLG $4, d1, d1         \
-	VN     TEMP0, d1, d1      \
-
-// pack h2:h0 into h1:h0 (no carry)
-// input: h0, h1, h2
-// output: h0, h1, h2
-#define PACK(h0, h1, h2) \
-	VMRLG  h1, h2, h2  \ // copy h1 to upper half h2
-	VESLG  $44, h1, h1 \ // shift limb 1 44 bits, leaving 20
-	VO     h0, h1, h0  \ // combine h0 with 20 bits from limb 1
-	VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1
-	VLEIG  $1, $0, h1  \ // clear h2 stuff from lower half of h1
-	VO     h0, h1, h0  \ // h0 now has 88 bits (limb 0 and 1)
-	VLEIG  $0, $0, h2  \ // clear upper half of h2
-	VESRLG $40, h2, h1 \ // h1 now has upper two bits of result
-	VLEIB  $7, $88, h1 \ // for byte shift (11 bytes)
-	VSLB   h1, h2, h2  \ // shift h2 11 bytes to the left
-	VO     h0, h2, h0  \ // combine h0 with 20 bits from limb 1
-	VLEIG  $0, $0, h1  \ // clear upper half of h1
-
-// if h > 2**130-5 then h -= 2**130-5
-// input: h0, h1
-// temp: t0, t1, t2
-// output: h0
-#define MOD(h0, h1, t0, t1, t2) \
-	VZERO t0          \
-	VLEIG $1, $5, t0  \
-	VACCQ h0, t0, t1  \
-	VAQ   h0, t0, t0  \
-	VONE  t2          \
-	VLEIG $1, $-4, t2 \
-	VAQ   t2, t1, t1  \
-	VACCQ h1, t1, t1  \
-	VONE  t2          \
-	VAQ   t2, t1, t1  \
-	VN    h0, t1, t2  \
-	VNC   t0, t1, t1  \
-	VO    t1, t2, h0  \
-
-// func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key)
-TEXT ·poly1305vmsl(SB), $0-32
-	// This code processes 6 + up to 4 blocks (32 bytes) per iteration
-	// using the algorithm described in:
-	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
-	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
-	// And as moddified for VMSL as described in
-	// Accelerating Poly1305 Cryptographic Message Authentication on the z14
-	// O'Farrell et al, CASCON 2017, p48-55
-	// https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht
-
-	LMG   out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
-	VZERO V0                // c
-
-	// load EX0, EX1 and EX2
-	MOVD $·constants<>(SB), R5
-	VLM  (R5), EX0, EX2        // c
-
-	// setup r
-	VL    (R4), T_0
-	MOVD  $·keyMask<>(SB), R6
-	VL    (R6), T_1
-	VN    T_0, T_1, T_0
-	VZERO T_2                 // limbs for r
-	VZERO T_3
-	VZERO T_4
-	EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7)
-
-	// T_2, T_3, T_4: [0, r]
-
-	// setup r*20
-	VLEIG $0, $0, T_0
-	VLEIG $1, $20, T_0       // T_0: [0, 20]
-	VZERO T_5
-	VZERO T_6
-	VMSLG T_0, T_3, T_5, T_5
-	VMSLG T_0, T_4, T_6, T_6
-
-	// store r for final block in GR
-	VLGVG $1, T_2, RSAVE_0  // c
-	VLGVG $1, T_3, RSAVE_1  // c
-	VLGVG $1, T_4, RSAVE_2  // c
-	VLGVG $1, T_5, R5SAVE_1 // c
-	VLGVG $1, T_6, R5SAVE_2 // c
-
-	// initialize h
-	VZERO H0_0
-	VZERO H1_0
-	VZERO H2_0
-	VZERO H0_1
-	VZERO H1_1
-	VZERO H2_1
-
-	// initialize pointer for reduce constants
-	MOVD $·reduce<>(SB), R12
-
-	// calculate r**2 and 20*(r**2)
-	VZERO R_0
-	VZERO R_1
-	VZERO R_2
-	SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7)
-	REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1)
-	VZERO R5_1
-	VZERO R5_2
-	VMSLG T_0, R_1, R5_1, R5_1
-	VMSLG T_0, R_2, R5_2, R5_2
-
-	// skip r**4 calculation if 3 blocks or less
-	CMPBLE R3, $48, b4
-
-	// calculate r**4 and 20*(r**4)
-	VZERO T_8
-	VZERO T_9
-	VZERO T_10
-	SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7)
-	REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1)
-	VZERO T_2
-	VZERO T_3
-	VMSLG T_0, T_9, T_2, T_2
-	VMSLG T_0, T_10, T_3, T_3
-
-	// put r**2 to the right and r**4 to the left of R_0, R_1, R_2
-	VSLDB $8, T_8, T_8, T_8
-	VSLDB $8, T_9, T_9, T_9
-	VSLDB $8, T_10, T_10, T_10
-	VSLDB $8, T_2, T_2, T_2
-	VSLDB $8, T_3, T_3, T_3
-
-	VO T_8, R_0, R_0
-	VO T_9, R_1, R_1
-	VO T_10, R_2, R_2
-	VO T_2, R5_1, R5_1
-	VO T_3, R5_2, R5_2
-
-	CMPBLE R3, $80, load // less than or equal to 5 blocks in message
-
-	// 6(or 5+1) blocks
-	SUB    $81, R3
-	VLM    (R2), M0, M4
-	VLL    R3, 80(R2), M5
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBGE R3, $16, 2(PC)
-	VLVGB  R3, R0, M5
-	MOVD   $96(R2), R2
-	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
-	EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
-	VLEIB  $2, $1, H2_0
-	VLEIB  $2, $1, H2_1
-	VLEIB  $10, $1, H2_0
-	VLEIB  $10, $1, H2_1
-
-	VZERO  M0
-	VZERO  M1
-	VZERO  M2
-	VZERO  M3
-	VZERO  T_4
-	VZERO  T_10
-	EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3)
-	VLR    T_4, M4
-	VLEIB  $10, $1, M2
-	CMPBLT R3, $16, 2(PC)
-	VLEIB  $10, $1, T_10
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
-	VMRHG  V0, H0_1, H0_0
-	VMRHG  V0, H1_1, H1_0
-	VMRHG  V0, H2_1, H2_0
-	VMRLG  V0, H0_1, H0_1
-	VMRLG  V0, H1_1, H1_1
-	VMRLG  V0, H2_1, H2_1
-
-	SUB    $16, R3
-	CMPBLE R3, $0, square
-
-load:
-	// load EX0, EX1 and EX2
-	MOVD $·c<>(SB), R5
-	VLM  (R5), EX0, EX2
-
-loop:
-	CMPBLE R3, $64, add // b4	// last 4 or less blocks left
-
-	// next 4 full blocks
-	VLM  (R2), M2, M5
-	SUB  $64, R3
-	MOVD $64(R2), R2
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9)
-
-	// expacc in-lined to create [m2, m3] limbs
-	VGBM   $0x3f3f, T_0     // 44 bit clear mask
-	VGBM   $0x1f1f, T_1     // 40 bit clear mask
-	VPERM  M2, M3, EX0, T_3
-	VESRLG $4, T_0, T_0     // 44 bit clear mask ready
-	VPERM  M2, M3, EX1, T_4
-	VPERM  M2, M3, EX2, T_5
-	VN     T_0, T_3, T_3
-	VESRLG $4, T_4, T_4
-	VN     T_1, T_5, T_5
-	VN     T_0, T_4, T_4
-	VMRHG  H0_1, T_3, H0_0
-	VMRHG  H1_1, T_4, H1_0
-	VMRHG  H2_1, T_5, H2_0
-	VMRLG  H0_1, T_3, H0_1
-	VMRLG  H1_1, T_4, H1_1
-	VMRLG  H2_1, T_5, H2_1
-	VLEIB  $10, $1, H2_0
-	VLEIB  $10, $1, H2_1
-	VPERM  M4, M5, EX0, T_3
-	VPERM  M4, M5, EX1, T_4
-	VPERM  M4, M5, EX2, T_5
-	VN     T_0, T_3, T_3
-	VESRLG $4, T_4, T_4
-	VN     T_1, T_5, T_5
-	VN     T_0, T_4, T_4
-	VMRHG  V0, T_3, M0
-	VMRHG  V0, T_4, M1
-	VMRHG  V0, T_5, M2
-	VMRLG  V0, T_3, M3
-	VMRLG  V0, T_4, M4
-	VMRLG  V0, T_5, M5
-	VLEIB  $10, $1, M2
-	VLEIB  $10, $1, M5
-
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	CMPBNE R3, $0, loop
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
-	VMRHG  V0, H0_1, H0_0
-	VMRHG  V0, H1_1, H1_0
-	VMRHG  V0, H2_1, H2_0
-	VMRLG  V0, H0_1, H0_1
-	VMRLG  V0, H1_1, H1_1
-	VMRLG  V0, H2_1, H2_1
-
-	// load EX0, EX1, EX2
-	MOVD $·constants<>(SB), R5
-	VLM  (R5), EX0, EX2
-
-	// sum vectors
-	VAQ H0_0, H0_1, H0_0
-	VAQ H1_0, H1_1, H1_0
-	VAQ H2_0, H2_1, H2_0
-
-	// h may be >= 2*(2**130-5) so we need to reduce it again
-	// M0...M4 are used as temps here
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
-next:  // carry h1->h2
-	VLEIB  $7, $0x28, T_1
-	VREPIB $4, T_2
-	VGBM   $0x003F, T_3
-	VESRLG $4, T_3
-
-	// byte shift
-	VSRLB T_1, H1_0, T_4
-
-	// bit shift
-	VSRL T_2, T_4, T_4
-
-	// clear h1 carry bits
-	VN T_3, H1_0, H1_0
-
-	// add carry
-	VAQ T_4, H2_0, H2_0
-
-	// h is now < 2*(2**130-5)
-	// pack h into h1 (hi) and h0 (lo)
-	PACK(H0_0, H1_0, H2_0)
-
-	// if h > 2**130-5 then h -= 2**130-5
-	MOD(H0_0, H1_0, T_0, T_1, T_2)
-
-	// h += s
-	MOVD  $·bswapMask<>(SB), R5
-	VL    (R5), T_1
-	VL    16(R4), T_0
-	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
-	VAQ   T_0, H0_0, H0_0
-	VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little)
-	VST   H0_0, (R1)
-	RET
-
-add:
-	// load EX0, EX1, EX2
-	MOVD $·constants<>(SB), R5
-	VLM  (R5), EX0, EX2
-
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
-	VMRHG  V0, H0_1, H0_0
-	VMRHG  V0, H1_1, H1_0
-	VMRHG  V0, H2_1, H2_0
-	VMRLG  V0, H0_1, H0_1
-	VMRLG  V0, H1_1, H1_1
-	VMRLG  V0, H2_1, H2_1
-	CMPBLE R3, $64, b4
-
-b4:
-	CMPBLE R3, $48, b3 // 3 blocks or less
-
-	// 4(3+1) blocks remaining
-	SUB    $49, R3
-	VLM    (R2), M0, M2
-	VLL    R3, 48(R2), M3
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBEQ R3, $16, 2(PC)
-	VLVGB  R3, R0, M3
-	MOVD   $64(R2), R2
-	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
-	VLEIB  $10, $1, H2_0
-	VLEIB  $10, $1, H2_1
-	VZERO  M0
-	VZERO  M1
-	VZERO  M4
-	VZERO  M5
-	VZERO  T_4
-	VZERO  T_10
-	EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3)
-	VLR    T_4, M2
-	VLEIB  $10, $1, M4
-	CMPBNE R3, $16, 2(PC)
-	VLEIB  $10, $1, T_10
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
-	VMRHG  V0, H0_1, H0_0
-	VMRHG  V0, H1_1, H1_0
-	VMRHG  V0, H2_1, H2_0
-	VMRLG  V0, H0_1, H0_1
-	VMRLG  V0, H1_1, H1_1
-	VMRLG  V0, H2_1, H2_1
-	SUB    $16, R3
-	CMPBLE R3, $0, square // this condition must always hold true!
-
-b3:
-	CMPBLE R3, $32, b2
-
-	// 3 blocks remaining
-
-	// setup [r²,r]
-	VSLDB $8, R_0, R_0, R_0
-	VSLDB $8, R_1, R_1, R_1
-	VSLDB $8, R_2, R_2, R_2
-	VSLDB $8, R5_1, R5_1, R5_1
-	VSLDB $8, R5_2, R5_2, R5_2
-
-	VLVGG $1, RSAVE_0, R_0
-	VLVGG $1, RSAVE_1, R_1
-	VLVGG $1, RSAVE_2, R_2
-	VLVGG $1, R5SAVE_1, R5_1
-	VLVGG $1, R5SAVE_2, R5_2
-
-	// setup [h0, h1]
-	VSLDB $8, H0_0, H0_0, H0_0
-	VSLDB $8, H1_0, H1_0, H1_0
-	VSLDB $8, H2_0, H2_0, H2_0
-	VO    H0_1, H0_0, H0_0
-	VO    H1_1, H1_0, H1_0
-	VO    H2_1, H2_0, H2_0
-	VZERO H0_1
-	VZERO H1_1
-	VZERO H2_1
-
-	VZERO M0
-	VZERO M1
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-
-	// H*[r**2, r]
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5)
-
-	SUB    $33, R3
-	VLM    (R2), M0, M1
-	VLL    R3, 32(R2), M2
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBEQ R3, $16, 2(PC)
-	VLVGB  R3, R0, M2
-
-	// H += m0
-	VZERO T_1
-	VZERO T_2
-	VZERO T_3
-	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)
-	VLEIB $10, $1, T_3
-	VAG   H0_0, T_1, H0_0
-	VAG   H1_0, T_2, H1_0
-	VAG   H2_0, T_3, H2_0
-
-	VZERO M0
-	VZERO M3
-	VZERO M4
-	VZERO M5
-	VZERO T_10
-
-	// (H+m0)*r
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9)
-
-	// H += m1
-	VZERO V0
-	VZERO T_1
-	VZERO T_2
-	VZERO T_3
-	EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6)
-	VLEIB $10, $1, T_3
-	VAQ   H0_0, T_1, H0_0
-	VAQ   H1_0, T_2, H1_0
-	VAQ   H2_0, T_3, H2_0
-	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
-
-	// [H, m2] * [r**2, r]
-	EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3)
-	CMPBNE R3, $16, 2(PC)
-	VLEIB  $10, $1, H2_0
-	VZERO  M0
-	VZERO  M1
-	VZERO  M2
-	VZERO  M3
-	VZERO  M4
-	VZERO  M5
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10)
-	SUB    $16, R3
-	CMPBLE R3, $0, next   // this condition must always hold true!
-
-b2:
-	CMPBLE R3, $16, b1
-
-	// 2 blocks remaining
-
-	// setup [r²,r]
-	VSLDB $8, R_0, R_0, R_0
-	VSLDB $8, R_1, R_1, R_1
-	VSLDB $8, R_2, R_2, R_2
-	VSLDB $8, R5_1, R5_1, R5_1
-	VSLDB $8, R5_2, R5_2, R5_2
-
-	VLVGG $1, RSAVE_0, R_0
-	VLVGG $1, RSAVE_1, R_1
-	VLVGG $1, RSAVE_2, R_2
-	VLVGG $1, R5SAVE_1, R5_1
-	VLVGG $1, R5SAVE_2, R5_2
-
-	// setup [h0, h1]
-	VSLDB $8, H0_0, H0_0, H0_0
-	VSLDB $8, H1_0, H1_0, H1_0
-	VSLDB $8, H2_0, H2_0, H2_0
-	VO    H0_1, H0_0, H0_0
-	VO    H1_1, H1_0, H1_0
-	VO    H2_1, H2_0, H2_0
-	VZERO H0_1
-	VZERO H1_1
-	VZERO H2_1
-
-	VZERO M0
-	VZERO M1
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-
-	// H*[r**2, r]
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
-	VMRHG V0, H0_1, H0_0
-	VMRHG V0, H1_1, H1_0
-	VMRHG V0, H2_1, H2_0
-	VMRLG V0, H0_1, H0_1
-	VMRLG V0, H1_1, H1_1
-	VMRLG V0, H2_1, H2_1
-
-	// move h to the left and 0s at the right
-	VSLDB $8, H0_0, H0_0, H0_0
-	VSLDB $8, H1_0, H1_0, H1_0
-	VSLDB $8, H2_0, H2_0, H2_0
-
-	// get message blocks and append 1 to start
-	SUB    $17, R3
-	VL     (R2), M0
-	VLL    R3, 16(R2), M1
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBEQ R3, $16, 2(PC)
-	VLVGB  R3, R0, M1
-	VZERO  T_6
-	VZERO  T_7
-	VZERO  T_8
-	EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3)
-	EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3)
-	VLEIB  $2, $1, T_8
-	CMPBNE R3, $16, 2(PC)
-	VLEIB  $10, $1, T_8
-
-	// add [m0, m1] to h
-	VAG H0_0, T_6, H0_0
-	VAG H1_0, T_7, H1_0
-	VAG H2_0, T_8, H2_0
-
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-	VZERO T_10
-	VZERO M0
-
-	// at this point R_0 .. R5_2 look like [r**2, r]
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
-	SUB    $16, R3, R3
-	CMPBLE R3, $0, next
-
-b1:
-	CMPBLE R3, $0, next
-
-	// 1 block remaining
-
-	// setup [r²,r]
-	VSLDB $8, R_0, R_0, R_0
-	VSLDB $8, R_1, R_1, R_1
-	VSLDB $8, R_2, R_2, R_2
-	VSLDB $8, R5_1, R5_1, R5_1
-	VSLDB $8, R5_2, R5_2, R5_2
-
-	VLVGG $1, RSAVE_0, R_0
-	VLVGG $1, RSAVE_1, R_1
-	VLVGG $1, RSAVE_2, R_2
-	VLVGG $1, R5SAVE_1, R5_1
-	VLVGG $1, R5SAVE_2, R5_2
-
-	// setup [h0, h1]
-	VSLDB $8, H0_0, H0_0, H0_0
-	VSLDB $8, H1_0, H1_0, H1_0
-	VSLDB $8, H2_0, H2_0, H2_0
-	VO    H0_1, H0_0, H0_0
-	VO    H1_1, H1_0, H1_0
-	VO    H2_1, H2_0, H2_0
-	VZERO H0_1
-	VZERO H1_1
-	VZERO H2_1
-
-	VZERO M0
-	VZERO M1
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-
-	// H*[r**2, r]
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
-	// set up [0, m0] limbs
-	SUB    $1, R3
-	VLL    R3, (R2), M0
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBEQ R3, $16, 2(PC)
-	VLVGB  R3, R0, M0
-	VZERO  T_1
-	VZERO  T_2
-	VZERO  T_3
-	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m]
-	CMPBNE R3, $16, 2(PC)
-	VLEIB  $10, $1, T_3
-
-	// h+m0
-	VAQ H0_0, T_1, H0_0
-	VAQ H1_0, T_2, H1_0
-	VAQ H2_0, T_3, H2_0
-
-	VZERO M0
-	VZERO M1
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
-	BR next
-
-square:
-	// setup [r²,r]
-	VSLDB $8, R_0, R_0, R_0
-	VSLDB $8, R_1, R_1, R_1
-	VSLDB $8, R_2, R_2, R_2
-	VSLDB $8, R5_1, R5_1, R5_1
-	VSLDB $8, R5_2, R5_2, R5_2
-
-	VLVGG $1, RSAVE_0, R_0
-	VLVGG $1, RSAVE_1, R_1
-	VLVGG $1, RSAVE_2, R_2
-	VLVGG $1, R5SAVE_1, R5_1
-	VLVGG $1, R5SAVE_2, R5_2
-
-	// setup [h0, h1]
-	VSLDB $8, H0_0, H0_0, H0_0
-	VSLDB $8, H1_0, H1_0, H1_0
-	VSLDB $8, H2_0, H2_0, H2_0
-	VO    H0_1, H0_0, H0_0
-	VO    H1_1, H1_0, H1_0
-	VO    H2_1, H2_0, H2_0
-	VZERO H0_1
-	VZERO H1_1
-	VZERO H2_1
-
-	VZERO M0
-	VZERO M1
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-
-	// (h0*r**2) + (h1*r)
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-	BR next
-
-TEXT ·hasVMSLFacility(SB), NOSPLIT, $24-1
-	MOVD  $x-24(SP), R1
-	XC    $24, 0(R1), 0(R1) // clear the storage
-	MOVD  $2, R0            // R0 is the number of double words stored -1
-	WORD  $0xB2B01000       // STFLE 0(R1)
-	XOR   R0, R0            // reset the value of R0
-	MOVBZ z-8(SP), R1
-	AND   $0x01, R1
-	BEQ   novmsl
-
-vectorinstalled:
-	// check if the vector instruction has been enabled
-	VLEIB  $0, $0xF, V16
-	VLGVB  $0, V16, R1
-	CMPBNE R1, $0xF, novmsl
-	MOVB   $1, ret+0(FP)    // have vx
-	RET
-
-novmsl:
-	MOVB $0, ret+0(FP) // no vx
-	RET
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go b/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go
deleted file mode 100644
index 4c96147..0000000
--- a/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package salsa provides low-level access to functions in the Salsa family.
-package salsa // import "golang.org/x/crypto/salsa20/salsa"
-
-// Sigma is the Salsa20 constant for 256-bit keys.
-var Sigma = [16]byte{'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', '2', '-', 'b', 'y', 't', 'e', ' ', 'k'}
-
-// HSalsa20 applies the HSalsa20 core function to a 16-byte input in, 32-byte
-// key k, and 16-byte constant c, and puts the result into the 32-byte array
-// out.
-func HSalsa20(out *[32]byte, in *[16]byte, k *[32]byte, c *[16]byte) {
-	x0 := uint32(c[0]) | uint32(c[1])<<8 | uint32(c[2])<<16 | uint32(c[3])<<24
-	x1 := uint32(k[0]) | uint32(k[1])<<8 | uint32(k[2])<<16 | uint32(k[3])<<24
-	x2 := uint32(k[4]) | uint32(k[5])<<8 | uint32(k[6])<<16 | uint32(k[7])<<24
-	x3 := uint32(k[8]) | uint32(k[9])<<8 | uint32(k[10])<<16 | uint32(k[11])<<24
-	x4 := uint32(k[12]) | uint32(k[13])<<8 | uint32(k[14])<<16 | uint32(k[15])<<24
-	x5 := uint32(c[4]) | uint32(c[5])<<8 | uint32(c[6])<<16 | uint32(c[7])<<24
-	x6 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
-	x7 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24
-	x8 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24
-	x9 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24
-	x10 := uint32(c[8]) | uint32(c[9])<<8 | uint32(c[10])<<16 | uint32(c[11])<<24
-	x11 := uint32(k[16]) | uint32(k[17])<<8 | uint32(k[18])<<16 | uint32(k[19])<<24
-	x12 := uint32(k[20]) | uint32(k[21])<<8 | uint32(k[22])<<16 | uint32(k[23])<<24
-	x13 := uint32(k[24]) | uint32(k[25])<<8 | uint32(k[26])<<16 | uint32(k[27])<<24
-	x14 := uint32(k[28]) | uint32(k[29])<<8 | uint32(k[30])<<16 | uint32(k[31])<<24
-	x15 := uint32(c[12]) | uint32(c[13])<<8 | uint32(c[14])<<16 | uint32(c[15])<<24
-
-	for i := 0; i < 20; i += 2 {
-		u := x0 + x12
-		x4 ^= u<<7 | u>>(32-7)
-		u = x4 + x0
-		x8 ^= u<<9 | u>>(32-9)
-		u = x8 + x4
-		x12 ^= u<<13 | u>>(32-13)
-		u = x12 + x8
-		x0 ^= u<<18 | u>>(32-18)
-
-		u = x5 + x1
-		x9 ^= u<<7 | u>>(32-7)
-		u = x9 + x5
-		x13 ^= u<<9 | u>>(32-9)
-		u = x13 + x9
-		x1 ^= u<<13 | u>>(32-13)
-		u = x1 + x13
-		x5 ^= u<<18 | u>>(32-18)
-
-		u = x10 + x6
-		x14 ^= u<<7 | u>>(32-7)
-		u = x14 + x10
-		x2 ^= u<<9 | u>>(32-9)
-		u = x2 + x14
-		x6 ^= u<<13 | u>>(32-13)
-		u = x6 + x2
-		x10 ^= u<<18 | u>>(32-18)
-
-		u = x15 + x11
-		x3 ^= u<<7 | u>>(32-7)
-		u = x3 + x15
-		x7 ^= u<<9 | u>>(32-9)
-		u = x7 + x3
-		x11 ^= u<<13 | u>>(32-13)
-		u = x11 + x7
-		x15 ^= u<<18 | u>>(32-18)
-
-		u = x0 + x3
-		x1 ^= u<<7 | u>>(32-7)
-		u = x1 + x0
-		x2 ^= u<<9 | u>>(32-9)
-		u = x2 + x1
-		x3 ^= u<<13 | u>>(32-13)
-		u = x3 + x2
-		x0 ^= u<<18 | u>>(32-18)
-
-		u = x5 + x4
-		x6 ^= u<<7 | u>>(32-7)
-		u = x6 + x5
-		x7 ^= u<<9 | u>>(32-9)
-		u = x7 + x6
-		x4 ^= u<<13 | u>>(32-13)
-		u = x4 + x7
-		x5 ^= u<<18 | u>>(32-18)
-
-		u = x10 + x9
-		x11 ^= u<<7 | u>>(32-7)
-		u = x11 + x10
-		x8 ^= u<<9 | u>>(32-9)
-		u = x8 + x11
-		x9 ^= u<<13 | u>>(32-13)
-		u = x9 + x8
-		x10 ^= u<<18 | u>>(32-18)
-
-		u = x15 + x14
-		x12 ^= u<<7 | u>>(32-7)
-		u = x12 + x15
-		x13 ^= u<<9 | u>>(32-9)
-		u = x13 + x12
-		x14 ^= u<<13 | u>>(32-13)
-		u = x14 + x13
-		x15 ^= u<<18 | u>>(32-18)
-	}
-	out[0] = byte(x0)
-	out[1] = byte(x0 >> 8)
-	out[2] = byte(x0 >> 16)
-	out[3] = byte(x0 >> 24)
-
-	out[4] = byte(x5)
-	out[5] = byte(x5 >> 8)
-	out[6] = byte(x5 >> 16)
-	out[7] = byte(x5 >> 24)
-
-	out[8] = byte(x10)
-	out[9] = byte(x10 >> 8)
-	out[10] = byte(x10 >> 16)
-	out[11] = byte(x10 >> 24)
-
-	out[12] = byte(x15)
-	out[13] = byte(x15 >> 8)
-	out[14] = byte(x15 >> 16)
-	out[15] = byte(x15 >> 24)
-
-	out[16] = byte(x6)
-	out[17] = byte(x6 >> 8)
-	out[18] = byte(x6 >> 16)
-	out[19] = byte(x6 >> 24)
-
-	out[20] = byte(x7)
-	out[21] = byte(x7 >> 8)
-	out[22] = byte(x7 >> 16)
-	out[23] = byte(x7 >> 24)
-
-	out[24] = byte(x8)
-	out[25] = byte(x8 >> 8)
-	out[26] = byte(x8 >> 16)
-	out[27] = byte(x8 >> 24)
-
-	out[28] = byte(x9)
-	out[29] = byte(x9 >> 8)
-	out[30] = byte(x9 >> 16)
-	out[31] = byte(x9 >> 24)
-}
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa2020_amd64.s b/vendor/golang.org/x/crypto/salsa20/salsa/salsa2020_amd64.s
deleted file mode 100644
index 22afbdc..0000000
--- a/vendor/golang.org/x/crypto/salsa20/salsa/salsa2020_amd64.s
+++ /dev/null
@@ -1,889 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build amd64,!appengine,!gccgo
-
-// This code was translated into a form compatible with 6a from the public
-// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
-
-// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
-// This needs up to 64 bytes at 360(SP); hence the non-obvious frame size.
-TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
-	MOVQ out+0(FP),DI
-	MOVQ in+8(FP),SI
-	MOVQ n+16(FP),DX
-	MOVQ nonce+24(FP),CX
-	MOVQ key+32(FP),R8
-
-	MOVQ SP,R12
-	MOVQ SP,R9
-	ADDQ $31, R9
-	ANDQ $~31, R9
-	MOVQ R9, SP
-
-	MOVQ DX,R9
-	MOVQ CX,DX
-	MOVQ R8,R10
-	CMPQ R9,$0
-	JBE DONE
-	START:
-	MOVL 20(R10),CX
-	MOVL 0(R10),R8
-	MOVL 0(DX),AX
-	MOVL 16(R10),R11
-	MOVL CX,0(SP)
-	MOVL R8, 4 (SP)
-	MOVL AX, 8 (SP)
-	MOVL R11, 12 (SP)
-	MOVL 8(DX),CX
-	MOVL 24(R10),R8
-	MOVL 4(R10),AX
-	MOVL 4(DX),R11
-	MOVL CX,16(SP)
-	MOVL R8, 20 (SP)
-	MOVL AX, 24 (SP)
-	MOVL R11, 28 (SP)
-	MOVL 12(DX),CX
-	MOVL 12(R10),DX
-	MOVL 28(R10),R8
-	MOVL 8(R10),AX
-	MOVL DX,32(SP)
-	MOVL CX, 36 (SP)
-	MOVL R8, 40 (SP)
-	MOVL AX, 44 (SP)
-	MOVQ $1634760805,DX
-	MOVQ $857760878,CX
-	MOVQ $2036477234,R8
-	MOVQ $1797285236,AX
-	MOVL DX,48(SP)
-	MOVL CX, 52 (SP)
-	MOVL R8, 56 (SP)
-	MOVL AX, 60 (SP)
-	CMPQ R9,$256
-	JB BYTESBETWEEN1AND255
-	MOVOA 48(SP),X0
-	PSHUFL $0X55,X0,X1
-	PSHUFL $0XAA,X0,X2
-	PSHUFL $0XFF,X0,X3
-	PSHUFL $0X00,X0,X0
-	MOVOA X1,64(SP)
-	MOVOA X2,80(SP)
-	MOVOA X3,96(SP)
-	MOVOA X0,112(SP)
-	MOVOA 0(SP),X0
-	PSHUFL $0XAA,X0,X1
-	PSHUFL $0XFF,X0,X2
-	PSHUFL $0X00,X0,X3
-	PSHUFL $0X55,X0,X0
-	MOVOA X1,128(SP)
-	MOVOA X2,144(SP)
-	MOVOA X3,160(SP)
-	MOVOA X0,176(SP)
-	MOVOA 16(SP),X0
-	PSHUFL $0XFF,X0,X1
-	PSHUFL $0X55,X0,X2
-	PSHUFL $0XAA,X0,X0
-	MOVOA X1,192(SP)
-	MOVOA X2,208(SP)
-	MOVOA X0,224(SP)
-	MOVOA 32(SP),X0
-	PSHUFL $0X00,X0,X1
-	PSHUFL $0XAA,X0,X2
-	PSHUFL $0XFF,X0,X0
-	MOVOA X1,240(SP)
-	MOVOA X2,256(SP)
-	MOVOA X0,272(SP)
-	BYTESATLEAST256:
-	MOVL 16(SP),DX
-	MOVL  36 (SP),CX
-	MOVL DX,288(SP)
-	MOVL CX,304(SP)
-	ADDQ $1,DX
-	SHLQ $32,CX
-	ADDQ CX,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 292 (SP)
-	MOVL CX, 308 (SP)
-	ADDQ $1,DX
-	SHLQ $32,CX
-	ADDQ CX,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 296 (SP)
-	MOVL CX, 312 (SP)
-	ADDQ $1,DX
-	SHLQ $32,CX
-	ADDQ CX,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 300 (SP)
-	MOVL CX, 316 (SP)
-	ADDQ $1,DX
-	SHLQ $32,CX
-	ADDQ CX,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX,16(SP)
-	MOVL CX, 36 (SP)
-	MOVQ R9,352(SP)
-	MOVQ $20,DX
-	MOVOA 64(SP),X0
-	MOVOA 80(SP),X1
-	MOVOA 96(SP),X2
-	MOVOA 256(SP),X3
-	MOVOA 272(SP),X4
-	MOVOA 128(SP),X5
-	MOVOA 144(SP),X6
-	MOVOA 176(SP),X7
-	MOVOA 192(SP),X8
-	MOVOA 208(SP),X9
-	MOVOA 224(SP),X10
-	MOVOA 304(SP),X11
-	MOVOA 112(SP),X12
-	MOVOA 160(SP),X13
-	MOVOA 240(SP),X14
-	MOVOA 288(SP),X15
-	MAINLOOP1:
-	MOVOA X1,320(SP)
-	MOVOA X2,336(SP)
-	MOVOA X13,X1
-	PADDL X12,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X14
-	PSRLL $25,X2
-	PXOR X2,X14
-	MOVOA X7,X1
-	PADDL X0,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X11
-	PSRLL $25,X2
-	PXOR X2,X11
-	MOVOA X12,X1
-	PADDL X14,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X15
-	PSRLL $23,X2
-	PXOR X2,X15
-	MOVOA X0,X1
-	PADDL X11,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X9
-	PSRLL $23,X2
-	PXOR X2,X9
-	MOVOA X14,X1
-	PADDL X15,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X13
-	PSRLL $19,X2
-	PXOR X2,X13
-	MOVOA X11,X1
-	PADDL X9,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X7
-	PSRLL $19,X2
-	PXOR X2,X7
-	MOVOA X15,X1
-	PADDL X13,X1
-	MOVOA X1,X2
-	PSLLL $18,X1
-	PXOR X1,X12
-	PSRLL $14,X2
-	PXOR X2,X12
-	MOVOA 320(SP),X1
-	MOVOA X12,320(SP)
-	MOVOA X9,X2
-	PADDL X7,X2
-	MOVOA X2,X12
-	PSLLL $18,X2
-	PXOR X2,X0
-	PSRLL $14,X12
-	PXOR X12,X0
-	MOVOA X5,X2
-	PADDL X1,X2
-	MOVOA X2,X12
-	PSLLL $7,X2
-	PXOR X2,X3
-	PSRLL $25,X12
-	PXOR X12,X3
-	MOVOA 336(SP),X2
-	MOVOA X0,336(SP)
-	MOVOA X6,X0
-	PADDL X2,X0
-	MOVOA X0,X12
-	PSLLL $7,X0
-	PXOR X0,X4
-	PSRLL $25,X12
-	PXOR X12,X4
-	MOVOA X1,X0
-	PADDL X3,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X10
-	PSRLL $23,X12
-	PXOR X12,X10
-	MOVOA X2,X0
-	PADDL X4,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X8
-	PSRLL $23,X12
-	PXOR X12,X8
-	MOVOA X3,X0
-	PADDL X10,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X5
-	PSRLL $19,X12
-	PXOR X12,X5
-	MOVOA X4,X0
-	PADDL X8,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X6
-	PSRLL $19,X12
-	PXOR X12,X6
-	MOVOA X10,X0
-	PADDL X5,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X1
-	PSRLL $14,X12
-	PXOR X12,X1
-	MOVOA 320(SP),X0
-	MOVOA X1,320(SP)
-	MOVOA X4,X1
-	PADDL X0,X1
-	MOVOA X1,X12
-	PSLLL $7,X1
-	PXOR X1,X7
-	PSRLL $25,X12
-	PXOR X12,X7
-	MOVOA X8,X1
-	PADDL X6,X1
-	MOVOA X1,X12
-	PSLLL $18,X1
-	PXOR X1,X2
-	PSRLL $14,X12
-	PXOR X12,X2
-	MOVOA 336(SP),X12
-	MOVOA X2,336(SP)
-	MOVOA X14,X1
-	PADDL X12,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X5
-	PSRLL $25,X2
-	PXOR X2,X5
-	MOVOA X0,X1
-	PADDL X7,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X10
-	PSRLL $23,X2
-	PXOR X2,X10
-	MOVOA X12,X1
-	PADDL X5,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X8
-	PSRLL $23,X2
-	PXOR X2,X8
-	MOVOA X7,X1
-	PADDL X10,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X4
-	PSRLL $19,X2
-	PXOR X2,X4
-	MOVOA X5,X1
-	PADDL X8,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X14
-	PSRLL $19,X2
-	PXOR X2,X14
-	MOVOA X10,X1
-	PADDL X4,X1
-	MOVOA X1,X2
-	PSLLL $18,X1
-	PXOR X1,X0
-	PSRLL $14,X2
-	PXOR X2,X0
-	MOVOA 320(SP),X1
-	MOVOA X0,320(SP)
-	MOVOA X8,X0
-	PADDL X14,X0
-	MOVOA X0,X2
-	PSLLL $18,X0
-	PXOR X0,X12
-	PSRLL $14,X2
-	PXOR X2,X12
-	MOVOA X11,X0
-	PADDL X1,X0
-	MOVOA X0,X2
-	PSLLL $7,X0
-	PXOR X0,X6
-	PSRLL $25,X2
-	PXOR X2,X6
-	MOVOA 336(SP),X2
-	MOVOA X12,336(SP)
-	MOVOA X3,X0
-	PADDL X2,X0
-	MOVOA X0,X12
-	PSLLL $7,X0
-	PXOR X0,X13
-	PSRLL $25,X12
-	PXOR X12,X13
-	MOVOA X1,X0
-	PADDL X6,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X15
-	PSRLL $23,X12
-	PXOR X12,X15
-	MOVOA X2,X0
-	PADDL X13,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X9
-	PSRLL $23,X12
-	PXOR X12,X9
-	MOVOA X6,X0
-	PADDL X15,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X11
-	PSRLL $19,X12
-	PXOR X12,X11
-	MOVOA X13,X0
-	PADDL X9,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X3
-	PSRLL $19,X12
-	PXOR X12,X3
-	MOVOA X15,X0
-	PADDL X11,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X1
-	PSRLL $14,X12
-	PXOR X12,X1
-	MOVOA X9,X0
-	PADDL X3,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X2
-	PSRLL $14,X12
-	PXOR X12,X2
-	MOVOA 320(SP),X12
-	MOVOA 336(SP),X0
-	SUBQ $2,DX
-	JA MAINLOOP1
-	PADDL 112(SP),X12
-	PADDL 176(SP),X7
-	PADDL 224(SP),X10
-	PADDL 272(SP),X4
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 0(SI),DX
-	XORL 4(SI),CX
-	XORL 8(SI),R8
-	XORL 12(SI),R9
-	MOVL DX,0(DI)
-	MOVL CX,4(DI)
-	MOVL R8,8(DI)
-	MOVL R9,12(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 64(SI),DX
-	XORL 68(SI),CX
-	XORL 72(SI),R8
-	XORL 76(SI),R9
-	MOVL DX,64(DI)
-	MOVL CX,68(DI)
-	MOVL R8,72(DI)
-	MOVL R9,76(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 128(SI),DX
-	XORL 132(SI),CX
-	XORL 136(SI),R8
-	XORL 140(SI),R9
-	MOVL DX,128(DI)
-	MOVL CX,132(DI)
-	MOVL R8,136(DI)
-	MOVL R9,140(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	XORL 192(SI),DX
-	XORL 196(SI),CX
-	XORL 200(SI),R8
-	XORL 204(SI),R9
-	MOVL DX,192(DI)
-	MOVL CX,196(DI)
-	MOVL R8,200(DI)
-	MOVL R9,204(DI)
-	PADDL 240(SP),X14
-	PADDL 64(SP),X0
-	PADDL 128(SP),X5
-	PADDL 192(SP),X8
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 16(SI),DX
-	XORL 20(SI),CX
-	XORL 24(SI),R8
-	XORL 28(SI),R9
-	MOVL DX,16(DI)
-	MOVL CX,20(DI)
-	MOVL R8,24(DI)
-	MOVL R9,28(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 80(SI),DX
-	XORL 84(SI),CX
-	XORL 88(SI),R8
-	XORL 92(SI),R9
-	MOVL DX,80(DI)
-	MOVL CX,84(DI)
-	MOVL R8,88(DI)
-	MOVL R9,92(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 144(SI),DX
-	XORL 148(SI),CX
-	XORL 152(SI),R8
-	XORL 156(SI),R9
-	MOVL DX,144(DI)
-	MOVL CX,148(DI)
-	MOVL R8,152(DI)
-	MOVL R9,156(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	XORL 208(SI),DX
-	XORL 212(SI),CX
-	XORL 216(SI),R8
-	XORL 220(SI),R9
-	MOVL DX,208(DI)
-	MOVL CX,212(DI)
-	MOVL R8,216(DI)
-	MOVL R9,220(DI)
-	PADDL 288(SP),X15
-	PADDL 304(SP),X11
-	PADDL 80(SP),X1
-	PADDL 144(SP),X6
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 32(SI),DX
-	XORL 36(SI),CX
-	XORL 40(SI),R8
-	XORL 44(SI),R9
-	MOVL DX,32(DI)
-	MOVL CX,36(DI)
-	MOVL R8,40(DI)
-	MOVL R9,44(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 96(SI),DX
-	XORL 100(SI),CX
-	XORL 104(SI),R8
-	XORL 108(SI),R9
-	MOVL DX,96(DI)
-	MOVL CX,100(DI)
-	MOVL R8,104(DI)
-	MOVL R9,108(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 160(SI),DX
-	XORL 164(SI),CX
-	XORL 168(SI),R8
-	XORL 172(SI),R9
-	MOVL DX,160(DI)
-	MOVL CX,164(DI)
-	MOVL R8,168(DI)
-	MOVL R9,172(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	XORL 224(SI),DX
-	XORL 228(SI),CX
-	XORL 232(SI),R8
-	XORL 236(SI),R9
-	MOVL DX,224(DI)
-	MOVL CX,228(DI)
-	MOVL R8,232(DI)
-	MOVL R9,236(DI)
-	PADDL 160(SP),X13
-	PADDL 208(SP),X9
-	PADDL 256(SP),X3
-	PADDL 96(SP),X2
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 48(SI),DX
-	XORL 52(SI),CX
-	XORL 56(SI),R8
-	XORL 60(SI),R9
-	MOVL DX,48(DI)
-	MOVL CX,52(DI)
-	MOVL R8,56(DI)
-	MOVL R9,60(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 112(SI),DX
-	XORL 116(SI),CX
-	XORL 120(SI),R8
-	XORL 124(SI),R9
-	MOVL DX,112(DI)
-	MOVL CX,116(DI)
-	MOVL R8,120(DI)
-	MOVL R9,124(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 176(SI),DX
-	XORL 180(SI),CX
-	XORL 184(SI),R8
-	XORL 188(SI),R9
-	MOVL DX,176(DI)
-	MOVL CX,180(DI)
-	MOVL R8,184(DI)
-	MOVL R9,188(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	XORL 240(SI),DX
-	XORL 244(SI),CX
-	XORL 248(SI),R8
-	XORL 252(SI),R9
-	MOVL DX,240(DI)
-	MOVL CX,244(DI)
-	MOVL R8,248(DI)
-	MOVL R9,252(DI)
-	MOVQ 352(SP),R9
-	SUBQ $256,R9
-	ADDQ $256,SI
-	ADDQ $256,DI
-	CMPQ R9,$256
-	JAE BYTESATLEAST256
-	CMPQ R9,$0
-	JBE DONE
-	BYTESBETWEEN1AND255:
-	CMPQ R9,$64
-	JAE NOCOPY
-	MOVQ DI,DX
-	LEAQ 360(SP),DI
-	MOVQ R9,CX
-	REP; MOVSB
-	LEAQ 360(SP),DI
-	LEAQ 360(SP),SI
-	NOCOPY:
-	MOVQ R9,352(SP)
-	MOVOA 48(SP),X0
-	MOVOA 0(SP),X1
-	MOVOA 16(SP),X2
-	MOVOA 32(SP),X3
-	MOVOA X1,X4
-	MOVQ $20,CX
-	MAINLOOP2:
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X3
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X3,X3
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X1
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X1,X1
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X1
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X1,X1
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X3
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X3,X3
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X3
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X3,X3
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X1
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X1,X1
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X1
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X1,X1
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X3
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X3
-	SUBQ $4,CX
-	PADDL X3,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PXOR X7,X7
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X3,X3
-	PXOR X6,X0
-	JA MAINLOOP2
-	PADDL 48(SP),X0
-	PADDL 0(SP),X1
-	PADDL 16(SP),X2
-	PADDL 32(SP),X3
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 0(SI),CX
-	XORL 48(SI),R8
-	XORL 32(SI),R9
-	XORL 16(SI),AX
-	MOVL CX,0(DI)
-	MOVL R8,48(DI)
-	MOVL R9,32(DI)
-	MOVL AX,16(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 20(SI),CX
-	XORL 4(SI),R8
-	XORL 52(SI),R9
-	XORL 36(SI),AX
-	MOVL CX,20(DI)
-	MOVL R8,4(DI)
-	MOVL R9,52(DI)
-	MOVL AX,36(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 40(SI),CX
-	XORL 24(SI),R8
-	XORL 8(SI),R9
-	XORL 56(SI),AX
-	MOVL CX,40(DI)
-	MOVL R8,24(DI)
-	MOVL R9,8(DI)
-	MOVL AX,56(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	XORL 60(SI),CX
-	XORL 44(SI),R8
-	XORL 28(SI),R9
-	XORL 12(SI),AX
-	MOVL CX,60(DI)
-	MOVL R8,44(DI)
-	MOVL R9,28(DI)
-	MOVL AX,12(DI)
-	MOVQ 352(SP),R9
-	MOVL 16(SP),CX
-	MOVL  36 (SP),R8
-	ADDQ $1,CX
-	SHLQ $32,R8
-	ADDQ R8,CX
-	MOVQ CX,R8
-	SHRQ $32,R8
-	MOVL CX,16(SP)
-	MOVL R8, 36 (SP)
-	CMPQ R9,$64
-	JA BYTESATLEAST65
-	JAE BYTESATLEAST64
-	MOVQ DI,SI
-	MOVQ DX,DI
-	MOVQ R9,CX
-	REP; MOVSB
-	BYTESATLEAST64:
-	DONE:
-	MOVQ R12,SP
-	RET
-	BYTESATLEAST65:
-	SUBQ $64,R9
-	ADDQ $64,DI
-	ADDQ $64,SI
-	JMP BYTESBETWEEN1AND255
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa208.go b/vendor/golang.org/x/crypto/salsa20/salsa/salsa208.go
deleted file mode 100644
index 9bfc092..0000000
--- a/vendor/golang.org/x/crypto/salsa20/salsa/salsa208.go
+++ /dev/null
@@ -1,199 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package salsa
-
-// Core208 applies the Salsa20/8 core function to the 64-byte array in and puts
-// the result into the 64-byte array out. The input and output may be the same array.
-func Core208(out *[64]byte, in *[64]byte) {
-	j0 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
-	j1 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24
-	j2 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24
-	j3 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24
-	j4 := uint32(in[16]) | uint32(in[17])<<8 | uint32(in[18])<<16 | uint32(in[19])<<24
-	j5 := uint32(in[20]) | uint32(in[21])<<8 | uint32(in[22])<<16 | uint32(in[23])<<24
-	j6 := uint32(in[24]) | uint32(in[25])<<8 | uint32(in[26])<<16 | uint32(in[27])<<24
-	j7 := uint32(in[28]) | uint32(in[29])<<8 | uint32(in[30])<<16 | uint32(in[31])<<24
-	j8 := uint32(in[32]) | uint32(in[33])<<8 | uint32(in[34])<<16 | uint32(in[35])<<24
-	j9 := uint32(in[36]) | uint32(in[37])<<8 | uint32(in[38])<<16 | uint32(in[39])<<24
-	j10 := uint32(in[40]) | uint32(in[41])<<8 | uint32(in[42])<<16 | uint32(in[43])<<24
-	j11 := uint32(in[44]) | uint32(in[45])<<8 | uint32(in[46])<<16 | uint32(in[47])<<24
-	j12 := uint32(in[48]) | uint32(in[49])<<8 | uint32(in[50])<<16 | uint32(in[51])<<24
-	j13 := uint32(in[52]) | uint32(in[53])<<8 | uint32(in[54])<<16 | uint32(in[55])<<24
-	j14 := uint32(in[56]) | uint32(in[57])<<8 | uint32(in[58])<<16 | uint32(in[59])<<24
-	j15 := uint32(in[60]) | uint32(in[61])<<8 | uint32(in[62])<<16 | uint32(in[63])<<24
-
-	x0, x1, x2, x3, x4, x5, x6, x7, x8 := j0, j1, j2, j3, j4, j5, j6, j7, j8
-	x9, x10, x11, x12, x13, x14, x15 := j9, j10, j11, j12, j13, j14, j15
-
-	for i := 0; i < 8; i += 2 {
-		u := x0 + x12
-		x4 ^= u<<7 | u>>(32-7)
-		u = x4 + x0
-		x8 ^= u<<9 | u>>(32-9)
-		u = x8 + x4
-		x12 ^= u<<13 | u>>(32-13)
-		u = x12 + x8
-		x0 ^= u<<18 | u>>(32-18)
-
-		u = x5 + x1
-		x9 ^= u<<7 | u>>(32-7)
-		u = x9 + x5
-		x13 ^= u<<9 | u>>(32-9)
-		u = x13 + x9
-		x1 ^= u<<13 | u>>(32-13)
-		u = x1 + x13
-		x5 ^= u<<18 | u>>(32-18)
-
-		u = x10 + x6
-		x14 ^= u<<7 | u>>(32-7)
-		u = x14 + x10
-		x2 ^= u<<9 | u>>(32-9)
-		u = x2 + x14
-		x6 ^= u<<13 | u>>(32-13)
-		u = x6 + x2
-		x10 ^= u<<18 | u>>(32-18)
-
-		u = x15 + x11
-		x3 ^= u<<7 | u>>(32-7)
-		u = x3 + x15
-		x7 ^= u<<9 | u>>(32-9)
-		u = x7 + x3
-		x11 ^= u<<13 | u>>(32-13)
-		u = x11 + x7
-		x15 ^= u<<18 | u>>(32-18)
-
-		u = x0 + x3
-		x1 ^= u<<7 | u>>(32-7)
-		u = x1 + x0
-		x2 ^= u<<9 | u>>(32-9)
-		u = x2 + x1
-		x3 ^= u<<13 | u>>(32-13)
-		u = x3 + x2
-		x0 ^= u<<18 | u>>(32-18)
-
-		u = x5 + x4
-		x6 ^= u<<7 | u>>(32-7)
-		u = x6 + x5
-		x7 ^= u<<9 | u>>(32-9)
-		u = x7 + x6
-		x4 ^= u<<13 | u>>(32-13)
-		u = x4 + x7
-		x5 ^= u<<18 | u>>(32-18)
-
-		u = x10 + x9
-		x11 ^= u<<7 | u>>(32-7)
-		u = x11 + x10
-		x8 ^= u<<9 | u>>(32-9)
-		u = x8 + x11
-		x9 ^= u<<13 | u>>(32-13)
-		u = x9 + x8
-		x10 ^= u<<18 | u>>(32-18)
-
-		u = x15 + x14
-		x12 ^= u<<7 | u>>(32-7)
-		u = x12 + x15
-		x13 ^= u<<9 | u>>(32-9)
-		u = x13 + x12
-		x14 ^= u<<13 | u>>(32-13)
-		u = x14 + x13
-		x15 ^= u<<18 | u>>(32-18)
-	}
-	x0 += j0
-	x1 += j1
-	x2 += j2
-	x3 += j3
-	x4 += j4
-	x5 += j5
-	x6 += j6
-	x7 += j7
-	x8 += j8
-	x9 += j9
-	x10 += j10
-	x11 += j11
-	x12 += j12
-	x13 += j13
-	x14 += j14
-	x15 += j15
-
-	out[0] = byte(x0)
-	out[1] = byte(x0 >> 8)
-	out[2] = byte(x0 >> 16)
-	out[3] = byte(x0 >> 24)
-
-	out[4] = byte(x1)
-	out[5] = byte(x1 >> 8)
-	out[6] = byte(x1 >> 16)
-	out[7] = byte(x1 >> 24)
-
-	out[8] = byte(x2)
-	out[9] = byte(x2 >> 8)
-	out[10] = byte(x2 >> 16)
-	out[11] = byte(x2 >> 24)
-
-	out[12] = byte(x3)
-	out[13] = byte(x3 >> 8)
-	out[14] = byte(x3 >> 16)
-	out[15] = byte(x3 >> 24)
-
-	out[16] = byte(x4)
-	out[17] = byte(x4 >> 8)
-	out[18] = byte(x4 >> 16)
-	out[19] = byte(x4 >> 24)
-
-	out[20] = byte(x5)
-	out[21] = byte(x5 >> 8)
-	out[22] = byte(x5 >> 16)
-	out[23] = byte(x5 >> 24)
-
-	out[24] = byte(x6)
-	out[25] = byte(x6 >> 8)
-	out[26] = byte(x6 >> 16)
-	out[27] = byte(x6 >> 24)
-
-	out[28] = byte(x7)
-	out[29] = byte(x7 >> 8)
-	out[30] = byte(x7 >> 16)
-	out[31] = byte(x7 >> 24)
-
-	out[32] = byte(x8)
-	out[33] = byte(x8 >> 8)
-	out[34] = byte(x8 >> 16)
-	out[35] = byte(x8 >> 24)
-
-	out[36] = byte(x9)
-	out[37] = byte(x9 >> 8)
-	out[38] = byte(x9 >> 16)
-	out[39] = byte(x9 >> 24)
-
-	out[40] = byte(x10)
-	out[41] = byte(x10 >> 8)
-	out[42] = byte(x10 >> 16)
-	out[43] = byte(x10 >> 24)
-
-	out[44] = byte(x11)
-	out[45] = byte(x11 >> 8)
-	out[46] = byte(x11 >> 16)
-	out[47] = byte(x11 >> 24)
-
-	out[48] = byte(x12)
-	out[49] = byte(x12 >> 8)
-	out[50] = byte(x12 >> 16)
-	out[51] = byte(x12 >> 24)
-
-	out[52] = byte(x13)
-	out[53] = byte(x13 >> 8)
-	out[54] = byte(x13 >> 16)
-	out[55] = byte(x13 >> 24)
-
-	out[56] = byte(x14)
-	out[57] = byte(x14 >> 8)
-	out[58] = byte(x14 >> 16)
-	out[59] = byte(x14 >> 24)
-
-	out[60] = byte(x15)
-	out[61] = byte(x15 >> 8)
-	out[62] = byte(x15 >> 16)
-	out[63] = byte(x15 >> 24)
-}
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.go b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.go
deleted file mode 100644
index f9269c3..0000000
--- a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.go
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build amd64,!appengine,!gccgo
-
-package salsa
-
-// This function is implemented in salsa2020_amd64.s.
-
-//go:noescape
-
-func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
-
-// XORKeyStream crypts bytes from in to out using the given key and counters.
-// In and out must overlap entirely or not at all. Counter
-// contains the raw salsa20 counter bytes (both nonce and block counter).
-func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
-	if len(in) == 0 {
-		return
-	}
-	_ = out[len(in)-1]
-	salsa2020XORKeyStream(&out[0], &in[0], uint64(len(in)), &counter[0], &key[0])
-}
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go
deleted file mode 100644
index 22126d1..0000000
--- a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go
+++ /dev/null
@@ -1,234 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64 appengine gccgo
-
-package salsa
-
-const rounds = 20
-
-// core applies the Salsa20 core function to 16-byte input in, 32-byte key k,
-// and 16-byte constant c, and puts the result into 64-byte array out.
-func core(out *[64]byte, in *[16]byte, k *[32]byte, c *[16]byte) {
-	j0 := uint32(c[0]) | uint32(c[1])<<8 | uint32(c[2])<<16 | uint32(c[3])<<24
-	j1 := uint32(k[0]) | uint32(k[1])<<8 | uint32(k[2])<<16 | uint32(k[3])<<24
-	j2 := uint32(k[4]) | uint32(k[5])<<8 | uint32(k[6])<<16 | uint32(k[7])<<24
-	j3 := uint32(k[8]) | uint32(k[9])<<8 | uint32(k[10])<<16 | uint32(k[11])<<24
-	j4 := uint32(k[12]) | uint32(k[13])<<8 | uint32(k[14])<<16 | uint32(k[15])<<24
-	j5 := uint32(c[4]) | uint32(c[5])<<8 | uint32(c[6])<<16 | uint32(c[7])<<24
-	j6 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
-	j7 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24
-	j8 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24
-	j9 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24
-	j10 := uint32(c[8]) | uint32(c[9])<<8 | uint32(c[10])<<16 | uint32(c[11])<<24
-	j11 := uint32(k[16]) | uint32(k[17])<<8 | uint32(k[18])<<16 | uint32(k[19])<<24
-	j12 := uint32(k[20]) | uint32(k[21])<<8 | uint32(k[22])<<16 | uint32(k[23])<<24
-	j13 := uint32(k[24]) | uint32(k[25])<<8 | uint32(k[26])<<16 | uint32(k[27])<<24
-	j14 := uint32(k[28]) | uint32(k[29])<<8 | uint32(k[30])<<16 | uint32(k[31])<<24
-	j15 := uint32(c[12]) | uint32(c[13])<<8 | uint32(c[14])<<16 | uint32(c[15])<<24
-
-	x0, x1, x2, x3, x4, x5, x6, x7, x8 := j0, j1, j2, j3, j4, j5, j6, j7, j8
-	x9, x10, x11, x12, x13, x14, x15 := j9, j10, j11, j12, j13, j14, j15
-
-	for i := 0; i < rounds; i += 2 {
-		u := x0 + x12
-		x4 ^= u<<7 | u>>(32-7)
-		u = x4 + x0
-		x8 ^= u<<9 | u>>(32-9)
-		u = x8 + x4
-		x12 ^= u<<13 | u>>(32-13)
-		u = x12 + x8
-		x0 ^= u<<18 | u>>(32-18)
-
-		u = x5 + x1
-		x9 ^= u<<7 | u>>(32-7)
-		u = x9 + x5
-		x13 ^= u<<9 | u>>(32-9)
-		u = x13 + x9
-		x1 ^= u<<13 | u>>(32-13)
-		u = x1 + x13
-		x5 ^= u<<18 | u>>(32-18)
-
-		u = x10 + x6
-		x14 ^= u<<7 | u>>(32-7)
-		u = x14 + x10
-		x2 ^= u<<9 | u>>(32-9)
-		u = x2 + x14
-		x6 ^= u<<13 | u>>(32-13)
-		u = x6 + x2
-		x10 ^= u<<18 | u>>(32-18)
-
-		u = x15 + x11
-		x3 ^= u<<7 | u>>(32-7)
-		u = x3 + x15
-		x7 ^= u<<9 | u>>(32-9)
-		u = x7 + x3
-		x11 ^= u<<13 | u>>(32-13)
-		u = x11 + x7
-		x15 ^= u<<18 | u>>(32-18)
-
-		u = x0 + x3
-		x1 ^= u<<7 | u>>(32-7)
-		u = x1 + x0
-		x2 ^= u<<9 | u>>(32-9)
-		u = x2 + x1
-		x3 ^= u<<13 | u>>(32-13)
-		u = x3 + x2
-		x0 ^= u<<18 | u>>(32-18)
-
-		u = x5 + x4
-		x6 ^= u<<7 | u>>(32-7)
-		u = x6 + x5
-		x7 ^= u<<9 | u>>(32-9)
-		u = x7 + x6
-		x4 ^= u<<13 | u>>(32-13)
-		u = x4 + x7
-		x5 ^= u<<18 | u>>(32-18)
-
-		u = x10 + x9
-		x11 ^= u<<7 | u>>(32-7)
-		u = x11 + x10
-		x8 ^= u<<9 | u>>(32-9)
-		u = x8 + x11
-		x9 ^= u<<13 | u>>(32-13)
-		u = x9 + x8
-		x10 ^= u<<18 | u>>(32-18)
-
-		u = x15 + x14
-		x12 ^= u<<7 | u>>(32-7)
-		u = x12 + x15
-		x13 ^= u<<9 | u>>(32-9)
-		u = x13 + x12
-		x14 ^= u<<13 | u>>(32-13)
-		u = x14 + x13
-		x15 ^= u<<18 | u>>(32-18)
-	}
-	x0 += j0
-	x1 += j1
-	x2 += j2
-	x3 += j3
-	x4 += j4
-	x5 += j5
-	x6 += j6
-	x7 += j7
-	x8 += j8
-	x9 += j9
-	x10 += j10
-	x11 += j11
-	x12 += j12
-	x13 += j13
-	x14 += j14
-	x15 += j15
-
-	out[0] = byte(x0)
-	out[1] = byte(x0 >> 8)
-	out[2] = byte(x0 >> 16)
-	out[3] = byte(x0 >> 24)
-
-	out[4] = byte(x1)
-	out[5] = byte(x1 >> 8)
-	out[6] = byte(x1 >> 16)
-	out[7] = byte(x1 >> 24)
-
-	out[8] = byte(x2)
-	out[9] = byte(x2 >> 8)
-	out[10] = byte(x2 >> 16)
-	out[11] = byte(x2 >> 24)
-
-	out[12] = byte(x3)
-	out[13] = byte(x3 >> 8)
-	out[14] = byte(x3 >> 16)
-	out[15] = byte(x3 >> 24)
-
-	out[16] = byte(x4)
-	out[17] = byte(x4 >> 8)
-	out[18] = byte(x4 >> 16)
-	out[19] = byte(x4 >> 24)
-
-	out[20] = byte(x5)
-	out[21] = byte(x5 >> 8)
-	out[22] = byte(x5 >> 16)
-	out[23] = byte(x5 >> 24)
-
-	out[24] = byte(x6)
-	out[25] = byte(x6 >> 8)
-	out[26] = byte(x6 >> 16)
-	out[27] = byte(x6 >> 24)
-
-	out[28] = byte(x7)
-	out[29] = byte(x7 >> 8)
-	out[30] = byte(x7 >> 16)
-	out[31] = byte(x7 >> 24)
-
-	out[32] = byte(x8)
-	out[33] = byte(x8 >> 8)
-	out[34] = byte(x8 >> 16)
-	out[35] = byte(x8 >> 24)
-
-	out[36] = byte(x9)
-	out[37] = byte(x9 >> 8)
-	out[38] = byte(x9 >> 16)
-	out[39] = byte(x9 >> 24)
-
-	out[40] = byte(x10)
-	out[41] = byte(x10 >> 8)
-	out[42] = byte(x10 >> 16)
-	out[43] = byte(x10 >> 24)
-
-	out[44] = byte(x11)
-	out[45] = byte(x11 >> 8)
-	out[46] = byte(x11 >> 16)
-	out[47] = byte(x11 >> 24)
-
-	out[48] = byte(x12)
-	out[49] = byte(x12 >> 8)
-	out[50] = byte(x12 >> 16)
-	out[51] = byte(x12 >> 24)
-
-	out[52] = byte(x13)
-	out[53] = byte(x13 >> 8)
-	out[54] = byte(x13 >> 16)
-	out[55] = byte(x13 >> 24)
-
-	out[56] = byte(x14)
-	out[57] = byte(x14 >> 8)
-	out[58] = byte(x14 >> 16)
-	out[59] = byte(x14 >> 24)
-
-	out[60] = byte(x15)
-	out[61] = byte(x15 >> 8)
-	out[62] = byte(x15 >> 16)
-	out[63] = byte(x15 >> 24)
-}
-
-// XORKeyStream crypts bytes from in to out using the given key and counters.
-// In and out must overlap entirely or not at all. Counter
-// contains the raw salsa20 counter bytes (both nonce and block counter).
-func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
-	var block [64]byte
-	var counterCopy [16]byte
-	copy(counterCopy[:], counter[:])
-
-	for len(in) >= 64 {
-		core(&block, &counterCopy, key, &Sigma)
-		for i, x := range block {
-			out[i] = in[i] ^ x
-		}
-		u := uint32(1)
-		for i := 8; i < 16; i++ {
-			u += uint32(counterCopy[i])
-			counterCopy[i] = byte(u)
-			u >>= 8
-		}
-		in = in[64:]
-		out = out[64:]
-	}
-
-	if len(in) > 0 {
-		core(&block, &counterCopy, key, &Sigma)
-		for i, v := range in {
-			out[i] = v ^ block[i]
-		}
-	}
-}
diff --git a/vendor/golang.org/x/sys/LICENSE b/vendor/golang.org/x/sys/LICENSE
deleted file mode 100644
index 6a66aea..0000000
--- a/vendor/golang.org/x/sys/LICENSE
+++ /dev/null
@@ -1,27 +0,0 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/golang.org/x/sys/PATENTS b/vendor/golang.org/x/sys/PATENTS
deleted file mode 100644
index 7330990..0000000
--- a/vendor/golang.org/x/sys/PATENTS
+++ /dev/null
@@ -1,22 +0,0 @@
-Additional IP Rights Grant (Patents)
-
-"This implementation" means the copyrightable works distributed by
-Google as part of the Go project.
-
-Google hereby grants to You a perpetual, worldwide, non-exclusive,
-no-charge, royalty-free, irrevocable (except as stated in this section)
-patent license to make, have made, use, offer to sell, sell, import,
-transfer and otherwise run, modify and propagate the contents of this
-implementation of Go, where such license applies only to those patent
-claims, both currently owned or controlled by Google and acquired in
-the future, licensable by Google that are necessarily infringed by this
-implementation of Go.  This grant does not include claims that would be
-infringed only as a consequence of further modification of this
-implementation.  If you or your agent or exclusive licensee institute or
-order or agree to the institution of patent litigation against any
-entity (including a cross-claim or counterclaim in a lawsuit) alleging
-that this implementation of Go or any code incorporated within this
-implementation of Go constitutes direct or contributory patent
-infringement, or inducement of patent infringement, then any patent
-rights granted to you under this License for this implementation of Go
-shall terminate as of the date such litigation is filed.
diff --git a/vendor/golang.org/x/sys/cpu/cpu.go b/vendor/golang.org/x/sys/cpu/cpu.go
deleted file mode 100644
index 3d88f86..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu.go
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package cpu implements processor feature detection for
-// various CPU architectures.
-package cpu
-
-// CacheLinePad is used to pad structs to avoid false sharing.
-type CacheLinePad struct{ _ [cacheLineSize]byte }
-
-// X86 contains the supported CPU features of the
-// current X86/AMD64 platform. If the current platform
-// is not X86/AMD64 then all feature flags are false.
-//
-// X86 is padded to avoid false sharing. Further the HasAVX
-// and HasAVX2 are only set if the OS supports XMM and YMM
-// registers in addition to the CPUID feature bit being set.
-var X86 struct {
-	_            CacheLinePad
-	HasAES       bool // AES hardware implementation (AES NI)
-	HasADX       bool // Multi-precision add-carry instruction extensions
-	HasAVX       bool // Advanced vector extension
-	HasAVX2      bool // Advanced vector extension 2
-	HasBMI1      bool // Bit manipulation instruction set 1
-	HasBMI2      bool // Bit manipulation instruction set 2
-	HasERMS      bool // Enhanced REP for MOVSB and STOSB
-	HasFMA       bool // Fused-multiply-add instructions
-	HasOSXSAVE   bool // OS supports XSAVE/XRESTOR for saving/restoring XMM registers.
-	HasPCLMULQDQ bool // PCLMULQDQ instruction - most often used for AES-GCM
-	HasPOPCNT    bool // Hamming weight instruction POPCNT.
-	HasSSE2      bool // Streaming SIMD extension 2 (always available on amd64)
-	HasSSE3      bool // Streaming SIMD extension 3
-	HasSSSE3     bool // Supplemental streaming SIMD extension 3
-	HasSSE41     bool // Streaming SIMD extension 4 and 4.1
-	HasSSE42     bool // Streaming SIMD extension 4 and 4.2
-	_            CacheLinePad
-}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_arm.go b/vendor/golang.org/x/sys/cpu/cpu_arm.go
deleted file mode 100644
index d93036f..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu_arm.go
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package cpu
-
-const cacheLineSize = 32
diff --git a/vendor/golang.org/x/sys/cpu/cpu_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_arm64.go
deleted file mode 100644
index 1d2ab29..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu_arm64.go
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package cpu
-
-const cacheLineSize = 64
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go
deleted file mode 100644
index f7cb469..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build 386 amd64 amd64p32
-// +build !gccgo
-
-package cpu
-
-// cpuid is implemented in cpu_x86.s for gc compiler
-// and in cpu_gccgo.c for gccgo.
-func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
-
-// xgetbv with ecx = 0 is implemented in cpu_x86.s for gc compiler
-// and in cpu_gccgo.c for gccgo.
-func xgetbv() (eax, edx uint32)
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo.c b/vendor/golang.org/x/sys/cpu/cpu_gccgo.c
deleted file mode 100644
index e363c7d..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu_gccgo.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build 386 amd64 amd64p32
-// +build gccgo
-
-#include <cpuid.h>
-#include <stdint.h>
-
-// Need to wrap __get_cpuid_count because it's declared as static.
-int
-gccgoGetCpuidCount(uint32_t leaf, uint32_t subleaf,
-                   uint32_t *eax, uint32_t *ebx,
-                   uint32_t *ecx, uint32_t *edx)
-{
-	return __get_cpuid_count(leaf, subleaf, eax, ebx, ecx, edx);
-}
-
-// xgetbv reads the contents of an XCR (Extended Control Register)
-// specified in the ECX register into registers EDX:EAX.
-// Currently, the only supported value for XCR is 0.
-//
-// TODO: Replace with a better alternative:
-//
-//     #include <xsaveintrin.h>
-//
-//     #pragma GCC target("xsave")
-//
-//     void gccgoXgetbv(uint32_t *eax, uint32_t *edx) {
-//       unsigned long long x = _xgetbv(0);
-//       *eax = x & 0xffffffff;
-//       *edx = (x >> 32) & 0xffffffff;
-//     }
-//
-// Note that _xgetbv is defined starting with GCC 8.
-void
-gccgoXgetbv(uint32_t *eax, uint32_t *edx)
-{
-	__asm("  xorl %%ecx, %%ecx\n"
-	      "  xgetbv"
-	    : "=a"(*eax), "=d"(*edx));
-}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo.go b/vendor/golang.org/x/sys/cpu/cpu_gccgo.go
deleted file mode 100644
index ba49b91..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu_gccgo.go
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build 386 amd64 amd64p32
-// +build gccgo
-
-package cpu
-
-//extern gccgoGetCpuidCount
-func gccgoGetCpuidCount(eaxArg, ecxArg uint32, eax, ebx, ecx, edx *uint32)
-
-func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) {
-	var a, b, c, d uint32
-	gccgoGetCpuidCount(eaxArg, ecxArg, &a, &b, &c, &d)
-	return a, b, c, d
-}
-
-//extern gccgoXgetbv
-func gccgoXgetbv(eax, edx *uint32)
-
-func xgetbv() (eax, edx uint32) {
-	var a, d uint32
-	gccgoXgetbv(&a, &d)
-	return a, d
-}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_mips64x.go b/vendor/golang.org/x/sys/cpu/cpu_mips64x.go
deleted file mode 100644
index 6165f12..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu_mips64x.go
+++ /dev/null
@@ -1,9 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build mips64 mips64le
-
-package cpu
-
-const cacheLineSize = 32
diff --git a/vendor/golang.org/x/sys/cpu/cpu_mipsx.go b/vendor/golang.org/x/sys/cpu/cpu_mipsx.go
deleted file mode 100644
index 1269eee..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu_mipsx.go
+++ /dev/null
@@ -1,9 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build mips mipsle
-
-package cpu
-
-const cacheLineSize = 32
diff --git a/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go b/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go
deleted file mode 100644
index d10759a..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go
+++ /dev/null
@@ -1,9 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ppc64 ppc64le
-
-package cpu
-
-const cacheLineSize = 128
diff --git a/vendor/golang.org/x/sys/cpu/cpu_s390x.go b/vendor/golang.org/x/sys/cpu/cpu_s390x.go
deleted file mode 100644
index 684c4f0..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu_s390x.go
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package cpu
-
-const cacheLineSize = 256
diff --git a/vendor/golang.org/x/sys/cpu/cpu_x86.go b/vendor/golang.org/x/sys/cpu/cpu_x86.go
deleted file mode 100644
index 71e288b..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu_x86.go
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build 386 amd64 amd64p32
-
-package cpu
-
-const cacheLineSize = 64
-
-func init() {
-	maxID, _, _, _ := cpuid(0, 0)
-
-	if maxID < 1 {
-		return
-	}
-
-	_, _, ecx1, edx1 := cpuid(1, 0)
-	X86.HasSSE2 = isSet(26, edx1)
-
-	X86.HasSSE3 = isSet(0, ecx1)
-	X86.HasPCLMULQDQ = isSet(1, ecx1)
-	X86.HasSSSE3 = isSet(9, ecx1)
-	X86.HasFMA = isSet(12, ecx1)
-	X86.HasSSE41 = isSet(19, ecx1)
-	X86.HasSSE42 = isSet(20, ecx1)
-	X86.HasPOPCNT = isSet(23, ecx1)
-	X86.HasAES = isSet(25, ecx1)
-	X86.HasOSXSAVE = isSet(27, ecx1)
-
-	osSupportsAVX := false
-	// For XGETBV, OSXSAVE bit is required and sufficient.
-	if X86.HasOSXSAVE {
-		eax, _ := xgetbv()
-		// Check if XMM and YMM registers have OS support.
-		osSupportsAVX = isSet(1, eax) && isSet(2, eax)
-	}
-
-	X86.HasAVX = isSet(28, ecx1) && osSupportsAVX
-
-	if maxID < 7 {
-		return
-	}
-
-	_, ebx7, _, _ := cpuid(7, 0)
-	X86.HasBMI1 = isSet(3, ebx7)
-	X86.HasAVX2 = isSet(5, ebx7) && osSupportsAVX
-	X86.HasBMI2 = isSet(8, ebx7)
-	X86.HasERMS = isSet(9, ebx7)
-	X86.HasADX = isSet(19, ebx7)
-}
-
-func isSet(bitpos uint, value uint32) bool {
-	return value&(1<<bitpos) != 0
-}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_x86.s b/vendor/golang.org/x/sys/cpu/cpu_x86.s
deleted file mode 100644
index 47f0841..0000000
--- a/vendor/golang.org/x/sys/cpu/cpu_x86.s
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build 386 amd64 amd64p32
-// +build !gccgo
-
-#include "textflag.h"
-
-// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·cpuid(SB), NOSPLIT, $0-24
-	MOVL eaxArg+0(FP), AX
-	MOVL ecxArg+4(FP), CX
-	CPUID
-	MOVL AX, eax+8(FP)
-	MOVL BX, ebx+12(FP)
-	MOVL CX, ecx+16(FP)
-	MOVL DX, edx+20(FP)
-	RET
-
-// func xgetbv() (eax, edx uint32)
-TEXT ·xgetbv(SB),NOSPLIT,$0-8
-	MOVL $0, CX
-	XGETBV
-	MOVL AX, eax+0(FP)
-	MOVL DX, edx+4(FP)
-	RET
diff --git a/vendor/vendor.json b/vendor/vendor.json
deleted file mode 100644
index 34cddfd..0000000
--- a/vendor/vendor.json
+++ /dev/null
@@ -1,145 +0,0 @@
-{
-	"comment": "",
-	"ignore": "test",
-	"package": [
-		{
-			"checksumSHA1": "0RF8xZ7NSiFIkL3klNaK28xQwB4=",
-			"path": "github.com/Yawning/chacha20",
-			"revision": "e3b1f968fc6397b51d963fee8ec8711a47bc0ce8",
-			"revisionTime": "2017-09-04T08:51:04Z"
-		},
-		{
-			"checksumSHA1": "q6pAq51zm9fz4vA3THiNg2URqJc=",
-			"path": "github.com/dgryski/go-camellia",
-			"revision": "3be6b3054dd1c0f7191ea97b30721edba4f6d874",
-			"revisionTime": "2014-04-12T17:44:59Z"
-		},
-		{
-			"checksumSHA1": "bQ+Q97zPBOdpKLPzug5+CxL3z4U=",
-			"path": "github.com/dgryski/go-idea",
-			"revision": "d2fb45a411fb694e8400cf80e91a25146a2af86e",
-			"revisionTime": "2017-03-06T09:12:26Z"
-		},
-		{
-			"checksumSHA1": "yocEZGpHJf++i6qggDdXwJfXuF4=",
-			"path": "github.com/dgryski/go-rc2",
-			"revision": "8a9021637152186df738b1ec376caf2100fef194",
-			"revisionTime": "2015-06-21T09:53:37Z"
-		},
-		{
-			"checksumSHA1": "A+UhRkiCx/xhLhy7ieikM/Lq85o=",
-			"path": "github.com/nadoo/conflag",
-			"revision": "42f66d5216f887f3f59e7c28024f4e31ed19a36e",
-			"revisionTime": "2017-08-25T12:31:51Z"
-		},
-		{
-			"checksumSHA1": "DaSQ92dyMuaO3JwJ2OoQTcHffGo=",
-			"path": "github.com/shadowsocks/go-shadowsocks2/core",
-			"revision": "7358a5c14bd056cd67e5b9aedc0bfa588e752152",
-			"revisionTime": "2018-04-15T15:21:30Z"
-		},
-		{
-			"checksumSHA1": "RIUpyaP+TZHTg2A1j25l72Nmwh4=",
-			"path": "github.com/shadowsocks/go-shadowsocks2/shadowaead",
-			"revision": "7358a5c14bd056cd67e5b9aedc0bfa588e752152",
-			"revisionTime": "2018-04-15T15:21:30Z"
-		},
-		{
-			"checksumSHA1": "qqBwHsGHTgCv8jbva9nw7jFA20g=",
-			"path": "github.com/shadowsocks/go-shadowsocks2/shadowstream",
-			"revision": "7358a5c14bd056cd67e5b9aedc0bfa588e752152",
-			"revisionTime": "2018-04-15T15:21:30Z"
-		},
-		{
-			"checksumSHA1": "7utuujqVopLPMVeiWdellqo+CPY=",
-			"path": "github.com/sun8911879/shadowsocksR",
-			"revision": "da20fda4804f730414a28fdb6d6c59bf25d3b186",
-			"revisionTime": "2018-05-29T04:20:39Z"
-		},
-		{
-			"checksumSHA1": "Mbr7G18zxd8AyfFmbVTq65WLJFQ=",
-			"path": "github.com/sun8911879/shadowsocksR/obfs",
-			"revision": "da20fda4804f730414a28fdb6d6c59bf25d3b186",
-			"revisionTime": "2018-05-29T04:20:39Z"
-		},
-		{
-			"checksumSHA1": "F3vZU5pZc0nWJBw8IhMHYsE1mss=",
-			"path": "github.com/sun8911879/shadowsocksR/protocol",
-			"revision": "da20fda4804f730414a28fdb6d6c59bf25d3b186",
-			"revisionTime": "2018-05-29T04:20:39Z"
-		},
-		{
-			"checksumSHA1": "ktmUgflTDcfGas7paJC7UBTpudo=",
-			"path": "github.com/sun8911879/shadowsocksR/ssr",
-			"revision": "da20fda4804f730414a28fdb6d6c59bf25d3b186",
-			"revisionTime": "2018-05-29T04:20:39Z"
-		},
-		{
-			"checksumSHA1": "Atqh0gW7Lgxlcb0670QN4dH7tVE=",
-			"path": "github.com/sun8911879/shadowsocksR/tools",
-			"revision": "da20fda4804f730414a28fdb6d6c59bf25d3b186",
-			"revisionTime": "2018-05-29T04:20:39Z"
-		},
-		{
-			"checksumSHA1": "4+eltbDtzXNpwE4P2UxBPznMPNE=",
-			"path": "github.com/sun8911879/shadowsocksR/tools/leakybuf",
-			"revision": "da20fda4804f730414a28fdb6d6c59bf25d3b186",
-			"revisionTime": "2018-05-29T04:20:39Z"
-		},
-		{
-			"checksumSHA1": "oVPHWesOmZ02vLq2fglGvf+AMgk=",
-			"path": "golang.org/x/crypto/blowfish",
-			"revision": "e6b1200d11b062b0e525730044b7de555b7ed66d",
-			"revisionTime": "2018-06-13T20:00:12Z"
-		},
-		{
-			"checksumSHA1": "TT1rac6kpQp2vz24m5yDGUNQ/QQ=",
-			"path": "golang.org/x/crypto/cast5",
-			"revision": "e6b1200d11b062b0e525730044b7de555b7ed66d",
-			"revisionTime": "2018-06-13T20:00:12Z"
-		},
-		{
-			"checksumSHA1": "c4bxPizB4/jUUxpw9XAdbZsA0Aw=",
-			"path": "golang.org/x/crypto/chacha20poly1305",
-			"revision": "e6b1200d11b062b0e525730044b7de555b7ed66d",
-			"revisionTime": "2018-06-13T20:00:12Z"
-		},
-		{
-			"checksumSHA1": "4D8hxMIaSDEW5pCQk22Xj4DcDh4=",
-			"path": "golang.org/x/crypto/hkdf",
-			"revision": "e6b1200d11b062b0e525730044b7de555b7ed66d",
-			"revisionTime": "2018-06-13T20:00:12Z"
-		},
-		{
-			"checksumSHA1": "SEPNUEkZaGKt3dkO3B13pRAp6ho=",
-			"path": "golang.org/x/crypto/internal/chacha20",
-			"revision": "e6b1200d11b062b0e525730044b7de555b7ed66d",
-			"revisionTime": "2018-06-13T20:00:12Z"
-		},
-		{
-			"checksumSHA1": "/U7f2gaH6DnEmLguVLDbipU6kXU=",
-			"path": "golang.org/x/crypto/internal/subtle",
-			"revision": "a49355c7e3f8fe157a85be2f77e6e269a0f89602",
-			"revisionTime": "2018-06-20T09:14:27Z"
-		},
-		{
-			"checksumSHA1": "vKbPb9fpjCdzuoOvajOJnYfHG2g=",
-			"path": "golang.org/x/crypto/poly1305",
-			"revision": "e6b1200d11b062b0e525730044b7de555b7ed66d",
-			"revisionTime": "2018-06-13T20:00:12Z"
-		},
-		{
-			"checksumSHA1": "cRCpfAgTnlIDpdcfjivbiv+9YJU=",
-			"path": "golang.org/x/crypto/salsa20/salsa",
-			"revision": "e6b1200d11b062b0e525730044b7de555b7ed66d",
-			"revisionTime": "2018-06-13T20:00:12Z"
-		},
-		{
-			"checksumSHA1": "REkmyB368pIiip76LiqMLspgCRk=",
-			"path": "golang.org/x/sys/cpu",
-			"revision": "151529c776cdc58ddbe7963ba9af779f3577b419",
-			"revisionTime": "2018-04-04T02:59:38Z"
-		}
-	],
-	"rootPath": "github.com/nadoo/glider"
-}