diff --git a/proxy/vmess/addr.go b/proxy/vmess/addr.go index d89f963..dfc494d 100644 --- a/proxy/vmess/addr.go +++ b/proxy/vmess/addr.go @@ -5,15 +5,15 @@ import ( "strconv" ) -// AType is vmess addr type -type AType byte +// Atyp is vmess addr type +type Atyp byte // Atyp const ( - ATypeErr AType = 0 - ATypeIP4 AType = 1 - ATypeDomain AType = 2 - ATypeIP6 AType = 3 + AtypErr Atyp = 0 + AtypIP4 Atyp = 1 + AtypDomain Atyp = 2 + AtypIP6 Atyp = 3 ) // Addr is vmess addr @@ -22,9 +22,9 @@ type Addr []byte // Port is vmess addr port type Port uint16 -// ParseAddr parses the address in string s. return AType = 0 if error. -func ParseAddr(s string) (AType, Addr, Port, error) { - var atype AType +// ParseAddr parses the address in string s +func ParseAddr(s string) (Atyp, Addr, Port, error) { + var atyp Atyp var addr Addr host, port, err := net.SplitHostPort(s) @@ -35,11 +35,11 @@ func ParseAddr(s string) (AType, Addr, Port, error) { if ip := net.ParseIP(host); ip != nil { if ip4 := ip.To4(); ip4 != nil { addr = make([]byte, net.IPv4len) - atype = ATypeIP4 + atyp = AtypIP4 copy(addr[:], ip4) } else { addr = make([]byte, net.IPv6len) - atype = ATypeIP6 + atyp = AtypIP6 copy(addr[:], ip) } } else { @@ -47,7 +47,7 @@ func ParseAddr(s string) (AType, Addr, Port, error) { return 0, nil, 0, err } addr = make([]byte, 1+len(host)) - atype = ATypeDomain + atyp = AtypDomain addr[0] = byte(len(host)) copy(addr[1:], host) } @@ -57,5 +57,5 @@ func ParseAddr(s string) (AType, Addr, Port, error) { return 0, nil, 0, err } - return atype, addr, Port(portnum), err + return atyp, addr, Port(portnum), err } diff --git a/proxy/vmess/chunk.go b/proxy/vmess/chunk.go new file mode 100644 index 0000000..233540c --- /dev/null +++ b/proxy/vmess/chunk.go @@ -0,0 +1,3 @@ +package vmess + +// chunk: plain, AES-128-CFB, AES-128-GCM, ChaCha20-Poly1305 diff --git a/proxy/vmess/client.go b/proxy/vmess/client.go index 251df63..17e44e0 100644 --- a/proxy/vmess/client.go +++ b/proxy/vmess/client.go @@ -51,9 +51,9 @@ type Conn struct { user *User security byte - atype AType - addr Addr - port Port + atyp Atyp + addr Addr + port Port reqBodyIV [16]byte reqBodyKey [16]byte @@ -99,7 +99,7 @@ func (c *Client) NewConn(rc net.Conn, target string) (*Conn, error) { conn := &Conn{user: c.users[r], security: c.security} var err error - conn.atype, conn.addr, conn.port, err = ParseAddr(target) + conn.atyp, conn.addr, conn.port, err = ParseAddr(target) if err != nil { return nil, err } @@ -161,7 +161,7 @@ func (c *Conn) EncodeRequest() ([]byte, error) { // target binary.Write(buf, binary.BigEndian, uint16(c.port)) // port - buf.WriteByte(byte(c.atype)) // atype + buf.WriteByte(byte(c.atyp)) // atyp buf.Write(c.addr) // addr // padding @@ -211,7 +211,6 @@ func (c *Conn) DecodeRespHeader() error { c.connected = true return nil - } func (c *Conn) Read(b []byte) (n int, err error) { diff --git a/vendor/github.com/Yawning/chacha20/LICENSE b/vendor/github.com/Yawning/chacha20/LICENSE new file mode 100644 index 0000000..6ca207e --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/LICENSE @@ -0,0 +1,122 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. + diff --git a/vendor/github.com/Yawning/chacha20/README.md b/vendor/github.com/Yawning/chacha20/README.md new file mode 100644 index 0000000..9080a84 --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/README.md @@ -0,0 +1,14 @@ +### chacha20 - ChaCha20 +#### Yawning Angel (yawning at schwanenlied dot me) + +Yet another Go ChaCha20 implementation. Everything else I found was slow, +didn't support all the variants I need to use, or relied on cgo to go fast. + +Features: + + * 20 round, 256 bit key only. Everything else is pointless and stupid. + * IETF 96 bit nonce variant. + * XChaCha 24 byte nonce variant. + * SSE2 and AVX2 support on amd64 targets. + * Incremental encrypt/decrypt support, unlike golang.org/x/crypto/salsa20. + diff --git a/vendor/github.com/Yawning/chacha20/chacha20.go b/vendor/github.com/Yawning/chacha20/chacha20.go new file mode 100644 index 0000000..07d5e4b --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20.go @@ -0,0 +1,273 @@ +// chacha20.go - A ChaCha stream cipher implementation. +// +// To the extent possible under law, Yawning Angel has waived all copyright +// and related or neighboring rights to chacha20, using the Creative +// Commons "CC0" public domain dedication. See LICENSE or +// for full details. + +package chacha20 + +import ( + "crypto/cipher" + "encoding/binary" + "errors" + "math" + "runtime" +) + +const ( + // KeySize is the ChaCha20 key size in bytes. + KeySize = 32 + + // NonceSize is the ChaCha20 nonce size in bytes. + NonceSize = 8 + + // INonceSize is the IETF ChaCha20 nonce size in bytes. + INonceSize = 12 + + // XNonceSize is the XChaCha20 nonce size in bytes. + XNonceSize = 24 + + // HNonceSize is the HChaCha20 nonce size in bytes. + HNonceSize = 16 + + // BlockSize is the ChaCha20 block size in bytes. + BlockSize = 64 + + stateSize = 16 + chachaRounds = 20 + + // The constant "expand 32-byte k" as little endian uint32s. + sigma0 = uint32(0x61707865) + sigma1 = uint32(0x3320646e) + sigma2 = uint32(0x79622d32) + sigma3 = uint32(0x6b206574) +) + +var ( + // ErrInvalidKey is the error returned when the key is invalid. + ErrInvalidKey = errors.New("key length must be KeySize bytes") + + // ErrInvalidNonce is the error returned when the nonce is invalid. + ErrInvalidNonce = errors.New("nonce length must be NonceSize/INonceSize/XNonceSize bytes") + + // ErrInvalidCounter is the error returned when the counter is invalid. + ErrInvalidCounter = errors.New("block counter is invalid (out of range)") + + useUnsafe = false + usingVectors = false + blocksFn = blocksRef +) + +// A Cipher is an instance of ChaCha20/XChaCha20 using a particular key and +// nonce. +type Cipher struct { + state [stateSize]uint32 + + buf [BlockSize]byte + off int + ietf bool +} + +// Reset zeros the key data so that it will no longer appear in the process's +// memory. +func (c *Cipher) Reset() { + for i := range c.state { + c.state[i] = 0 + } + for i := range c.buf { + c.buf[i] = 0 + } +} + +// XORKeyStream sets dst to the result of XORing src with the key stream. Dst +// and src may be the same slice but otherwise should not overlap. +func (c *Cipher) XORKeyStream(dst, src []byte) { + if len(dst) < len(src) { + src = src[:len(dst)] + } + + for remaining := len(src); remaining > 0; { + // Process multiple blocks at once. + if c.off == BlockSize { + nrBlocks := remaining / BlockSize + directBytes := nrBlocks * BlockSize + if nrBlocks > 0 { + blocksFn(&c.state, src, dst, nrBlocks, c.ietf) + remaining -= directBytes + if remaining == 0 { + return + } + dst = dst[directBytes:] + src = src[directBytes:] + } + + // If there's a partial block, generate 1 block of keystream into + // the internal buffer. + blocksFn(&c.state, nil, c.buf[:], 1, c.ietf) + c.off = 0 + } + + // Process partial blocks from the buffered keystream. + toXor := BlockSize - c.off + if remaining < toXor { + toXor = remaining + } + if toXor > 0 { + for i, v := range src[:toXor] { + dst[i] = v ^ c.buf[c.off+i] + } + dst = dst[toXor:] + src = src[toXor:] + + remaining -= toXor + c.off += toXor + } + } +} + +// KeyStream sets dst to the raw keystream. +func (c *Cipher) KeyStream(dst []byte) { + for remaining := len(dst); remaining > 0; { + // Process multiple blocks at once. + if c.off == BlockSize { + nrBlocks := remaining / BlockSize + directBytes := nrBlocks * BlockSize + if nrBlocks > 0 { + blocksFn(&c.state, nil, dst, nrBlocks, c.ietf) + remaining -= directBytes + if remaining == 0 { + return + } + dst = dst[directBytes:] + } + + // If there's a partial block, generate 1 block of keystream into + // the internal buffer. + blocksFn(&c.state, nil, c.buf[:], 1, c.ietf) + c.off = 0 + } + + // Process partial blocks from the buffered keystream. + toCopy := BlockSize - c.off + if remaining < toCopy { + toCopy = remaining + } + if toCopy > 0 { + copy(dst[:toCopy], c.buf[c.off:c.off+toCopy]) + dst = dst[toCopy:] + remaining -= toCopy + c.off += toCopy + } + } +} + +// ReKey reinitializes the ChaCha20/XChaCha20 instance with the provided key +// and nonce. +func (c *Cipher) ReKey(key, nonce []byte) error { + if len(key) != KeySize { + return ErrInvalidKey + } + + switch len(nonce) { + case NonceSize: + case INonceSize: + case XNonceSize: + var subkey [KeySize]byte + var subnonce [HNonceSize]byte + copy(subnonce[:], nonce[0:16]) + HChaCha(key, &subnonce, &subkey) + key = subkey[:] + nonce = nonce[16:24] + defer func() { + for i := range subkey { + subkey[i] = 0 + } + }() + default: + return ErrInvalidNonce + } + + c.Reset() + c.state[0] = sigma0 + c.state[1] = sigma1 + c.state[2] = sigma2 + c.state[3] = sigma3 + c.state[4] = binary.LittleEndian.Uint32(key[0:4]) + c.state[5] = binary.LittleEndian.Uint32(key[4:8]) + c.state[6] = binary.LittleEndian.Uint32(key[8:12]) + c.state[7] = binary.LittleEndian.Uint32(key[12:16]) + c.state[8] = binary.LittleEndian.Uint32(key[16:20]) + c.state[9] = binary.LittleEndian.Uint32(key[20:24]) + c.state[10] = binary.LittleEndian.Uint32(key[24:28]) + c.state[11] = binary.LittleEndian.Uint32(key[28:32]) + c.state[12] = 0 + if len(nonce) == INonceSize { + c.state[13] = binary.LittleEndian.Uint32(nonce[0:4]) + c.state[14] = binary.LittleEndian.Uint32(nonce[4:8]) + c.state[15] = binary.LittleEndian.Uint32(nonce[8:12]) + c.ietf = true + } else { + c.state[13] = 0 + c.state[14] = binary.LittleEndian.Uint32(nonce[0:4]) + c.state[15] = binary.LittleEndian.Uint32(nonce[4:8]) + c.ietf = false + } + c.off = BlockSize + return nil + +} + +// Seek sets the block counter to a given offset. +func (c *Cipher) Seek(blockCounter uint64) error { + if c.ietf { + if blockCounter > math.MaxUint32 { + return ErrInvalidCounter + } + c.state[12] = uint32(blockCounter) + } else { + c.state[12] = uint32(blockCounter) + c.state[13] = uint32(blockCounter >> 32) + } + c.off = BlockSize + return nil +} + +// NewCipher returns a new ChaCha20/XChaCha20 instance. +func NewCipher(key, nonce []byte) (*Cipher, error) { + c := new(Cipher) + if err := c.ReKey(key, nonce); err != nil { + return nil, err + } + return c, nil +} + +// HChaCha is the HChaCha20 hash function used to make XChaCha. +func HChaCha(key []byte, nonce *[HNonceSize]byte, out *[32]byte) { + var x [stateSize]uint32 // Last 4 slots unused, sigma hardcoded. + x[0] = binary.LittleEndian.Uint32(key[0:4]) + x[1] = binary.LittleEndian.Uint32(key[4:8]) + x[2] = binary.LittleEndian.Uint32(key[8:12]) + x[3] = binary.LittleEndian.Uint32(key[12:16]) + x[4] = binary.LittleEndian.Uint32(key[16:20]) + x[5] = binary.LittleEndian.Uint32(key[20:24]) + x[6] = binary.LittleEndian.Uint32(key[24:28]) + x[7] = binary.LittleEndian.Uint32(key[28:32]) + x[8] = binary.LittleEndian.Uint32(nonce[0:4]) + x[9] = binary.LittleEndian.Uint32(nonce[4:8]) + x[10] = binary.LittleEndian.Uint32(nonce[8:12]) + x[11] = binary.LittleEndian.Uint32(nonce[12:16]) + hChaChaRef(&x, out) +} + +func init() { + switch runtime.GOARCH { + case "386", "amd64": + // Abuse unsafe to skip calling binary.LittleEndian.PutUint32 + // in the critical path. This is a big boost on systems that are + // little endian and not overly picky about alignment. + useUnsafe = true + } +} + +var _ cipher.Stream = (*Cipher)(nil) diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.go b/vendor/github.com/Yawning/chacha20/chacha20_amd64.go new file mode 100644 index 0000000..05adad1 --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20_amd64.go @@ -0,0 +1,95 @@ +// chacha20_amd64.go - AMD64 optimized chacha20. +// +// To the extent possible under law, Yawning Angel has waived all copyright +// and related or neighboring rights to chacha20, using the Creative +// Commons "CC0" public domain dedication. See LICENSE or +// for full details. + +// +build amd64,!gccgo,!appengine + +package chacha20 + +import ( + "math" +) + +var usingAVX2 = false + +func blocksAmd64SSE2(x *uint32, inp, outp *byte, nrBlocks uint) + +func blocksAmd64AVX2(x *uint32, inp, outp *byte, nrBlocks uint) + +func cpuidAmd64(cpuidParams *uint32) + +func xgetbv0Amd64(xcrVec *uint32) + +func blocksAmd64(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) { + // Probably unneeded, but stating this explicitly simplifies the assembly. + if nrBlocks == 0 { + return + } + + if isIetf { + var totalBlocks uint64 + totalBlocks = uint64(x[12]) + uint64(nrBlocks) + if totalBlocks > math.MaxUint32 { + panic("chacha20: Exceeded keystream per nonce limit") + } + } + + if in == nil { + for i := range out { + out[i] = 0 + } + in = out + } + + // Pointless to call the AVX2 code for just a single block, since half of + // the output gets discarded... + if usingAVX2 && nrBlocks > 1 { + blocksAmd64AVX2(&x[0], &in[0], &out[0], uint(nrBlocks)) + } else { + blocksAmd64SSE2(&x[0], &in[0], &out[0], uint(nrBlocks)) + } +} + +func supportsAVX2() bool { + // https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family + const ( + osXsaveBit = 1 << 27 + avx2Bit = 1 << 5 + ) + + // Check to see if CPUID actually supports the leaf that indicates AVX2. + // CPUID.(EAX=0H, ECX=0H) >= 7 + regs := [4]uint32{0x00} + cpuidAmd64(®s[0]) + if regs[0] < 7 { + return false + } + + // Check to see if the OS knows how to save/restore XMM/YMM state. + // CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 + regs = [4]uint32{0x01} + cpuidAmd64(®s[0]) + if regs[2]&osXsaveBit == 0 { + return false + } + xcrRegs := [2]uint32{} + xgetbv0Amd64(&xcrRegs[0]) + if xcrRegs[0]&6 != 6 { + return false + } + + // Check for AVX2 support. + // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1 + regs = [4]uint32{0x07} + cpuidAmd64(®s[0]) + return regs[1]&avx2Bit != 0 +} + +func init() { + blocksFn = blocksAmd64 + usingVectors = true + usingAVX2 = supportsAVX2() +} diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.py b/vendor/github.com/Yawning/chacha20/chacha20_amd64.py new file mode 100644 index 0000000..3bfebf4 --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20_amd64.py @@ -0,0 +1,1295 @@ +#!/usr/bin/env python3 +# +# To the extent possible under law, Yawning Angel has waived all copyright +# and related or neighboring rights to chacha20, using the Creative +# Commons "CC0" public domain dedication. See LICENSE or +# for full details. + +# +# cgo sucks. Plan 9 assembly sucks. Real languages have SIMD intrinsics. +# The least terrible/retarded option is to use a Python code generator, so +# that's what I did. +# +# Code based on Ted Krovetz's vec128 C implementation, with corrections +# to use a 64 bit counter instead of 32 bit, and to allow unaligned input and +# output pointers. +# +# Dependencies: https://github.com/Maratyszcza/PeachPy +# +# python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py +# + +from peachpy import * +from peachpy.x86_64 import * + +x = Argument(ptr(uint32_t)) +inp = Argument(ptr(const_uint8_t)) +outp = Argument(ptr(uint8_t)) +nrBlocks = Argument(ptr(size_t)) + +# +# SSE2 helper functions. A temporary register is explicitly passed in because +# the main fast loop uses every single register (and even spills) so manual +# control is needed. +# +# This used to also have a DQROUNDS helper that did 2 rounds of ChaCha like +# in the C code, but the C code has the luxury of an optimizer reordering +# everything, while this does not. +# + +def ROTW16_sse2(tmp, d): + MOVDQA(tmp, d) + PSLLD(tmp, 16) + PSRLD(d, 16) + PXOR(d, tmp) + +def ROTW12_sse2(tmp, b): + MOVDQA(tmp, b) + PSLLD(tmp, 12) + PSRLD(b, 20) + PXOR(b, tmp) + +def ROTW8_sse2(tmp, d): + MOVDQA(tmp, d) + PSLLD(tmp, 8) + PSRLD(d, 24) + PXOR(d, tmp) + +def ROTW7_sse2(tmp, b): + MOVDQA(tmp, b) + PSLLD(tmp, 7) + PSRLD(b, 25) + PXOR(b, tmp) + +def WriteXor_sse2(tmp, inp, outp, d, v0, v1, v2, v3): + MOVDQU(tmp, [inp+d]) + PXOR(tmp, v0) + MOVDQU([outp+d], tmp) + MOVDQU(tmp, [inp+d+16]) + PXOR(tmp, v1) + MOVDQU([outp+d+16], tmp) + MOVDQU(tmp, [inp+d+32]) + PXOR(tmp, v2) + MOVDQU([outp+d+32], tmp) + MOVDQU(tmp, [inp+d+48]) + PXOR(tmp, v3) + MOVDQU([outp+d+48], tmp) + +# SSE2 ChaCha20 (aka vec128). Does not handle partial blocks, and will +# process 4/2/1 blocks at a time. +with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)): + reg_x = GeneralPurposeRegister64() + reg_inp = GeneralPurposeRegister64() + reg_outp = GeneralPurposeRegister64() + reg_blocks = GeneralPurposeRegister64() + reg_sp_save = GeneralPurposeRegister64() + + LOAD.ARGUMENT(reg_x, x) + LOAD.ARGUMENT(reg_inp, inp) + LOAD.ARGUMENT(reg_outp, outp) + LOAD.ARGUMENT(reg_blocks, nrBlocks) + + # Align the stack to a 32 byte boundary. + MOV(reg_sp_save, registers.rsp) + AND(registers.rsp, 0xffffffffffffffe0) + SUB(registers.rsp, 0x20) + + # Build the counter increment vector on the stack, and allocate the scratch + # space + xmm_v0 = XMMRegister() + PXOR(xmm_v0, xmm_v0) + SUB(registers.rsp, 16+16) + MOVDQA([registers.rsp], xmm_v0) + reg_tmp = GeneralPurposeRegister32() + MOV(reg_tmp, 0x00000001) + MOV([registers.rsp], reg_tmp) + mem_one = [registers.rsp] # (Stack) Counter increment vector + mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space. + + mem_s0 = [reg_x] # (Memory) Cipher state [0..3] + mem_s1 = [reg_x+16] # (Memory) Cipher state [4..7] + mem_s2 = [reg_x+32] # (Memory) Cipher state [8..11] + mem_s3 = [reg_x+48] # (Memory) Cipher state [12..15] + + # xmm_v0 allocated above... + xmm_v1 = XMMRegister() + xmm_v2 = XMMRegister() + xmm_v3 = XMMRegister() + + xmm_v4 = XMMRegister() + xmm_v5 = XMMRegister() + xmm_v6 = XMMRegister() + xmm_v7 = XMMRegister() + + xmm_v8 = XMMRegister() + xmm_v9 = XMMRegister() + xmm_v10 = XMMRegister() + xmm_v11 = XMMRegister() + + xmm_v12 = XMMRegister() + xmm_v13 = XMMRegister() + xmm_v14 = XMMRegister() + xmm_v15 = XMMRegister() + + xmm_tmp = xmm_v12 + + # + # 4 blocks at a time. + # + + reg_rounds = GeneralPurposeRegister64() + + vector_loop4 = Loop() + SUB(reg_blocks, 4) + JB(vector_loop4.end) + with vector_loop4: + MOVDQU(xmm_v0, mem_s0) + MOVDQU(xmm_v1, mem_s1) + MOVDQU(xmm_v2, mem_s2) + MOVDQU(xmm_v3, mem_s3) + + MOVDQA(xmm_v4, xmm_v0) + MOVDQA(xmm_v5, xmm_v1) + MOVDQA(xmm_v6, xmm_v2) + MOVDQA(xmm_v7, xmm_v3) + PADDQ(xmm_v7, mem_one) + + MOVDQA(xmm_v8, xmm_v0) + MOVDQA(xmm_v9, xmm_v1) + MOVDQA(xmm_v10, xmm_v2) + MOVDQA(xmm_v11, xmm_v7) + PADDQ(xmm_v11, mem_one) + + MOVDQA(xmm_v12, xmm_v0) + MOVDQA(xmm_v13, xmm_v1) + MOVDQA(xmm_v14, xmm_v2) + MOVDQA(xmm_v15, xmm_v11) + PADDQ(xmm_v15, mem_one) + + MOV(reg_rounds, 20) + rounds_loop4 = Loop() + with rounds_loop4: + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PADDD(xmm_v8, xmm_v9) + PADDD(xmm_v12, xmm_v13) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + PXOR(xmm_v11, xmm_v8) + PXOR(xmm_v15, xmm_v12) + + MOVDQA(mem_tmp0, xmm_tmp) # Save + + ROTW16_sse2(xmm_tmp, xmm_v3) + ROTW16_sse2(xmm_tmp, xmm_v7) + ROTW16_sse2(xmm_tmp, xmm_v11) + ROTW16_sse2(xmm_tmp, xmm_v15) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PADDD(xmm_v10, xmm_v11) + PADDD(xmm_v14, xmm_v15) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + PXOR(xmm_v9, xmm_v10) + PXOR(xmm_v13, xmm_v14) + ROTW12_sse2(xmm_tmp, xmm_v1) + ROTW12_sse2(xmm_tmp, xmm_v5) + ROTW12_sse2(xmm_tmp, xmm_v9) + ROTW12_sse2(xmm_tmp, xmm_v13) + + # a += b; d ^= a; d = ROTW8(d); + MOVDQA(xmm_tmp, mem_tmp0) # Restore + + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PADDD(xmm_v8, xmm_v9) + PADDD(xmm_v12, xmm_v13) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + PXOR(xmm_v11, xmm_v8) + PXOR(xmm_v15, xmm_v12) + + MOVDQA(mem_tmp0, xmm_tmp) # Save + + ROTW8_sse2(xmm_tmp, xmm_v3) + ROTW8_sse2(xmm_tmp, xmm_v7) + ROTW8_sse2(xmm_tmp, xmm_v11) + ROTW8_sse2(xmm_tmp, xmm_v15) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PADDD(xmm_v10, xmm_v11) + PADDD(xmm_v14, xmm_v15) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + PXOR(xmm_v9, xmm_v10) + PXOR(xmm_v13, xmm_v14) + ROTW7_sse2(xmm_tmp, xmm_v1) + ROTW7_sse2(xmm_tmp, xmm_v5) + ROTW7_sse2(xmm_tmp, xmm_v9) + ROTW7_sse2(xmm_tmp, xmm_v13) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x39) + PSHUFD(xmm_v5, xmm_v5, 0x39) + PSHUFD(xmm_v9, xmm_v9, 0x39) + PSHUFD(xmm_v13, xmm_v13, 0x39) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v6, xmm_v6, 0x4e) + PSHUFD(xmm_v10, xmm_v10, 0x4e) + PSHUFD(xmm_v14, xmm_v14, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x93) + PSHUFD(xmm_v7, xmm_v7, 0x93) + PSHUFD(xmm_v11, xmm_v11, 0x93) + PSHUFD(xmm_v15, xmm_v15, 0x93) + + MOVDQA(xmm_tmp, mem_tmp0) # Restore + + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PADDD(xmm_v8, xmm_v9) + PADDD(xmm_v12, xmm_v13) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + PXOR(xmm_v11, xmm_v8) + PXOR(xmm_v15, xmm_v12) + + MOVDQA(mem_tmp0, xmm_tmp) # Save + + ROTW16_sse2(xmm_tmp, xmm_v3) + ROTW16_sse2(xmm_tmp, xmm_v7) + ROTW16_sse2(xmm_tmp, xmm_v11) + ROTW16_sse2(xmm_tmp, xmm_v15) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PADDD(xmm_v10, xmm_v11) + PADDD(xmm_v14, xmm_v15) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + PXOR(xmm_v9, xmm_v10) + PXOR(xmm_v13, xmm_v14) + ROTW12_sse2(xmm_tmp, xmm_v1) + ROTW12_sse2(xmm_tmp, xmm_v5) + ROTW12_sse2(xmm_tmp, xmm_v9) + ROTW12_sse2(xmm_tmp, xmm_v13) + + # a += b; d ^= a; d = ROTW8(d); + MOVDQA(xmm_tmp, mem_tmp0) # Restore + + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PADDD(xmm_v8, xmm_v9) + PADDD(xmm_v12, xmm_v13) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + PXOR(xmm_v11, xmm_v8) + PXOR(xmm_v15, xmm_v12) + + MOVDQA(mem_tmp0, xmm_tmp) # Save + + ROTW8_sse2(xmm_tmp, xmm_v3) + ROTW8_sse2(xmm_tmp, xmm_v7) + ROTW8_sse2(xmm_tmp, xmm_v11) + ROTW8_sse2(xmm_tmp, xmm_v15) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PADDD(xmm_v10, xmm_v11) + PADDD(xmm_v14, xmm_v15) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + PXOR(xmm_v9, xmm_v10) + PXOR(xmm_v13, xmm_v14) + ROTW7_sse2(xmm_tmp, xmm_v1) + ROTW7_sse2(xmm_tmp, xmm_v5) + ROTW7_sse2(xmm_tmp, xmm_v9) + ROTW7_sse2(xmm_tmp, xmm_v13) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x93) + PSHUFD(xmm_v5, xmm_v5, 0x93) + PSHUFD(xmm_v9, xmm_v9, 0x93) + PSHUFD(xmm_v13, xmm_v13, 0x93) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v6, xmm_v6, 0x4e) + PSHUFD(xmm_v10, xmm_v10, 0x4e) + PSHUFD(xmm_v14, xmm_v14, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x39) + PSHUFD(xmm_v7, xmm_v7, 0x39) + PSHUFD(xmm_v11, xmm_v11, 0x39) + PSHUFD(xmm_v15, xmm_v15, 0x39) + + MOVDQA(xmm_tmp, mem_tmp0) # Restore + + SUB(reg_rounds, 2) + JNZ(rounds_loop4.begin) + + MOVDQA(mem_tmp0, xmm_tmp) + + PADDD(xmm_v0, mem_s0) + PADDD(xmm_v1, mem_s1) + PADDD(xmm_v2, mem_s2) + PADDD(xmm_v3, mem_s3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) + MOVDQU(xmm_v3, mem_s3) + PADDQ(xmm_v3, mem_one) + + PADDD(xmm_v4, mem_s0) + PADDD(xmm_v5, mem_s1) + PADDD(xmm_v6, mem_s2) + PADDD(xmm_v7, xmm_v3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7) + PADDQ(xmm_v3, mem_one) + + PADDD(xmm_v8, mem_s0) + PADDD(xmm_v9, mem_s1) + PADDD(xmm_v10, mem_s2) + PADDD(xmm_v11, xmm_v3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11) + PADDQ(xmm_v3, mem_one) + + MOVDQA(xmm_tmp, mem_tmp0) + + PADDD(xmm_v12, mem_s0) + PADDD(xmm_v13, mem_s1) + PADDD(xmm_v14, mem_s2) + PADDD(xmm_v15, xmm_v3) + WriteXor_sse2(xmm_v0, reg_inp, reg_outp, 192, xmm_v12, xmm_v13, xmm_v14, xmm_v15) + PADDQ(xmm_v3, mem_one) + + MOVDQU(mem_s3, xmm_v3) + + ADD(reg_inp, 4 * 64) + ADD(reg_outp, 4 * 64) + + SUB(reg_blocks, 4) + JAE(vector_loop4.begin) + + ADD(reg_blocks, 4) + out = Label() + JZ(out) + + # Past this point, we no longer need to use every single register to hold + # the in progress state. + + xmm_s0 = xmm_v8 + xmm_s1 = xmm_v9 + xmm_s2 = xmm_v10 + xmm_s3 = xmm_v11 + xmm_one = xmm_v13 + MOVDQU(xmm_s0, mem_s0) + MOVDQU(xmm_s1, mem_s1) + MOVDQU(xmm_s2, mem_s2) + MOVDQU(xmm_s3, mem_s3) + MOVDQA(xmm_one, mem_one) + + # + # 2 blocks at a time. + # + + process_1_block = Label() + SUB(reg_blocks, 2) + JB(process_1_block) # < 2 blocks remaining. + + MOVDQA(xmm_v0, xmm_s0) + MOVDQA(xmm_v1, xmm_s1) + MOVDQA(xmm_v2, xmm_s2) + MOVDQA(xmm_v3, xmm_s3) + + MOVDQA(xmm_v4, xmm_v0) + MOVDQA(xmm_v5, xmm_v1) + MOVDQA(xmm_v6, xmm_v2) + MOVDQA(xmm_v7, xmm_v3) + PADDQ(xmm_v7, xmm_one) + + MOV(reg_rounds, 20) + rounds_loop2 = Loop() + with rounds_loop2: + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + ROTW16_sse2(xmm_tmp, xmm_v3) + ROTW16_sse2(xmm_tmp, xmm_v7) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + ROTW12_sse2(xmm_tmp, xmm_v1) + ROTW12_sse2(xmm_tmp, xmm_v5) + + # a += b; d ^= a; d = ROTW8(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + ROTW8_sse2(xmm_tmp, xmm_v3) + ROTW8_sse2(xmm_tmp, xmm_v7) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + ROTW7_sse2(xmm_tmp, xmm_v1) + ROTW7_sse2(xmm_tmp, xmm_v5) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x39) + PSHUFD(xmm_v5, xmm_v5, 0x39) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v6, xmm_v6, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x93) + PSHUFD(xmm_v7, xmm_v7, 0x93) + + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + ROTW16_sse2(xmm_tmp, xmm_v3) + ROTW16_sse2(xmm_tmp, xmm_v7) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + ROTW12_sse2(xmm_tmp, xmm_v1) + ROTW12_sse2(xmm_tmp, xmm_v5) + + # a += b; d ^= a; d = ROTW8(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + ROTW8_sse2(xmm_tmp, xmm_v3) + ROTW8_sse2(xmm_tmp, xmm_v7) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + ROTW7_sse2(xmm_tmp, xmm_v1) + ROTW7_sse2(xmm_tmp, xmm_v5) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x93) + PSHUFD(xmm_v5, xmm_v5, 0x93) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v6, xmm_v6, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x39) + PSHUFD(xmm_v7, xmm_v7, 0x39) + + SUB(reg_rounds, 2) + JNZ(rounds_loop2.begin) + + PADDD(xmm_v0, xmm_s0) + PADDD(xmm_v1, xmm_s1) + PADDD(xmm_v2, xmm_s2) + PADDD(xmm_v3, xmm_s3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) + PADDQ(xmm_s3, xmm_one) + + PADDD(xmm_v4, xmm_s0) + PADDD(xmm_v5, xmm_s1) + PADDD(xmm_v6, xmm_s2) + PADDD(xmm_v7, xmm_s3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7) + PADDQ(xmm_s3, xmm_one) + + ADD(reg_inp, 2 * 64) + ADD(reg_outp, 2 * 64) + SUB(reg_blocks, 2) + + LABEL(process_1_block) + ADD(reg_blocks, 2) + out_serial = Label() + JZ(out_serial) + + # + # 1 block at a time. Only executed once, because if there was > 1, + # the parallel code would have processed it already. + # + + MOVDQA(xmm_v0, xmm_s0) + MOVDQA(xmm_v1, xmm_s1) + MOVDQA(xmm_v2, xmm_s2) + MOVDQA(xmm_v3, xmm_s3) + + MOV(reg_rounds, 20) + rounds_loop1 = Loop() + with rounds_loop1: + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PXOR(xmm_v3, xmm_v0) + ROTW16_sse2(xmm_tmp, xmm_v3) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PXOR(xmm_v1, xmm_v2) + ROTW12_sse2(xmm_tmp, xmm_v1) + + # a += b; d ^= a; d = ROTW8(d); + PADDD(xmm_v0, xmm_v1) + PXOR(xmm_v3, xmm_v0) + ROTW8_sse2(xmm_tmp, xmm_v3) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PXOR(xmm_v1, xmm_v2) + ROTW7_sse2(xmm_tmp, xmm_v1) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x39) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x93) + + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PXOR(xmm_v3, xmm_v0) + ROTW16_sse2(xmm_tmp, xmm_v3) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PXOR(xmm_v1, xmm_v2) + ROTW12_sse2(xmm_tmp, xmm_v1) + + # a += b; d ^= a; d = ROTW8(d); + PADDD(xmm_v0, xmm_v1) + PXOR(xmm_v3, xmm_v0) + ROTW8_sse2(xmm_tmp, xmm_v3) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PXOR(xmm_v1, xmm_v2) + ROTW7_sse2(xmm_tmp, xmm_v1) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x93) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x39) + + SUB(reg_rounds, 2) + JNZ(rounds_loop1.begin) + + PADDD(xmm_v0, xmm_s0) + PADDD(xmm_v1, xmm_s1) + PADDD(xmm_v2, xmm_s2) + PADDD(xmm_v3, xmm_s3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) + PADDQ(xmm_s3, xmm_one) + + LABEL(out_serial) + + # Write back the updated counter. Stoping at 2^70 bytes is the user's + # problem, not mine. (Skipped if there's exactly a multiple of 4 blocks + # because the counter is incremented in memory while looping.) + MOVDQU(mem_s3, xmm_s3) + + LABEL(out) + + # Paranoia, cleanse the scratch space. + PXOR(xmm_v0, xmm_v0) + MOVDQA(mem_tmp0, xmm_v0) + + # Remove our stack allocation. + MOV(registers.rsp, reg_sp_save) + + RETURN() + +# +# AVX2 helpers. Like the SSE2 equivalents, the scratch register is explicit, +# and more helpers are used to increase readability for destructive operations. +# +# XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB. +# + +def ADD_avx2(dst, src): + VPADDD(dst, dst, src) + +def XOR_avx2(dst, src): + VPXOR(dst, dst, src) + +def ROTW16_avx2(tmp, d): + VPSLLD(tmp, d, 16) + VPSRLD(d, d, 16) + XOR_avx2(d, tmp) + +def ROTW12_avx2(tmp, b): + VPSLLD(tmp, b, 12) + VPSRLD(b, b, 20) + XOR_avx2(b, tmp) + +def ROTW8_avx2(tmp, d): + VPSLLD(tmp, d, 8) + VPSRLD(d, d, 24) + XOR_avx2(d, tmp) + +def ROTW7_avx2(tmp, b): + VPSLLD(tmp, b, 7) + VPSRLD(b, b, 25) + XOR_avx2(b, tmp) + +def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3): + # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20)); + VPERM2I128(tmp, v0, v1, 0x20) + VPXOR(tmp, tmp, [inp+d]) + VMOVDQU([outp+d], tmp) + + # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20)); + VPERM2I128(tmp, v2, v3, 0x20) + VPXOR(tmp, tmp, [inp+d+32]) + VMOVDQU([outp+d+32], tmp) + + # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31)); + VPERM2I128(tmp, v0, v1, 0x31) + VPXOR(tmp, tmp, [inp+d+64]) + VMOVDQU([outp+d+64], tmp) + + # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31)); + VPERM2I128(tmp, v2, v3, 0x31) + VPXOR(tmp, tmp, [inp+d+96]) + VMOVDQU([outp+d+96], tmp) + +# AVX2 ChaCha20 (aka avx2). Does not handle partial blocks, will process +# 8/4/2 blocks at a time. +with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell): + reg_x = GeneralPurposeRegister64() + reg_inp = GeneralPurposeRegister64() + reg_outp = GeneralPurposeRegister64() + reg_blocks = GeneralPurposeRegister64() + reg_sp_save = GeneralPurposeRegister64() + + LOAD.ARGUMENT(reg_x, x) + LOAD.ARGUMENT(reg_inp, inp) + LOAD.ARGUMENT(reg_outp, outp) + LOAD.ARGUMENT(reg_blocks, nrBlocks) + + # Align the stack to a 32 byte boundary. + MOV(reg_sp_save, registers.rsp) + AND(registers.rsp, 0xffffffffffffffe0) + SUB(registers.rsp, 0x20) + + x_s0 = [reg_x] # (Memory) Cipher state [0..3] + x_s1 = [reg_x+16] # (Memory) Cipher state [4..7] + x_s2 = [reg_x+32] # (Memory) Cipher state [8..11] + x_s3 = [reg_x+48] # (Memory) Cipher state [12..15] + + ymm_v0 = YMMRegister() + ymm_v1 = YMMRegister() + ymm_v2 = YMMRegister() + ymm_v3 = YMMRegister() + + ymm_v4 = YMMRegister() + ymm_v5 = YMMRegister() + ymm_v6 = YMMRegister() + ymm_v7 = YMMRegister() + + ymm_v8 = YMMRegister() + ymm_v9 = YMMRegister() + ymm_v10 = YMMRegister() + ymm_v11 = YMMRegister() + + ymm_v12 = YMMRegister() + ymm_v13 = YMMRegister() + ymm_v14 = YMMRegister() + ymm_v15 = YMMRegister() + + ymm_tmp0 = ymm_v12 + + # Allocate the neccecary stack space for the counter vector and two ymm + # registers that we will spill. + SUB(registers.rsp, 96) + mem_tmp0 = [registers.rsp+64] # (Stack) Scratch space. + mem_s3 = [registers.rsp+32] # (Stack) Working copy of s3. (8x) + mem_inc = [registers.rsp] # (Stack) Counter increment vector. + + # Increment the counter for one side of the state vector. + VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0) + VMOVDQU(mem_inc, ymm_tmp0) + reg_tmp = GeneralPurposeRegister32() + MOV(reg_tmp, 0x00000001) + MOV([registers.rsp+16], reg_tmp) + VBROADCASTI128(ymm_v3, x_s3) + VPADDQ(ymm_v3, ymm_v3, [registers.rsp]) + VMOVDQA(mem_s3, ymm_v3) + + # As we process 2xN blocks at a time, so the counter increment for both + # sides of the state vector is 2. + MOV(reg_tmp, 0x00000002) + MOV([registers.rsp], reg_tmp) + MOV([registers.rsp+16], reg_tmp) + + out_write_even = Label() + out_write_odd = Label() + + # + # 8 blocks at a time. Ted Krovetz's avx2 code does not do this, but it's + # a decent gain despite all the pain... + # + + reg_rounds = GeneralPurposeRegister64() + + vector_loop8 = Loop() + SUB(reg_blocks, 8) + JB(vector_loop8.end) + with vector_loop8: + VBROADCASTI128(ymm_v0, x_s0) + VBROADCASTI128(ymm_v1, x_s1) + VBROADCASTI128(ymm_v2, x_s2) + VMOVDQA(ymm_v3, mem_s3) + + VMOVDQA(ymm_v4, ymm_v0) + VMOVDQA(ymm_v5, ymm_v1) + VMOVDQA(ymm_v6, ymm_v2) + VPADDQ(ymm_v7, ymm_v3, mem_inc) + + VMOVDQA(ymm_v8, ymm_v0) + VMOVDQA(ymm_v9, ymm_v1) + VMOVDQA(ymm_v10, ymm_v2) + VPADDQ(ymm_v11, ymm_v7, mem_inc) + + VMOVDQA(ymm_v12, ymm_v0) + VMOVDQA(ymm_v13, ymm_v1) + VMOVDQA(ymm_v14, ymm_v2) + VPADDQ(ymm_v15, ymm_v11, mem_inc) + + MOV(reg_rounds, 20) + rounds_loop8 = Loop() + with rounds_loop8: + # a += b; d ^= a; d = ROTW16(d); + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + ADD_avx2(ymm_v8, ymm_v9) + ADD_avx2(ymm_v12, ymm_v13) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + XOR_avx2(ymm_v11, ymm_v8) + XOR_avx2(ymm_v15, ymm_v12) + + VMOVDQA(mem_tmp0, ymm_tmp0) # Save + + ROTW16_avx2(ymm_tmp0, ymm_v3) + ROTW16_avx2(ymm_tmp0, ymm_v7) + ROTW16_avx2(ymm_tmp0, ymm_v11) + ROTW16_avx2(ymm_tmp0, ymm_v15) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + ADD_avx2(ymm_v10, ymm_v11) + ADD_avx2(ymm_v14, ymm_v15) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + XOR_avx2(ymm_v9, ymm_v10) + XOR_avx2(ymm_v13, ymm_v14) + ROTW12_avx2(ymm_tmp0, ymm_v1) + ROTW12_avx2(ymm_tmp0, ymm_v5) + ROTW12_avx2(ymm_tmp0, ymm_v9) + ROTW12_avx2(ymm_tmp0, ymm_v13) + + # a += b; d ^= a; d = ROTW8(d); + VMOVDQA(ymm_tmp0, mem_tmp0) # Restore + + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + ADD_avx2(ymm_v8, ymm_v9) + ADD_avx2(ymm_v12, ymm_v13) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + XOR_avx2(ymm_v11, ymm_v8) + XOR_avx2(ymm_v15, ymm_v12) + + VMOVDQA(mem_tmp0, ymm_tmp0) # Save + + ROTW8_avx2(ymm_tmp0, ymm_v3) + ROTW8_avx2(ymm_tmp0, ymm_v7) + ROTW8_avx2(ymm_tmp0, ymm_v11) + ROTW8_avx2(ymm_tmp0, ymm_v15) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + ADD_avx2(ymm_v10, ymm_v11) + ADD_avx2(ymm_v14, ymm_v15) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + XOR_avx2(ymm_v9, ymm_v10) + XOR_avx2(ymm_v13, ymm_v14) + ROTW7_avx2(ymm_tmp0, ymm_v1) + ROTW7_avx2(ymm_tmp0, ymm_v5) + ROTW7_avx2(ymm_tmp0, ymm_v9) + ROTW7_avx2(ymm_tmp0, ymm_v13) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x39) + VPSHUFD(ymm_v5, ymm_v5, 0x39) + VPSHUFD(ymm_v9, ymm_v9, 0x39) + VPSHUFD(ymm_v13, ymm_v13, 0x39) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v6, ymm_v6, 0x4e) + VPSHUFD(ymm_v10, ymm_v10, 0x4e) + VPSHUFD(ymm_v14, ymm_v14, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x93) + VPSHUFD(ymm_v7, ymm_v7, 0x93) + VPSHUFD(ymm_v11, ymm_v11, 0x93) + VPSHUFD(ymm_v15, ymm_v15, 0x93) + + # a += b; d ^= a; d = ROTW16(d); + VMOVDQA(ymm_tmp0, mem_tmp0) # Restore + + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + ADD_avx2(ymm_v8, ymm_v9) + ADD_avx2(ymm_v12, ymm_v13) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + XOR_avx2(ymm_v11, ymm_v8) + XOR_avx2(ymm_v15, ymm_v12) + + VMOVDQA(mem_tmp0, ymm_tmp0) # Save + + ROTW16_avx2(ymm_tmp0, ymm_v3) + ROTW16_avx2(ymm_tmp0, ymm_v7) + ROTW16_avx2(ymm_tmp0, ymm_v11) + ROTW16_avx2(ymm_tmp0, ymm_v15) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + ADD_avx2(ymm_v10, ymm_v11) + ADD_avx2(ymm_v14, ymm_v15) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + XOR_avx2(ymm_v9, ymm_v10) + XOR_avx2(ymm_v13, ymm_v14) + ROTW12_avx2(ymm_tmp0, ymm_v1) + ROTW12_avx2(ymm_tmp0, ymm_v5) + ROTW12_avx2(ymm_tmp0, ymm_v9) + ROTW12_avx2(ymm_tmp0, ymm_v13) + + # a += b; d ^= a; d = ROTW8(d); + VMOVDQA(ymm_tmp0, mem_tmp0) # Restore + + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + ADD_avx2(ymm_v8, ymm_v9) + ADD_avx2(ymm_v12, ymm_v13) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + XOR_avx2(ymm_v11, ymm_v8) + XOR_avx2(ymm_v15, ymm_v12) + + VMOVDQA(mem_tmp0, ymm_tmp0) # Save + + ROTW8_avx2(ymm_tmp0, ymm_v3) + ROTW8_avx2(ymm_tmp0, ymm_v7) + ROTW8_avx2(ymm_tmp0, ymm_v11) + ROTW8_avx2(ymm_tmp0, ymm_v15) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + ADD_avx2(ymm_v10, ymm_v11) + ADD_avx2(ymm_v14, ymm_v15) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + XOR_avx2(ymm_v9, ymm_v10) + XOR_avx2(ymm_v13, ymm_v14) + ROTW7_avx2(ymm_tmp0, ymm_v1) + ROTW7_avx2(ymm_tmp0, ymm_v5) + ROTW7_avx2(ymm_tmp0, ymm_v9) + ROTW7_avx2(ymm_tmp0, ymm_v13) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x93) + VPSHUFD(ymm_v5, ymm_v5, 0x93) + VPSHUFD(ymm_v9, ymm_v9, 0x93) + VPSHUFD(ymm_v13, ymm_v13, 0x93) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v6, ymm_v6, 0x4e) + VPSHUFD(ymm_v10, ymm_v10, 0x4e) + VPSHUFD(ymm_v14, ymm_v14, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x39) + VPSHUFD(ymm_v7, ymm_v7, 0x39) + VPSHUFD(ymm_v11, ymm_v11, 0x39) + VPSHUFD(ymm_v15, ymm_v15, 0x39) + + VMOVDQA(ymm_tmp0, mem_tmp0) # Restore + + SUB(reg_rounds, 2) + JNZ(rounds_loop8.begin) + + # ymm_v12 is in mem_tmp0 and is current.... + + # XXX: I assume VBROADCASTI128 is about as fast as VMOVDQA.... + VBROADCASTI128(ymm_tmp0, x_s0) + ADD_avx2(ymm_v0, ymm_tmp0) + ADD_avx2(ymm_v4, ymm_tmp0) + ADD_avx2(ymm_v8, ymm_tmp0) + ADD_avx2(ymm_tmp0, mem_tmp0) + VMOVDQA(mem_tmp0, ymm_tmp0) + + VBROADCASTI128(ymm_tmp0, x_s1) + ADD_avx2(ymm_v1, ymm_tmp0) + ADD_avx2(ymm_v5, ymm_tmp0) + ADD_avx2(ymm_v9, ymm_tmp0) + ADD_avx2(ymm_v13, ymm_tmp0) + + VBROADCASTI128(ymm_tmp0, x_s2) + ADD_avx2(ymm_v2, ymm_tmp0) + ADD_avx2(ymm_v6, ymm_tmp0) + ADD_avx2(ymm_v10, ymm_tmp0) + ADD_avx2(ymm_v14, ymm_tmp0) + + ADD_avx2(ymm_v3, mem_s3) + WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3) + VMOVDQA(ymm_v3, mem_s3) + ADD_avx2(ymm_v3, mem_inc) + + ADD_avx2(ymm_v7, ymm_v3) + WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7) + ADD_avx2(ymm_v3, mem_inc) + + ADD_avx2(ymm_v11, ymm_v3) + WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11) + ADD_avx2(ymm_v3, mem_inc) + + VMOVDQA(ymm_v12, mem_tmp0) + ADD_avx2(ymm_v15, ymm_v3) + WriteXor_avx2(ymm_v0, reg_inp, reg_outp, 384, ymm_v12, ymm_v13, ymm_v14, ymm_v15) + ADD_avx2(ymm_v3, mem_inc) + + VMOVDQA(mem_s3, ymm_v3) + + ADD(reg_inp, 8 * 64) + ADD(reg_outp, 8 * 64) + + SUB(reg_blocks, 8) + JAE(vector_loop8.begin) + + # ymm_v3 contains a current copy of mem_s3 either from when it was built, + # or because the loop updates it. Copy this before we mess with the block + # counter in case we need to write it back and return. + ymm_s3 = ymm_v11 + VMOVDQA(ymm_s3, ymm_v3) + + ADD(reg_blocks, 8) + JZ(out_write_even) + + # We now actually can do everything in registers. + ymm_s0 = ymm_v8 + VBROADCASTI128(ymm_s0, x_s0) + ymm_s1 = ymm_v9 + VBROADCASTI128(ymm_s1, x_s1) + ymm_s2 = ymm_v10 + VBROADCASTI128(ymm_s2, x_s2) + ymm_inc = ymm_v14 + VMOVDQA(ymm_inc, mem_inc) + + # + # 4 blocks at a time. + # + + process_2_blocks = Label() + SUB(reg_blocks, 4) + JB(process_2_blocks) # < 4 blocks remaining. + + VMOVDQA(ymm_v0, ymm_s0) + VMOVDQA(ymm_v1, ymm_s1) + VMOVDQA(ymm_v2, ymm_s2) + VMOVDQA(ymm_v3, ymm_s3) + + VMOVDQA(ymm_v4, ymm_v0) + VMOVDQA(ymm_v5, ymm_v1) + VMOVDQA(ymm_v6, ymm_v2) + VPADDQ(ymm_v7, ymm_v3, ymm_inc) + + MOV(reg_rounds, 20) + rounds_loop4 = Loop() + with rounds_loop4: + # a += b; d ^= a; d = ROTW16(d); + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + ROTW16_avx2(ymm_tmp0, ymm_v3) + ROTW16_avx2(ymm_tmp0, ymm_v7) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + ROTW12_avx2(ymm_tmp0, ymm_v1) + ROTW12_avx2(ymm_tmp0, ymm_v5) + + # a += b; d ^= a; d = ROTW8(d); + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + ROTW8_avx2(ymm_tmp0, ymm_v3) + ROTW8_avx2(ymm_tmp0, ymm_v7) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + ROTW7_avx2(ymm_tmp0, ymm_v1) + ROTW7_avx2(ymm_tmp0, ymm_v5) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x39) + VPSHUFD(ymm_v5, ymm_v5, 0x39) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v6, ymm_v6, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x93) + VPSHUFD(ymm_v7, ymm_v7, 0x93) + + # a += b; d ^= a; d = ROTW16(d); + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + ROTW16_avx2(ymm_tmp0, ymm_v3) + ROTW16_avx2(ymm_tmp0, ymm_v7) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + ROTW12_avx2(ymm_tmp0, ymm_v1) + ROTW12_avx2(ymm_tmp0, ymm_v5) + + # a += b; d ^= a; d = ROTW8(d); + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + ROTW8_avx2(ymm_tmp0, ymm_v3) + ROTW8_avx2(ymm_tmp0, ymm_v7) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + ROTW7_avx2(ymm_tmp0, ymm_v1) + ROTW7_avx2(ymm_tmp0, ymm_v5) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x93) + VPSHUFD(ymm_v5, ymm_v5, 0x93) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v6, ymm_v6, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x39) + VPSHUFD(ymm_v7, ymm_v7, 0x39) + + SUB(reg_rounds, 2) + JNZ(rounds_loop4.begin) + + ADD_avx2(ymm_v0, ymm_s0) + ADD_avx2(ymm_v1, ymm_s1) + ADD_avx2(ymm_v2, ymm_s2) + ADD_avx2(ymm_v3, ymm_s3) + WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3) + ADD_avx2(ymm_s3, ymm_inc) + + ADD_avx2(ymm_v4, ymm_s0) + ADD_avx2(ymm_v5, ymm_s1) + ADD_avx2(ymm_v6, ymm_s2) + ADD_avx2(ymm_v7, ymm_s3) + WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7) + ADD_avx2(ymm_s3, ymm_inc) + + ADD(reg_inp, 4 * 64) + ADD(reg_outp, 4 * 64) + SUB(reg_blocks, 4) + + LABEL(process_2_blocks) + ADD(reg_blocks, 4) + JZ(out_write_even) # 0 blocks left. + + # + # 2/1 blocks at a time. The two codepaths are unified because + # with AVX2 we do 2 blocks at a time anyway, and this only gets called + # if 3/2/1 blocks are remaining, so the extra branches don't hurt that + # much. + # + + vector_loop2 = Loop() + with vector_loop2: + VMOVDQA(ymm_v0, ymm_s0) + VMOVDQA(ymm_v1, ymm_s1) + VMOVDQA(ymm_v2, ymm_s2) + VMOVDQA(ymm_v3, ymm_s3) + + MOV(reg_rounds, 20) + rounds_loop2 = Loop() + with rounds_loop2: + # a += b; d ^= a; d = ROTW16(d); + ADD_avx2(ymm_v0, ymm_v1) + XOR_avx2(ymm_v3, ymm_v0) + ROTW16_avx2(ymm_tmp0, ymm_v3) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + XOR_avx2(ymm_v1, ymm_v2) + ROTW12_avx2(ymm_tmp0, ymm_v1) + + # a += b; d ^= a; d = ROTW8(d); + ADD_avx2(ymm_v0, ymm_v1) + XOR_avx2(ymm_v3, ymm_v0) + ROTW8_avx2(ymm_tmp0, ymm_v3) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + XOR_avx2(ymm_v1, ymm_v2) + ROTW7_avx2(ymm_tmp0, ymm_v1) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x39) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x93) + + # a += b; d ^= a; d = ROTW16(d); + ADD_avx2(ymm_v0, ymm_v1) + XOR_avx2(ymm_v3, ymm_v0) + ROTW16_avx2(ymm_tmp0, ymm_v3) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + XOR_avx2(ymm_v1, ymm_v2) + ROTW12_avx2(ymm_tmp0, ymm_v1) + + # a += b; d ^= a; d = ROTW8(d); + ADD_avx2(ymm_v0, ymm_v1) + XOR_avx2(ymm_v3, ymm_v0) + ROTW8_avx2(ymm_tmp0, ymm_v3) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + XOR_avx2(ymm_v1, ymm_v2) + ROTW7_avx2(ymm_tmp0, ymm_v1) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x93) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x39) + + SUB(reg_rounds, 2) + JNZ(rounds_loop2.begin) + + ADD_avx2(ymm_v0, ymm_s0) + ADD_avx2(ymm_v1, ymm_s1) + ADD_avx2(ymm_v2, ymm_s2) + ADD_avx2(ymm_v3, ymm_s3) + + # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20)); + VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20) + VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp]) + VMOVDQU([reg_outp], ymm_tmp0) + + # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20)); + VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20) + VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32]) + VMOVDQU([reg_outp+32], ymm_tmp0) + + SUB(reg_blocks, 1) + JZ(out_write_odd) + + ADD_avx2(ymm_s3, ymm_inc) + + # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31)); + VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x31) + VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+64]) + VMOVDQU([reg_outp+64], ymm_tmp0) + + # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31)); + VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x31) + VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+96]) + VMOVDQU([reg_outp+96], ymm_tmp0) + + SUB(reg_blocks, 1) + JZ(out_write_even) + + ADD(reg_inp, 2 * 64) + ADD(reg_outp, 2 * 64) + JMP(vector_loop2.begin) + + LABEL(out_write_odd) + VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01) # Odd number of blocks. + + LABEL(out_write_even) + VMOVDQU(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3 + + # Paranoia, cleanse the scratch space. + VPXOR(ymm_v0, ymm_v0, ymm_v0) + VMOVDQA(mem_tmp0, ymm_v0) + VMOVDQA(mem_s3, ymm_v0) + + # Clear all YMM (and XMM) registers. + VZEROALL() + + # Remove our stack allocation. + MOV(registers.rsp, reg_sp_save) + + RETURN() + +# +# CPUID +# + +cpuidParams = Argument(ptr(uint32_t)) + +with Function("cpuidAmd64", (cpuidParams,)): + reg_params = registers.r15 + LOAD.ARGUMENT(reg_params, cpuidParams) + + MOV(registers.eax, [reg_params]) + MOV(registers.ecx, [reg_params+8]) + + CPUID() + + MOV([reg_params], registers.eax) + MOV([reg_params+4], registers.ebx) + MOV([reg_params+8], registers.ecx) + MOV([reg_params+12], registers.edx) + + RETURN() + +# +# XGETBV (ECX = 0) +# + +xcrVec = Argument(ptr(uint32_t)) + +with Function("xgetbv0Amd64", (xcrVec,)): + reg_vec = GeneralPurposeRegister64() + + LOAD.ARGUMENT(reg_vec, xcrVec) + + XOR(registers.ecx, registers.ecx) + + XGETBV() + + MOV([reg_vec], registers.eax) + MOV([reg_vec+4], registers.edx) + + RETURN() diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.s b/vendor/github.com/Yawning/chacha20/chacha20_amd64.s new file mode 100644 index 0000000..e3792af --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20_amd64.s @@ -0,0 +1,1180 @@ +// +build !noasm +// Generated by PeachPy 0.2.0 from chacha20_amd64.py + + +// func blocksAmd64SSE2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint) +TEXT ·blocksAmd64SSE2(SB),4,$0-32 + MOVQ x+0(FP), AX + MOVQ inp+8(FP), BX + MOVQ outp+16(FP), CX + MOVQ nrBlocks+24(FP), DX + MOVQ SP, DI + ANDQ $18446744073709551584, SP + SUBQ $32, SP + PXOR X0, X0 + SUBQ $32, SP + MOVO X0, 0(SP) + MOVL $1, SI + MOVL SI, 0(SP) + SUBQ $4, DX + JCS vector_loop4_end +vector_loop4_begin: + MOVOU 0(AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ 0(SP), X7 + MOVO X0, X8 + MOVO X1, X9 + MOVO X2, X10 + MOVO X7, X11 + PADDQ 0(SP), X11 + MOVO X0, X12 + MOVO X1, X13 + MOVO X2, X14 + MOVO X11, X15 + PADDQ 0(SP), X15 + MOVQ $20, SI +rounds_loop4_begin: + PADDL X1, X0 + PADDL X5, X4 + PADDL X9, X8 + PADDL X13, X12 + PXOR X0, X3 + PXOR X4, X7 + PXOR X8, X11 + PXOR X12, X15 + MOVO X12, 16(SP) + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $16, X12 + PSRLL $16, X7 + PXOR X12, X7 + MOVO X11, X12 + PSLLL $16, X12 + PSRLL $16, X11 + PXOR X12, X11 + MOVO X15, X12 + PSLLL $16, X12 + PSRLL $16, X15 + PXOR X12, X15 + PADDL X3, X2 + PADDL X7, X6 + PADDL X11, X10 + PADDL X15, X14 + PXOR X2, X1 + PXOR X6, X5 + PXOR X10, X9 + PXOR X14, X13 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $12, X12 + PSRLL $20, X5 + PXOR X12, X5 + MOVO X9, X12 + PSLLL $12, X12 + PSRLL $20, X9 + PXOR X12, X9 + MOVO X13, X12 + PSLLL $12, X12 + PSRLL $20, X13 + PXOR X12, X13 + MOVO 16(SP), X12 + PADDL X1, X0 + PADDL X5, X4 + PADDL X9, X8 + PADDL X13, X12 + PXOR X0, X3 + PXOR X4, X7 + PXOR X8, X11 + PXOR X12, X15 + MOVO X12, 16(SP) + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $8, X12 + PSRLL $24, X7 + PXOR X12, X7 + MOVO X11, X12 + PSLLL $8, X12 + PSRLL $24, X11 + PXOR X12, X11 + MOVO X15, X12 + PSLLL $8, X12 + PSRLL $24, X15 + PXOR X12, X15 + PADDL X3, X2 + PADDL X7, X6 + PADDL X11, X10 + PADDL X15, X14 + PXOR X2, X1 + PXOR X6, X5 + PXOR X10, X9 + PXOR X14, X13 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $7, X12 + PSRLL $25, X5 + PXOR X12, X5 + MOVO X9, X12 + PSLLL $7, X12 + PSRLL $25, X9 + PXOR X12, X9 + MOVO X13, X12 + PSLLL $7, X12 + PSRLL $25, X13 + PXOR X12, X13 + PSHUFL $57, X1, X1 + PSHUFL $57, X5, X5 + PSHUFL $57, X9, X9 + PSHUFL $57, X13, X13 + PSHUFL $78, X2, X2 + PSHUFL $78, X6, X6 + PSHUFL $78, X10, X10 + PSHUFL $78, X14, X14 + PSHUFL $147, X3, X3 + PSHUFL $147, X7, X7 + PSHUFL $147, X11, X11 + PSHUFL $147, X15, X15 + MOVO 16(SP), X12 + PADDL X1, X0 + PADDL X5, X4 + PADDL X9, X8 + PADDL X13, X12 + PXOR X0, X3 + PXOR X4, X7 + PXOR X8, X11 + PXOR X12, X15 + MOVO X12, 16(SP) + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $16, X12 + PSRLL $16, X7 + PXOR X12, X7 + MOVO X11, X12 + PSLLL $16, X12 + PSRLL $16, X11 + PXOR X12, X11 + MOVO X15, X12 + PSLLL $16, X12 + PSRLL $16, X15 + PXOR X12, X15 + PADDL X3, X2 + PADDL X7, X6 + PADDL X11, X10 + PADDL X15, X14 + PXOR X2, X1 + PXOR X6, X5 + PXOR X10, X9 + PXOR X14, X13 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $12, X12 + PSRLL $20, X5 + PXOR X12, X5 + MOVO X9, X12 + PSLLL $12, X12 + PSRLL $20, X9 + PXOR X12, X9 + MOVO X13, X12 + PSLLL $12, X12 + PSRLL $20, X13 + PXOR X12, X13 + MOVO 16(SP), X12 + PADDL X1, X0 + PADDL X5, X4 + PADDL X9, X8 + PADDL X13, X12 + PXOR X0, X3 + PXOR X4, X7 + PXOR X8, X11 + PXOR X12, X15 + MOVO X12, 16(SP) + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $8, X12 + PSRLL $24, X7 + PXOR X12, X7 + MOVO X11, X12 + PSLLL $8, X12 + PSRLL $24, X11 + PXOR X12, X11 + MOVO X15, X12 + PSLLL $8, X12 + PSRLL $24, X15 + PXOR X12, X15 + PADDL X3, X2 + PADDL X7, X6 + PADDL X11, X10 + PADDL X15, X14 + PXOR X2, X1 + PXOR X6, X5 + PXOR X10, X9 + PXOR X14, X13 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $7, X12 + PSRLL $25, X5 + PXOR X12, X5 + MOVO X9, X12 + PSLLL $7, X12 + PSRLL $25, X9 + PXOR X12, X9 + MOVO X13, X12 + PSLLL $7, X12 + PSRLL $25, X13 + PXOR X12, X13 + PSHUFL $147, X1, X1 + PSHUFL $147, X5, X5 + PSHUFL $147, X9, X9 + PSHUFL $147, X13, X13 + PSHUFL $78, X2, X2 + PSHUFL $78, X6, X6 + PSHUFL $78, X10, X10 + PSHUFL $78, X14, X14 + PSHUFL $57, X3, X3 + PSHUFL $57, X7, X7 + PSHUFL $57, X11, X11 + PSHUFL $57, X15, X15 + MOVO 16(SP), X12 + SUBQ $2, SI + JNE rounds_loop4_begin + MOVO X12, 16(SP) + PADDL 0(AX), X0 + PADDL 16(AX), X1 + PADDL 32(AX), X2 + PADDL 48(AX), X3 + MOVOU 0(BX), X12 + PXOR X0, X12 + MOVOU X12, 0(CX) + MOVOU 16(BX), X12 + PXOR X1, X12 + MOVOU X12, 16(CX) + MOVOU 32(BX), X12 + PXOR X2, X12 + MOVOU X12, 32(CX) + MOVOU 48(BX), X12 + PXOR X3, X12 + MOVOU X12, 48(CX) + MOVOU 48(AX), X3 + PADDQ 0(SP), X3 + PADDL 0(AX), X4 + PADDL 16(AX), X5 + PADDL 32(AX), X6 + PADDL X3, X7 + MOVOU 64(BX), X12 + PXOR X4, X12 + MOVOU X12, 64(CX) + MOVOU 80(BX), X12 + PXOR X5, X12 + MOVOU X12, 80(CX) + MOVOU 96(BX), X12 + PXOR X6, X12 + MOVOU X12, 96(CX) + MOVOU 112(BX), X12 + PXOR X7, X12 + MOVOU X12, 112(CX) + PADDQ 0(SP), X3 + PADDL 0(AX), X8 + PADDL 16(AX), X9 + PADDL 32(AX), X10 + PADDL X3, X11 + MOVOU 128(BX), X12 + PXOR X8, X12 + MOVOU X12, 128(CX) + MOVOU 144(BX), X12 + PXOR X9, X12 + MOVOU X12, 144(CX) + MOVOU 160(BX), X12 + PXOR X10, X12 + MOVOU X12, 160(CX) + MOVOU 176(BX), X12 + PXOR X11, X12 + MOVOU X12, 176(CX) + PADDQ 0(SP), X3 + MOVO 16(SP), X12 + PADDL 0(AX), X12 + PADDL 16(AX), X13 + PADDL 32(AX), X14 + PADDL X3, X15 + MOVOU 192(BX), X0 + PXOR X12, X0 + MOVOU X0, 192(CX) + MOVOU 208(BX), X0 + PXOR X13, X0 + MOVOU X0, 208(CX) + MOVOU 224(BX), X0 + PXOR X14, X0 + MOVOU X0, 224(CX) + MOVOU 240(BX), X0 + PXOR X15, X0 + MOVOU X0, 240(CX) + PADDQ 0(SP), X3 + MOVOU X3, 48(AX) + ADDQ $256, BX + ADDQ $256, CX + SUBQ $4, DX + JCC vector_loop4_begin +vector_loop4_end: + ADDQ $4, DX + JEQ out + MOVOU 0(AX), X8 + MOVOU 16(AX), X9 + MOVOU 32(AX), X10 + MOVOU 48(AX), X11 + MOVO 0(SP), X13 + SUBQ $2, DX + JCS process_1_block + MOVO X8, X0 + MOVO X9, X1 + MOVO X10, X2 + MOVO X11, X3 + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ X13, X7 + MOVQ $20, SI +rounds_loop2_begin: + PADDL X1, X0 + PADDL X5, X4 + PXOR X0, X3 + PXOR X4, X7 + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $16, X12 + PSRLL $16, X7 + PXOR X12, X7 + PADDL X3, X2 + PADDL X7, X6 + PXOR X2, X1 + PXOR X6, X5 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $12, X12 + PSRLL $20, X5 + PXOR X12, X5 + PADDL X1, X0 + PADDL X5, X4 + PXOR X0, X3 + PXOR X4, X7 + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $8, X12 + PSRLL $24, X7 + PXOR X12, X7 + PADDL X3, X2 + PADDL X7, X6 + PXOR X2, X1 + PXOR X6, X5 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $7, X12 + PSRLL $25, X5 + PXOR X12, X5 + PSHUFL $57, X1, X1 + PSHUFL $57, X5, X5 + PSHUFL $78, X2, X2 + PSHUFL $78, X6, X6 + PSHUFL $147, X3, X3 + PSHUFL $147, X7, X7 + PADDL X1, X0 + PADDL X5, X4 + PXOR X0, X3 + PXOR X4, X7 + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $16, X12 + PSRLL $16, X7 + PXOR X12, X7 + PADDL X3, X2 + PADDL X7, X6 + PXOR X2, X1 + PXOR X6, X5 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $12, X12 + PSRLL $20, X5 + PXOR X12, X5 + PADDL X1, X0 + PADDL X5, X4 + PXOR X0, X3 + PXOR X4, X7 + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $8, X12 + PSRLL $24, X7 + PXOR X12, X7 + PADDL X3, X2 + PADDL X7, X6 + PXOR X2, X1 + PXOR X6, X5 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $7, X12 + PSRLL $25, X5 + PXOR X12, X5 + PSHUFL $147, X1, X1 + PSHUFL $147, X5, X5 + PSHUFL $78, X2, X2 + PSHUFL $78, X6, X6 + PSHUFL $57, X3, X3 + PSHUFL $57, X7, X7 + SUBQ $2, SI + JNE rounds_loop2_begin + PADDL X8, X0 + PADDL X9, X1 + PADDL X10, X2 + PADDL X11, X3 + MOVOU 0(BX), X12 + PXOR X0, X12 + MOVOU X12, 0(CX) + MOVOU 16(BX), X12 + PXOR X1, X12 + MOVOU X12, 16(CX) + MOVOU 32(BX), X12 + PXOR X2, X12 + MOVOU X12, 32(CX) + MOVOU 48(BX), X12 + PXOR X3, X12 + MOVOU X12, 48(CX) + PADDQ X13, X11 + PADDL X8, X4 + PADDL X9, X5 + PADDL X10, X6 + PADDL X11, X7 + MOVOU 64(BX), X12 + PXOR X4, X12 + MOVOU X12, 64(CX) + MOVOU 80(BX), X12 + PXOR X5, X12 + MOVOU X12, 80(CX) + MOVOU 96(BX), X12 + PXOR X6, X12 + MOVOU X12, 96(CX) + MOVOU 112(BX), X12 + PXOR X7, X12 + MOVOU X12, 112(CX) + PADDQ X13, X11 + ADDQ $128, BX + ADDQ $128, CX + SUBQ $2, DX +process_1_block: + ADDQ $2, DX + JEQ out_serial + MOVO X8, X0 + MOVO X9, X1 + MOVO X10, X2 + MOVO X11, X3 + MOVQ $20, SI +rounds_loop1_begin: + PADDL X1, X0 + PXOR X0, X3 + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + PADDL X3, X2 + PXOR X2, X1 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + PADDL X1, X0 + PXOR X0, X3 + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + PADDL X3, X2 + PXOR X2, X1 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + PSHUFL $57, X1, X1 + PSHUFL $78, X2, X2 + PSHUFL $147, X3, X3 + PADDL X1, X0 + PXOR X0, X3 + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + PADDL X3, X2 + PXOR X2, X1 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + PADDL X1, X0 + PXOR X0, X3 + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + PADDL X3, X2 + PXOR X2, X1 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + PSHUFL $147, X1, X1 + PSHUFL $78, X2, X2 + PSHUFL $57, X3, X3 + SUBQ $2, SI + JNE rounds_loop1_begin + PADDL X8, X0 + PADDL X9, X1 + PADDL X10, X2 + PADDL X11, X3 + MOVOU 0(BX), X12 + PXOR X0, X12 + MOVOU X12, 0(CX) + MOVOU 16(BX), X12 + PXOR X1, X12 + MOVOU X12, 16(CX) + MOVOU 32(BX), X12 + PXOR X2, X12 + MOVOU X12, 32(CX) + MOVOU 48(BX), X12 + PXOR X3, X12 + MOVOU X12, 48(CX) + PADDQ X13, X11 +out_serial: + MOVOU X11, 48(AX) +out: + PXOR X0, X0 + MOVO X0, 16(SP) + MOVQ DI, SP + RET + +// func blocksAmd64AVX2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint) +TEXT ·blocksAmd64AVX2(SB),4,$0-32 + MOVQ x+0(FP), AX + MOVQ inp+8(FP), BX + MOVQ outp+16(FP), CX + MOVQ nrBlocks+24(FP), DX + MOVQ SP, DI + ANDQ $18446744073709551584, SP + SUBQ $32, SP + SUBQ $96, SP + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm0, ymm0, ymm0 + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x04; BYTE $0x24 // VMOVDQU [rsp], ymm0 + MOVL $1, SI + MOVL SI, 16(SP) + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x48; BYTE $0x30 // VBROADCASTI128 ymm1, [rax + 48] + BYTE $0xC5; BYTE $0xF5; BYTE $0xD4; BYTE $0x0C; BYTE $0x24 // VPADDQ ymm1, ymm1, [rsp] + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm1 + MOVL $2, SI + MOVL SI, 0(SP) + MOVL SI, 16(SP) + SUBQ $8, DX + JCS vector_loop8_end +vector_loop8_begin: + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x10 // VBROADCASTI128 ymm2, [rax] + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x58; BYTE $0x10 // VBROADCASTI128 ymm3, [rax + 16] + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x60; BYTE $0x20 // VBROADCASTI128 ymm4, [rax + 32] + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA ymm1, [rsp + 32] + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xEA // VMOVDQA ymm5, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF3 // VMOVDQA ymm6, ymm3 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFC // VMOVDQA ymm7, ymm4 + BYTE $0xC5; BYTE $0x75; BYTE $0xD4; BYTE $0x04; BYTE $0x24 // VPADDQ ymm8, ymm1, [rsp] + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xCA // VMOVDQA ymm9, ymm2 + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xD3 // VMOVDQA ymm10, ymm3 + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xDC // VMOVDQA ymm11, ymm4 + BYTE $0xC5; BYTE $0x3D; BYTE $0xD4; BYTE $0x24; BYTE $0x24 // VPADDQ ymm12, ymm8, [rsp] + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xC2 // VMOVDQA ymm0, ymm2 + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xEB // VMOVDQA ymm13, ymm3 + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xF4 // VMOVDQA ymm14, ymm4 + BYTE $0xC5; BYTE $0x1D; BYTE $0xD4; BYTE $0x3C; BYTE $0x24 // VPADDQ ymm15, ymm12, [rsp] + MOVQ $20, SI +rounds_loop8_begin: + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x10 // VPSLLD ymm0, ymm12, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x10 // VPSRLD ymm12, ymm12, 16 + BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12 + BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11 + BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x0C // VPSLLD ymm0, ymm10, 12 + BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x14 // VPSRLD ymm10, ymm10, 20 + BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12 + BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20 + BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64] + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x08 // VPSLLD ymm0, ymm12, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x18 // VPSRLD ymm12, ymm12, 24 + BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12 + BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11 + BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x07 // VPSLLD ymm0, ymm10, 7 + BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x19 // VPSRLD ymm10, ymm10, 25 + BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7 + BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25 + BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x39 // VPSHUFD ymm6, ymm6, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x39 // VPSHUFD ymm10, ymm10, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x39 // VPSHUFD ymm13, ymm13, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x4E // VPSHUFD ymm11, ymm11, 78 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x93 // VPSHUFD ymm8, ymm8, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xE4; BYTE $0x93 // VPSHUFD ymm12, ymm12, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x93 // VPSHUFD ymm15, ymm15, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64] + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x10 // VPSLLD ymm0, ymm12, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x10 // VPSRLD ymm12, ymm12, 16 + BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12 + BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11 + BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x0C // VPSLLD ymm0, ymm10, 12 + BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x14 // VPSRLD ymm10, ymm10, 20 + BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12 + BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20 + BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64] + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x08 // VPSLLD ymm0, ymm12, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x18 // VPSRLD ymm12, ymm12, 24 + BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12 + BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11 + BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x07 // VPSLLD ymm0, ymm10, 7 + BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x19 // VPSRLD ymm10, ymm10, 25 + BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7 + BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25 + BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x93 // VPSHUFD ymm6, ymm6, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x93 // VPSHUFD ymm10, ymm10, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x93 // VPSHUFD ymm13, ymm13, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x4E // VPSHUFD ymm11, ymm11, 78 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x39 // VPSHUFD ymm8, ymm8, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xE4; BYTE $0x39 // VPSHUFD ymm12, ymm12, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x39 // VPSHUFD ymm15, ymm15, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64] + SUBQ $2, SI + JNE rounds_loop8_begin + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x00 // VBROADCASTI128 ymm0, [rax] + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD0 // VPADDD ymm2, ymm2, ymm0 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xE8 // VPADDD ymm5, ymm5, ymm0 + BYTE $0xC5; BYTE $0x35; BYTE $0xFE; BYTE $0xC8 // VPADDD ymm9, ymm9, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0xFE; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VPADDD ymm0, ymm0, [rsp + 64] + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0 + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x40; BYTE $0x10 // VBROADCASTI128 ymm0, [rax + 16] + BYTE $0xC5; BYTE $0xE5; BYTE $0xFE; BYTE $0xD8 // VPADDD ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF0 // VPADDD ymm6, ymm6, ymm0 + BYTE $0xC5; BYTE $0x2D; BYTE $0xFE; BYTE $0xD0 // VPADDD ymm10, ymm10, ymm0 + BYTE $0xC5; BYTE $0x15; BYTE $0xFE; BYTE $0xE8 // VPADDD ymm13, ymm13, ymm0 + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x40; BYTE $0x20 // VBROADCASTI128 ymm0, [rax + 32] + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE0 // VPADDD ymm4, ymm4, ymm0 + BYTE $0xC5; BYTE $0xC5; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm0 + BYTE $0xC5; BYTE $0x25; BYTE $0xFE; BYTE $0xD8 // VPADDD ymm11, ymm11, ymm0 + BYTE $0xC5; BYTE $0x0D; BYTE $0xFE; BYTE $0xF0 // VPADDD ymm14, ymm14, ymm0 + BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VPADDD ymm1, ymm1, [rsp + 32] + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA ymm1, [rsp + 32] + BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp] + BYTE $0xC5; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm1 + BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x20 // VPERM2I128 ymm0, ymm5, ymm6, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 128] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x20 // VPERM2I128 ymm0, ymm7, ymm8, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 160] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x31 // VPERM2I128 ymm0, ymm5, ymm6, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 192] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x31 // VPERM2I128 ymm0, ymm7, ymm8, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 224] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm0 + BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp] + BYTE $0xC5; BYTE $0x1D; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm12, ymm12, ymm1 + BYTE $0xC4; BYTE $0xC3; BYTE $0x35; BYTE $0x46; BYTE $0xC2; BYTE $0x20 // VPERM2I128 ymm0, ymm9, ymm10, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 256] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 256], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x25; BYTE $0x46; BYTE $0xC4; BYTE $0x20 // VPERM2I128 ymm0, ymm11, ymm12, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 288] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 288], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x35; BYTE $0x46; BYTE $0xC2; BYTE $0x31 // VPERM2I128 ymm0, ymm9, ymm10, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 320] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 320], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x25; BYTE $0x46; BYTE $0xC4; BYTE $0x31 // VPERM2I128 ymm0, ymm11, ymm12, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 352] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 352], ymm0 + BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp] + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64] + BYTE $0xC5; BYTE $0x05; BYTE $0xFE; BYTE $0xF9 // VPADDD ymm15, ymm15, ymm1 + BYTE $0xC4; BYTE $0xC3; BYTE $0x7D; BYTE $0x46; BYTE $0xD5; BYTE $0x20 // VPERM2I128 ymm2, ymm0, ymm13, 32 + BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0x80; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 384] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0x80; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 384], ymm2 + BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xD7; BYTE $0x20 // VPERM2I128 ymm2, ymm14, ymm15, 32 + BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xA0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 416] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xA0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 416], ymm2 + BYTE $0xC4; BYTE $0xC3; BYTE $0x7D; BYTE $0x46; BYTE $0xD5; BYTE $0x31 // VPERM2I128 ymm2, ymm0, ymm13, 49 + BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xC0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 448] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xC0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 448], ymm2 + BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xD7; BYTE $0x31 // VPERM2I128 ymm2, ymm14, ymm15, 49 + BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xE0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 480] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xE0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 480], ymm2 + BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp] + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm1 + ADDQ $512, BX + ADDQ $512, CX + SUBQ $8, DX + JCC vector_loop8_begin +vector_loop8_end: + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xE1 // VMOVDQA ymm12, ymm1 + ADDQ $8, DX + JEQ out_write_even + BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x08 // VBROADCASTI128 ymm9, [rax] + BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x50; BYTE $0x10 // VBROADCASTI128 ymm10, [rax + 16] + BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x58; BYTE $0x20 // VBROADCASTI128 ymm11, [rax + 32] + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0x34; BYTE $0x24 // VMOVDQA ymm14, [rsp] + SUBQ $4, DX + JCS process_2_blocks + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xCA // VMOVDQA ymm2, ymm9 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xD3 // VMOVDQA ymm3, ymm10 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xDC // VMOVDQA ymm4, ymm11 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE1 // VMOVDQA ymm1, ymm12 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xEA // VMOVDQA ymm5, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF3 // VMOVDQA ymm6, ymm3 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFC // VMOVDQA ymm7, ymm4 + BYTE $0xC4; BYTE $0x41; BYTE $0x75; BYTE $0xD4; BYTE $0xC6 // VPADDQ ymm8, ymm1, ymm14 + MOVQ $20, SI +rounds_loop4_begin: + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x39 // VPSHUFD ymm6, ymm6, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x93 // VPSHUFD ymm8, ymm8, 147 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x93 // VPSHUFD ymm6, ymm6, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x39 // VPSHUFD ymm8, ymm8, 57 + SUBQ $2, SI + JNE rounds_loop4_begin + BYTE $0xC4; BYTE $0xC1; BYTE $0x6D; BYTE $0xFE; BYTE $0xD1 // VPADDD ymm2, ymm2, ymm9 + BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDA // VPADDD ymm3, ymm3, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE3 // VPADDD ymm4, ymm4, ymm11 + BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0xFE; BYTE $0xCC // VPADDD ymm1, ymm1, ymm12 + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14 + BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xFE; BYTE $0xE9 // VPADDD ymm5, ymm5, ymm9 + BYTE $0xC4; BYTE $0xC1; BYTE $0x4D; BYTE $0xFE; BYTE $0xF2 // VPADDD ymm6, ymm6, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xFB // VPADDD ymm7, ymm7, ymm11 + BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC4 // VPADDD ymm8, ymm8, ymm12 + BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x20 // VPERM2I128 ymm0, ymm5, ymm6, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 128] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x20 // VPERM2I128 ymm0, ymm7, ymm8, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 160] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x31 // VPERM2I128 ymm0, ymm5, ymm6, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 192] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x31 // VPERM2I128 ymm0, ymm7, ymm8, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 224] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm0 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14 + ADDQ $256, BX + ADDQ $256, CX + SUBQ $4, DX +process_2_blocks: + ADDQ $4, DX + JEQ out_write_even +vector_loop2_begin: + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xCA // VMOVDQA ymm2, ymm9 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xD3 // VMOVDQA ymm3, ymm10 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xDC // VMOVDQA ymm4, ymm11 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE1 // VMOVDQA ymm1, ymm12 + MOVQ $20, SI +rounds_loop2_begin: + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57 + SUBQ $2, SI + JNE rounds_loop2_begin + BYTE $0xC4; BYTE $0xC1; BYTE $0x6D; BYTE $0xFE; BYTE $0xD1 // VPADDD ymm2, ymm2, ymm9 + BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDA // VPADDD ymm3, ymm3, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE3 // VPADDD ymm4, ymm4, ymm11 + BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0xFE; BYTE $0xCC // VPADDD ymm1, ymm1, ymm12 + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0 + SUBQ $1, DX + JEQ out_write_odd + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14 + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0 + SUBQ $1, DX + JEQ out_write_even + ADDQ $128, BX + ADDQ $128, CX + JMP vector_loop2_begin +out_write_odd: + BYTE $0xC4; BYTE $0x43; BYTE $0x1D; BYTE $0x46; BYTE $0xE4; BYTE $0x01 // VPERM2I128 ymm12, ymm12, ymm12, 1 +out_write_even: + BYTE $0xC5; BYTE $0x7A; BYTE $0x7F; BYTE $0x60; BYTE $0x30 // VMOVDQU [rax + 48], xmm12 + BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0xD2 // VPXOR ymm2, ymm2, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x54; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x54; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm2 + BYTE $0xC5; BYTE $0xFC; BYTE $0x77 // VZEROALL + MOVQ DI, SP + RET + +// func cpuidAmd64(cpuidParams *uint32) +TEXT ·cpuidAmd64(SB),4,$0-8 + MOVQ cpuidParams+0(FP), R15 + MOVL 0(R15), AX + MOVL 8(R15), CX + CPUID + MOVL AX, 0(R15) + MOVL BX, 4(R15) + MOVL CX, 8(R15) + MOVL DX, 12(R15) + RET + +// func xgetbv0Amd64(xcrVec *uint32) +TEXT ·xgetbv0Amd64(SB),4,$0-8 + MOVQ xcrVec+0(FP), BX + XORL CX, CX + BYTE $0x0F; BYTE $0x01; BYTE $0xD0 // XGETBV + MOVL AX, 0(BX) + MOVL DX, 4(BX) + RET diff --git a/vendor/github.com/Yawning/chacha20/chacha20_ref.go b/vendor/github.com/Yawning/chacha20/chacha20_ref.go new file mode 100644 index 0000000..fcdc8c6 --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20_ref.go @@ -0,0 +1,394 @@ +// chacha20_ref.go - Reference ChaCha20. +// +// To the extent possible under law, Yawning Angel has waived all copyright +// and related or neighboring rights to chacha20, using the Creative +// Commons "CC0" public domain dedication. See LICENSE or +// for full details. + +// +build !go1.9 + +package chacha20 + +import ( + "encoding/binary" + "math" + "unsafe" +) + +func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) { + if isIetf { + var totalBlocks uint64 + totalBlocks = uint64(x[12]) + uint64(nrBlocks) + if totalBlocks > math.MaxUint32 { + panic("chacha20: Exceeded keystream per nonce limit") + } + } + + // This routine ignores x[0]...x[4] in favor the const values since it's + // ever so slightly faster. + + for n := 0; n < nrBlocks; n++ { + x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3 + x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] + + for i := chachaRounds; i > 0; i -= 2 { + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ^= x0 + x12 = (x12 << 16) | (x12 >> 16) + x8 += x12 + x4 ^= x8 + x4 = (x4 << 12) | (x4 >> 20) + x0 += x4 + x12 ^= x0 + x12 = (x12 << 8) | (x12 >> 24) + x8 += x12 + x4 ^= x8 + x4 = (x4 << 7) | (x4 >> 25) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ^= x1 + x13 = (x13 << 16) | (x13 >> 16) + x9 += x13 + x5 ^= x9 + x5 = (x5 << 12) | (x5 >> 20) + x1 += x5 + x13 ^= x1 + x13 = (x13 << 8) | (x13 >> 24) + x9 += x13 + x5 ^= x9 + x5 = (x5 << 7) | (x5 >> 25) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ^= x2 + x14 = (x14 << 16) | (x14 >> 16) + x10 += x14 + x6 ^= x10 + x6 = (x6 << 12) | (x6 >> 20) + x2 += x6 + x14 ^= x2 + x14 = (x14 << 8) | (x14 >> 24) + x10 += x14 + x6 ^= x10 + x6 = (x6 << 7) | (x6 >> 25) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ^= x3 + x15 = (x15 << 16) | (x15 >> 16) + x11 += x15 + x7 ^= x11 + x7 = (x7 << 12) | (x7 >> 20) + x3 += x7 + x15 ^= x3 + x15 = (x15 << 8) | (x15 >> 24) + x11 += x15 + x7 ^= x11 + x7 = (x7 << 7) | (x7 >> 25) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ^= x0 + x15 = (x15 << 16) | (x15 >> 16) + x10 += x15 + x5 ^= x10 + x5 = (x5 << 12) | (x5 >> 20) + x0 += x5 + x15 ^= x0 + x15 = (x15 << 8) | (x15 >> 24) + x10 += x15 + x5 ^= x10 + x5 = (x5 << 7) | (x5 >> 25) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ^= x1 + x12 = (x12 << 16) | (x12 >> 16) + x11 += x12 + x6 ^= x11 + x6 = (x6 << 12) | (x6 >> 20) + x1 += x6 + x12 ^= x1 + x12 = (x12 << 8) | (x12 >> 24) + x11 += x12 + x6 ^= x11 + x6 = (x6 << 7) | (x6 >> 25) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ^= x2 + x13 = (x13 << 16) | (x13 >> 16) + x8 += x13 + x7 ^= x8 + x7 = (x7 << 12) | (x7 >> 20) + x2 += x7 + x13 ^= x2 + x13 = (x13 << 8) | (x13 >> 24) + x8 += x13 + x7 ^= x8 + x7 = (x7 << 7) | (x7 >> 25) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ^= x3 + x14 = (x14 << 16) | (x14 >> 16) + x9 += x14 + x4 ^= x9 + x4 = (x4 << 12) | (x4 >> 20) + x3 += x4 + x14 ^= x3 + x14 = (x14 << 8) | (x14 >> 24) + x9 += x14 + x4 ^= x9 + x4 = (x4 << 7) | (x4 >> 25) + } + + // On amd64 at least, this is a rather big boost. + if useUnsafe { + if in != nil { + inArr := (*[16]uint32)(unsafe.Pointer(&in[n*BlockSize])) + outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize])) + outArr[0] = inArr[0] ^ (x0 + sigma0) + outArr[1] = inArr[1] ^ (x1 + sigma1) + outArr[2] = inArr[2] ^ (x2 + sigma2) + outArr[3] = inArr[3] ^ (x3 + sigma3) + outArr[4] = inArr[4] ^ (x4 + x[4]) + outArr[5] = inArr[5] ^ (x5 + x[5]) + outArr[6] = inArr[6] ^ (x6 + x[6]) + outArr[7] = inArr[7] ^ (x7 + x[7]) + outArr[8] = inArr[8] ^ (x8 + x[8]) + outArr[9] = inArr[9] ^ (x9 + x[9]) + outArr[10] = inArr[10] ^ (x10 + x[10]) + outArr[11] = inArr[11] ^ (x11 + x[11]) + outArr[12] = inArr[12] ^ (x12 + x[12]) + outArr[13] = inArr[13] ^ (x13 + x[13]) + outArr[14] = inArr[14] ^ (x14 + x[14]) + outArr[15] = inArr[15] ^ (x15 + x[15]) + } else { + outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize])) + outArr[0] = x0 + sigma0 + outArr[1] = x1 + sigma1 + outArr[2] = x2 + sigma2 + outArr[3] = x3 + sigma3 + outArr[4] = x4 + x[4] + outArr[5] = x5 + x[5] + outArr[6] = x6 + x[6] + outArr[7] = x7 + x[7] + outArr[8] = x8 + x[8] + outArr[9] = x9 + x[9] + outArr[10] = x10 + x[10] + outArr[11] = x11 + x[11] + outArr[12] = x12 + x[12] + outArr[13] = x13 + x[13] + outArr[14] = x14 + x[14] + outArr[15] = x15 + x[15] + } + } else { + // Slow path, either the architecture cares about alignment, or is not little endian. + x0 += sigma0 + x1 += sigma1 + x2 += sigma2 + x3 += sigma3 + x4 += x[4] + x5 += x[5] + x6 += x[6] + x7 += x[7] + x8 += x[8] + x9 += x[9] + x10 += x[10] + x11 += x[11] + x12 += x[12] + x13 += x[13] + x14 += x[14] + x15 += x[15] + if in != nil { + binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^x0) + binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^x1) + binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^x2) + binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^x3) + binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4) + binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5) + binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6) + binary.LittleEndian.PutUint32(out[28:32], binary.LittleEndian.Uint32(in[28:32])^x7) + binary.LittleEndian.PutUint32(out[32:36], binary.LittleEndian.Uint32(in[32:36])^x8) + binary.LittleEndian.PutUint32(out[36:40], binary.LittleEndian.Uint32(in[36:40])^x9) + binary.LittleEndian.PutUint32(out[40:44], binary.LittleEndian.Uint32(in[40:44])^x10) + binary.LittleEndian.PutUint32(out[44:48], binary.LittleEndian.Uint32(in[44:48])^x11) + binary.LittleEndian.PutUint32(out[48:52], binary.LittleEndian.Uint32(in[48:52])^x12) + binary.LittleEndian.PutUint32(out[52:56], binary.LittleEndian.Uint32(in[52:56])^x13) + binary.LittleEndian.PutUint32(out[56:60], binary.LittleEndian.Uint32(in[56:60])^x14) + binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15) + in = in[BlockSize:] + } else { + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x4) + binary.LittleEndian.PutUint32(out[20:24], x5) + binary.LittleEndian.PutUint32(out[24:28], x6) + binary.LittleEndian.PutUint32(out[28:32], x7) + binary.LittleEndian.PutUint32(out[32:36], x8) + binary.LittleEndian.PutUint32(out[36:40], x9) + binary.LittleEndian.PutUint32(out[40:44], x10) + binary.LittleEndian.PutUint32(out[44:48], x11) + binary.LittleEndian.PutUint32(out[48:52], x12) + binary.LittleEndian.PutUint32(out[52:56], x13) + binary.LittleEndian.PutUint32(out[56:60], x14) + binary.LittleEndian.PutUint32(out[60:64], x15) + } + out = out[BlockSize:] + } + + // Stoping at 2^70 bytes per nonce is the user's responsibility. + ctr := uint64(x[13])<<32 | uint64(x[12]) + ctr++ + x[12] = uint32(ctr) + x[13] = uint32(ctr >> 32) + } +} + +func hChaChaRef(x *[stateSize]uint32, out *[32]byte) { + x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3 + x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11] + + for i := chachaRounds; i > 0; i -= 2 { + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ^= x0 + x12 = (x12 << 16) | (x12 >> 16) + x8 += x12 + x4 ^= x8 + x4 = (x4 << 12) | (x4 >> 20) + x0 += x4 + x12 ^= x0 + x12 = (x12 << 8) | (x12 >> 24) + x8 += x12 + x4 ^= x8 + x4 = (x4 << 7) | (x4 >> 25) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ^= x1 + x13 = (x13 << 16) | (x13 >> 16) + x9 += x13 + x5 ^= x9 + x5 = (x5 << 12) | (x5 >> 20) + x1 += x5 + x13 ^= x1 + x13 = (x13 << 8) | (x13 >> 24) + x9 += x13 + x5 ^= x9 + x5 = (x5 << 7) | (x5 >> 25) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ^= x2 + x14 = (x14 << 16) | (x14 >> 16) + x10 += x14 + x6 ^= x10 + x6 = (x6 << 12) | (x6 >> 20) + x2 += x6 + x14 ^= x2 + x14 = (x14 << 8) | (x14 >> 24) + x10 += x14 + x6 ^= x10 + x6 = (x6 << 7) | (x6 >> 25) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ^= x3 + x15 = (x15 << 16) | (x15 >> 16) + x11 += x15 + x7 ^= x11 + x7 = (x7 << 12) | (x7 >> 20) + x3 += x7 + x15 ^= x3 + x15 = (x15 << 8) | (x15 >> 24) + x11 += x15 + x7 ^= x11 + x7 = (x7 << 7) | (x7 >> 25) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ^= x0 + x15 = (x15 << 16) | (x15 >> 16) + x10 += x15 + x5 ^= x10 + x5 = (x5 << 12) | (x5 >> 20) + x0 += x5 + x15 ^= x0 + x15 = (x15 << 8) | (x15 >> 24) + x10 += x15 + x5 ^= x10 + x5 = (x5 << 7) | (x5 >> 25) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ^= x1 + x12 = (x12 << 16) | (x12 >> 16) + x11 += x12 + x6 ^= x11 + x6 = (x6 << 12) | (x6 >> 20) + x1 += x6 + x12 ^= x1 + x12 = (x12 << 8) | (x12 >> 24) + x11 += x12 + x6 ^= x11 + x6 = (x6 << 7) | (x6 >> 25) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ^= x2 + x13 = (x13 << 16) | (x13 >> 16) + x8 += x13 + x7 ^= x8 + x7 = (x7 << 12) | (x7 >> 20) + x2 += x7 + x13 ^= x2 + x13 = (x13 << 8) | (x13 >> 24) + x8 += x13 + x7 ^= x8 + x7 = (x7 << 7) | (x7 >> 25) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ^= x3 + x14 = (x14 << 16) | (x14 >> 16) + x9 += x14 + x4 ^= x9 + x4 = (x4 << 12) | (x4 >> 20) + x3 += x4 + x14 ^= x3 + x14 = (x14 << 8) | (x14 >> 24) + x9 += x14 + x4 ^= x9 + x4 = (x4 << 7) | (x4 >> 25) + } + + // HChaCha returns x0...x3 | x12...x15, which corresponds to the + // indexes of the ChaCha constant and the indexes of the IV. + if useUnsafe { + outArr := (*[16]uint32)(unsafe.Pointer(&out[0])) + outArr[0] = x0 + outArr[1] = x1 + outArr[2] = x2 + outArr[3] = x3 + outArr[4] = x12 + outArr[5] = x13 + outArr[6] = x14 + outArr[7] = x15 + } else { + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x12) + binary.LittleEndian.PutUint32(out[20:24], x13) + binary.LittleEndian.PutUint32(out[24:28], x14) + binary.LittleEndian.PutUint32(out[28:32], x15) + } + return +} diff --git a/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go b/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go new file mode 100644 index 0000000..8405c22 --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go @@ -0,0 +1,395 @@ +// chacha20_ref.go - Reference ChaCha20. +// +// To the extent possible under law, Yawning Angel has waived all copyright +// and related or neighboring rights to chacha20, using the Creative +// Commons "CC0" public domain dedication. See LICENSE or +// for full details. + +// +build go1.9 + +package chacha20 + +import ( + "encoding/binary" + "math" + "math/bits" + "unsafe" +) + +func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) { + if isIetf { + var totalBlocks uint64 + totalBlocks = uint64(x[12]) + uint64(nrBlocks) + if totalBlocks > math.MaxUint32 { + panic("chacha20: Exceeded keystream per nonce limit") + } + } + + // This routine ignores x[0]...x[4] in favor the const values since it's + // ever so slightly faster. + + for n := 0; n < nrBlocks; n++ { + x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3 + x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] + + for i := chachaRounds; i > 0; i -= 2 { + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ^= x0 + x12 = bits.RotateLeft32(x12, 16) + x8 += x12 + x4 ^= x8 + x4 = bits.RotateLeft32(x4, 12) + x0 += x4 + x12 ^= x0 + x12 = bits.RotateLeft32(x12, 8) + x8 += x12 + x4 ^= x8 + x4 = bits.RotateLeft32(x4, 7) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ^= x1 + x13 = bits.RotateLeft32(x13, 16) + x9 += x13 + x5 ^= x9 + x5 = bits.RotateLeft32(x5, 12) + x1 += x5 + x13 ^= x1 + x13 = bits.RotateLeft32(x13, 8) + x9 += x13 + x5 ^= x9 + x5 = bits.RotateLeft32(x5, 7) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ^= x2 + x14 = bits.RotateLeft32(x14, 16) + x10 += x14 + x6 ^= x10 + x6 = bits.RotateLeft32(x6, 12) + x2 += x6 + x14 ^= x2 + x14 = bits.RotateLeft32(x14, 8) + x10 += x14 + x6 ^= x10 + x6 = bits.RotateLeft32(x6, 7) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ^= x3 + x15 = bits.RotateLeft32(x15, 16) + x11 += x15 + x7 ^= x11 + x7 = bits.RotateLeft32(x7, 12) + x3 += x7 + x15 ^= x3 + x15 = bits.RotateLeft32(x15, 8) + x11 += x15 + x7 ^= x11 + x7 = bits.RotateLeft32(x7, 7) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ^= x0 + x15 = bits.RotateLeft32(x15, 16) + x10 += x15 + x5 ^= x10 + x5 = bits.RotateLeft32(x5, 12) + x0 += x5 + x15 ^= x0 + x15 = bits.RotateLeft32(x15, 8) + x10 += x15 + x5 ^= x10 + x5 = bits.RotateLeft32(x5, 7) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ^= x1 + x12 = bits.RotateLeft32(x12, 16) + x11 += x12 + x6 ^= x11 + x6 = bits.RotateLeft32(x6, 12) + x1 += x6 + x12 ^= x1 + x12 = bits.RotateLeft32(x12, 8) + x11 += x12 + x6 ^= x11 + x6 = bits.RotateLeft32(x6, 7) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ^= x2 + x13 = bits.RotateLeft32(x13, 16) + x8 += x13 + x7 ^= x8 + x7 = bits.RotateLeft32(x7, 12) + x2 += x7 + x13 ^= x2 + x13 = bits.RotateLeft32(x13, 8) + x8 += x13 + x7 ^= x8 + x7 = bits.RotateLeft32(x7, 7) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ^= x3 + x14 = bits.RotateLeft32(x14, 16) + x9 += x14 + x4 ^= x9 + x4 = bits.RotateLeft32(x4, 12) + x3 += x4 + x14 ^= x3 + x14 = bits.RotateLeft32(x14, 8) + x9 += x14 + x4 ^= x9 + x4 = bits.RotateLeft32(x4, 7) + } + + // On amd64 at least, this is a rather big boost. + if useUnsafe { + if in != nil { + inArr := (*[16]uint32)(unsafe.Pointer(&in[n*BlockSize])) + outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize])) + outArr[0] = inArr[0] ^ (x0 + sigma0) + outArr[1] = inArr[1] ^ (x1 + sigma1) + outArr[2] = inArr[2] ^ (x2 + sigma2) + outArr[3] = inArr[3] ^ (x3 + sigma3) + outArr[4] = inArr[4] ^ (x4 + x[4]) + outArr[5] = inArr[5] ^ (x5 + x[5]) + outArr[6] = inArr[6] ^ (x6 + x[6]) + outArr[7] = inArr[7] ^ (x7 + x[7]) + outArr[8] = inArr[8] ^ (x8 + x[8]) + outArr[9] = inArr[9] ^ (x9 + x[9]) + outArr[10] = inArr[10] ^ (x10 + x[10]) + outArr[11] = inArr[11] ^ (x11 + x[11]) + outArr[12] = inArr[12] ^ (x12 + x[12]) + outArr[13] = inArr[13] ^ (x13 + x[13]) + outArr[14] = inArr[14] ^ (x14 + x[14]) + outArr[15] = inArr[15] ^ (x15 + x[15]) + } else { + outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize])) + outArr[0] = x0 + sigma0 + outArr[1] = x1 + sigma1 + outArr[2] = x2 + sigma2 + outArr[3] = x3 + sigma3 + outArr[4] = x4 + x[4] + outArr[5] = x5 + x[5] + outArr[6] = x6 + x[6] + outArr[7] = x7 + x[7] + outArr[8] = x8 + x[8] + outArr[9] = x9 + x[9] + outArr[10] = x10 + x[10] + outArr[11] = x11 + x[11] + outArr[12] = x12 + x[12] + outArr[13] = x13 + x[13] + outArr[14] = x14 + x[14] + outArr[15] = x15 + x[15] + } + } else { + // Slow path, either the architecture cares about alignment, or is not little endian. + x0 += sigma0 + x1 += sigma1 + x2 += sigma2 + x3 += sigma3 + x4 += x[4] + x5 += x[5] + x6 += x[6] + x7 += x[7] + x8 += x[8] + x9 += x[9] + x10 += x[10] + x11 += x[11] + x12 += x[12] + x13 += x[13] + x14 += x[14] + x15 += x[15] + if in != nil { + binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^x0) + binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^x1) + binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^x2) + binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^x3) + binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4) + binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5) + binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6) + binary.LittleEndian.PutUint32(out[28:32], binary.LittleEndian.Uint32(in[28:32])^x7) + binary.LittleEndian.PutUint32(out[32:36], binary.LittleEndian.Uint32(in[32:36])^x8) + binary.LittleEndian.PutUint32(out[36:40], binary.LittleEndian.Uint32(in[36:40])^x9) + binary.LittleEndian.PutUint32(out[40:44], binary.LittleEndian.Uint32(in[40:44])^x10) + binary.LittleEndian.PutUint32(out[44:48], binary.LittleEndian.Uint32(in[44:48])^x11) + binary.LittleEndian.PutUint32(out[48:52], binary.LittleEndian.Uint32(in[48:52])^x12) + binary.LittleEndian.PutUint32(out[52:56], binary.LittleEndian.Uint32(in[52:56])^x13) + binary.LittleEndian.PutUint32(out[56:60], binary.LittleEndian.Uint32(in[56:60])^x14) + binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15) + in = in[BlockSize:] + } else { + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x4) + binary.LittleEndian.PutUint32(out[20:24], x5) + binary.LittleEndian.PutUint32(out[24:28], x6) + binary.LittleEndian.PutUint32(out[28:32], x7) + binary.LittleEndian.PutUint32(out[32:36], x8) + binary.LittleEndian.PutUint32(out[36:40], x9) + binary.LittleEndian.PutUint32(out[40:44], x10) + binary.LittleEndian.PutUint32(out[44:48], x11) + binary.LittleEndian.PutUint32(out[48:52], x12) + binary.LittleEndian.PutUint32(out[52:56], x13) + binary.LittleEndian.PutUint32(out[56:60], x14) + binary.LittleEndian.PutUint32(out[60:64], x15) + } + out = out[BlockSize:] + } + + // Stoping at 2^70 bytes per nonce is the user's responsibility. + ctr := uint64(x[13])<<32 | uint64(x[12]) + ctr++ + x[12] = uint32(ctr) + x[13] = uint32(ctr >> 32) + } +} + +func hChaChaRef(x *[stateSize]uint32, out *[32]byte) { + x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3 + x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11] + + for i := chachaRounds; i > 0; i -= 2 { + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ^= x0 + x12 = bits.RotateLeft32(x12, 16) + x8 += x12 + x4 ^= x8 + x4 = bits.RotateLeft32(x4, 12) + x0 += x4 + x12 ^= x0 + x12 = bits.RotateLeft32(x12, 8) + x8 += x12 + x4 ^= x8 + x4 = bits.RotateLeft32(x4, 7) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ^= x1 + x13 = bits.RotateLeft32(x13, 16) + x9 += x13 + x5 ^= x9 + x5 = bits.RotateLeft32(x5, 12) + x1 += x5 + x13 ^= x1 + x13 = bits.RotateLeft32(x13, 8) + x9 += x13 + x5 ^= x9 + x5 = bits.RotateLeft32(x5, 7) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ^= x2 + x14 = bits.RotateLeft32(x14, 16) + x10 += x14 + x6 ^= x10 + x6 = bits.RotateLeft32(x6, 12) + x2 += x6 + x14 ^= x2 + x14 = bits.RotateLeft32(x14, 8) + x10 += x14 + x6 ^= x10 + x6 = bits.RotateLeft32(x6, 7) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ^= x3 + x15 = bits.RotateLeft32(x15, 16) + x11 += x15 + x7 ^= x11 + x7 = bits.RotateLeft32(x7, 12) + x3 += x7 + x15 ^= x3 + x15 = bits.RotateLeft32(x15, 8) + x11 += x15 + x7 ^= x11 + x7 = bits.RotateLeft32(x7, 7) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ^= x0 + x15 = bits.RotateLeft32(x15, 16) + x10 += x15 + x5 ^= x10 + x5 = bits.RotateLeft32(x5, 12) + x0 += x5 + x15 ^= x0 + x15 = bits.RotateLeft32(x15, 8) + x10 += x15 + x5 ^= x10 + x5 = bits.RotateLeft32(x5, 7) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ^= x1 + x12 = bits.RotateLeft32(x12, 16) + x11 += x12 + x6 ^= x11 + x6 = bits.RotateLeft32(x6, 12) + x1 += x6 + x12 ^= x1 + x12 = bits.RotateLeft32(x12, 8) + x11 += x12 + x6 ^= x11 + x6 = bits.RotateLeft32(x6, 7) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ^= x2 + x13 = bits.RotateLeft32(x13, 16) + x8 += x13 + x7 ^= x8 + x7 = bits.RotateLeft32(x7, 12) + x2 += x7 + x13 ^= x2 + x13 = bits.RotateLeft32(x13, 8) + x8 += x13 + x7 ^= x8 + x7 = bits.RotateLeft32(x7, 7) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ^= x3 + x14 = bits.RotateLeft32(x14, 16) + x9 += x14 + x4 ^= x9 + x4 = bits.RotateLeft32(x4, 12) + x3 += x4 + x14 ^= x3 + x14 = bits.RotateLeft32(x14, 8) + x9 += x14 + x4 ^= x9 + x4 = bits.RotateLeft32(x4, 7) + } + + // HChaCha returns x0...x3 | x12...x15, which corresponds to the + // indexes of the ChaCha constant and the indexes of the IV. + if useUnsafe { + outArr := (*[16]uint32)(unsafe.Pointer(&out[0])) + outArr[0] = x0 + outArr[1] = x1 + outArr[2] = x2 + outArr[3] = x3 + outArr[4] = x12 + outArr[5] = x13 + outArr[6] = x14 + outArr[7] = x15 + } else { + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x12) + binary.LittleEndian.PutUint32(out[20:24], x13) + binary.LittleEndian.PutUint32(out[24:28], x14) + binary.LittleEndian.PutUint32(out[28:32], x15) + } + return +} diff --git a/vendor/github.com/dgryski/go-camellia/camellia.go b/vendor/github.com/dgryski/go-camellia/camellia.go new file mode 100644 index 0000000..048b2e3 --- /dev/null +++ b/vendor/github.com/dgryski/go-camellia/camellia.go @@ -0,0 +1,368 @@ +// Copyright (c) 2013 Damian Gryski +// Licensed under the GPLv3 or, at your option, any later version. + +// Package camellia is an implementation of the CAMELLIA encryption algorithm +/* + + This is an unoptimized version based on the description in RFC 3713. + + References: + http://en.wikipedia.org/wiki/Camellia_%28cipher%29 + https://info.isl.ntt.co.jp/crypt/eng/camellia/ +*/ +package camellia + +import ( + "crypto/cipher" + "encoding/binary" + "strconv" +) + +const BlockSize = 16 + +type KeySizeError int + +func (k KeySizeError) Error() string { + return "camellia: invalid key size " + strconv.Itoa(int(k)) +} + +type camelliaCipher struct { + kw [5]uint64 + k [25]uint64 + ke [7]uint64 + klen int +} + +const ( + sigma1 = 0xA09E667F3BCC908B + sigma2 = 0xB67AE8584CAA73B2 + sigma3 = 0xC6EF372FE94F82BE + sigma4 = 0x54FF53A5F1D36F1C + sigma5 = 0x10E527FADE682D1D + sigma6 = 0xB05688C2B3E6C1FD +) + +func init() { + // initialize other sboxes + for i := range sbox1 { + sbox2[i] = rotl8(sbox1[i], 1) + sbox3[i] = rotl8(sbox1[i], 7) + sbox4[i] = sbox1[rotl8(uint8(i), 1)] + } +} + +func rotl128(k [2]uint64, rot uint) (hi, lo uint64) { + + if rot > 64 { + rot -= 64 + k[0], k[1] = k[1], k[0] + } + + t := k[0] >> (64 - rot) + hi = (k[0] << rot) | (k[1] >> (64 - rot)) + lo = (k[1] << rot) | t + return hi, lo +} + +func rotl32(k uint32, rot uint) uint32 { + return (k << rot) | (k >> (32 - rot)) +} + +func rotl8(k byte, rot uint) byte { + return (k << rot) | (k >> (8 - rot)) +} + +// New creates and returns a new cipher.Block. +// The key argument should be 16, 24, or 32 bytes. +func New(key []byte) (cipher.Block, error) { + + klen := len(key) + switch klen { + default: + return nil, KeySizeError(klen) + case 16, 24, 32: + break + } + + var d1, d2 uint64 + + var kl [2]uint64 + var kr [2]uint64 + var ka [2]uint64 + var kb [2]uint64 + + kl[0] = binary.BigEndian.Uint64(key[0:]) + kl[1] = binary.BigEndian.Uint64(key[8:]) + + switch klen { + case 24: + kr[0] = binary.BigEndian.Uint64(key[16:]) + kr[1] = ^kr[0] + case 32: + kr[0] = binary.BigEndian.Uint64(key[16:]) + kr[1] = binary.BigEndian.Uint64(key[24:]) + + } + + d1 = (kl[0] ^ kr[0]) + d2 = (kl[1] ^ kr[1]) + + d2 = d2 ^ f(d1, sigma1) + d1 = d1 ^ f(d2, sigma2) + + d1 = d1 ^ (kl[0]) + d2 = d2 ^ (kl[1]) + d2 = d2 ^ f(d1, sigma3) + d1 = d1 ^ f(d2, sigma4) + ka[0] = d1 + ka[1] = d2 + d1 = (ka[0] ^ kr[0]) + d2 = (ka[1] ^ kr[1]) + d2 = d2 ^ f(d1, sigma5) + d1 = d1 ^ f(d2, sigma6) + kb[0] = d1 + kb[1] = d2 + + // here we generate our keys + c := new(camelliaCipher) + + c.klen = klen + + if klen == 16 { + + c.kw[1], c.kw[2] = rotl128(kl, 0) + + c.k[1], c.k[2] = rotl128(ka, 0) + c.k[3], c.k[4] = rotl128(kl, 15) + c.k[5], c.k[6] = rotl128(ka, 15) + + c.ke[1], c.ke[2] = rotl128(ka, 30) + + c.k[7], c.k[8] = rotl128(kl, 45) + c.k[9], _ = rotl128(ka, 45) + _, c.k[10] = rotl128(kl, 60) + c.k[11], c.k[12] = rotl128(ka, 60) + + c.ke[3], c.ke[4] = rotl128(kl, 77) + + c.k[13], c.k[14] = rotl128(kl, 94) + c.k[15], c.k[16] = rotl128(ka, 94) + c.k[17], c.k[18] = rotl128(kl, 111) + + c.kw[3], c.kw[4] = rotl128(ka, 111) + + } else { + // 24 or 32 + + c.kw[1], c.kw[2] = rotl128(kl, 0) + + c.k[1], c.k[2] = rotl128(kb, 0) + c.k[3], c.k[4] = rotl128(kr, 15) + c.k[5], c.k[6] = rotl128(ka, 15) + + c.ke[1], c.ke[2] = rotl128(kr, 30) + + c.k[7], c.k[8] = rotl128(kb, 30) + c.k[9], c.k[10] = rotl128(kl, 45) + c.k[11], c.k[12] = rotl128(ka, 45) + + c.ke[3], c.ke[4] = rotl128(kl, 60) + + c.k[13], c.k[14] = rotl128(kr, 60) + c.k[15], c.k[16] = rotl128(kb, 60) + c.k[17], c.k[18] = rotl128(kl, 77) + + c.ke[5], c.ke[6] = rotl128(ka, 77) + + c.k[19], c.k[20] = rotl128(kr, 94) + c.k[21], c.k[22] = rotl128(ka, 94) + c.k[23], c.k[24] = rotl128(kl, 111) + + c.kw[3], c.kw[4] = rotl128(kb, 111) + } + + return c, nil +} + +func (c *camelliaCipher) Encrypt(dst, src []byte) { + + d1 := binary.BigEndian.Uint64(src[0:]) + d2 := binary.BigEndian.Uint64(src[8:]) + + d1 ^= c.kw[1] + d2 ^= c.kw[2] + + d2 = d2 ^ f(d1, c.k[1]) + d1 = d1 ^ f(d2, c.k[2]) + d2 = d2 ^ f(d1, c.k[3]) + d1 = d1 ^ f(d2, c.k[4]) + d2 = d2 ^ f(d1, c.k[5]) + d1 = d1 ^ f(d2, c.k[6]) + + d1 = fl(d1, c.ke[1]) + d2 = flinv(d2, c.ke[2]) + + d2 = d2 ^ f(d1, c.k[7]) + d1 = d1 ^ f(d2, c.k[8]) + d2 = d2 ^ f(d1, c.k[9]) + d1 = d1 ^ f(d2, c.k[10]) + d2 = d2 ^ f(d1, c.k[11]) + d1 = d1 ^ f(d2, c.k[12]) + + d1 = fl(d1, c.ke[3]) + d2 = flinv(d2, c.ke[4]) + + d2 = d2 ^ f(d1, c.k[13]) + d1 = d1 ^ f(d2, c.k[14]) + d2 = d2 ^ f(d1, c.k[15]) + d1 = d1 ^ f(d2, c.k[16]) + d2 = d2 ^ f(d1, c.k[17]) + d1 = d1 ^ f(d2, c.k[18]) + + if c.klen > 16 { + // 24 or 32 + + d1 = fl(d1, c.ke[5]) + d2 = flinv(d2, c.ke[6]) + + d2 = d2 ^ f(d1, c.k[19]) + d1 = d1 ^ f(d2, c.k[20]) + d2 = d2 ^ f(d1, c.k[21]) + d1 = d1 ^ f(d2, c.k[22]) + d2 = d2 ^ f(d1, c.k[23]) + d1 = d1 ^ f(d2, c.k[24]) + } + + d2 = d2 ^ c.kw[3] + d1 = d1 ^ c.kw[4] + + binary.BigEndian.PutUint64(dst[0:], d2) + binary.BigEndian.PutUint64(dst[8:], d1) +} + +func (c *camelliaCipher) Decrypt(dst, src []byte) { + + d2 := binary.BigEndian.Uint64(src[0:]) + d1 := binary.BigEndian.Uint64(src[8:]) + + d1 = d1 ^ c.kw[4] + d2 = d2 ^ c.kw[3] + + if c.klen > 16 { + // 24 or 32 + + d1 = d1 ^ f(d2, c.k[24]) + d2 = d2 ^ f(d1, c.k[23]) + d1 = d1 ^ f(d2, c.k[22]) + d2 = d2 ^ f(d1, c.k[21]) + d1 = d1 ^ f(d2, c.k[20]) + d2 = d2 ^ f(d1, c.k[19]) + + d2 = fl(d2, c.ke[6]) + d1 = flinv(d1, c.ke[5]) + } + + d1 = d1 ^ f(d2, c.k[18]) + d2 = d2 ^ f(d1, c.k[17]) + d1 = d1 ^ f(d2, c.k[16]) + d2 = d2 ^ f(d1, c.k[15]) + d1 = d1 ^ f(d2, c.k[14]) + d2 = d2 ^ f(d1, c.k[13]) + + d2 = fl(d2, c.ke[4]) + d1 = flinv(d1, c.ke[3]) + + d1 = d1 ^ f(d2, c.k[12]) + d2 = d2 ^ f(d1, c.k[11]) + d1 = d1 ^ f(d2, c.k[10]) + d2 = d2 ^ f(d1, c.k[9]) + d1 = d1 ^ f(d2, c.k[8]) + d2 = d2 ^ f(d1, c.k[7]) + + d2 = fl(d2, c.ke[2]) + d1 = flinv(d1, c.ke[1]) + + d1 = d1 ^ f(d2, c.k[6]) + d2 = d2 ^ f(d1, c.k[5]) + d1 = d1 ^ f(d2, c.k[4]) + d2 = d2 ^ f(d1, c.k[3]) + d1 = d1 ^ f(d2, c.k[2]) + d2 = d2 ^ f(d1, c.k[1]) + + d2 ^= c.kw[2] + d1 ^= c.kw[1] + + binary.BigEndian.PutUint64(dst[0:], d1) + binary.BigEndian.PutUint64(dst[8:], d2) +} + +func (c *camelliaCipher) BlockSize() int { + return BlockSize +} + +func f(fin, ke uint64) uint64 { + var x uint64 + x = fin ^ ke + t1 := sbox1[uint8(x>>56)] + t2 := sbox2[uint8(x>>48)] + t3 := sbox3[uint8(x>>40)] + t4 := sbox4[uint8(x>>32)] + t5 := sbox2[uint8(x>>24)] + t6 := sbox3[uint8(x>>16)] + t7 := sbox4[uint8(x>>8)] + t8 := sbox1[uint8(x)] + y1 := t1 ^ t3 ^ t4 ^ t6 ^ t7 ^ t8 + y2 := t1 ^ t2 ^ t4 ^ t5 ^ t7 ^ t8 + y3 := t1 ^ t2 ^ t3 ^ t5 ^ t6 ^ t8 + y4 := t2 ^ t3 ^ t4 ^ t5 ^ t6 ^ t7 + y5 := t1 ^ t2 ^ t6 ^ t7 ^ t8 + y6 := t2 ^ t3 ^ t5 ^ t7 ^ t8 + y7 := t3 ^ t4 ^ t5 ^ t6 ^ t8 + y8 := t1 ^ t4 ^ t5 ^ t6 ^ t7 + return uint64(y1)<<56 | uint64(y2)<<48 | uint64(y3)<<40 | uint64(y4)<<32 | uint64(y5)<<24 | uint64(y6)<<16 | uint64(y7)<<8 | uint64(y8) +} + +func fl(flin, ke uint64) uint64 { + x1 := uint32(flin >> 32) + x2 := uint32(flin & 0xffffffff) + k1 := uint32(ke >> 32) + k2 := uint32(ke & 0xffffffff) + x2 = x2 ^ rotl32(x1&k1, 1) + x1 = x1 ^ (x2 | k2) + return uint64(x1)<<32 | uint64(x2) +} + +func flinv(flin, ke uint64) uint64 { + y1 := uint32(flin >> 32) + y2 := uint32(flin & 0xffffffff) + k1 := uint32(ke >> 32) + k2 := uint32(ke & 0xffffffff) + y1 = y1 ^ (y2 | k2) + y2 = y2 ^ rotl32(y1&k1, 1) + return uint64(y1)<<32 | uint64(y2) +} + +var sbox1 = [...]byte{ + 0x70, 0x82, 0x2c, 0xec, 0xb3, 0x27, 0xc0, 0xe5, 0xe4, 0x85, 0x57, 0x35, 0xea, 0x0c, 0xae, 0x41, + 0x23, 0xef, 0x6b, 0x93, 0x45, 0x19, 0xa5, 0x21, 0xed, 0x0e, 0x4f, 0x4e, 0x1d, 0x65, 0x92, 0xbd, + 0x86, 0xb8, 0xaf, 0x8f, 0x7c, 0xeb, 0x1f, 0xce, 0x3e, 0x30, 0xdc, 0x5f, 0x5e, 0xc5, 0x0b, 0x1a, + 0xa6, 0xe1, 0x39, 0xca, 0xd5, 0x47, 0x5d, 0x3d, 0xd9, 0x01, 0x5a, 0xd6, 0x51, 0x56, 0x6c, 0x4d, + 0x8b, 0x0d, 0x9a, 0x66, 0xfb, 0xcc, 0xb0, 0x2d, 0x74, 0x12, 0x2b, 0x20, 0xf0, 0xb1, 0x84, 0x99, + 0xdf, 0x4c, 0xcb, 0xc2, 0x34, 0x7e, 0x76, 0x05, 0x6d, 0xb7, 0xa9, 0x31, 0xd1, 0x17, 0x04, 0xd7, + 0x14, 0x58, 0x3a, 0x61, 0xde, 0x1b, 0x11, 0x1c, 0x32, 0x0f, 0x9c, 0x16, 0x53, 0x18, 0xf2, 0x22, + 0xfe, 0x44, 0xcf, 0xb2, 0xc3, 0xb5, 0x7a, 0x91, 0x24, 0x08, 0xe8, 0xa8, 0x60, 0xfc, 0x69, 0x50, + 0xaa, 0xd0, 0xa0, 0x7d, 0xa1, 0x89, 0x62, 0x97, 0x54, 0x5b, 0x1e, 0x95, 0xe0, 0xff, 0x64, 0xd2, + 0x10, 0xc4, 0x00, 0x48, 0xa3, 0xf7, 0x75, 0xdb, 0x8a, 0x03, 0xe6, 0xda, 0x09, 0x3f, 0xdd, 0x94, + 0x87, 0x5c, 0x83, 0x02, 0xcd, 0x4a, 0x90, 0x33, 0x73, 0x67, 0xf6, 0xf3, 0x9d, 0x7f, 0xbf, 0xe2, + 0x52, 0x9b, 0xd8, 0x26, 0xc8, 0x37, 0xc6, 0x3b, 0x81, 0x96, 0x6f, 0x4b, 0x13, 0xbe, 0x63, 0x2e, + 0xe9, 0x79, 0xa7, 0x8c, 0x9f, 0x6e, 0xbc, 0x8e, 0x29, 0xf5, 0xf9, 0xb6, 0x2f, 0xfd, 0xb4, 0x59, + 0x78, 0x98, 0x06, 0x6a, 0xe7, 0x46, 0x71, 0xba, 0xd4, 0x25, 0xab, 0x42, 0x88, 0xa2, 0x8d, 0xfa, + 0x72, 0x07, 0xb9, 0x55, 0xf8, 0xee, 0xac, 0x0a, 0x36, 0x49, 0x2a, 0x68, 0x3c, 0x38, 0xf1, 0xa4, + 0x40, 0x28, 0xd3, 0x7b, 0xbb, 0xc9, 0x43, 0xc1, 0x15, 0xe3, 0xad, 0xf4, 0x77, 0xc7, 0x80, 0x9e, +} + +var sbox2 [256]byte +var sbox3 [256]byte +var sbox4 [256]byte + +var _ cipher.Block = &camelliaCipher{} diff --git a/vendor/github.com/dgryski/go-camellia/t_camellia.pl b/vendor/github.com/dgryski/go-camellia/t_camellia.pl new file mode 100644 index 0000000..f95ba48 --- /dev/null +++ b/vendor/github.com/dgryski/go-camellia/t_camellia.pl @@ -0,0 +1,58 @@ +#!/usr/bin/perl + +# to run the full verification suite: +# curl http://info.isl.ntt.co.jp/crypt/eng/camellia/dl/cryptrec/t_camellia.txt |perl ./t_camellia.pl >cfull_test.go +# perl -pi -e 's/range camelliaTests/$&Full/' camellia_test.go +# go test + +print <) { + next if /^\s*$/ or /^Camellia/; + if (/^K/) { + $k = linetostr($_); + } + + if (/^P/) { + $p = linetostr($_); + } + + if (/^C/) { + $c = linetostr($_); + print <<"GOCODE"; + { + []byte{$k}, + []byte{$p}, + []byte{$c}, + }, +GOCODE + } +} + +print <= 2, this fits into 16 bits + y := uint16(0x10001 % uint32(x)) + + if y == 1 { + return 1 - t1 + } + + var t0 uint16 = 1 + var q uint16 + + for y != 1 { + q = x / y + x = x % y + t0 += q * t1 + if x == 1 { + return t0 + } + q = y / x + y = y % x + t1 += q * t0 + } + return 1 - t1 +} + +// mul computes x*y mod 2^16+1 +func mul(x, y uint16) uint16 { + + if y == 0 { + return 1 - x + } + + if x == 0 { + return 1 - y + } + + t32 := uint32(x) * uint32(y) + x = uint16(t32) + y = uint16(t32 >> 16) + + if x < y { + return x - y + 1 + } + + return x - y +} + +// expandKey computes encryption round-keys from a user-supplied key +func expandKey(key []byte, EK []uint16) { + var i, j int + + for j = 0; j < 8; j++ { + EK[j] = (uint16(key[0]) << 8) + uint16(key[1]) + key = key[2:] + } + for i = 0; j < keyLen; j++ { + i++ + EK[i+7] = EK[i&7]<<9 | EK[(i+1)&7]>>7 + EK = EK[i&8:] + i &= 7 + } +} + +// invertKey computes the decryption round-keys from a set of encryption round-keys +func invertKey(EK []uint16, DK []uint16) { + + var t1, t2, t3 uint16 + var p [keyLen]uint16 + pidx := keyLen + ekidx := 0 + + t1 = mulInv(EK[ekidx]) + ekidx++ + t2 = -EK[ekidx] + ekidx++ + t3 = -EK[ekidx] + ekidx++ + pidx-- + p[pidx] = mulInv(EK[ekidx]) + ekidx++ + pidx-- + p[pidx] = t3 + pidx-- + p[pidx] = t2 + pidx-- + p[pidx] = t1 + + for i := 0; i < rounds-1; i++ { + t1 = EK[ekidx] + ekidx++ + pidx-- + p[pidx] = EK[ekidx] + ekidx++ + pidx-- + p[pidx] = t1 + + t1 = mulInv(EK[ekidx]) + ekidx++ + t2 = -EK[ekidx] + ekidx++ + t3 = -EK[ekidx] + ekidx++ + pidx-- + p[pidx] = mulInv(EK[ekidx]) + ekidx++ + pidx-- + p[pidx] = t2 + pidx-- + p[pidx] = t3 + pidx-- + p[pidx] = t1 + } + + t1 = EK[ekidx] + ekidx++ + pidx-- + p[pidx] = EK[ekidx] + ekidx++ + pidx-- + p[pidx] = t1 + + t1 = mulInv(EK[ekidx]) + ekidx++ + t2 = -EK[ekidx] + ekidx++ + t3 = -EK[ekidx] + ekidx++ + pidx-- + p[pidx] = mulInv(EK[ekidx]) + pidx-- + p[pidx] = t3 + pidx-- + p[pidx] = t2 + pidx-- + p[pidx] = t1 + + copy(DK, p[:]) +} + +// crypt performs IDEA encryption given input/output buffers and a set of round-keys +func crypt(inbuf, outbuf []byte, key []uint16) { + + var x1, x2, x3, x4, s2, s3 uint16 + + x1 = binary.BigEndian.Uint16(inbuf[0:]) + x2 = binary.BigEndian.Uint16(inbuf[2:]) + x3 = binary.BigEndian.Uint16(inbuf[4:]) + x4 = binary.BigEndian.Uint16(inbuf[6:]) + + for r := rounds; r > 0; r-- { + + x1 = mul(x1, key[0]) + key = key[1:] + x2 += key[0] + key = key[1:] + x3 += key[0] + key = key[1:] + + x4 = mul(x4, key[0]) + key = key[1:] + + s3 = x3 + x3 ^= x1 + x3 = mul(x3, key[0]) + key = key[1:] + s2 = x2 + + x2 ^= x4 + x2 += x3 + x2 = mul(x2, key[0]) + key = key[1:] + x3 += x2 + + x1 ^= x2 + x4 ^= x3 + + x2 ^= s3 + x3 ^= s2 + + } + x1 = mul(x1, key[0]) + key = key[1:] + + x3 += key[0] + key = key[1:] + x2 += key[0] + key = key[1:] + x4 = mul(x4, key[0]) + + binary.BigEndian.PutUint16(outbuf[0:], x1) + binary.BigEndian.PutUint16(outbuf[2:], x3) + binary.BigEndian.PutUint16(outbuf[4:], x2) + binary.BigEndian.PutUint16(outbuf[6:], x4) +} diff --git a/vendor/github.com/dgryski/go-rc2/LICENSE b/vendor/github.com/dgryski/go-rc2/LICENSE new file mode 100644 index 0000000..039a2e4 --- /dev/null +++ b/vendor/github.com/dgryski/go-rc2/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Damian Gryski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/dgryski/go-rc2/rc2.go b/vendor/github.com/dgryski/go-rc2/rc2.go new file mode 100644 index 0000000..aa194e5 --- /dev/null +++ b/vendor/github.com/dgryski/go-rc2/rc2.go @@ -0,0 +1,284 @@ +// Package rc2 implements the RC2 cipher +/* +https://www.ietf.org/rfc/rfc2268.txt +http://people.csail.mit.edu/rivest/pubs/KRRR98.pdf + +This code is licensed under the MIT license. +*/ +package rc2 + +import ( + "crypto/cipher" + "encoding/binary" + "strconv" +) + +// The rc2 block size in bytes +const BlockSize = 8 + +type rc2Cipher struct { + k [64]uint16 +} + +// KeySizeError indicates the supplied key was invalid +type KeySizeError int + +func (k KeySizeError) Error() string { return "rc2: invalid key size " + strconv.Itoa(int(k)) } + +// EffectiveKeySizeError indicates the supplied effective key length was invalid +type EffectiveKeySizeError int + +func (k EffectiveKeySizeError) Error() string { + return "rc2: invalid effective key size " + strconv.Itoa(int(k)) +} + +// New returns a new rc2 cipher with the given key and effective key length t1 +func New(key []byte, t1 int) (cipher.Block, error) { + if l := len(key); l == 0 || l > 128 { + return nil, KeySizeError(l) + } + + if t1 < 8 || t1 > 1024 { + return nil, EffectiveKeySizeError(t1) + } + + return &rc2Cipher{ + k: expandKey(key, t1), + }, nil +} + +func (c *rc2Cipher) BlockSize() int { return BlockSize } + +var piTable = [256]byte{ + 0xd9, 0x78, 0xf9, 0xc4, 0x19, 0xdd, 0xb5, 0xed, 0x28, 0xe9, 0xfd, 0x79, 0x4a, 0xa0, 0xd8, 0x9d, + 0xc6, 0x7e, 0x37, 0x83, 0x2b, 0x76, 0x53, 0x8e, 0x62, 0x4c, 0x64, 0x88, 0x44, 0x8b, 0xfb, 0xa2, + 0x17, 0x9a, 0x59, 0xf5, 0x87, 0xb3, 0x4f, 0x13, 0x61, 0x45, 0x6d, 0x8d, 0x09, 0x81, 0x7d, 0x32, + 0xbd, 0x8f, 0x40, 0xeb, 0x86, 0xb7, 0x7b, 0x0b, 0xf0, 0x95, 0x21, 0x22, 0x5c, 0x6b, 0x4e, 0x82, + 0x54, 0xd6, 0x65, 0x93, 0xce, 0x60, 0xb2, 0x1c, 0x73, 0x56, 0xc0, 0x14, 0xa7, 0x8c, 0xf1, 0xdc, + 0x12, 0x75, 0xca, 0x1f, 0x3b, 0xbe, 0xe4, 0xd1, 0x42, 0x3d, 0xd4, 0x30, 0xa3, 0x3c, 0xb6, 0x26, + 0x6f, 0xbf, 0x0e, 0xda, 0x46, 0x69, 0x07, 0x57, 0x27, 0xf2, 0x1d, 0x9b, 0xbc, 0x94, 0x43, 0x03, + 0xf8, 0x11, 0xc7, 0xf6, 0x90, 0xef, 0x3e, 0xe7, 0x06, 0xc3, 0xd5, 0x2f, 0xc8, 0x66, 0x1e, 0xd7, + 0x08, 0xe8, 0xea, 0xde, 0x80, 0x52, 0xee, 0xf7, 0x84, 0xaa, 0x72, 0xac, 0x35, 0x4d, 0x6a, 0x2a, + 0x96, 0x1a, 0xd2, 0x71, 0x5a, 0x15, 0x49, 0x74, 0x4b, 0x9f, 0xd0, 0x5e, 0x04, 0x18, 0xa4, 0xec, + 0xc2, 0xe0, 0x41, 0x6e, 0x0f, 0x51, 0xcb, 0xcc, 0x24, 0x91, 0xaf, 0x50, 0xa1, 0xf4, 0x70, 0x39, + 0x99, 0x7c, 0x3a, 0x85, 0x23, 0xb8, 0xb4, 0x7a, 0xfc, 0x02, 0x36, 0x5b, 0x25, 0x55, 0x97, 0x31, + 0x2d, 0x5d, 0xfa, 0x98, 0xe3, 0x8a, 0x92, 0xae, 0x05, 0xdf, 0x29, 0x10, 0x67, 0x6c, 0xba, 0xc9, + 0xd3, 0x00, 0xe6, 0xcf, 0xe1, 0x9e, 0xa8, 0x2c, 0x63, 0x16, 0x01, 0x3f, 0x58, 0xe2, 0x89, 0xa9, + 0x0d, 0x38, 0x34, 0x1b, 0xab, 0x33, 0xff, 0xb0, 0xbb, 0x48, 0x0c, 0x5f, 0xb9, 0xb1, 0xcd, 0x2e, + 0xc5, 0xf3, 0xdb, 0x47, 0xe5, 0xa5, 0x9c, 0x77, 0x0a, 0xa6, 0x20, 0x68, 0xfe, 0x7f, 0xc1, 0xad, +} + +func expandKey(key []byte, t1 int) [64]uint16 { + + l := make([]byte, 128) + copy(l, key) + + var t = len(key) + var t8 = (t1 + 7) / 8 + var tm = byte(255 % uint(1<<(8+uint(t1)-8*uint(t8)))) + + for i := len(key); i < 128; i++ { + l[i] = piTable[l[i-1]+l[uint8(i-t)]] + } + + l[128-t8] = piTable[l[128-t8]&tm] + + for i := 127 - t8; i >= 0; i-- { + l[i] = piTable[l[i+1]^l[i+t8]] + } + + var k [64]uint16 + + for i := range k { + k[i] = uint16(l[2*i]) + uint16(l[2*i+1])*256 + } + + return k +} + +func rotl16(x uint16, b uint) uint16 { + return (x >> (16 - b)) | (x << b) +} + +func (c *rc2Cipher) Encrypt(dst, src []byte) { + + r0 := binary.LittleEndian.Uint16(src[0:]) + r1 := binary.LittleEndian.Uint16(src[2:]) + r2 := binary.LittleEndian.Uint16(src[4:]) + r3 := binary.LittleEndian.Uint16(src[6:]) + + var j int + + // These three mix blocks have not been extracted to a common function for to performance reasons. + for j <= 16 { + // mix r0 + r0 = r0 + c.k[j] + (r3 & r2) + ((^r3) & r1) + r0 = rotl16(r0, 1) + j++ + + // mix r1 + r1 = r1 + c.k[j] + (r0 & r3) + ((^r0) & r2) + r1 = rotl16(r1, 2) + j++ + + // mix r2 + r2 = r2 + c.k[j] + (r1 & r0) + ((^r1) & r3) + r2 = rotl16(r2, 3) + j++ + + // mix r3 + r3 = r3 + c.k[j] + (r2 & r1) + ((^r2) & r0) + r3 = rotl16(r3, 5) + j++ + } + + r0 = r0 + c.k[r3&63] + r1 = r1 + c.k[r0&63] + r2 = r2 + c.k[r1&63] + r3 = r3 + c.k[r2&63] + + for j <= 40 { + // mix r0 + r0 = r0 + c.k[j] + (r3 & r2) + ((^r3) & r1) + r0 = rotl16(r0, 1) + j++ + + // mix r1 + r1 = r1 + c.k[j] + (r0 & r3) + ((^r0) & r2) + r1 = rotl16(r1, 2) + j++ + + // mix r2 + r2 = r2 + c.k[j] + (r1 & r0) + ((^r1) & r3) + r2 = rotl16(r2, 3) + j++ + + // mix r3 + r3 = r3 + c.k[j] + (r2 & r1) + ((^r2) & r0) + r3 = rotl16(r3, 5) + j++ + } + + r0 = r0 + c.k[r3&63] + r1 = r1 + c.k[r0&63] + r2 = r2 + c.k[r1&63] + r3 = r3 + c.k[r2&63] + + for j <= 60 { + // mix r0 + r0 = r0 + c.k[j] + (r3 & r2) + ((^r3) & r1) + r0 = rotl16(r0, 1) + j++ + + // mix r1 + r1 = r1 + c.k[j] + (r0 & r3) + ((^r0) & r2) + r1 = rotl16(r1, 2) + j++ + + // mix r2 + r2 = r2 + c.k[j] + (r1 & r0) + ((^r1) & r3) + r2 = rotl16(r2, 3) + j++ + + // mix r3 + r3 = r3 + c.k[j] + (r2 & r1) + ((^r2) & r0) + r3 = rotl16(r3, 5) + j++ + } + + binary.LittleEndian.PutUint16(dst[0:], r0) + binary.LittleEndian.PutUint16(dst[2:], r1) + binary.LittleEndian.PutUint16(dst[4:], r2) + binary.LittleEndian.PutUint16(dst[6:], r3) +} + +func (c *rc2Cipher) Decrypt(dst, src []byte) { + + r0 := binary.LittleEndian.Uint16(src[0:]) + r1 := binary.LittleEndian.Uint16(src[2:]) + r2 := binary.LittleEndian.Uint16(src[4:]) + r3 := binary.LittleEndian.Uint16(src[6:]) + + j := 63 + + for j >= 44 { + // unmix r3 + r3 = rotl16(r3, 16-5) + r3 = r3 - c.k[j] - (r2 & r1) - ((^r2) & r0) + j-- + + // unmix r2 + r2 = rotl16(r2, 16-3) + r2 = r2 - c.k[j] - (r1 & r0) - ((^r1) & r3) + j-- + + // unmix r1 + r1 = rotl16(r1, 16-2) + r1 = r1 - c.k[j] - (r0 & r3) - ((^r0) & r2) + j-- + + // unmix r0 + r0 = rotl16(r0, 16-1) + r0 = r0 - c.k[j] - (r3 & r2) - ((^r3) & r1) + j-- + } + + r3 = r3 - c.k[r2&63] + r2 = r2 - c.k[r1&63] + r1 = r1 - c.k[r0&63] + r0 = r0 - c.k[r3&63] + + for j >= 20 { + // unmix r3 + r3 = rotl16(r3, 16-5) + r3 = r3 - c.k[j] - (r2 & r1) - ((^r2) & r0) + j-- + + // unmix r2 + r2 = rotl16(r2, 16-3) + r2 = r2 - c.k[j] - (r1 & r0) - ((^r1) & r3) + j-- + + // unmix r1 + r1 = rotl16(r1, 16-2) + r1 = r1 - c.k[j] - (r0 & r3) - ((^r0) & r2) + j-- + + // unmix r0 + r0 = rotl16(r0, 16-1) + r0 = r0 - c.k[j] - (r3 & r2) - ((^r3) & r1) + j-- + } + + r3 = r3 - c.k[r2&63] + r2 = r2 - c.k[r1&63] + r1 = r1 - c.k[r0&63] + r0 = r0 - c.k[r3&63] + + for j >= 0 { + // unmix r3 + r3 = rotl16(r3, 16-5) + r3 = r3 - c.k[j] - (r2 & r1) - ((^r2) & r0) + j-- + + // unmix r2 + r2 = rotl16(r2, 16-3) + r2 = r2 - c.k[j] - (r1 & r0) - ((^r1) & r3) + j-- + + // unmix r1 + r1 = rotl16(r1, 16-2) + r1 = r1 - c.k[j] - (r0 & r3) - ((^r0) & r2) + j-- + + // unmix r0 + r0 = rotl16(r0, 16-1) + r0 = r0 - c.k[j] - (r3 & r2) - ((^r3) & r1) + j-- + } + + binary.LittleEndian.PutUint16(dst[0:], r0) + binary.LittleEndian.PutUint16(dst[2:], r1) + binary.LittleEndian.PutUint16(dst[4:], r2) + binary.LittleEndian.PutUint16(dst[6:], r3) +} diff --git a/vendor/github.com/nadoo/conflag/LICENSE b/vendor/github.com/nadoo/conflag/LICENSE new file mode 100644 index 0000000..9cecc1d --- /dev/null +++ b/vendor/github.com/nadoo/conflag/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + {one line to give the program's name and a brief idea of what it does.} + Copyright (C) {year} {name of author} + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + {project} Copyright (C) {year} {fullname} + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/vendor/github.com/nadoo/conflag/README.md b/vendor/github.com/nadoo/conflag/README.md new file mode 100644 index 0000000..6a1b368 --- /dev/null +++ b/vendor/github.com/nadoo/conflag/README.md @@ -0,0 +1,111 @@ +# conflag +conflag is a config file and command line parser based on Go's standard flag package. + +## Usage + +### Your code: +```Go +package main + +import ( + "fmt" + + "github.com/nadoo/conflag" +) + +var conf struct { + Name string + Age int + Male bool +} + +func main() { + // get a new conflag instance + flag := conflag.New() + + // setup flags as the standard flag package + flag.StringVar(&conf.Name, "name", "", "your name") + flag.IntVar(&conf.Age, "age", 0, "your age") + flag.BoolVar(&conf.Male, "male", false, "your sex") + + // parse before access flags + flag.Parse() + + // now you're able to get the parsed flag values + fmt.Printf(" Name: %s\n", conf.Name) + fmt.Printf(" Age: %d\n", conf.Age) + fmt.Printf(" Male: %v\n", conf.Male) +} +``` + +### Run without config file: +command: +```bash +sample -name Jay -age 30 +``` +output: +```bash + Name: Jay + Age: 30 + Male: false +``` + +### Run with config file(-config): +sample.conf: +```bash +name=Jason +age=20 +male +``` +command: **use "-config" flag to specify the config file path.** +```bash +sample -config sample.conf +``` +output: +```bash + Name: Jason + Age: 20 + Male: true +``` + +### Run with config file and OVERRIDE a flag value using commandline: +sample.conf: +```bash +name=Jason +age=20 +male +``` +command: +```bash +sample -config sample.conf -name Michael +``` +output: +```bash + Name: Michael + Age: 20 + Male: true +``` + +## Config File +- format: KEY=VALUE + +**just use the command line flag name as the key name**: + +```bash +## config file +# comment line starts with "#" + +# format: +#KEY=VALUE, +# just use the command line flag name as the key name + +# your name +name=Jason + +# your age +age=20 + +# are you male? +male +``` +See [simple.conf](examples/simple/simple.conf) \ No newline at end of file diff --git a/vendor/github.com/nadoo/conflag/conflag.go b/vendor/github.com/nadoo/conflag/conflag.go new file mode 100644 index 0000000..c011540 --- /dev/null +++ b/vendor/github.com/nadoo/conflag/conflag.go @@ -0,0 +1,151 @@ +package conflag + +import ( + "bufio" + "flag" + "os" + "path/filepath" + "strings" +) + +// Conflag . +type Conflag struct { + *flag.FlagSet + + app string + osArgs []string + cfgFile string + args []string + + includes []string + + // TODO: add shorthand? or just use pflag? + // shorthand map[byte]string +} + +// New ... +func New(args ...string) *Conflag { + if args == nil { + args = os.Args + } + + c := &Conflag{} + c.app = args[0] + c.osArgs = args[1:] + c.FlagSet = flag.NewFlagSet(c.app, flag.ExitOnError) + c.FlagSet.StringVar(&c.cfgFile, "config", "", "config file path") + + return c +} + +// NewFromFile ... +func NewFromFile(app, cfgFile string) *Conflag { + c := &Conflag{} + + if app != "" { + c.app = app + } else { + c.app = os.Args[0] + } + + c.cfgFile = cfgFile + c.FlagSet = flag.NewFlagSet(c.app, flag.ExitOnError) + + c.StringSliceUniqVar(&c.includes, "include", nil, "include file") + + return c +} + +// Parse ... +func (c *Conflag) Parse() (err error) { + // parse 1st time and see whether there is a conf file. + err = c.FlagSet.Parse(c.osArgs) + if err != nil { + return err + } + + // if there is no args, just try to load the app.conf file. + if c.cfgFile == "" && len(c.osArgs) == 0 { + // trim app exetension + for i := len(c.app) - 1; i >= 0 && c.app[i] != '/' && c.app[i] != '\\'; i-- { + if c.app[i] == '.' { + c.cfgFile = c.app[:i] + break + } + } + + if c.cfgFile == "" { + c.cfgFile = c.app + } + + c.cfgFile += ".conf" + } + + if c.cfgFile == "" { + return nil + } + + fargs, err := parseFile(c.cfgFile) + if err != nil { + return err + } + + c.args = fargs + c.args = append(c.args, c.osArgs...) + + // parse 2nd time to get the include file values + err = c.FlagSet.Parse(c.args) + if err != nil { + return err + } + + dir := filepath.Dir(c.cfgFile) + + // parse 3rd time to parse flags in include file + for _, include := range c.includes { + include = filepath.Join(dir, include) + fargs, err := parseFile(include) + if err != nil { + return err + } + + c.args = fargs + c.args = append(c.args, c.osArgs...) + + err = c.FlagSet.Parse(c.args) + } + + return err +} + +func parseFile(cfgFile string) ([]string, error) { + var s []string + + fp, err := os.Open(cfgFile) + if err != nil { + return nil, err + } + defer fp.Close() + + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + line := scanner.Text() + line = strings.TrimSpace(line) + if len(line) == 0 || line[:1] == "#" { + continue + } + s = append(s, "-"+line) + } + + return s, nil +} + +// AppDir returns the app dir +func (c *Conflag) AppDir() string { + return filepath.Dir(os.Args[0]) +} + +// ConfDir returns the config file dir +func (c *Conflag) ConfDir() string { + return filepath.Dir(c.cfgFile) +} diff --git a/vendor/github.com/nadoo/conflag/string_slice.go b/vendor/github.com/nadoo/conflag/string_slice.go new file mode 100644 index 0000000..e7a02f1 --- /dev/null +++ b/vendor/github.com/nadoo/conflag/string_slice.go @@ -0,0 +1,47 @@ +// source: https://github.com/spf13/pflag/blob/master/string_slice.go + +package conflag + +type stringSliceValue struct { + value *[]string + changed bool +} + +func newStringSliceValue(val []string, p *[]string) *stringSliceValue { + ssv := new(stringSliceValue) + ssv.value = p + *ssv.value = val + return ssv +} + +func (s *stringSliceValue) Set(val string) error { + if !s.changed { + *s.value = []string{val} + s.changed = true + } else { + *s.value = append(*s.value, val) + } + return nil +} + +func (s *stringSliceValue) Type() string { + return "stringSlice" +} + +func (s *stringSliceValue) String() string { + return "" +} + +// StringSliceVar defines a string flag with specified name, default value, and usage string. +// The argument p points to a []string variable in which to store the value of the flag. +func (c *Conflag) StringSliceVar(p *[]string, name string, value []string, usage string) { + c.Var(newStringSliceValue(value, p), name, usage) +} + +// StringSlice defines a string flag with specified name, default value, and usage string. +// The return value is the address of a []string variable that stores the value of the flag. +func (c *Conflag) StringSlice(name string, value []string, usage string) *[]string { + p := []string{} + c.StringSliceVar(&p, name, value, usage) + return &p +} diff --git a/vendor/github.com/nadoo/conflag/string_slice_uniq.go b/vendor/github.com/nadoo/conflag/string_slice_uniq.go new file mode 100644 index 0000000..70d7389 --- /dev/null +++ b/vendor/github.com/nadoo/conflag/string_slice_uniq.go @@ -0,0 +1,51 @@ +package conflag + +type stringSliceUniqValue struct { + *stringSliceValue +} + +func newStringSliceUniqValue(val []string, p *[]string) *stringSliceUniqValue { + return &stringSliceUniqValue{stringSliceValue: newStringSliceValue(val, p)} +} + +func (s *stringSliceUniqValue) Set(val string) error { + if !s.changed { + *s.value = []string{val} + s.changed = true + } + + dup := false + for _, v := range *s.value { + if v == val { + dup = true + } + } + + if !dup { + *s.value = append(*s.value, val) + } + + return nil +} + +func (s *stringSliceUniqValue) Type() string { + return "stringSliceUniq" +} + +func (s *stringSliceUniqValue) String() string { + return "" +} + +// StringSliceUniqVar defines a string flag with specified name, default value, and usage string. +// The argument p points to a []string variable in which to store the value of the flag. +func (c *Conflag) StringSliceUniqVar(p *[]string, name string, value []string, usage string) { + c.Var(newStringSliceUniqValue(value, p), name, usage) +} + +// StringUniqSlice defines a string flag with specified name, default value, and usage string. +// The return value is the address of a []string variable that stores the value of the flag. +func (c *Conflag) StringUniqSlice(name string, value []string, usage string) *[]string { + p := []string{} + c.StringSliceUniqVar(&p, name, value, usage) + return &p +} diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/LICENSE b/vendor/github.com/shadowsocks/go-shadowsocks2/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/core/cipher.go b/vendor/github.com/shadowsocks/go-shadowsocks2/core/cipher.go new file mode 100644 index 0000000..c672510 --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/core/cipher.go @@ -0,0 +1,144 @@ +package core + +import ( + "crypto/md5" + "errors" + "net" + "sort" + "strings" + + "github.com/shadowsocks/go-shadowsocks2/shadowaead" + "github.com/shadowsocks/go-shadowsocks2/shadowstream" +) + +type Cipher interface { + StreamConnCipher + PacketConnCipher +} + +type StreamConnCipher interface { + StreamConn(net.Conn) net.Conn +} + +type PacketConnCipher interface { + PacketConn(net.PacketConn) net.PacketConn +} + +// ErrCipherNotSupported occurs when a cipher is not supported (likely because of security concerns). +var ErrCipherNotSupported = errors.New("cipher not supported") + +// List of AEAD ciphers: key size in bytes and constructor +var aeadList = map[string]struct { + KeySize int + New func([]byte) (shadowaead.Cipher, error) +}{ + "AEAD_AES_128_GCM": {16, shadowaead.AESGCM}, + "AEAD_AES_192_GCM": {24, shadowaead.AESGCM}, + "AEAD_AES_256_GCM": {32, shadowaead.AESGCM}, + "AEAD_CHACHA20_POLY1305": {32, shadowaead.Chacha20Poly1305}, +} + +// List of stream ciphers: key size in bytes and constructor +var streamList = map[string]struct { + KeySize int + New func(key []byte) (shadowstream.Cipher, error) +}{ + "AES-128-CTR": {16, shadowstream.AESCTR}, + "AES-192-CTR": {24, shadowstream.AESCTR}, + "AES-256-CTR": {32, shadowstream.AESCTR}, + "AES-128-CFB": {16, shadowstream.AESCFB}, + "AES-192-CFB": {24, shadowstream.AESCFB}, + "AES-256-CFB": {32, shadowstream.AESCFB}, + "CHACHA20-IETF": {32, shadowstream.Chacha20IETF}, + "XCHACHA20": {32, shadowstream.Xchacha20}, +} + +// ListCipher returns a list of available cipher names sorted alphabetically. +func ListCipher() []string { + var l []string + for k := range aeadList { + l = append(l, k) + } + for k := range streamList { + l = append(l, k) + } + sort.Strings(l) + return l +} + +// PickCipher returns a Cipher of the given name. Derive key from password if given key is empty. +func PickCipher(name string, key []byte, password string) (Cipher, error) { + name = strings.ToUpper(name) + + switch name { + case "DUMMY": + return &dummy{}, nil + case "CHACHA20-IETF-POLY1305": + name = "AEAD_CHACHA20_POLY1305" + case "AES-128-GCM": + name = "AEAD_AES_128_GCM" + case "AES-196-GCM": + name = "AEAD_AES_196_GCM" + case "AES-256-GCM": + name = "AEAD_AES_256_GCM" + } + + if choice, ok := aeadList[name]; ok { + if len(key) == 0 { + key = kdf(password, choice.KeySize) + } + if len(key) != choice.KeySize { + return nil, shadowaead.KeySizeError(choice.KeySize) + } + aead, err := choice.New(key) + return &aeadCipher{aead}, err + } + + if choice, ok := streamList[name]; ok { + if len(key) == 0 { + key = kdf(password, choice.KeySize) + } + if len(key) != choice.KeySize { + return nil, shadowstream.KeySizeError(choice.KeySize) + } + ciph, err := choice.New(key) + return &streamCipher{ciph}, err + } + + return nil, ErrCipherNotSupported +} + +type aeadCipher struct{ shadowaead.Cipher } + +func (aead *aeadCipher) StreamConn(c net.Conn) net.Conn { return shadowaead.NewConn(c, aead) } +func (aead *aeadCipher) PacketConn(c net.PacketConn) net.PacketConn { + return shadowaead.NewPacketConn(c, aead) +} + +type streamCipher struct{ shadowstream.Cipher } + +func (ciph *streamCipher) StreamConn(c net.Conn) net.Conn { return shadowstream.NewConn(c, ciph) } +func (ciph *streamCipher) PacketConn(c net.PacketConn) net.PacketConn { + return shadowstream.NewPacketConn(c, ciph) +} + +// dummy cipher does not encrypt + +type dummy struct{} + +func (dummy) StreamConn(c net.Conn) net.Conn { return c } +func (dummy) PacketConn(c net.PacketConn) net.PacketConn { return c } + +// key-derivation function from original Shadowsocks +func kdf(password string, keyLen int) []byte { + var b, prev []byte + h := md5.New() + for len(b) < keyLen { + h.Write(prev) + h.Write([]byte(password)) + b = h.Sum(b) + prev = b[len(b)-h.Size():] + h.Reset() + } + return b[:keyLen] +} diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/core/doc.go b/vendor/github.com/shadowsocks/go-shadowsocks2/core/doc.go new file mode 100644 index 0000000..4001c10 --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/core/doc.go @@ -0,0 +1,2 @@ +// Package core implements essential parts of Shadowsocks +package core diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/core/packet.go b/vendor/github.com/shadowsocks/go-shadowsocks2/core/packet.go new file mode 100644 index 0000000..641aa13 --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/core/packet.go @@ -0,0 +1,8 @@ +package core + +import "net" + +func ListenPacket(network, address string, ciph PacketConnCipher) (net.PacketConn, error) { + c, err := net.ListenPacket(network, address) + return ciph.PacketConn(c), err +} diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/core/stream.go b/vendor/github.com/shadowsocks/go-shadowsocks2/core/stream.go new file mode 100644 index 0000000..5c773cd --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/core/stream.go @@ -0,0 +1,23 @@ +package core + +import "net" + +type listener struct { + net.Listener + StreamConnCipher +} + +func Listen(network, address string, ciph StreamConnCipher) (net.Listener, error) { + l, err := net.Listen(network, address) + return &listener{l, ciph}, err +} + +func (l *listener) Accept() (net.Conn, error) { + c, err := l.Listener.Accept() + return l.StreamConn(c), err +} + +func Dial(network, address string, ciph StreamConnCipher) (net.Conn, error) { + c, err := net.Dial(network, address) + return ciph.StreamConn(c), err +} diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/cipher.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/cipher.go new file mode 100644 index 0000000..19410df --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/cipher.go @@ -0,0 +1,83 @@ +package shadowaead + +import ( + "crypto/aes" + "crypto/cipher" + "crypto/sha1" + "io" + "strconv" + + "golang.org/x/crypto/chacha20poly1305" + "golang.org/x/crypto/hkdf" +) + +type Cipher interface { + KeySize() int + SaltSize() int + Encrypter(salt []byte) (cipher.AEAD, error) + Decrypter(salt []byte) (cipher.AEAD, error) +} + +type KeySizeError int + +func (e KeySizeError) Error() string { + return "key size error: need " + strconv.Itoa(int(e)) + " bytes" +} + +func hkdfSHA1(secret, salt, info, outkey []byte) { + r := hkdf.New(sha1.New, secret, salt, info) + if _, err := io.ReadFull(r, outkey); err != nil { + panic(err) // should never happen + } +} + +type metaCipher struct { + psk []byte + makeAEAD func(key []byte) (cipher.AEAD, error) +} + +func (a *metaCipher) KeySize() int { return len(a.psk) } +func (a *metaCipher) SaltSize() int { + if ks := a.KeySize(); ks > 16 { + return ks + } + return 16 +} +func (a *metaCipher) Encrypter(salt []byte) (cipher.AEAD, error) { + subkey := make([]byte, a.KeySize()) + hkdfSHA1(a.psk, salt, []byte("ss-subkey"), subkey) + return a.makeAEAD(subkey) +} +func (a *metaCipher) Decrypter(salt []byte) (cipher.AEAD, error) { + subkey := make([]byte, a.KeySize()) + hkdfSHA1(a.psk, salt, []byte("ss-subkey"), subkey) + return a.makeAEAD(subkey) +} + +func aesGCM(key []byte) (cipher.AEAD, error) { + blk, err := aes.NewCipher(key) + if err != nil { + return nil, err + } + return cipher.NewGCM(blk) +} + +// AESGCM creates a new Cipher with a pre-shared key. len(psk) must be +// one of 16, 24, or 32 to select AES-128/196/256-GCM. +func AESGCM(psk []byte) (Cipher, error) { + switch l := len(psk); l { + case 16, 24, 32: // AES 128/196/256 + default: + return nil, aes.KeySizeError(l) + } + return &metaCipher{psk: psk, makeAEAD: aesGCM}, nil +} + +// Chacha20Poly1305 creates a new Cipher with a pre-shared key. len(psk) +// must be 32. +func Chacha20Poly1305(psk []byte) (Cipher, error) { + if len(psk) != chacha20poly1305.KeySize { + return nil, KeySizeError(chacha20poly1305.KeySize) + } + return &metaCipher{psk: psk, makeAEAD: chacha20poly1305.New}, nil +} diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/doc.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/doc.go new file mode 100644 index 0000000..8d1e286 --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/doc.go @@ -0,0 +1,35 @@ +/* +Package shadowaead implements a simple AEAD-protected secure protocol. + +In general, there are two types of connections: stream-oriented and packet-oriented. +Stream-oriented connections (e.g. TCP) assume reliable and orderly delivery of bytes. +Packet-oriented connections (e.g. UDP) assume unreliable and out-of-order delivery of packets, +where each packet is either delivered intact or lost. + +An encrypted stream starts with a random salt to derive a session key, followed by any number of +encrypted records. Each encrypted record has the following structure: + + [encrypted payload length] + [payload length tag] + [encrypted payload] + [payload tag] + +Payload length is 2-byte unsigned big-endian integer capped at 0x3FFF (16383). +The higher 2 bits are reserved and must be set to zero. The first AEAD encrypt/decrypt +operation uses a counting nonce starting from 0. After each encrypt/decrypt operation, +the nonce is incremented by one as if it were an unsigned little-endian integer. + + +Each encrypted packet transmitted on a packet-oriented connection has the following structure: + + [random salt] + [encrypted payload] + [payload tag] + +The salt is used to derive a subkey to initiate an AEAD. Packets are encrypted/decrypted independently +using zero nonce. + +In both stream-oriented and packet-oriented connections, length of nonce and tag varies +depending on which AEAD is used. Salt should be at least 16-byte long. +*/ +package shadowaead diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/packet.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/packet.go new file mode 100644 index 0000000..d8f20a4 --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/packet.go @@ -0,0 +1,93 @@ +package shadowaead + +import ( + "crypto/rand" + "errors" + "io" + "net" + "sync" +) + +// ErrShortPacket means that the packet is too short for a valid encrypted packet. +var ErrShortPacket = errors.New("short packet") + +var _zerononce [128]byte // read-only. 128 bytes is more than enough. + +// Pack encrypts plaintext using Cipher with a randomly generated salt and +// returns a slice of dst containing the encrypted packet and any error occurred. +// Ensure len(dst) >= ciph.SaltSize() + len(plaintext) + aead.Overhead(). +func Pack(dst, plaintext []byte, ciph Cipher) ([]byte, error) { + saltSize := ciph.SaltSize() + salt := dst[:saltSize] + if _, err := io.ReadFull(rand.Reader, salt); err != nil { + return nil, err + } + + aead, err := ciph.Encrypter(salt) + if err != nil { + return nil, err + } + + if len(dst) < saltSize+len(plaintext)+aead.Overhead() { + return nil, io.ErrShortBuffer + } + b := aead.Seal(dst[saltSize:saltSize], _zerononce[:aead.NonceSize()], plaintext, nil) + return dst[:saltSize+len(b)], nil +} + +// Unpack decrypts pkt using Cipher and returns a slice of dst containing the decrypted payload and any error occurred. +// Ensure len(dst) >= len(pkt) - aead.SaltSize() - aead.Overhead(). +func Unpack(dst, pkt []byte, ciph Cipher) ([]byte, error) { + saltSize := ciph.SaltSize() + if len(pkt) < saltSize { + return nil, ErrShortPacket + } + salt := pkt[:saltSize] + aead, err := ciph.Decrypter(salt) + if err != nil { + return nil, err + } + if len(pkt) < saltSize+aead.Overhead() { + return nil, ErrShortPacket + } + if saltSize+len(dst)+aead.Overhead() < len(pkt) { + return nil, io.ErrShortBuffer + } + b, err := aead.Open(dst[:0], _zerononce[:aead.NonceSize()], pkt[saltSize:], nil) + return b, err +} + +type packetConn struct { + net.PacketConn + Cipher + sync.Mutex + buf []byte // write lock +} + +// NewPacketConn wraps a net.PacketConn with cipher +func NewPacketConn(c net.PacketConn, ciph Cipher) net.PacketConn { + const maxPacketSize = 64 * 1024 + return &packetConn{PacketConn: c, Cipher: ciph, buf: make([]byte, maxPacketSize)} +} + +// WriteTo encrypts b and write to addr using the embedded PacketConn. +func (c *packetConn) WriteTo(b []byte, addr net.Addr) (int, error) { + c.Lock() + defer c.Unlock() + buf, err := Pack(c.buf, b, c) + if err != nil { + return 0, err + } + _, err = c.PacketConn.WriteTo(buf, addr) + return len(b), err +} + +// ReadFrom reads from the embedded PacketConn and decrypts into b. +func (c *packetConn) ReadFrom(b []byte) (int, net.Addr, error) { + n, addr, err := c.PacketConn.ReadFrom(b) + if err != nil { + return n, addr, err + } + b, err = Unpack(b, b[:n], c) + return len(b), addr, err +} diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/stream.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/stream.go new file mode 100644 index 0000000..5f499a2 --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowaead/stream.go @@ -0,0 +1,270 @@ +package shadowaead + +import ( + "bytes" + "crypto/cipher" + "crypto/rand" + "io" + "net" +) + +// payloadSizeMask is the maximum size of payload in bytes. +const payloadSizeMask = 0x3FFF // 16*1024 - 1 + +type writer struct { + io.Writer + cipher.AEAD + nonce []byte + buf []byte +} + +// NewWriter wraps an io.Writer with AEAD encryption. +func NewWriter(w io.Writer, aead cipher.AEAD) io.Writer { return newWriter(w, aead) } + +func newWriter(w io.Writer, aead cipher.AEAD) *writer { + return &writer{ + Writer: w, + AEAD: aead, + buf: make([]byte, 2+aead.Overhead()+payloadSizeMask+aead.Overhead()), + nonce: make([]byte, aead.NonceSize()), + } +} + +// Write encrypts b and writes to the embedded io.Writer. +func (w *writer) Write(b []byte) (int, error) { + n, err := w.ReadFrom(bytes.NewBuffer(b)) + return int(n), err +} + +// ReadFrom reads from the given io.Reader until EOF or error, encrypts and +// writes to the embedded io.Writer. Returns number of bytes read from r and +// any error encountered. +func (w *writer) ReadFrom(r io.Reader) (n int64, err error) { + for { + buf := w.buf + payloadBuf := buf[2+w.Overhead() : 2+w.Overhead()+payloadSizeMask] + nr, er := r.Read(payloadBuf) + + if nr > 0 { + n += int64(nr) + buf = buf[:2+w.Overhead()+nr+w.Overhead()] + payloadBuf = payloadBuf[:nr] + buf[0], buf[1] = byte(nr>>8), byte(nr) // big-endian payload size + w.Seal(buf[:0], w.nonce, buf[:2], nil) + increment(w.nonce) + + w.Seal(payloadBuf[:0], w.nonce, payloadBuf, nil) + increment(w.nonce) + + _, ew := w.Writer.Write(buf) + if ew != nil { + err = ew + break + } + } + + if er != nil { + if er != io.EOF { // ignore EOF as per io.ReaderFrom contract + err = er + } + break + } + } + + return n, err +} + +type reader struct { + io.Reader + cipher.AEAD + nonce []byte + buf []byte + leftover []byte +} + +// NewReader wraps an io.Reader with AEAD decryption. +func NewReader(r io.Reader, aead cipher.AEAD) io.Reader { return newReader(r, aead) } + +func newReader(r io.Reader, aead cipher.AEAD) *reader { + return &reader{ + Reader: r, + AEAD: aead, + buf: make([]byte, payloadSizeMask+aead.Overhead()), + nonce: make([]byte, aead.NonceSize()), + } +} + +// read and decrypt a record into the internal buffer. Return decrypted payload length and any error encountered. +func (r *reader) read() (int, error) { + // decrypt payload size + buf := r.buf[:2+r.Overhead()] + _, err := io.ReadFull(r.Reader, buf) + if err != nil { + return 0, err + } + + _, err = r.Open(buf[:0], r.nonce, buf, nil) + increment(r.nonce) + if err != nil { + return 0, err + } + + size := (int(buf[0])<<8 + int(buf[1])) & payloadSizeMask + + // decrypt payload + buf = r.buf[:size+r.Overhead()] + _, err = io.ReadFull(r.Reader, buf) + if err != nil { + return 0, err + } + + _, err = r.Open(buf[:0], r.nonce, buf, nil) + increment(r.nonce) + if err != nil { + return 0, err + } + + return size, nil +} + +// Read reads from the embedded io.Reader, decrypts and writes to b. +func (r *reader) Read(b []byte) (int, error) { + // copy decrypted bytes (if any) from previous record first + if len(r.leftover) > 0 { + n := copy(b, r.leftover) + r.leftover = r.leftover[n:] + return n, nil + } + + n, err := r.read() + m := copy(b, r.buf[:n]) + if m < n { // insufficient len(b), keep leftover for next read + r.leftover = r.buf[m:n] + } + return m, err +} + +// WriteTo reads from the embedded io.Reader, decrypts and writes to w until +// there's no more data to write or when an error occurs. Return number of +// bytes written to w and any error encountered. +func (r *reader) WriteTo(w io.Writer) (n int64, err error) { + // write decrypted bytes left over from previous record + for len(r.leftover) > 0 { + nw, ew := w.Write(r.leftover) + r.leftover = r.leftover[nw:] + n += int64(nw) + if ew != nil { + return n, ew + } + } + + for { + nr, er := r.read() + if nr > 0 { + nw, ew := w.Write(r.buf[:nr]) + n += int64(nw) + + if ew != nil { + err = ew + break + } + } + + if er != nil { + if er != io.EOF { // ignore EOF as per io.Copy contract (using src.WriteTo shortcut) + err = er + } + break + } + } + + return n, err +} + +// increment little-endian encoded unsigned integer b. Wrap around on overflow. +func increment(b []byte) { + for i := range b { + b[i]++ + if b[i] != 0 { + return + } + } +} + +type streamConn struct { + net.Conn + Cipher + r *reader + w *writer +} + +func (c *streamConn) initReader() error { + salt := make([]byte, c.SaltSize()) + if _, err := io.ReadFull(c.Conn, salt); err != nil { + return err + } + + aead, err := c.Decrypter(salt) + if err != nil { + return err + } + + c.r = newReader(c.Conn, aead) + return nil +} + +func (c *streamConn) Read(b []byte) (int, error) { + if c.r == nil { + if err := c.initReader(); err != nil { + return 0, err + } + } + return c.r.Read(b) +} + +func (c *streamConn) WriteTo(w io.Writer) (int64, error) { + if c.r == nil { + if err := c.initReader(); err != nil { + return 0, err + } + } + return c.r.WriteTo(w) +} + +func (c *streamConn) initWriter() error { + salt := make([]byte, c.SaltSize()) + if _, err := io.ReadFull(rand.Reader, salt); err != nil { + return err + } + aead, err := c.Encrypter(salt) + if err != nil { + return err + } + _, err = c.Conn.Write(salt) + if err != nil { + return err + } + c.w = newWriter(c.Conn, aead) + return nil +} + +func (c *streamConn) Write(b []byte) (int, error) { + if c.w == nil { + if err := c.initWriter(); err != nil { + return 0, err + } + } + return c.w.Write(b) +} + +func (c *streamConn) ReadFrom(r io.Reader) (int64, error) { + if c.w == nil { + if err := c.initWriter(); err != nil { + return 0, err + } + } + return c.w.ReadFrom(r) +} + +// NewConn wraps a stream-oriented net.Conn with cipher. +func NewConn(c net.Conn, ciph Cipher) net.Conn { return &streamConn{Conn: c, Cipher: ciph} } diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/cipher.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/cipher.go new file mode 100644 index 0000000..dea233e --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/cipher.go @@ -0,0 +1,91 @@ +package shadowstream + +import ( + "crypto/aes" + "crypto/cipher" + "strconv" + + "github.com/Yawning/chacha20" +) + +// Cipher generates a pair of stream ciphers for encryption and decryption. +type Cipher interface { + IVSize() int + Encrypter(iv []byte) cipher.Stream + Decrypter(iv []byte) cipher.Stream +} + +type KeySizeError int + +func (e KeySizeError) Error() string { + return "key size error: need " + strconv.Itoa(int(e)) + " bytes" +} + +// CTR mode +type ctrStream struct{ cipher.Block } + +func (b *ctrStream) IVSize() int { return b.BlockSize() } +func (b *ctrStream) Decrypter(iv []byte) cipher.Stream { return b.Encrypter(iv) } +func (b *ctrStream) Encrypter(iv []byte) cipher.Stream { return cipher.NewCTR(b, iv) } + +func AESCTR(key []byte) (Cipher, error) { + blk, err := aes.NewCipher(key) + if err != nil { + return nil, err + } + return &ctrStream{blk}, nil +} + +// CFB mode +type cfbStream struct{ cipher.Block } + +func (b *cfbStream) IVSize() int { return b.BlockSize() } +func (b *cfbStream) Decrypter(iv []byte) cipher.Stream { return cipher.NewCFBDecrypter(b, iv) } +func (b *cfbStream) Encrypter(iv []byte) cipher.Stream { return cipher.NewCFBEncrypter(b, iv) } + +func AESCFB(key []byte) (Cipher, error) { + blk, err := aes.NewCipher(key) + if err != nil { + return nil, err + } + return &cfbStream{blk}, nil +} + +// IETF-variant of chacha20 +type chacha20ietfkey []byte + +func (k chacha20ietfkey) IVSize() int { return chacha20.INonceSize } +func (k chacha20ietfkey) Decrypter(iv []byte) cipher.Stream { return k.Encrypter(iv) } +func (k chacha20ietfkey) Encrypter(iv []byte) cipher.Stream { + ciph, err := chacha20.NewCipher(k, iv) + if err != nil { + panic(err) // should never happen + } + return ciph +} + +func Chacha20IETF(key []byte) (Cipher, error) { + if len(key) != chacha20.KeySize { + return nil, KeySizeError(chacha20.KeySize) + } + return chacha20ietfkey(key), nil +} + +type xchacha20key []byte + +func (k xchacha20key) IVSize() int { return chacha20.XNonceSize } +func (k xchacha20key) Decrypter(iv []byte) cipher.Stream { return k.Encrypter(iv) } +func (k xchacha20key) Encrypter(iv []byte) cipher.Stream { + ciph, err := chacha20.NewCipher(k, iv) + if err != nil { + panic(err) // should never happen + } + return ciph +} + +func Xchacha20(key []byte) (Cipher, error) { + if len(key) != chacha20.KeySize { + return nil, KeySizeError(chacha20.KeySize) + } + return xchacha20key(key), nil +} diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/doc.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/doc.go new file mode 100644 index 0000000..4c0897a --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/doc.go @@ -0,0 +1,2 @@ +// Package shadowstream implements the original Shadowsocks protocol protected by stream cipher. +package shadowstream diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/packet.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/packet.go new file mode 100644 index 0000000..8bbb27b --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/packet.go @@ -0,0 +1,76 @@ +package shadowstream + +import ( + "crypto/rand" + "errors" + "io" + "net" + "sync" +) + +// ErrShortPacket means the packet is too short to be a valid encrypted packet. +var ErrShortPacket = errors.New("short packet") + +// Pack encrypts plaintext using stream cipher s and a random IV. +// Returns a slice of dst containing random IV and ciphertext. +// Ensure len(dst) >= s.IVSize() + len(plaintext). +func Pack(dst, plaintext []byte, s Cipher) ([]byte, error) { + if len(dst) < s.IVSize()+len(plaintext) { + return nil, io.ErrShortBuffer + } + iv := dst[:s.IVSize()] + _, err := io.ReadFull(rand.Reader, iv) + if err != nil { + return nil, err + } + + s.Encrypter(iv).XORKeyStream(dst[len(iv):], plaintext) + return dst[:len(iv)+len(plaintext)], nil +} + +// Unpack decrypts pkt using stream cipher s. +// Returns a slice of dst containing decrypted plaintext. +func Unpack(dst, pkt []byte, s Cipher) ([]byte, error) { + if len(pkt) < s.IVSize() { + return nil, ErrShortPacket + } + + if len(dst) < len(pkt)-s.IVSize() { + return nil, io.ErrShortBuffer + } + iv := pkt[:s.IVSize()] + s.Decrypter(iv).XORKeyStream(dst, pkt[len(iv):]) + return dst[:len(pkt)-len(iv)], nil +} + +type packetConn struct { + net.PacketConn + Cipher + buf []byte + sync.Mutex // write lock +} + +// NewPacketConn wraps a net.PacketConn with stream cipher encryption/decryption. +func NewPacketConn(c net.PacketConn, ciph Cipher) net.PacketConn { + return &packetConn{PacketConn: c, Cipher: ciph, buf: make([]byte, 64*1024)} +} + +func (c *packetConn) WriteTo(b []byte, addr net.Addr) (int, error) { + c.Lock() + defer c.Unlock() + buf, err := Pack(c.buf, b, c.Cipher) + if err != nil { + return 0, err + } + _, err = c.PacketConn.WriteTo(buf, addr) + return len(b), err +} + +func (c *packetConn) ReadFrom(b []byte) (int, net.Addr, error) { + n, addr, err := c.PacketConn.ReadFrom(b) + if err != nil { + return n, addr, err + } + b, err = Unpack(b, b[:n], c.Cipher) + return len(b), addr, err +} diff --git a/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/stream.go b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/stream.go new file mode 100644 index 0000000..eb4d967 --- /dev/null +++ b/vendor/github.com/shadowsocks/go-shadowsocks2/shadowstream/stream.go @@ -0,0 +1,171 @@ +package shadowstream + +import ( + "bytes" + "crypto/cipher" + "crypto/rand" + "io" + "net" +) + +const bufSize = 32 * 1024 + +type writer struct { + io.Writer + cipher.Stream + buf []byte +} + +// NewWriter wraps an io.Writer with stream cipher encryption. +func NewWriter(w io.Writer, s cipher.Stream) io.Writer { + return &writer{Writer: w, Stream: s, buf: make([]byte, bufSize)} +} + +func (w *writer) ReadFrom(r io.Reader) (n int64, err error) { + for { + buf := w.buf + nr, er := r.Read(buf) + if nr > 0 { + n += int64(nr) + buf = buf[:nr] + w.XORKeyStream(buf, buf) + _, ew := w.Writer.Write(buf) + if ew != nil { + err = ew + return + } + } + + if er != nil { + if er != io.EOF { // ignore EOF as per io.ReaderFrom contract + err = er + } + return + } + } +} + +func (w *writer) Write(b []byte) (int, error) { + n, err := w.ReadFrom(bytes.NewBuffer(b)) + return int(n), err +} + +type reader struct { + io.Reader + cipher.Stream + buf []byte +} + +// NewReader wraps an io.Reader with stream cipher decryption. +func NewReader(r io.Reader, s cipher.Stream) io.Reader { + return &reader{Reader: r, Stream: s, buf: make([]byte, bufSize)} +} + +func (r *reader) Read(b []byte) (int, error) { + + n, err := r.Reader.Read(b) + if err != nil { + return 0, err + } + b = b[:n] + r.XORKeyStream(b, b) + return n, nil +} + +func (r *reader) WriteTo(w io.Writer) (n int64, err error) { + for { + buf := r.buf + nr, er := r.Read(buf) + if nr > 0 { + nw, ew := w.Write(buf[:nr]) + n += int64(nw) + + if ew != nil { + err = ew + return + } + } + + if er != nil { + if er != io.EOF { // ignore EOF as per io.Copy contract (using src.WriteTo shortcut) + err = er + } + return + } + } +} + +type conn struct { + net.Conn + Cipher + r *reader + w *writer +} + +// NewConn wraps a stream-oriented net.Conn with stream cipher encryption/decryption. +func NewConn(c net.Conn, ciph Cipher) net.Conn { + return &conn{Conn: c, Cipher: ciph} +} + +func (c *conn) initReader() error { + if c.r == nil { + buf := make([]byte, bufSize) + iv := buf[:c.IVSize()] + if _, err := io.ReadFull(c.Conn, iv); err != nil { + return err + } + c.r = &reader{Reader: c.Conn, Stream: c.Decrypter(iv), buf: buf} + } + return nil +} + +func (c *conn) Read(b []byte) (int, error) { + if c.r == nil { + if err := c.initReader(); err != nil { + return 0, err + } + } + return c.r.Read(b) +} + +func (c *conn) WriteTo(w io.Writer) (int64, error) { + if c.r == nil { + if err := c.initReader(); err != nil { + return 0, err + } + } + return c.r.WriteTo(w) +} + +func (c *conn) initWriter() error { + if c.w == nil { + buf := make([]byte, bufSize) + iv := buf[:c.IVSize()] + if _, err := io.ReadFull(rand.Reader, iv); err != nil { + return err + } + if _, err := c.Conn.Write(iv); err != nil { + return err + } + c.w = &writer{Writer: c.Conn, Stream: c.Encrypter(iv), buf: buf} + } + return nil +} + +func (c *conn) Write(b []byte) (int, error) { + if c.w == nil { + if err := c.initWriter(); err != nil { + return 0, err + } + } + return c.w.Write(b) +} + +func (c *conn) ReadFrom(r io.Reader) (int64, error) { + if c.w == nil { + if err := c.initWriter(); err != nil { + return 0, err + } + } + return c.w.ReadFrom(r) +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/LICENSE b/vendor/github.com/sun8911879/shadowsocksR/LICENSE new file mode 100644 index 0000000..bed05e0 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 YanXin Sun + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/sun8911879/shadowsocksR/README.md b/vendor/github.com/sun8911879/shadowsocksR/README.md new file mode 100644 index 0000000..93938b4 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/README.md @@ -0,0 +1,84 @@ +# shadowsocksR + +[shadowsocksR](https://github.com/sun8911879/shadowsocksR) is a shadowsocksR for Go library + +* shadowsocksR is based on [avege](https://github.com/avege/avege) and [shadowsocksR for Python](https://github.com/shadowsocksr-backup/shadowsocksr) changes. +* Repair avege SSR communication BUG and streamline version. Is a normal use version. + +#### Use + +```go +bi := &BackendInfo{ + Address: "www.domain.com:445", + Type: "ssr", + SSInfo: SSInfo{ + EncryptMethod: "aes-128-cfb", + EncryptPassword: "password", + SSRInfo: SSRInfo{ + Protocol: "auth_aes128_sha1", + ProtocolParam: "", + Obfs: "tls1.2_ticket_auth", + ObfsParam: "", + }, + }, +} +dst, err := bi.DialSSRConn(rawaddr) +bi.Pipe(src, dst) +bi.Pipe(dst, src) +``` + +See 'example/client.go' for detailed usage. + +#### SS Encrypting algorithm + +* aes-128-cfb +* aes-192-cfb +* aes-256-cfb +* aes-128-ctr +* aes-192-ctr +* aes-256-ctr +* aes-128-ofb +* aes-192-ofb +* aes-256-ofb +* des-cfb +* bf-cfb +* cast5-cfb +* rc4-md5 +* chacha20 +* chacha20-ietf +* salsa20 +* camellia-128-cfb +* camellia-192-cfb +* camellia-256-cfb +* idea-cfb +* rc2-cfb +* seed-cfb + +#### SSR Obfs + +* plain +* http_simple +* http_post +* random_head +* tls1.2_ticket_auth + +#### SSR Protocol + +* origin +* verify_sha1 aka. one time auth(OTA) +* auth_sha1_v4 +* auth_aes128_md5 +* auth_aes128_sha1 + +## Todo (help wanted) + +* Optimize performance + +### Thanks avege project +* [avege](https://github.com/avege/avege) + +### Reference +* [avege](https://github.com/avege/avege) +* [shadowsocks-go](https://github.com/shadowsocks/shadowsocks-go) +* [go-shadowsocks2](https://github.com/shadowsocks/go-shadowsocks2) +* [ShadowsocksR](https://github.com/shadowsocksr-backup/shadowsocksr) \ No newline at end of file diff --git a/vendor/github.com/sun8911879/shadowsocksR/client.go b/vendor/github.com/sun8911879/shadowsocksR/client.go new file mode 100644 index 0000000..2c3d7ad --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/client.go @@ -0,0 +1,61 @@ +package shadowsocksr + +import ( + "errors" + "net" + "net/url" + "strconv" + "strings" + "time" + + "github.com/sun8911879/shadowsocksR/obfs" + "github.com/sun8911879/shadowsocksR/protocol" + "github.com/sun8911879/shadowsocksR/ssr" +) + +func NewSSRClient(u *url.URL) (*SSTCPConn, error) { + query := u.Query() + encryptMethod := query.Get("encrypt-method") + encryptKey := query.Get("encrypt-key") + cipher, err := NewStreamCipher(encryptMethod, encryptKey) + if err != nil { + return nil, err + } + + dialer := net.Dialer{ + Timeout: time.Millisecond * 500, + DualStack: true, + } + conn, err := dialer.Dial("tcp", u.Host) + if err != nil { + return nil, err + } + + ssconn := NewSSTCPConn(conn, cipher) + if ssconn.Conn == nil || ssconn.RemoteAddr() == nil { + return nil, errors.New("nil connection") + } + + // should initialize obfs/protocol now + rs := strings.Split(ssconn.RemoteAddr().String(), ":") + port, _ := strconv.Atoi(rs[1]) + + ssconn.IObfs = obfs.NewObfs(query.Get("obfs")) + obfsServerInfo := &ssr.ServerInfoForObfs{ + Host: rs[0], + Port: uint16(port), + TcpMss: 1460, + Param: query.Get("obfs-param"), + } + ssconn.IObfs.SetServerInfo(obfsServerInfo) + ssconn.IProtocol = protocol.NewProtocol(query.Get("protocol")) + protocolServerInfo := &ssr.ServerInfoForObfs{ + Host: rs[0], + Port: uint16(port), + TcpMss: 1460, + Param: query.Get("protocol-param"), + } + ssconn.IProtocol.SetServerInfo(protocolServerInfo) + + return ssconn, nil +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/encrypt.go b/vendor/github.com/sun8911879/shadowsocksR/encrypt.go new file mode 100644 index 0000000..3f5e6b9 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/encrypt.go @@ -0,0 +1,296 @@ +package shadowsocksr + +import ( + "crypto/aes" + "crypto/cipher" + "crypto/des" + "crypto/md5" + "crypto/rand" + "crypto/rc4" + "encoding/binary" + "errors" + + "github.com/sun8911879/shadowsocksR/tools" + "github.com/sun8911879/shadowsocksR/tools/leakybuf" + + "github.com/Yawning/chacha20" + "github.com/dgryski/go-camellia" + "github.com/dgryski/go-idea" + "github.com/dgryski/go-rc2" + "golang.org/x/crypto/blowfish" + "golang.org/x/crypto/cast5" + "golang.org/x/crypto/salsa20/salsa" +) + +var errEmptyPassword = errors.New("empty key") + +type DecOrEnc int + +const ( + Decrypt DecOrEnc = iota + Encrypt +) + +func newCTRStream(block cipher.Block, err error, key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + if err != nil { + return nil, err + } + return cipher.NewCTR(block, iv), nil +} + +func newAESCTRStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := aes.NewCipher(key) + return newCTRStream(block, err, key, iv, doe) +} + +func newOFBStream(block cipher.Block, err error, key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + if err != nil { + return nil, err + } + return cipher.NewCTR(block, iv), nil +} + +func newAESOFBStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := aes.NewCipher(key) + return newOFBStream(block, err, key, iv, doe) +} + +func newCFBStream(block cipher.Block, err error, key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + if err != nil { + return nil, err + } + if doe == Encrypt { + return cipher.NewCFBEncrypter(block, iv), nil + } else { + return cipher.NewCFBDecrypter(block, iv), nil + } +} + +func newAESCFBStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := aes.NewCipher(key) + return newCFBStream(block, err, key, iv, doe) +} + +func newDESStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := des.NewCipher(key) + return newCFBStream(block, err, key, iv, doe) +} + +func newBlowFishStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := blowfish.NewCipher(key) + return newCFBStream(block, err, key, iv, doe) +} + +func newCast5Stream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := cast5.NewCipher(key) + return newCFBStream(block, err, key, iv, doe) +} + +func newRC4MD5Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) { + h := md5.New() + h.Write(key) + h.Write(iv) + rc4key := h.Sum(nil) + + return rc4.NewCipher(rc4key) +} + +func newChaCha20Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) { + return chacha20.NewCipher(key, iv) +} + +func newChacha20IETFStream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) { + return chacha20.NewCipher(key, iv) +} + +type salsaStreamCipher struct { + nonce [8]byte + key [32]byte + counter int +} + +func (c *salsaStreamCipher) XORKeyStream(dst, src []byte) { + var buf []byte + padLen := c.counter % 64 + dataSize := len(src) + padLen + if cap(dst) >= dataSize { + buf = dst[:dataSize] + } else if leakybuf.GlobalLeakyBufSize >= dataSize { + buf = leakybuf.GlobalLeakyBuf.Get() + defer leakybuf.GlobalLeakyBuf.Put(buf) + buf = buf[:dataSize] + } else { + buf = make([]byte, dataSize) + } + + var subNonce [16]byte + copy(subNonce[:], c.nonce[:]) + binary.LittleEndian.PutUint64(subNonce[len(c.nonce):], uint64(c.counter/64)) + + // It's difficult to avoid data copy here. src or dst maybe slice from + // Conn.Read/Write, which can't have padding. + copy(buf[padLen:], src[:]) + salsa.XORKeyStream(buf, buf, &subNonce, &c.key) + copy(dst, buf[padLen:]) + + c.counter += len(src) +} + +func newSalsa20Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) { + var c salsaStreamCipher + copy(c.nonce[:], iv[:8]) + copy(c.key[:], key[:32]) + return &c, nil +} + +func newCamelliaStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := camellia.New(key) + return newCFBStream(block, err, key, iv, doe) +} + +func newIdeaStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := idea.NewCipher(key) + return newCFBStream(block, err, key, iv, doe) +} + +func newRC2Stream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := rc2.New(key, 16) + return newCFBStream(block, err, key, iv, doe) +} + +func newSeedStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + // TODO: SEED block cipher implementation is required + block, err := rc2.New(key, 16) + return newCFBStream(block, err, key, iv, doe) +} + +type cipherInfo struct { + keyLen int + ivLen int + newStream func(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) +} + +var streamCipherMethod = map[string]*cipherInfo{ + "aes-128-cfb": {16, 16, newAESCFBStream}, + "aes-192-cfb": {24, 16, newAESCFBStream}, + "aes-256-cfb": {32, 16, newAESCFBStream}, + "aes-128-ctr": {16, 16, newAESCTRStream}, + "aes-192-ctr": {24, 16, newAESCTRStream}, + "aes-256-ctr": {32, 16, newAESCTRStream}, + "aes-128-ofb": {16, 16, newAESOFBStream}, + "aes-192-ofb": {24, 16, newAESOFBStream}, + "aes-256-ofb": {32, 16, newAESOFBStream}, + "des-cfb": {8, 8, newDESStream}, + "bf-cfb": {16, 8, newBlowFishStream}, + "cast5-cfb": {16, 8, newCast5Stream}, + "rc4-md5": {16, 16, newRC4MD5Stream}, + "rc4-md5-6": {16, 6, newRC4MD5Stream}, + "chacha20": {32, 8, newChaCha20Stream}, + "chacha20-ietf": {32, 12, newChacha20IETFStream}, + "salsa20": {32, 8, newSalsa20Stream}, + "camellia-128-cfb": {16, 16, newCamelliaStream}, + "camellia-192-cfb": {24, 16, newCamelliaStream}, + "camellia-256-cfb": {32, 16, newCamelliaStream}, + "idea-cfb": {16, 8, newIdeaStream}, + "rc2-cfb": {16, 8, newRC2Stream}, + "seed-cfb": {16, 8, newSeedStream}, +} + +func CheckCipherMethod(method string) error { + if method == "" { + method = "rc4-md5" + } + _, ok := streamCipherMethod[method] + if !ok { + return errors.New("Unsupported encryption method: " + method) + } + return nil +} + +type StreamCipher struct { + enc cipher.Stream + dec cipher.Stream + key []byte + info *cipherInfo + iv []byte +} + +// NewStreamCipher creates a cipher that can be used in Dial() etc. +// Use cipher.Copy() to create a new cipher with the same method and password +// to avoid the cost of repeated cipher initialization. +func NewStreamCipher(method, password string) (c *StreamCipher, err error) { + if password == "" { + return nil, errEmptyPassword + } + if method == "" { + method = "rc4-md5" + } + mi, ok := streamCipherMethod[method] + if !ok { + return nil, errors.New("Unsupported encryption method: " + method) + } + + key := tools.EVPBytesToKey(password, mi.keyLen) + + c = &StreamCipher{key: key, info: mi} + + if err != nil { + return nil, err + } + return c, nil +} + +// Initializes the block cipher with CFB mode, returns IV. +func (c *StreamCipher) initEncrypt() (iv []byte, err error) { + if c.iv == nil { + iv = make([]byte, c.info.ivLen) + rand.Read(iv) + c.iv = iv + } else { + iv = c.iv + } + c.enc, err = c.info.newStream(c.key, iv, Encrypt) + return +} + +func (c *StreamCipher) initDecrypt(iv []byte) (err error) { + c.dec, err = c.info.newStream(c.key, iv, Decrypt) + return +} + +func (c *StreamCipher) encrypt(dst, src []byte) { + c.enc.XORKeyStream(dst, src) +} + +func (c *StreamCipher) decrypt(dst, src []byte) { + c.dec.XORKeyStream(dst, src) +} + +// Copy creates a new cipher at it's initial state. +func (c *StreamCipher) Copy() *StreamCipher { + // This optimization maybe not necessary. But without this function, we + // need to maintain a table cache for newTableCipher and use lock to + // protect concurrent access to that cache. + + // AES and DES ciphers does not return specific types, so it's difficult + // to create copy. But their initialization time is less than 4000ns on my + // 2.26 GHz Intel Core 2 Duo processor. So no need to worry. + + // Currently, blow-fish and cast5 initialization cost is an order of + // magnitude slower than other ciphers. (I'm not sure whether this is + // because the current implementation is not highly optimized, or this is + // the nature of the algorithm.) + + nc := *c + nc.enc = nil + nc.dec = nil + return &nc +} + +func (c *StreamCipher) Key() (key []byte, keyLen int) { + return c.key, c.info.keyLen +} + +func (c *StreamCipher) IV() ([]byte, int) { + return c.iv, c.info.ivLen +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/base.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/base.go new file mode 100644 index 0000000..a57732e --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/obfs/base.go @@ -0,0 +1,35 @@ +package obfs + +import ( + "strings" + + "github.com/sun8911879/shadowsocksR/ssr" +) + +type creator func() IObfs + +var ( + creatorMap = make(map[string]creator) +) + +type IObfs interface { + SetServerInfo(s *ssr.ServerInfoForObfs) + GetServerInfo() (s *ssr.ServerInfoForObfs) + Encode(data []byte) ([]byte, error) + Decode(data []byte) ([]byte, uint64, error) + SetData(data interface{}) + GetData() interface{} +} + +func register(name string, c creator) { + creatorMap[name] = c +} + +// NewObfs create an obfs object by name and return as an IObfs interface +func NewObfs(name string) IObfs { + c, ok := creatorMap[strings.ToLower(name)] + if ok { + return c() + } + return nil +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/http_post.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/http_post.go new file mode 100644 index 0000000..46513d7 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/obfs/http_post.go @@ -0,0 +1,19 @@ +package obfs + +import ( + "math/rand" +) + +func init() { + register("http_post", newHttpPost) +} + +// newHttpPost create a http_post object +func newHttpPost() IObfs { + // newHttpSimple create a http_simple object + t := &httpSimplePost{ + userAgentIndex: rand.Intn(len(requestUserAgent)), + getOrPost: false, + } + return t +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/http_simple.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/http_simple.go new file mode 100644 index 0000000..521a3e9 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/obfs/http_simple.go @@ -0,0 +1,178 @@ +package obfs + +import ( + "bytes" + "encoding/hex" + "fmt" + "math/rand" + "strings" + + "github.com/sun8911879/shadowsocksR/ssr" +) + +var ( + requestPath = []string{ + "", "", + "login.php?redir=", "", + "register.php?code=", "", + "s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&ch=&bar=&wd=", "&rn=", + "post.php?id=", "&goto=view.php", + } + requestUserAgent = []string{ + "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0", + "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/44.0", + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0", + "Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)", + "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.3; Trident/7.0; .NET4.0E; .NET4.0C)", + "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko", + "Mozilla/5.0 (Linux; Android 4.4; Nexus 5 Build/BuildID) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Mobile Safari/537.36", + "Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3", + "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3", + } +) + +// HttpSimple http_simple obfs encapsulate +type httpSimplePost struct { + ssr.ServerInfoForObfs + rawTransSent bool + rawTransReceived bool + userAgentIndex int + getOrPost bool // true for get, false for post +} + +func init() { + register("http_simple", newHttpSimple) +} + +// newHttpSimple create a http_simple object +func newHttpSimple() IObfs { + t := &httpSimplePost{ + rawTransSent: false, + rawTransReceived: false, + userAgentIndex: rand.Intn(len(requestUserAgent)), + getOrPost: true, + } + return t +} + +func (t *httpSimplePost) SetServerInfo(s *ssr.ServerInfoForObfs) { + t.ServerInfoForObfs = *s +} + +func (t *httpSimplePost) GetServerInfo() (s *ssr.ServerInfoForObfs) { + return &t.ServerInfoForObfs +} + +func (t *httpSimplePost) SetData(data interface{}) { + +} + +func (t *httpSimplePost) GetData() interface{} { + return nil +} + +func (t *httpSimplePost) boundary() (ret string) { + set := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + for i := 0; i < 32; i++ { + ret = fmt.Sprintf("%s%c", ret, set[rand.Intn(len(set))]) + } + return +} + +func (t *httpSimplePost) data2URLEncode(data []byte) (ret string) { + for i := 0; i < len(data); i++ { + ret = fmt.Sprintf("%s%%%s", ret, hex.EncodeToString([]byte{data[i]})) + } + return +} + +func (t *httpSimplePost) Encode(data []byte) (encodedData []byte, err error) { + if t.rawTransSent { + return data, nil + } + + dataLength := len(data) + var headData []byte + if headSize := t.IVLen + t.HeadLen; dataLength-headSize > 64 { + headData = make([]byte, headSize+rand.Intn(64)) + } else { + headData = make([]byte, dataLength) + } + copy(headData, data[0:len(headData)]) + requestPathIndex := rand.Intn(len(requestPath)/2) * 2 + host := t.Host + var customHead string + + if len(t.Param) > 0 { + customHeads := strings.Split(t.Param, "#") + if len(customHeads) > 2 { + customHeads = customHeads[0:2] + } + param := t.Param + if len(customHeads) > 1 { + customHead = customHeads[1] + param = customHeads[0] + } + hosts := strings.Split(param, ",") + if len(hosts) > 0 { + host = strings.TrimSpace(hosts[rand.Intn(len(hosts))]) + } + } + method := "GET /" + if !t.getOrPost { + method = "POST /" + } + httpBuf := fmt.Sprintf("%s%s%s%s HTTP/1.1\r\nHost: %s:%d\r\n", + method, + requestPath[requestPathIndex], + t.data2URLEncode(headData), + requestPath[requestPathIndex+1], + host, + t.Port) + if len(customHead) > 0 { + httpBuf = httpBuf + strings.Replace(customHead, "\\n", "\r\n", -1) + "\r\n\r\n" + } else { + var contentType string + if !t.getOrPost { + contentType = "Content-Type: multipart/form-data; boundary=" + t.boundary() + "\r\n" + } + httpBuf = httpBuf + + "User-Agent: " + requestUserAgent[t.userAgentIndex] + "\r\n" + + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" + + "Accept-Language: en-US,en;q=0.8\r\n" + + "Accept-Encoding: gzip, deflate\r\n" + + contentType + + "DNT: 1\r\n" + + "Connection: keep-alive\r\n" + + "\r\n" + } + + if len(headData) < dataLength { + encodedData = make([]byte, len(httpBuf)+(dataLength-len(headData))) + copy(encodedData, []byte(httpBuf)) + copy(encodedData[len(httpBuf):], data[len(headData):]) + } else { + encodedData = []byte(httpBuf) + } + t.rawTransSent = true + + return +} + +func (t *httpSimplePost) Decode(data []byte) ([]byte, uint64, error) { + if t.rawTransReceived { + return data, 0, nil + } + + pos := bytes.Index(data, []byte("\r\n\r\n")) + if pos > 0 { + decodedData := make([]byte, len(data)-pos-4) + copy(decodedData, data[pos+4:]) + t.rawTransReceived = true + return decodedData, 0, nil + } + return nil, 0, nil +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/plain.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/plain.go new file mode 100644 index 0000000..4ee2682 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/obfs/plain.go @@ -0,0 +1,42 @@ +package obfs + +import ( + "github.com/sun8911879/shadowsocksR/ssr" +) + +func init() { + register("plain", newPlainObfs) +} + +type plain struct { + ssr.ServerInfoForObfs +} + +func newPlainObfs() IObfs { + p := &plain{} + return p +} + +func (p *plain) SetServerInfo(s *ssr.ServerInfoForObfs) { + p.ServerInfoForObfs = *s +} + +func (p *plain) GetServerInfo() (s *ssr.ServerInfoForObfs) { + return &p.ServerInfoForObfs +} + +func (p *plain) Encode(data []byte) (encodedData []byte, err error) { + return data, nil +} + +func (p *plain) Decode(data []byte) ([]byte, uint64, error) { + return data, 0, nil +} + +func (p *plain) SetData(data interface{}) { + +} + +func (p *plain) GetData() interface{} { + return nil +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/random_head.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/random_head.go new file mode 100644 index 0000000..6e0366f --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/obfs/random_head.go @@ -0,0 +1,79 @@ +package obfs + +import ( + "math/rand" + + "github.com/sun8911879/shadowsocksR/ssr" +) + +type randomHead struct { + ssr.ServerInfoForObfs + rawTransSent bool + rawTransReceived bool + hasSentHeader bool + dataBuffer []byte +} + +func init() { + register("random_head", newRandomHead) +} + +func newRandomHead() IObfs { + p := &randomHead{} + return p +} + +func (r *randomHead) SetServerInfo(s *ssr.ServerInfoForObfs) { + r.ServerInfoForObfs = *s +} + +func (r *randomHead) GetServerInfo() (s *ssr.ServerInfoForObfs) { + return &r.ServerInfoForObfs +} + +func (r *randomHead) SetData(data interface{}) { + +} + +func (r *randomHead) GetData() interface{} { + return nil +} + +func (r *randomHead) Encode(data []byte) (encodedData []byte, err error) { + if r.rawTransSent { + return data, nil + } + + dataLength := len(data) + if r.hasSentHeader { + if dataLength > 0 { + d := make([]byte, len(r.dataBuffer)+dataLength) + copy(d, r.dataBuffer) + copy(d[len(r.dataBuffer):], data) + r.dataBuffer = d + } else { + encodedData = r.dataBuffer + r.dataBuffer = nil + r.rawTransSent = true + } + } else { + size := rand.Intn(96) + 8 + encodedData = make([]byte, size) + rand.Read(encodedData) + ssr.SetCRC32(encodedData, size) + + d := make([]byte, dataLength) + copy(d, data) + r.dataBuffer = d + } + r.hasSentHeader = true + return +} + +func (r *randomHead) Decode(data []byte) ([]byte, uint64, error) { + if r.rawTransReceived { + return data, 0, nil + } + r.rawTransReceived = true + return data, 0, nil +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/obfs/tls12_ticket_auth.go b/vendor/github.com/sun8911879/shadowsocksR/obfs/tls12_ticket_auth.go new file mode 100644 index 0000000..847c2e6 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/obfs/tls12_ticket_auth.go @@ -0,0 +1,279 @@ +package obfs + +import ( + "crypto/hmac" + "encoding/binary" + "fmt" + "math/rand" + "strings" + "time" + + "github.com/sun8911879/shadowsocksR/ssr" + "github.com/sun8911879/shadowsocksR/tools" +) + +func init() { + register("tls1.2_ticket_auth", newTLS12TicketAuth) +} + +type tlsAuthData struct { + localClientID [32]byte +} + +// tls12TicketAuth tls1.2_ticket_auth obfs encapsulate +type tls12TicketAuth struct { + ssr.ServerInfoForObfs + data *tlsAuthData + sendID int + handshakeStatus int + sendBuffer []byte +} + +// newTLS12TicketAuth create a tlv1.2_ticket_auth object +func newTLS12TicketAuth() IObfs { + return &tls12TicketAuth{} +} + +func (t *tls12TicketAuth) SetServerInfo(s *ssr.ServerInfoForObfs) { + t.ServerInfoForObfs = *s +} + +func (t *tls12TicketAuth) GetServerInfo() (s *ssr.ServerInfoForObfs) { + return &t.ServerInfoForObfs +} + +func (t *tls12TicketAuth) SetData(data interface{}) { + if auth, ok := data.(*tlsAuthData); ok { + t.data = auth + } +} + +func (t *tls12TicketAuth) GetData() interface{} { + if t.data == nil { + t.data = &tlsAuthData{} + b := make([]byte, 32) + rand.Read(b) + copy(t.data.localClientID[:], b) + } + return t.data +} + +func (t *tls12TicketAuth) getHost() string { + host := t.Host + if len(t.Param) > 0 { + hosts := strings.Split(t.Param, ",") + if len(hosts) > 0 { + host = hosts[rand.Intn(len(hosts))] + host = strings.TrimSpace(host) + } + } + if len(host) > 0 && host[len(host)-1] >= byte('0') && host[len(host)-1] <= byte('9') && len(t.Param) == 0 { + host = "" + } + return host +} + +func (t *tls12TicketAuth) Encode(data []byte) (encodedData []byte, err error) { + if t.handshakeStatus == -1 { + return data, nil + } + dataLength := len(data) + + if t.handshakeStatus == 8 { + encodedData = make([]byte, dataLength+4096) + start := 0 + outLength := 0 + + for t.sendID <= 4 && dataLength-start > 256 { + length := rand.Intn(512) + 64 + if length > dataLength-start { + length = dataLength - start + } + copy(encodedData[outLength:], []byte{0x17, 0x3, 0x3}) + binary.BigEndian.PutUint16(encodedData[outLength+3:], uint16(length&0xFFFF)) + copy(encodedData[outLength+5:], data[start:start+length]) + start += length + outLength += length + 5 + t.sendID++ + } + for dataLength-start > 2048 { + length := rand.Intn(3990) + 100 + if length > dataLength-start { + length = dataLength - start + } + copy(encodedData[outLength:], []byte{0x17, 0x3, 0x3}) + binary.BigEndian.PutUint16(encodedData[outLength+3:], uint16(length&0xFFFF)) + copy(encodedData[outLength+5:], data[start:start+length]) + start += length + outLength += length + 5 + t.sendID++ + } + if dataLength-start > 0 { + length := dataLength - start + copy(encodedData[outLength:], []byte{0x17, 0x3, 0x3}) + binary.BigEndian.PutUint16(encodedData[outLength+3:], uint16(length&0xFFFF)) + copy(encodedData[outLength+5:], data[start:start+length]) + // not necessary to update variable *start* any more + outLength += length + 5 + t.sendID++ + } + encodedData = encodedData[:outLength] + return + } + + if t.handshakeStatus == 1 { + //outLength := 0 + if dataLength > 0 { + b := make([]byte, len(t.sendBuffer)+dataLength+5) + copy(b, t.sendBuffer) + copy(b[len(t.sendBuffer):], []byte{0x17, 0x3, 0x3}) + binary.BigEndian.PutUint16(b[len(t.sendBuffer)+3:], uint16(dataLength&0xFFFF)) + copy(b[len(t.sendBuffer)+5:], data) + t.sendBuffer = b + return []byte{}, nil + } + + hmacData := make([]byte, 43) + rnd := make([]byte, 22) + rand.Read(rnd) + + handshakeFinish := []byte("\x14\x03\x03\x00\x01\x01\x16\x03\x03\x00\x20") + copy(hmacData, handshakeFinish) + copy(hmacData[len(handshakeFinish):], rnd) + + h := t.hmacSHA1(hmacData[:33]) + copy(hmacData[33:], h) + + encodedData = make([]byte, len(hmacData)+len(t.sendBuffer)) + copy(encodedData, hmacData) + copy(encodedData[len(hmacData):], t.sendBuffer) + t.sendBuffer = nil + t.handshakeStatus = 8 + + return + } + + rnd := t.packAuthData() + + tlsData0 := []byte("\x00\x1c\xc0\x2b\xc0\x2f\xcc\xa9\xcc\xa8\xcc\x14\xcc\x13\xc0\x0a\xc0\x14\xc0\x09\xc0\x13\x00\x9c\x00\x35\x00\x2f\x00\x0a\x01\x00") + tlsData1 := []byte("\xff\x01\x00\x01\x00") + tlsData2 := []byte("\x00\x17\x00\x00\x00\x23\x00\xd0") + tlsData3 := []byte("\x00\x0d\x00\x16\x00\x14\x06\x01\x06\x03\x05\x01\x05\x03\x04\x01\x04\x03\x03\x01\x03\x03\x02\x01\x02\x03\x00\x05\x00\x05\x01\x00\x00\x00\x00\x00\x12\x00\x00\x75\x50\x00\x00\x00\x0b\x00\x02\x01\x00\x00\x0a\x00\x06\x00\x04\x00\x17\x00\x18" + + "\x00\x15\x00\x66\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00") + + var sslBuf []byte + sslBuf = append(sslBuf, rnd...) + sslBuf = append(sslBuf, byte(32)) + sslBuf = append(sslBuf, t.data.localClientID[:]...) + sslBuf = append(sslBuf, tlsData0...) + + var extBuf []byte + extBuf = append(extBuf, tlsData1...) + + host := t.getHost() + + extBuf = append(extBuf, t.sni(host)...) + extBuf = append(extBuf, tlsData2...) + ticket := make([]byte, 208) + rand.Read(ticket) + extBuf = append(extBuf, ticket...) + extBuf = append(extBuf, tlsData3...) + extBuf = append([]byte{byte(len(extBuf) / 256), byte(len(extBuf) % 256)}, extBuf...) + + sslBuf = append(sslBuf, extBuf...) + // client version + sslBuf = append([]byte{3, 3}, sslBuf...) + // length + sslBuf = append([]byte{1, 0, byte(len(sslBuf) / 256), byte(len(sslBuf) % 256)}, sslBuf...) + // length + sslBuf = append([]byte{byte(len(sslBuf) / 256), byte(len(sslBuf) % 256)}, sslBuf...) + // version + sslBuf = append([]byte{0x16, 3, 1}, sslBuf...) + + encodedData = sslBuf + + d := make([]byte, dataLength+5) + copy(d[0:], []byte{0x17, 0x3, 0x3}) + binary.BigEndian.PutUint16(d[3:], uint16(dataLength&0xFFFF)) + copy(d[5:], data) + b := make([]byte, len(t.sendBuffer)+len(d)) + copy(b, t.sendBuffer) + copy(b[len(t.sendBuffer):], d) + t.sendBuffer = b + + t.handshakeStatus = 1 + + return +} + +func (t *tls12TicketAuth) Decode(data []byte) ([]byte, uint64, error) { + if t.handshakeStatus == -1 { + return data, 0, nil + } + dataLength := len(data) + + if t.handshakeStatus == 8 { + if dataLength < 5 { + return nil, 5, fmt.Errorf("data need minimum length: 5 ,data only length: %d", dataLength) + } + if data[0] != 0x17 { + return nil, 0, ssr.ErrTLS12TicketAuthIncorrectMagicNumber + } + size := int(binary.BigEndian.Uint16(data[3:5])) + if size+5 > dataLength { + return nil, uint64(size + 5), fmt.Errorf("unexpected data length: %d ,data only length: %d", size+5, dataLength) + } + if dataLength == size+5 { + return data[5:], 0, nil + } + return data[5 : 5+size], uint64(size + 5), nil + } + + if dataLength < 11+32+1+32 { + return nil, 0, ssr.ErrTLS12TicketAuthTooShortData + } + + hash := t.hmacSHA1(data[11 : 11+22]) + + if !hmac.Equal(data[33:33+ssr.ObfsHMACSHA1Len], hash) { + return nil, 0, ssr.ErrTLS12TicketAuthHMACError + } + return nil, 1, nil +} + +func (t *tls12TicketAuth) packAuthData() (outData []byte) { + outSize := 32 + outData = make([]byte, outSize) + + now := time.Now().Unix() + binary.BigEndian.PutUint32(outData[0:4], uint32(now)) + + rand.Read(outData[4 : 4+18]) + + hash := t.hmacSHA1(outData[:outSize-ssr.ObfsHMACSHA1Len]) + copy(outData[outSize-ssr.ObfsHMACSHA1Len:], hash) + + return +} + +func (t *tls12TicketAuth) hmacSHA1(data []byte) []byte { + key := make([]byte, t.KeyLen+32) + copy(key, t.Key) + copy(key[t.KeyLen:], t.data.localClientID[:]) + + sha1Data := tools.HmacSHA1(key, data) + return sha1Data[:ssr.ObfsHMACSHA1Len] +} + +func (t *tls12TicketAuth) sni(u string) []byte { + bURL := []byte(u) + length := len(bURL) + ret := make([]byte, length+9) + copy(ret[9:9+length], bURL) + binary.BigEndian.PutUint16(ret[7:], uint16(length&0xFFFF)) + length += 3 + binary.BigEndian.PutUint16(ret[4:], uint16(length&0xFFFF)) + length += 2 + binary.BigEndian.PutUint16(ret[2:], uint16(length&0xFFFF)) + return ret +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_md5.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_md5.go new file mode 100644 index 0000000..818b78d --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_md5.go @@ -0,0 +1,282 @@ +package protocol + +import ( + "bytes" + "crypto/aes" + "crypto/cipher" + "encoding/base64" + "encoding/binary" + "math/rand" + "strconv" + "strings" + "time" + + "github.com/sun8911879/shadowsocksR/ssr" + "github.com/sun8911879/shadowsocksR/tools" +) + +type hmacMethod func(key []byte, data []byte) []byte +type hashDigestMethod func(data []byte) []byte + +func init() { + register("auth_aes128_md5", NewAuthAES128MD5) +} + +func NewAuthAES128MD5() IProtocol { + a := &authAES128{ + salt: "auth_aes128_md5", + hmac: tools.HmacMD5, + hashDigest: tools.MD5Sum, + packID: 1, + recvInfo: recvInfo{ + recvID: 1, + buffer: bytes.NewBuffer(nil), + }, + data: &authData{ + connectionID: 0xFF000001, + }, + } + return a +} + +type recvInfo struct { + recvID uint32 + buffer *bytes.Buffer +} + +type authAES128 struct { + ssr.ServerInfoForObfs + recvInfo + data *authData + hasSentHeader bool + packID uint32 + userKey []byte + salt string + hmac hmacMethod + hashDigest hashDigestMethod +} + +func (a *authAES128) SetServerInfo(s *ssr.ServerInfoForObfs) { + a.ServerInfoForObfs = *s +} + +func (a *authAES128) GetServerInfo() (s *ssr.ServerInfoForObfs) { + return &a.ServerInfoForObfs +} + +func (a *authAES128) SetData(data interface{}) { + if auth, ok := data.(*authData); ok { + a.data = auth + } +} + +func (a *authAES128) GetData() interface{} { + if a.data == nil { + a.data = &authData{} + } + return a.data +} + +func (a *authAES128) packData(data []byte) (outData []byte) { + dataLength := len(data) + randLength := 1 + if dataLength <= 1200 { + if a.packID > 4 { + randLength += rand.Intn(32) + } else { + if dataLength > 900 { + randLength += rand.Intn(128) + } else { + randLength += rand.Intn(512) + } + } + } + + outLength := randLength + dataLength + 8 + outData = make([]byte, outLength) + // 0~1, out length + binary.LittleEndian.PutUint16(outData[0:], uint16(outLength&0xFFFF)) + // 2~3, hmac + key := make([]byte, len(a.userKey)+4) + copy(key, a.userKey) + binary.LittleEndian.PutUint32(key[len(key)-4:], a.packID) + h := a.hmac(key, outData[0:2]) + copy(outData[2:4], h[:2]) + // 4~rand length+4, rand number + rand.Read(outData[4 : 4+randLength]) + // 4, rand length + if randLength < 128 { + outData[4] = byte(randLength & 0xFF) + } else { + // 4, magic number 0xFF + outData[4] = 0xFF + // 5~6, rand length + binary.LittleEndian.PutUint16(outData[5:], uint16(randLength&0xFFFF)) + } + // rand length+4~out length-4, data + if dataLength > 0 { + copy(outData[randLength+4:], data) + } + a.packID++ + h = a.hmac(key, outData[:outLength-4]) + copy(outData[outLength-4:], h[:4]) + return +} + +func (a *authAES128) packAuthData(data []byte) (outData []byte) { + dataLength := len(data) + var randLength int + if dataLength > 400 { + randLength = rand.Intn(512) + } else { + randLength = rand.Intn(1024) + } + + dataOffset := randLength + 16 + 4 + 4 + 7 + outLength := dataOffset + dataLength + 4 + outData = make([]byte, outLength) + encrypt := make([]byte, 24) + key := make([]byte, a.IVLen+a.KeyLen) + copy(key, a.IV) + copy(key[a.IVLen:], a.Key) + + rand.Read(outData[dataOffset-randLength:]) + + if a.data.connectionID > 0xFF000000 { + a.data.clientID = nil + } + if len(a.data.clientID) == 0 { + a.data.clientID = make([]byte, 4) + rand.Read(a.data.clientID) + b := make([]byte, 4) + rand.Read(b) + a.data.connectionID = binary.LittleEndian.Uint32(b) & 0xFFFFFF + } + a.data.connectionID++ + copy(encrypt[4:], a.data.clientID) + binary.LittleEndian.PutUint32(encrypt[8:], a.data.connectionID) + + now := time.Now().Unix() + binary.LittleEndian.PutUint32(encrypt[0:4], uint32(now)) + + binary.LittleEndian.PutUint16(encrypt[12:], uint16(outLength&0xFFFF)) + binary.LittleEndian.PutUint16(encrypt[14:], uint16(randLength&0xFFFF)) + + params := strings.Split(a.Param, ":") + uid := make([]byte, 4) + if len(params) >= 2 { + if userID, err := strconv.ParseUint(params[0], 10, 32); err != nil { + rand.Read(uid) + } else { + binary.LittleEndian.PutUint32(uid, uint32(userID)) + a.userKey = a.hashDigest([]byte(params[1])) + } + } else { + rand.Read(uid) + } + + if a.userKey == nil { + a.userKey = make([]byte, a.KeyLen) + copy(a.userKey, a.Key) + } + + encryptKey := make([]byte, len(a.userKey)) + copy(encryptKey, a.userKey) + + aesCipherKey := tools.EVPBytesToKey(base64.StdEncoding.EncodeToString(encryptKey)+a.salt, 16) + block, err := aes.NewCipher(aesCipherKey) + if err != nil { + return nil + } + + encryptData := make([]byte, 16) + iv := make([]byte, aes.BlockSize) + cbc := cipher.NewCBCEncrypter(block, iv) + cbc.CryptBlocks(encryptData, encrypt[0:16]) + copy(encrypt[4:4+16], encryptData) + copy(encrypt[0:4], uid) + + h := a.hmac(key, encrypt[0:20]) + copy(encrypt[20:], h[:4]) + + rand.Read(outData[0:1]) + h = a.hmac(key, outData[0:1]) + copy(outData[1:], h[0:7-1]) + + copy(outData[7:], encrypt) + copy(outData[dataOffset:], data) + + h = a.hmac(a.userKey, outData[0:outLength-4]) + copy(outData[outLength-4:], h[:4]) + + return +} + +func (a *authAES128) PreEncrypt(plainData []byte) (outData []byte, err error) { + dataLength := len(plainData) + offset := 0 + if !a.hasSentHeader { + authLength := dataLength + if authLength > 1200 { + authLength = 1200 + } + packData := a.packAuthData(plainData[:authLength]) + a.hasSentHeader = true + outData = append(outData, packData...) + dataLength -= authLength + offset += authLength + } + const blockSize = 4096 + for dataLength > blockSize { + packData := a.packData(plainData[offset : offset+blockSize]) + outData = append(outData, packData...) + dataLength -= blockSize + offset += blockSize + } + if dataLength > 0 { + packData := a.packData(plainData[offset:]) + outData = append(outData, packData...) + } + + return +} + +func (a *authAES128) PostDecrypt(plainData []byte) ([]byte, int, error) { + a.buffer.Reset() + plainLength := len(plainData) + datalength := plainLength + readlenth := 0 + key := make([]byte, len(a.userKey)+4) + copy(key, a.userKey) + for plainLength > 4 { + binary.LittleEndian.PutUint32(key[len(key)-4:], a.recvID) + + h := a.hmac(key, plainData[0:2]) + if h[0] != plainData[2] || h[1] != plainData[3] { + return nil, 0, ssr.ErrAuthAES128HMACError + } + length := int(binary.LittleEndian.Uint16(plainData[0:2])) + if length >= 8192 || length < 8 { + return nil, 0, ssr.ErrAuthAES128DataLengthError + } + if length > plainLength { + break + } + a.recvID++ + pos := int(plainData[4]) + if pos < 255 { + pos += 4 + } else { + pos = int(binary.LittleEndian.Uint16(plainData[5:7])) + 4 + } + + a.buffer.Write(plainData[pos : length-4]) + plainData = plainData[length:] + plainLength -= length + readlenth += length + } + if datalength == readlenth { + readlenth = -1 + } + return a.buffer.Bytes(), readlenth, nil +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_sha1.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_sha1.go new file mode 100644 index 0000000..c6c2b0e --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_aes128_sha1.go @@ -0,0 +1,28 @@ +package protocol + +import ( + "bytes" + + "github.com/sun8911879/shadowsocksR/tools" +) + +func init() { + register("auth_aes128_sha1", NewAuthAES128SHA1) +} + +func NewAuthAES128SHA1() IProtocol { + a := &authAES128{ + salt: "auth_aes128_sha1", + hmac: tools.HmacSHA1, + hashDigest: tools.SHA1Sum, + packID: 1, + recvInfo: recvInfo{ + recvID: 1, + buffer: bytes.NewBuffer(nil), + }, + data: &authData{ + connectionID: 0xFF000001, + }, + } + return a +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_sha1_v4.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_sha1_v4.go new file mode 100644 index 0000000..ffec01b --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/protocol/auth_sha1_v4.go @@ -0,0 +1,232 @@ +package protocol + +import ( + "encoding/binary" + "math/rand" + "time" + + "github.com/sun8911879/shadowsocksR/ssr" + "github.com/sun8911879/shadowsocksR/tools" +) + +func init() { + register("auth_sha1_v4", NewAuthSHA1v4) +} + +type authSHA1v4 struct { + ssr.ServerInfoForObfs + data *authData + hasSentHeader bool + recvBuffer []byte + recvBufferLength int +} + +func NewAuthSHA1v4() IProtocol { + a := &authSHA1v4{} + return a +} + +func (a *authSHA1v4) SetServerInfo(s *ssr.ServerInfoForObfs) { + a.ServerInfoForObfs = *s +} + +func (a *authSHA1v4) GetServerInfo() (s *ssr.ServerInfoForObfs) { + return &a.ServerInfoForObfs +} + +func (a *authSHA1v4) SetData(data interface{}) { + if auth, ok := data.(*authData); ok { + a.data = auth + } +} + +func (a *authSHA1v4) GetData() interface{} { + if a.data == nil { + a.data = &authData{} + } + return a.data +} + +func (a *authSHA1v4) packData(data []byte) (outData []byte) { + dataLength := len(data) + randLength := 1 + if dataLength <= 1300 { + if dataLength > 400 { + randLength += rand.Intn(128) + } else { + randLength += rand.Intn(1024) + } + } + + outLength := randLength + dataLength + 8 + outData = make([]byte, outLength) + // 0~1, out length + binary.BigEndian.PutUint16(outData[0:2], uint16(outLength&0xFFFF)) + // 2~3, crc of out length + crc32 := ssr.CalcCRC32(outData, 2, 0xFFFFFFFF) + binary.LittleEndian.PutUint16(outData[2:4], uint16(crc32&0xFFFF)) + // 4~rand length+4, rand number + rand.Read(outData[4 : 4+randLength]) + // 4, rand length + if randLength < 128 { + outData[4] = byte(randLength & 0xFF) + } else { + // 4, magic number 0xFF + outData[4] = 0xFF + // 5~6, rand length + binary.BigEndian.PutUint16(outData[5:], uint16(randLength&0xFFFF)) + } + // rand length+4~out length-4, data + if dataLength > 0 { + copy(outData[randLength+4:], data) + } + // out length-4~end, adler32 of full data + adler := ssr.CalcAdler32(outData[:outLength-4]) + binary.LittleEndian.PutUint32(outData[outLength-4:], adler) + + return outData +} + +func (a *authSHA1v4) packAuthData(data []byte) (outData []byte) { + dataLength := len(data) + randLength := 1 + if dataLength <= 1300 { + if dataLength > 400 { + randLength += rand.Intn(128) + } else { + randLength += rand.Intn(1024) + } + } + dataOffset := randLength + 4 + 2 + outLength := dataOffset + dataLength + 12 + ssr.ObfsHMACSHA1Len + outData = make([]byte, outLength) + + a.data.connectionID++ + if a.data.connectionID > 0xFF000000 { + a.data.clientID = nil + } + if len(a.data.clientID) == 0 { + a.data.clientID = make([]byte, 8) + rand.Read(a.data.clientID) + b := make([]byte, 4) + rand.Read(b) + a.data.connectionID = binary.LittleEndian.Uint32(b) & 0xFFFFFF + } + // 0-1, out length + binary.BigEndian.PutUint16(outData[0:], uint16(outLength&0xFFFF)) + + // 2~6, crc of out length+salt+key + salt := []byte("auth_sha1_v4") + crcData := make([]byte, len(salt)+a.KeyLen+2) + copy(crcData[0:2], outData[0:2]) + copy(crcData[2:], salt) + copy(crcData[2+len(salt):], a.Key) + crc32 := ssr.CalcCRC32(crcData, len(crcData), 0xFFFFFFFF) + // 2~6, crc of out length+salt+key + binary.LittleEndian.PutUint32(outData[2:], crc32) + // 6~rand length+6, rand numbers + rand.Read(outData[dataOffset-randLength : dataOffset]) + // 6, rand length + if randLength < 128 { + outData[6] = byte(randLength & 0xFF) + } else { + // 6, magic number 0xFF + outData[6] = 0xFF + // 7-8, rand length + binary.BigEndian.PutUint16(outData[7:], uint16(randLength&0xFFFF)) + } + // rand length+6~rand length+10, time stamp + now := time.Now().Unix() + binary.LittleEndian.PutUint32(outData[dataOffset:dataOffset+4], uint32(now)) + // rand length+10~rand length+14, client ID + copy(outData[dataOffset+4:dataOffset+4+4], a.data.clientID[0:4]) + // rand length+14~rand length+18, connection ID + binary.LittleEndian.PutUint32(outData[dataOffset+8:dataOffset+8+4], a.data.connectionID) + // rand length+18~rand length+18+data length, data + copy(outData[dataOffset+12:], data) + + key := make([]byte, a.IVLen+a.KeyLen) + copy(key, a.IV) + copy(key[a.IVLen:], a.Key) + + h := tools.HmacSHA1(key, outData[:outLength-ssr.ObfsHMACSHA1Len]) + // out length-10~out length/rand length+18+data length~end, hmac + copy(outData[outLength-ssr.ObfsHMACSHA1Len:], h[0:ssr.ObfsHMACSHA1Len]) + + return outData +} + +func (a *authSHA1v4) PreEncrypt(plainData []byte) (outData []byte, err error) { + dataLength := len(plainData) + offset := 0 + if !a.hasSentHeader && dataLength > 0 { + authLength := dataLength + if headSize := ssr.GetHeadSize(plainData, 30); headSize <= dataLength { + authLength = headSize + } + packData := a.packAuthData(plainData[:authLength]) + a.hasSentHeader = true + outData = append(outData, packData...) + dataLength -= authLength + offset += authLength + } + const blockSize = 4096 + for dataLength > blockSize { + packData := a.packData(plainData[offset : offset+blockSize]) + outData = append(outData, packData...) + dataLength -= blockSize + offset += blockSize + } + if dataLength > 0 { + packData := a.packData(plainData[offset:]) + outData = append(outData, packData...) + } + + return +} + +func (a *authSHA1v4) PostDecrypt(plainData []byte) ([]byte, int, error) { + var outData []byte + dataLength := len(plainData) + b := make([]byte, len(a.recvBuffer)+dataLength) + copy(b, a.recvBuffer) + copy(b[len(a.recvBuffer):], plainData) + a.recvBuffer = b + a.recvBufferLength = len(b) + for a.recvBufferLength > 4 { + crc32 := ssr.CalcCRC32(a.recvBuffer, 2, 0xFFFFFFFF) + if binary.LittleEndian.Uint16(a.recvBuffer[2:4]) != uint16(crc32&0xFFFF) { + return nil, 0, ssr.ErrAuthSHA1v4CRC32Error + } + length := int(binary.BigEndian.Uint16(a.recvBuffer[0:2])) + if length >= 8192 || length < 8 { + a.recvBufferLength = 0 + a.recvBuffer = nil + return nil, 0, ssr.ErrAuthSHA1v4DataLengthError + } + if length > a.recvBufferLength { + break + } + + if ssr.CheckAdler32(a.recvBuffer, length) { + pos := int(a.recvBuffer[4]) + if pos != 0xFF { + pos += 4 + } else { + pos = int(binary.BigEndian.Uint16(a.recvBuffer[5:5+2])) + 4 + } + outLength := length - pos - 4 + b = make([]byte, len(outData)+outLength) + copy(b, outData) + copy(b[len(outData):], a.recvBuffer[pos:pos+outLength]) + outData = b + a.recvBufferLength -= length + a.recvBuffer = a.recvBuffer[length:] + } else { + a.recvBufferLength = 0 + a.recvBuffer = nil + return nil, 0, ssr.ErrAuthSHA1v4IncorrectChecksum + } + } + return outData, 0, nil +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/base.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/base.go new file mode 100644 index 0000000..4fff0aa --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/protocol/base.go @@ -0,0 +1,39 @@ +package protocol + +import ( + "strings" + + "github.com/sun8911879/shadowsocksR/ssr" +) + +type creator func() IProtocol + +var ( + creatorMap = make(map[string]creator) +) + +type IProtocol interface { + SetServerInfo(s *ssr.ServerInfoForObfs) + GetServerInfo() *ssr.ServerInfoForObfs + PreEncrypt(data []byte) ([]byte, error) + PostDecrypt(data []byte) ([]byte, int, error) + SetData(data interface{}) + GetData() interface{} +} + +type authData struct { + clientID []byte + connectionID uint32 +} + +func register(name string, c creator) { + creatorMap[name] = c +} + +func NewProtocol(name string) IProtocol { + c, ok := creatorMap[strings.ToLower(name)] + if ok { + return c() + } + return nil +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/origin.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/origin.go new file mode 100644 index 0000000..8dd1851 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/protocol/origin.go @@ -0,0 +1,42 @@ +package protocol + +import ( + "github.com/sun8911879/shadowsocksR/ssr" +) + +func init() { + register("origin", NewOrigin) +} + +type origin struct { + ssr.ServerInfoForObfs +} + +func NewOrigin() IProtocol { + a := &origin{} + return a +} + +func (o *origin) SetServerInfo(s *ssr.ServerInfoForObfs) { + o.ServerInfoForObfs = *s +} + +func (o *origin) GetServerInfo() (s *ssr.ServerInfoForObfs) { + return &o.ServerInfoForObfs +} + +func (o *origin) PreEncrypt(data []byte) (encryptedData []byte, err error) { + return data, nil +} + +func (o *origin) PostDecrypt(data []byte) ([]byte, int, error) { + return data, 0, nil +} + +func (o *origin) SetData(data interface{}) { + +} + +func (o *origin) GetData() interface{} { + return nil +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/protocol/verify_sha1.go b/vendor/github.com/sun8911879/shadowsocksR/protocol/verify_sha1.go new file mode 100644 index 0000000..42543e5 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/protocol/verify_sha1.go @@ -0,0 +1,101 @@ +package protocol + +import ( + "bytes" + "encoding/binary" + + "github.com/sun8911879/shadowsocksR/ssr" + "github.com/sun8911879/shadowsocksR/tools" +) + +func init() { + register("verify_sha1", NewVerifySHA1) + register("ota", NewVerifySHA1) +} + +type verifySHA1 struct { + ssr.ServerInfoForObfs + hasSentHeader bool + chunkId uint32 +} + +const ( + oneTimeAuthMask byte = 0x10 +) + +func NewVerifySHA1() IProtocol { + a := &verifySHA1{} + return a +} + +func (v *verifySHA1) otaConnectAuth(data []byte) []byte { + return append(data, tools.HmacSHA1(append(v.IV, v.Key...), data)...) +} + +func (v *verifySHA1) otaReqChunkAuth(chunkId uint32, data []byte) []byte { + nb := make([]byte, 2) + binary.BigEndian.PutUint16(nb, uint16(len(data))) + chunkIdBytes := make([]byte, 4) + binary.BigEndian.PutUint32(chunkIdBytes, chunkId) + header := append(nb, tools.HmacSHA1(append(v.IV, chunkIdBytes...), data)...) + return append(header, data...) +} + +func (v *verifySHA1) otaVerifyAuth(iv []byte, chunkId uint32, data []byte, expectedHmacSha1 []byte) bool { + chunkIdBytes := make([]byte, 4) + binary.BigEndian.PutUint32(chunkIdBytes, chunkId) + actualHmacSha1 := tools.HmacSHA1(append(iv, chunkIdBytes...), data) + return bytes.Equal(expectedHmacSha1, actualHmacSha1) +} + +func (v *verifySHA1) getAndIncreaseChunkId() (chunkId uint32) { + chunkId = v.chunkId + v.chunkId += 1 + return +} + +func (v *verifySHA1) SetServerInfo(s *ssr.ServerInfoForObfs) { + v.ServerInfoForObfs = *s +} + +func (v *verifySHA1) GetServerInfo() (s *ssr.ServerInfoForObfs) { + return &v.ServerInfoForObfs +} + +func (v *verifySHA1) SetData(data interface{}) { + +} + +func (v *verifySHA1) GetData() interface{} { + return nil +} + +func (v *verifySHA1) PreEncrypt(data []byte) (encryptedData []byte, err error) { + dataLength := len(data) + offset := 0 + if !v.hasSentHeader { + data[0] |= oneTimeAuthMask + encryptedData = v.otaConnectAuth(data[:v.HeadLen]) + v.hasSentHeader = true + dataLength -= v.HeadLen + offset += v.HeadLen + } + const blockSize = 4096 + for dataLength > blockSize { + chunkId := v.getAndIncreaseChunkId() + b := v.otaReqChunkAuth(chunkId, data[offset:offset+blockSize]) + encryptedData = append(encryptedData, b...) + dataLength -= blockSize + offset += blockSize + } + if dataLength > 0 { + chunkId := v.getAndIncreaseChunkId() + b := v.otaReqChunkAuth(chunkId, data[offset:]) + encryptedData = append(encryptedData, b...) + } + return +} + +func (v *verifySHA1) PostDecrypt(data []byte) ([]byte, int, error) { + return data, 0, nil +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/ssr/adler32.go b/vendor/github.com/sun8911879/shadowsocksR/ssr/adler32.go new file mode 100644 index 0000000..6bda937 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/ssr/adler32.go @@ -0,0 +1,31 @@ +package ssr + +import "encoding/binary" + +func calcShortAdler32(input []byte, a, b uint32) (uint32, uint32) { + for _, i := range input { + a += uint32(i) + b += a + } + a %= 65521 + b %= 65521 + return a, b +} + +func CalcAdler32(input []byte) uint32 { + var a uint32 = 1 + var b uint32 = 0 + const nMax = 5552 + for length := len(input); length > nMax; length -= nMax { + a, b = calcShortAdler32(input[:nMax], a, b) + input = input[nMax:] + } + a, b = calcShortAdler32(input, a, b) + return (b << 16) + a +} + +func CheckAdler32(input []byte, l int) bool { + adler32 := CalcAdler32(input[:l-4]) + checksum := binary.LittleEndian.Uint32(input[l-4:]) + return adler32 == checksum +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/ssr/crc32.go b/vendor/github.com/sun8911879/shadowsocksR/ssr/crc32.go new file mode 100644 index 0000000..9cf6cc6 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/ssr/crc32.go @@ -0,0 +1,52 @@ +package ssr + +import "encoding/binary" + +var ( + crc32Table = make([]uint32, 256) +) + +func init() { + createCRC32Table() +} + +func createCRC32Table() { + for i := 0; i < 256; i++ { + crc := uint32(i) + for j := 8; j > 0; j-- { + if crc&1 == 1 { + crc = (crc >> 1) ^ 0xEDB88320 + } else { + crc >>= 1 + } + } + crc32Table[i] = crc + } +} + +func CalcCRC32(input []byte, length int, value uint32) uint32 { + value = 0xFFFFFFFF + return DoCalcCRC32(input, 0, length, value) +} + +func DoCalcCRC32(input []byte, index int, length int, value uint32) uint32 { + buffer := input + for i := index; i < length; i++ { + value = (value >> 8) ^ crc32Table[byte(value&0xFF)^buffer[i]] + } + return value ^ 0xFFFFFFFF +} + +func DoSetCRC32(buffer []byte, index int, length int) { + crc := CalcCRC32(buffer[:length-4], length-4, 0xFFFFFFFF) + binary.LittleEndian.PutUint32(buffer[length-4:], crc^0xFFFFFFFF) +} + +func SetCRC32(buffer []byte, length int) { + DoSetCRC32(buffer, 0, length) +} + +func CheckCRC32(buffer []byte, length int) bool { + crc := CalcCRC32(buffer, length, 0xFFFFFFFF) + return crc == 0xFFFFFFFF +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/ssr/obfs.go b/vendor/github.com/sun8911879/shadowsocksR/ssr/obfs.go new file mode 100644 index 0000000..cee94a2 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/ssr/obfs.go @@ -0,0 +1,55 @@ +package ssr + +import "errors" + +const ObfsHMACSHA1Len = 10 + +var ( + ErrAuthSHA1v4CRC32Error = errors.New("auth_sha1_v4 post decrypt data crc32 error") + ErrAuthSHA1v4DataLengthError = errors.New("auth_sha1_v4 post decrypt data length error") + ErrAuthSHA1v4IncorrectChecksum = errors.New("auth_sha1_v4 post decrypt incorrect checksum") + ErrAuthAES128HMACError = errors.New("auth_aes128_* post decrypt hmac error") + ErrAuthAES128DataLengthError = errors.New("auth_aes128_* post decrypt length mismatch") + ErrAuthAES128IncorrectChecksum = errors.New("auth_aes128_* post decrypt incorrect checksum") + ErrTLS12TicketAuthTooShortData = errors.New("tls1.2_ticket_auth too short data") + ErrTLS12TicketAuthHMACError = errors.New("tls1.2_ticket_auth hmac verifying failed") + ErrTLS12TicketAuthIncorrectMagicNumber = errors.New("tls1.2_ticket_auth incorrect magic number") +) + +type ServerInfoForObfs struct { + Host string + Port uint16 + Param string + IV []byte + IVLen int + RecvIV []byte + RecvIVLen int + Key []byte + KeyLen int + HeadLen int + TcpMss int +} + +func GetHeadSize(data []byte, defaultValue int) int { + if data == nil || len(data) < 2 { + return defaultValue + } + headType := data[0] & 0x07 + switch headType { + case 1: + // IPv4 1+4+2 + return 7 + case 4: + // IPv6 1+16+2 + return 19 + case 3: + // domain name, variant length + return 4 + int(data[1]) + } + + return defaultValue +} + +func (s *ServerInfoForObfs) SetHeadLen(data []byte, defaultValue int) { + s.HeadLen = GetHeadSize(data, defaultValue) +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/tcp.go b/vendor/github.com/sun8911879/shadowsocksR/tcp.go new file mode 100644 index 0000000..4e86d98 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/tcp.go @@ -0,0 +1,251 @@ +package shadowsocksr + +import ( + "bytes" + "fmt" + "net" + "sync" + + "github.com/sun8911879/shadowsocksR/obfs" + "github.com/sun8911879/shadowsocksR/protocol" + "github.com/sun8911879/shadowsocksR/tools/leakybuf" +) + +// SSTCPConn the struct that override the net.Conn methods +type SSTCPConn struct { + net.Conn + sync.RWMutex + *StreamCipher + IObfs obfs.IObfs + IProtocol protocol.IProtocol + readBuf []byte + readDecodeBuf *bytes.Buffer + readIObfsBuf *bytes.Buffer + readEncryptBuf *bytes.Buffer + readIndex uint64 + readUserBuf *bytes.Buffer + writeBuf []byte + lastReadError error +} + +func NewSSTCPConn(c net.Conn, cipher *StreamCipher) *SSTCPConn { + return &SSTCPConn{ + Conn: c, + StreamCipher: cipher, + readBuf: leakybuf.GlobalLeakyBuf.Get(), + readDecodeBuf: bytes.NewBuffer(nil), + readIObfsBuf: bytes.NewBuffer(nil), + readUserBuf: bytes.NewBuffer(nil), + readEncryptBuf: bytes.NewBuffer(nil), + writeBuf: leakybuf.GlobalLeakyBuf.Get(), + } +} + +func (c *SSTCPConn) Close() error { + leakybuf.GlobalLeakyBuf.Put(c.readBuf) + leakybuf.GlobalLeakyBuf.Put(c.writeBuf) + return c.Conn.Close() +} + +func (c *SSTCPConn) GetIv() (iv []byte) { + iv = make([]byte, len(c.iv)) + copy(iv, c.iv) + return +} + +func (c *SSTCPConn) GetKey() (key []byte) { + key = make([]byte, len(c.key)) + copy(key, c.key) + return +} + +func (c *SSTCPConn) initEncryptor(b []byte) (iv []byte, err error) { + if c.enc == nil { + iv, err = c.initEncrypt() + if err != nil { + return nil, err + } + + // should initialize obfs/protocol now, because iv is ready now + obfsServerInfo := c.IObfs.GetServerInfo() + obfsServerInfo.SetHeadLen(b, 30) + obfsServerInfo.IV, obfsServerInfo.IVLen = c.IV() + obfsServerInfo.Key, obfsServerInfo.KeyLen = c.Key() + c.IObfs.SetServerInfo(obfsServerInfo) + + protocolServerInfo := c.IProtocol.GetServerInfo() + protocolServerInfo.SetHeadLen(b, 30) + protocolServerInfo.IV, protocolServerInfo.IVLen = c.IV() + protocolServerInfo.Key, protocolServerInfo.KeyLen = c.Key() + c.IProtocol.SetServerInfo(protocolServerInfo) + } + return +} + +func (c *SSTCPConn) Read(b []byte) (n int, err error) { + for { + n, err = c.doRead(b) + if b == nil || n != 0 || err != nil { + return n, err + } + } +} + +func (c *SSTCPConn) doRead(b []byte) (n int, err error) { + //先吐出已经解密后数据 + if c.readUserBuf.Len() > 0 { + return c.readUserBuf.Read(b) + } + //未读取够长度继续读取并解码 + decodelength := c.readDecodeBuf.Len() + if (decodelength == 0 || c.readEncryptBuf.Len() > 0 || (c.readIndex != 0 && c.readIndex > uint64(decodelength))) && c.lastReadError == nil { + c.readIndex = 0 + n, c.lastReadError = c.Conn.Read(c.readBuf) + //写入decode 缓存 + c.readDecodeBuf.Write(c.readBuf[0:n]) + } + //无缓冲数据返回错误 + if c.lastReadError != nil && (decodelength == 0 || uint64(decodelength) < c.readIndex) { + return 0, c.lastReadError + } + decodelength = c.readDecodeBuf.Len() + decodebytes := c.readDecodeBuf.Bytes() + c.readDecodeBuf.Reset() + + for { + + decodedData, length, err := c.IObfs.Decode(decodebytes) + if length == 0 && err != nil { + return 0, err + } + + //do send back + if length == 1 { + c.Write(make([]byte, 0)) + return 0, nil + } + + //数据不够长度 + if err != nil { + if uint64(decodelength) >= length { + return 0, fmt.Errorf("data length: %d,decode data length: %d unknown panic", decodelength, length) + } + c.readIndex = length + c.readDecodeBuf.Write(decodebytes) + if c.readIObfsBuf.Len() == 0 { + return 0, nil + } + break + } + + if length >= 1 { + //读出数据 但是有多余的数据 返回已经读取数值 + c.readIObfsBuf.Write(decodedData) + decodebytes = decodebytes[length:] + decodelength = len(decodebytes) + continue + } + + //完全读取数据 -- length == 0 + c.readIObfsBuf.Write(decodedData) + break + } + + decodedData := c.readIObfsBuf.Bytes() + decodelength = c.readIObfsBuf.Len() + c.readIObfsBuf.Reset() + + if c.dec == nil { + iv := decodedData[0:c.info.ivLen] + if err = c.initDecrypt(iv); err != nil { + return 0, err + } + + if len(c.iv) == 0 { + c.iv = iv + } + decodelength -= c.info.ivLen + if decodelength <= 0 { + return 0, nil + } + decodedData = decodedData[c.info.ivLen:] + } + + buf := make([]byte, decodelength) + c.decrypt(buf, decodedData) + + c.readEncryptBuf.Write(buf) + encryptbuf := c.readEncryptBuf.Bytes() + c.readEncryptBuf.Reset() + postDecryptedData, length, err := c.IProtocol.PostDecrypt(encryptbuf) + if err != nil { + return 0, err + } + if length == 0 { + c.readEncryptBuf.Write(encryptbuf) + return 0, nil + } + + if length > 0 { + c.readEncryptBuf.Write(encryptbuf[length:]) + } + + postDecryptedlength := len(postDecryptedData) + blength := len(b) + copy(b, postDecryptedData) + if blength > postDecryptedlength { + return postDecryptedlength, nil + } + c.readUserBuf.Write(postDecryptedData[len(b):]) + return blength, nil +} + +func (c *SSTCPConn) preWrite(b []byte) (outData []byte, err error) { + var iv []byte + if iv, err = c.initEncryptor(b); err != nil { + return + } + + var preEncryptedData []byte + preEncryptedData, err = c.IProtocol.PreEncrypt(b) + if err != nil { + return + } + preEncryptedDataLen := len(preEncryptedData) + //c.encrypt(cipherData[len(iv):], b) + encryptedData := make([]byte, preEncryptedDataLen) + //! \attention here the expected output buffer length MUST be accurate, it is preEncryptedDataLen now! + c.encrypt(encryptedData[0:preEncryptedDataLen], preEncryptedData) + + //common.Info("len(b)=", len(b), ", b:", b, + // ", pre encrypted data length:", preEncryptedDataLen, + // ", pre encrypted data:", preEncryptedData, + // ", encrypted data length:", preEncryptedDataLen) + + cipherData := c.writeBuf + dataSize := len(encryptedData) + len(iv) + if dataSize > len(cipherData) { + cipherData = make([]byte, dataSize) + } else { + cipherData = cipherData[:dataSize] + } + + if iv != nil { + // Put initialization vector in buffer before be encoded + copy(cipherData, iv) + } + copy(cipherData[len(iv):], encryptedData) + + return c.IObfs.Encode(cipherData) +} + +func (c *SSTCPConn) Write(b []byte) (n int, err error) { + outData, err := c.preWrite(b) + if err == nil { + n, err = c.Conn.Write(outData) + if err != nil { + return n, err + } + } + return len(b), nil +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/tools/encrypt.go b/vendor/github.com/sun8911879/shadowsocksR/tools/encrypt.go new file mode 100644 index 0000000..5ecb3b9 --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/tools/encrypt.go @@ -0,0 +1,51 @@ +package tools + +import ( + "crypto/hmac" + "crypto/md5" + "crypto/sha1" +) + +func HmacMD5(key []byte, data []byte) []byte { + hmacMD5 := hmac.New(md5.New, key) + hmacMD5.Write(data) + return hmacMD5.Sum(nil)[:10] +} + +func HmacSHA1(key []byte, data []byte) []byte { + hmacSHA1 := hmac.New(sha1.New, key) + hmacSHA1.Write(data) + return hmacSHA1.Sum(nil)[:10] +} + +func MD5Sum(d []byte) []byte { + h := md5.New() + h.Write(d) + return h.Sum(nil) +} + +func SHA1Sum(d []byte) []byte { + h := sha1.New() + h.Write(d) + return h.Sum(nil) +} + +func EVPBytesToKey(password string, keyLen int) (key []byte) { + const md5Len = 16 + + cnt := (keyLen-1)/md5Len + 1 + m := make([]byte, cnt*md5Len) + copy(m, MD5Sum([]byte(password))) + + // Repeatedly call md5 until bytes generated is enough. + // Each call to md5 uses data: prev md5 sum + password. + d := make([]byte, md5Len+len(password)) + start := 0 + for i := 1; i < cnt; i++ { + start += md5Len + copy(d, m[start-md5Len:start]) + copy(d[md5Len:], password) + copy(m[start:], MD5Sum(d)) + } + return m[:keyLen] +} diff --git a/vendor/github.com/sun8911879/shadowsocksR/tools/leakybuf/leakybuf.go b/vendor/github.com/sun8911879/shadowsocksR/tools/leakybuf/leakybuf.go new file mode 100644 index 0000000..096c9cb --- /dev/null +++ b/vendor/github.com/sun8911879/shadowsocksR/tools/leakybuf/leakybuf.go @@ -0,0 +1,47 @@ +// Provides leaky buffer, based on the example in Effective Go. +package leakybuf + +type LeakyBuf struct { + bufSize int // size of each buffer + freeList chan []byte +} + +// NewLeakyBuf creates a leaky buffer which can hold at most n buffer, each +// with bufSize bytes. +func NewLeakyBuf(n, bufSize int) *LeakyBuf { + return &LeakyBuf{ + bufSize: bufSize, + freeList: make(chan []byte, n), + } +} + +// Get returns a buffer from the leaky buffer or create a new buffer. +func (lb *LeakyBuf) Get() (b []byte) { + select { + case b = <-lb.freeList: + default: + b = make([]byte, lb.bufSize) + } + return +} + +// Put add the buffer into the free buffer pool for reuse. Panic if the buffer +// size is not the same with the leaky buffer's. This is intended to expose +// error usage of leaky buffer. +func (lb *LeakyBuf) Put(b []byte) { + if len(b) != lb.bufSize { + panic("invalid buffer size that's put into leaky buffer") + } + select { + case lb.freeList <- b: + default: + } + return +} + +const ( + GlobalLeakyBufSize = 32 * 1024 // data.len(2) + hmacsha1(10) + data(4096) + maxNBuf = 8192 +) + +var GlobalLeakyBuf = NewLeakyBuf(maxNBuf, GlobalLeakyBufSize) diff --git a/vendor/golang.org/x/crypto/LICENSE b/vendor/golang.org/x/crypto/LICENSE new file mode 100644 index 0000000..6a66aea --- /dev/null +++ b/vendor/golang.org/x/crypto/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/golang.org/x/crypto/PATENTS b/vendor/golang.org/x/crypto/PATENTS new file mode 100644 index 0000000..7330990 --- /dev/null +++ b/vendor/golang.org/x/crypto/PATENTS @@ -0,0 +1,22 @@ +Additional IP Rights Grant (Patents) + +"This implementation" means the copyrightable works distributed by +Google as part of the Go project. + +Google hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable (except as stated in this section) +patent license to make, have made, use, offer to sell, sell, import, +transfer and otherwise run, modify and propagate the contents of this +implementation of Go, where such license applies only to those patent +claims, both currently owned or controlled by Google and acquired in +the future, licensable by Google that are necessarily infringed by this +implementation of Go. This grant does not include claims that would be +infringed only as a consequence of further modification of this +implementation. If you or your agent or exclusive licensee institute or +order or agree to the institution of patent litigation against any +entity (including a cross-claim or counterclaim in a lawsuit) alleging +that this implementation of Go or any code incorporated within this +implementation of Go constitutes direct or contributory patent +infringement, or inducement of patent infringement, then any patent +rights granted to you under this License for this implementation of Go +shall terminate as of the date such litigation is filed. diff --git a/vendor/golang.org/x/crypto/blowfish/block.go b/vendor/golang.org/x/crypto/blowfish/block.go new file mode 100644 index 0000000..9d80f19 --- /dev/null +++ b/vendor/golang.org/x/crypto/blowfish/block.go @@ -0,0 +1,159 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package blowfish + +// getNextWord returns the next big-endian uint32 value from the byte slice +// at the given position in a circular manner, updating the position. +func getNextWord(b []byte, pos *int) uint32 { + var w uint32 + j := *pos + for i := 0; i < 4; i++ { + w = w<<8 | uint32(b[j]) + j++ + if j >= len(b) { + j = 0 + } + } + *pos = j + return w +} + +// ExpandKey performs a key expansion on the given *Cipher. Specifically, it +// performs the Blowfish algorithm's key schedule which sets up the *Cipher's +// pi and substitution tables for calls to Encrypt. This is used, primarily, +// by the bcrypt package to reuse the Blowfish key schedule during its +// set up. It's unlikely that you need to use this directly. +func ExpandKey(key []byte, c *Cipher) { + j := 0 + for i := 0; i < 18; i++ { + // Using inlined getNextWord for performance. + var d uint32 + for k := 0; k < 4; k++ { + d = d<<8 | uint32(key[j]) + j++ + if j >= len(key) { + j = 0 + } + } + c.p[i] ^= d + } + + var l, r uint32 + for i := 0; i < 18; i += 2 { + l, r = encryptBlock(l, r, c) + c.p[i], c.p[i+1] = l, r + } + + for i := 0; i < 256; i += 2 { + l, r = encryptBlock(l, r, c) + c.s0[i], c.s0[i+1] = l, r + } + for i := 0; i < 256; i += 2 { + l, r = encryptBlock(l, r, c) + c.s1[i], c.s1[i+1] = l, r + } + for i := 0; i < 256; i += 2 { + l, r = encryptBlock(l, r, c) + c.s2[i], c.s2[i+1] = l, r + } + for i := 0; i < 256; i += 2 { + l, r = encryptBlock(l, r, c) + c.s3[i], c.s3[i+1] = l, r + } +} + +// This is similar to ExpandKey, but folds the salt during the key +// schedule. While ExpandKey is essentially expandKeyWithSalt with an all-zero +// salt passed in, reusing ExpandKey turns out to be a place of inefficiency +// and specializing it here is useful. +func expandKeyWithSalt(key []byte, salt []byte, c *Cipher) { + j := 0 + for i := 0; i < 18; i++ { + c.p[i] ^= getNextWord(key, &j) + } + + j = 0 + var l, r uint32 + for i := 0; i < 18; i += 2 { + l ^= getNextWord(salt, &j) + r ^= getNextWord(salt, &j) + l, r = encryptBlock(l, r, c) + c.p[i], c.p[i+1] = l, r + } + + for i := 0; i < 256; i += 2 { + l ^= getNextWord(salt, &j) + r ^= getNextWord(salt, &j) + l, r = encryptBlock(l, r, c) + c.s0[i], c.s0[i+1] = l, r + } + + for i := 0; i < 256; i += 2 { + l ^= getNextWord(salt, &j) + r ^= getNextWord(salt, &j) + l, r = encryptBlock(l, r, c) + c.s1[i], c.s1[i+1] = l, r + } + + for i := 0; i < 256; i += 2 { + l ^= getNextWord(salt, &j) + r ^= getNextWord(salt, &j) + l, r = encryptBlock(l, r, c) + c.s2[i], c.s2[i+1] = l, r + } + + for i := 0; i < 256; i += 2 { + l ^= getNextWord(salt, &j) + r ^= getNextWord(salt, &j) + l, r = encryptBlock(l, r, c) + c.s3[i], c.s3[i+1] = l, r + } +} + +func encryptBlock(l, r uint32, c *Cipher) (uint32, uint32) { + xl, xr := l, r + xl ^= c.p[0] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[1] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[2] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[3] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[4] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[5] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[6] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[7] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[8] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[9] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[10] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[11] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[12] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[13] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[14] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[15] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[16] + xr ^= c.p[17] + return xr, xl +} + +func decryptBlock(l, r uint32, c *Cipher) (uint32, uint32) { + xl, xr := l, r + xl ^= c.p[17] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[16] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[15] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[14] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[13] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[12] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[11] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[10] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[9] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[8] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[7] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[6] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[5] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[4] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[3] + xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[2] + xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[1] + xr ^= c.p[0] + return xr, xl +} diff --git a/vendor/golang.org/x/crypto/blowfish/cipher.go b/vendor/golang.org/x/crypto/blowfish/cipher.go new file mode 100644 index 0000000..2641dad --- /dev/null +++ b/vendor/golang.org/x/crypto/blowfish/cipher.go @@ -0,0 +1,91 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package blowfish implements Bruce Schneier's Blowfish encryption algorithm. +package blowfish // import "golang.org/x/crypto/blowfish" + +// The code is a port of Bruce Schneier's C implementation. +// See https://www.schneier.com/blowfish.html. + +import "strconv" + +// The Blowfish block size in bytes. +const BlockSize = 8 + +// A Cipher is an instance of Blowfish encryption using a particular key. +type Cipher struct { + p [18]uint32 + s0, s1, s2, s3 [256]uint32 +} + +type KeySizeError int + +func (k KeySizeError) Error() string { + return "crypto/blowfish: invalid key size " + strconv.Itoa(int(k)) +} + +// NewCipher creates and returns a Cipher. +// The key argument should be the Blowfish key, from 1 to 56 bytes. +func NewCipher(key []byte) (*Cipher, error) { + var result Cipher + if k := len(key); k < 1 || k > 56 { + return nil, KeySizeError(k) + } + initCipher(&result) + ExpandKey(key, &result) + return &result, nil +} + +// NewSaltedCipher creates a returns a Cipher that folds a salt into its key +// schedule. For most purposes, NewCipher, instead of NewSaltedCipher, is +// sufficient and desirable. For bcrypt compatibility, the key can be over 56 +// bytes. +func NewSaltedCipher(key, salt []byte) (*Cipher, error) { + if len(salt) == 0 { + return NewCipher(key) + } + var result Cipher + if k := len(key); k < 1 { + return nil, KeySizeError(k) + } + initCipher(&result) + expandKeyWithSalt(key, salt, &result) + return &result, nil +} + +// BlockSize returns the Blowfish block size, 8 bytes. +// It is necessary to satisfy the Block interface in the +// package "crypto/cipher". +func (c *Cipher) BlockSize() int { return BlockSize } + +// Encrypt encrypts the 8-byte buffer src using the key k +// and stores the result in dst. +// Note that for amounts of data larger than a block, +// it is not safe to just call Encrypt on successive blocks; +// instead, use an encryption mode like CBC (see crypto/cipher/cbc.go). +func (c *Cipher) Encrypt(dst, src []byte) { + l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3]) + r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7]) + l, r = encryptBlock(l, r, c) + dst[0], dst[1], dst[2], dst[3] = byte(l>>24), byte(l>>16), byte(l>>8), byte(l) + dst[4], dst[5], dst[6], dst[7] = byte(r>>24), byte(r>>16), byte(r>>8), byte(r) +} + +// Decrypt decrypts the 8-byte buffer src using the key k +// and stores the result in dst. +func (c *Cipher) Decrypt(dst, src []byte) { + l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3]) + r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7]) + l, r = decryptBlock(l, r, c) + dst[0], dst[1], dst[2], dst[3] = byte(l>>24), byte(l>>16), byte(l>>8), byte(l) + dst[4], dst[5], dst[6], dst[7] = byte(r>>24), byte(r>>16), byte(r>>8), byte(r) +} + +func initCipher(c *Cipher) { + copy(c.p[0:], p[0:]) + copy(c.s0[0:], s0[0:]) + copy(c.s1[0:], s1[0:]) + copy(c.s2[0:], s2[0:]) + copy(c.s3[0:], s3[0:]) +} diff --git a/vendor/golang.org/x/crypto/blowfish/const.go b/vendor/golang.org/x/crypto/blowfish/const.go new file mode 100644 index 0000000..d040775 --- /dev/null +++ b/vendor/golang.org/x/crypto/blowfish/const.go @@ -0,0 +1,199 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// The startup permutation array and substitution boxes. +// They are the hexadecimal digits of PI; see: +// https://www.schneier.com/code/constants.txt. + +package blowfish + +var s0 = [256]uint32{ + 0xd1310ba6, 0x98dfb5ac, 0x2ffd72db, 0xd01adfb7, 0xb8e1afed, 0x6a267e96, + 0xba7c9045, 0xf12c7f99, 0x24a19947, 0xb3916cf7, 0x0801f2e2, 0x858efc16, + 0x636920d8, 0x71574e69, 0xa458fea3, 0xf4933d7e, 0x0d95748f, 0x728eb658, + 0x718bcd58, 0x82154aee, 0x7b54a41d, 0xc25a59b5, 0x9c30d539, 0x2af26013, + 0xc5d1b023, 0x286085f0, 0xca417918, 0xb8db38ef, 0x8e79dcb0, 0x603a180e, + 0x6c9e0e8b, 0xb01e8a3e, 0xd71577c1, 0xbd314b27, 0x78af2fda, 0x55605c60, + 0xe65525f3, 0xaa55ab94, 0x57489862, 0x63e81440, 0x55ca396a, 0x2aab10b6, + 0xb4cc5c34, 0x1141e8ce, 0xa15486af, 0x7c72e993, 0xb3ee1411, 0x636fbc2a, + 0x2ba9c55d, 0x741831f6, 0xce5c3e16, 0x9b87931e, 0xafd6ba33, 0x6c24cf5c, + 0x7a325381, 0x28958677, 0x3b8f4898, 0x6b4bb9af, 0xc4bfe81b, 0x66282193, + 0x61d809cc, 0xfb21a991, 0x487cac60, 0x5dec8032, 0xef845d5d, 0xe98575b1, + 0xdc262302, 0xeb651b88, 0x23893e81, 0xd396acc5, 0x0f6d6ff3, 0x83f44239, + 0x2e0b4482, 0xa4842004, 0x69c8f04a, 0x9e1f9b5e, 0x21c66842, 0xf6e96c9a, + 0x670c9c61, 0xabd388f0, 0x6a51a0d2, 0xd8542f68, 0x960fa728, 0xab5133a3, + 0x6eef0b6c, 0x137a3be4, 0xba3bf050, 0x7efb2a98, 0xa1f1651d, 0x39af0176, + 0x66ca593e, 0x82430e88, 0x8cee8619, 0x456f9fb4, 0x7d84a5c3, 0x3b8b5ebe, + 0xe06f75d8, 0x85c12073, 0x401a449f, 0x56c16aa6, 0x4ed3aa62, 0x363f7706, + 0x1bfedf72, 0x429b023d, 0x37d0d724, 0xd00a1248, 0xdb0fead3, 0x49f1c09b, + 0x075372c9, 0x80991b7b, 0x25d479d8, 0xf6e8def7, 0xe3fe501a, 0xb6794c3b, + 0x976ce0bd, 0x04c006ba, 0xc1a94fb6, 0x409f60c4, 0x5e5c9ec2, 0x196a2463, + 0x68fb6faf, 0x3e6c53b5, 0x1339b2eb, 0x3b52ec6f, 0x6dfc511f, 0x9b30952c, + 0xcc814544, 0xaf5ebd09, 0xbee3d004, 0xde334afd, 0x660f2807, 0x192e4bb3, + 0xc0cba857, 0x45c8740f, 0xd20b5f39, 0xb9d3fbdb, 0x5579c0bd, 0x1a60320a, + 0xd6a100c6, 0x402c7279, 0x679f25fe, 0xfb1fa3cc, 0x8ea5e9f8, 0xdb3222f8, + 0x3c7516df, 0xfd616b15, 0x2f501ec8, 0xad0552ab, 0x323db5fa, 0xfd238760, + 0x53317b48, 0x3e00df82, 0x9e5c57bb, 0xca6f8ca0, 0x1a87562e, 0xdf1769db, + 0xd542a8f6, 0x287effc3, 0xac6732c6, 0x8c4f5573, 0x695b27b0, 0xbbca58c8, + 0xe1ffa35d, 0xb8f011a0, 0x10fa3d98, 0xfd2183b8, 0x4afcb56c, 0x2dd1d35b, + 0x9a53e479, 0xb6f84565, 0xd28e49bc, 0x4bfb9790, 0xe1ddf2da, 0xa4cb7e33, + 0x62fb1341, 0xcee4c6e8, 0xef20cada, 0x36774c01, 0xd07e9efe, 0x2bf11fb4, + 0x95dbda4d, 0xae909198, 0xeaad8e71, 0x6b93d5a0, 0xd08ed1d0, 0xafc725e0, + 0x8e3c5b2f, 0x8e7594b7, 0x8ff6e2fb, 0xf2122b64, 0x8888b812, 0x900df01c, + 0x4fad5ea0, 0x688fc31c, 0xd1cff191, 0xb3a8c1ad, 0x2f2f2218, 0xbe0e1777, + 0xea752dfe, 0x8b021fa1, 0xe5a0cc0f, 0xb56f74e8, 0x18acf3d6, 0xce89e299, + 0xb4a84fe0, 0xfd13e0b7, 0x7cc43b81, 0xd2ada8d9, 0x165fa266, 0x80957705, + 0x93cc7314, 0x211a1477, 0xe6ad2065, 0x77b5fa86, 0xc75442f5, 0xfb9d35cf, + 0xebcdaf0c, 0x7b3e89a0, 0xd6411bd3, 0xae1e7e49, 0x00250e2d, 0x2071b35e, + 0x226800bb, 0x57b8e0af, 0x2464369b, 0xf009b91e, 0x5563911d, 0x59dfa6aa, + 0x78c14389, 0xd95a537f, 0x207d5ba2, 0x02e5b9c5, 0x83260376, 0x6295cfa9, + 0x11c81968, 0x4e734a41, 0xb3472dca, 0x7b14a94a, 0x1b510052, 0x9a532915, + 0xd60f573f, 0xbc9bc6e4, 0x2b60a476, 0x81e67400, 0x08ba6fb5, 0x571be91f, + 0xf296ec6b, 0x2a0dd915, 0xb6636521, 0xe7b9f9b6, 0xff34052e, 0xc5855664, + 0x53b02d5d, 0xa99f8fa1, 0x08ba4799, 0x6e85076a, +} + +var s1 = [256]uint32{ + 0x4b7a70e9, 0xb5b32944, 0xdb75092e, 0xc4192623, 0xad6ea6b0, 0x49a7df7d, + 0x9cee60b8, 0x8fedb266, 0xecaa8c71, 0x699a17ff, 0x5664526c, 0xc2b19ee1, + 0x193602a5, 0x75094c29, 0xa0591340, 0xe4183a3e, 0x3f54989a, 0x5b429d65, + 0x6b8fe4d6, 0x99f73fd6, 0xa1d29c07, 0xefe830f5, 0x4d2d38e6, 0xf0255dc1, + 0x4cdd2086, 0x8470eb26, 0x6382e9c6, 0x021ecc5e, 0x09686b3f, 0x3ebaefc9, + 0x3c971814, 0x6b6a70a1, 0x687f3584, 0x52a0e286, 0xb79c5305, 0xaa500737, + 0x3e07841c, 0x7fdeae5c, 0x8e7d44ec, 0x5716f2b8, 0xb03ada37, 0xf0500c0d, + 0xf01c1f04, 0x0200b3ff, 0xae0cf51a, 0x3cb574b2, 0x25837a58, 0xdc0921bd, + 0xd19113f9, 0x7ca92ff6, 0x94324773, 0x22f54701, 0x3ae5e581, 0x37c2dadc, + 0xc8b57634, 0x9af3dda7, 0xa9446146, 0x0fd0030e, 0xecc8c73e, 0xa4751e41, + 0xe238cd99, 0x3bea0e2f, 0x3280bba1, 0x183eb331, 0x4e548b38, 0x4f6db908, + 0x6f420d03, 0xf60a04bf, 0x2cb81290, 0x24977c79, 0x5679b072, 0xbcaf89af, + 0xde9a771f, 0xd9930810, 0xb38bae12, 0xdccf3f2e, 0x5512721f, 0x2e6b7124, + 0x501adde6, 0x9f84cd87, 0x7a584718, 0x7408da17, 0xbc9f9abc, 0xe94b7d8c, + 0xec7aec3a, 0xdb851dfa, 0x63094366, 0xc464c3d2, 0xef1c1847, 0x3215d908, + 0xdd433b37, 0x24c2ba16, 0x12a14d43, 0x2a65c451, 0x50940002, 0x133ae4dd, + 0x71dff89e, 0x10314e55, 0x81ac77d6, 0x5f11199b, 0x043556f1, 0xd7a3c76b, + 0x3c11183b, 0x5924a509, 0xf28fe6ed, 0x97f1fbfa, 0x9ebabf2c, 0x1e153c6e, + 0x86e34570, 0xeae96fb1, 0x860e5e0a, 0x5a3e2ab3, 0x771fe71c, 0x4e3d06fa, + 0x2965dcb9, 0x99e71d0f, 0x803e89d6, 0x5266c825, 0x2e4cc978, 0x9c10b36a, + 0xc6150eba, 0x94e2ea78, 0xa5fc3c53, 0x1e0a2df4, 0xf2f74ea7, 0x361d2b3d, + 0x1939260f, 0x19c27960, 0x5223a708, 0xf71312b6, 0xebadfe6e, 0xeac31f66, + 0xe3bc4595, 0xa67bc883, 0xb17f37d1, 0x018cff28, 0xc332ddef, 0xbe6c5aa5, + 0x65582185, 0x68ab9802, 0xeecea50f, 0xdb2f953b, 0x2aef7dad, 0x5b6e2f84, + 0x1521b628, 0x29076170, 0xecdd4775, 0x619f1510, 0x13cca830, 0xeb61bd96, + 0x0334fe1e, 0xaa0363cf, 0xb5735c90, 0x4c70a239, 0xd59e9e0b, 0xcbaade14, + 0xeecc86bc, 0x60622ca7, 0x9cab5cab, 0xb2f3846e, 0x648b1eaf, 0x19bdf0ca, + 0xa02369b9, 0x655abb50, 0x40685a32, 0x3c2ab4b3, 0x319ee9d5, 0xc021b8f7, + 0x9b540b19, 0x875fa099, 0x95f7997e, 0x623d7da8, 0xf837889a, 0x97e32d77, + 0x11ed935f, 0x16681281, 0x0e358829, 0xc7e61fd6, 0x96dedfa1, 0x7858ba99, + 0x57f584a5, 0x1b227263, 0x9b83c3ff, 0x1ac24696, 0xcdb30aeb, 0x532e3054, + 0x8fd948e4, 0x6dbc3128, 0x58ebf2ef, 0x34c6ffea, 0xfe28ed61, 0xee7c3c73, + 0x5d4a14d9, 0xe864b7e3, 0x42105d14, 0x203e13e0, 0x45eee2b6, 0xa3aaabea, + 0xdb6c4f15, 0xfacb4fd0, 0xc742f442, 0xef6abbb5, 0x654f3b1d, 0x41cd2105, + 0xd81e799e, 0x86854dc7, 0xe44b476a, 0x3d816250, 0xcf62a1f2, 0x5b8d2646, + 0xfc8883a0, 0xc1c7b6a3, 0x7f1524c3, 0x69cb7492, 0x47848a0b, 0x5692b285, + 0x095bbf00, 0xad19489d, 0x1462b174, 0x23820e00, 0x58428d2a, 0x0c55f5ea, + 0x1dadf43e, 0x233f7061, 0x3372f092, 0x8d937e41, 0xd65fecf1, 0x6c223bdb, + 0x7cde3759, 0xcbee7460, 0x4085f2a7, 0xce77326e, 0xa6078084, 0x19f8509e, + 0xe8efd855, 0x61d99735, 0xa969a7aa, 0xc50c06c2, 0x5a04abfc, 0x800bcadc, + 0x9e447a2e, 0xc3453484, 0xfdd56705, 0x0e1e9ec9, 0xdb73dbd3, 0x105588cd, + 0x675fda79, 0xe3674340, 0xc5c43465, 0x713e38d8, 0x3d28f89e, 0xf16dff20, + 0x153e21e7, 0x8fb03d4a, 0xe6e39f2b, 0xdb83adf7, +} + +var s2 = [256]uint32{ + 0xe93d5a68, 0x948140f7, 0xf64c261c, 0x94692934, 0x411520f7, 0x7602d4f7, + 0xbcf46b2e, 0xd4a20068, 0xd4082471, 0x3320f46a, 0x43b7d4b7, 0x500061af, + 0x1e39f62e, 0x97244546, 0x14214f74, 0xbf8b8840, 0x4d95fc1d, 0x96b591af, + 0x70f4ddd3, 0x66a02f45, 0xbfbc09ec, 0x03bd9785, 0x7fac6dd0, 0x31cb8504, + 0x96eb27b3, 0x55fd3941, 0xda2547e6, 0xabca0a9a, 0x28507825, 0x530429f4, + 0x0a2c86da, 0xe9b66dfb, 0x68dc1462, 0xd7486900, 0x680ec0a4, 0x27a18dee, + 0x4f3ffea2, 0xe887ad8c, 0xb58ce006, 0x7af4d6b6, 0xaace1e7c, 0xd3375fec, + 0xce78a399, 0x406b2a42, 0x20fe9e35, 0xd9f385b9, 0xee39d7ab, 0x3b124e8b, + 0x1dc9faf7, 0x4b6d1856, 0x26a36631, 0xeae397b2, 0x3a6efa74, 0xdd5b4332, + 0x6841e7f7, 0xca7820fb, 0xfb0af54e, 0xd8feb397, 0x454056ac, 0xba489527, + 0x55533a3a, 0x20838d87, 0xfe6ba9b7, 0xd096954b, 0x55a867bc, 0xa1159a58, + 0xcca92963, 0x99e1db33, 0xa62a4a56, 0x3f3125f9, 0x5ef47e1c, 0x9029317c, + 0xfdf8e802, 0x04272f70, 0x80bb155c, 0x05282ce3, 0x95c11548, 0xe4c66d22, + 0x48c1133f, 0xc70f86dc, 0x07f9c9ee, 0x41041f0f, 0x404779a4, 0x5d886e17, + 0x325f51eb, 0xd59bc0d1, 0xf2bcc18f, 0x41113564, 0x257b7834, 0x602a9c60, + 0xdff8e8a3, 0x1f636c1b, 0x0e12b4c2, 0x02e1329e, 0xaf664fd1, 0xcad18115, + 0x6b2395e0, 0x333e92e1, 0x3b240b62, 0xeebeb922, 0x85b2a20e, 0xe6ba0d99, + 0xde720c8c, 0x2da2f728, 0xd0127845, 0x95b794fd, 0x647d0862, 0xe7ccf5f0, + 0x5449a36f, 0x877d48fa, 0xc39dfd27, 0xf33e8d1e, 0x0a476341, 0x992eff74, + 0x3a6f6eab, 0xf4f8fd37, 0xa812dc60, 0xa1ebddf8, 0x991be14c, 0xdb6e6b0d, + 0xc67b5510, 0x6d672c37, 0x2765d43b, 0xdcd0e804, 0xf1290dc7, 0xcc00ffa3, + 0xb5390f92, 0x690fed0b, 0x667b9ffb, 0xcedb7d9c, 0xa091cf0b, 0xd9155ea3, + 0xbb132f88, 0x515bad24, 0x7b9479bf, 0x763bd6eb, 0x37392eb3, 0xcc115979, + 0x8026e297, 0xf42e312d, 0x6842ada7, 0xc66a2b3b, 0x12754ccc, 0x782ef11c, + 0x6a124237, 0xb79251e7, 0x06a1bbe6, 0x4bfb6350, 0x1a6b1018, 0x11caedfa, + 0x3d25bdd8, 0xe2e1c3c9, 0x44421659, 0x0a121386, 0xd90cec6e, 0xd5abea2a, + 0x64af674e, 0xda86a85f, 0xbebfe988, 0x64e4c3fe, 0x9dbc8057, 0xf0f7c086, + 0x60787bf8, 0x6003604d, 0xd1fd8346, 0xf6381fb0, 0x7745ae04, 0xd736fccc, + 0x83426b33, 0xf01eab71, 0xb0804187, 0x3c005e5f, 0x77a057be, 0xbde8ae24, + 0x55464299, 0xbf582e61, 0x4e58f48f, 0xf2ddfda2, 0xf474ef38, 0x8789bdc2, + 0x5366f9c3, 0xc8b38e74, 0xb475f255, 0x46fcd9b9, 0x7aeb2661, 0x8b1ddf84, + 0x846a0e79, 0x915f95e2, 0x466e598e, 0x20b45770, 0x8cd55591, 0xc902de4c, + 0xb90bace1, 0xbb8205d0, 0x11a86248, 0x7574a99e, 0xb77f19b6, 0xe0a9dc09, + 0x662d09a1, 0xc4324633, 0xe85a1f02, 0x09f0be8c, 0x4a99a025, 0x1d6efe10, + 0x1ab93d1d, 0x0ba5a4df, 0xa186f20f, 0x2868f169, 0xdcb7da83, 0x573906fe, + 0xa1e2ce9b, 0x4fcd7f52, 0x50115e01, 0xa70683fa, 0xa002b5c4, 0x0de6d027, + 0x9af88c27, 0x773f8641, 0xc3604c06, 0x61a806b5, 0xf0177a28, 0xc0f586e0, + 0x006058aa, 0x30dc7d62, 0x11e69ed7, 0x2338ea63, 0x53c2dd94, 0xc2c21634, + 0xbbcbee56, 0x90bcb6de, 0xebfc7da1, 0xce591d76, 0x6f05e409, 0x4b7c0188, + 0x39720a3d, 0x7c927c24, 0x86e3725f, 0x724d9db9, 0x1ac15bb4, 0xd39eb8fc, + 0xed545578, 0x08fca5b5, 0xd83d7cd3, 0x4dad0fc4, 0x1e50ef5e, 0xb161e6f8, + 0xa28514d9, 0x6c51133c, 0x6fd5c7e7, 0x56e14ec4, 0x362abfce, 0xddc6c837, + 0xd79a3234, 0x92638212, 0x670efa8e, 0x406000e0, +} + +var s3 = [256]uint32{ + 0x3a39ce37, 0xd3faf5cf, 0xabc27737, 0x5ac52d1b, 0x5cb0679e, 0x4fa33742, + 0xd3822740, 0x99bc9bbe, 0xd5118e9d, 0xbf0f7315, 0xd62d1c7e, 0xc700c47b, + 0xb78c1b6b, 0x21a19045, 0xb26eb1be, 0x6a366eb4, 0x5748ab2f, 0xbc946e79, + 0xc6a376d2, 0x6549c2c8, 0x530ff8ee, 0x468dde7d, 0xd5730a1d, 0x4cd04dc6, + 0x2939bbdb, 0xa9ba4650, 0xac9526e8, 0xbe5ee304, 0xa1fad5f0, 0x6a2d519a, + 0x63ef8ce2, 0x9a86ee22, 0xc089c2b8, 0x43242ef6, 0xa51e03aa, 0x9cf2d0a4, + 0x83c061ba, 0x9be96a4d, 0x8fe51550, 0xba645bd6, 0x2826a2f9, 0xa73a3ae1, + 0x4ba99586, 0xef5562e9, 0xc72fefd3, 0xf752f7da, 0x3f046f69, 0x77fa0a59, + 0x80e4a915, 0x87b08601, 0x9b09e6ad, 0x3b3ee593, 0xe990fd5a, 0x9e34d797, + 0x2cf0b7d9, 0x022b8b51, 0x96d5ac3a, 0x017da67d, 0xd1cf3ed6, 0x7c7d2d28, + 0x1f9f25cf, 0xadf2b89b, 0x5ad6b472, 0x5a88f54c, 0xe029ac71, 0xe019a5e6, + 0x47b0acfd, 0xed93fa9b, 0xe8d3c48d, 0x283b57cc, 0xf8d56629, 0x79132e28, + 0x785f0191, 0xed756055, 0xf7960e44, 0xe3d35e8c, 0x15056dd4, 0x88f46dba, + 0x03a16125, 0x0564f0bd, 0xc3eb9e15, 0x3c9057a2, 0x97271aec, 0xa93a072a, + 0x1b3f6d9b, 0x1e6321f5, 0xf59c66fb, 0x26dcf319, 0x7533d928, 0xb155fdf5, + 0x03563482, 0x8aba3cbb, 0x28517711, 0xc20ad9f8, 0xabcc5167, 0xccad925f, + 0x4de81751, 0x3830dc8e, 0x379d5862, 0x9320f991, 0xea7a90c2, 0xfb3e7bce, + 0x5121ce64, 0x774fbe32, 0xa8b6e37e, 0xc3293d46, 0x48de5369, 0x6413e680, + 0xa2ae0810, 0xdd6db224, 0x69852dfd, 0x09072166, 0xb39a460a, 0x6445c0dd, + 0x586cdecf, 0x1c20c8ae, 0x5bbef7dd, 0x1b588d40, 0xccd2017f, 0x6bb4e3bb, + 0xdda26a7e, 0x3a59ff45, 0x3e350a44, 0xbcb4cdd5, 0x72eacea8, 0xfa6484bb, + 0x8d6612ae, 0xbf3c6f47, 0xd29be463, 0x542f5d9e, 0xaec2771b, 0xf64e6370, + 0x740e0d8d, 0xe75b1357, 0xf8721671, 0xaf537d5d, 0x4040cb08, 0x4eb4e2cc, + 0x34d2466a, 0x0115af84, 0xe1b00428, 0x95983a1d, 0x06b89fb4, 0xce6ea048, + 0x6f3f3b82, 0x3520ab82, 0x011a1d4b, 0x277227f8, 0x611560b1, 0xe7933fdc, + 0xbb3a792b, 0x344525bd, 0xa08839e1, 0x51ce794b, 0x2f32c9b7, 0xa01fbac9, + 0xe01cc87e, 0xbcc7d1f6, 0xcf0111c3, 0xa1e8aac7, 0x1a908749, 0xd44fbd9a, + 0xd0dadecb, 0xd50ada38, 0x0339c32a, 0xc6913667, 0x8df9317c, 0xe0b12b4f, + 0xf79e59b7, 0x43f5bb3a, 0xf2d519ff, 0x27d9459c, 0xbf97222c, 0x15e6fc2a, + 0x0f91fc71, 0x9b941525, 0xfae59361, 0xceb69ceb, 0xc2a86459, 0x12baa8d1, + 0xb6c1075e, 0xe3056a0c, 0x10d25065, 0xcb03a442, 0xe0ec6e0e, 0x1698db3b, + 0x4c98a0be, 0x3278e964, 0x9f1f9532, 0xe0d392df, 0xd3a0342b, 0x8971f21e, + 0x1b0a7441, 0x4ba3348c, 0xc5be7120, 0xc37632d8, 0xdf359f8d, 0x9b992f2e, + 0xe60b6f47, 0x0fe3f11d, 0xe54cda54, 0x1edad891, 0xce6279cf, 0xcd3e7e6f, + 0x1618b166, 0xfd2c1d05, 0x848fd2c5, 0xf6fb2299, 0xf523f357, 0xa6327623, + 0x93a83531, 0x56cccd02, 0xacf08162, 0x5a75ebb5, 0x6e163697, 0x88d273cc, + 0xde966292, 0x81b949d0, 0x4c50901b, 0x71c65614, 0xe6c6c7bd, 0x327a140a, + 0x45e1d006, 0xc3f27b9a, 0xc9aa53fd, 0x62a80f00, 0xbb25bfe2, 0x35bdd2f6, + 0x71126905, 0xb2040222, 0xb6cbcf7c, 0xcd769c2b, 0x53113ec0, 0x1640e3d3, + 0x38abbd60, 0x2547adf0, 0xba38209c, 0xf746ce76, 0x77afa1c5, 0x20756060, + 0x85cbfe4e, 0x8ae88dd8, 0x7aaaf9b0, 0x4cf9aa7e, 0x1948c25c, 0x02fb8a8c, + 0x01c36ae4, 0xd6ebe1f9, 0x90d4f869, 0xa65cdea0, 0x3f09252d, 0xc208e69f, + 0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6, +} + +var p = [18]uint32{ + 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, 0xa4093822, 0x299f31d0, + 0x082efa98, 0xec4e6c89, 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c, + 0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917, 0x9216d5d9, 0x8979fb1b, +} diff --git a/vendor/golang.org/x/crypto/cast5/cast5.go b/vendor/golang.org/x/crypto/cast5/cast5.go new file mode 100644 index 0000000..0b4af37 --- /dev/null +++ b/vendor/golang.org/x/crypto/cast5/cast5.go @@ -0,0 +1,526 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package cast5 implements CAST5, as defined in RFC 2144. CAST5 is a common +// OpenPGP cipher. +package cast5 // import "golang.org/x/crypto/cast5" + +import "errors" + +const BlockSize = 8 +const KeySize = 16 + +type Cipher struct { + masking [16]uint32 + rotate [16]uint8 +} + +func NewCipher(key []byte) (c *Cipher, err error) { + if len(key) != KeySize { + return nil, errors.New("CAST5: keys must be 16 bytes") + } + + c = new(Cipher) + c.keySchedule(key) + return +} + +func (c *Cipher) BlockSize() int { + return BlockSize +} + +func (c *Cipher) Encrypt(dst, src []byte) { + l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3]) + r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7]) + + l, r = r, l^f1(r, c.masking[0], c.rotate[0]) + l, r = r, l^f2(r, c.masking[1], c.rotate[1]) + l, r = r, l^f3(r, c.masking[2], c.rotate[2]) + l, r = r, l^f1(r, c.masking[3], c.rotate[3]) + + l, r = r, l^f2(r, c.masking[4], c.rotate[4]) + l, r = r, l^f3(r, c.masking[5], c.rotate[5]) + l, r = r, l^f1(r, c.masking[6], c.rotate[6]) + l, r = r, l^f2(r, c.masking[7], c.rotate[7]) + + l, r = r, l^f3(r, c.masking[8], c.rotate[8]) + l, r = r, l^f1(r, c.masking[9], c.rotate[9]) + l, r = r, l^f2(r, c.masking[10], c.rotate[10]) + l, r = r, l^f3(r, c.masking[11], c.rotate[11]) + + l, r = r, l^f1(r, c.masking[12], c.rotate[12]) + l, r = r, l^f2(r, c.masking[13], c.rotate[13]) + l, r = r, l^f3(r, c.masking[14], c.rotate[14]) + l, r = r, l^f1(r, c.masking[15], c.rotate[15]) + + dst[0] = uint8(r >> 24) + dst[1] = uint8(r >> 16) + dst[2] = uint8(r >> 8) + dst[3] = uint8(r) + dst[4] = uint8(l >> 24) + dst[5] = uint8(l >> 16) + dst[6] = uint8(l >> 8) + dst[7] = uint8(l) +} + +func (c *Cipher) Decrypt(dst, src []byte) { + l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3]) + r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7]) + + l, r = r, l^f1(r, c.masking[15], c.rotate[15]) + l, r = r, l^f3(r, c.masking[14], c.rotate[14]) + l, r = r, l^f2(r, c.masking[13], c.rotate[13]) + l, r = r, l^f1(r, c.masking[12], c.rotate[12]) + + l, r = r, l^f3(r, c.masking[11], c.rotate[11]) + l, r = r, l^f2(r, c.masking[10], c.rotate[10]) + l, r = r, l^f1(r, c.masking[9], c.rotate[9]) + l, r = r, l^f3(r, c.masking[8], c.rotate[8]) + + l, r = r, l^f2(r, c.masking[7], c.rotate[7]) + l, r = r, l^f1(r, c.masking[6], c.rotate[6]) + l, r = r, l^f3(r, c.masking[5], c.rotate[5]) + l, r = r, l^f2(r, c.masking[4], c.rotate[4]) + + l, r = r, l^f1(r, c.masking[3], c.rotate[3]) + l, r = r, l^f3(r, c.masking[2], c.rotate[2]) + l, r = r, l^f2(r, c.masking[1], c.rotate[1]) + l, r = r, l^f1(r, c.masking[0], c.rotate[0]) + + dst[0] = uint8(r >> 24) + dst[1] = uint8(r >> 16) + dst[2] = uint8(r >> 8) + dst[3] = uint8(r) + dst[4] = uint8(l >> 24) + dst[5] = uint8(l >> 16) + dst[6] = uint8(l >> 8) + dst[7] = uint8(l) +} + +type keyScheduleA [4][7]uint8 +type keyScheduleB [4][5]uint8 + +// keyScheduleRound contains the magic values for a round of the key schedule. +// The keyScheduleA deals with the lines like: +// z0z1z2z3 = x0x1x2x3 ^ S5[xD] ^ S6[xF] ^ S7[xC] ^ S8[xE] ^ S7[x8] +// Conceptually, both x and z are in the same array, x first. The first +// element describes which word of this array gets written to and the +// second, which word gets read. So, for the line above, it's "4, 0", because +// it's writing to the first word of z, which, being after x, is word 4, and +// reading from the first word of x: word 0. +// +// Next are the indexes into the S-boxes. Now the array is treated as bytes. So +// "xD" is 0xd. The first byte of z is written as "16 + 0", just to be clear +// that it's z that we're indexing. +// +// keyScheduleB deals with lines like: +// K1 = S5[z8] ^ S6[z9] ^ S7[z7] ^ S8[z6] ^ S5[z2] +// "K1" is ignored because key words are always written in order. So the five +// elements are the S-box indexes. They use the same form as in keyScheduleA, +// above. + +type keyScheduleRound struct{} +type keySchedule []keyScheduleRound + +var schedule = []struct { + a keyScheduleA + b keyScheduleB +}{ + { + keyScheduleA{ + {4, 0, 0xd, 0xf, 0xc, 0xe, 0x8}, + {5, 2, 16 + 0, 16 + 2, 16 + 1, 16 + 3, 0xa}, + {6, 3, 16 + 7, 16 + 6, 16 + 5, 16 + 4, 9}, + {7, 1, 16 + 0xa, 16 + 9, 16 + 0xb, 16 + 8, 0xb}, + }, + keyScheduleB{ + {16 + 8, 16 + 9, 16 + 7, 16 + 6, 16 + 2}, + {16 + 0xa, 16 + 0xb, 16 + 5, 16 + 4, 16 + 6}, + {16 + 0xc, 16 + 0xd, 16 + 3, 16 + 2, 16 + 9}, + {16 + 0xe, 16 + 0xf, 16 + 1, 16 + 0, 16 + 0xc}, + }, + }, + { + keyScheduleA{ + {0, 6, 16 + 5, 16 + 7, 16 + 4, 16 + 6, 16 + 0}, + {1, 4, 0, 2, 1, 3, 16 + 2}, + {2, 5, 7, 6, 5, 4, 16 + 1}, + {3, 7, 0xa, 9, 0xb, 8, 16 + 3}, + }, + keyScheduleB{ + {3, 2, 0xc, 0xd, 8}, + {1, 0, 0xe, 0xf, 0xd}, + {7, 6, 8, 9, 3}, + {5, 4, 0xa, 0xb, 7}, + }, + }, + { + keyScheduleA{ + {4, 0, 0xd, 0xf, 0xc, 0xe, 8}, + {5, 2, 16 + 0, 16 + 2, 16 + 1, 16 + 3, 0xa}, + {6, 3, 16 + 7, 16 + 6, 16 + 5, 16 + 4, 9}, + {7, 1, 16 + 0xa, 16 + 9, 16 + 0xb, 16 + 8, 0xb}, + }, + keyScheduleB{ + {16 + 3, 16 + 2, 16 + 0xc, 16 + 0xd, 16 + 9}, + {16 + 1, 16 + 0, 16 + 0xe, 16 + 0xf, 16 + 0xc}, + {16 + 7, 16 + 6, 16 + 8, 16 + 9, 16 + 2}, + {16 + 5, 16 + 4, 16 + 0xa, 16 + 0xb, 16 + 6}, + }, + }, + { + keyScheduleA{ + {0, 6, 16 + 5, 16 + 7, 16 + 4, 16 + 6, 16 + 0}, + {1, 4, 0, 2, 1, 3, 16 + 2}, + {2, 5, 7, 6, 5, 4, 16 + 1}, + {3, 7, 0xa, 9, 0xb, 8, 16 + 3}, + }, + keyScheduleB{ + {8, 9, 7, 6, 3}, + {0xa, 0xb, 5, 4, 7}, + {0xc, 0xd, 3, 2, 8}, + {0xe, 0xf, 1, 0, 0xd}, + }, + }, +} + +func (c *Cipher) keySchedule(in []byte) { + var t [8]uint32 + var k [32]uint32 + + for i := 0; i < 4; i++ { + j := i * 4 + t[i] = uint32(in[j])<<24 | uint32(in[j+1])<<16 | uint32(in[j+2])<<8 | uint32(in[j+3]) + } + + x := []byte{6, 7, 4, 5} + ki := 0 + + for half := 0; half < 2; half++ { + for _, round := range schedule { + for j := 0; j < 4; j++ { + var a [7]uint8 + copy(a[:], round.a[j][:]) + w := t[a[1]] + w ^= sBox[4][(t[a[2]>>2]>>(24-8*(a[2]&3)))&0xff] + w ^= sBox[5][(t[a[3]>>2]>>(24-8*(a[3]&3)))&0xff] + w ^= sBox[6][(t[a[4]>>2]>>(24-8*(a[4]&3)))&0xff] + w ^= sBox[7][(t[a[5]>>2]>>(24-8*(a[5]&3)))&0xff] + w ^= sBox[x[j]][(t[a[6]>>2]>>(24-8*(a[6]&3)))&0xff] + t[a[0]] = w + } + + for j := 0; j < 4; j++ { + var b [5]uint8 + copy(b[:], round.b[j][:]) + w := sBox[4][(t[b[0]>>2]>>(24-8*(b[0]&3)))&0xff] + w ^= sBox[5][(t[b[1]>>2]>>(24-8*(b[1]&3)))&0xff] + w ^= sBox[6][(t[b[2]>>2]>>(24-8*(b[2]&3)))&0xff] + w ^= sBox[7][(t[b[3]>>2]>>(24-8*(b[3]&3)))&0xff] + w ^= sBox[4+j][(t[b[4]>>2]>>(24-8*(b[4]&3)))&0xff] + k[ki] = w + ki++ + } + } + } + + for i := 0; i < 16; i++ { + c.masking[i] = k[i] + c.rotate[i] = uint8(k[16+i] & 0x1f) + } +} + +// These are the three 'f' functions. See RFC 2144, section 2.2. +func f1(d, m uint32, r uint8) uint32 { + t := m + d + I := (t << r) | (t >> (32 - r)) + return ((sBox[0][I>>24] ^ sBox[1][(I>>16)&0xff]) - sBox[2][(I>>8)&0xff]) + sBox[3][I&0xff] +} + +func f2(d, m uint32, r uint8) uint32 { + t := m ^ d + I := (t << r) | (t >> (32 - r)) + return ((sBox[0][I>>24] - sBox[1][(I>>16)&0xff]) + sBox[2][(I>>8)&0xff]) ^ sBox[3][I&0xff] +} + +func f3(d, m uint32, r uint8) uint32 { + t := m - d + I := (t << r) | (t >> (32 - r)) + return ((sBox[0][I>>24] + sBox[1][(I>>16)&0xff]) ^ sBox[2][(I>>8)&0xff]) - sBox[3][I&0xff] +} + +var sBox = [8][256]uint32{ + { + 0x30fb40d4, 0x9fa0ff0b, 0x6beccd2f, 0x3f258c7a, 0x1e213f2f, 0x9c004dd3, 0x6003e540, 0xcf9fc949, + 0xbfd4af27, 0x88bbbdb5, 0xe2034090, 0x98d09675, 0x6e63a0e0, 0x15c361d2, 0xc2e7661d, 0x22d4ff8e, + 0x28683b6f, 0xc07fd059, 0xff2379c8, 0x775f50e2, 0x43c340d3, 0xdf2f8656, 0x887ca41a, 0xa2d2bd2d, + 0xa1c9e0d6, 0x346c4819, 0x61b76d87, 0x22540f2f, 0x2abe32e1, 0xaa54166b, 0x22568e3a, 0xa2d341d0, + 0x66db40c8, 0xa784392f, 0x004dff2f, 0x2db9d2de, 0x97943fac, 0x4a97c1d8, 0x527644b7, 0xb5f437a7, + 0xb82cbaef, 0xd751d159, 0x6ff7f0ed, 0x5a097a1f, 0x827b68d0, 0x90ecf52e, 0x22b0c054, 0xbc8e5935, + 0x4b6d2f7f, 0x50bb64a2, 0xd2664910, 0xbee5812d, 0xb7332290, 0xe93b159f, 0xb48ee411, 0x4bff345d, + 0xfd45c240, 0xad31973f, 0xc4f6d02e, 0x55fc8165, 0xd5b1caad, 0xa1ac2dae, 0xa2d4b76d, 0xc19b0c50, + 0x882240f2, 0x0c6e4f38, 0xa4e4bfd7, 0x4f5ba272, 0x564c1d2f, 0xc59c5319, 0xb949e354, 0xb04669fe, + 0xb1b6ab8a, 0xc71358dd, 0x6385c545, 0x110f935d, 0x57538ad5, 0x6a390493, 0xe63d37e0, 0x2a54f6b3, + 0x3a787d5f, 0x6276a0b5, 0x19a6fcdf, 0x7a42206a, 0x29f9d4d5, 0xf61b1891, 0xbb72275e, 0xaa508167, + 0x38901091, 0xc6b505eb, 0x84c7cb8c, 0x2ad75a0f, 0x874a1427, 0xa2d1936b, 0x2ad286af, 0xaa56d291, + 0xd7894360, 0x425c750d, 0x93b39e26, 0x187184c9, 0x6c00b32d, 0x73e2bb14, 0xa0bebc3c, 0x54623779, + 0x64459eab, 0x3f328b82, 0x7718cf82, 0x59a2cea6, 0x04ee002e, 0x89fe78e6, 0x3fab0950, 0x325ff6c2, + 0x81383f05, 0x6963c5c8, 0x76cb5ad6, 0xd49974c9, 0xca180dcf, 0x380782d5, 0xc7fa5cf6, 0x8ac31511, + 0x35e79e13, 0x47da91d0, 0xf40f9086, 0xa7e2419e, 0x31366241, 0x051ef495, 0xaa573b04, 0x4a805d8d, + 0x548300d0, 0x00322a3c, 0xbf64cddf, 0xba57a68e, 0x75c6372b, 0x50afd341, 0xa7c13275, 0x915a0bf5, + 0x6b54bfab, 0x2b0b1426, 0xab4cc9d7, 0x449ccd82, 0xf7fbf265, 0xab85c5f3, 0x1b55db94, 0xaad4e324, + 0xcfa4bd3f, 0x2deaa3e2, 0x9e204d02, 0xc8bd25ac, 0xeadf55b3, 0xd5bd9e98, 0xe31231b2, 0x2ad5ad6c, + 0x954329de, 0xadbe4528, 0xd8710f69, 0xaa51c90f, 0xaa786bf6, 0x22513f1e, 0xaa51a79b, 0x2ad344cc, + 0x7b5a41f0, 0xd37cfbad, 0x1b069505, 0x41ece491, 0xb4c332e6, 0x032268d4, 0xc9600acc, 0xce387e6d, + 0xbf6bb16c, 0x6a70fb78, 0x0d03d9c9, 0xd4df39de, 0xe01063da, 0x4736f464, 0x5ad328d8, 0xb347cc96, + 0x75bb0fc3, 0x98511bfb, 0x4ffbcc35, 0xb58bcf6a, 0xe11f0abc, 0xbfc5fe4a, 0xa70aec10, 0xac39570a, + 0x3f04442f, 0x6188b153, 0xe0397a2e, 0x5727cb79, 0x9ceb418f, 0x1cacd68d, 0x2ad37c96, 0x0175cb9d, + 0xc69dff09, 0xc75b65f0, 0xd9db40d8, 0xec0e7779, 0x4744ead4, 0xb11c3274, 0xdd24cb9e, 0x7e1c54bd, + 0xf01144f9, 0xd2240eb1, 0x9675b3fd, 0xa3ac3755, 0xd47c27af, 0x51c85f4d, 0x56907596, 0xa5bb15e6, + 0x580304f0, 0xca042cf1, 0x011a37ea, 0x8dbfaadb, 0x35ba3e4a, 0x3526ffa0, 0xc37b4d09, 0xbc306ed9, + 0x98a52666, 0x5648f725, 0xff5e569d, 0x0ced63d0, 0x7c63b2cf, 0x700b45e1, 0xd5ea50f1, 0x85a92872, + 0xaf1fbda7, 0xd4234870, 0xa7870bf3, 0x2d3b4d79, 0x42e04198, 0x0cd0ede7, 0x26470db8, 0xf881814c, + 0x474d6ad7, 0x7c0c5e5c, 0xd1231959, 0x381b7298, 0xf5d2f4db, 0xab838653, 0x6e2f1e23, 0x83719c9e, + 0xbd91e046, 0x9a56456e, 0xdc39200c, 0x20c8c571, 0x962bda1c, 0xe1e696ff, 0xb141ab08, 0x7cca89b9, + 0x1a69e783, 0x02cc4843, 0xa2f7c579, 0x429ef47d, 0x427b169c, 0x5ac9f049, 0xdd8f0f00, 0x5c8165bf, + }, + { + 0x1f201094, 0xef0ba75b, 0x69e3cf7e, 0x393f4380, 0xfe61cf7a, 0xeec5207a, 0x55889c94, 0x72fc0651, + 0xada7ef79, 0x4e1d7235, 0xd55a63ce, 0xde0436ba, 0x99c430ef, 0x5f0c0794, 0x18dcdb7d, 0xa1d6eff3, + 0xa0b52f7b, 0x59e83605, 0xee15b094, 0xe9ffd909, 0xdc440086, 0xef944459, 0xba83ccb3, 0xe0c3cdfb, + 0xd1da4181, 0x3b092ab1, 0xf997f1c1, 0xa5e6cf7b, 0x01420ddb, 0xe4e7ef5b, 0x25a1ff41, 0xe180f806, + 0x1fc41080, 0x179bee7a, 0xd37ac6a9, 0xfe5830a4, 0x98de8b7f, 0x77e83f4e, 0x79929269, 0x24fa9f7b, + 0xe113c85b, 0xacc40083, 0xd7503525, 0xf7ea615f, 0x62143154, 0x0d554b63, 0x5d681121, 0xc866c359, + 0x3d63cf73, 0xcee234c0, 0xd4d87e87, 0x5c672b21, 0x071f6181, 0x39f7627f, 0x361e3084, 0xe4eb573b, + 0x602f64a4, 0xd63acd9c, 0x1bbc4635, 0x9e81032d, 0x2701f50c, 0x99847ab4, 0xa0e3df79, 0xba6cf38c, + 0x10843094, 0x2537a95e, 0xf46f6ffe, 0xa1ff3b1f, 0x208cfb6a, 0x8f458c74, 0xd9e0a227, 0x4ec73a34, + 0xfc884f69, 0x3e4de8df, 0xef0e0088, 0x3559648d, 0x8a45388c, 0x1d804366, 0x721d9bfd, 0xa58684bb, + 0xe8256333, 0x844e8212, 0x128d8098, 0xfed33fb4, 0xce280ae1, 0x27e19ba5, 0xd5a6c252, 0xe49754bd, + 0xc5d655dd, 0xeb667064, 0x77840b4d, 0xa1b6a801, 0x84db26a9, 0xe0b56714, 0x21f043b7, 0xe5d05860, + 0x54f03084, 0x066ff472, 0xa31aa153, 0xdadc4755, 0xb5625dbf, 0x68561be6, 0x83ca6b94, 0x2d6ed23b, + 0xeccf01db, 0xa6d3d0ba, 0xb6803d5c, 0xaf77a709, 0x33b4a34c, 0x397bc8d6, 0x5ee22b95, 0x5f0e5304, + 0x81ed6f61, 0x20e74364, 0xb45e1378, 0xde18639b, 0x881ca122, 0xb96726d1, 0x8049a7e8, 0x22b7da7b, + 0x5e552d25, 0x5272d237, 0x79d2951c, 0xc60d894c, 0x488cb402, 0x1ba4fe5b, 0xa4b09f6b, 0x1ca815cf, + 0xa20c3005, 0x8871df63, 0xb9de2fcb, 0x0cc6c9e9, 0x0beeff53, 0xe3214517, 0xb4542835, 0x9f63293c, + 0xee41e729, 0x6e1d2d7c, 0x50045286, 0x1e6685f3, 0xf33401c6, 0x30a22c95, 0x31a70850, 0x60930f13, + 0x73f98417, 0xa1269859, 0xec645c44, 0x52c877a9, 0xcdff33a6, 0xa02b1741, 0x7cbad9a2, 0x2180036f, + 0x50d99c08, 0xcb3f4861, 0xc26bd765, 0x64a3f6ab, 0x80342676, 0x25a75e7b, 0xe4e6d1fc, 0x20c710e6, + 0xcdf0b680, 0x17844d3b, 0x31eef84d, 0x7e0824e4, 0x2ccb49eb, 0x846a3bae, 0x8ff77888, 0xee5d60f6, + 0x7af75673, 0x2fdd5cdb, 0xa11631c1, 0x30f66f43, 0xb3faec54, 0x157fd7fa, 0xef8579cc, 0xd152de58, + 0xdb2ffd5e, 0x8f32ce19, 0x306af97a, 0x02f03ef8, 0x99319ad5, 0xc242fa0f, 0xa7e3ebb0, 0xc68e4906, + 0xb8da230c, 0x80823028, 0xdcdef3c8, 0xd35fb171, 0x088a1bc8, 0xbec0c560, 0x61a3c9e8, 0xbca8f54d, + 0xc72feffa, 0x22822e99, 0x82c570b4, 0xd8d94e89, 0x8b1c34bc, 0x301e16e6, 0x273be979, 0xb0ffeaa6, + 0x61d9b8c6, 0x00b24869, 0xb7ffce3f, 0x08dc283b, 0x43daf65a, 0xf7e19798, 0x7619b72f, 0x8f1c9ba4, + 0xdc8637a0, 0x16a7d3b1, 0x9fc393b7, 0xa7136eeb, 0xc6bcc63e, 0x1a513742, 0xef6828bc, 0x520365d6, + 0x2d6a77ab, 0x3527ed4b, 0x821fd216, 0x095c6e2e, 0xdb92f2fb, 0x5eea29cb, 0x145892f5, 0x91584f7f, + 0x5483697b, 0x2667a8cc, 0x85196048, 0x8c4bacea, 0x833860d4, 0x0d23e0f9, 0x6c387e8a, 0x0ae6d249, + 0xb284600c, 0xd835731d, 0xdcb1c647, 0xac4c56ea, 0x3ebd81b3, 0x230eabb0, 0x6438bc87, 0xf0b5b1fa, + 0x8f5ea2b3, 0xfc184642, 0x0a036b7a, 0x4fb089bd, 0x649da589, 0xa345415e, 0x5c038323, 0x3e5d3bb9, + 0x43d79572, 0x7e6dd07c, 0x06dfdf1e, 0x6c6cc4ef, 0x7160a539, 0x73bfbe70, 0x83877605, 0x4523ecf1, + }, + { + 0x8defc240, 0x25fa5d9f, 0xeb903dbf, 0xe810c907, 0x47607fff, 0x369fe44b, 0x8c1fc644, 0xaececa90, + 0xbeb1f9bf, 0xeefbcaea, 0xe8cf1950, 0x51df07ae, 0x920e8806, 0xf0ad0548, 0xe13c8d83, 0x927010d5, + 0x11107d9f, 0x07647db9, 0xb2e3e4d4, 0x3d4f285e, 0xb9afa820, 0xfade82e0, 0xa067268b, 0x8272792e, + 0x553fb2c0, 0x489ae22b, 0xd4ef9794, 0x125e3fbc, 0x21fffcee, 0x825b1bfd, 0x9255c5ed, 0x1257a240, + 0x4e1a8302, 0xbae07fff, 0x528246e7, 0x8e57140e, 0x3373f7bf, 0x8c9f8188, 0xa6fc4ee8, 0xc982b5a5, + 0xa8c01db7, 0x579fc264, 0x67094f31, 0xf2bd3f5f, 0x40fff7c1, 0x1fb78dfc, 0x8e6bd2c1, 0x437be59b, + 0x99b03dbf, 0xb5dbc64b, 0x638dc0e6, 0x55819d99, 0xa197c81c, 0x4a012d6e, 0xc5884a28, 0xccc36f71, + 0xb843c213, 0x6c0743f1, 0x8309893c, 0x0feddd5f, 0x2f7fe850, 0xd7c07f7e, 0x02507fbf, 0x5afb9a04, + 0xa747d2d0, 0x1651192e, 0xaf70bf3e, 0x58c31380, 0x5f98302e, 0x727cc3c4, 0x0a0fb402, 0x0f7fef82, + 0x8c96fdad, 0x5d2c2aae, 0x8ee99a49, 0x50da88b8, 0x8427f4a0, 0x1eac5790, 0x796fb449, 0x8252dc15, + 0xefbd7d9b, 0xa672597d, 0xada840d8, 0x45f54504, 0xfa5d7403, 0xe83ec305, 0x4f91751a, 0x925669c2, + 0x23efe941, 0xa903f12e, 0x60270df2, 0x0276e4b6, 0x94fd6574, 0x927985b2, 0x8276dbcb, 0x02778176, + 0xf8af918d, 0x4e48f79e, 0x8f616ddf, 0xe29d840e, 0x842f7d83, 0x340ce5c8, 0x96bbb682, 0x93b4b148, + 0xef303cab, 0x984faf28, 0x779faf9b, 0x92dc560d, 0x224d1e20, 0x8437aa88, 0x7d29dc96, 0x2756d3dc, + 0x8b907cee, 0xb51fd240, 0xe7c07ce3, 0xe566b4a1, 0xc3e9615e, 0x3cf8209d, 0x6094d1e3, 0xcd9ca341, + 0x5c76460e, 0x00ea983b, 0xd4d67881, 0xfd47572c, 0xf76cedd9, 0xbda8229c, 0x127dadaa, 0x438a074e, + 0x1f97c090, 0x081bdb8a, 0x93a07ebe, 0xb938ca15, 0x97b03cff, 0x3dc2c0f8, 0x8d1ab2ec, 0x64380e51, + 0x68cc7bfb, 0xd90f2788, 0x12490181, 0x5de5ffd4, 0xdd7ef86a, 0x76a2e214, 0xb9a40368, 0x925d958f, + 0x4b39fffa, 0xba39aee9, 0xa4ffd30b, 0xfaf7933b, 0x6d498623, 0x193cbcfa, 0x27627545, 0x825cf47a, + 0x61bd8ba0, 0xd11e42d1, 0xcead04f4, 0x127ea392, 0x10428db7, 0x8272a972, 0x9270c4a8, 0x127de50b, + 0x285ba1c8, 0x3c62f44f, 0x35c0eaa5, 0xe805d231, 0x428929fb, 0xb4fcdf82, 0x4fb66a53, 0x0e7dc15b, + 0x1f081fab, 0x108618ae, 0xfcfd086d, 0xf9ff2889, 0x694bcc11, 0x236a5cae, 0x12deca4d, 0x2c3f8cc5, + 0xd2d02dfe, 0xf8ef5896, 0xe4cf52da, 0x95155b67, 0x494a488c, 0xb9b6a80c, 0x5c8f82bc, 0x89d36b45, + 0x3a609437, 0xec00c9a9, 0x44715253, 0x0a874b49, 0xd773bc40, 0x7c34671c, 0x02717ef6, 0x4feb5536, + 0xa2d02fff, 0xd2bf60c4, 0xd43f03c0, 0x50b4ef6d, 0x07478cd1, 0x006e1888, 0xa2e53f55, 0xb9e6d4bc, + 0xa2048016, 0x97573833, 0xd7207d67, 0xde0f8f3d, 0x72f87b33, 0xabcc4f33, 0x7688c55d, 0x7b00a6b0, + 0x947b0001, 0x570075d2, 0xf9bb88f8, 0x8942019e, 0x4264a5ff, 0x856302e0, 0x72dbd92b, 0xee971b69, + 0x6ea22fde, 0x5f08ae2b, 0xaf7a616d, 0xe5c98767, 0xcf1febd2, 0x61efc8c2, 0xf1ac2571, 0xcc8239c2, + 0x67214cb8, 0xb1e583d1, 0xb7dc3e62, 0x7f10bdce, 0xf90a5c38, 0x0ff0443d, 0x606e6dc6, 0x60543a49, + 0x5727c148, 0x2be98a1d, 0x8ab41738, 0x20e1be24, 0xaf96da0f, 0x68458425, 0x99833be5, 0x600d457d, + 0x282f9350, 0x8334b362, 0xd91d1120, 0x2b6d8da0, 0x642b1e31, 0x9c305a00, 0x52bce688, 0x1b03588a, + 0xf7baefd5, 0x4142ed9c, 0xa4315c11, 0x83323ec5, 0xdfef4636, 0xa133c501, 0xe9d3531c, 0xee353783, + }, + { + 0x9db30420, 0x1fb6e9de, 0xa7be7bef, 0xd273a298, 0x4a4f7bdb, 0x64ad8c57, 0x85510443, 0xfa020ed1, + 0x7e287aff, 0xe60fb663, 0x095f35a1, 0x79ebf120, 0xfd059d43, 0x6497b7b1, 0xf3641f63, 0x241e4adf, + 0x28147f5f, 0x4fa2b8cd, 0xc9430040, 0x0cc32220, 0xfdd30b30, 0xc0a5374f, 0x1d2d00d9, 0x24147b15, + 0xee4d111a, 0x0fca5167, 0x71ff904c, 0x2d195ffe, 0x1a05645f, 0x0c13fefe, 0x081b08ca, 0x05170121, + 0x80530100, 0xe83e5efe, 0xac9af4f8, 0x7fe72701, 0xd2b8ee5f, 0x06df4261, 0xbb9e9b8a, 0x7293ea25, + 0xce84ffdf, 0xf5718801, 0x3dd64b04, 0xa26f263b, 0x7ed48400, 0x547eebe6, 0x446d4ca0, 0x6cf3d6f5, + 0x2649abdf, 0xaea0c7f5, 0x36338cc1, 0x503f7e93, 0xd3772061, 0x11b638e1, 0x72500e03, 0xf80eb2bb, + 0xabe0502e, 0xec8d77de, 0x57971e81, 0xe14f6746, 0xc9335400, 0x6920318f, 0x081dbb99, 0xffc304a5, + 0x4d351805, 0x7f3d5ce3, 0xa6c866c6, 0x5d5bcca9, 0xdaec6fea, 0x9f926f91, 0x9f46222f, 0x3991467d, + 0xa5bf6d8e, 0x1143c44f, 0x43958302, 0xd0214eeb, 0x022083b8, 0x3fb6180c, 0x18f8931e, 0x281658e6, + 0x26486e3e, 0x8bd78a70, 0x7477e4c1, 0xb506e07c, 0xf32d0a25, 0x79098b02, 0xe4eabb81, 0x28123b23, + 0x69dead38, 0x1574ca16, 0xdf871b62, 0x211c40b7, 0xa51a9ef9, 0x0014377b, 0x041e8ac8, 0x09114003, + 0xbd59e4d2, 0xe3d156d5, 0x4fe876d5, 0x2f91a340, 0x557be8de, 0x00eae4a7, 0x0ce5c2ec, 0x4db4bba6, + 0xe756bdff, 0xdd3369ac, 0xec17b035, 0x06572327, 0x99afc8b0, 0x56c8c391, 0x6b65811c, 0x5e146119, + 0x6e85cb75, 0xbe07c002, 0xc2325577, 0x893ff4ec, 0x5bbfc92d, 0xd0ec3b25, 0xb7801ab7, 0x8d6d3b24, + 0x20c763ef, 0xc366a5fc, 0x9c382880, 0x0ace3205, 0xaac9548a, 0xeca1d7c7, 0x041afa32, 0x1d16625a, + 0x6701902c, 0x9b757a54, 0x31d477f7, 0x9126b031, 0x36cc6fdb, 0xc70b8b46, 0xd9e66a48, 0x56e55a79, + 0x026a4ceb, 0x52437eff, 0x2f8f76b4, 0x0df980a5, 0x8674cde3, 0xedda04eb, 0x17a9be04, 0x2c18f4df, + 0xb7747f9d, 0xab2af7b4, 0xefc34d20, 0x2e096b7c, 0x1741a254, 0xe5b6a035, 0x213d42f6, 0x2c1c7c26, + 0x61c2f50f, 0x6552daf9, 0xd2c231f8, 0x25130f69, 0xd8167fa2, 0x0418f2c8, 0x001a96a6, 0x0d1526ab, + 0x63315c21, 0x5e0a72ec, 0x49bafefd, 0x187908d9, 0x8d0dbd86, 0x311170a7, 0x3e9b640c, 0xcc3e10d7, + 0xd5cad3b6, 0x0caec388, 0xf73001e1, 0x6c728aff, 0x71eae2a1, 0x1f9af36e, 0xcfcbd12f, 0xc1de8417, + 0xac07be6b, 0xcb44a1d8, 0x8b9b0f56, 0x013988c3, 0xb1c52fca, 0xb4be31cd, 0xd8782806, 0x12a3a4e2, + 0x6f7de532, 0x58fd7eb6, 0xd01ee900, 0x24adffc2, 0xf4990fc5, 0x9711aac5, 0x001d7b95, 0x82e5e7d2, + 0x109873f6, 0x00613096, 0xc32d9521, 0xada121ff, 0x29908415, 0x7fbb977f, 0xaf9eb3db, 0x29c9ed2a, + 0x5ce2a465, 0xa730f32c, 0xd0aa3fe8, 0x8a5cc091, 0xd49e2ce7, 0x0ce454a9, 0xd60acd86, 0x015f1919, + 0x77079103, 0xdea03af6, 0x78a8565e, 0xdee356df, 0x21f05cbe, 0x8b75e387, 0xb3c50651, 0xb8a5c3ef, + 0xd8eeb6d2, 0xe523be77, 0xc2154529, 0x2f69efdf, 0xafe67afb, 0xf470c4b2, 0xf3e0eb5b, 0xd6cc9876, + 0x39e4460c, 0x1fda8538, 0x1987832f, 0xca007367, 0xa99144f8, 0x296b299e, 0x492fc295, 0x9266beab, + 0xb5676e69, 0x9bd3ddda, 0xdf7e052f, 0xdb25701c, 0x1b5e51ee, 0xf65324e6, 0x6afce36c, 0x0316cc04, + 0x8644213e, 0xb7dc59d0, 0x7965291f, 0xccd6fd43, 0x41823979, 0x932bcdf6, 0xb657c34d, 0x4edfd282, + 0x7ae5290c, 0x3cb9536b, 0x851e20fe, 0x9833557e, 0x13ecf0b0, 0xd3ffb372, 0x3f85c5c1, 0x0aef7ed2, + }, + { + 0x7ec90c04, 0x2c6e74b9, 0x9b0e66df, 0xa6337911, 0xb86a7fff, 0x1dd358f5, 0x44dd9d44, 0x1731167f, + 0x08fbf1fa, 0xe7f511cc, 0xd2051b00, 0x735aba00, 0x2ab722d8, 0x386381cb, 0xacf6243a, 0x69befd7a, + 0xe6a2e77f, 0xf0c720cd, 0xc4494816, 0xccf5c180, 0x38851640, 0x15b0a848, 0xe68b18cb, 0x4caadeff, + 0x5f480a01, 0x0412b2aa, 0x259814fc, 0x41d0efe2, 0x4e40b48d, 0x248eb6fb, 0x8dba1cfe, 0x41a99b02, + 0x1a550a04, 0xba8f65cb, 0x7251f4e7, 0x95a51725, 0xc106ecd7, 0x97a5980a, 0xc539b9aa, 0x4d79fe6a, + 0xf2f3f763, 0x68af8040, 0xed0c9e56, 0x11b4958b, 0xe1eb5a88, 0x8709e6b0, 0xd7e07156, 0x4e29fea7, + 0x6366e52d, 0x02d1c000, 0xc4ac8e05, 0x9377f571, 0x0c05372a, 0x578535f2, 0x2261be02, 0xd642a0c9, + 0xdf13a280, 0x74b55bd2, 0x682199c0, 0xd421e5ec, 0x53fb3ce8, 0xc8adedb3, 0x28a87fc9, 0x3d959981, + 0x5c1ff900, 0xfe38d399, 0x0c4eff0b, 0x062407ea, 0xaa2f4fb1, 0x4fb96976, 0x90c79505, 0xb0a8a774, + 0xef55a1ff, 0xe59ca2c2, 0xa6b62d27, 0xe66a4263, 0xdf65001f, 0x0ec50966, 0xdfdd55bc, 0x29de0655, + 0x911e739a, 0x17af8975, 0x32c7911c, 0x89f89468, 0x0d01e980, 0x524755f4, 0x03b63cc9, 0x0cc844b2, + 0xbcf3f0aa, 0x87ac36e9, 0xe53a7426, 0x01b3d82b, 0x1a9e7449, 0x64ee2d7e, 0xcddbb1da, 0x01c94910, + 0xb868bf80, 0x0d26f3fd, 0x9342ede7, 0x04a5c284, 0x636737b6, 0x50f5b616, 0xf24766e3, 0x8eca36c1, + 0x136e05db, 0xfef18391, 0xfb887a37, 0xd6e7f7d4, 0xc7fb7dc9, 0x3063fcdf, 0xb6f589de, 0xec2941da, + 0x26e46695, 0xb7566419, 0xf654efc5, 0xd08d58b7, 0x48925401, 0xc1bacb7f, 0xe5ff550f, 0xb6083049, + 0x5bb5d0e8, 0x87d72e5a, 0xab6a6ee1, 0x223a66ce, 0xc62bf3cd, 0x9e0885f9, 0x68cb3e47, 0x086c010f, + 0xa21de820, 0xd18b69de, 0xf3f65777, 0xfa02c3f6, 0x407edac3, 0xcbb3d550, 0x1793084d, 0xb0d70eba, + 0x0ab378d5, 0xd951fb0c, 0xded7da56, 0x4124bbe4, 0x94ca0b56, 0x0f5755d1, 0xe0e1e56e, 0x6184b5be, + 0x580a249f, 0x94f74bc0, 0xe327888e, 0x9f7b5561, 0xc3dc0280, 0x05687715, 0x646c6bd7, 0x44904db3, + 0x66b4f0a3, 0xc0f1648a, 0x697ed5af, 0x49e92ff6, 0x309e374f, 0x2cb6356a, 0x85808573, 0x4991f840, + 0x76f0ae02, 0x083be84d, 0x28421c9a, 0x44489406, 0x736e4cb8, 0xc1092910, 0x8bc95fc6, 0x7d869cf4, + 0x134f616f, 0x2e77118d, 0xb31b2be1, 0xaa90b472, 0x3ca5d717, 0x7d161bba, 0x9cad9010, 0xaf462ba2, + 0x9fe459d2, 0x45d34559, 0xd9f2da13, 0xdbc65487, 0xf3e4f94e, 0x176d486f, 0x097c13ea, 0x631da5c7, + 0x445f7382, 0x175683f4, 0xcdc66a97, 0x70be0288, 0xb3cdcf72, 0x6e5dd2f3, 0x20936079, 0x459b80a5, + 0xbe60e2db, 0xa9c23101, 0xeba5315c, 0x224e42f2, 0x1c5c1572, 0xf6721b2c, 0x1ad2fff3, 0x8c25404e, + 0x324ed72f, 0x4067b7fd, 0x0523138e, 0x5ca3bc78, 0xdc0fd66e, 0x75922283, 0x784d6b17, 0x58ebb16e, + 0x44094f85, 0x3f481d87, 0xfcfeae7b, 0x77b5ff76, 0x8c2302bf, 0xaaf47556, 0x5f46b02a, 0x2b092801, + 0x3d38f5f7, 0x0ca81f36, 0x52af4a8a, 0x66d5e7c0, 0xdf3b0874, 0x95055110, 0x1b5ad7a8, 0xf61ed5ad, + 0x6cf6e479, 0x20758184, 0xd0cefa65, 0x88f7be58, 0x4a046826, 0x0ff6f8f3, 0xa09c7f70, 0x5346aba0, + 0x5ce96c28, 0xe176eda3, 0x6bac307f, 0x376829d2, 0x85360fa9, 0x17e3fe2a, 0x24b79767, 0xf5a96b20, + 0xd6cd2595, 0x68ff1ebf, 0x7555442c, 0xf19f06be, 0xf9e0659a, 0xeeb9491d, 0x34010718, 0xbb30cab8, + 0xe822fe15, 0x88570983, 0x750e6249, 0xda627e55, 0x5e76ffa8, 0xb1534546, 0x6d47de08, 0xefe9e7d4, + }, + { + 0xf6fa8f9d, 0x2cac6ce1, 0x4ca34867, 0xe2337f7c, 0x95db08e7, 0x016843b4, 0xeced5cbc, 0x325553ac, + 0xbf9f0960, 0xdfa1e2ed, 0x83f0579d, 0x63ed86b9, 0x1ab6a6b8, 0xde5ebe39, 0xf38ff732, 0x8989b138, + 0x33f14961, 0xc01937bd, 0xf506c6da, 0xe4625e7e, 0xa308ea99, 0x4e23e33c, 0x79cbd7cc, 0x48a14367, + 0xa3149619, 0xfec94bd5, 0xa114174a, 0xeaa01866, 0xa084db2d, 0x09a8486f, 0xa888614a, 0x2900af98, + 0x01665991, 0xe1992863, 0xc8f30c60, 0x2e78ef3c, 0xd0d51932, 0xcf0fec14, 0xf7ca07d2, 0xd0a82072, + 0xfd41197e, 0x9305a6b0, 0xe86be3da, 0x74bed3cd, 0x372da53c, 0x4c7f4448, 0xdab5d440, 0x6dba0ec3, + 0x083919a7, 0x9fbaeed9, 0x49dbcfb0, 0x4e670c53, 0x5c3d9c01, 0x64bdb941, 0x2c0e636a, 0xba7dd9cd, + 0xea6f7388, 0xe70bc762, 0x35f29adb, 0x5c4cdd8d, 0xf0d48d8c, 0xb88153e2, 0x08a19866, 0x1ae2eac8, + 0x284caf89, 0xaa928223, 0x9334be53, 0x3b3a21bf, 0x16434be3, 0x9aea3906, 0xefe8c36e, 0xf890cdd9, + 0x80226dae, 0xc340a4a3, 0xdf7e9c09, 0xa694a807, 0x5b7c5ecc, 0x221db3a6, 0x9a69a02f, 0x68818a54, + 0xceb2296f, 0x53c0843a, 0xfe893655, 0x25bfe68a, 0xb4628abc, 0xcf222ebf, 0x25ac6f48, 0xa9a99387, + 0x53bddb65, 0xe76ffbe7, 0xe967fd78, 0x0ba93563, 0x8e342bc1, 0xe8a11be9, 0x4980740d, 0xc8087dfc, + 0x8de4bf99, 0xa11101a0, 0x7fd37975, 0xda5a26c0, 0xe81f994f, 0x9528cd89, 0xfd339fed, 0xb87834bf, + 0x5f04456d, 0x22258698, 0xc9c4c83b, 0x2dc156be, 0x4f628daa, 0x57f55ec5, 0xe2220abe, 0xd2916ebf, + 0x4ec75b95, 0x24f2c3c0, 0x42d15d99, 0xcd0d7fa0, 0x7b6e27ff, 0xa8dc8af0, 0x7345c106, 0xf41e232f, + 0x35162386, 0xe6ea8926, 0x3333b094, 0x157ec6f2, 0x372b74af, 0x692573e4, 0xe9a9d848, 0xf3160289, + 0x3a62ef1d, 0xa787e238, 0xf3a5f676, 0x74364853, 0x20951063, 0x4576698d, 0xb6fad407, 0x592af950, + 0x36f73523, 0x4cfb6e87, 0x7da4cec0, 0x6c152daa, 0xcb0396a8, 0xc50dfe5d, 0xfcd707ab, 0x0921c42f, + 0x89dff0bb, 0x5fe2be78, 0x448f4f33, 0x754613c9, 0x2b05d08d, 0x48b9d585, 0xdc049441, 0xc8098f9b, + 0x7dede786, 0xc39a3373, 0x42410005, 0x6a091751, 0x0ef3c8a6, 0x890072d6, 0x28207682, 0xa9a9f7be, + 0xbf32679d, 0xd45b5b75, 0xb353fd00, 0xcbb0e358, 0x830f220a, 0x1f8fb214, 0xd372cf08, 0xcc3c4a13, + 0x8cf63166, 0x061c87be, 0x88c98f88, 0x6062e397, 0x47cf8e7a, 0xb6c85283, 0x3cc2acfb, 0x3fc06976, + 0x4e8f0252, 0x64d8314d, 0xda3870e3, 0x1e665459, 0xc10908f0, 0x513021a5, 0x6c5b68b7, 0x822f8aa0, + 0x3007cd3e, 0x74719eef, 0xdc872681, 0x073340d4, 0x7e432fd9, 0x0c5ec241, 0x8809286c, 0xf592d891, + 0x08a930f6, 0x957ef305, 0xb7fbffbd, 0xc266e96f, 0x6fe4ac98, 0xb173ecc0, 0xbc60b42a, 0x953498da, + 0xfba1ae12, 0x2d4bd736, 0x0f25faab, 0xa4f3fceb, 0xe2969123, 0x257f0c3d, 0x9348af49, 0x361400bc, + 0xe8816f4a, 0x3814f200, 0xa3f94043, 0x9c7a54c2, 0xbc704f57, 0xda41e7f9, 0xc25ad33a, 0x54f4a084, + 0xb17f5505, 0x59357cbe, 0xedbd15c8, 0x7f97c5ab, 0xba5ac7b5, 0xb6f6deaf, 0x3a479c3a, 0x5302da25, + 0x653d7e6a, 0x54268d49, 0x51a477ea, 0x5017d55b, 0xd7d25d88, 0x44136c76, 0x0404a8c8, 0xb8e5a121, + 0xb81a928a, 0x60ed5869, 0x97c55b96, 0xeaec991b, 0x29935913, 0x01fdb7f1, 0x088e8dfa, 0x9ab6f6f5, + 0x3b4cbf9f, 0x4a5de3ab, 0xe6051d35, 0xa0e1d855, 0xd36b4cf1, 0xf544edeb, 0xb0e93524, 0xbebb8fbd, + 0xa2d762cf, 0x49c92f54, 0x38b5f331, 0x7128a454, 0x48392905, 0xa65b1db8, 0x851c97bd, 0xd675cf2f, + }, + { + 0x85e04019, 0x332bf567, 0x662dbfff, 0xcfc65693, 0x2a8d7f6f, 0xab9bc912, 0xde6008a1, 0x2028da1f, + 0x0227bce7, 0x4d642916, 0x18fac300, 0x50f18b82, 0x2cb2cb11, 0xb232e75c, 0x4b3695f2, 0xb28707de, + 0xa05fbcf6, 0xcd4181e9, 0xe150210c, 0xe24ef1bd, 0xb168c381, 0xfde4e789, 0x5c79b0d8, 0x1e8bfd43, + 0x4d495001, 0x38be4341, 0x913cee1d, 0x92a79c3f, 0x089766be, 0xbaeeadf4, 0x1286becf, 0xb6eacb19, + 0x2660c200, 0x7565bde4, 0x64241f7a, 0x8248dca9, 0xc3b3ad66, 0x28136086, 0x0bd8dfa8, 0x356d1cf2, + 0x107789be, 0xb3b2e9ce, 0x0502aa8f, 0x0bc0351e, 0x166bf52a, 0xeb12ff82, 0xe3486911, 0xd34d7516, + 0x4e7b3aff, 0x5f43671b, 0x9cf6e037, 0x4981ac83, 0x334266ce, 0x8c9341b7, 0xd0d854c0, 0xcb3a6c88, + 0x47bc2829, 0x4725ba37, 0xa66ad22b, 0x7ad61f1e, 0x0c5cbafa, 0x4437f107, 0xb6e79962, 0x42d2d816, + 0x0a961288, 0xe1a5c06e, 0x13749e67, 0x72fc081a, 0xb1d139f7, 0xf9583745, 0xcf19df58, 0xbec3f756, + 0xc06eba30, 0x07211b24, 0x45c28829, 0xc95e317f, 0xbc8ec511, 0x38bc46e9, 0xc6e6fa14, 0xbae8584a, + 0xad4ebc46, 0x468f508b, 0x7829435f, 0xf124183b, 0x821dba9f, 0xaff60ff4, 0xea2c4e6d, 0x16e39264, + 0x92544a8b, 0x009b4fc3, 0xaba68ced, 0x9ac96f78, 0x06a5b79a, 0xb2856e6e, 0x1aec3ca9, 0xbe838688, + 0x0e0804e9, 0x55f1be56, 0xe7e5363b, 0xb3a1f25d, 0xf7debb85, 0x61fe033c, 0x16746233, 0x3c034c28, + 0xda6d0c74, 0x79aac56c, 0x3ce4e1ad, 0x51f0c802, 0x98f8f35a, 0x1626a49f, 0xeed82b29, 0x1d382fe3, + 0x0c4fb99a, 0xbb325778, 0x3ec6d97b, 0x6e77a6a9, 0xcb658b5c, 0xd45230c7, 0x2bd1408b, 0x60c03eb7, + 0xb9068d78, 0xa33754f4, 0xf430c87d, 0xc8a71302, 0xb96d8c32, 0xebd4e7be, 0xbe8b9d2d, 0x7979fb06, + 0xe7225308, 0x8b75cf77, 0x11ef8da4, 0xe083c858, 0x8d6b786f, 0x5a6317a6, 0xfa5cf7a0, 0x5dda0033, + 0xf28ebfb0, 0xf5b9c310, 0xa0eac280, 0x08b9767a, 0xa3d9d2b0, 0x79d34217, 0x021a718d, 0x9ac6336a, + 0x2711fd60, 0x438050e3, 0x069908a8, 0x3d7fedc4, 0x826d2bef, 0x4eeb8476, 0x488dcf25, 0x36c9d566, + 0x28e74e41, 0xc2610aca, 0x3d49a9cf, 0xbae3b9df, 0xb65f8de6, 0x92aeaf64, 0x3ac7d5e6, 0x9ea80509, + 0xf22b017d, 0xa4173f70, 0xdd1e16c3, 0x15e0d7f9, 0x50b1b887, 0x2b9f4fd5, 0x625aba82, 0x6a017962, + 0x2ec01b9c, 0x15488aa9, 0xd716e740, 0x40055a2c, 0x93d29a22, 0xe32dbf9a, 0x058745b9, 0x3453dc1e, + 0xd699296e, 0x496cff6f, 0x1c9f4986, 0xdfe2ed07, 0xb87242d1, 0x19de7eae, 0x053e561a, 0x15ad6f8c, + 0x66626c1c, 0x7154c24c, 0xea082b2a, 0x93eb2939, 0x17dcb0f0, 0x58d4f2ae, 0x9ea294fb, 0x52cf564c, + 0x9883fe66, 0x2ec40581, 0x763953c3, 0x01d6692e, 0xd3a0c108, 0xa1e7160e, 0xe4f2dfa6, 0x693ed285, + 0x74904698, 0x4c2b0edd, 0x4f757656, 0x5d393378, 0xa132234f, 0x3d321c5d, 0xc3f5e194, 0x4b269301, + 0xc79f022f, 0x3c997e7e, 0x5e4f9504, 0x3ffafbbd, 0x76f7ad0e, 0x296693f4, 0x3d1fce6f, 0xc61e45be, + 0xd3b5ab34, 0xf72bf9b7, 0x1b0434c0, 0x4e72b567, 0x5592a33d, 0xb5229301, 0xcfd2a87f, 0x60aeb767, + 0x1814386b, 0x30bcc33d, 0x38a0c07d, 0xfd1606f2, 0xc363519b, 0x589dd390, 0x5479f8e6, 0x1cb8d647, + 0x97fd61a9, 0xea7759f4, 0x2d57539d, 0x569a58cf, 0xe84e63ad, 0x462e1b78, 0x6580f87e, 0xf3817914, + 0x91da55f4, 0x40a230f3, 0xd1988f35, 0xb6e318d2, 0x3ffa50bc, 0x3d40f021, 0xc3c0bdae, 0x4958c24c, + 0x518f36b2, 0x84b1d370, 0x0fedce83, 0x878ddada, 0xf2a279c7, 0x94e01be8, 0x90716f4b, 0x954b8aa3, + }, + { + 0xe216300d, 0xbbddfffc, 0xa7ebdabd, 0x35648095, 0x7789f8b7, 0xe6c1121b, 0x0e241600, 0x052ce8b5, + 0x11a9cfb0, 0xe5952f11, 0xece7990a, 0x9386d174, 0x2a42931c, 0x76e38111, 0xb12def3a, 0x37ddddfc, + 0xde9adeb1, 0x0a0cc32c, 0xbe197029, 0x84a00940, 0xbb243a0f, 0xb4d137cf, 0xb44e79f0, 0x049eedfd, + 0x0b15a15d, 0x480d3168, 0x8bbbde5a, 0x669ded42, 0xc7ece831, 0x3f8f95e7, 0x72df191b, 0x7580330d, + 0x94074251, 0x5c7dcdfa, 0xabbe6d63, 0xaa402164, 0xb301d40a, 0x02e7d1ca, 0x53571dae, 0x7a3182a2, + 0x12a8ddec, 0xfdaa335d, 0x176f43e8, 0x71fb46d4, 0x38129022, 0xce949ad4, 0xb84769ad, 0x965bd862, + 0x82f3d055, 0x66fb9767, 0x15b80b4e, 0x1d5b47a0, 0x4cfde06f, 0xc28ec4b8, 0x57e8726e, 0x647a78fc, + 0x99865d44, 0x608bd593, 0x6c200e03, 0x39dc5ff6, 0x5d0b00a3, 0xae63aff2, 0x7e8bd632, 0x70108c0c, + 0xbbd35049, 0x2998df04, 0x980cf42a, 0x9b6df491, 0x9e7edd53, 0x06918548, 0x58cb7e07, 0x3b74ef2e, + 0x522fffb1, 0xd24708cc, 0x1c7e27cd, 0xa4eb215b, 0x3cf1d2e2, 0x19b47a38, 0x424f7618, 0x35856039, + 0x9d17dee7, 0x27eb35e6, 0xc9aff67b, 0x36baf5b8, 0x09c467cd, 0xc18910b1, 0xe11dbf7b, 0x06cd1af8, + 0x7170c608, 0x2d5e3354, 0xd4de495a, 0x64c6d006, 0xbcc0c62c, 0x3dd00db3, 0x708f8f34, 0x77d51b42, + 0x264f620f, 0x24b8d2bf, 0x15c1b79e, 0x46a52564, 0xf8d7e54e, 0x3e378160, 0x7895cda5, 0x859c15a5, + 0xe6459788, 0xc37bc75f, 0xdb07ba0c, 0x0676a3ab, 0x7f229b1e, 0x31842e7b, 0x24259fd7, 0xf8bef472, + 0x835ffcb8, 0x6df4c1f2, 0x96f5b195, 0xfd0af0fc, 0xb0fe134c, 0xe2506d3d, 0x4f9b12ea, 0xf215f225, + 0xa223736f, 0x9fb4c428, 0x25d04979, 0x34c713f8, 0xc4618187, 0xea7a6e98, 0x7cd16efc, 0x1436876c, + 0xf1544107, 0xbedeee14, 0x56e9af27, 0xa04aa441, 0x3cf7c899, 0x92ecbae6, 0xdd67016d, 0x151682eb, + 0xa842eedf, 0xfdba60b4, 0xf1907b75, 0x20e3030f, 0x24d8c29e, 0xe139673b, 0xefa63fb8, 0x71873054, + 0xb6f2cf3b, 0x9f326442, 0xcb15a4cc, 0xb01a4504, 0xf1e47d8d, 0x844a1be5, 0xbae7dfdc, 0x42cbda70, + 0xcd7dae0a, 0x57e85b7a, 0xd53f5af6, 0x20cf4d8c, 0xcea4d428, 0x79d130a4, 0x3486ebfb, 0x33d3cddc, + 0x77853b53, 0x37effcb5, 0xc5068778, 0xe580b3e6, 0x4e68b8f4, 0xc5c8b37e, 0x0d809ea2, 0x398feb7c, + 0x132a4f94, 0x43b7950e, 0x2fee7d1c, 0x223613bd, 0xdd06caa2, 0x37df932b, 0xc4248289, 0xacf3ebc3, + 0x5715f6b7, 0xef3478dd, 0xf267616f, 0xc148cbe4, 0x9052815e, 0x5e410fab, 0xb48a2465, 0x2eda7fa4, + 0xe87b40e4, 0xe98ea084, 0x5889e9e1, 0xefd390fc, 0xdd07d35b, 0xdb485694, 0x38d7e5b2, 0x57720101, + 0x730edebc, 0x5b643113, 0x94917e4f, 0x503c2fba, 0x646f1282, 0x7523d24a, 0xe0779695, 0xf9c17a8f, + 0x7a5b2121, 0xd187b896, 0x29263a4d, 0xba510cdf, 0x81f47c9f, 0xad1163ed, 0xea7b5965, 0x1a00726e, + 0x11403092, 0x00da6d77, 0x4a0cdd61, 0xad1f4603, 0x605bdfb0, 0x9eedc364, 0x22ebe6a8, 0xcee7d28a, + 0xa0e736a0, 0x5564a6b9, 0x10853209, 0xc7eb8f37, 0x2de705ca, 0x8951570f, 0xdf09822b, 0xbd691a6c, + 0xaa12e4f2, 0x87451c0f, 0xe0f6a27a, 0x3ada4819, 0x4cf1764f, 0x0d771c2b, 0x67cdb156, 0x350d8384, + 0x5938fa0f, 0x42399ef3, 0x36997b07, 0x0e84093d, 0x4aa93e61, 0x8360d87b, 0x1fa98b0c, 0x1149382c, + 0xe97625a5, 0x0614d1b7, 0x0e25244b, 0x0c768347, 0x589e8d82, 0x0d2059d1, 0xa466bb1e, 0xf8da0a82, + 0x04f19130, 0xba6e4ec0, 0x99265164, 0x1ee7230d, 0x50b2ad80, 0xeaee6801, 0x8db2a283, 0xea8bf59e, + }, +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go new file mode 100644 index 0000000..e28f49d --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go @@ -0,0 +1,91 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD as specified in RFC 7539. +package chacha20poly1305 // import "golang.org/x/crypto/chacha20poly1305" + +import ( + "crypto/cipher" + "encoding/binary" + "errors" +) + +const ( + // KeySize is the size of the key used by this AEAD, in bytes. + KeySize = 32 + // NonceSize is the size of the nonce used with this AEAD, in bytes. + NonceSize = 12 +) + +type chacha20poly1305 struct { + key [8]uint32 +} + +// New returns a ChaCha20-Poly1305 AEAD that uses the given, 256-bit key. +func New(key []byte) (cipher.AEAD, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20poly1305: bad key length") + } + ret := new(chacha20poly1305) + ret.key[0] = binary.LittleEndian.Uint32(key[0:4]) + ret.key[1] = binary.LittleEndian.Uint32(key[4:8]) + ret.key[2] = binary.LittleEndian.Uint32(key[8:12]) + ret.key[3] = binary.LittleEndian.Uint32(key[12:16]) + ret.key[4] = binary.LittleEndian.Uint32(key[16:20]) + ret.key[5] = binary.LittleEndian.Uint32(key[20:24]) + ret.key[6] = binary.LittleEndian.Uint32(key[24:28]) + ret.key[7] = binary.LittleEndian.Uint32(key[28:32]) + return ret, nil +} + +func (c *chacha20poly1305) NonceSize() int { + return NonceSize +} + +func (c *chacha20poly1305) Overhead() int { + return 16 +} + +func (c *chacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte { + if len(nonce) != NonceSize { + panic("chacha20poly1305: bad nonce length passed to Seal") + } + + if uint64(len(plaintext)) > (1<<38)-64 { + panic("chacha20poly1305: plaintext too large") + } + + return c.seal(dst, nonce, plaintext, additionalData) +} + +var errOpen = errors.New("chacha20poly1305: message authentication failed") + +func (c *chacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + if len(nonce) != NonceSize { + panic("chacha20poly1305: bad nonce length passed to Open") + } + if len(ciphertext) < 16 { + return nil, errOpen + } + if uint64(len(ciphertext)) > (1<<38)-48 { + panic("chacha20poly1305: ciphertext too large") + } + + return c.open(dst, nonce, ciphertext, additionalData) +} + +// sliceForAppend takes a slice and a requested number of bytes. It returns a +// slice with the contents of the given slice followed by that many bytes and a +// second slice that aliases into it and contains only the extra bytes. If the +// original slice has sufficient capacity then no allocation is performed. +func sliceForAppend(in []byte, n int) (head, tail []byte) { + if total := len(in) + n; cap(in) >= total { + head = in[:total] + } else { + head = make([]byte, total) + copy(head, in) + } + tail = head[len(in):] + return +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go new file mode 100644 index 0000000..07d18a3 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go @@ -0,0 +1,80 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.7,amd64,!gccgo,!appengine + +package chacha20poly1305 + +import ( + "encoding/binary" + + "golang.org/x/sys/cpu" +) + +//go:noescape +func chacha20Poly1305Open(dst []byte, key []uint32, src, ad []byte) bool + +//go:noescape +func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte) + +var ( + useASM = cpu.X86.HasSSSE3 + useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2 +) + +// setupState writes a ChaCha20 input matrix to state. See +// https://tools.ietf.org/html/rfc7539#section-2.3. +func setupState(state *[16]uint32, key *[8]uint32, nonce []byte) { + state[0] = 0x61707865 + state[1] = 0x3320646e + state[2] = 0x79622d32 + state[3] = 0x6b206574 + + state[4] = key[0] + state[5] = key[1] + state[6] = key[2] + state[7] = key[3] + state[8] = key[4] + state[9] = key[5] + state[10] = key[6] + state[11] = key[7] + + state[12] = 0 + state[13] = binary.LittleEndian.Uint32(nonce[:4]) + state[14] = binary.LittleEndian.Uint32(nonce[4:8]) + state[15] = binary.LittleEndian.Uint32(nonce[8:12]) +} + +func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte { + if !useASM { + return c.sealGeneric(dst, nonce, plaintext, additionalData) + } + + var state [16]uint32 + setupState(&state, &c.key, nonce) + + ret, out := sliceForAppend(dst, len(plaintext)+16) + chacha20Poly1305Seal(out[:], state[:], plaintext, additionalData) + return ret +} + +func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + if !useASM { + return c.openGeneric(dst, nonce, ciphertext, additionalData) + } + + var state [16]uint32 + setupState(&state, &c.key, nonce) + + ciphertext = ciphertext[:len(ciphertext)-16] + ret, out := sliceForAppend(dst, len(ciphertext)) + if !chacha20Poly1305Open(out, state[:], ciphertext, additionalData) { + for i := range out { + out[i] = 0 + } + return nil, errOpen + } + + return ret, nil +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s new file mode 100644 index 0000000..af76bbc --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s @@ -0,0 +1,2695 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. + +// +build go1.7,amd64,!gccgo,!appengine + +#include "textflag.h" +// General register allocation +#define oup DI +#define inp SI +#define inl BX +#define adp CX // free to reuse, after we hash the additional data +#define keyp R8 // free to reuse, when we copy the key to stack +#define itr2 R9 // general iterator +#define itr1 CX // general iterator +#define acc0 R10 +#define acc1 R11 +#define acc2 R12 +#define t0 R13 +#define t1 R14 +#define t2 R15 +#define t3 R8 +// Register and stack allocation for the SSE code +#define rStore (0*16)(BP) +#define sStore (1*16)(BP) +#define state1Store (2*16)(BP) +#define state2Store (3*16)(BP) +#define tmpStore (4*16)(BP) +#define ctr0Store (5*16)(BP) +#define ctr1Store (6*16)(BP) +#define ctr2Store (7*16)(BP) +#define ctr3Store (8*16)(BP) +#define A0 X0 +#define A1 X1 +#define A2 X2 +#define B0 X3 +#define B1 X4 +#define B2 X5 +#define C0 X6 +#define C1 X7 +#define C2 X8 +#define D0 X9 +#define D1 X10 +#define D2 X11 +#define T0 X12 +#define T1 X13 +#define T2 X14 +#define T3 X15 +#define A3 T0 +#define B3 T1 +#define C3 T2 +#define D3 T3 +// Register and stack allocation for the AVX2 code +#define rsStoreAVX2 (0*32)(BP) +#define state1StoreAVX2 (1*32)(BP) +#define state2StoreAVX2 (2*32)(BP) +#define ctr0StoreAVX2 (3*32)(BP) +#define ctr1StoreAVX2 (4*32)(BP) +#define ctr2StoreAVX2 (5*32)(BP) +#define ctr3StoreAVX2 (6*32)(BP) +#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack +#define AA0 Y0 +#define AA1 Y5 +#define AA2 Y6 +#define AA3 Y7 +#define BB0 Y14 +#define BB1 Y9 +#define BB2 Y10 +#define BB3 Y11 +#define CC0 Y12 +#define CC1 Y13 +#define CC2 Y8 +#define CC3 Y15 +#define DD0 Y4 +#define DD1 Y1 +#define DD2 Y2 +#define DD3 Y3 +#define TT0 DD3 +#define TT1 AA3 +#define TT2 BB3 +#define TT3 CC3 +// ChaCha20 constants +DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 +// <<< 16 with PSHUFB +DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 +DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 +DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A +// <<< 8 with PSHUFB +DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 +DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 +DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B + +DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 +DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 +DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 +DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 + +DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 +DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 +DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 +DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 +// Poly1305 key clamp +DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF +DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC +DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF +DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF + +DATA ·sseIncMask<>+0x00(SB)/8, $0x1 +DATA ·sseIncMask<>+0x08(SB)/8, $0x0 +// To load/store the last < 16 bytes in a buffer +DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff +DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff +DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff +DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff +DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff + +GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 +GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 +GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 +GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 +GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 +// No PALIGNR in Go ASM yet (but VPALIGNR is present). +#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 +#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 +#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 +#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 +#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 +#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 +#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 +#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 +#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 +#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 +#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 +#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 +#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 +#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 +#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 +#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 +#define shiftC0Right shiftC0Left +#define shiftC1Right shiftC1Left +#define shiftC2Right shiftC2Left +#define shiftC3Right shiftC3Left +#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 +#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 +#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 +#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 +// Some macros +#define chachaQR(A, B, C, D, T) \ + PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \ + PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ + PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \ + PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B + +#define chachaQR_AVX2(A, B, C, D, T) \ + VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ + VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ + VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ + VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B + +#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 +#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 +#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX +#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 +#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 + +#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 +#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 +#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 + +#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage +#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage +// ---------------------------------------------------------------------------- +TEXT polyHashADInternal<>(SB), NOSPLIT, $0 + // adp points to beginning of additional data + // itr2 holds ad length + XORQ acc0, acc0 + XORQ acc1, acc1 + XORQ acc2, acc2 + CMPQ itr2, $13 + JNE hashADLoop + +openFastTLSAD: + // Special treatment for the TLS case of 13 bytes + MOVQ (adp), acc0 + MOVQ 5(adp), acc1 + SHRQ $24, acc1 + MOVQ $1, acc2 + polyMul + RET + +hashADLoop: + // Hash in 16 byte chunks + CMPQ itr2, $16 + JB hashADTail + polyAdd(0(adp)) + LEAQ (1*16)(adp), adp + SUBQ $16, itr2 + polyMul + JMP hashADLoop + +hashADTail: + CMPQ itr2, $0 + JE hashADDone + + // Hash last < 16 byte tail + XORQ t0, t0 + XORQ t1, t1 + XORQ t2, t2 + ADDQ itr2, adp + +hashADTailLoop: + SHLQ $8, t1:t0 + SHLQ $8, t0 + MOVB -1(adp), t2 + XORQ t2, t0 + DECQ adp + DECQ itr2 + JNE hashADTailLoop + +hashADTailFinish: + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + // Finished AD +hashADDone: + RET + +// ---------------------------------------------------------------------------- +// func chacha20Poly1305Open(dst, key, src, ad []byte) bool +TEXT ·chacha20Poly1305Open(SB), 0, $288-97 + // For aligned stack access + MOVQ SP, BP + ADDQ $32, BP + ANDQ $-32, BP + MOVQ dst+0(FP), oup + MOVQ key+24(FP), keyp + MOVQ src+48(FP), inp + MOVQ src_len+56(FP), inl + MOVQ ad+72(FP), adp + + // Check for AVX2 support + CMPB ·useAVX2(SB), $1 + JE chacha20Poly1305Open_AVX2 + + // Special optimization, for very short buffers + CMPQ inl, $128 + JBE openSSE128 // About 16% faster + + // For long buffers, prepare the poly key first + MOVOU ·chacha20Constants<>(SB), A0 + MOVOU (1*16)(keyp), B0 + MOVOU (2*16)(keyp), C0 + MOVOU (3*16)(keyp), D0 + MOVO D0, T1 + + // Store state on stack for future use + MOVO B0, state1Store + MOVO C0, state2Store + MOVO D0, ctr3Store + MOVQ $10, itr2 + +openSSEPreparePolyKey: + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left; shiftC0Left; shiftD0Left + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right; shiftC0Right; shiftD0Right + DECQ itr2 + JNE openSSEPreparePolyKey + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVO A0, rStore; MOVO B0, sStore + + // Hash AAD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openSSEMainLoop: + CMPQ inl, $256 + JB openSSEMainLoopDone + + // Load state, increment counter blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + + // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 + MOVQ $4, itr1 + MOVQ inp, itr2 + +openSSEInternalLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyAdd(0(itr2)) + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + LEAQ (2*8)(itr2), itr2 + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + polyMulStage3 + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr1 + JGE openSSEInternalLoop + + polyAdd(0(itr2)) + polyMul + LEAQ (2*8)(itr2), itr2 + + CMPQ itr1, $-6 + JG openSSEInternalLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + + // Load - xor - store + MOVO D3, tmpStore + MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) + MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) + MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) + MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) + MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) + MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) + MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) + MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) + MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) + MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) + MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) + MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) + MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) + MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) + LEAQ 256(inp), inp + LEAQ 256(oup), oup + SUBQ $256, inl + JMP openSSEMainLoop + +openSSEMainLoopDone: + // Handle the various tail sizes efficiently + TESTQ inl, inl + JE openSSEFinalize + CMPQ inl, $64 + JBE openSSETail64 + CMPQ inl, $128 + JBE openSSETail128 + CMPQ inl, $192 + JBE openSSETail192 + JMP openSSETail256 + +openSSEFinalize: + // Hash in the PT, AAD lengths + ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 + polyMul + + // Final reduce + MOVQ acc0, t0 + MOVQ acc1, t1 + MOVQ acc2, t2 + SUBQ $-5, acc0 + SBBQ $-1, acc1 + SBBQ $3, acc2 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + CMOVQCS t2, acc2 + + // Add in the "s" part of the key + ADDQ 0+sStore, acc0 + ADCQ 8+sStore, acc1 + + // Finally, constant time compare to the tag at the end of the message + XORQ AX, AX + MOVQ $1, DX + XORQ (0*8)(inp), acc0 + XORQ (1*8)(inp), acc1 + ORQ acc1, acc0 + CMOVQEQ DX, AX + + // Return true iff tags are equal + MOVB AX, ret+96(FP) + RET + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 129 bytes +openSSE128: + // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks + MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 + MOVQ $10, itr2 + +openSSE128InnerCipherLoop: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftB1Left; shiftB2Left + shiftC0Left; shiftC1Left; shiftC2Left + shiftD0Left; shiftD1Left; shiftD2Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftB1Right; shiftB2Right + shiftC0Right; shiftC1Right; shiftC2Right + shiftD0Right; shiftD1Right; shiftD2Right + DECQ itr2 + JNE openSSE128InnerCipherLoop + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 + PADDL T2, C1; PADDL T2, C2 + PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVOU A0, rStore; MOVOU B0, sStore + + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openSSE128Open: + CMPQ inl, $16 + JB openSSETail16 + SUBQ $16, inl + + // Load for hashing + polyAdd(0(inp)) + + // Load for decryption + MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + polyMul + + // Shift the stream "left" + MOVO B1, A1 + MOVO C1, B1 + MOVO D1, C1 + MOVO A2, D1 + MOVO B2, A2 + MOVO C2, B2 + MOVO D2, C2 + JMP openSSE128Open + +openSSETail16: + TESTQ inl, inl + JE openSSEFinalize + + // We can safely load the CT from the end, because it is padded with the MAC + MOVQ inl, itr2 + SHLQ $4, itr2 + LEAQ ·andMask<>(SB), t0 + MOVOU (inp), T0 + ADDQ inl, inp + PAND -16(t0)(itr2*1), T0 + MOVO T0, 0+tmpStore + MOVQ T0, t0 + MOVQ 8+tmpStore, t1 + PXOR A1, T0 + + // We can only store one byte at a time, since plaintext can be shorter than 16 bytes +openSSETail16Store: + MOVQ T0, t3 + MOVB t3, (oup) + PSRLDQ $1, T0 + INCQ oup + DECQ inl + JNE openSSETail16Store + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + JMP openSSEFinalize + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of ciphertext +openSSETail64: + // Need to decrypt up to 64 bytes - prepare single block + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + XORQ itr2, itr2 + MOVQ inl, itr1 + CMPQ itr1, $16 + JB openSSETail64LoopB + +openSSETail64LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + SUBQ $16, itr1 + +openSSETail64LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left; shiftC0Left; shiftD0Left + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right; shiftC0Right; shiftD0Right + + CMPQ itr1, $16 + JAE openSSETail64LoopA + + CMPQ itr2, $160 + JNE openSSETail64LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 + +openSSETail64DecLoop: + CMPQ inl, $16 + JB openSSETail64DecLoopDone + SUBQ $16, inl + MOVOU (inp), T0 + PXOR T0, A0 + MOVOU A0, (oup) + LEAQ 16(inp), inp + LEAQ 16(oup), oup + MOVO B0, A0 + MOVO C0, B0 + MOVO D0, C0 + JMP openSSETail64DecLoop + +openSSETail64DecLoopDone: + MOVO A0, A1 + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +openSSETail128: + // Need to decrypt up to 128 bytes - prepare two blocks + MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store + MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store + XORQ itr2, itr2 + MOVQ inl, itr1 + ANDQ $-16, itr1 + +openSSETail128LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + +openSSETail128LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + + CMPQ itr2, itr1 + JB openSSETail128LoopA + + CMPQ itr2, $160 + JNE openSSETail128LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B0; PADDL state1Store, B1 + PADDL state2Store, C0; PADDL state2Store, C1 + PADDL ctr1Store, D0; PADDL ctr0Store, D1 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) + + SUBQ $64, inl + LEAQ 64(inp), inp + LEAQ 64(oup), oup + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of ciphertext +openSSETail192: + // Need to decrypt up to 192 bytes - prepare three blocks + MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store + MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store + + MOVQ inl, itr1 + MOVQ $160, itr2 + CMPQ itr1, $160 + CMOVQGT itr2, itr1 + ANDQ $-16, itr1 + XORQ itr2, itr2 + +openSSLTail192LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + +openSSLTail192LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + shiftB2Left; shiftC2Left; shiftD2Left + + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + shiftB2Right; shiftC2Right; shiftD2Right + + CMPQ itr2, itr1 + JB openSSLTail192LoopA + + CMPQ itr2, $160 + JNE openSSLTail192LoopB + + CMPQ inl, $176 + JB openSSLTail192Store + + polyAdd(160(inp)) + polyMul + + CMPQ inl, $192 + JB openSSLTail192Store + + polyAdd(176(inp)) + polyMul + +openSSLTail192Store: + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 + PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 + PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 + MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) + + MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + + SUBQ $128, inl + LEAQ 128(inp), inp + LEAQ 128(oup), oup + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +openSSETail256: + // Need to decrypt up to 256 bytes - prepare four blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + XORQ itr2, itr2 + +openSSETail256Loop: + // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication + polyAdd(0(inp)(itr2*1)) + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulStage3 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + ADDQ $2*8, itr2 + CMPQ itr2, $160 + JB openSSETail256Loop + MOVQ inl, itr1 + ANDQ $-16, itr1 + +openSSETail256HashLoop: + polyAdd(0(inp)(itr2*1)) + polyMul + ADDQ $2*8, itr2 + CMPQ itr2, itr1 + JB openSSETail256HashLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + MOVO D3, tmpStore + + // Load - xor - store + MOVOU (0*16)(inp), D3; PXOR D3, A0 + MOVOU (1*16)(inp), D3; PXOR D3, B0 + MOVOU (2*16)(inp), D3; PXOR D3, C0 + MOVOU (3*16)(inp), D3; PXOR D3, D0 + MOVOU A0, (0*16)(oup) + MOVOU B0, (1*16)(oup) + MOVOU C0, (2*16)(oup) + MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) + LEAQ 192(inp), inp + LEAQ 192(oup), oup + SUBQ $192, inl + MOVO A3, A0 + MOVO B3, B0 + MOVO C3, C0 + MOVO tmpStore, D0 + + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- +chacha20Poly1305Open_AVX2: + VZEROUPPER + VMOVDQU ·chacha20Constants<>(SB), AA0 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 + BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 + VPADDD ·avx2InitMask<>(SB), DD0, DD0 + + // Special optimization, for very short buffers + CMPQ inl, $192 + JBE openAVX2192 + CMPQ inl, $320 + JBE openAVX2320 + + // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream + VMOVDQA BB0, state1StoreAVX2 + VMOVDQA CC0, state2StoreAVX2 + VMOVDQA DD0, ctr3StoreAVX2 + MOVQ $10, itr2 + +openAVX2PreparePolyKey: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + DECQ itr2 + JNE openAVX2PreparePolyKey + + VPADDD ·chacha20Constants<>(SB), AA0, AA0 + VPADDD state1StoreAVX2, BB0, BB0 + VPADDD state2StoreAVX2, CC0, CC0 + VPADDD ctr3StoreAVX2, DD0, DD0 + + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for the first 64 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + + // Hash AD + first 64 bytes + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +openAVX2InitialHash64: + polyAdd(0(inp)(itr1*1)) + polyMulAVX2 + ADDQ $16, itr1 + CMPQ itr1, $64 + JNE openAVX2InitialHash64 + + // Decrypt the first 64 bytes + VPXOR (0*32)(inp), AA0, AA0 + VPXOR (1*32)(inp), BB0, BB0 + VMOVDQU AA0, (0*32)(oup) + VMOVDQU BB0, (1*32)(oup) + LEAQ (2*32)(inp), inp + LEAQ (2*32)(oup), oup + SUBQ $64, inl + +openAVX2MainLoop: + CMPQ inl, $512 + JB openAVX2MainLoopDone + + // Load state, increment counter blocks, store the incremented counters + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + XORQ itr1, itr1 + +openAVX2InternalLoop: + // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications + // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext + polyAdd(0*8(inp)(itr1*1)) + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage1_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulStage2_AVX2 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyMulStage3_AVX2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + polyAdd(2*8(inp)(itr1*1)) + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage1_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage2_AVX2 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage3_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulReduceStage + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(4*8(inp)(itr1*1)) + LEAQ (6*8)(itr1), itr1 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage1_AVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + polyMulStage2_AVX2 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage3_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + CMPQ itr1, $480 + JNE openAVX2InternalLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + + // We only hashed 480 of the 512 bytes available - hash the remaining 32 here + polyAdd(480(inp)) + polyMulAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + // and here + polyAdd(496(inp)) + polyMulAVX2 + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 + VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) + LEAQ (32*16)(inp), inp + LEAQ (32*16)(oup), oup + SUBQ $(32*16), inl + JMP openAVX2MainLoop + +openAVX2MainLoopDone: + // Handle the various tail sizes efficiently + TESTQ inl, inl + JE openSSEFinalize + CMPQ inl, $128 + JBE openAVX2Tail128 + CMPQ inl, $256 + JBE openAVX2Tail256 + CMPQ inl, $384 + JBE openAVX2Tail384 + JMP openAVX2Tail512 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes +openAVX2192: + // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks + VMOVDQA AA0, AA1 + VMOVDQA BB0, BB1 + VMOVDQA CC0, CC1 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2 + VMOVDQA BB0, BB2 + VMOVDQA CC0, CC2 + VMOVDQA DD0, DD2 + VMOVDQA DD1, TT3 + MOVQ $10, itr2 + +openAVX2192InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr2 + JNE openAVX2192InnerCipherLoop + VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 + VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 + VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 + VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 192 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + +openAVX2ShortOpen: + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openAVX2ShortOpenLoop: + CMPQ inl, $32 + JB openAVX2ShortTail32 + SUBQ $32, inl + + // Load for hashing + polyAdd(0*8(inp)) + polyMulAVX2 + polyAdd(2*8(inp)) + polyMulAVX2 + + // Load for decryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + LEAQ (1*32)(oup), oup + + // Shift stream left + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + VMOVDQA AA1, DD0 + VMOVDQA BB1, AA1 + VMOVDQA CC1, BB1 + VMOVDQA DD1, CC1 + VMOVDQA AA2, DD1 + VMOVDQA BB2, AA2 + JMP openAVX2ShortOpenLoop + +openAVX2ShortTail32: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB openAVX2ShortDone + + SUBQ $16, inl + + // Load for hashing + polyAdd(0*8(inp)) + polyMulAVX2 + + // Load for decryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +openAVX2ShortDone: + VZEROUPPER + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes +openAVX2320: + // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks + VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 + MOVQ $10, itr2 + +openAVX2320InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr2 + JNE openAVX2320InnerCipherLoop + + VMOVDQA ·chacha20Constants<>(SB), TT0 + VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 + VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 + VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 + VMOVDQA ·avx2IncMask<>(SB), TT0 + VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD2, DD2 + + // Clamp and store poly key + VPERM2I128 $0x02, AA0, BB0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 320 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x02, AA2, BB2, CC1 + VPERM2I128 $0x02, CC2, DD2, DD1 + VPERM2I128 $0x13, AA2, BB2, AA2 + VPERM2I128 $0x13, CC2, DD2, BB2 + JMP openAVX2ShortOpen + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +openAVX2Tail128: + // Need to decrypt up to 128 bytes - prepare two blocks + VMOVDQA ·chacha20Constants<>(SB), AA1 + VMOVDQA state1StoreAVX2, BB1 + VMOVDQA state2StoreAVX2, CC1 + VMOVDQA ctr3StoreAVX2, DD1 + VPADDD ·avx2IncMask<>(SB), DD1, DD1 + VMOVDQA DD1, DD0 + + XORQ itr2, itr2 + MOVQ inl, itr1 + ANDQ $-16, itr1 + TESTQ itr1, itr1 + JE openAVX2Tail128LoopB + +openAVX2Tail128LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMulAVX2 + +openAVX2Tail128LoopB: + ADDQ $16, itr2 + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD1, DD1, DD1 + CMPQ itr2, itr1 + JB openAVX2Tail128LoopA + CMPQ itr2, $160 + JNE openAVX2Tail128LoopB + + VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC1, CC1 + VPADDD DD0, DD1, DD1 + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + +openAVX2TailLoop: + CMPQ inl, $32 + JB openAVX2Tail + SUBQ $32, inl + + // Load for decryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + LEAQ (1*32)(oup), oup + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + JMP openAVX2TailLoop + +openAVX2Tail: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB openAVX2TailDone + SUBQ $16, inl + + // Load for decryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +openAVX2TailDone: + VZEROUPPER + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +openAVX2Tail256: + // Need to decrypt up to 256 bytes - prepare four blocks + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA DD0, TT1 + VMOVDQA DD1, TT2 + + // Compute the number of iterations that will hash data + MOVQ inl, tmpStoreAVX2 + MOVQ inl, itr1 + SUBQ $128, itr1 + SHRQ $4, itr1 + MOVQ $10, itr2 + CMPQ itr1, $10 + CMOVQGT itr2, itr1 + MOVQ inp, inl + XORQ itr2, itr2 + +openAVX2Tail256LoopA: + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + + // Perform ChaCha rounds, while hashing the remaining input +openAVX2Tail256LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + INCQ itr2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + CMPQ itr2, itr1 + JB openAVX2Tail256LoopA + + CMPQ itr2, $10 + JNE openAVX2Tail256LoopB + + MOVQ inl, itr2 + SUBQ inp, inl + MOVQ inl, itr1 + MOVQ tmpStoreAVX2, inl + + // Hash the remainder of data (if any) +openAVX2Tail256Hash: + ADDQ $16, itr1 + CMPQ itr1, inl + JGT openAVX2Tail256HashEnd + polyAdd (0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + JMP openAVX2Tail256Hash + +// Store 128 bytes safely, then go to store loop +openAVX2Tail256HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + + VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 + VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) + LEAQ (4*32)(inp), inp + LEAQ (4*32)(oup), oup + SUBQ $4*32, inl + + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext +openAVX2Tail384: + // Need to decrypt up to 384 bytes - prepare six blocks + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA DD0, ctr0StoreAVX2 + VMOVDQA DD1, ctr1StoreAVX2 + VMOVDQA DD2, ctr2StoreAVX2 + + // Compute the number of iterations that will hash two blocks of data + MOVQ inl, tmpStoreAVX2 + MOVQ inl, itr1 + SUBQ $256, itr1 + SHRQ $4, itr1 + ADDQ $6, itr1 + MOVQ $10, itr2 + CMPQ itr1, $10 + CMOVQGT itr2, itr1 + MOVQ inp, inl + XORQ itr2, itr2 + + // Perform ChaCha rounds, while hashing the remaining input +openAVX2Tail384LoopB: + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + +openAVX2Tail384LoopA: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + INCQ itr2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + + CMPQ itr2, itr1 + JB openAVX2Tail384LoopB + + CMPQ itr2, $10 + JNE openAVX2Tail384LoopA + + MOVQ inl, itr2 + SUBQ inp, inl + MOVQ inl, itr1 + MOVQ tmpStoreAVX2, inl + +openAVX2Tail384Hash: + ADDQ $16, itr1 + CMPQ itr1, inl + JGT openAVX2Tail384HashEnd + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + JMP openAVX2Tail384Hash + +// Store 256 bytes safely, then go to store loop +openAVX2Tail384HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 + VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 + VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 + VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + LEAQ (8*32)(inp), inp + LEAQ (8*32)(oup), oup + SUBQ $8*32, inl + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext +openAVX2Tail512: + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + XORQ itr1, itr1 + MOVQ inp, itr2 + +openAVX2Tail512LoopB: + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ (2*8)(itr2), itr2 + +openAVX2Tail512LoopA: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyAdd(0*8(itr2)) + polyMulAVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(2*8(itr2)) + polyMulAVX2 + LEAQ (4*8)(itr2), itr2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + INCQ itr1 + CMPQ itr1, $4 + JLT openAVX2Tail512LoopB + + CMPQ itr1, $10 + JNE openAVX2Tail512LoopA + + MOVQ inl, itr1 + SUBQ $384, itr1 + ANDQ $-16, itr1 + +openAVX2Tail512HashLoop: + TESTQ itr1, itr1 + JE openAVX2Tail512HashEnd + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + SUBQ $16, itr1 + JMP openAVX2Tail512HashLoop + +openAVX2Tail512HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + + LEAQ (12*32)(inp), inp + LEAQ (12*32)(oup), oup + SUBQ $12*32, inl + + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- +// func chacha20Poly1305Seal(dst, key, src, ad []byte) +TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 + // For aligned stack access + MOVQ SP, BP + ADDQ $32, BP + ANDQ $-32, BP + MOVQ dst+0(FP), oup + MOVQ key+24(FP), keyp + MOVQ src+48(FP), inp + MOVQ src_len+56(FP), inl + MOVQ ad+72(FP), adp + + CMPB ·useAVX2(SB), $1 + JE chacha20Poly1305Seal_AVX2 + + // Special optimization, for very short buffers + CMPQ inl, $128 + JBE sealSSE128 // About 15% faster + + // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration + MOVOU ·chacha20Constants<>(SB), A0 + MOVOU (1*16)(keyp), B0 + MOVOU (2*16)(keyp), C0 + MOVOU (3*16)(keyp), D0 + + // Store state on stack for future use + MOVO B0, state1Store + MOVO C0, state2Store + + // Load state, increment counter blocks + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVQ $10, itr2 + +sealSSEIntroLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr2 + JNE sealSSEIntroLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVO A0, rStore + MOVO B0, sStore + + // Hash AAD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) + + MOVQ $128, itr1 + SUBQ $128, inl + LEAQ 128(inp), inp + + MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 + + CMPQ inl, $64 + JBE sealSSE128SealHash + + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 + MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) + + ADDQ $64, itr1 + SUBQ $64, inl + LEAQ 64(inp), inp + + MOVQ $2, itr1 + MOVQ $8, itr2 + + CMPQ inl, $64 + JBE sealSSETail64 + CMPQ inl, $128 + JBE sealSSETail128 + CMPQ inl, $192 + JBE sealSSETail192 + +sealSSEMainLoop: + // Load state, increment counter blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + +sealSSEInnerLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyAdd(0(oup)) + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + LEAQ (2*8)(oup), oup + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + polyMulStage3 + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr2 + JGE sealSSEInnerLoop + polyAdd(0(oup)) + polyMul + LEAQ (2*8)(oup), oup + DECQ itr1 + JG sealSSEInnerLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + MOVO D3, tmpStore + + // Load - xor - store + MOVOU (0*16)(inp), D3; PXOR D3, A0 + MOVOU (1*16)(inp), D3; PXOR D3, B0 + MOVOU (2*16)(inp), D3; PXOR D3, C0 + MOVOU (3*16)(inp), D3; PXOR D3, D0 + MOVOU A0, (0*16)(oup) + MOVOU B0, (1*16)(oup) + MOVOU C0, (2*16)(oup) + MOVOU D0, (3*16)(oup) + MOVO tmpStore, D3 + + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) + ADDQ $192, inp + MOVQ $192, itr1 + SUBQ $192, inl + MOVO A3, A1 + MOVO B3, B1 + MOVO C3, C1 + MOVO D3, D1 + CMPQ inl, $64 + JBE sealSSE128SealHash + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 + MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) + LEAQ 64(inp), inp + SUBQ $64, inl + MOVQ $6, itr1 + MOVQ $4, itr2 + CMPQ inl, $192 + JG sealSSEMainLoop + + MOVQ inl, itr1 + TESTQ inl, inl + JE sealSSE128SealHash + MOVQ $6, itr1 + CMPQ inl, $64 + JBE sealSSETail64 + CMPQ inl, $128 + JBE sealSSETail128 + JMP sealSSETail192 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of plaintext +sealSSETail64: + // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A1 + MOVO state1Store, B1 + MOVO state2Store, C1 + MOVO ctr3Store, D1 + PADDL ·sseIncMask<>(SB), D1 + MOVO D1, ctr0Store + +sealSSETail64LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail64LoopB: + chachaQR(A1, B1, C1, D1, T1) + shiftB1Left; shiftC1Left; shiftD1Left + chachaQR(A1, B1, C1, D1, T1) + shiftB1Right; shiftC1Right; shiftD1Right + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + + DECQ itr1 + JG sealSSETail64LoopA + + DECQ itr2 + JGE sealSSETail64LoopB + PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B1 + PADDL state2Store, C1 + PADDL ctr0Store, D1 + + JMP sealSSE128Seal + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of plaintext +sealSSETail128: + // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + +sealSSETail128LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail128LoopB: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + + DECQ itr1 + JG sealSSETail128LoopA + + DECQ itr2 + JGE sealSSETail128LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B0; PADDL state1Store, B1 + PADDL state2Store, C0; PADDL state2Store, C1 + PADDL ctr0Store, D0; PADDL ctr1Store, D1 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 + MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) + + MOVQ $64, itr1 + LEAQ 64(inp), inp + SUBQ $64, inl + + JMP sealSSE128SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of plaintext +sealSSETail192: + // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store + +sealSSETail192LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail192LoopB: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + shiftB2Left; shiftC2Left; shiftD2Left + + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + shiftB2Right; shiftC2Right; shiftD2Right + + DECQ itr1 + JG sealSSETail192LoopA + + DECQ itr2 + JGE sealSSETail192LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 + PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 + PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 + MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + + MOVO A2, A1 + MOVO B2, B1 + MOVO C2, C1 + MOVO D2, D1 + MOVQ $128, itr1 + LEAQ 128(inp), inp + SUBQ $128, inl + + JMP sealSSE128SealHash + +// ---------------------------------------------------------------------------- +// Special seal optimization for buffers smaller than 129 bytes +sealSSE128: + // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks + MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 + MOVQ $10, itr2 + +sealSSE128InnerCipherLoop: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftB1Left; shiftB2Left + shiftC0Left; shiftC1Left; shiftC2Left + shiftD0Left; shiftD1Left; shiftD2Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftB1Right; shiftB2Right + shiftC0Right; shiftC1Right; shiftC2Right + shiftD0Right; shiftD1Right; shiftD2Right + DECQ itr2 + JNE sealSSE128InnerCipherLoop + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 + PADDL T2, C1; PADDL T2, C2 + PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + PAND ·polyClampMask<>(SB), A0 + MOVOU A0, rStore + MOVOU B0, sStore + + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +sealSSE128SealHash: + // itr1 holds the number of bytes encrypted but not yet hashed + CMPQ itr1, $16 + JB sealSSE128Seal + polyAdd(0(oup)) + polyMul + + SUBQ $16, itr1 + ADDQ $16, oup + + JMP sealSSE128SealHash + +sealSSE128Seal: + CMPQ inl, $16 + JB sealSSETail + SUBQ $16, inl + + // Load for decryption + MOVOU (inp), T0 + PXOR T0, A1 + MOVOU A1, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + + // Extract for hashing + MOVQ A1, t0 + PSRLDQ $8, A1 + MOVQ A1, t1 + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + // Shift the stream "left" + MOVO B1, A1 + MOVO C1, B1 + MOVO D1, C1 + MOVO A2, D1 + MOVO B2, A2 + MOVO C2, B2 + MOVO D2, C2 + JMP sealSSE128Seal + +sealSSETail: + TESTQ inl, inl + JE sealSSEFinalize + + // We can only load the PT one byte at a time to avoid read after end of buffer + MOVQ inl, itr2 + SHLQ $4, itr2 + LEAQ ·andMask<>(SB), t0 + MOVQ inl, itr1 + LEAQ -1(inp)(inl*1), inp + XORQ t2, t2 + XORQ t3, t3 + XORQ AX, AX + +sealSSETailLoadLoop: + SHLQ $8, t2, t3 + SHLQ $8, t2 + MOVB (inp), AX + XORQ AX, t2 + LEAQ -1(inp), inp + DECQ itr1 + JNE sealSSETailLoadLoop + MOVQ t2, 0+tmpStore + MOVQ t3, 8+tmpStore + PXOR 0+tmpStore, A1 + MOVOU A1, (oup) + MOVOU -16(t0)(itr2*1), T0 + PAND T0, A1 + MOVQ A1, t0 + PSRLDQ $8, A1 + MOVQ A1, t1 + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + ADDQ inl, oup + +sealSSEFinalize: + // Hash in the buffer lengths + ADDQ ad_len+80(FP), acc0 + ADCQ src_len+56(FP), acc1 + ADCQ $1, acc2 + polyMul + + // Final reduce + MOVQ acc0, t0 + MOVQ acc1, t1 + MOVQ acc2, t2 + SUBQ $-5, acc0 + SBBQ $-1, acc1 + SBBQ $3, acc2 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + CMOVQCS t2, acc2 + + // Add in the "s" part of the key + ADDQ 0+sStore, acc0 + ADCQ 8+sStore, acc1 + + // Finally store the tag at the end of the message + MOVQ acc0, (0*8)(oup) + MOVQ acc1, (1*8)(oup) + RET + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- +chacha20Poly1305Seal_AVX2: + VZEROUPPER + VMOVDQU ·chacha20Constants<>(SB), AA0 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 + BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 + VPADDD ·avx2InitMask<>(SB), DD0, DD0 + + // Special optimizations, for very short buffers + CMPQ inl, $192 + JBE seal192AVX2 // 33% faster + CMPQ inl, $320 + JBE seal320AVX2 // 17% faster + + // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream + VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 + VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 + VMOVDQA DD3, ctr3StoreAVX2 + MOVQ $10, itr2 + +sealAVX2IntroLoop: + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 + VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 + VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 + VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 + VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 + DECQ itr2 + JNE sealAVX2IntroLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + + VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 + VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key + VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), DD0, DD0 + VMOVDQA DD0, rsStoreAVX2 + + // Hash AD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + + // Can store at least 320 bytes + VPXOR (0*32)(inp), AA0, AA0 + VPXOR (1*32)(inp), CC0, CC0 + VMOVDQU AA0, (0*32)(oup) + VMOVDQU CC0, (1*32)(oup) + + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 + VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 + VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) + + MOVQ $320, itr1 + SUBQ $320, inl + LEAQ 320(inp), inp + + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 + CMPQ inl, $128 + JBE sealAVX2SealHash + + VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 + VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) + SUBQ $128, inl + LEAQ 128(inp), inp + + MOVQ $8, itr1 + MOVQ $2, itr2 + + CMPQ inl, $128 + JBE sealAVX2Tail128 + CMPQ inl, $256 + JBE sealAVX2Tail256 + CMPQ inl, $384 + JBE sealAVX2Tail384 + CMPQ inl, $512 + JBE sealAVX2Tail512 + + // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 + VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 + VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 + VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 + VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + + SUBQ $16, oup // Adjust the pointer + MOVQ $9, itr1 + JMP sealAVX2InternalLoopStart + +sealAVX2MainLoop: + // Load state, increment counter blocks, store the incremented counters + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + MOVQ $10, itr1 + +sealAVX2InternalLoop: + polyAdd(0*8(oup)) + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage1_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulStage2_AVX2 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyMulStage3_AVX2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + +sealAVX2InternalLoopStart: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + polyAdd(2*8(oup)) + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage1_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage2_AVX2 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage3_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulReduceStage + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(4*8(oup)) + LEAQ (6*8)(oup), oup + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage1_AVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + polyMulStage2_AVX2 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage3_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + DECQ itr1 + JNE sealAVX2InternalLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + + // We only hashed 480 of the 512 bytes available - hash the remaining 32 here + polyAdd(0*8(oup)) + polyMulAVX2 + LEAQ (4*8)(oup), oup + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + // and here + polyAdd(-2*8(oup)) + polyMulAVX2 + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 + VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) + LEAQ (32*16)(inp), inp + SUBQ $(32*16), inl + CMPQ inl, $512 + JG sealAVX2MainLoop + + // Tail can only hash 480 bytes + polyAdd(0*8(oup)) + polyMulAVX2 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ 32(oup), oup + + MOVQ $10, itr1 + MOVQ $0, itr2 + CMPQ inl, $128 + JBE sealAVX2Tail128 + CMPQ inl, $256 + JBE sealAVX2Tail256 + CMPQ inl, $384 + JBE sealAVX2Tail384 + JMP sealAVX2Tail512 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes +seal192AVX2: + // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks + VMOVDQA AA0, AA1 + VMOVDQA BB0, BB1 + VMOVDQA CC0, CC1 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2 + VMOVDQA BB0, BB2 + VMOVDQA CC0, CC2 + VMOVDQA DD0, DD2 + VMOVDQA DD1, TT3 + MOVQ $10, itr2 + +sealAVX2192InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr2 + JNE sealAVX2192InnerCipherLoop + VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 + VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 + VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 + VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 192 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + +sealAVX2ShortSeal: + // Hash aad + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +sealAVX2SealHash: + // itr1 holds the number of bytes encrypted but not yet hashed + CMPQ itr1, $16 + JB sealAVX2ShortSealLoop + polyAdd(0(oup)) + polyMul + SUBQ $16, itr1 + ADDQ $16, oup + JMP sealAVX2SealHash + +sealAVX2ShortSealLoop: + CMPQ inl, $32 + JB sealAVX2ShortTail32 + SUBQ $32, inl + + // Load for encryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + + // Now can hash + polyAdd(0*8(oup)) + polyMulAVX2 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ (1*32)(oup), oup + + // Shift stream left + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + VMOVDQA AA1, DD0 + VMOVDQA BB1, AA1 + VMOVDQA CC1, BB1 + VMOVDQA DD1, CC1 + VMOVDQA AA2, DD1 + VMOVDQA BB2, AA2 + JMP sealAVX2ShortSealLoop + +sealAVX2ShortTail32: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB sealAVX2ShortDone + + SUBQ $16, inl + + // Load for encryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + + // Hash + polyAdd(0*8(oup)) + polyMulAVX2 + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +sealAVX2ShortDone: + VZEROUPPER + JMP sealSSETail + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes +seal320AVX2: + // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks + VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 + MOVQ $10, itr2 + +sealAVX2320InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr2 + JNE sealAVX2320InnerCipherLoop + + VMOVDQA ·chacha20Constants<>(SB), TT0 + VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 + VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 + VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 + VMOVDQA ·avx2IncMask<>(SB), TT0 + VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD2, DD2 + + // Clamp and store poly key + VPERM2I128 $0x02, AA0, BB0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 320 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x02, AA2, BB2, CC1 + VPERM2I128 $0x02, CC2, DD2, DD1 + VPERM2I128 $0x13, AA2, BB2, AA2 + VPERM2I128 $0x13, CC2, DD2, BB2 + JMP sealAVX2ShortSeal + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +sealAVX2Tail128: + // Need to decrypt up to 128 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0 + VMOVDQA state1StoreAVX2, BB0 + VMOVDQA state2StoreAVX2, CC0 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VMOVDQA DD0, DD1 + +sealAVX2Tail128LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail128LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0 + VPALIGNR $8, CC0, CC0, CC0 + VPALIGNR $12, DD0, DD0, DD0 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0 + VPALIGNR $8, CC0, CC0, CC0 + VPALIGNR $4, DD0, DD0, DD0 + DECQ itr1 + JG sealAVX2Tail128LoopA + DECQ itr2 + JGE sealAVX2Tail128LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA1 + VPADDD state1StoreAVX2, BB0, BB1 + VPADDD state2StoreAVX2, CC0, CC1 + VPADDD DD1, DD0, DD1 + + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + JMP sealAVX2ShortSealLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +sealAVX2Tail256: + // Need to decrypt up to 256 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA DD0, TT1 + VMOVDQA DD1, TT2 + +sealAVX2Tail256LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail256LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr1 + JG sealAVX2Tail256LoopA + DECQ itr2 + JGE sealAVX2Tail256LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + VPERM2I128 $0x02, CC0, DD0, TT1 + VPERM2I128 $0x13, AA0, BB0, TT2 + VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + MOVQ $128, itr1 + LEAQ 128(inp), inp + SUBQ $128, inl + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + + JMP sealAVX2SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext +sealAVX2Tail384: + // Need to decrypt up to 384 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 + +sealAVX2Tail384LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail384LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr1 + JG sealAVX2Tail384LoopA + DECQ itr2 + JGE sealAVX2Tail384LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 + VPERM2I128 $0x02, AA0, BB0, TT0 + VPERM2I128 $0x02, CC0, DD0, TT1 + VPERM2I128 $0x13, AA0, BB0, TT2 + VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, TT0 + VPERM2I128 $0x02, CC1, DD1, TT1 + VPERM2I128 $0x13, AA1, BB1, TT2 + VPERM2I128 $0x13, CC1, DD1, TT3 + VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 + VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) + MOVQ $256, itr1 + LEAQ 256(inp), inp + SUBQ $256, inl + VPERM2I128 $0x02, AA2, BB2, AA0 + VPERM2I128 $0x02, CC2, DD2, BB0 + VPERM2I128 $0x13, AA2, BB2, CC0 + VPERM2I128 $0x13, CC2, DD2, DD0 + + JMP sealAVX2SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext +sealAVX2Tail512: + // Need to decrypt up to 512 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + +sealAVX2Tail512LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail512LoopB: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyAdd(0*8(oup)) + polyMulAVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ (4*8)(oup), oup + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + + DECQ itr1 + JG sealAVX2Tail512LoopA + DECQ itr2 + JGE sealAVX2Tail512LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3 + VPXOR (0*32)(inp), CC3, CC3 + VMOVDQU CC3, (0*32)(oup) + VPERM2I128 $0x02, CC0, DD0, CC3 + VPXOR (1*32)(inp), CC3, CC3 + VMOVDQU CC3, (1*32)(oup) + VPERM2I128 $0x13, AA0, BB0, CC3 + VPXOR (2*32)(inp), CC3, CC3 + VMOVDQU CC3, (2*32)(oup) + VPERM2I128 $0x13, CC0, DD0, CC3 + VPXOR (3*32)(inp), CC3, CC3 + VMOVDQU CC3, (3*32)(oup) + + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + VPERM2I128 $0x02, AA2, BB2, AA0 + VPERM2I128 $0x02, CC2, DD2, BB0 + VPERM2I128 $0x13, AA2, BB2, CC0 + VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + + MOVQ $384, itr1 + LEAQ 384(inp), inp + SUBQ $384, inl + VPERM2I128 $0x02, AA3, BB3, AA0 + VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 + VPERM2I128 $0x13, AA3, BB3, CC0 + VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + + JMP sealAVX2SealHash diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go new file mode 100644 index 0000000..8d28ce2 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go @@ -0,0 +1,74 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package chacha20poly1305 + +import ( + "encoding/binary" + + "golang.org/x/crypto/internal/chacha20" + "golang.org/x/crypto/poly1305" +) + +func roundTo16(n int) int { + return 16 * ((n + 15) / 16) +} + +func (c *chacha20poly1305) sealGeneric(dst, nonce, plaintext, additionalData []byte) []byte { + ret, out := sliceForAppend(dst, len(plaintext)+poly1305.TagSize) + + var polyKey [32]byte + s := chacha20.New(c.key, [3]uint32{ + binary.LittleEndian.Uint32(nonce[0:4]), + binary.LittleEndian.Uint32(nonce[4:8]), + binary.LittleEndian.Uint32(nonce[8:12]), + }) + s.XORKeyStream(polyKey[:], polyKey[:]) + s.Advance() // skip the next 32 bytes + s.XORKeyStream(out, plaintext) + + polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(plaintext))+8+8) + copy(polyInput, additionalData) + copy(polyInput[roundTo16(len(additionalData)):], out[:len(plaintext)]) + binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData))) + binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(plaintext))) + + var tag [poly1305.TagSize]byte + poly1305.Sum(&tag, polyInput, &polyKey) + copy(out[len(plaintext):], tag[:]) + + return ret +} + +func (c *chacha20poly1305) openGeneric(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + var tag [poly1305.TagSize]byte + copy(tag[:], ciphertext[len(ciphertext)-16:]) + ciphertext = ciphertext[:len(ciphertext)-16] + + var polyKey [32]byte + s := chacha20.New(c.key, [3]uint32{ + binary.LittleEndian.Uint32(nonce[0:4]), + binary.LittleEndian.Uint32(nonce[4:8]), + binary.LittleEndian.Uint32(nonce[8:12]), + }) + s.XORKeyStream(polyKey[:], polyKey[:]) + s.Advance() // skip the next 32 bytes + + polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(ciphertext))+8+8) + copy(polyInput, additionalData) + copy(polyInput[roundTo16(len(additionalData)):], ciphertext) + binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData))) + binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(ciphertext))) + + ret, out := sliceForAppend(dst, len(ciphertext)) + if !poly1305.Verify(&tag, polyInput, &polyKey) { + for i := range out { + out[i] = 0 + } + return nil, errOpen + } + + s.XORKeyStream(out, ciphertext) + return ret, nil +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go new file mode 100644 index 0000000..4c2eb70 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go @@ -0,0 +1,15 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 !go1.7 gccgo appengine + +package chacha20poly1305 + +func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte { + return c.sealGeneric(dst, nonce, plaintext, additionalData) +} + +func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + return c.openGeneric(dst, nonce, ciphertext, additionalData) +} diff --git a/vendor/golang.org/x/crypto/hkdf/hkdf.go b/vendor/golang.org/x/crypto/hkdf/hkdf.go new file mode 100644 index 0000000..5bc2463 --- /dev/null +++ b/vendor/golang.org/x/crypto/hkdf/hkdf.go @@ -0,0 +1,75 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package hkdf implements the HMAC-based Extract-and-Expand Key Derivation +// Function (HKDF) as defined in RFC 5869. +// +// HKDF is a cryptographic key derivation function (KDF) with the goal of +// expanding limited input keying material into one or more cryptographically +// strong secret keys. +// +// RFC 5869: https://tools.ietf.org/html/rfc5869 +package hkdf // import "golang.org/x/crypto/hkdf" + +import ( + "crypto/hmac" + "errors" + "hash" + "io" +) + +type hkdf struct { + expander hash.Hash + size int + + info []byte + counter byte + + prev []byte + cache []byte +} + +func (f *hkdf) Read(p []byte) (int, error) { + // Check whether enough data can be generated + need := len(p) + remains := len(f.cache) + int(255-f.counter+1)*f.size + if remains < need { + return 0, errors.New("hkdf: entropy limit reached") + } + // Read from the cache, if enough data is present + n := copy(p, f.cache) + p = p[n:] + + // Fill the buffer + for len(p) > 0 { + f.expander.Reset() + f.expander.Write(f.prev) + f.expander.Write(f.info) + f.expander.Write([]byte{f.counter}) + f.prev = f.expander.Sum(f.prev[:0]) + f.counter++ + + // Copy the new batch into p + f.cache = f.prev + n = copy(p, f.cache) + p = p[n:] + } + // Save leftovers for next run + f.cache = f.cache[n:] + + return need, nil +} + +// New returns a new HKDF using the given hash, the secret keying material to expand +// and optional salt and info fields. +func New(hash func() hash.Hash, secret, salt, info []byte) io.Reader { + if salt == nil { + salt = make([]byte, hash().Size()) + } + extractor := hmac.New(hash, salt) + extractor.Write(secret) + prk := extractor.Sum(nil) + + return &hkdf{hmac.New(hash, prk), extractor.Size(), info, 1, nil, nil} +} diff --git a/vendor/golang.org/x/crypto/internal/chacha20/asm_s390x.s b/vendor/golang.org/x/crypto/internal/chacha20/asm_s390x.s new file mode 100644 index 0000000..98427c5 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/chacha20/asm_s390x.s @@ -0,0 +1,283 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build s390x,!gccgo,!appengine + +#include "go_asm.h" +#include "textflag.h" + +// This is an implementation of the ChaCha20 encryption algorithm as +// specified in RFC 7539. It uses vector instructions to compute +// 4 keystream blocks in parallel (256 bytes) which are then XORed +// with the bytes in the input slice. + +GLOBL ·constants<>(SB), RODATA|NOPTR, $32 +// BSWAP: swap bytes in each 4-byte element +DATA ·constants<>+0x00(SB)/4, $0x03020100 +DATA ·constants<>+0x04(SB)/4, $0x07060504 +DATA ·constants<>+0x08(SB)/4, $0x0b0a0908 +DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c +// J0: [j0, j1, j2, j3] +DATA ·constants<>+0x10(SB)/4, $0x61707865 +DATA ·constants<>+0x14(SB)/4, $0x3320646e +DATA ·constants<>+0x18(SB)/4, $0x79622d32 +DATA ·constants<>+0x1c(SB)/4, $0x6b206574 + +// EXRL targets: +TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0 + MVC $1, (R1), (R8) + RET + +TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0 + MVC $1, (R8), (R9) + RET + +#define BSWAP V5 +#define J0 V6 +#define KEY0 V7 +#define KEY1 V8 +#define NONCE V9 +#define CTR V10 +#define M0 V11 +#define M1 V12 +#define M2 V13 +#define M3 V14 +#define INC V15 +#define X0 V16 +#define X1 V17 +#define X2 V18 +#define X3 V19 +#define X4 V20 +#define X5 V21 +#define X6 V22 +#define X7 V23 +#define X8 V24 +#define X9 V25 +#define X10 V26 +#define X11 V27 +#define X12 V28 +#define X13 V29 +#define X14 V30 +#define X15 V31 + +#define NUM_ROUNDS 20 + +#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \ + VAF a1, a0, a0 \ + VAF b1, b0, b0 \ + VAF c1, c0, c0 \ + VAF d1, d0, d0 \ + VX a0, a2, a2 \ + VX b0, b2, b2 \ + VX c0, c2, c2 \ + VX d0, d2, d2 \ + VERLLF $16, a2, a2 \ + VERLLF $16, b2, b2 \ + VERLLF $16, c2, c2 \ + VERLLF $16, d2, d2 \ + VAF a2, a3, a3 \ + VAF b2, b3, b3 \ + VAF c2, c3, c3 \ + VAF d2, d3, d3 \ + VX a3, a1, a1 \ + VX b3, b1, b1 \ + VX c3, c1, c1 \ + VX d3, d1, d1 \ + VERLLF $12, a1, a1 \ + VERLLF $12, b1, b1 \ + VERLLF $12, c1, c1 \ + VERLLF $12, d1, d1 \ + VAF a1, a0, a0 \ + VAF b1, b0, b0 \ + VAF c1, c0, c0 \ + VAF d1, d0, d0 \ + VX a0, a2, a2 \ + VX b0, b2, b2 \ + VX c0, c2, c2 \ + VX d0, d2, d2 \ + VERLLF $8, a2, a2 \ + VERLLF $8, b2, b2 \ + VERLLF $8, c2, c2 \ + VERLLF $8, d2, d2 \ + VAF a2, a3, a3 \ + VAF b2, b3, b3 \ + VAF c2, c3, c3 \ + VAF d2, d3, d3 \ + VX a3, a1, a1 \ + VX b3, b1, b1 \ + VX c3, c1, c1 \ + VX d3, d1, d1 \ + VERLLF $7, a1, a1 \ + VERLLF $7, b1, b1 \ + VERLLF $7, c1, c1 \ + VERLLF $7, d1, d1 + +#define PERMUTE(mask, v0, v1, v2, v3) \ + VPERM v0, v0, mask, v0 \ + VPERM v1, v1, mask, v1 \ + VPERM v2, v2, mask, v2 \ + VPERM v3, v3, mask, v3 + +#define ADDV(x, v0, v1, v2, v3) \ + VAF x, v0, v0 \ + VAF x, v1, v1 \ + VAF x, v2, v2 \ + VAF x, v3, v3 + +#define XORV(off, dst, src, v0, v1, v2, v3) \ + VLM off(src), M0, M3 \ + PERMUTE(BSWAP, v0, v1, v2, v3) \ + VX v0, M0, M0 \ + VX v1, M1, M1 \ + VX v2, M2, M2 \ + VX v3, M3, M3 \ + VSTM M0, M3, off(dst) + +#define SHUFFLE(a, b, c, d, t, u, v, w) \ + VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]} + VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]} + VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]} + VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]} + VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]} + VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]} + VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]} + VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]} + +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int) +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 + MOVD $·constants<>(SB), R1 + MOVD dst+0(FP), R2 // R2=&dst[0] + LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src) + MOVD key+48(FP), R5 // R5=key + MOVD nonce+56(FP), R6 // R6=nonce + MOVD counter+64(FP), R7 // R7=counter + MOVD buf+72(FP), R8 // R8=buf + MOVD len+80(FP), R9 // R9=len + + // load BSWAP and J0 + VLM (R1), BSWAP, J0 + + // set up tail buffer + ADD $-1, R4, R12 + MOVBZ R12, R12 + CMPUBEQ R12, $255, aligned + MOVD R4, R1 + AND $~255, R1 + MOVD $(R3)(R1*1), R1 + EXRL $·mvcSrcToBuf(SB), R12 + MOVD $255, R0 + SUB R12, R0 + MOVD R0, (R9) // update len + +aligned: + // setup + MOVD $95, R0 + VLM (R5), KEY0, KEY1 + VLL R0, (R6), NONCE + VZERO M0 + VLEIB $7, $32, M0 + VSRLB M0, NONCE, NONCE + + // initialize counter values + VLREPF (R7), CTR + VZERO INC + VLEIF $1, $1, INC + VLEIF $2, $2, INC + VLEIF $3, $3, INC + VAF INC, CTR, CTR + VREPIF $4, INC + +chacha: + VREPF $0, J0, X0 + VREPF $1, J0, X1 + VREPF $2, J0, X2 + VREPF $3, J0, X3 + VREPF $0, KEY0, X4 + VREPF $1, KEY0, X5 + VREPF $2, KEY0, X6 + VREPF $3, KEY0, X7 + VREPF $0, KEY1, X8 + VREPF $1, KEY1, X9 + VREPF $2, KEY1, X10 + VREPF $3, KEY1, X11 + VLR CTR, X12 + VREPF $1, NONCE, X13 + VREPF $2, NONCE, X14 + VREPF $3, NONCE, X15 + + MOVD $(NUM_ROUNDS/2), R1 + +loop: + ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11) + ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9) + + ADD $-1, R1 + BNE loop + + // decrement length + ADD $-256, R4 + BLT tail + +continue: + // rearrange vectors + SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3) + ADDV(J0, X0, X1, X2, X3) + SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3) + ADDV(KEY0, X4, X5, X6, X7) + SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3) + ADDV(KEY1, X8, X9, X10, X11) + VAF CTR, X12, X12 + SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3) + ADDV(NONCE, X12, X13, X14, X15) + + // increment counters + VAF INC, CTR, CTR + + // xor keystream with plaintext + XORV(0*64, R2, R3, X0, X4, X8, X12) + XORV(1*64, R2, R3, X1, X5, X9, X13) + XORV(2*64, R2, R3, X2, X6, X10, X14) + XORV(3*64, R2, R3, X3, X7, X11, X15) + + // increment pointers + MOVD $256(R2), R2 + MOVD $256(R3), R3 + + CMPBNE R4, $0, chacha + CMPUBEQ R12, $255, return + EXRL $·mvcBufToDst(SB), R12 // len was updated during setup + +return: + VSTEF $0, CTR, (R7) + RET + +tail: + MOVD R2, R9 + MOVD R8, R2 + MOVD R8, R3 + MOVD $0, R4 + JMP continue + +// func hasVectorFacility() bool +TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 + MOVD $x-24(SP), R1 + XC $24, 0(R1), 0(R1) // clear the storage + MOVD $2, R0 // R0 is the number of double words stored -1 + WORD $0xB2B01000 // STFLE 0(R1) + XOR R0, R0 // reset the value of R0 + MOVBZ z-8(SP), R1 + AND $0x40, R1 + BEQ novector + +vectorinstalled: + // check if the vector instruction has been enabled + VLEIB $0, $0xF, V16 + VLGVB $0, V16, R1 + CMPBNE R1, $0xF, novector + MOVB $1, ret+0(FP) // have vx + RET + +novector: + MOVB $0, ret+0(FP) // no vx + RET diff --git a/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go b/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go new file mode 100644 index 0000000..7ed1cd9 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go @@ -0,0 +1,227 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package ChaCha20 implements the core ChaCha20 function as specified +// in https://tools.ietf.org/html/rfc7539#section-2.3. +package chacha20 + +import ( + "crypto/cipher" + "encoding/binary" +) + +// assert that *Cipher implements cipher.Stream +var _ cipher.Stream = (*Cipher)(nil) + +// Cipher is a stateful instance of ChaCha20 using a particular key +// and nonce. A *Cipher implements the cipher.Stream interface. +type Cipher struct { + key [8]uint32 + counter uint32 // incremented after each block + nonce [3]uint32 + buf [bufSize]byte // buffer for unused keystream bytes + len int // number of unused keystream bytes at end of buf +} + +// New creates a new ChaCha20 stream cipher with the given key and nonce. +// The initial counter value is set to 0. +func New(key [8]uint32, nonce [3]uint32) *Cipher { + return &Cipher{key: key, nonce: nonce} +} + +// XORKeyStream XORs each byte in the given slice with a byte from the +// cipher's key stream. Dst and src must overlap entirely or not at all. +// +// If len(dst) < len(src), XORKeyStream will panic. It is acceptable +// to pass a dst bigger than src, and in that case, XORKeyStream will +// only update dst[:len(src)] and will not touch the rest of dst. +// +// Multiple calls to XORKeyStream behave as if the concatenation of +// the src buffers was passed in a single run. That is, Cipher +// maintains state and does not reset at each XORKeyStream call. +func (s *Cipher) XORKeyStream(dst, src []byte) { + // xor src with buffered keystream first + if s.len != 0 { + buf := s.buf[len(s.buf)-s.len:] + if len(src) < len(buf) { + buf = buf[:len(src)] + } + td, ts := dst[:len(buf)], src[:len(buf)] // BCE hint + for i, b := range buf { + td[i] = ts[i] ^ b + } + s.len -= len(buf) + if s.len != 0 { + return + } + s.buf = [len(s.buf)]byte{} // zero the empty buffer + src = src[len(buf):] + dst = dst[len(buf):] + } + + if len(src) == 0 { + return + } + if haveAsm { + s.xorKeyStreamAsm(dst, src) + return + } + + // set up a 64-byte buffer to pad out the final block if needed + // (hoisted out of the main loop to avoid spills) + rem := len(src) % 64 // length of final block + fin := len(src) - rem // index of final block + if rem > 0 { + copy(s.buf[len(s.buf)-64:], src[fin:]) + } + + // qr calculates a quarter round + qr := func(a, b, c, d uint32) (uint32, uint32, uint32, uint32) { + a += b + d ^= a + d = (d << 16) | (d >> 16) + c += d + b ^= c + b = (b << 12) | (b >> 20) + a += b + d ^= a + d = (d << 8) | (d >> 24) + c += d + b ^= c + b = (b << 7) | (b >> 25) + return a, b, c, d + } + + // ChaCha20 constants + const ( + j0 = 0x61707865 + j1 = 0x3320646e + j2 = 0x79622d32 + j3 = 0x6b206574 + ) + + // pre-calculate most of the first round + s1, s5, s9, s13 := qr(j1, s.key[1], s.key[5], s.nonce[0]) + s2, s6, s10, s14 := qr(j2, s.key[2], s.key[6], s.nonce[1]) + s3, s7, s11, s15 := qr(j3, s.key[3], s.key[7], s.nonce[2]) + + n := len(src) + src, dst = src[:n:n], dst[:n:n] // BCE hint + for i := 0; i < n; i += 64 { + // calculate the remainder of the first round + s0, s4, s8, s12 := qr(j0, s.key[0], s.key[4], s.counter) + + // execute the second round + x0, x5, x10, x15 := qr(s0, s5, s10, s15) + x1, x6, x11, x12 := qr(s1, s6, s11, s12) + x2, x7, x8, x13 := qr(s2, s7, s8, s13) + x3, x4, x9, x14 := qr(s3, s4, s9, s14) + + // execute the remaining 18 rounds + for i := 0; i < 9; i++ { + x0, x4, x8, x12 = qr(x0, x4, x8, x12) + x1, x5, x9, x13 = qr(x1, x5, x9, x13) + x2, x6, x10, x14 = qr(x2, x6, x10, x14) + x3, x7, x11, x15 = qr(x3, x7, x11, x15) + + x0, x5, x10, x15 = qr(x0, x5, x10, x15) + x1, x6, x11, x12 = qr(x1, x6, x11, x12) + x2, x7, x8, x13 = qr(x2, x7, x8, x13) + x3, x4, x9, x14 = qr(x3, x4, x9, x14) + } + + x0 += j0 + x1 += j1 + x2 += j2 + x3 += j3 + + x4 += s.key[0] + x5 += s.key[1] + x6 += s.key[2] + x7 += s.key[3] + x8 += s.key[4] + x9 += s.key[5] + x10 += s.key[6] + x11 += s.key[7] + + x12 += s.counter + x13 += s.nonce[0] + x14 += s.nonce[1] + x15 += s.nonce[2] + + // increment the counter + s.counter += 1 + if s.counter == 0 { + panic("chacha20: counter overflow") + } + + // pad to 64 bytes if needed + in, out := src[i:], dst[i:] + if i == fin { + // src[fin:] has already been copied into s.buf before + // the main loop + in, out = s.buf[len(s.buf)-64:], s.buf[len(s.buf)-64:] + } + in, out = in[:64], out[:64] // BCE hint + + // XOR the key stream with the source and write out the result + xor(out[0:], in[0:], x0) + xor(out[4:], in[4:], x1) + xor(out[8:], in[8:], x2) + xor(out[12:], in[12:], x3) + xor(out[16:], in[16:], x4) + xor(out[20:], in[20:], x5) + xor(out[24:], in[24:], x6) + xor(out[28:], in[28:], x7) + xor(out[32:], in[32:], x8) + xor(out[36:], in[36:], x9) + xor(out[40:], in[40:], x10) + xor(out[44:], in[44:], x11) + xor(out[48:], in[48:], x12) + xor(out[52:], in[52:], x13) + xor(out[56:], in[56:], x14) + xor(out[60:], in[60:], x15) + } + // copy any trailing bytes out of the buffer and into dst + if rem != 0 { + s.len = 64 - rem + copy(dst[fin:], s.buf[len(s.buf)-64:]) + } +} + +// Advance discards bytes in the key stream until the next 64 byte block +// boundary is reached and updates the counter accordingly. If the key +// stream is already at a block boundary no bytes will be discarded and +// the counter will be unchanged. +func (s *Cipher) Advance() { + s.len -= s.len % 64 + if s.len == 0 { + s.buf = [len(s.buf)]byte{} + } +} + +// XORKeyStream crypts bytes from in to out using the given key and counters. +// In and out must overlap entirely or not at all. Counter contains the raw +// ChaCha20 counter bytes (i.e. block counter followed by nonce). +func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) { + s := Cipher{ + key: [8]uint32{ + binary.LittleEndian.Uint32(key[0:4]), + binary.LittleEndian.Uint32(key[4:8]), + binary.LittleEndian.Uint32(key[8:12]), + binary.LittleEndian.Uint32(key[12:16]), + binary.LittleEndian.Uint32(key[16:20]), + binary.LittleEndian.Uint32(key[20:24]), + binary.LittleEndian.Uint32(key[24:28]), + binary.LittleEndian.Uint32(key[28:32]), + }, + nonce: [3]uint32{ + binary.LittleEndian.Uint32(counter[4:8]), + binary.LittleEndian.Uint32(counter[8:12]), + binary.LittleEndian.Uint32(counter[12:16]), + }, + counter: binary.LittleEndian.Uint32(counter[0:4]), + } + s.XORKeyStream(out, in) +} diff --git a/vendor/golang.org/x/crypto/internal/chacha20/chacha_noasm.go b/vendor/golang.org/x/crypto/internal/chacha20/chacha_noasm.go new file mode 100644 index 0000000..91520d1 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/chacha20/chacha_noasm.go @@ -0,0 +1,16 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !s390x gccgo appengine + +package chacha20 + +const ( + bufSize = 64 + haveAsm = false +) + +func (*Cipher) xorKeyStreamAsm(dst, src []byte) { + panic("not implemented") +} diff --git a/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.go b/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.go new file mode 100644 index 0000000..0c1c671 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.go @@ -0,0 +1,30 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build s390x,!gccgo,!appengine + +package chacha20 + +var haveAsm = hasVectorFacility() + +const bufSize = 256 + +// hasVectorFacility reports whether the machine supports the vector +// facility (vx). +// Implementation in asm_s390x.s. +func hasVectorFacility() bool + +// xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only +// be called when the vector facility is available. +// Implementation in asm_s390x.s. +//go:noescape +func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int) + +func (c *Cipher) xorKeyStreamAsm(dst, src []byte) { + xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter, &c.buf, &c.len) +} + +// EXRL targets, DO NOT CALL! +func mvcSrcToBuf() +func mvcBufToDst() diff --git a/vendor/golang.org/x/crypto/internal/chacha20/xor.go b/vendor/golang.org/x/crypto/internal/chacha20/xor.go new file mode 100644 index 0000000..9c5ba0b --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/chacha20/xor.go @@ -0,0 +1,43 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found src the LICENSE file. + +package chacha20 + +import ( + "runtime" +) + +// Platforms that have fast unaligned 32-bit little endian accesses. +const unaligned = runtime.GOARCH == "386" || + runtime.GOARCH == "amd64" || + runtime.GOARCH == "arm64" || + runtime.GOARCH == "ppc64le" || + runtime.GOARCH == "s390x" + +// xor reads a little endian uint32 from src, XORs it with u and +// places the result in little endian byte order in dst. +func xor(dst, src []byte, u uint32) { + _, _ = src[3], dst[3] // eliminate bounds checks + if unaligned { + // The compiler should optimize this code into + // 32-bit unaligned little endian loads and stores. + // TODO: delete once the compiler does a reliably + // good job with the generic code below. + // See issue #25111 for more details. + v := uint32(src[0]) + v |= uint32(src[1]) << 8 + v |= uint32(src[2]) << 16 + v |= uint32(src[3]) << 24 + v ^= u + dst[0] = byte(v) + dst[1] = byte(v >> 8) + dst[2] = byte(v >> 16) + dst[3] = byte(v >> 24) + } else { + dst[0] = src[0] ^ byte(u) + dst[1] = src[1] ^ byte(u>>8) + dst[2] = src[2] ^ byte(u>>16) + dst[3] = src[3] ^ byte(u>>24) + } +} diff --git a/vendor/golang.org/x/crypto/poly1305/poly1305.go b/vendor/golang.org/x/crypto/poly1305/poly1305.go new file mode 100644 index 0000000..f562fa5 --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/poly1305.go @@ -0,0 +1,33 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* +Package poly1305 implements Poly1305 one-time message authentication code as +specified in https://cr.yp.to/mac/poly1305-20050329.pdf. + +Poly1305 is a fast, one-time authentication function. It is infeasible for an +attacker to generate an authenticator for a message without the key. However, a +key must only be used for a single message. Authenticating two different +messages with the same key allows an attacker to forge authenticators for other +messages with the same key. + +Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was +used with a fixed key in order to generate one-time keys from an nonce. +However, in this package AES isn't used and the one-time key is specified +directly. +*/ +package poly1305 // import "golang.org/x/crypto/poly1305" + +import "crypto/subtle" + +// TagSize is the size, in bytes, of a poly1305 authenticator. +const TagSize = 16 + +// Verify returns true if mac is a valid authenticator for m with the given +// key. +func Verify(mac *[16]byte, m []byte, key *[32]byte) bool { + var tmp [16]byte + Sum(&tmp, m, key) + return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1 +} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go new file mode 100644 index 0000000..4dd72fe --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go @@ -0,0 +1,22 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64,!gccgo,!appengine + +package poly1305 + +// This function is implemented in sum_amd64.s +//go:noescape +func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]byte) + +// Sum generates an authenticator for m using a one-time key and puts the +// 16-byte result into out. Authenticating two different messages with the same +// key allows an attacker to forge messages at will. +func Sum(out *[16]byte, m []byte, key *[32]byte) { + var mPtr *byte + if len(m) > 0 { + mPtr = &m[0] + } + poly1305(out, mPtr, uint64(len(m)), key) +} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/poly1305/sum_amd64.s new file mode 100644 index 0000000..2edae63 --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_amd64.s @@ -0,0 +1,125 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64,!gccgo,!appengine + +#include "textflag.h" + +#define POLY1305_ADD(msg, h0, h1, h2) \ + ADDQ 0(msg), h0; \ + ADCQ 8(msg), h1; \ + ADCQ $1, h2; \ + LEAQ 16(msg), msg + +#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ + MOVQ r0, AX; \ + MULQ h0; \ + MOVQ AX, t0; \ + MOVQ DX, t1; \ + MOVQ r0, AX; \ + MULQ h1; \ + ADDQ AX, t1; \ + ADCQ $0, DX; \ + MOVQ r0, t2; \ + IMULQ h2, t2; \ + ADDQ DX, t2; \ + \ + MOVQ r1, AX; \ + MULQ h0; \ + ADDQ AX, t1; \ + ADCQ $0, DX; \ + MOVQ DX, h0; \ + MOVQ r1, t3; \ + IMULQ h2, t3; \ + MOVQ r1, AX; \ + MULQ h1; \ + ADDQ AX, t2; \ + ADCQ DX, t3; \ + ADDQ h0, t2; \ + ADCQ $0, t3; \ + \ + MOVQ t0, h0; \ + MOVQ t1, h1; \ + MOVQ t2, h2; \ + ANDQ $3, h2; \ + MOVQ t2, t0; \ + ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ + ADDQ t0, h0; \ + ADCQ t3, h1; \ + ADCQ $0, h2; \ + SHRQ $2, t3, t2; \ + SHRQ $2, t3; \ + ADDQ t2, h0; \ + ADCQ t3, h1; \ + ADCQ $0, h2 + +DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF +DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC +GLOBL ·poly1305Mask<>(SB), RODATA, $16 + +// func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]key) +TEXT ·poly1305(SB), $0-32 + MOVQ out+0(FP), DI + MOVQ m+8(FP), SI + MOVQ mlen+16(FP), R15 + MOVQ key+24(FP), AX + + MOVQ 0(AX), R11 + MOVQ 8(AX), R12 + ANDQ ·poly1305Mask<>(SB), R11 // r0 + ANDQ ·poly1305Mask<>+8(SB), R12 // r1 + XORQ R8, R8 // h0 + XORQ R9, R9 // h1 + XORQ R10, R10 // h2 + + CMPQ R15, $16 + JB bytes_between_0_and_15 + +loop: + POLY1305_ADD(SI, R8, R9, R10) + +multiply: + POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) + SUBQ $16, R15 + CMPQ R15, $16 + JAE loop + +bytes_between_0_and_15: + TESTQ R15, R15 + JZ done + MOVQ $1, BX + XORQ CX, CX + XORQ R13, R13 + ADDQ R15, SI + +flush_buffer: + SHLQ $8, BX, CX + SHLQ $8, BX + MOVB -1(SI), R13 + XORQ R13, BX + DECQ SI + DECQ R15 + JNZ flush_buffer + + ADDQ BX, R8 + ADCQ CX, R9 + ADCQ $0, R10 + MOVQ $16, R15 + JMP multiply + +done: + MOVQ R8, AX + MOVQ R9, BX + SUBQ $0xFFFFFFFFFFFFFFFB, AX + SBBQ $0xFFFFFFFFFFFFFFFF, BX + SBBQ $3, R10 + CMOVQCS R8, AX + CMOVQCS R9, BX + MOVQ key+24(FP), R8 + ADDQ 16(R8), AX + ADCQ 24(R8), BX + + MOVQ AX, 0(DI) + MOVQ BX, 8(DI) + RET diff --git a/vendor/golang.org/x/crypto/poly1305/sum_arm.go b/vendor/golang.org/x/crypto/poly1305/sum_arm.go new file mode 100644 index 0000000..5dc321c --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_arm.go @@ -0,0 +1,22 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build arm,!gccgo,!appengine,!nacl + +package poly1305 + +// This function is implemented in sum_arm.s +//go:noescape +func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]byte) + +// Sum generates an authenticator for m using a one-time key and puts the +// 16-byte result into out. Authenticating two different messages with the same +// key allows an attacker to forge messages at will. +func Sum(out *[16]byte, m []byte, key *[32]byte) { + var mPtr *byte + if len(m) > 0 { + mPtr = &m[0] + } + poly1305_auth_armv6(out, mPtr, uint32(len(m)), key) +} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_arm.s b/vendor/golang.org/x/crypto/poly1305/sum_arm.s new file mode 100644 index 0000000..f70b4ac --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_arm.s @@ -0,0 +1,427 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build arm,!gccgo,!appengine,!nacl + +#include "textflag.h" + +// This code was translated into a form compatible with 5a from the public +// domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305. + +DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff +DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03 +DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff +DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff +DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff +GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20 + +// Warning: the linker may use R11 to synthesize certain instructions. Please +// take care and verify that no synthetic instructions use it. + +TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0 + // Needs 16 bytes of stack and 64 bytes of space pointed to by R0. (It + // might look like it's only 60 bytes of space but the final four bytes + // will be written by another function.) We need to skip over four + // bytes of stack because that's saving the value of 'g'. + ADD $4, R13, R8 + MOVM.IB [R4-R7], (R8) + MOVM.IA.W (R1), [R2-R5] + MOVW $·poly1305_init_constants_armv6<>(SB), R7 + MOVW R2, R8 + MOVW R2>>26, R9 + MOVW R3>>20, g + MOVW R4>>14, R11 + MOVW R5>>8, R12 + ORR R3<<6, R9, R9 + ORR R4<<12, g, g + ORR R5<<18, R11, R11 + MOVM.IA (R7), [R2-R6] + AND R8, R2, R2 + AND R9, R3, R3 + AND g, R4, R4 + AND R11, R5, R5 + AND R12, R6, R6 + MOVM.IA.W [R2-R6], (R0) + EOR R2, R2, R2 + EOR R3, R3, R3 + EOR R4, R4, R4 + EOR R5, R5, R5 + EOR R6, R6, R6 + MOVM.IA.W [R2-R6], (R0) + MOVM.IA.W (R1), [R2-R5] + MOVM.IA [R2-R6], (R0) + ADD $20, R13, R0 + MOVM.DA (R0), [R4-R7] + RET + +#define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \ + MOVBU (offset+0)(Rsrc), Rtmp; \ + MOVBU Rtmp, (offset+0)(Rdst); \ + MOVBU (offset+1)(Rsrc), Rtmp; \ + MOVBU Rtmp, (offset+1)(Rdst); \ + MOVBU (offset+2)(Rsrc), Rtmp; \ + MOVBU Rtmp, (offset+2)(Rdst); \ + MOVBU (offset+3)(Rsrc), Rtmp; \ + MOVBU Rtmp, (offset+3)(Rdst) + +TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0 + // Needs 24 bytes of stack for saved registers and then 88 bytes of + // scratch space after that. We assume that 24 bytes at (R13) have + // already been used: four bytes for the link register saved in the + // prelude of poly1305_auth_armv6, four bytes for saving the value of g + // in that function and 16 bytes of scratch space used around + // poly1305_finish_ext_armv6_skip1. + ADD $24, R13, R12 + MOVM.IB [R4-R8, R14], (R12) + MOVW R0, 88(R13) + MOVW R1, 92(R13) + MOVW R2, 96(R13) + MOVW R1, R14 + MOVW R2, R12 + MOVW 56(R0), R8 + WORD $0xe1180008 // TST R8, R8 not working see issue 5921 + EOR R6, R6, R6 + MOVW.EQ $(1<<24), R6 + MOVW R6, 84(R13) + ADD $116, R13, g + MOVM.IA (R0), [R0-R9] + MOVM.IA [R0-R4], (g) + CMP $16, R12 + BLO poly1305_blocks_armv6_done + +poly1305_blocks_armv6_mainloop: + WORD $0xe31e0003 // TST R14, #3 not working see issue 5921 + BEQ poly1305_blocks_armv6_mainloop_aligned + ADD $100, R13, g + MOVW_UNALIGNED(R14, g, R0, 0) + MOVW_UNALIGNED(R14, g, R0, 4) + MOVW_UNALIGNED(R14, g, R0, 8) + MOVW_UNALIGNED(R14, g, R0, 12) + MOVM.IA (g), [R0-R3] + ADD $16, R14 + B poly1305_blocks_armv6_mainloop_loaded + +poly1305_blocks_armv6_mainloop_aligned: + MOVM.IA.W (R14), [R0-R3] + +poly1305_blocks_armv6_mainloop_loaded: + MOVW R0>>26, g + MOVW R1>>20, R11 + MOVW R2>>14, R12 + MOVW R14, 92(R13) + MOVW R3>>8, R4 + ORR R1<<6, g, g + ORR R2<<12, R11, R11 + ORR R3<<18, R12, R12 + BIC $0xfc000000, R0, R0 + BIC $0xfc000000, g, g + MOVW 84(R13), R3 + BIC $0xfc000000, R11, R11 + BIC $0xfc000000, R12, R12 + ADD R0, R5, R5 + ADD g, R6, R6 + ORR R3, R4, R4 + ADD R11, R7, R7 + ADD $116, R13, R14 + ADD R12, R8, R8 + ADD R4, R9, R9 + MOVM.IA (R14), [R0-R4] + MULLU R4, R5, (R11, g) + MULLU R3, R5, (R14, R12) + MULALU R3, R6, (R11, g) + MULALU R2, R6, (R14, R12) + MULALU R2, R7, (R11, g) + MULALU R1, R7, (R14, R12) + ADD R4<<2, R4, R4 + ADD R3<<2, R3, R3 + MULALU R1, R8, (R11, g) + MULALU R0, R8, (R14, R12) + MULALU R0, R9, (R11, g) + MULALU R4, R9, (R14, R12) + MOVW g, 76(R13) + MOVW R11, 80(R13) + MOVW R12, 68(R13) + MOVW R14, 72(R13) + MULLU R2, R5, (R11, g) + MULLU R1, R5, (R14, R12) + MULALU R1, R6, (R11, g) + MULALU R0, R6, (R14, R12) + MULALU R0, R7, (R11, g) + MULALU R4, R7, (R14, R12) + ADD R2<<2, R2, R2 + ADD R1<<2, R1, R1 + MULALU R4, R8, (R11, g) + MULALU R3, R8, (R14, R12) + MULALU R3, R9, (R11, g) + MULALU R2, R9, (R14, R12) + MOVW g, 60(R13) + MOVW R11, 64(R13) + MOVW R12, 52(R13) + MOVW R14, 56(R13) + MULLU R0, R5, (R11, g) + MULALU R4, R6, (R11, g) + MULALU R3, R7, (R11, g) + MULALU R2, R8, (R11, g) + MULALU R1, R9, (R11, g) + ADD $52, R13, R0 + MOVM.IA (R0), [R0-R7] + MOVW g>>26, R12 + MOVW R4>>26, R14 + ORR R11<<6, R12, R12 + ORR R5<<6, R14, R14 + BIC $0xfc000000, g, g + BIC $0xfc000000, R4, R4 + ADD.S R12, R0, R0 + ADC $0, R1, R1 + ADD.S R14, R6, R6 + ADC $0, R7, R7 + MOVW R0>>26, R12 + MOVW R6>>26, R14 + ORR R1<<6, R12, R12 + ORR R7<<6, R14, R14 + BIC $0xfc000000, R0, R0 + BIC $0xfc000000, R6, R6 + ADD R14<<2, R14, R14 + ADD.S R12, R2, R2 + ADC $0, R3, R3 + ADD R14, g, g + MOVW R2>>26, R12 + MOVW g>>26, R14 + ORR R3<<6, R12, R12 + BIC $0xfc000000, g, R5 + BIC $0xfc000000, R2, R7 + ADD R12, R4, R4 + ADD R14, R0, R0 + MOVW R4>>26, R12 + BIC $0xfc000000, R4, R8 + ADD R12, R6, R9 + MOVW 96(R13), R12 + MOVW 92(R13), R14 + MOVW R0, R6 + CMP $32, R12 + SUB $16, R12, R12 + MOVW R12, 96(R13) + BHS poly1305_blocks_armv6_mainloop + +poly1305_blocks_armv6_done: + MOVW 88(R13), R12 + MOVW R5, 20(R12) + MOVW R6, 24(R12) + MOVW R7, 28(R12) + MOVW R8, 32(R12) + MOVW R9, 36(R12) + ADD $48, R13, R0 + MOVM.DA (R0), [R4-R8, R14] + RET + +#define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \ + MOVBU.P 1(Rsrc), Rtmp; \ + MOVBU.P Rtmp, 1(Rdst); \ + MOVBU.P 1(Rsrc), Rtmp; \ + MOVBU.P Rtmp, 1(Rdst) + +#define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \ + MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \ + MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) + +// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key) +TEXT ·poly1305_auth_armv6(SB), $196-16 + // The value 196, just above, is the sum of 64 (the size of the context + // structure) and 132 (the amount of stack needed). + // + // At this point, the stack pointer (R13) has been moved down. It + // points to the saved link register and there's 196 bytes of free + // space above it. + // + // The stack for this function looks like: + // + // +--------------------- + // | + // | 64 bytes of context structure + // | + // +--------------------- + // | + // | 112 bytes for poly1305_blocks_armv6 + // | + // +--------------------- + // | 16 bytes of final block, constructed at + // | poly1305_finish_ext_armv6_skip8 + // +--------------------- + // | four bytes of saved 'g' + // +--------------------- + // | lr, saved by prelude <- R13 points here + // +--------------------- + MOVW g, 4(R13) + + MOVW out+0(FP), R4 + MOVW m+4(FP), R5 + MOVW mlen+8(FP), R6 + MOVW key+12(FP), R7 + + ADD $136, R13, R0 // 136 = 4 + 4 + 16 + 112 + MOVW R7, R1 + + // poly1305_init_ext_armv6 will write to the stack from R13+4, but + // that's ok because none of the other values have been written yet. + BL poly1305_init_ext_armv6<>(SB) + BIC.S $15, R6, R2 + BEQ poly1305_auth_armv6_noblocks + ADD $136, R13, R0 + MOVW R5, R1 + ADD R2, R5, R5 + SUB R2, R6, R6 + BL poly1305_blocks_armv6<>(SB) + +poly1305_auth_armv6_noblocks: + ADD $136, R13, R0 + MOVW R5, R1 + MOVW R6, R2 + MOVW R4, R3 + + MOVW R0, R5 + MOVW R1, R6 + MOVW R2, R7 + MOVW R3, R8 + AND.S R2, R2, R2 + BEQ poly1305_finish_ext_armv6_noremaining + EOR R0, R0 + ADD $8, R13, R9 // 8 = offset to 16 byte scratch space + MOVW R0, (R9) + MOVW R0, 4(R9) + MOVW R0, 8(R9) + MOVW R0, 12(R9) + WORD $0xe3110003 // TST R1, #3 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_aligned + WORD $0xe3120008 // TST R2, #8 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip8 + MOVWP_UNALIGNED(R1, R9, g) + MOVWP_UNALIGNED(R1, R9, g) + +poly1305_finish_ext_armv6_skip8: + WORD $0xe3120004 // TST $4, R2 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip4 + MOVWP_UNALIGNED(R1, R9, g) + +poly1305_finish_ext_armv6_skip4: + WORD $0xe3120002 // TST $2, R2 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip2 + MOVHUP_UNALIGNED(R1, R9, g) + B poly1305_finish_ext_armv6_skip2 + +poly1305_finish_ext_armv6_aligned: + WORD $0xe3120008 // TST R2, #8 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip8_aligned + MOVM.IA.W (R1), [g-R11] + MOVM.IA.W [g-R11], (R9) + +poly1305_finish_ext_armv6_skip8_aligned: + WORD $0xe3120004 // TST $4, R2 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip4_aligned + MOVW.P 4(R1), g + MOVW.P g, 4(R9) + +poly1305_finish_ext_armv6_skip4_aligned: + WORD $0xe3120002 // TST $2, R2 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip2 + MOVHU.P 2(R1), g + MOVH.P g, 2(R9) + +poly1305_finish_ext_armv6_skip2: + WORD $0xe3120001 // TST $1, R2 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip1 + MOVBU.P 1(R1), g + MOVBU.P g, 1(R9) + +poly1305_finish_ext_armv6_skip1: + MOVW $1, R11 + MOVBU R11, 0(R9) + MOVW R11, 56(R5) + MOVW R5, R0 + ADD $8, R13, R1 + MOVW $16, R2 + BL poly1305_blocks_armv6<>(SB) + +poly1305_finish_ext_armv6_noremaining: + MOVW 20(R5), R0 + MOVW 24(R5), R1 + MOVW 28(R5), R2 + MOVW 32(R5), R3 + MOVW 36(R5), R4 + MOVW R4>>26, R12 + BIC $0xfc000000, R4, R4 + ADD R12<<2, R12, R12 + ADD R12, R0, R0 + MOVW R0>>26, R12 + BIC $0xfc000000, R0, R0 + ADD R12, R1, R1 + MOVW R1>>26, R12 + BIC $0xfc000000, R1, R1 + ADD R12, R2, R2 + MOVW R2>>26, R12 + BIC $0xfc000000, R2, R2 + ADD R12, R3, R3 + MOVW R3>>26, R12 + BIC $0xfc000000, R3, R3 + ADD R12, R4, R4 + ADD $5, R0, R6 + MOVW R6>>26, R12 + BIC $0xfc000000, R6, R6 + ADD R12, R1, R7 + MOVW R7>>26, R12 + BIC $0xfc000000, R7, R7 + ADD R12, R2, g + MOVW g>>26, R12 + BIC $0xfc000000, g, g + ADD R12, R3, R11 + MOVW $-(1<<26), R12 + ADD R11>>26, R12, R12 + BIC $0xfc000000, R11, R11 + ADD R12, R4, R9 + MOVW R9>>31, R12 + SUB $1, R12 + AND R12, R6, R6 + AND R12, R7, R7 + AND R12, g, g + AND R12, R11, R11 + AND R12, R9, R9 + MVN R12, R12 + AND R12, R0, R0 + AND R12, R1, R1 + AND R12, R2, R2 + AND R12, R3, R3 + AND R12, R4, R4 + ORR R6, R0, R0 + ORR R7, R1, R1 + ORR g, R2, R2 + ORR R11, R3, R3 + ORR R9, R4, R4 + ORR R1<<26, R0, R0 + MOVW R1>>6, R1 + ORR R2<<20, R1, R1 + MOVW R2>>12, R2 + ORR R3<<14, R2, R2 + MOVW R3>>18, R3 + ORR R4<<8, R3, R3 + MOVW 40(R5), R6 + MOVW 44(R5), R7 + MOVW 48(R5), g + MOVW 52(R5), R11 + ADD.S R6, R0, R0 + ADC.S R7, R1, R1 + ADC.S g, R2, R2 + ADC.S R11, R3, R3 + MOVM.IA [R0-R3], (R8) + MOVW R5, R12 + EOR R0, R0, R0 + EOR R1, R1, R1 + EOR R2, R2, R2 + EOR R3, R3, R3 + EOR R4, R4, R4 + EOR R5, R5, R5 + EOR R6, R6, R6 + EOR R7, R7, R7 + MOVM.IA.W [R0-R7], (R12) + MOVM.IA [R0-R7], (R12) + MOVW 4(R13), g + RET diff --git a/vendor/golang.org/x/crypto/poly1305/sum_noasm.go b/vendor/golang.org/x/crypto/poly1305/sum_noasm.go new file mode 100644 index 0000000..751eec5 --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_noasm.go @@ -0,0 +1,14 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build s390x,!go1.11 !arm,!amd64,!s390x gccgo appengine nacl + +package poly1305 + +// Sum generates an authenticator for msg using a one-time key and puts the +// 16-byte result into out. Authenticating two different messages with the same +// key allows an attacker to forge messages at will. +func Sum(out *[TagSize]byte, msg []byte, key *[32]byte) { + sumGeneric(out, msg, key) +} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_ref.go b/vendor/golang.org/x/crypto/poly1305/sum_ref.go new file mode 100644 index 0000000..c4d59bd --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_ref.go @@ -0,0 +1,139 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package poly1305 + +import "encoding/binary" + +// sumGeneric generates an authenticator for msg using a one-time key and +// puts the 16-byte result into out. This is the generic implementation of +// Sum and should be called if no assembly implementation is available. +func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) { + var ( + h0, h1, h2, h3, h4 uint32 // the hash accumulators + r0, r1, r2, r3, r4 uint64 // the r part of the key + ) + + r0 = uint64(binary.LittleEndian.Uint32(key[0:]) & 0x3ffffff) + r1 = uint64((binary.LittleEndian.Uint32(key[3:]) >> 2) & 0x3ffff03) + r2 = uint64((binary.LittleEndian.Uint32(key[6:]) >> 4) & 0x3ffc0ff) + r3 = uint64((binary.LittleEndian.Uint32(key[9:]) >> 6) & 0x3f03fff) + r4 = uint64((binary.LittleEndian.Uint32(key[12:]) >> 8) & 0x00fffff) + + R1, R2, R3, R4 := r1*5, r2*5, r3*5, r4*5 + + for len(msg) >= TagSize { + // h += msg + h0 += binary.LittleEndian.Uint32(msg[0:]) & 0x3ffffff + h1 += (binary.LittleEndian.Uint32(msg[3:]) >> 2) & 0x3ffffff + h2 += (binary.LittleEndian.Uint32(msg[6:]) >> 4) & 0x3ffffff + h3 += (binary.LittleEndian.Uint32(msg[9:]) >> 6) & 0x3ffffff + h4 += (binary.LittleEndian.Uint32(msg[12:]) >> 8) | (1 << 24) + + // h *= r + d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1) + d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2) + d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3) + d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4) + d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0) + + // h %= p + h0 = uint32(d0) & 0x3ffffff + h1 = uint32(d1) & 0x3ffffff + h2 = uint32(d2) & 0x3ffffff + h3 = uint32(d3) & 0x3ffffff + h4 = uint32(d4) & 0x3ffffff + + h0 += uint32(d4>>26) * 5 + h1 += h0 >> 26 + h0 = h0 & 0x3ffffff + + msg = msg[TagSize:] + } + + if len(msg) > 0 { + var block [TagSize]byte + off := copy(block[:], msg) + block[off] = 0x01 + + // h += msg + h0 += binary.LittleEndian.Uint32(block[0:]) & 0x3ffffff + h1 += (binary.LittleEndian.Uint32(block[3:]) >> 2) & 0x3ffffff + h2 += (binary.LittleEndian.Uint32(block[6:]) >> 4) & 0x3ffffff + h3 += (binary.LittleEndian.Uint32(block[9:]) >> 6) & 0x3ffffff + h4 += (binary.LittleEndian.Uint32(block[12:]) >> 8) + + // h *= r + d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1) + d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2) + d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3) + d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4) + d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0) + + // h %= p + h0 = uint32(d0) & 0x3ffffff + h1 = uint32(d1) & 0x3ffffff + h2 = uint32(d2) & 0x3ffffff + h3 = uint32(d3) & 0x3ffffff + h4 = uint32(d4) & 0x3ffffff + + h0 += uint32(d4>>26) * 5 + h1 += h0 >> 26 + h0 = h0 & 0x3ffffff + } + + // h %= p reduction + h2 += h1 >> 26 + h1 &= 0x3ffffff + h3 += h2 >> 26 + h2 &= 0x3ffffff + h4 += h3 >> 26 + h3 &= 0x3ffffff + h0 += 5 * (h4 >> 26) + h4 &= 0x3ffffff + h1 += h0 >> 26 + h0 &= 0x3ffffff + + // h - p + t0 := h0 + 5 + t1 := h1 + (t0 >> 26) + t2 := h2 + (t1 >> 26) + t3 := h3 + (t2 >> 26) + t4 := h4 + (t3 >> 26) - (1 << 26) + t0 &= 0x3ffffff + t1 &= 0x3ffffff + t2 &= 0x3ffffff + t3 &= 0x3ffffff + + // select h if h < p else h - p + t_mask := (t4 >> 31) - 1 + h_mask := ^t_mask + h0 = (h0 & h_mask) | (t0 & t_mask) + h1 = (h1 & h_mask) | (t1 & t_mask) + h2 = (h2 & h_mask) | (t2 & t_mask) + h3 = (h3 & h_mask) | (t3 & t_mask) + h4 = (h4 & h_mask) | (t4 & t_mask) + + // h %= 2^128 + h0 |= h1 << 26 + h1 = ((h1 >> 6) | (h2 << 20)) + h2 = ((h2 >> 12) | (h3 << 14)) + h3 = ((h3 >> 18) | (h4 << 8)) + + // s: the s part of the key + // tag = (h + s) % (2^128) + t := uint64(h0) + uint64(binary.LittleEndian.Uint32(key[16:])) + h0 = uint32(t) + t = uint64(h1) + uint64(binary.LittleEndian.Uint32(key[20:])) + (t >> 32) + h1 = uint32(t) + t = uint64(h2) + uint64(binary.LittleEndian.Uint32(key[24:])) + (t >> 32) + h2 = uint32(t) + t = uint64(h3) + uint64(binary.LittleEndian.Uint32(key[28:])) + (t >> 32) + h3 = uint32(t) + + binary.LittleEndian.PutUint32(out[0:], h0) + binary.LittleEndian.PutUint32(out[4:], h1) + binary.LittleEndian.PutUint32(out[8:], h2) + binary.LittleEndian.PutUint32(out[12:], h3) +} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go b/vendor/golang.org/x/crypto/poly1305/sum_s390x.go new file mode 100644 index 0000000..7a266ce --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_s390x.go @@ -0,0 +1,49 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build s390x,go1.11,!gccgo,!appengine + +package poly1305 + +// hasVectorFacility reports whether the machine supports +// the vector facility (vx). +func hasVectorFacility() bool + +// hasVMSLFacility reports whether the machine supports +// Vector Multiply Sum Logical (VMSL). +func hasVMSLFacility() bool + +var hasVX = hasVectorFacility() +var hasVMSL = hasVMSLFacility() + +// poly1305vx is an assembly implementation of Poly1305 that uses vector +// instructions. It must only be called if the vector facility (vx) is +// available. +//go:noescape +func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]byte) + +// poly1305vmsl is an assembly implementation of Poly1305 that uses vector +// instructions, including VMSL. It must only be called if the vector facility (vx) is +// available and if VMSL is supported. +//go:noescape +func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]byte) + +// Sum generates an authenticator for m using a one-time key and puts the +// 16-byte result into out. Authenticating two different messages with the same +// key allows an attacker to forge messages at will. +func Sum(out *[16]byte, m []byte, key *[32]byte) { + if hasVX { + var mPtr *byte + if len(m) > 0 { + mPtr = &m[0] + } + if hasVMSL && len(m) > 256 { + poly1305vmsl(out, mPtr, uint64(len(m)), key) + } else { + poly1305vx(out, mPtr, uint64(len(m)), key) + } + } else { + sumGeneric(out, m, key) + } +} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s b/vendor/golang.org/x/crypto/poly1305/sum_s390x.s new file mode 100644 index 0000000..356c07a --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_s390x.s @@ -0,0 +1,400 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build s390x,go1.11,!gccgo,!appengine + +#include "textflag.h" + +// Implementation of Poly1305 using the vector facility (vx). + +// constants +#define MOD26 V0 +#define EX0 V1 +#define EX1 V2 +#define EX2 V3 + +// temporaries +#define T_0 V4 +#define T_1 V5 +#define T_2 V6 +#define T_3 V7 +#define T_4 V8 + +// key (r) +#define R_0 V9 +#define R_1 V10 +#define R_2 V11 +#define R_3 V12 +#define R_4 V13 +#define R5_1 V14 +#define R5_2 V15 +#define R5_3 V16 +#define R5_4 V17 +#define RSAVE_0 R5 +#define RSAVE_1 R6 +#define RSAVE_2 R7 +#define RSAVE_3 R8 +#define RSAVE_4 R9 +#define R5SAVE_1 V28 +#define R5SAVE_2 V29 +#define R5SAVE_3 V30 +#define R5SAVE_4 V31 + +// message block +#define F_0 V18 +#define F_1 V19 +#define F_2 V20 +#define F_3 V21 +#define F_4 V22 + +// accumulator +#define H_0 V23 +#define H_1 V24 +#define H_2 V25 +#define H_3 V26 +#define H_4 V27 + +GLOBL ·keyMask<>(SB), RODATA, $16 +DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f +DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f + +GLOBL ·bswapMask<>(SB), RODATA, $16 +DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908 +DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100 + +GLOBL ·constants<>(SB), RODATA, $64 +// MOD26 +DATA ·constants<>+0(SB)/8, $0x3ffffff +DATA ·constants<>+8(SB)/8, $0x3ffffff +// EX0 +DATA ·constants<>+16(SB)/8, $0x0006050403020100 +DATA ·constants<>+24(SB)/8, $0x1016151413121110 +// EX1 +DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706 +DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716 +// EX2 +DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d +DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d + +// h = (f*g) % (2**130-5) [partial reduction] +#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \ + VMLOF f0, g0, h0 \ + VMLOF f0, g1, h1 \ + VMLOF f0, g2, h2 \ + VMLOF f0, g3, h3 \ + VMLOF f0, g4, h4 \ + VMLOF f1, g54, T_0 \ + VMLOF f1, g0, T_1 \ + VMLOF f1, g1, T_2 \ + VMLOF f1, g2, T_3 \ + VMLOF f1, g3, T_4 \ + VMALOF f2, g53, h0, h0 \ + VMALOF f2, g54, h1, h1 \ + VMALOF f2, g0, h2, h2 \ + VMALOF f2, g1, h3, h3 \ + VMALOF f2, g2, h4, h4 \ + VMALOF f3, g52, T_0, T_0 \ + VMALOF f3, g53, T_1, T_1 \ + VMALOF f3, g54, T_2, T_2 \ + VMALOF f3, g0, T_3, T_3 \ + VMALOF f3, g1, T_4, T_4 \ + VMALOF f4, g51, h0, h0 \ + VMALOF f4, g52, h1, h1 \ + VMALOF f4, g53, h2, h2 \ + VMALOF f4, g54, h3, h3 \ + VMALOF f4, g0, h4, h4 \ + VAG T_0, h0, h0 \ + VAG T_1, h1, h1 \ + VAG T_2, h2, h2 \ + VAG T_3, h3, h3 \ + VAG T_4, h4, h4 + +// carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4 +#define REDUCE(h0, h1, h2, h3, h4) \ + VESRLG $26, h0, T_0 \ + VESRLG $26, h3, T_1 \ + VN MOD26, h0, h0 \ + VN MOD26, h3, h3 \ + VAG T_0, h1, h1 \ + VAG T_1, h4, h4 \ + VESRLG $26, h1, T_2 \ + VESRLG $26, h4, T_3 \ + VN MOD26, h1, h1 \ + VN MOD26, h4, h4 \ + VESLG $2, T_3, T_4 \ + VAG T_3, T_4, T_4 \ + VAG T_2, h2, h2 \ + VAG T_4, h0, h0 \ + VESRLG $26, h2, T_0 \ + VESRLG $26, h0, T_1 \ + VN MOD26, h2, h2 \ + VN MOD26, h0, h0 \ + VAG T_0, h3, h3 \ + VAG T_1, h1, h1 \ + VESRLG $26, h3, T_2 \ + VN MOD26, h3, h3 \ + VAG T_2, h4, h4 + +// expand in0 into d[0] and in1 into d[1] +#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \ + VGBM $0x0707, d1 \ // d1=tmp + VPERM in0, in1, EX2, d4 \ + VPERM in0, in1, EX0, d0 \ + VPERM in0, in1, EX1, d2 \ + VN d1, d4, d4 \ + VESRLG $26, d0, d1 \ + VESRLG $30, d2, d3 \ + VESRLG $4, d2, d2 \ + VN MOD26, d0, d0 \ + VN MOD26, d1, d1 \ + VN MOD26, d2, d2 \ + VN MOD26, d3, d3 + +// pack h4:h0 into h1:h0 (no carry) +#define PACK(h0, h1, h2, h3, h4) \ + VESLG $26, h1, h1 \ + VESLG $26, h3, h3 \ + VO h0, h1, h0 \ + VO h2, h3, h2 \ + VESLG $4, h2, h2 \ + VLEIB $7, $48, h1 \ + VSLB h1, h2, h2 \ + VO h0, h2, h0 \ + VLEIB $7, $104, h1 \ + VSLB h1, h4, h3 \ + VO h3, h0, h0 \ + VLEIB $7, $24, h1 \ + VSRLB h1, h4, h1 + +// if h > 2**130-5 then h -= 2**130-5 +#define MOD(h0, h1, t0, t1, t2) \ + VZERO t0 \ + VLEIG $1, $5, t0 \ + VACCQ h0, t0, t1 \ + VAQ h0, t0, t0 \ + VONE t2 \ + VLEIG $1, $-4, t2 \ + VAQ t2, t1, t1 \ + VACCQ h1, t1, t1 \ + VONE t2 \ + VAQ t2, t1, t1 \ + VN h0, t1, t2 \ + VNC t0, t1, t1 \ + VO t1, t2, h0 + +// func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key) +TEXT ·poly1305vx(SB), $0-32 + // This code processes up to 2 blocks (32 bytes) per iteration + // using the algorithm described in: + // NEON crypto, Daniel J. Bernstein & Peter Schwabe + // https://cryptojedi.org/papers/neoncrypto-20120320.pdf + LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key + + // load MOD26, EX0, EX1 and EX2 + MOVD $·constants<>(SB), R5 + VLM (R5), MOD26, EX2 + + // setup r + VL (R4), T_0 + MOVD $·keyMask<>(SB), R6 + VL (R6), T_1 + VN T_0, T_1, T_0 + EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4) + + // setup r*5 + VLEIG $0, $5, T_0 + VLEIG $1, $5, T_0 + + // store r (for final block) + VMLOF T_0, R_1, R5SAVE_1 + VMLOF T_0, R_2, R5SAVE_2 + VMLOF T_0, R_3, R5SAVE_3 + VMLOF T_0, R_4, R5SAVE_4 + VLGVG $0, R_0, RSAVE_0 + VLGVG $0, R_1, RSAVE_1 + VLGVG $0, R_2, RSAVE_2 + VLGVG $0, R_3, RSAVE_3 + VLGVG $0, R_4, RSAVE_4 + + // skip r**2 calculation + CMPBLE R3, $16, skip + + // calculate r**2 + MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4) + REDUCE(H_0, H_1, H_2, H_3, H_4) + VLEIG $0, $5, T_0 + VLEIG $1, $5, T_0 + VMLOF T_0, H_1, R5_1 + VMLOF T_0, H_2, R5_2 + VMLOF T_0, H_3, R5_3 + VMLOF T_0, H_4, R5_4 + VLR H_0, R_0 + VLR H_1, R_1 + VLR H_2, R_2 + VLR H_3, R_3 + VLR H_4, R_4 + + // initialize h + VZERO H_0 + VZERO H_1 + VZERO H_2 + VZERO H_3 + VZERO H_4 + +loop: + CMPBLE R3, $32, b2 + VLM (R2), T_0, T_1 + SUB $32, R3 + MOVD $32(R2), R2 + EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4) + VLEIB $4, $1, F_4 + VLEIB $12, $1, F_4 + +multiply: + VAG H_0, F_0, F_0 + VAG H_1, F_1, F_1 + VAG H_2, F_2, F_2 + VAG H_3, F_3, F_3 + VAG H_4, F_4, F_4 + MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4) + REDUCE(H_0, H_1, H_2, H_3, H_4) + CMPBNE R3, $0, loop + +finish: + // sum vectors + VZERO T_0 + VSUMQG H_0, T_0, H_0 + VSUMQG H_1, T_0, H_1 + VSUMQG H_2, T_0, H_2 + VSUMQG H_3, T_0, H_3 + VSUMQG H_4, T_0, H_4 + + // h may be >= 2*(2**130-5) so we need to reduce it again + REDUCE(H_0, H_1, H_2, H_3, H_4) + + // carry h1->h4 + VESRLG $26, H_1, T_1 + VN MOD26, H_1, H_1 + VAQ T_1, H_2, H_2 + VESRLG $26, H_2, T_2 + VN MOD26, H_2, H_2 + VAQ T_2, H_3, H_3 + VESRLG $26, H_3, T_3 + VN MOD26, H_3, H_3 + VAQ T_3, H_4, H_4 + + // h is now < 2*(2**130-5) + // pack h into h1 (hi) and h0 (lo) + PACK(H_0, H_1, H_2, H_3, H_4) + + // if h > 2**130-5 then h -= 2**130-5 + MOD(H_0, H_1, T_0, T_1, T_2) + + // h += s + MOVD $·bswapMask<>(SB), R5 + VL (R5), T_1 + VL 16(R4), T_0 + VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big) + VAQ T_0, H_0, H_0 + VPERM H_0, H_0, T_1, H_0 // reverse bytes (to little) + VST H_0, (R1) + + RET + +b2: + CMPBLE R3, $16, b1 + + // 2 blocks remaining + SUB $17, R3 + VL (R2), T_0 + VLL R3, 16(R2), T_1 + ADD $1, R3 + MOVBZ $1, R0 + CMPBEQ R3, $16, 2(PC) + VLVGB R3, R0, T_1 + EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4) + CMPBNE R3, $16, 2(PC) + VLEIB $12, $1, F_4 + VLEIB $4, $1, F_4 + + // setup [r²,r] + VLVGG $1, RSAVE_0, R_0 + VLVGG $1, RSAVE_1, R_1 + VLVGG $1, RSAVE_2, R_2 + VLVGG $1, RSAVE_3, R_3 + VLVGG $1, RSAVE_4, R_4 + VPDI $0, R5_1, R5SAVE_1, R5_1 + VPDI $0, R5_2, R5SAVE_2, R5_2 + VPDI $0, R5_3, R5SAVE_3, R5_3 + VPDI $0, R5_4, R5SAVE_4, R5_4 + + MOVD $0, R3 + BR multiply + +skip: + VZERO H_0 + VZERO H_1 + VZERO H_2 + VZERO H_3 + VZERO H_4 + + CMPBEQ R3, $0, finish + +b1: + // 1 block remaining + SUB $1, R3 + VLL R3, (R2), T_0 + ADD $1, R3 + MOVBZ $1, R0 + CMPBEQ R3, $16, 2(PC) + VLVGB R3, R0, T_0 + VZERO T_1 + EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4) + CMPBNE R3, $16, 2(PC) + VLEIB $4, $1, F_4 + VLEIG $1, $1, R_0 + VZERO R_1 + VZERO R_2 + VZERO R_3 + VZERO R_4 + VZERO R5_1 + VZERO R5_2 + VZERO R5_3 + VZERO R5_4 + + // setup [r, 1] + VLVGG $0, RSAVE_0, R_0 + VLVGG $0, RSAVE_1, R_1 + VLVGG $0, RSAVE_2, R_2 + VLVGG $0, RSAVE_3, R_3 + VLVGG $0, RSAVE_4, R_4 + VPDI $0, R5SAVE_1, R5_1, R5_1 + VPDI $0, R5SAVE_2, R5_2, R5_2 + VPDI $0, R5SAVE_3, R5_3, R5_3 + VPDI $0, R5SAVE_4, R5_4, R5_4 + + MOVD $0, R3 + BR multiply + +TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 + MOVD $x-24(SP), R1 + XC $24, 0(R1), 0(R1) // clear the storage + MOVD $2, R0 // R0 is the number of double words stored -1 + WORD $0xB2B01000 // STFLE 0(R1) + XOR R0, R0 // reset the value of R0 + MOVBZ z-8(SP), R1 + AND $0x40, R1 + BEQ novector + +vectorinstalled: + // check if the vector instruction has been enabled + VLEIB $0, $0xF, V16 + VLGVB $0, V16, R1 + CMPBNE R1, $0xF, novector + MOVB $1, ret+0(FP) // have vx + RET + +novector: + MOVB $0, ret+0(FP) // no vx + RET diff --git a/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s b/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s new file mode 100644 index 0000000..e548020 --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s @@ -0,0 +1,931 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build s390x,go1.11,!gccgo,!appengine + +#include "textflag.h" + +// Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction. + +// constants +#define EX0 V1 +#define EX1 V2 +#define EX2 V3 + +// temporaries +#define T_0 V4 +#define T_1 V5 +#define T_2 V6 +#define T_3 V7 +#define T_4 V8 +#define T_5 V9 +#define T_6 V10 +#define T_7 V11 +#define T_8 V12 +#define T_9 V13 +#define T_10 V14 + +// r**2 & r**4 +#define R_0 V15 +#define R_1 V16 +#define R_2 V17 +#define R5_1 V18 +#define R5_2 V19 +// key (r) +#define RSAVE_0 R7 +#define RSAVE_1 R8 +#define RSAVE_2 R9 +#define R5SAVE_1 R10 +#define R5SAVE_2 R11 + +// message block +#define M0 V20 +#define M1 V21 +#define M2 V22 +#define M3 V23 +#define M4 V24 +#define M5 V25 + +// accumulator +#define H0_0 V26 +#define H1_0 V27 +#define H2_0 V28 +#define H0_1 V29 +#define H1_1 V30 +#define H2_1 V31 + +GLOBL ·keyMask<>(SB), RODATA, $16 +DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f +DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f + +GLOBL ·bswapMask<>(SB), RODATA, $16 +DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908 +DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100 + +GLOBL ·constants<>(SB), RODATA, $48 +// EX0 +DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f +DATA ·constants<>+8(SB)/8, $0x0000050403020100 +// EX1 +DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f +DATA ·constants<>+24(SB)/8, $0x00000a0908070605 +// EX2 +DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f +DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b + +GLOBL ·c<>(SB), RODATA, $48 +// EX0 +DATA ·c<>+0(SB)/8, $0x0000050403020100 +DATA ·c<>+8(SB)/8, $0x0000151413121110 +// EX1 +DATA ·c<>+16(SB)/8, $0x00000a0908070605 +DATA ·c<>+24(SB)/8, $0x00001a1918171615 +// EX2 +DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b +DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b + +GLOBL ·reduce<>(SB), RODATA, $32 +// 44 bit +DATA ·reduce<>+0(SB)/8, $0x0 +DATA ·reduce<>+8(SB)/8, $0xfffffffffff +// 42 bit +DATA ·reduce<>+16(SB)/8, $0x0 +DATA ·reduce<>+24(SB)/8, $0x3ffffffffff + +// h = (f*g) % (2**130-5) [partial reduction] +// uses T_0...T_9 temporary registers +// input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2 +// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9 +// output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2 +#define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \ + \ // Eliminate the dependency for the last 2 VMSLs + VMSLG m02_0, r_2, m4_2, m4_2 \ + VMSLG m13_0, r_2, m5_2, m5_2 \ // 8 VMSLs pipelined + VMSLG m02_0, r_0, m4_0, m4_0 \ + VMSLG m02_1, r5_2, V0, T_0 \ + VMSLG m02_0, r_1, m4_1, m4_1 \ + VMSLG m02_1, r_0, V0, T_1 \ + VMSLG m02_1, r_1, V0, T_2 \ + VMSLG m02_2, r5_1, V0, T_3 \ + VMSLG m02_2, r5_2, V0, T_4 \ + VMSLG m13_0, r_0, m5_0, m5_0 \ + VMSLG m13_1, r5_2, V0, T_5 \ + VMSLG m13_0, r_1, m5_1, m5_1 \ + VMSLG m13_1, r_0, V0, T_6 \ + VMSLG m13_1, r_1, V0, T_7 \ + VMSLG m13_2, r5_1, V0, T_8 \ + VMSLG m13_2, r5_2, V0, T_9 \ + VMSLG m02_2, r_0, m4_2, m4_2 \ + VMSLG m13_2, r_0, m5_2, m5_2 \ + VAQ m4_0, T_0, m02_0 \ + VAQ m4_1, T_1, m02_1 \ + VAQ m5_0, T_5, m13_0 \ + VAQ m5_1, T_6, m13_1 \ + VAQ m02_0, T_3, m02_0 \ + VAQ m02_1, T_4, m02_1 \ + VAQ m13_0, T_8, m13_0 \ + VAQ m13_1, T_9, m13_1 \ + VAQ m4_2, T_2, m02_2 \ + VAQ m5_2, T_7, m13_2 \ + +// SQUARE uses three limbs of r and r_2*5 to output square of r +// uses T_1, T_5 and T_7 temporary registers +// input: r_0, r_1, r_2, r5_2 +// temp: TEMP0, TEMP1, TEMP2 +// output: p0, p1, p2 +#define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \ + VMSLG r_0, r_0, p0, p0 \ + VMSLG r_1, r5_2, V0, TEMP0 \ + VMSLG r_2, r5_2, p1, p1 \ + VMSLG r_0, r_1, V0, TEMP1 \ + VMSLG r_1, r_1, p2, p2 \ + VMSLG r_0, r_2, V0, TEMP2 \ + VAQ TEMP0, p0, p0 \ + VAQ TEMP1, p1, p1 \ + VAQ TEMP2, p2, p2 \ + VAQ TEMP0, p0, p0 \ + VAQ TEMP1, p1, p1 \ + VAQ TEMP2, p2, p2 \ + +// carry h0->h1->h2->h0 || h3->h4->h5->h3 +// uses T_2, T_4, T_5, T_7, T_8, T_9 +// t6, t7, t8, t9, t10, t11 +// input: h0, h1, h2, h3, h4, h5 +// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11 +// output: h0, h1, h2, h3, h4, h5 +#define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \ + VLM (R12), t6, t7 \ // 44 and 42 bit clear mask + VLEIB $7, $0x28, t10 \ // 5 byte shift mask + VREPIB $4, t8 \ // 4 bit shift mask + VREPIB $2, t11 \ // 2 bit shift mask + VSRLB t10, h0, t0 \ // h0 byte shift + VSRLB t10, h1, t1 \ // h1 byte shift + VSRLB t10, h2, t2 \ // h2 byte shift + VSRLB t10, h3, t3 \ // h3 byte shift + VSRLB t10, h4, t4 \ // h4 byte shift + VSRLB t10, h5, t5 \ // h5 byte shift + VSRL t8, t0, t0 \ // h0 bit shift + VSRL t8, t1, t1 \ // h2 bit shift + VSRL t11, t2, t2 \ // h2 bit shift + VSRL t8, t3, t3 \ // h3 bit shift + VSRL t8, t4, t4 \ // h4 bit shift + VESLG $2, t2, t9 \ // h2 carry x5 + VSRL t11, t5, t5 \ // h5 bit shift + VN t6, h0, h0 \ // h0 clear carry + VAQ t2, t9, t2 \ // h2 carry x5 + VESLG $2, t5, t9 \ // h5 carry x5 + VN t6, h1, h1 \ // h1 clear carry + VN t7, h2, h2 \ // h2 clear carry + VAQ t5, t9, t5 \ // h5 carry x5 + VN t6, h3, h3 \ // h3 clear carry + VN t6, h4, h4 \ // h4 clear carry + VN t7, h5, h5 \ // h5 clear carry + VAQ t0, h1, h1 \ // h0->h1 + VAQ t3, h4, h4 \ // h3->h4 + VAQ t1, h2, h2 \ // h1->h2 + VAQ t4, h5, h5 \ // h4->h5 + VAQ t2, h0, h0 \ // h2->h0 + VAQ t5, h3, h3 \ // h5->h3 + VREPG $1, t6, t6 \ // 44 and 42 bit masks across both halves + VREPG $1, t7, t7 \ + VSLDB $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5] + VSLDB $8, h1, h1, h1 \ + VSLDB $8, h2, h2, h2 \ + VO h0, h3, h3 \ + VO h1, h4, h4 \ + VO h2, h5, h5 \ + VESRLG $44, h3, t0 \ // 44 bit shift right + VESRLG $44, h4, t1 \ + VESRLG $42, h5, t2 \ + VN t6, h3, h3 \ // clear carry bits + VN t6, h4, h4 \ + VN t7, h5, h5 \ + VESLG $2, t2, t9 \ // multiply carry by 5 + VAQ t9, t2, t2 \ + VAQ t0, h4, h4 \ + VAQ t1, h5, h5 \ + VAQ t2, h3, h3 \ + +// carry h0->h1->h2->h0 +// input: h0, h1, h2 +// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8 +// output: h0, h1, h2 +#define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \ + VLEIB $7, $0x28, t3 \ // 5 byte shift mask + VREPIB $4, t4 \ // 4 bit shift mask + VREPIB $2, t7 \ // 2 bit shift mask + VGBM $0x003F, t5 \ // mask to clear carry bits + VSRLB t3, h0, t0 \ + VSRLB t3, h1, t1 \ + VSRLB t3, h2, t2 \ + VESRLG $4, t5, t5 \ // 44 bit clear mask + VSRL t4, t0, t0 \ + VSRL t4, t1, t1 \ + VSRL t7, t2, t2 \ + VESRLG $2, t5, t6 \ // 42 bit clear mask + VESLG $2, t2, t8 \ + VAQ t8, t2, t2 \ + VN t5, h0, h0 \ + VN t5, h1, h1 \ + VN t6, h2, h2 \ + VAQ t0, h1, h1 \ + VAQ t1, h2, h2 \ + VAQ t2, h0, h0 \ + VSRLB t3, h0, t0 \ + VSRLB t3, h1, t1 \ + VSRLB t3, h2, t2 \ + VSRL t4, t0, t0 \ + VSRL t4, t1, t1 \ + VSRL t7, t2, t2 \ + VN t5, h0, h0 \ + VN t5, h1, h1 \ + VESLG $2, t2, t8 \ + VN t6, h2, h2 \ + VAQ t0, h1, h1 \ + VAQ t8, t2, t2 \ + VAQ t1, h2, h2 \ + VAQ t2, h0, h0 \ + +// expands two message blocks into the lower halfs of the d registers +// moves the contents of the d registers into upper halfs +// input: in1, in2, d0, d1, d2, d3, d4, d5 +// temp: TEMP0, TEMP1, TEMP2, TEMP3 +// output: d0, d1, d2, d3, d4, d5 +#define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \ + VGBM $0xff3f, TEMP0 \ + VGBM $0xff1f, TEMP1 \ + VESLG $4, d1, TEMP2 \ + VESLG $4, d4, TEMP3 \ + VESRLG $4, TEMP0, TEMP0 \ + VPERM in1, d0, EX0, d0 \ + VPERM in2, d3, EX0, d3 \ + VPERM in1, d2, EX2, d2 \ + VPERM in2, d5, EX2, d5 \ + VPERM in1, TEMP2, EX1, d1 \ + VPERM in2, TEMP3, EX1, d4 \ + VN TEMP0, d0, d0 \ + VN TEMP0, d3, d3 \ + VESRLG $4, d1, d1 \ + VESRLG $4, d4, d4 \ + VN TEMP1, d2, d2 \ + VN TEMP1, d5, d5 \ + VN TEMP0, d1, d1 \ + VN TEMP0, d4, d4 \ + +// expands one message block into the lower halfs of the d registers +// moves the contents of the d registers into upper halfs +// input: in, d0, d1, d2 +// temp: TEMP0, TEMP1, TEMP2 +// output: d0, d1, d2 +#define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \ + VGBM $0xff3f, TEMP0 \ + VESLG $4, d1, TEMP2 \ + VGBM $0xff1f, TEMP1 \ + VPERM in, d0, EX0, d0 \ + VESRLG $4, TEMP0, TEMP0 \ + VPERM in, d2, EX2, d2 \ + VPERM in, TEMP2, EX1, d1 \ + VN TEMP0, d0, d0 \ + VN TEMP1, d2, d2 \ + VESRLG $4, d1, d1 \ + VN TEMP0, d1, d1 \ + +// pack h2:h0 into h1:h0 (no carry) +// input: h0, h1, h2 +// output: h0, h1, h2 +#define PACK(h0, h1, h2) \ + VMRLG h1, h2, h2 \ // copy h1 to upper half h2 + VESLG $44, h1, h1 \ // shift limb 1 44 bits, leaving 20 + VO h0, h1, h0 \ // combine h0 with 20 bits from limb 1 + VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1 + VLEIG $1, $0, h1 \ // clear h2 stuff from lower half of h1 + VO h0, h1, h0 \ // h0 now has 88 bits (limb 0 and 1) + VLEIG $0, $0, h2 \ // clear upper half of h2 + VESRLG $40, h2, h1 \ // h1 now has upper two bits of result + VLEIB $7, $88, h1 \ // for byte shift (11 bytes) + VSLB h1, h2, h2 \ // shift h2 11 bytes to the left + VO h0, h2, h0 \ // combine h0 with 20 bits from limb 1 + VLEIG $0, $0, h1 \ // clear upper half of h1 + +// if h > 2**130-5 then h -= 2**130-5 +// input: h0, h1 +// temp: t0, t1, t2 +// output: h0 +#define MOD(h0, h1, t0, t1, t2) \ + VZERO t0 \ + VLEIG $1, $5, t0 \ + VACCQ h0, t0, t1 \ + VAQ h0, t0, t0 \ + VONE t2 \ + VLEIG $1, $-4, t2 \ + VAQ t2, t1, t1 \ + VACCQ h1, t1, t1 \ + VONE t2 \ + VAQ t2, t1, t1 \ + VN h0, t1, t2 \ + VNC t0, t1, t1 \ + VO t1, t2, h0 \ + +// func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key) +TEXT ·poly1305vmsl(SB), $0-32 + // This code processes 6 + up to 4 blocks (32 bytes) per iteration + // using the algorithm described in: + // NEON crypto, Daniel J. Bernstein & Peter Schwabe + // https://cryptojedi.org/papers/neoncrypto-20120320.pdf + // And as moddified for VMSL as described in + // Accelerating Poly1305 Cryptographic Message Authentication on the z14 + // O'Farrell et al, CASCON 2017, p48-55 + // https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht + + LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key + VZERO V0 // c + + // load EX0, EX1 and EX2 + MOVD $·constants<>(SB), R5 + VLM (R5), EX0, EX2 // c + + // setup r + VL (R4), T_0 + MOVD $·keyMask<>(SB), R6 + VL (R6), T_1 + VN T_0, T_1, T_0 + VZERO T_2 // limbs for r + VZERO T_3 + VZERO T_4 + EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7) + + // T_2, T_3, T_4: [0, r] + + // setup r*20 + VLEIG $0, $0, T_0 + VLEIG $1, $20, T_0 // T_0: [0, 20] + VZERO T_5 + VZERO T_6 + VMSLG T_0, T_3, T_5, T_5 + VMSLG T_0, T_4, T_6, T_6 + + // store r for final block in GR + VLGVG $1, T_2, RSAVE_0 // c + VLGVG $1, T_3, RSAVE_1 // c + VLGVG $1, T_4, RSAVE_2 // c + VLGVG $1, T_5, R5SAVE_1 // c + VLGVG $1, T_6, R5SAVE_2 // c + + // initialize h + VZERO H0_0 + VZERO H1_0 + VZERO H2_0 + VZERO H0_1 + VZERO H1_1 + VZERO H2_1 + + // initialize pointer for reduce constants + MOVD $·reduce<>(SB), R12 + + // calculate r**2 and 20*(r**2) + VZERO R_0 + VZERO R_1 + VZERO R_2 + SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7) + REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1) + VZERO R5_1 + VZERO R5_2 + VMSLG T_0, R_1, R5_1, R5_1 + VMSLG T_0, R_2, R5_2, R5_2 + + // skip r**4 calculation if 3 blocks or less + CMPBLE R3, $48, b4 + + // calculate r**4 and 20*(r**4) + VZERO T_8 + VZERO T_9 + VZERO T_10 + SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7) + REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1) + VZERO T_2 + VZERO T_3 + VMSLG T_0, T_9, T_2, T_2 + VMSLG T_0, T_10, T_3, T_3 + + // put r**2 to the right and r**4 to the left of R_0, R_1, R_2 + VSLDB $8, T_8, T_8, T_8 + VSLDB $8, T_9, T_9, T_9 + VSLDB $8, T_10, T_10, T_10 + VSLDB $8, T_2, T_2, T_2 + VSLDB $8, T_3, T_3, T_3 + + VO T_8, R_0, R_0 + VO T_9, R_1, R_1 + VO T_10, R_2, R_2 + VO T_2, R5_1, R5_1 + VO T_3, R5_2, R5_2 + + CMPBLE R3, $80, load // less than or equal to 5 blocks in message + + // 6(or 5+1) blocks + SUB $81, R3 + VLM (R2), M0, M4 + VLL R3, 80(R2), M5 + ADD $1, R3 + MOVBZ $1, R0 + CMPBGE R3, $16, 2(PC) + VLVGB R3, R0, M5 + MOVD $96(R2), R2 + EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3) + EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3) + VLEIB $2, $1, H2_0 + VLEIB $2, $1, H2_1 + VLEIB $10, $1, H2_0 + VLEIB $10, $1, H2_1 + + VZERO M0 + VZERO M1 + VZERO M2 + VZERO M3 + VZERO T_4 + VZERO T_10 + EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3) + VLR T_4, M4 + VLEIB $10, $1, M2 + CMPBLT R3, $16, 2(PC) + VLEIB $10, $1, T_10 + MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) + REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9) + VMRHG V0, H0_1, H0_0 + VMRHG V0, H1_1, H1_0 + VMRHG V0, H2_1, H2_0 + VMRLG V0, H0_1, H0_1 + VMRLG V0, H1_1, H1_1 + VMRLG V0, H2_1, H2_1 + + SUB $16, R3 + CMPBLE R3, $0, square + +load: + // load EX0, EX1 and EX2 + MOVD $·c<>(SB), R5 + VLM (R5), EX0, EX2 + +loop: + CMPBLE R3, $64, add // b4 // last 4 or less blocks left + + // next 4 full blocks + VLM (R2), M2, M5 + SUB $64, R3 + MOVD $64(R2), R2 + REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9) + + // expacc in-lined to create [m2, m3] limbs + VGBM $0x3f3f, T_0 // 44 bit clear mask + VGBM $0x1f1f, T_1 // 40 bit clear mask + VPERM M2, M3, EX0, T_3 + VESRLG $4, T_0, T_0 // 44 bit clear mask ready + VPERM M2, M3, EX1, T_4 + VPERM M2, M3, EX2, T_5 + VN T_0, T_3, T_3 + VESRLG $4, T_4, T_4 + VN T_1, T_5, T_5 + VN T_0, T_4, T_4 + VMRHG H0_1, T_3, H0_0 + VMRHG H1_1, T_4, H1_0 + VMRHG H2_1, T_5, H2_0 + VMRLG H0_1, T_3, H0_1 + VMRLG H1_1, T_4, H1_1 + VMRLG H2_1, T_5, H2_1 + VLEIB $10, $1, H2_0 + VLEIB $10, $1, H2_1 + VPERM M4, M5, EX0, T_3 + VPERM M4, M5, EX1, T_4 + VPERM M4, M5, EX2, T_5 + VN T_0, T_3, T_3 + VESRLG $4, T_4, T_4 + VN T_1, T_5, T_5 + VN T_0, T_4, T_4 + VMRHG V0, T_3, M0 + VMRHG V0, T_4, M1 + VMRHG V0, T_5, M2 + VMRLG V0, T_3, M3 + VMRLG V0, T_4, M4 + VMRLG V0, T_5, M5 + VLEIB $10, $1, M2 + VLEIB $10, $1, M5 + + MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) + CMPBNE R3, $0, loop + REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9) + VMRHG V0, H0_1, H0_0 + VMRHG V0, H1_1, H1_0 + VMRHG V0, H2_1, H2_0 + VMRLG V0, H0_1, H0_1 + VMRLG V0, H1_1, H1_1 + VMRLG V0, H2_1, H2_1 + + // load EX0, EX1, EX2 + MOVD $·constants<>(SB), R5 + VLM (R5), EX0, EX2 + + // sum vectors + VAQ H0_0, H0_1, H0_0 + VAQ H1_0, H1_1, H1_0 + VAQ H2_0, H2_1, H2_0 + + // h may be >= 2*(2**130-5) so we need to reduce it again + // M0...M4 are used as temps here + REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5) + +next: // carry h1->h2 + VLEIB $7, $0x28, T_1 + VREPIB $4, T_2 + VGBM $0x003F, T_3 + VESRLG $4, T_3 + + // byte shift + VSRLB T_1, H1_0, T_4 + + // bit shift + VSRL T_2, T_4, T_4 + + // clear h1 carry bits + VN T_3, H1_0, H1_0 + + // add carry + VAQ T_4, H2_0, H2_0 + + // h is now < 2*(2**130-5) + // pack h into h1 (hi) and h0 (lo) + PACK(H0_0, H1_0, H2_0) + + // if h > 2**130-5 then h -= 2**130-5 + MOD(H0_0, H1_0, T_0, T_1, T_2) + + // h += s + MOVD $·bswapMask<>(SB), R5 + VL (R5), T_1 + VL 16(R4), T_0 + VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big) + VAQ T_0, H0_0, H0_0 + VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little) + VST H0_0, (R1) + RET + +add: + // load EX0, EX1, EX2 + MOVD $·constants<>(SB), R5 + VLM (R5), EX0, EX2 + + REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9) + VMRHG V0, H0_1, H0_0 + VMRHG V0, H1_1, H1_0 + VMRHG V0, H2_1, H2_0 + VMRLG V0, H0_1, H0_1 + VMRLG V0, H1_1, H1_1 + VMRLG V0, H2_1, H2_1 + CMPBLE R3, $64, b4 + +b4: + CMPBLE R3, $48, b3 // 3 blocks or less + + // 4(3+1) blocks remaining + SUB $49, R3 + VLM (R2), M0, M2 + VLL R3, 48(R2), M3 + ADD $1, R3 + MOVBZ $1, R0 + CMPBEQ R3, $16, 2(PC) + VLVGB R3, R0, M3 + MOVD $64(R2), R2 + EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3) + VLEIB $10, $1, H2_0 + VLEIB $10, $1, H2_1 + VZERO M0 + VZERO M1 + VZERO M4 + VZERO M5 + VZERO T_4 + VZERO T_10 + EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3) + VLR T_4, M2 + VLEIB $10, $1, M4 + CMPBNE R3, $16, 2(PC) + VLEIB $10, $1, T_10 + MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) + REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9) + VMRHG V0, H0_1, H0_0 + VMRHG V0, H1_1, H1_0 + VMRHG V0, H2_1, H2_0 + VMRLG V0, H0_1, H0_1 + VMRLG V0, H1_1, H1_1 + VMRLG V0, H2_1, H2_1 + SUB $16, R3 + CMPBLE R3, $0, square // this condition must always hold true! + +b3: + CMPBLE R3, $32, b2 + + // 3 blocks remaining + + // setup [r²,r] + VSLDB $8, R_0, R_0, R_0 + VSLDB $8, R_1, R_1, R_1 + VSLDB $8, R_2, R_2, R_2 + VSLDB $8, R5_1, R5_1, R5_1 + VSLDB $8, R5_2, R5_2, R5_2 + + VLVGG $1, RSAVE_0, R_0 + VLVGG $1, RSAVE_1, R_1 + VLVGG $1, RSAVE_2, R_2 + VLVGG $1, R5SAVE_1, R5_1 + VLVGG $1, R5SAVE_2, R5_2 + + // setup [h0, h1] + VSLDB $8, H0_0, H0_0, H0_0 + VSLDB $8, H1_0, H1_0, H1_0 + VSLDB $8, H2_0, H2_0, H2_0 + VO H0_1, H0_0, H0_0 + VO H1_1, H1_0, H1_0 + VO H2_1, H2_0, H2_0 + VZERO H0_1 + VZERO H1_1 + VZERO H2_1 + + VZERO M0 + VZERO M1 + VZERO M2 + VZERO M3 + VZERO M4 + VZERO M5 + + // H*[r**2, r] + MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) + REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5) + + SUB $33, R3 + VLM (R2), M0, M1 + VLL R3, 32(R2), M2 + ADD $1, R3 + MOVBZ $1, R0 + CMPBEQ R3, $16, 2(PC) + VLVGB R3, R0, M2 + + // H += m0 + VZERO T_1 + VZERO T_2 + VZERO T_3 + EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6) + VLEIB $10, $1, T_3 + VAG H0_0, T_1, H0_0 + VAG H1_0, T_2, H1_0 + VAG H2_0, T_3, H2_0 + + VZERO M0 + VZERO M3 + VZERO M4 + VZERO M5 + VZERO T_10 + + // (H+m0)*r + MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) + REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9) + + // H += m1 + VZERO V0 + VZERO T_1 + VZERO T_2 + VZERO T_3 + EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6) + VLEIB $10, $1, T_3 + VAQ H0_0, T_1, H0_0 + VAQ H1_0, T_2, H1_0 + VAQ H2_0, T_3, H2_0 + REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10) + + // [H, m2] * [r**2, r] + EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3) + CMPBNE R3, $16, 2(PC) + VLEIB $10, $1, H2_0 + VZERO M0 + VZERO M1 + VZERO M2 + VZERO M3 + VZERO M4 + VZERO M5 + MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) + REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10) + SUB $16, R3 + CMPBLE R3, $0, next // this condition must always hold true! + +b2: + CMPBLE R3, $16, b1 + + // 2 blocks remaining + + // setup [r²,r] + VSLDB $8, R_0, R_0, R_0 + VSLDB $8, R_1, R_1, R_1 + VSLDB $8, R_2, R_2, R_2 + VSLDB $8, R5_1, R5_1, R5_1 + VSLDB $8, R5_2, R5_2, R5_2 + + VLVGG $1, RSAVE_0, R_0 + VLVGG $1, RSAVE_1, R_1 + VLVGG $1, RSAVE_2, R_2 + VLVGG $1, R5SAVE_1, R5_1 + VLVGG $1, R5SAVE_2, R5_2 + + // setup [h0, h1] + VSLDB $8, H0_0, H0_0, H0_0 + VSLDB $8, H1_0, H1_0, H1_0 + VSLDB $8, H2_0, H2_0, H2_0 + VO H0_1, H0_0, H0_0 + VO H1_1, H1_0, H1_0 + VO H2_1, H2_0, H2_0 + VZERO H0_1 + VZERO H1_1 + VZERO H2_1 + + VZERO M0 + VZERO M1 + VZERO M2 + VZERO M3 + VZERO M4 + VZERO M5 + + // H*[r**2, r] + MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) + REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9) + VMRHG V0, H0_1, H0_0 + VMRHG V0, H1_1, H1_0 + VMRHG V0, H2_1, H2_0 + VMRLG V0, H0_1, H0_1 + VMRLG V0, H1_1, H1_1 + VMRLG V0, H2_1, H2_1 + + // move h to the left and 0s at the right + VSLDB $8, H0_0, H0_0, H0_0 + VSLDB $8, H1_0, H1_0, H1_0 + VSLDB $8, H2_0, H2_0, H2_0 + + // get message blocks and append 1 to start + SUB $17, R3 + VL (R2), M0 + VLL R3, 16(R2), M1 + ADD $1, R3 + MOVBZ $1, R0 + CMPBEQ R3, $16, 2(PC) + VLVGB R3, R0, M1 + VZERO T_6 + VZERO T_7 + VZERO T_8 + EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3) + EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3) + VLEIB $2, $1, T_8 + CMPBNE R3, $16, 2(PC) + VLEIB $10, $1, T_8 + + // add [m0, m1] to h + VAG H0_0, T_6, H0_0 + VAG H1_0, T_7, H1_0 + VAG H2_0, T_8, H2_0 + + VZERO M2 + VZERO M3 + VZERO M4 + VZERO M5 + VZERO T_10 + VZERO M0 + + // at this point R_0 .. R5_2 look like [r**2, r] + MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) + REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10) + SUB $16, R3, R3 + CMPBLE R3, $0, next + +b1: + CMPBLE R3, $0, next + + // 1 block remaining + + // setup [r²,r] + VSLDB $8, R_0, R_0, R_0 + VSLDB $8, R_1, R_1, R_1 + VSLDB $8, R_2, R_2, R_2 + VSLDB $8, R5_1, R5_1, R5_1 + VSLDB $8, R5_2, R5_2, R5_2 + + VLVGG $1, RSAVE_0, R_0 + VLVGG $1, RSAVE_1, R_1 + VLVGG $1, RSAVE_2, R_2 + VLVGG $1, R5SAVE_1, R5_1 + VLVGG $1, R5SAVE_2, R5_2 + + // setup [h0, h1] + VSLDB $8, H0_0, H0_0, H0_0 + VSLDB $8, H1_0, H1_0, H1_0 + VSLDB $8, H2_0, H2_0, H2_0 + VO H0_1, H0_0, H0_0 + VO H1_1, H1_0, H1_0 + VO H2_1, H2_0, H2_0 + VZERO H0_1 + VZERO H1_1 + VZERO H2_1 + + VZERO M0 + VZERO M1 + VZERO M2 + VZERO M3 + VZERO M4 + VZERO M5 + + // H*[r**2, r] + MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) + REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5) + + // set up [0, m0] limbs + SUB $1, R3 + VLL R3, (R2), M0 + ADD $1, R3 + MOVBZ $1, R0 + CMPBEQ R3, $16, 2(PC) + VLVGB R3, R0, M0 + VZERO T_1 + VZERO T_2 + VZERO T_3 + EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m] + CMPBNE R3, $16, 2(PC) + VLEIB $10, $1, T_3 + + // h+m0 + VAQ H0_0, T_1, H0_0 + VAQ H1_0, T_2, H1_0 + VAQ H2_0, T_3, H2_0 + + VZERO M0 + VZERO M1 + VZERO M2 + VZERO M3 + VZERO M4 + VZERO M5 + MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) + REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5) + + BR next + +square: + // setup [r²,r] + VSLDB $8, R_0, R_0, R_0 + VSLDB $8, R_1, R_1, R_1 + VSLDB $8, R_2, R_2, R_2 + VSLDB $8, R5_1, R5_1, R5_1 + VSLDB $8, R5_2, R5_2, R5_2 + + VLVGG $1, RSAVE_0, R_0 + VLVGG $1, RSAVE_1, R_1 + VLVGG $1, RSAVE_2, R_2 + VLVGG $1, R5SAVE_1, R5_1 + VLVGG $1, R5SAVE_2, R5_2 + + // setup [h0, h1] + VSLDB $8, H0_0, H0_0, H0_0 + VSLDB $8, H1_0, H1_0, H1_0 + VSLDB $8, H2_0, H2_0, H2_0 + VO H0_1, H0_0, H0_0 + VO H1_1, H1_0, H1_0 + VO H2_1, H2_0, H2_0 + VZERO H0_1 + VZERO H1_1 + VZERO H2_1 + + VZERO M0 + VZERO M1 + VZERO M2 + VZERO M3 + VZERO M4 + VZERO M5 + + // (h0*r**2) + (h1*r) + MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) + REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5) + BR next + +TEXT ·hasVMSLFacility(SB), NOSPLIT, $24-1 + MOVD $x-24(SP), R1 + XC $24, 0(R1), 0(R1) // clear the storage + MOVD $2, R0 // R0 is the number of double words stored -1 + WORD $0xB2B01000 // STFLE 0(R1) + XOR R0, R0 // reset the value of R0 + MOVBZ z-8(SP), R1 + AND $0x01, R1 + BEQ novmsl + +vectorinstalled: + // check if the vector instruction has been enabled + VLEIB $0, $0xF, V16 + VLGVB $0, V16, R1 + CMPBNE R1, $0xF, novmsl + MOVB $1, ret+0(FP) // have vx + RET + +novmsl: + MOVB $0, ret+0(FP) // no vx + RET diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go b/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go new file mode 100644 index 0000000..4c96147 --- /dev/null +++ b/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go @@ -0,0 +1,144 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package salsa provides low-level access to functions in the Salsa family. +package salsa // import "golang.org/x/crypto/salsa20/salsa" + +// Sigma is the Salsa20 constant for 256-bit keys. +var Sigma = [16]byte{'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', '2', '-', 'b', 'y', 't', 'e', ' ', 'k'} + +// HSalsa20 applies the HSalsa20 core function to a 16-byte input in, 32-byte +// key k, and 16-byte constant c, and puts the result into the 32-byte array +// out. +func HSalsa20(out *[32]byte, in *[16]byte, k *[32]byte, c *[16]byte) { + x0 := uint32(c[0]) | uint32(c[1])<<8 | uint32(c[2])<<16 | uint32(c[3])<<24 + x1 := uint32(k[0]) | uint32(k[1])<<8 | uint32(k[2])<<16 | uint32(k[3])<<24 + x2 := uint32(k[4]) | uint32(k[5])<<8 | uint32(k[6])<<16 | uint32(k[7])<<24 + x3 := uint32(k[8]) | uint32(k[9])<<8 | uint32(k[10])<<16 | uint32(k[11])<<24 + x4 := uint32(k[12]) | uint32(k[13])<<8 | uint32(k[14])<<16 | uint32(k[15])<<24 + x5 := uint32(c[4]) | uint32(c[5])<<8 | uint32(c[6])<<16 | uint32(c[7])<<24 + x6 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24 + x7 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24 + x8 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24 + x9 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24 + x10 := uint32(c[8]) | uint32(c[9])<<8 | uint32(c[10])<<16 | uint32(c[11])<<24 + x11 := uint32(k[16]) | uint32(k[17])<<8 | uint32(k[18])<<16 | uint32(k[19])<<24 + x12 := uint32(k[20]) | uint32(k[21])<<8 | uint32(k[22])<<16 | uint32(k[23])<<24 + x13 := uint32(k[24]) | uint32(k[25])<<8 | uint32(k[26])<<16 | uint32(k[27])<<24 + x14 := uint32(k[28]) | uint32(k[29])<<8 | uint32(k[30])<<16 | uint32(k[31])<<24 + x15 := uint32(c[12]) | uint32(c[13])<<8 | uint32(c[14])<<16 | uint32(c[15])<<24 + + for i := 0; i < 20; i += 2 { + u := x0 + x12 + x4 ^= u<<7 | u>>(32-7) + u = x4 + x0 + x8 ^= u<<9 | u>>(32-9) + u = x8 + x4 + x12 ^= u<<13 | u>>(32-13) + u = x12 + x8 + x0 ^= u<<18 | u>>(32-18) + + u = x5 + x1 + x9 ^= u<<7 | u>>(32-7) + u = x9 + x5 + x13 ^= u<<9 | u>>(32-9) + u = x13 + x9 + x1 ^= u<<13 | u>>(32-13) + u = x1 + x13 + x5 ^= u<<18 | u>>(32-18) + + u = x10 + x6 + x14 ^= u<<7 | u>>(32-7) + u = x14 + x10 + x2 ^= u<<9 | u>>(32-9) + u = x2 + x14 + x6 ^= u<<13 | u>>(32-13) + u = x6 + x2 + x10 ^= u<<18 | u>>(32-18) + + u = x15 + x11 + x3 ^= u<<7 | u>>(32-7) + u = x3 + x15 + x7 ^= u<<9 | u>>(32-9) + u = x7 + x3 + x11 ^= u<<13 | u>>(32-13) + u = x11 + x7 + x15 ^= u<<18 | u>>(32-18) + + u = x0 + x3 + x1 ^= u<<7 | u>>(32-7) + u = x1 + x0 + x2 ^= u<<9 | u>>(32-9) + u = x2 + x1 + x3 ^= u<<13 | u>>(32-13) + u = x3 + x2 + x0 ^= u<<18 | u>>(32-18) + + u = x5 + x4 + x6 ^= u<<7 | u>>(32-7) + u = x6 + x5 + x7 ^= u<<9 | u>>(32-9) + u = x7 + x6 + x4 ^= u<<13 | u>>(32-13) + u = x4 + x7 + x5 ^= u<<18 | u>>(32-18) + + u = x10 + x9 + x11 ^= u<<7 | u>>(32-7) + u = x11 + x10 + x8 ^= u<<9 | u>>(32-9) + u = x8 + x11 + x9 ^= u<<13 | u>>(32-13) + u = x9 + x8 + x10 ^= u<<18 | u>>(32-18) + + u = x15 + x14 + x12 ^= u<<7 | u>>(32-7) + u = x12 + x15 + x13 ^= u<<9 | u>>(32-9) + u = x13 + x12 + x14 ^= u<<13 | u>>(32-13) + u = x14 + x13 + x15 ^= u<<18 | u>>(32-18) + } + out[0] = byte(x0) + out[1] = byte(x0 >> 8) + out[2] = byte(x0 >> 16) + out[3] = byte(x0 >> 24) + + out[4] = byte(x5) + out[5] = byte(x5 >> 8) + out[6] = byte(x5 >> 16) + out[7] = byte(x5 >> 24) + + out[8] = byte(x10) + out[9] = byte(x10 >> 8) + out[10] = byte(x10 >> 16) + out[11] = byte(x10 >> 24) + + out[12] = byte(x15) + out[13] = byte(x15 >> 8) + out[14] = byte(x15 >> 16) + out[15] = byte(x15 >> 24) + + out[16] = byte(x6) + out[17] = byte(x6 >> 8) + out[18] = byte(x6 >> 16) + out[19] = byte(x6 >> 24) + + out[20] = byte(x7) + out[21] = byte(x7 >> 8) + out[22] = byte(x7 >> 16) + out[23] = byte(x7 >> 24) + + out[24] = byte(x8) + out[25] = byte(x8 >> 8) + out[26] = byte(x8 >> 16) + out[27] = byte(x8 >> 24) + + out[28] = byte(x9) + out[29] = byte(x9 >> 8) + out[30] = byte(x9 >> 16) + out[31] = byte(x9 >> 24) +} diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa2020_amd64.s b/vendor/golang.org/x/crypto/salsa20/salsa/salsa2020_amd64.s new file mode 100644 index 0000000..22afbdc --- /dev/null +++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa2020_amd64.s @@ -0,0 +1,889 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64,!appengine,!gccgo + +// This code was translated into a form compatible with 6a from the public +// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html + +// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte) +// This needs up to 64 bytes at 360(SP); hence the non-obvious frame size. +TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment + MOVQ out+0(FP),DI + MOVQ in+8(FP),SI + MOVQ n+16(FP),DX + MOVQ nonce+24(FP),CX + MOVQ key+32(FP),R8 + + MOVQ SP,R12 + MOVQ SP,R9 + ADDQ $31, R9 + ANDQ $~31, R9 + MOVQ R9, SP + + MOVQ DX,R9 + MOVQ CX,DX + MOVQ R8,R10 + CMPQ R9,$0 + JBE DONE + START: + MOVL 20(R10),CX + MOVL 0(R10),R8 + MOVL 0(DX),AX + MOVL 16(R10),R11 + MOVL CX,0(SP) + MOVL R8, 4 (SP) + MOVL AX, 8 (SP) + MOVL R11, 12 (SP) + MOVL 8(DX),CX + MOVL 24(R10),R8 + MOVL 4(R10),AX + MOVL 4(DX),R11 + MOVL CX,16(SP) + MOVL R8, 20 (SP) + MOVL AX, 24 (SP) + MOVL R11, 28 (SP) + MOVL 12(DX),CX + MOVL 12(R10),DX + MOVL 28(R10),R8 + MOVL 8(R10),AX + MOVL DX,32(SP) + MOVL CX, 36 (SP) + MOVL R8, 40 (SP) + MOVL AX, 44 (SP) + MOVQ $1634760805,DX + MOVQ $857760878,CX + MOVQ $2036477234,R8 + MOVQ $1797285236,AX + MOVL DX,48(SP) + MOVL CX, 52 (SP) + MOVL R8, 56 (SP) + MOVL AX, 60 (SP) + CMPQ R9,$256 + JB BYTESBETWEEN1AND255 + MOVOA 48(SP),X0 + PSHUFL $0X55,X0,X1 + PSHUFL $0XAA,X0,X2 + PSHUFL $0XFF,X0,X3 + PSHUFL $0X00,X0,X0 + MOVOA X1,64(SP) + MOVOA X2,80(SP) + MOVOA X3,96(SP) + MOVOA X0,112(SP) + MOVOA 0(SP),X0 + PSHUFL $0XAA,X0,X1 + PSHUFL $0XFF,X0,X2 + PSHUFL $0X00,X0,X3 + PSHUFL $0X55,X0,X0 + MOVOA X1,128(SP) + MOVOA X2,144(SP) + MOVOA X3,160(SP) + MOVOA X0,176(SP) + MOVOA 16(SP),X0 + PSHUFL $0XFF,X0,X1 + PSHUFL $0X55,X0,X2 + PSHUFL $0XAA,X0,X0 + MOVOA X1,192(SP) + MOVOA X2,208(SP) + MOVOA X0,224(SP) + MOVOA 32(SP),X0 + PSHUFL $0X00,X0,X1 + PSHUFL $0XAA,X0,X2 + PSHUFL $0XFF,X0,X0 + MOVOA X1,240(SP) + MOVOA X2,256(SP) + MOVOA X0,272(SP) + BYTESATLEAST256: + MOVL 16(SP),DX + MOVL 36 (SP),CX + MOVL DX,288(SP) + MOVL CX,304(SP) + ADDQ $1,DX + SHLQ $32,CX + ADDQ CX,DX + MOVQ DX,CX + SHRQ $32,CX + MOVL DX, 292 (SP) + MOVL CX, 308 (SP) + ADDQ $1,DX + SHLQ $32,CX + ADDQ CX,DX + MOVQ DX,CX + SHRQ $32,CX + MOVL DX, 296 (SP) + MOVL CX, 312 (SP) + ADDQ $1,DX + SHLQ $32,CX + ADDQ CX,DX + MOVQ DX,CX + SHRQ $32,CX + MOVL DX, 300 (SP) + MOVL CX, 316 (SP) + ADDQ $1,DX + SHLQ $32,CX + ADDQ CX,DX + MOVQ DX,CX + SHRQ $32,CX + MOVL DX,16(SP) + MOVL CX, 36 (SP) + MOVQ R9,352(SP) + MOVQ $20,DX + MOVOA 64(SP),X0 + MOVOA 80(SP),X1 + MOVOA 96(SP),X2 + MOVOA 256(SP),X3 + MOVOA 272(SP),X4 + MOVOA 128(SP),X5 + MOVOA 144(SP),X6 + MOVOA 176(SP),X7 + MOVOA 192(SP),X8 + MOVOA 208(SP),X9 + MOVOA 224(SP),X10 + MOVOA 304(SP),X11 + MOVOA 112(SP),X12 + MOVOA 160(SP),X13 + MOVOA 240(SP),X14 + MOVOA 288(SP),X15 + MAINLOOP1: + MOVOA X1,320(SP) + MOVOA X2,336(SP) + MOVOA X13,X1 + PADDL X12,X1 + MOVOA X1,X2 + PSLLL $7,X1 + PXOR X1,X14 + PSRLL $25,X2 + PXOR X2,X14 + MOVOA X7,X1 + PADDL X0,X1 + MOVOA X1,X2 + PSLLL $7,X1 + PXOR X1,X11 + PSRLL $25,X2 + PXOR X2,X11 + MOVOA X12,X1 + PADDL X14,X1 + MOVOA X1,X2 + PSLLL $9,X1 + PXOR X1,X15 + PSRLL $23,X2 + PXOR X2,X15 + MOVOA X0,X1 + PADDL X11,X1 + MOVOA X1,X2 + PSLLL $9,X1 + PXOR X1,X9 + PSRLL $23,X2 + PXOR X2,X9 + MOVOA X14,X1 + PADDL X15,X1 + MOVOA X1,X2 + PSLLL $13,X1 + PXOR X1,X13 + PSRLL $19,X2 + PXOR X2,X13 + MOVOA X11,X1 + PADDL X9,X1 + MOVOA X1,X2 + PSLLL $13,X1 + PXOR X1,X7 + PSRLL $19,X2 + PXOR X2,X7 + MOVOA X15,X1 + PADDL X13,X1 + MOVOA X1,X2 + PSLLL $18,X1 + PXOR X1,X12 + PSRLL $14,X2 + PXOR X2,X12 + MOVOA 320(SP),X1 + MOVOA X12,320(SP) + MOVOA X9,X2 + PADDL X7,X2 + MOVOA X2,X12 + PSLLL $18,X2 + PXOR X2,X0 + PSRLL $14,X12 + PXOR X12,X0 + MOVOA X5,X2 + PADDL X1,X2 + MOVOA X2,X12 + PSLLL $7,X2 + PXOR X2,X3 + PSRLL $25,X12 + PXOR X12,X3 + MOVOA 336(SP),X2 + MOVOA X0,336(SP) + MOVOA X6,X0 + PADDL X2,X0 + MOVOA X0,X12 + PSLLL $7,X0 + PXOR X0,X4 + PSRLL $25,X12 + PXOR X12,X4 + MOVOA X1,X0 + PADDL X3,X0 + MOVOA X0,X12 + PSLLL $9,X0 + PXOR X0,X10 + PSRLL $23,X12 + PXOR X12,X10 + MOVOA X2,X0 + PADDL X4,X0 + MOVOA X0,X12 + PSLLL $9,X0 + PXOR X0,X8 + PSRLL $23,X12 + PXOR X12,X8 + MOVOA X3,X0 + PADDL X10,X0 + MOVOA X0,X12 + PSLLL $13,X0 + PXOR X0,X5 + PSRLL $19,X12 + PXOR X12,X5 + MOVOA X4,X0 + PADDL X8,X0 + MOVOA X0,X12 + PSLLL $13,X0 + PXOR X0,X6 + PSRLL $19,X12 + PXOR X12,X6 + MOVOA X10,X0 + PADDL X5,X0 + MOVOA X0,X12 + PSLLL $18,X0 + PXOR X0,X1 + PSRLL $14,X12 + PXOR X12,X1 + MOVOA 320(SP),X0 + MOVOA X1,320(SP) + MOVOA X4,X1 + PADDL X0,X1 + MOVOA X1,X12 + PSLLL $7,X1 + PXOR X1,X7 + PSRLL $25,X12 + PXOR X12,X7 + MOVOA X8,X1 + PADDL X6,X1 + MOVOA X1,X12 + PSLLL $18,X1 + PXOR X1,X2 + PSRLL $14,X12 + PXOR X12,X2 + MOVOA 336(SP),X12 + MOVOA X2,336(SP) + MOVOA X14,X1 + PADDL X12,X1 + MOVOA X1,X2 + PSLLL $7,X1 + PXOR X1,X5 + PSRLL $25,X2 + PXOR X2,X5 + MOVOA X0,X1 + PADDL X7,X1 + MOVOA X1,X2 + PSLLL $9,X1 + PXOR X1,X10 + PSRLL $23,X2 + PXOR X2,X10 + MOVOA X12,X1 + PADDL X5,X1 + MOVOA X1,X2 + PSLLL $9,X1 + PXOR X1,X8 + PSRLL $23,X2 + PXOR X2,X8 + MOVOA X7,X1 + PADDL X10,X1 + MOVOA X1,X2 + PSLLL $13,X1 + PXOR X1,X4 + PSRLL $19,X2 + PXOR X2,X4 + MOVOA X5,X1 + PADDL X8,X1 + MOVOA X1,X2 + PSLLL $13,X1 + PXOR X1,X14 + PSRLL $19,X2 + PXOR X2,X14 + MOVOA X10,X1 + PADDL X4,X1 + MOVOA X1,X2 + PSLLL $18,X1 + PXOR X1,X0 + PSRLL $14,X2 + PXOR X2,X0 + MOVOA 320(SP),X1 + MOVOA X0,320(SP) + MOVOA X8,X0 + PADDL X14,X0 + MOVOA X0,X2 + PSLLL $18,X0 + PXOR X0,X12 + PSRLL $14,X2 + PXOR X2,X12 + MOVOA X11,X0 + PADDL X1,X0 + MOVOA X0,X2 + PSLLL $7,X0 + PXOR X0,X6 + PSRLL $25,X2 + PXOR X2,X6 + MOVOA 336(SP),X2 + MOVOA X12,336(SP) + MOVOA X3,X0 + PADDL X2,X0 + MOVOA X0,X12 + PSLLL $7,X0 + PXOR X0,X13 + PSRLL $25,X12 + PXOR X12,X13 + MOVOA X1,X0 + PADDL X6,X0 + MOVOA X0,X12 + PSLLL $9,X0 + PXOR X0,X15 + PSRLL $23,X12 + PXOR X12,X15 + MOVOA X2,X0 + PADDL X13,X0 + MOVOA X0,X12 + PSLLL $9,X0 + PXOR X0,X9 + PSRLL $23,X12 + PXOR X12,X9 + MOVOA X6,X0 + PADDL X15,X0 + MOVOA X0,X12 + PSLLL $13,X0 + PXOR X0,X11 + PSRLL $19,X12 + PXOR X12,X11 + MOVOA X13,X0 + PADDL X9,X0 + MOVOA X0,X12 + PSLLL $13,X0 + PXOR X0,X3 + PSRLL $19,X12 + PXOR X12,X3 + MOVOA X15,X0 + PADDL X11,X0 + MOVOA X0,X12 + PSLLL $18,X0 + PXOR X0,X1 + PSRLL $14,X12 + PXOR X12,X1 + MOVOA X9,X0 + PADDL X3,X0 + MOVOA X0,X12 + PSLLL $18,X0 + PXOR X0,X2 + PSRLL $14,X12 + PXOR X12,X2 + MOVOA 320(SP),X12 + MOVOA 336(SP),X0 + SUBQ $2,DX + JA MAINLOOP1 + PADDL 112(SP),X12 + PADDL 176(SP),X7 + PADDL 224(SP),X10 + PADDL 272(SP),X4 + MOVD X12,DX + MOVD X7,CX + MOVD X10,R8 + MOVD X4,R9 + PSHUFL $0X39,X12,X12 + PSHUFL $0X39,X7,X7 + PSHUFL $0X39,X10,X10 + PSHUFL $0X39,X4,X4 + XORL 0(SI),DX + XORL 4(SI),CX + XORL 8(SI),R8 + XORL 12(SI),R9 + MOVL DX,0(DI) + MOVL CX,4(DI) + MOVL R8,8(DI) + MOVL R9,12(DI) + MOVD X12,DX + MOVD X7,CX + MOVD X10,R8 + MOVD X4,R9 + PSHUFL $0X39,X12,X12 + PSHUFL $0X39,X7,X7 + PSHUFL $0X39,X10,X10 + PSHUFL $0X39,X4,X4 + XORL 64(SI),DX + XORL 68(SI),CX + XORL 72(SI),R8 + XORL 76(SI),R9 + MOVL DX,64(DI) + MOVL CX,68(DI) + MOVL R8,72(DI) + MOVL R9,76(DI) + MOVD X12,DX + MOVD X7,CX + MOVD X10,R8 + MOVD X4,R9 + PSHUFL $0X39,X12,X12 + PSHUFL $0X39,X7,X7 + PSHUFL $0X39,X10,X10 + PSHUFL $0X39,X4,X4 + XORL 128(SI),DX + XORL 132(SI),CX + XORL 136(SI),R8 + XORL 140(SI),R9 + MOVL DX,128(DI) + MOVL CX,132(DI) + MOVL R8,136(DI) + MOVL R9,140(DI) + MOVD X12,DX + MOVD X7,CX + MOVD X10,R8 + MOVD X4,R9 + XORL 192(SI),DX + XORL 196(SI),CX + XORL 200(SI),R8 + XORL 204(SI),R9 + MOVL DX,192(DI) + MOVL CX,196(DI) + MOVL R8,200(DI) + MOVL R9,204(DI) + PADDL 240(SP),X14 + PADDL 64(SP),X0 + PADDL 128(SP),X5 + PADDL 192(SP),X8 + MOVD X14,DX + MOVD X0,CX + MOVD X5,R8 + MOVD X8,R9 + PSHUFL $0X39,X14,X14 + PSHUFL $0X39,X0,X0 + PSHUFL $0X39,X5,X5 + PSHUFL $0X39,X8,X8 + XORL 16(SI),DX + XORL 20(SI),CX + XORL 24(SI),R8 + XORL 28(SI),R9 + MOVL DX,16(DI) + MOVL CX,20(DI) + MOVL R8,24(DI) + MOVL R9,28(DI) + MOVD X14,DX + MOVD X0,CX + MOVD X5,R8 + MOVD X8,R9 + PSHUFL $0X39,X14,X14 + PSHUFL $0X39,X0,X0 + PSHUFL $0X39,X5,X5 + PSHUFL $0X39,X8,X8 + XORL 80(SI),DX + XORL 84(SI),CX + XORL 88(SI),R8 + XORL 92(SI),R9 + MOVL DX,80(DI) + MOVL CX,84(DI) + MOVL R8,88(DI) + MOVL R9,92(DI) + MOVD X14,DX + MOVD X0,CX + MOVD X5,R8 + MOVD X8,R9 + PSHUFL $0X39,X14,X14 + PSHUFL $0X39,X0,X0 + PSHUFL $0X39,X5,X5 + PSHUFL $0X39,X8,X8 + XORL 144(SI),DX + XORL 148(SI),CX + XORL 152(SI),R8 + XORL 156(SI),R9 + MOVL DX,144(DI) + MOVL CX,148(DI) + MOVL R8,152(DI) + MOVL R9,156(DI) + MOVD X14,DX + MOVD X0,CX + MOVD X5,R8 + MOVD X8,R9 + XORL 208(SI),DX + XORL 212(SI),CX + XORL 216(SI),R8 + XORL 220(SI),R9 + MOVL DX,208(DI) + MOVL CX,212(DI) + MOVL R8,216(DI) + MOVL R9,220(DI) + PADDL 288(SP),X15 + PADDL 304(SP),X11 + PADDL 80(SP),X1 + PADDL 144(SP),X6 + MOVD X15,DX + MOVD X11,CX + MOVD X1,R8 + MOVD X6,R9 + PSHUFL $0X39,X15,X15 + PSHUFL $0X39,X11,X11 + PSHUFL $0X39,X1,X1 + PSHUFL $0X39,X6,X6 + XORL 32(SI),DX + XORL 36(SI),CX + XORL 40(SI),R8 + XORL 44(SI),R9 + MOVL DX,32(DI) + MOVL CX,36(DI) + MOVL R8,40(DI) + MOVL R9,44(DI) + MOVD X15,DX + MOVD X11,CX + MOVD X1,R8 + MOVD X6,R9 + PSHUFL $0X39,X15,X15 + PSHUFL $0X39,X11,X11 + PSHUFL $0X39,X1,X1 + PSHUFL $0X39,X6,X6 + XORL 96(SI),DX + XORL 100(SI),CX + XORL 104(SI),R8 + XORL 108(SI),R9 + MOVL DX,96(DI) + MOVL CX,100(DI) + MOVL R8,104(DI) + MOVL R9,108(DI) + MOVD X15,DX + MOVD X11,CX + MOVD X1,R8 + MOVD X6,R9 + PSHUFL $0X39,X15,X15 + PSHUFL $0X39,X11,X11 + PSHUFL $0X39,X1,X1 + PSHUFL $0X39,X6,X6 + XORL 160(SI),DX + XORL 164(SI),CX + XORL 168(SI),R8 + XORL 172(SI),R9 + MOVL DX,160(DI) + MOVL CX,164(DI) + MOVL R8,168(DI) + MOVL R9,172(DI) + MOVD X15,DX + MOVD X11,CX + MOVD X1,R8 + MOVD X6,R9 + XORL 224(SI),DX + XORL 228(SI),CX + XORL 232(SI),R8 + XORL 236(SI),R9 + MOVL DX,224(DI) + MOVL CX,228(DI) + MOVL R8,232(DI) + MOVL R9,236(DI) + PADDL 160(SP),X13 + PADDL 208(SP),X9 + PADDL 256(SP),X3 + PADDL 96(SP),X2 + MOVD X13,DX + MOVD X9,CX + MOVD X3,R8 + MOVD X2,R9 + PSHUFL $0X39,X13,X13 + PSHUFL $0X39,X9,X9 + PSHUFL $0X39,X3,X3 + PSHUFL $0X39,X2,X2 + XORL 48(SI),DX + XORL 52(SI),CX + XORL 56(SI),R8 + XORL 60(SI),R9 + MOVL DX,48(DI) + MOVL CX,52(DI) + MOVL R8,56(DI) + MOVL R9,60(DI) + MOVD X13,DX + MOVD X9,CX + MOVD X3,R8 + MOVD X2,R9 + PSHUFL $0X39,X13,X13 + PSHUFL $0X39,X9,X9 + PSHUFL $0X39,X3,X3 + PSHUFL $0X39,X2,X2 + XORL 112(SI),DX + XORL 116(SI),CX + XORL 120(SI),R8 + XORL 124(SI),R9 + MOVL DX,112(DI) + MOVL CX,116(DI) + MOVL R8,120(DI) + MOVL R9,124(DI) + MOVD X13,DX + MOVD X9,CX + MOVD X3,R8 + MOVD X2,R9 + PSHUFL $0X39,X13,X13 + PSHUFL $0X39,X9,X9 + PSHUFL $0X39,X3,X3 + PSHUFL $0X39,X2,X2 + XORL 176(SI),DX + XORL 180(SI),CX + XORL 184(SI),R8 + XORL 188(SI),R9 + MOVL DX,176(DI) + MOVL CX,180(DI) + MOVL R8,184(DI) + MOVL R9,188(DI) + MOVD X13,DX + MOVD X9,CX + MOVD X3,R8 + MOVD X2,R9 + XORL 240(SI),DX + XORL 244(SI),CX + XORL 248(SI),R8 + XORL 252(SI),R9 + MOVL DX,240(DI) + MOVL CX,244(DI) + MOVL R8,248(DI) + MOVL R9,252(DI) + MOVQ 352(SP),R9 + SUBQ $256,R9 + ADDQ $256,SI + ADDQ $256,DI + CMPQ R9,$256 + JAE BYTESATLEAST256 + CMPQ R9,$0 + JBE DONE + BYTESBETWEEN1AND255: + CMPQ R9,$64 + JAE NOCOPY + MOVQ DI,DX + LEAQ 360(SP),DI + MOVQ R9,CX + REP; MOVSB + LEAQ 360(SP),DI + LEAQ 360(SP),SI + NOCOPY: + MOVQ R9,352(SP) + MOVOA 48(SP),X0 + MOVOA 0(SP),X1 + MOVOA 16(SP),X2 + MOVOA 32(SP),X3 + MOVOA X1,X4 + MOVQ $20,CX + MAINLOOP2: + PADDL X0,X4 + MOVOA X0,X5 + MOVOA X4,X6 + PSLLL $7,X4 + PSRLL $25,X6 + PXOR X4,X3 + PXOR X6,X3 + PADDL X3,X5 + MOVOA X3,X4 + MOVOA X5,X6 + PSLLL $9,X5 + PSRLL $23,X6 + PXOR X5,X2 + PSHUFL $0X93,X3,X3 + PXOR X6,X2 + PADDL X2,X4 + MOVOA X2,X5 + MOVOA X4,X6 + PSLLL $13,X4 + PSRLL $19,X6 + PXOR X4,X1 + PSHUFL $0X4E,X2,X2 + PXOR X6,X1 + PADDL X1,X5 + MOVOA X3,X4 + MOVOA X5,X6 + PSLLL $18,X5 + PSRLL $14,X6 + PXOR X5,X0 + PSHUFL $0X39,X1,X1 + PXOR X6,X0 + PADDL X0,X4 + MOVOA X0,X5 + MOVOA X4,X6 + PSLLL $7,X4 + PSRLL $25,X6 + PXOR X4,X1 + PXOR X6,X1 + PADDL X1,X5 + MOVOA X1,X4 + MOVOA X5,X6 + PSLLL $9,X5 + PSRLL $23,X6 + PXOR X5,X2 + PSHUFL $0X93,X1,X1 + PXOR X6,X2 + PADDL X2,X4 + MOVOA X2,X5 + MOVOA X4,X6 + PSLLL $13,X4 + PSRLL $19,X6 + PXOR X4,X3 + PSHUFL $0X4E,X2,X2 + PXOR X6,X3 + PADDL X3,X5 + MOVOA X1,X4 + MOVOA X5,X6 + PSLLL $18,X5 + PSRLL $14,X6 + PXOR X5,X0 + PSHUFL $0X39,X3,X3 + PXOR X6,X0 + PADDL X0,X4 + MOVOA X0,X5 + MOVOA X4,X6 + PSLLL $7,X4 + PSRLL $25,X6 + PXOR X4,X3 + PXOR X6,X3 + PADDL X3,X5 + MOVOA X3,X4 + MOVOA X5,X6 + PSLLL $9,X5 + PSRLL $23,X6 + PXOR X5,X2 + PSHUFL $0X93,X3,X3 + PXOR X6,X2 + PADDL X2,X4 + MOVOA X2,X5 + MOVOA X4,X6 + PSLLL $13,X4 + PSRLL $19,X6 + PXOR X4,X1 + PSHUFL $0X4E,X2,X2 + PXOR X6,X1 + PADDL X1,X5 + MOVOA X3,X4 + MOVOA X5,X6 + PSLLL $18,X5 + PSRLL $14,X6 + PXOR X5,X0 + PSHUFL $0X39,X1,X1 + PXOR X6,X0 + PADDL X0,X4 + MOVOA X0,X5 + MOVOA X4,X6 + PSLLL $7,X4 + PSRLL $25,X6 + PXOR X4,X1 + PXOR X6,X1 + PADDL X1,X5 + MOVOA X1,X4 + MOVOA X5,X6 + PSLLL $9,X5 + PSRLL $23,X6 + PXOR X5,X2 + PSHUFL $0X93,X1,X1 + PXOR X6,X2 + PADDL X2,X4 + MOVOA X2,X5 + MOVOA X4,X6 + PSLLL $13,X4 + PSRLL $19,X6 + PXOR X4,X3 + PSHUFL $0X4E,X2,X2 + PXOR X6,X3 + SUBQ $4,CX + PADDL X3,X5 + MOVOA X1,X4 + MOVOA X5,X6 + PSLLL $18,X5 + PXOR X7,X7 + PSRLL $14,X6 + PXOR X5,X0 + PSHUFL $0X39,X3,X3 + PXOR X6,X0 + JA MAINLOOP2 + PADDL 48(SP),X0 + PADDL 0(SP),X1 + PADDL 16(SP),X2 + PADDL 32(SP),X3 + MOVD X0,CX + MOVD X1,R8 + MOVD X2,R9 + MOVD X3,AX + PSHUFL $0X39,X0,X0 + PSHUFL $0X39,X1,X1 + PSHUFL $0X39,X2,X2 + PSHUFL $0X39,X3,X3 + XORL 0(SI),CX + XORL 48(SI),R8 + XORL 32(SI),R9 + XORL 16(SI),AX + MOVL CX,0(DI) + MOVL R8,48(DI) + MOVL R9,32(DI) + MOVL AX,16(DI) + MOVD X0,CX + MOVD X1,R8 + MOVD X2,R9 + MOVD X3,AX + PSHUFL $0X39,X0,X0 + PSHUFL $0X39,X1,X1 + PSHUFL $0X39,X2,X2 + PSHUFL $0X39,X3,X3 + XORL 20(SI),CX + XORL 4(SI),R8 + XORL 52(SI),R9 + XORL 36(SI),AX + MOVL CX,20(DI) + MOVL R8,4(DI) + MOVL R9,52(DI) + MOVL AX,36(DI) + MOVD X0,CX + MOVD X1,R8 + MOVD X2,R9 + MOVD X3,AX + PSHUFL $0X39,X0,X0 + PSHUFL $0X39,X1,X1 + PSHUFL $0X39,X2,X2 + PSHUFL $0X39,X3,X3 + XORL 40(SI),CX + XORL 24(SI),R8 + XORL 8(SI),R9 + XORL 56(SI),AX + MOVL CX,40(DI) + MOVL R8,24(DI) + MOVL R9,8(DI) + MOVL AX,56(DI) + MOVD X0,CX + MOVD X1,R8 + MOVD X2,R9 + MOVD X3,AX + XORL 60(SI),CX + XORL 44(SI),R8 + XORL 28(SI),R9 + XORL 12(SI),AX + MOVL CX,60(DI) + MOVL R8,44(DI) + MOVL R9,28(DI) + MOVL AX,12(DI) + MOVQ 352(SP),R9 + MOVL 16(SP),CX + MOVL 36 (SP),R8 + ADDQ $1,CX + SHLQ $32,R8 + ADDQ R8,CX + MOVQ CX,R8 + SHRQ $32,R8 + MOVL CX,16(SP) + MOVL R8, 36 (SP) + CMPQ R9,$64 + JA BYTESATLEAST65 + JAE BYTESATLEAST64 + MOVQ DI,SI + MOVQ DX,DI + MOVQ R9,CX + REP; MOVSB + BYTESATLEAST64: + DONE: + MOVQ R12,SP + RET + BYTESATLEAST65: + SUBQ $64,R9 + ADDQ $64,DI + ADDQ $64,SI + JMP BYTESBETWEEN1AND255 diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa208.go b/vendor/golang.org/x/crypto/salsa20/salsa/salsa208.go new file mode 100644 index 0000000..9bfc092 --- /dev/null +++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa208.go @@ -0,0 +1,199 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package salsa + +// Core208 applies the Salsa20/8 core function to the 64-byte array in and puts +// the result into the 64-byte array out. The input and output may be the same array. +func Core208(out *[64]byte, in *[64]byte) { + j0 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24 + j1 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24 + j2 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24 + j3 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24 + j4 := uint32(in[16]) | uint32(in[17])<<8 | uint32(in[18])<<16 | uint32(in[19])<<24 + j5 := uint32(in[20]) | uint32(in[21])<<8 | uint32(in[22])<<16 | uint32(in[23])<<24 + j6 := uint32(in[24]) | uint32(in[25])<<8 | uint32(in[26])<<16 | uint32(in[27])<<24 + j7 := uint32(in[28]) | uint32(in[29])<<8 | uint32(in[30])<<16 | uint32(in[31])<<24 + j8 := uint32(in[32]) | uint32(in[33])<<8 | uint32(in[34])<<16 | uint32(in[35])<<24 + j9 := uint32(in[36]) | uint32(in[37])<<8 | uint32(in[38])<<16 | uint32(in[39])<<24 + j10 := uint32(in[40]) | uint32(in[41])<<8 | uint32(in[42])<<16 | uint32(in[43])<<24 + j11 := uint32(in[44]) | uint32(in[45])<<8 | uint32(in[46])<<16 | uint32(in[47])<<24 + j12 := uint32(in[48]) | uint32(in[49])<<8 | uint32(in[50])<<16 | uint32(in[51])<<24 + j13 := uint32(in[52]) | uint32(in[53])<<8 | uint32(in[54])<<16 | uint32(in[55])<<24 + j14 := uint32(in[56]) | uint32(in[57])<<8 | uint32(in[58])<<16 | uint32(in[59])<<24 + j15 := uint32(in[60]) | uint32(in[61])<<8 | uint32(in[62])<<16 | uint32(in[63])<<24 + + x0, x1, x2, x3, x4, x5, x6, x7, x8 := j0, j1, j2, j3, j4, j5, j6, j7, j8 + x9, x10, x11, x12, x13, x14, x15 := j9, j10, j11, j12, j13, j14, j15 + + for i := 0; i < 8; i += 2 { + u := x0 + x12 + x4 ^= u<<7 | u>>(32-7) + u = x4 + x0 + x8 ^= u<<9 | u>>(32-9) + u = x8 + x4 + x12 ^= u<<13 | u>>(32-13) + u = x12 + x8 + x0 ^= u<<18 | u>>(32-18) + + u = x5 + x1 + x9 ^= u<<7 | u>>(32-7) + u = x9 + x5 + x13 ^= u<<9 | u>>(32-9) + u = x13 + x9 + x1 ^= u<<13 | u>>(32-13) + u = x1 + x13 + x5 ^= u<<18 | u>>(32-18) + + u = x10 + x6 + x14 ^= u<<7 | u>>(32-7) + u = x14 + x10 + x2 ^= u<<9 | u>>(32-9) + u = x2 + x14 + x6 ^= u<<13 | u>>(32-13) + u = x6 + x2 + x10 ^= u<<18 | u>>(32-18) + + u = x15 + x11 + x3 ^= u<<7 | u>>(32-7) + u = x3 + x15 + x7 ^= u<<9 | u>>(32-9) + u = x7 + x3 + x11 ^= u<<13 | u>>(32-13) + u = x11 + x7 + x15 ^= u<<18 | u>>(32-18) + + u = x0 + x3 + x1 ^= u<<7 | u>>(32-7) + u = x1 + x0 + x2 ^= u<<9 | u>>(32-9) + u = x2 + x1 + x3 ^= u<<13 | u>>(32-13) + u = x3 + x2 + x0 ^= u<<18 | u>>(32-18) + + u = x5 + x4 + x6 ^= u<<7 | u>>(32-7) + u = x6 + x5 + x7 ^= u<<9 | u>>(32-9) + u = x7 + x6 + x4 ^= u<<13 | u>>(32-13) + u = x4 + x7 + x5 ^= u<<18 | u>>(32-18) + + u = x10 + x9 + x11 ^= u<<7 | u>>(32-7) + u = x11 + x10 + x8 ^= u<<9 | u>>(32-9) + u = x8 + x11 + x9 ^= u<<13 | u>>(32-13) + u = x9 + x8 + x10 ^= u<<18 | u>>(32-18) + + u = x15 + x14 + x12 ^= u<<7 | u>>(32-7) + u = x12 + x15 + x13 ^= u<<9 | u>>(32-9) + u = x13 + x12 + x14 ^= u<<13 | u>>(32-13) + u = x14 + x13 + x15 ^= u<<18 | u>>(32-18) + } + x0 += j0 + x1 += j1 + x2 += j2 + x3 += j3 + x4 += j4 + x5 += j5 + x6 += j6 + x7 += j7 + x8 += j8 + x9 += j9 + x10 += j10 + x11 += j11 + x12 += j12 + x13 += j13 + x14 += j14 + x15 += j15 + + out[0] = byte(x0) + out[1] = byte(x0 >> 8) + out[2] = byte(x0 >> 16) + out[3] = byte(x0 >> 24) + + out[4] = byte(x1) + out[5] = byte(x1 >> 8) + out[6] = byte(x1 >> 16) + out[7] = byte(x1 >> 24) + + out[8] = byte(x2) + out[9] = byte(x2 >> 8) + out[10] = byte(x2 >> 16) + out[11] = byte(x2 >> 24) + + out[12] = byte(x3) + out[13] = byte(x3 >> 8) + out[14] = byte(x3 >> 16) + out[15] = byte(x3 >> 24) + + out[16] = byte(x4) + out[17] = byte(x4 >> 8) + out[18] = byte(x4 >> 16) + out[19] = byte(x4 >> 24) + + out[20] = byte(x5) + out[21] = byte(x5 >> 8) + out[22] = byte(x5 >> 16) + out[23] = byte(x5 >> 24) + + out[24] = byte(x6) + out[25] = byte(x6 >> 8) + out[26] = byte(x6 >> 16) + out[27] = byte(x6 >> 24) + + out[28] = byte(x7) + out[29] = byte(x7 >> 8) + out[30] = byte(x7 >> 16) + out[31] = byte(x7 >> 24) + + out[32] = byte(x8) + out[33] = byte(x8 >> 8) + out[34] = byte(x8 >> 16) + out[35] = byte(x8 >> 24) + + out[36] = byte(x9) + out[37] = byte(x9 >> 8) + out[38] = byte(x9 >> 16) + out[39] = byte(x9 >> 24) + + out[40] = byte(x10) + out[41] = byte(x10 >> 8) + out[42] = byte(x10 >> 16) + out[43] = byte(x10 >> 24) + + out[44] = byte(x11) + out[45] = byte(x11 >> 8) + out[46] = byte(x11 >> 16) + out[47] = byte(x11 >> 24) + + out[48] = byte(x12) + out[49] = byte(x12 >> 8) + out[50] = byte(x12 >> 16) + out[51] = byte(x12 >> 24) + + out[52] = byte(x13) + out[53] = byte(x13 >> 8) + out[54] = byte(x13 >> 16) + out[55] = byte(x13 >> 24) + + out[56] = byte(x14) + out[57] = byte(x14 >> 8) + out[58] = byte(x14 >> 16) + out[59] = byte(x14 >> 24) + + out[60] = byte(x15) + out[61] = byte(x15 >> 8) + out[62] = byte(x15 >> 16) + out[63] = byte(x15 >> 24) +} diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.go b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.go new file mode 100644 index 0000000..f9269c3 --- /dev/null +++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.go @@ -0,0 +1,24 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64,!appengine,!gccgo + +package salsa + +// This function is implemented in salsa2020_amd64.s. + +//go:noescape + +func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte) + +// XORKeyStream crypts bytes from in to out using the given key and counters. +// In and out must overlap entirely or not at all. Counter +// contains the raw salsa20 counter bytes (both nonce and block counter). +func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) { + if len(in) == 0 { + return + } + _ = out[len(in)-1] + salsa2020XORKeyStream(&out[0], &in[0], uint64(len(in)), &counter[0], &key[0]) +} diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go new file mode 100644 index 0000000..22126d1 --- /dev/null +++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go @@ -0,0 +1,234 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 appengine gccgo + +package salsa + +const rounds = 20 + +// core applies the Salsa20 core function to 16-byte input in, 32-byte key k, +// and 16-byte constant c, and puts the result into 64-byte array out. +func core(out *[64]byte, in *[16]byte, k *[32]byte, c *[16]byte) { + j0 := uint32(c[0]) | uint32(c[1])<<8 | uint32(c[2])<<16 | uint32(c[3])<<24 + j1 := uint32(k[0]) | uint32(k[1])<<8 | uint32(k[2])<<16 | uint32(k[3])<<24 + j2 := uint32(k[4]) | uint32(k[5])<<8 | uint32(k[6])<<16 | uint32(k[7])<<24 + j3 := uint32(k[8]) | uint32(k[9])<<8 | uint32(k[10])<<16 | uint32(k[11])<<24 + j4 := uint32(k[12]) | uint32(k[13])<<8 | uint32(k[14])<<16 | uint32(k[15])<<24 + j5 := uint32(c[4]) | uint32(c[5])<<8 | uint32(c[6])<<16 | uint32(c[7])<<24 + j6 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24 + j7 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24 + j8 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24 + j9 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24 + j10 := uint32(c[8]) | uint32(c[9])<<8 | uint32(c[10])<<16 | uint32(c[11])<<24 + j11 := uint32(k[16]) | uint32(k[17])<<8 | uint32(k[18])<<16 | uint32(k[19])<<24 + j12 := uint32(k[20]) | uint32(k[21])<<8 | uint32(k[22])<<16 | uint32(k[23])<<24 + j13 := uint32(k[24]) | uint32(k[25])<<8 | uint32(k[26])<<16 | uint32(k[27])<<24 + j14 := uint32(k[28]) | uint32(k[29])<<8 | uint32(k[30])<<16 | uint32(k[31])<<24 + j15 := uint32(c[12]) | uint32(c[13])<<8 | uint32(c[14])<<16 | uint32(c[15])<<24 + + x0, x1, x2, x3, x4, x5, x6, x7, x8 := j0, j1, j2, j3, j4, j5, j6, j7, j8 + x9, x10, x11, x12, x13, x14, x15 := j9, j10, j11, j12, j13, j14, j15 + + for i := 0; i < rounds; i += 2 { + u := x0 + x12 + x4 ^= u<<7 | u>>(32-7) + u = x4 + x0 + x8 ^= u<<9 | u>>(32-9) + u = x8 + x4 + x12 ^= u<<13 | u>>(32-13) + u = x12 + x8 + x0 ^= u<<18 | u>>(32-18) + + u = x5 + x1 + x9 ^= u<<7 | u>>(32-7) + u = x9 + x5 + x13 ^= u<<9 | u>>(32-9) + u = x13 + x9 + x1 ^= u<<13 | u>>(32-13) + u = x1 + x13 + x5 ^= u<<18 | u>>(32-18) + + u = x10 + x6 + x14 ^= u<<7 | u>>(32-7) + u = x14 + x10 + x2 ^= u<<9 | u>>(32-9) + u = x2 + x14 + x6 ^= u<<13 | u>>(32-13) + u = x6 + x2 + x10 ^= u<<18 | u>>(32-18) + + u = x15 + x11 + x3 ^= u<<7 | u>>(32-7) + u = x3 + x15 + x7 ^= u<<9 | u>>(32-9) + u = x7 + x3 + x11 ^= u<<13 | u>>(32-13) + u = x11 + x7 + x15 ^= u<<18 | u>>(32-18) + + u = x0 + x3 + x1 ^= u<<7 | u>>(32-7) + u = x1 + x0 + x2 ^= u<<9 | u>>(32-9) + u = x2 + x1 + x3 ^= u<<13 | u>>(32-13) + u = x3 + x2 + x0 ^= u<<18 | u>>(32-18) + + u = x5 + x4 + x6 ^= u<<7 | u>>(32-7) + u = x6 + x5 + x7 ^= u<<9 | u>>(32-9) + u = x7 + x6 + x4 ^= u<<13 | u>>(32-13) + u = x4 + x7 + x5 ^= u<<18 | u>>(32-18) + + u = x10 + x9 + x11 ^= u<<7 | u>>(32-7) + u = x11 + x10 + x8 ^= u<<9 | u>>(32-9) + u = x8 + x11 + x9 ^= u<<13 | u>>(32-13) + u = x9 + x8 + x10 ^= u<<18 | u>>(32-18) + + u = x15 + x14 + x12 ^= u<<7 | u>>(32-7) + u = x12 + x15 + x13 ^= u<<9 | u>>(32-9) + u = x13 + x12 + x14 ^= u<<13 | u>>(32-13) + u = x14 + x13 + x15 ^= u<<18 | u>>(32-18) + } + x0 += j0 + x1 += j1 + x2 += j2 + x3 += j3 + x4 += j4 + x5 += j5 + x6 += j6 + x7 += j7 + x8 += j8 + x9 += j9 + x10 += j10 + x11 += j11 + x12 += j12 + x13 += j13 + x14 += j14 + x15 += j15 + + out[0] = byte(x0) + out[1] = byte(x0 >> 8) + out[2] = byte(x0 >> 16) + out[3] = byte(x0 >> 24) + + out[4] = byte(x1) + out[5] = byte(x1 >> 8) + out[6] = byte(x1 >> 16) + out[7] = byte(x1 >> 24) + + out[8] = byte(x2) + out[9] = byte(x2 >> 8) + out[10] = byte(x2 >> 16) + out[11] = byte(x2 >> 24) + + out[12] = byte(x3) + out[13] = byte(x3 >> 8) + out[14] = byte(x3 >> 16) + out[15] = byte(x3 >> 24) + + out[16] = byte(x4) + out[17] = byte(x4 >> 8) + out[18] = byte(x4 >> 16) + out[19] = byte(x4 >> 24) + + out[20] = byte(x5) + out[21] = byte(x5 >> 8) + out[22] = byte(x5 >> 16) + out[23] = byte(x5 >> 24) + + out[24] = byte(x6) + out[25] = byte(x6 >> 8) + out[26] = byte(x6 >> 16) + out[27] = byte(x6 >> 24) + + out[28] = byte(x7) + out[29] = byte(x7 >> 8) + out[30] = byte(x7 >> 16) + out[31] = byte(x7 >> 24) + + out[32] = byte(x8) + out[33] = byte(x8 >> 8) + out[34] = byte(x8 >> 16) + out[35] = byte(x8 >> 24) + + out[36] = byte(x9) + out[37] = byte(x9 >> 8) + out[38] = byte(x9 >> 16) + out[39] = byte(x9 >> 24) + + out[40] = byte(x10) + out[41] = byte(x10 >> 8) + out[42] = byte(x10 >> 16) + out[43] = byte(x10 >> 24) + + out[44] = byte(x11) + out[45] = byte(x11 >> 8) + out[46] = byte(x11 >> 16) + out[47] = byte(x11 >> 24) + + out[48] = byte(x12) + out[49] = byte(x12 >> 8) + out[50] = byte(x12 >> 16) + out[51] = byte(x12 >> 24) + + out[52] = byte(x13) + out[53] = byte(x13 >> 8) + out[54] = byte(x13 >> 16) + out[55] = byte(x13 >> 24) + + out[56] = byte(x14) + out[57] = byte(x14 >> 8) + out[58] = byte(x14 >> 16) + out[59] = byte(x14 >> 24) + + out[60] = byte(x15) + out[61] = byte(x15 >> 8) + out[62] = byte(x15 >> 16) + out[63] = byte(x15 >> 24) +} + +// XORKeyStream crypts bytes from in to out using the given key and counters. +// In and out must overlap entirely or not at all. Counter +// contains the raw salsa20 counter bytes (both nonce and block counter). +func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) { + var block [64]byte + var counterCopy [16]byte + copy(counterCopy[:], counter[:]) + + for len(in) >= 64 { + core(&block, &counterCopy, key, &Sigma) + for i, x := range block { + out[i] = in[i] ^ x + } + u := uint32(1) + for i := 8; i < 16; i++ { + u += uint32(counterCopy[i]) + counterCopy[i] = byte(u) + u >>= 8 + } + in = in[64:] + out = out[64:] + } + + if len(in) > 0 { + core(&block, &counterCopy, key, &Sigma) + for i, v := range in { + out[i] = v ^ block[i] + } + } +} diff --git a/vendor/golang.org/x/sys/LICENSE b/vendor/golang.org/x/sys/LICENSE new file mode 100644 index 0000000..6a66aea --- /dev/null +++ b/vendor/golang.org/x/sys/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/golang.org/x/sys/PATENTS b/vendor/golang.org/x/sys/PATENTS new file mode 100644 index 0000000..7330990 --- /dev/null +++ b/vendor/golang.org/x/sys/PATENTS @@ -0,0 +1,22 @@ +Additional IP Rights Grant (Patents) + +"This implementation" means the copyrightable works distributed by +Google as part of the Go project. + +Google hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable (except as stated in this section) +patent license to make, have made, use, offer to sell, sell, import, +transfer and otherwise run, modify and propagate the contents of this +implementation of Go, where such license applies only to those patent +claims, both currently owned or controlled by Google and acquired in +the future, licensable by Google that are necessarily infringed by this +implementation of Go. This grant does not include claims that would be +infringed only as a consequence of further modification of this +implementation. If you or your agent or exclusive licensee institute or +order or agree to the institution of patent litigation against any +entity (including a cross-claim or counterclaim in a lawsuit) alleging +that this implementation of Go or any code incorporated within this +implementation of Go constitutes direct or contributory patent +infringement, or inducement of patent infringement, then any patent +rights granted to you under this License for this implementation of Go +shall terminate as of the date such litigation is filed. diff --git a/vendor/golang.org/x/sys/cpu/cpu.go b/vendor/golang.org/x/sys/cpu/cpu.go new file mode 100644 index 0000000..3d88f86 --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu.go @@ -0,0 +1,38 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package cpu implements processor feature detection for +// various CPU architectures. +package cpu + +// CacheLinePad is used to pad structs to avoid false sharing. +type CacheLinePad struct{ _ [cacheLineSize]byte } + +// X86 contains the supported CPU features of the +// current X86/AMD64 platform. If the current platform +// is not X86/AMD64 then all feature flags are false. +// +// X86 is padded to avoid false sharing. Further the HasAVX +// and HasAVX2 are only set if the OS supports XMM and YMM +// registers in addition to the CPUID feature bit being set. +var X86 struct { + _ CacheLinePad + HasAES bool // AES hardware implementation (AES NI) + HasADX bool // Multi-precision add-carry instruction extensions + HasAVX bool // Advanced vector extension + HasAVX2 bool // Advanced vector extension 2 + HasBMI1 bool // Bit manipulation instruction set 1 + HasBMI2 bool // Bit manipulation instruction set 2 + HasERMS bool // Enhanced REP for MOVSB and STOSB + HasFMA bool // Fused-multiply-add instructions + HasOSXSAVE bool // OS supports XSAVE/XRESTOR for saving/restoring XMM registers. + HasPCLMULQDQ bool // PCLMULQDQ instruction - most often used for AES-GCM + HasPOPCNT bool // Hamming weight instruction POPCNT. + HasSSE2 bool // Streaming SIMD extension 2 (always available on amd64) + HasSSE3 bool // Streaming SIMD extension 3 + HasSSSE3 bool // Supplemental streaming SIMD extension 3 + HasSSE41 bool // Streaming SIMD extension 4 and 4.1 + HasSSE42 bool // Streaming SIMD extension 4 and 4.2 + _ CacheLinePad +} diff --git a/vendor/golang.org/x/sys/cpu/cpu_arm.go b/vendor/golang.org/x/sys/cpu/cpu_arm.go new file mode 100644 index 0000000..d93036f --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_arm.go @@ -0,0 +1,7 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const cacheLineSize = 32 diff --git a/vendor/golang.org/x/sys/cpu/cpu_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_arm64.go new file mode 100644 index 0000000..1d2ab29 --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_arm64.go @@ -0,0 +1,7 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const cacheLineSize = 64 diff --git a/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go new file mode 100644 index 0000000..f7cb469 --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go @@ -0,0 +1,16 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build 386 amd64 amd64p32 +// +build !gccgo + +package cpu + +// cpuid is implemented in cpu_x86.s for gc compiler +// and in cpu_gccgo.c for gccgo. +func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) + +// xgetbv with ecx = 0 is implemented in cpu_x86.s for gc compiler +// and in cpu_gccgo.c for gccgo. +func xgetbv() (eax, edx uint32) diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo.c b/vendor/golang.org/x/sys/cpu/cpu_gccgo.c new file mode 100644 index 0000000..e363c7d --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo.c @@ -0,0 +1,43 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build 386 amd64 amd64p32 +// +build gccgo + +#include +#include + +// Need to wrap __get_cpuid_count because it's declared as static. +int +gccgoGetCpuidCount(uint32_t leaf, uint32_t subleaf, + uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) +{ + return __get_cpuid_count(leaf, subleaf, eax, ebx, ecx, edx); +} + +// xgetbv reads the contents of an XCR (Extended Control Register) +// specified in the ECX register into registers EDX:EAX. +// Currently, the only supported value for XCR is 0. +// +// TODO: Replace with a better alternative: +// +// #include +// +// #pragma GCC target("xsave") +// +// void gccgoXgetbv(uint32_t *eax, uint32_t *edx) { +// unsigned long long x = _xgetbv(0); +// *eax = x & 0xffffffff; +// *edx = (x >> 32) & 0xffffffff; +// } +// +// Note that _xgetbv is defined starting with GCC 8. +void +gccgoXgetbv(uint32_t *eax, uint32_t *edx) +{ + __asm(" xorl %%ecx, %%ecx\n" + " xgetbv" + : "=a"(*eax), "=d"(*edx)); +} diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo.go b/vendor/golang.org/x/sys/cpu/cpu_gccgo.go new file mode 100644 index 0000000..ba49b91 --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo.go @@ -0,0 +1,26 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build 386 amd64 amd64p32 +// +build gccgo + +package cpu + +//extern gccgoGetCpuidCount +func gccgoGetCpuidCount(eaxArg, ecxArg uint32, eax, ebx, ecx, edx *uint32) + +func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) { + var a, b, c, d uint32 + gccgoGetCpuidCount(eaxArg, ecxArg, &a, &b, &c, &d) + return a, b, c, d +} + +//extern gccgoXgetbv +func gccgoXgetbv(eax, edx *uint32) + +func xgetbv() (eax, edx uint32) { + var a, d uint32 + gccgoXgetbv(&a, &d) + return a, d +} diff --git a/vendor/golang.org/x/sys/cpu/cpu_mips64x.go b/vendor/golang.org/x/sys/cpu/cpu_mips64x.go new file mode 100644 index 0000000..6165f12 --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_mips64x.go @@ -0,0 +1,9 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build mips64 mips64le + +package cpu + +const cacheLineSize = 32 diff --git a/vendor/golang.org/x/sys/cpu/cpu_mipsx.go b/vendor/golang.org/x/sys/cpu/cpu_mipsx.go new file mode 100644 index 0000000..1269eee --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_mipsx.go @@ -0,0 +1,9 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build mips mipsle + +package cpu + +const cacheLineSize = 32 diff --git a/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go b/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go new file mode 100644 index 0000000..d10759a --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go @@ -0,0 +1,9 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ppc64 ppc64le + +package cpu + +const cacheLineSize = 128 diff --git a/vendor/golang.org/x/sys/cpu/cpu_s390x.go b/vendor/golang.org/x/sys/cpu/cpu_s390x.go new file mode 100644 index 0000000..684c4f0 --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_s390x.go @@ -0,0 +1,7 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +const cacheLineSize = 256 diff --git a/vendor/golang.org/x/sys/cpu/cpu_x86.go b/vendor/golang.org/x/sys/cpu/cpu_x86.go new file mode 100644 index 0000000..71e288b --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_x86.go @@ -0,0 +1,55 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build 386 amd64 amd64p32 + +package cpu + +const cacheLineSize = 64 + +func init() { + maxID, _, _, _ := cpuid(0, 0) + + if maxID < 1 { + return + } + + _, _, ecx1, edx1 := cpuid(1, 0) + X86.HasSSE2 = isSet(26, edx1) + + X86.HasSSE3 = isSet(0, ecx1) + X86.HasPCLMULQDQ = isSet(1, ecx1) + X86.HasSSSE3 = isSet(9, ecx1) + X86.HasFMA = isSet(12, ecx1) + X86.HasSSE41 = isSet(19, ecx1) + X86.HasSSE42 = isSet(20, ecx1) + X86.HasPOPCNT = isSet(23, ecx1) + X86.HasAES = isSet(25, ecx1) + X86.HasOSXSAVE = isSet(27, ecx1) + + osSupportsAVX := false + // For XGETBV, OSXSAVE bit is required and sufficient. + if X86.HasOSXSAVE { + eax, _ := xgetbv() + // Check if XMM and YMM registers have OS support. + osSupportsAVX = isSet(1, eax) && isSet(2, eax) + } + + X86.HasAVX = isSet(28, ecx1) && osSupportsAVX + + if maxID < 7 { + return + } + + _, ebx7, _, _ := cpuid(7, 0) + X86.HasBMI1 = isSet(3, ebx7) + X86.HasAVX2 = isSet(5, ebx7) && osSupportsAVX + X86.HasBMI2 = isSet(8, ebx7) + X86.HasERMS = isSet(9, ebx7) + X86.HasADX = isSet(19, ebx7) +} + +func isSet(bitpos uint, value uint32) bool { + return value&(1<