lib/connections: Use our own fork of kcp (fixes #4063)

This updates kcp and uses our own fork which: 1. Keys sessions not just by remote address, but by remote address + conversation id 2. Allows not to close connections that were passed directly to the library. 3. Resets cache key if the session gets terminated. GitHub-Pull-Request: https://github.com/syncthing/syncthing/pull/4339 LGTM: calmh
2024-11-16 10:28:49 -07:00 · 2017-09-02 06:04:35 +00:00 · 2017-09-02 06:04:35 +00:00 · cbcc3ea132
commit cbcc3ea132
parent ab132ff6fe
19 changed files with 1551 additions and 188 deletions
--- a/lib/connections/kcp_dial.go
+++ b/lib/connections/kcp_dial.go
@ -11,9 +11,9 @@ import (
 	"net/url"
 	"time"

+	"github.com/AudriusButkevicius/kcp-go"
 	"github.com/syncthing/syncthing/lib/config"
 	"github.com/syncthing/syncthing/lib/protocol"
-	"github.com/xtaci/kcp-go"
 	"github.com/xtaci/smux"
 )

@ -38,7 +38,7 @@ func (d *kcpDialer) Dial(id protocol.DeviceID, uri *url.URL) (internalConn, erro
 	// Try to dial via an existing listening connection
 	// giving better changes punching through NAT.
 	if f := getDialingFilter(); f != nil {
-		conn, err = kcp.NewConn(uri.Host, nil, 0, 0, f.NewConn(kcpConversationFilterPriority, &kcpConversationFilter{}))
+		conn, err = kcp.NewConn(uri.Host, nil, 0, 0, f.NewConn(kcpConversationFilterPriority, &kcpConversationFilter{}), false)
 		l.Debugf("dial %s using existing conn on %s", uri.String(), conn.LocalAddr())
 	} else {
 		conn, err = kcp.DialWithOptions(uri.Host, nil, 0, 0)
--- a/lib/connections/kcp_listen.go
+++ b/lib/connections/kcp_listen.go
@ -14,11 +14,11 @@ import (
 	"sync"
 	"time"

+	"github.com/AudriusButkevicius/kcp-go"
 	"github.com/AudriusButkevicius/pfilter"
 	"github.com/ccding/go-stun/stun"
 	"github.com/syncthing/syncthing/lib/config"
 	"github.com/syncthing/syncthing/lib/nat"
-	"github.com/xtaci/kcp-go"
 	"github.com/xtaci/smux"
 )

--- a/lib/protocol/benchmark_test.go
+++ b/lib/protocol/benchmark_test.go
@ -8,8 +8,8 @@ import (
 	"net"
 	"testing"

+	"github.com/AudriusButkevicius/kcp-go"
 	"github.com/syncthing/syncthing/lib/dialer"
-	"github.com/xtaci/kcp-go"
 )

 func BenchmarkRequestsRawTCP(b *testing.B) {
--- a/vendor/github.com/AudriusButkevicius/kcp-go/LICENSE
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/LICENSE
--- a/vendor/github.com/AudriusButkevicius/kcp-go/crypt.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/crypt.go
@ -6,6 +6,8 @@ import (
 	"crypto/des"
 	"crypto/sha1"

+	"github.com/templexxx/xor"
+
 	"golang.org/x/crypto/blowfish"
 	"golang.org/x/crypto/cast5"
 	"golang.org/x/crypto/pbkdf2"
@ -218,8 +220,8 @@ func NewSimpleXORBlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }

-func (c *simpleXORBlockCrypt) Encrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
-func (c *simpleXORBlockCrypt) Decrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
+func (c *simpleXORBlockCrypt) Encrypt(dst, src []byte) { xor.Bytes(dst, src, c.xortbl) }
+func (c *simpleXORBlockCrypt) Decrypt(dst, src []byte) { xor.Bytes(dst, src, c.xortbl) }

 type noneBlockCrypt struct{}

@ -239,11 +241,11 @@ func encrypt(block cipher.Block, dst, src, buf []byte) {
 	n := len(src) / blocksize
 	base := 0
 	for i := 0; i < n; i++ {
-		xorWords(dst[base:], src[base:], tbl)
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
 		block.Encrypt(tbl, dst[base:])
 		base += blocksize
 	}
-	xorBytes(dst[base:], src[base:], tbl)
+	xor.BytesSrc0(dst[base:], src[base:], tbl)
 }

 func decrypt(block cipher.Block, dst, src, buf []byte) {
@ -255,9 +257,9 @@ func decrypt(block cipher.Block, dst, src, buf []byte) {
 	base := 0
 	for i := 0; i < n; i++ {
 		block.Encrypt(next, src[base:])
-		xorWords(dst[base:], src[base:], tbl)
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
 		tbl, next = next, tbl
 		base += blocksize
 	}
-	xorBytes(dst[base:], src[base:], tbl)
+	xor.BytesSrc0(dst[base:], src[base:], tbl)
 }
--- a/vendor/github.com/AudriusButkevicius/kcp-go/fec.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/fec.go
@ -22,8 +22,8 @@ type (
 		data  []byte
 	}

-	// FECDecoder for decoding incoming packets
-	FECDecoder struct {
+	// fecDecoder for decoding incoming packets
+	fecDecoder struct {
 		rxlimit      int // queue size limit
 		dataShards   int
 		parityShards int
@ -39,7 +39,7 @@ type (
 	}
 )

-func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder {
+func newFECDecoder(rxlimit, dataShards, parityShards int) *fecDecoder {
 	if dataShards <= 0 || parityShards <= 0 {
 		return nil
 	}
@ -47,7 +47,7 @@ func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder {
 		return nil
 	}

-	fec := new(FECDecoder)
+	fec := new(fecDecoder)
 	fec.rxlimit = rxlimit
 	fec.dataShards = dataShards
 	fec.parityShards = parityShards
@ -63,7 +63,7 @@ func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder {
 }

 // decodeBytes a fec packet
-func (dec *FECDecoder) decodeBytes(data []byte) fecPacket {
+func (dec *fecDecoder) decodeBytes(data []byte) fecPacket {
 	var pkt fecPacket
 	pkt.seqid = binary.LittleEndian.Uint32(data)
 	pkt.flag = binary.LittleEndian.Uint16(data[4:])
@ -74,8 +74,8 @@ func (dec *FECDecoder) decodeBytes(data []byte) fecPacket {
 	return pkt
 }

-// Decode a fec packet
-func (dec *FECDecoder) Decode(pkt fecPacket) (recovered [][]byte) {
+// decode a fec packet
+func (dec *fecDecoder) decode(pkt fecPacket) (recovered [][]byte) {
 	// insertion
 	n := len(dec.rx) - 1
 	insertIdx := 0
@ -179,7 +179,7 @@ func (dec *FECDecoder) Decode(pkt fecPacket) (recovered [][]byte) {
 }

 // free a range of fecPacket, and zero for GC recycling
-func (dec *FECDecoder) freeRange(first, n int, q []fecPacket) []fecPacket {
+func (dec *fecDecoder) freeRange(first, n int, q []fecPacket) []fecPacket {
 	for i := first; i < first+n; i++ { // free
 		xmitBuf.Put(q[i].data)
 	}
@ -191,8 +191,8 @@ func (dec *FECDecoder) freeRange(first, n int, q []fecPacket) []fecPacket {
 }

 type (
-	// FECEncoder for encoding outgoing packets
-	FECEncoder struct {
+	// fecEncoder for encoding outgoing packets
+	fecEncoder struct {
 		dataShards   int
 		parityShards int
 		shardSize    int
@ -214,11 +214,11 @@ type (
 	}
 )

-func newFECEncoder(dataShards, parityShards, offset int) *FECEncoder {
+func newFECEncoder(dataShards, parityShards, offset int) *fecEncoder {
 	if dataShards <= 0 || parityShards <= 0 {
 		return nil
 	}
-	fec := new(FECEncoder)
+	fec := new(fecEncoder)
 	fec.dataShards = dataShards
 	fec.parityShards = parityShards
 	fec.shardSize = dataShards + parityShards
@ -241,9 +241,9 @@ func newFECEncoder(dataShards, parityShards, offset int) *FECEncoder {
 	return fec
 }

-// Encode the packet, output parity shards if we have enough datashards
-// the content of returned parityshards will change in next Encode
-func (enc *FECEncoder) Encode(b []byte) (ps [][]byte) {
+// encode the packet, output parity shards if we have enough datashards
+// the content of returned parityshards will change in next encode
+func (enc *fecEncoder) encode(b []byte) (ps [][]byte) {
 	enc.markData(b[enc.headerOffset:])
 	binary.LittleEndian.PutUint16(b[enc.payloadOffset:], uint16(len(b[enc.payloadOffset:])))

@ -290,13 +290,13 @@ func (enc *FECEncoder) Encode(b []byte) (ps [][]byte) {
 	return
 }

-func (enc *FECEncoder) markData(data []byte) {
+func (enc *fecEncoder) markData(data []byte) {
 	binary.LittleEndian.PutUint32(data, enc.next)
 	binary.LittleEndian.PutUint16(data[4:], typeData)
 	enc.next++
 }

-func (enc *FECEncoder) markFEC(data []byte) {
+func (enc *fecEncoder) markFEC(data []byte) {
 	binary.LittleEndian.PutUint32(data, enc.next)
 	binary.LittleEndian.PutUint16(data[4:], typeFEC)
 	enc.next = (enc.next + 1) % enc.paws
--- a/vendor/github.com/AudriusButkevicius/kcp-go/kcp.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/kcp.go
@ -30,8 +30,8 @@ const (
 	IKCP_PROBE_LIMIT = 120000 // up to 120 secs to probe window
 )

-// Output is a closure which captures conn and calls conn.Write
-type Output func(buf []byte, size int)
+// output_callback is a prototype which ought capture conn and call conn.Write
+type output_callback func(buf []byte, size int)

 /* encode 8 bits unsigned int */
 func ikcp_encode8u(p []byte, c byte) []byte {
@ -91,8 +91,8 @@ func _itimediff(later, earlier uint32) int32 {
 	return (int32)(later - earlier)
 }

-// Segment defines a KCP segment
-type Segment struct {
+// segment defines a KCP segment
+type segment struct {
 	conv     uint32
 	cmd      uint8
 	frg      uint8
@ -108,11 +108,11 @@ type Segment struct {
 }

 // encode a segment into buffer
-func (seg *Segment) encode(ptr []byte) []byte {
+func (seg *segment) encode(ptr []byte) []byte {
 	ptr = ikcp_encode32u(ptr, seg.conv)
-	ptr = ikcp_encode8u(ptr, uint8(seg.cmd))
-	ptr = ikcp_encode8u(ptr, uint8(seg.frg))
-	ptr = ikcp_encode16u(ptr, uint16(seg.wnd))
+	ptr = ikcp_encode8u(ptr, seg.cmd)
+	ptr = ikcp_encode8u(ptr, seg.frg)
+	ptr = ikcp_encode16u(ptr, seg.wnd)
 	ptr = ikcp_encode32u(ptr, seg.ts)
 	ptr = ikcp_encode32u(ptr, seg.sn)
 	ptr = ikcp_encode32u(ptr, seg.una)
@ -137,15 +137,15 @@ type KCP struct {
 	fastresend     int32
 	nocwnd, stream int32

-	snd_queue []Segment
-	rcv_queue []Segment
-	snd_buf   []Segment
-	rcv_buf   []Segment
+	snd_queue []segment
+	rcv_queue []segment
+	snd_buf   []segment
+	rcv_buf   []segment

 	acklist []ackItem

 	buffer []byte
-	output Output
+	output output_callback
 }

 type ackItem struct {
@ -155,7 +155,7 @@ type ackItem struct {

 // NewKCP create a new kcp control object, 'conv' must equal in two endpoint
 // from the same connection.
-func NewKCP(conv uint32, output Output) *KCP {
+func NewKCP(conv uint32, output output_callback) *KCP {
 	kcp := new(KCP)
 	kcp.conv = conv
 	kcp.snd_wnd = IKCP_WND_SND
@ -175,13 +175,13 @@ func NewKCP(conv uint32, output Output) *KCP {
 }

 // newSegment creates a KCP segment
-func (kcp *KCP) newSegment(size int) (seg Segment) {
+func (kcp *KCP) newSegment(size int) (seg segment) {
 	seg.data = xmitBuf.Get().([]byte)[:size]
 	return
 }

 // delSegment recycles a KCP segment
-func (kcp *KCP) delSegment(seg Segment) {
+func (kcp *KCP) delSegment(seg segment) {
 	xmitBuf.Put(seg.data)
 }

@ -384,7 +384,7 @@ func (kcp *KCP) parse_ack(sn uint32) {
 		if sn == seg.sn {
 			kcp.delSegment(*seg)
 			copy(kcp.snd_buf[k:], kcp.snd_buf[k+1:])
-			kcp.snd_buf[len(kcp.snd_buf)-1] = Segment{}
+			kcp.snd_buf[len(kcp.snd_buf)-1] = segment{}
 			kcp.snd_buf = kcp.snd_buf[:len(kcp.snd_buf)-1]
 			break
 		}
@ -430,7 +430,7 @@ func (kcp *KCP) ack_push(sn, ts uint32) {
 	kcp.acklist = append(kcp.acklist, ackItem{sn, ts})
 }

-func (kcp *KCP) parse_data(newseg Segment) {
+func (kcp *KCP) parse_data(newseg segment) {
 	sn := newseg.sn
 	if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) >= 0 ||
 		_itimediff(sn, kcp.rcv_nxt) < 0 {
@ -458,7 +458,7 @@ func (kcp *KCP) parse_data(newseg Segment) {
 		if insert_idx == n+1 {
 			kcp.rcv_buf = append(kcp.rcv_buf, newseg)
 		} else {
-			kcp.rcv_buf = append(kcp.rcv_buf, Segment{})
+			kcp.rcv_buf = append(kcp.rcv_buf, segment{})
 			copy(kcp.rcv_buf[insert_idx+1:], kcp.rcv_buf[insert_idx:])
 			kcp.rcv_buf[insert_idx] = newseg
 		}
@ -625,7 +625,7 @@ func (kcp *KCP) wnd_unused() uint16 {

 // flush pending data
 func (kcp *KCP) flush(ackOnly bool) {
-	var seg Segment
+	var seg segment
 	seg.conv = kcp.conv
 	seg.cmd = IKCP_CMD_ACK
 	seg.wnd = kcp.wnd_unused()
@ -989,10 +989,10 @@ func (kcp *KCP) WaitSnd() int {
 }

 // remove front n elements from queue
-func (kcp *KCP) remove_front(q []Segment, n int) []Segment {
+func (kcp *KCP) remove_front(q []segment, n int) []segment {
 	newn := copy(q, q[n:])
 	for i := newn; i < len(q); i++ {
-		q[i] = Segment{} // manual set nil for GC
+		q[i] = segment{} // manual set nil for GC
 	}
 	return q[:newn]
 }
--- a/vendor/github.com/AudriusButkevicius/kcp-go/sess.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/sess.go
@ -3,6 +3,7 @@ package kcp
 import (
 	"crypto/rand"
 	"encoding/binary"
+	"fmt"
 	"hash/crc32"
 	"io"
 	"net"
@ -54,9 +55,6 @@ var (
 	// global packet buffer
 	// shared among sending/receiving/FEC
 	xmitBuf sync.Pool
-
-	// monotonic session id
-	sid uint32
 )

 func init() {
@ -68,8 +66,9 @@ func init() {
 type (
 	// UDPSession defines a KCP session implemented by UDP
 	UDPSession struct {
-		sid   uint32         // session id(monotonic)
+		updaterIdx int            // record slice index in updater
 		conn       net.PacketConn // the underlying packet connection
+		closeConn  bool           // Should we close the underlying conn once UDPSession is closed.
 		kcp        *KCP           // KCP ARQ protocol
 		l          *Listener      // point to the Listener if it's accepted by Listener
 		block      BlockCrypt     // block encryption
@ -82,22 +81,23 @@ type (
 		ext []byte

 		// FEC
-		fecDecoder *FECDecoder
-		fecEncoder *FECEncoder
+		fecDecoder *fecDecoder
+		fecEncoder *fecEncoder

 		// settings
 		remote     net.Addr  // remote peer address
 		rd         time.Time // read deadline
 		wd         time.Time // write deadline
 		headerSize int       // the overall header size added before KCP frame
-		updateInterval time.Duration // interval in seconds to call kcp.flush()
 		ackNoDelay bool      // send ack immediately for each incoming packet
 		writeDelay bool      // delay kcp.flush() for Write() for bulk transfer
+		dup        int       // duplicate udp packets

 		// notifications
 		die          chan struct{} // notify session has Closed
 		chReadEvent  chan struct{} // notify Read() can be called without blocking
 		chWriteEvent chan struct{} // notify Write() can be called without blocking
+		chErrorEvent chan error    // notify Read() have an error

 		isClosed bool // flag the session has Closed
 		mu       sync.Mutex
@ -113,14 +113,15 @@ type (
 )

 // newUDPSession create a new udp session for client or server
-func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt) *UDPSession {
+func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt, closeConn bool) *UDPSession {
 	sess := new(UDPSession)
-	sess.sid = atomic.AddUint32(&sid, 1)
 	sess.die = make(chan struct{})
 	sess.chReadEvent = make(chan struct{}, 1)
 	sess.chWriteEvent = make(chan struct{}, 1)
+	sess.chErrorEvent = make(chan error, 1)
 	sess.remote = remote
 	sess.conn = conn
+	sess.closeConn = closeConn
 	sess.l = l
 	sess.block = block
 	sess.recvbuf = make([]byte, mtuLimit)
@ -232,6 +233,11 @@ func (s *UDPSession) Read(b []byte) (n int, err error) {
 		case <-s.chReadEvent:
 		case <-c:
 		case <-s.die:
+		case err = <-s.chErrorEvent:
+			if timeout != nil {
+				timeout.Stop()
+			}
+			return n, err
 		}

 		if timeout != nil {
@ -299,9 +305,11 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {

 // Close closes the connection.
 func (s *UDPSession) Close() error {
+	// remove this session from updater & listener(if necessary)
 	updater.removeSession(s)
 	if s.l != nil { // notify listener
-		s.l.closeSession(s.remote)
+		key := fmt.Sprintf("%s/%d", s.remote.String(), s.kcp.conv)
+		s.l.closeSession(key)
 	}

 	s.mu.Lock()
@ -312,7 +320,7 @@ func (s *UDPSession) Close() error {
 	close(s.die)
 	s.isClosed = true
 	atomic.AddUint64(&DefaultSnmp.CurrEstab, ^uint64(0))
-	if s.l == nil { // client socket close
+	if s.l == nil && s.closeConn { // client socket close
 		return s.conn.Close()
 	}
 	return nil
@ -393,12 +401,19 @@ func (s *UDPSession) SetACKNoDelay(nodelay bool) {
 	s.ackNoDelay = nodelay
 }

+// SetDUP duplicates udp packets for kcp output, for testing purpose only
+func (s *UDPSession) SetDUP(dup int) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.dup = dup
+}
+
 // SetNoDelay calls nodelay() of kcp
+// https://github.com/skywind3000/kcp/blob/master/README.en.md#protocol-configuration
 func (s *UDPSession) SetNoDelay(nodelay, interval, resend, nc int) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	s.kcp.NoDelay(nodelay, interval, resend, nc)
-	s.updateInterval = time.Duration(interval) * time.Millisecond
 }

 // SetDSCP sets the 6bit DSCP field of IP header, no effect if it's accepted from Listener
@ -406,8 +421,8 @@ func (s *UDPSession) SetDSCP(dscp int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	if s.l == nil {
-		if nc, ok := s.conn.(*ConnectedUDPConn); ok {
-			return ipv4.NewConn(nc.Conn).SetTOS(dscp << 2)
+		if nc, ok := s.conn.(*connectedUDPConn); ok {
+			return ipv4.NewConn(nc.UDPConn).SetTOS(dscp << 2)
 		} else if nc, ok := s.conn.(net.Conn); ok {
 			return ipv4.NewConn(nc).SetTOS(dscp << 2)
 		}
@ -449,26 +464,25 @@ func (s *UDPSession) SetWriteBuffer(bytes int) error {
 func (s *UDPSession) output(buf []byte) {
 	var ecc [][]byte

-	// extend buf's header space
+	// 0. extend buf's header space(if necessary)
 	ext := buf
 	if s.headerSize > 0 {
 		ext = s.ext[:s.headerSize+len(buf)]
 		copy(ext[s.headerSize:], buf)
 	}

-	// FEC stage
+	// 1. FEC encoding
 	if s.fecEncoder != nil {
-		ecc = s.fecEncoder.Encode(ext)
+		ecc = s.fecEncoder.encode(ext)
 	}

-	// encryption stage
+	// 2&3. crc32 & encryption
 	if s.block != nil {
 		io.ReadFull(rand.Reader, ext[:nonceSize])
 		checksum := crc32.ChecksumIEEE(ext[cryptHeaderSize:])
 		binary.LittleEndian.PutUint32(ext[nonceSize:], checksum)
 		s.block.Encrypt(ext, ext)

-		if ecc != nil {
 		for k := range ecc {
 			io.ReadFull(rand.Reader, ecc[k][:nonceSize])
 			checksum := crc32.ChecksumIEEE(ecc[k][cryptHeaderSize:])
@ -476,26 +490,23 @@ func (s *UDPSession) output(buf []byte) {
 			s.block.Encrypt(ecc[k], ecc[k])
 		}
 	}
-	}

-	// WriteTo kernel
+	// 4. WriteTo kernel
 	nbytes := 0
 	npkts := 0
-	// if mrand.Intn(100) < 50 {
+	for i := 0; i < s.dup+1; i++ {
 		if n, err := s.conn.WriteTo(ext, s.remote); err == nil {
 			nbytes += n
 			npkts++
 		}
-	// }
+	}

-	if ecc != nil {
 	for k := range ecc {
 		if n, err := s.conn.WriteTo(ecc[k], s.remote); err == nil {
 			nbytes += n
 			npkts++
 		}
 	}
-	}
 	atomic.AddUint64(&DefaultSnmp.OutPkts, uint64(npkts))
 	atomic.AddUint64(&DefaultSnmp.OutBytes, uint64(nbytes))
 }
@ -507,15 +518,13 @@ func (s *UDPSession) update() (interval time.Duration) {
 	if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) {
 		s.notifyWriteEvent()
 	}
-	interval = s.updateInterval
+	interval = time.Duration(s.kcp.interval) * time.Millisecond
 	s.mu.Unlock()
 	return
 }

 // GetConv gets conversation id of a session
-func (s *UDPSession) GetConv() uint32 {
-	return s.kcp.conv
-}
+func (s *UDPSession) GetConv() uint32 { return s.kcp.conv }

 func (s *UDPSession) notifyReadEvent() {
 	select {
@ -548,7 +557,7 @@ func (s *UDPSession) kcpInput(data []byte) {
 				fecParityShards++
 			}

-			if recovers := s.fecDecoder.Decode(f); recovers != nil {
+			recovers := s.fecDecoder.decode(f)
 			for _, r := range recovers {
 				if len(r) >= 2 { // must be larger than 2bytes
 					sz := binary.LittleEndian.Uint16(r)
@ -566,7 +575,6 @@ func (s *UDPSession) kcpInput(data []byte) {
 				}
 			}
 		}
-		}

 		// notify reader
 		if n := s.kcp.PeekSize(); n > 0 {
@ -601,7 +609,7 @@ func (s *UDPSession) kcpInput(data []byte) {
 	}
 }

-func (s *UDPSession) receiver(ch chan []byte) {
+func (s *UDPSession) receiver(ch chan<- []byte) {
 	for {
 		data := xmitBuf.Get().([]byte)[:mtuLimit]
 		if n, _, err := s.conn.ReadFrom(data); err == nil && n >= s.headerSize+IKCP_OVERHEAD {
@ -611,6 +619,7 @@ func (s *UDPSession) receiver(ch chan []byte) {
 				return
 			}
 		} else if err != nil {
+			s.chErrorEvent <- err
 			return
 		} else {
 			atomic.AddUint64(&DefaultSnmp.InErrs, 1)
@ -658,12 +667,12 @@ type (
 		block        BlockCrypt     // block encryption
 		dataShards   int            // FEC data shard
 		parityShards int            // FEC parity shard
-		fecDecoder   *FECDecoder    // FEC mock initialization
+		fecDecoder   *fecDecoder    // FEC mock initialization
 		conn         net.PacketConn // the underlying packet connection

 		sessions        map[string]*UDPSession // all sessions accepted by this Listener
 		chAccepts       chan *UDPSession       // Listen() backlog
-		chSessionClosed chan net.Addr          // session close queue
+		chSessionClosed chan string            // session close queue
 		headerSize      int                    // the overall header size added before KCP frame
 		die             chan struct{}          // notify the listener has closed
 		rd              atomic.Value           // read deadline for Accept()
@ -679,6 +688,10 @@ type (

 // monitor incoming data for all connections of server
 func (l *Listener) monitor() {
+	// cache last session
+	var lastKey string
+	var lastSession *UDPSession
+
 	chPacket := make(chan inPacket, qlen)
 	go l.receiver(chPacket)
 	for {
@ -703,10 +716,6 @@ func (l *Listener) monitor() {
 			}

 			if dataValid {
-				addr := from.String()
-				s, ok := l.sessions[addr]
-				if !ok { // new session
-					if len(l.chAccepts) < cap(l.chAccepts) { // do not let new session overwhelm accept queue
 				var conv uint32
 				convValid := false
 				if l.fecDecoder != nil {
@ -721,27 +730,46 @@ func (l *Listener) monitor() {
 				}

 				if convValid {
-							s := newUDPSession(conv, l.dataShards, l.parityShards, l, l.conn, from, l.block)
-							s.kcpInput(data)
-							l.sessions[addr] = s
-							l.chAccepts <- s
+					addr := from.String()
+					key := fmt.Sprintf("%s/%d", addr, conv)
+					var s *UDPSession
+					var ok bool
+
+					// packets received from an address always come in batch.
+					// cache the session for next packet, without querying map.
+					if key == lastKey {
+						s, ok = lastSession, true
+					} else if s, ok = l.sessions[key]; ok {
+						lastSession = s
+						lastKey = addr
 					}
+
+					if !ok { // new session
+						if len(l.chAccepts) < cap(l.chAccepts) && len(l.sessions) < 4096 { // do not let new session overwhelm accept queue and connection count
+							s := newUDPSession(conv, l.dataShards, l.parityShards, l, l.conn, from, l.block, false)
+							s.kcpInput(data)
+							l.sessions[key] = s
+							l.chAccepts <- s
 						}
 					} else {
 						s.kcpInput(data)
 					}
 				}
+			}

 			xmitBuf.Put(raw)
-		case deadlink := <-l.chSessionClosed:
-			delete(l.sessions, deadlink.String())
+		case key := <-l.chSessionClosed:
+			if key == lastKey {
+				lastKey = ""
+			}
+			delete(l.sessions, key)
 		case <-l.die:
 			return
 		}
 	}
 }

-func (l *Listener) receiver(ch chan inPacket) {
+func (l *Listener) receiver(ch chan<- inPacket) {
 	for {
 		data := xmitBuf.Get().([]byte)[:mtuLimit]
 		if n, from, err := l.conn.ReadFrom(data); err == nil && n >= l.headerSize+IKCP_OVERHEAD {
@ -830,9 +858,9 @@ func (l *Listener) Close() error {
 }

 // closeSession notify the listener that a session has closed
-func (l *Listener) closeSession(remote net.Addr) bool {
+func (l *Listener) closeSession(key string) bool {
 	select {
-	case l.chSessionClosed <- remote:
+	case l.chSessionClosed <- key:
 		return true
 	case <-l.die:
 		return false
@ -840,14 +868,10 @@ func (l *Listener) closeSession(remote net.Addr) bool {
 }

 // Addr returns the listener's network address, The Addr returned is shared by all invocations of Addr, so do not modify it.
-func (l *Listener) Addr() net.Addr {
-	return l.conn.LocalAddr()
-}
+func (l *Listener) Addr() net.Addr { return l.conn.LocalAddr() }

 // Listen listens for incoming KCP packets addressed to the local address laddr on the network "udp",
-func Listen(laddr string) (net.Listener, error) {
-	return ListenWithOptions(laddr, nil, 0, 0)
-}
+func Listen(laddr string) (net.Listener, error) { return ListenWithOptions(laddr, nil, 0, 0) }

 // ListenWithOptions listens for incoming KCP packets addressed to the local address laddr on the network "udp" with packet encryption,
 // dataShards, parityShards defines Reed-Solomon Erasure Coding parameters
@ -870,7 +894,7 @@ func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketCo
 	l.conn = conn
 	l.sessions = make(map[string]*UDPSession)
 	l.chAccepts = make(chan *UDPSession, acceptBacklog)
-	l.chSessionClosed = make(chan net.Addr)
+	l.chSessionClosed = make(chan string)
 	l.die = make(chan struct{})
 	l.dataShards = dataShards
 	l.parityShards = parityShards
@ -890,9 +914,7 @@ func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketCo
 }

 // Dial connects to the remote address "raddr" on the network "udp"
-func Dial(raddr string) (net.Conn, error) {
-	return DialWithOptions(raddr, nil, 0, 0)
-}
+func Dial(raddr string) (net.Conn, error) { return DialWithOptions(raddr, nil, 0, 0) }

 // DialWithOptions connects to the remote address "raddr" on the network "udp" with packet encryption
 func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards int) (*UDPSession, error) {
@ -906,11 +928,11 @@ func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards in
 		return nil, errors.Wrap(err, "net.DialUDP")
 	}

-	return NewConn(raddr, block, dataShards, parityShards, &ConnectedUDPConn{udpconn, udpconn})
+	return NewConn(raddr, block, dataShards, parityShards, &connectedUDPConn{udpconn}, true)
 }

 // NewConn establishes a session and talks KCP protocol over a packet connection.
-func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*UDPSession, error) {
+func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn net.PacketConn, closeConn bool) (*UDPSession, error) {
 	udpaddr, err := net.ResolveUDPAddr("udp", raddr)
 	if err != nil {
 		return nil, errors.Wrap(err, "net.ResolveUDPAddr")
@ -918,22 +940,16 @@ func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn

 	var convid uint32
 	binary.Read(rand.Reader, binary.LittleEndian, &convid)
-	return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block), nil
+	return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block, closeConn), nil
 }

-func currentMs() uint32 {
-	return uint32(time.Now().UnixNano() / int64(time.Millisecond))
-}
+// returns current time in milliseconds
+func currentMs() uint32 { return uint32(time.Now().UnixNano() / int64(time.Millisecond)) }

-// ConnectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls
+// connectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls
 // to Write syscalls that are 4 times faster on some OS'es. This should only be
 // used for connections that were produced by a net.Dial* call.
-type ConnectedUDPConn struct {
-	*net.UDPConn
-	Conn net.Conn // underlying connection if any
-}
+type connectedUDPConn struct{ *net.UDPConn }

 // WriteTo redirects all writes to the Write syscall, which is 4 times faster.
-func (c *ConnectedUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) {
-	return c.Write(b)
-}
+func (c *connectedUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) { return c.Write(b) }
--- a/vendor/github.com/AudriusButkevicius/kcp-go/snmp.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/snmp.go
--- a/vendor/github.com/AudriusButkevicius/kcp-go/updater.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/updater.go
@ -15,7 +15,6 @@ func init() {

 // entry contains a session update info
 type entry struct {
-	sid uint32
 	ts time.Time
 	s  *UDPSession
 }
@ -23,7 +22,6 @@ type entry struct {
 // a global heap managed kcp.flush() caller
 type updateHeap struct {
 	entries  []entry
-	indices  map[uint32]int
 	mu       sync.Mutex
 	chWakeUp chan struct{}
 }
@ -32,41 +30,40 @@ func (h *updateHeap) Len() int           { return len(h.entries) }
 func (h *updateHeap) Less(i, j int) bool { return h.entries[i].ts.Before(h.entries[j].ts) }
 func (h *updateHeap) Swap(i, j int) {
 	h.entries[i], h.entries[j] = h.entries[j], h.entries[i]
-	h.indices[h.entries[i].sid] = i
-	h.indices[h.entries[j].sid] = j
+	h.entries[i].s.updaterIdx = i
+	h.entries[j].s.updaterIdx = j
 }

 func (h *updateHeap) Push(x interface{}) {
 	h.entries = append(h.entries, x.(entry))
 	n := len(h.entries)
-	h.indices[h.entries[n-1].sid] = n - 1
+	h.entries[n-1].s.updaterIdx = n - 1
 }

 func (h *updateHeap) Pop() interface{} {
 	n := len(h.entries)
 	x := h.entries[n-1]
+	h.entries[n-1].s.updaterIdx = -1
 	h.entries[n-1] = entry{} // manual set nil for GC
 	h.entries = h.entries[0 : n-1]
-	delete(h.indices, x.sid)
 	return x
 }

 func (h *updateHeap) init() {
-	h.indices = make(map[uint32]int)
 	h.chWakeUp = make(chan struct{}, 1)
 }

 func (h *updateHeap) addSession(s *UDPSession) {
 	h.mu.Lock()
-	heap.Push(h, entry{s.sid, time.Now(), s})
+	heap.Push(h, entry{time.Now(), s})
 	h.mu.Unlock()
 	h.wakeup()
 }

 func (h *updateHeap) removeSession(s *UDPSession) {
 	h.mu.Lock()
-	if idx, ok := h.indices[s.sid]; ok {
-		heap.Remove(h, idx)
+	if s.updaterIdx != -1 {
+		heap.Remove(h, s.updaterIdx)
 	}
 	h.mu.Unlock()
 }
@ -99,7 +96,8 @@ func (h *updateHeap) updateTask() {
 				break
 			}
 		}
-		if h.Len() > 0 {
+
+		if hlen > 0 {
 			timer = time.After(h.entries[0].ts.Sub(now))
 		}
 		h.mu.Unlock()
--- a/vendor/github.com/AudriusButkevicius/kcp-go/xor.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/xor.go
--- a/vendor/github.com/templexxx/xor/LICENSE
+++ b/vendor/github.com/templexxx/xor/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Temple3x
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/vendor/github.com/templexxx/xor/avx2_amd64.s
+++ b/vendor/github.com/templexxx/xor/avx2_amd64.s
@ -0,0 +1,442 @@
+#include "textflag.h"
+
+// addr of mem
+#define DST BX
+#define SRC SI
+#define SRC0 TMP4
+#define SRC1 TMP5
+
+// loop args
+// num of vect
+#define VECT CX
+#define LEN DX
+// pos of matrix
+#define POS R8
+
+// tmp store
+// num of vect or ...
+#define TMP1 R9
+// pos of matrix or ...
+#define TMP2 R10
+// store addr of data/parity or ...
+#define TMP3 R11
+#define TMP4 R12
+#define TMP5 R13
+#define TMP6 R14
+
+// func bytesAVX2mini(dst, src0, src1 []byte, size int)
+TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $31, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop32b:
+	VMOVDQU (SRC0)(POS*1), Y0
+	VPXOR   (SRC1)(POS*1), Y0, Y0
+	VMOVDQU Y0, (DST)(POS*1)
+	ADDQ    $32, POS
+	CMPQ    LEN, POS
+	JNE     loop32b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $31, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $31, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $32
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesAVX2small(dst, src0, src1 []byte, size int)
+TEXT ·bytesAVX2small(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $127, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop128b:
+	VMOVDQU (SRC0)(POS*1), Y0
+	VMOVDQU 32(SRC0)(POS*1), Y1
+	VMOVDQU 64(SRC0)(POS*1), Y2
+	VMOVDQU 96(SRC0)(POS*1), Y3
+	VPXOR   (SRC1)(POS*1), Y0, Y0
+	VPXOR   32(SRC1)(POS*1), Y1, Y1
+	VPXOR   64(SRC1)(POS*1), Y2, Y2
+	VPXOR   96(SRC1)(POS*1), Y3, Y3
+	VMOVDQU Y0, (DST)(POS*1)
+	VMOVDQU Y1, 32(DST)(POS*1)
+	VMOVDQU Y2, 64(DST)(POS*1)
+	VMOVDQU Y3, 96(DST)(POS*1)
+
+	ADDQ $128, POS
+	CMPQ LEN, POS
+	JNE  loop128b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $127, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $127, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $128
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesAVX2big(dst, src0, src1 []byte, size int)
+TEXT ·bytesAVX2big(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $127, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop128b:
+	VMOVDQU (SRC0)(POS*1), Y0
+	VMOVDQU 32(SRC0)(POS*1), Y1
+	VMOVDQU 64(SRC0)(POS*1), Y2
+	VMOVDQU 96(SRC0)(POS*1), Y3
+	VPXOR   (SRC1)(POS*1), Y0, Y0
+	VPXOR   32(SRC1)(POS*1), Y1, Y1
+	VPXOR   64(SRC1)(POS*1), Y2, Y2
+	VPXOR   96(SRC1)(POS*1), Y3, Y3
+	LONG    $0xe77da1c4; WORD $0x0304
+	LONG    $0xe77da1c4; WORD $0x034c; BYTE $0x20
+	LONG    $0xe77da1c4; WORD $0x0354; BYTE $0x40
+	LONG    $0xe77da1c4; WORD $0x035c; BYTE $0x60
+
+	ADDQ $128, POS
+	CMPQ LEN, POS
+	JNE  loop128b
+	SFENCE
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $127, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $127, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $128
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func matrixAVX2small(dst []byte, src [][]byte)
+TEXT ·matrixAVX2small(SB), NOSPLIT, $0
+	MOVQ  dst+0(FP), DST
+	MOVQ  src+24(FP), SRC
+	MOVQ  vec+32(FP), VECT
+	MOVQ  len+8(FP), LEN
+	TESTQ $127, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop128b:
+	MOVQ    VECT, TMP1
+	SUBQ    $2, TMP1
+	MOVQ    $0, TMP2
+	MOVQ    (SRC)(TMP2*1), TMP3
+	MOVQ    TMP3, TMP4
+	VMOVDQU (TMP3)(POS*1), Y0
+	VMOVDQU 32(TMP4)(POS*1), Y1
+	VMOVDQU 64(TMP3)(POS*1), Y2
+	VMOVDQU 96(TMP4)(POS*1), Y3
+
+next_vect:
+	ADDQ    $24, TMP2
+	MOVQ    (SRC)(TMP2*1), TMP3
+	MOVQ    TMP3, TMP4
+	VMOVDQU (TMP3)(POS*1), Y4
+	VMOVDQU 32(TMP4)(POS*1), Y5
+	VMOVDQU 64(TMP3)(POS*1), Y6
+	VMOVDQU 96(TMP4)(POS*1), Y7
+	VPXOR   Y4, Y0, Y0
+	VPXOR   Y5, Y1, Y1
+	VPXOR   Y6, Y2, Y2
+	VPXOR   Y7, Y3, Y3
+	SUBQ    $1, TMP1
+	JGE     next_vect
+
+	VMOVDQU Y0, (DST)(POS*1)
+	VMOVDQU Y1, 32(DST)(POS*1)
+	VMOVDQU Y2, 64(DST)(POS*1)
+	VMOVDQU Y3, 96(DST)(POS*1)
+
+	ADDQ $128, POS
+	CMPQ LEN, POS
+	JNE  loop128b
+	RET
+
+loop_1b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVB -1(TMP3)(LEN*1), TMP5
+
+next_vect_1b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVB -1(TMP3)(LEN*1), TMP6
+	XORB TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_1b
+
+	MOVB  TMP5, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $127, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP4
+	ANDQ  $127, TMP4
+
+loop_8b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVQ -8(TMP3)(LEN*1), TMP5
+
+next_vect_8b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVQ -8(TMP3)(LEN*1), TMP6
+	XORQ TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_8b
+
+	MOVQ TMP5, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP4
+	JG   loop_8b
+
+	CMPQ LEN, $128
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func matrixAVX2big(dst []byte, src [][]byte)
+TEXT ·matrixAVX2big(SB), NOSPLIT, $0
+	MOVQ  dst+0(FP), DST
+	MOVQ  src+24(FP), SRC
+	MOVQ  vec+32(FP), VECT
+	MOVQ  len+8(FP), LEN
+	TESTQ $127, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop128b:
+	MOVQ    VECT, TMP1
+	SUBQ    $2, TMP1
+	MOVQ    $0, TMP2
+	MOVQ    (SRC)(TMP2*1), TMP3
+	MOVQ    TMP3, TMP4
+	VMOVDQU (TMP3)(POS*1), Y0
+	VMOVDQU 32(TMP4)(POS*1), Y1
+	VMOVDQU 64(TMP3)(POS*1), Y2
+	VMOVDQU 96(TMP4)(POS*1), Y3
+
+next_vect:
+	ADDQ    $24, TMP2
+	MOVQ    (SRC)(TMP2*1), TMP3
+	MOVQ    TMP3, TMP4
+	VMOVDQU (TMP3)(POS*1), Y4
+	VMOVDQU 32(TMP4)(POS*1), Y5
+	VMOVDQU 64(TMP3)(POS*1), Y6
+	VMOVDQU 96(TMP4)(POS*1), Y7
+	VPXOR   Y4, Y0, Y0
+	VPXOR   Y5, Y1, Y1
+	VPXOR   Y6, Y2, Y2
+	VPXOR   Y7, Y3, Y3
+	SUBQ    $1, TMP1
+	JGE     next_vect
+
+	LONG $0xe77da1c4; WORD $0x0304             // VMOVNTDQ  go1.8 has
+	LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
+	LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
+	LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
+
+	ADDQ $128, POS
+	CMPQ LEN, POS
+	JNE  loop128b
+	RET
+
+loop_1b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVB -1(TMP3)(LEN*1), TMP5
+
+next_vect_1b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVB -1(TMP3)(LEN*1), TMP6
+	XORB TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_1b
+
+	MOVB  TMP5, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $127, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP4
+	ANDQ  $127, TMP4
+
+loop_8b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVQ -8(TMP3)(LEN*1), TMP5
+
+next_vect_8b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVQ -8(TMP3)(LEN*1), TMP6
+	XORQ TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_8b
+
+	MOVQ TMP5, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP4
+	JG   loop_8b
+
+	CMPQ LEN, $128
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+TEXT ·hasAVX2(SB), NOSPLIT, $0
+	XORQ AX, AX
+	XORQ CX, CX
+	ADDL $7, AX
+	CPUID
+	SHRQ $5, BX
+	ANDQ $1, BX
+	MOVB BX, ret+0(FP)
+	RET
--- a/vendor/github.com/templexxx/xor/nosimd.go
+++ b/vendor/github.com/templexxx/xor/nosimd.go
@ -0,0 +1,116 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package xor
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+const wordSize = int(unsafe.Sizeof(uintptr(0)))
+const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
+
+// xor the bytes in a and b. The destination is assumed to have enough space.
+func bytesNoSIMD(dst, a, b []byte, size int) {
+	if supportsUnaligned {
+		fastXORBytes(dst, a, b, size)
+	} else {
+		// TODO(hanwen): if (dst, a, b) have common alignment
+		// we could still try fastXORBytes. It is not clear
+		// how often this happens, and it's only worth it if
+		// the block encryption itself is hardware
+		// accelerated.
+		safeXORBytes(dst, a, b, size)
+	}
+}
+
+// split slice for cache-friendly
+const unitSize = 16 * 1024
+
+func matrixNoSIMD(dst []byte, src [][]byte) {
+	size := len(src[0])
+	start := 0
+	do := unitSize
+	for start < size {
+		end := start + do
+		if end <= size {
+			partNoSIMD(start, end, dst, src)
+			start = start + do
+		} else {
+			partNoSIMD(start, size, dst, src)
+			start = size
+		}
+	}
+}
+
+// split vect will improve performance with big data by reducing cache pollution
+func partNoSIMD(start, end int, dst []byte, src [][]byte) {
+	bytesNoSIMD(dst[start:end], src[0][start:end], src[1][start:end], end-start)
+	for i := 2; i < len(src); i++ {
+		bytesNoSIMD(dst[start:end], dst[start:end], src[i][start:end], end-start)
+	}
+}
+
+// fastXORBytes xor in bulk. It only works on architectures that
+// support unaligned read/writes.
+func fastXORBytes(dst, a, b []byte, n int) {
+	w := n / wordSize
+	if w > 0 {
+		wordBytes := w * wordSize
+		fastXORWords(dst[:wordBytes], a[:wordBytes], b[:wordBytes])
+	}
+	for i := n - n%wordSize; i < n; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+}
+
+func safeXORBytes(dst, a, b []byte, n int) {
+	ex := n % 8
+	for i := 0; i < ex; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+
+	for i := ex; i < n; i += 8 {
+		_dst := dst[i : i+8]
+		_a := a[i : i+8]
+		_b := b[i : i+8]
+		_dst[0] = _a[0] ^ _b[0]
+		_dst[1] = _a[1] ^ _b[1]
+		_dst[2] = _a[2] ^ _b[2]
+		_dst[3] = _a[3] ^ _b[3]
+
+		_dst[4] = _a[4] ^ _b[4]
+		_dst[5] = _a[5] ^ _b[5]
+		_dst[6] = _a[6] ^ _b[6]
+		_dst[7] = _a[7] ^ _b[7]
+	}
+}
+
+// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
+// The arguments are assumed to be of equal length.
+func fastXORWords(dst, a, b []byte) {
+	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+	aw := *(*[]uintptr)(unsafe.Pointer(&a))
+	bw := *(*[]uintptr)(unsafe.Pointer(&b))
+	n := len(b) / wordSize
+	ex := n % 8
+	for i := 0; i < ex; i++ {
+		dw[i] = aw[i] ^ bw[i]
+	}
+
+	for i := ex; i < n; i += 8 {
+		_dw := dw[i : i+8]
+		_aw := aw[i : i+8]
+		_bw := bw[i : i+8]
+		_dw[0] = _aw[0] ^ _bw[0]
+		_dw[1] = _aw[1] ^ _bw[1]
+		_dw[2] = _aw[2] ^ _bw[2]
+		_dw[3] = _aw[3] ^ _bw[3]
+		_dw[4] = _aw[4] ^ _bw[4]
+		_dw[5] = _aw[5] ^ _bw[5]
+		_dw[6] = _aw[6] ^ _bw[6]
+		_dw[7] = _aw[7] ^ _bw[7]
+	}
+}
--- a/vendor/github.com/templexxx/xor/sse2_amd64.s
+++ b/vendor/github.com/templexxx/xor/sse2_amd64.s
@ -0,0 +1,574 @@
+#include "textflag.h"
+
+// addr of mem
+#define DST BX
+#define SRC SI
+#define SRC0 TMP4
+#define SRC1 TMP5
+
+// loop args
+// num of vect
+#define VECT CX
+#define LEN DX
+// pos of matrix
+#define POS R8
+
+// tmp store
+// num of vect or ...
+#define TMP1 R9
+// pos of matrix or ...
+#define TMP2 R10
+// store addr of data/parity or ...
+#define TMP3 R11
+#define TMP4 R12
+#define TMP5 R13
+#define TMP6 R14
+
+// func bytesSrc0(dst, src0, src1 []byte)
+TEXT ·xorSrc0(SB), NOSPLIT, $0
+	MOVQ  len+32(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $15, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop16b:
+	MOVOU (SRC0)(POS*1), X0
+	XORPD (SRC1)(POS*1), X0
+	MOVOU X0, (DST)(POS*1)
+	ADDQ  $16, POS
+	CMPQ  LEN, POS
+	JNE   loop16b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $15, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $15, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $16
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesSrc1(dst, src0, src1 []byte)
+TEXT ·xorSrc1(SB), NOSPLIT, $0
+	MOVQ  len+56(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $15, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop16b:
+	MOVOU (SRC0)(POS*1), X0
+	XORPD (SRC1)(POS*1), X0
+	MOVOU X0, (DST)(POS*1)
+	ADDQ  $16, POS
+	CMPQ  LEN, POS
+	JNE   loop16b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $15, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $15, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $16
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesSSE2mini(dst, src0, src1 []byte, size int)
+TEXT ·bytesSSE2mini(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $15, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop16b:
+	MOVOU (SRC0)(POS*1), X0
+	XORPD (SRC1)(POS*1), X0
+
+	// MOVOU (SRC1)(POS*1), X4
+	// PXOR X4, X0
+	MOVOU X0, (DST)(POS*1)
+	ADDQ  $16, POS
+	CMPQ  LEN, POS
+	JNE   loop16b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $15, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $15, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $16
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesSSE2small(dst, src0, src1 []byte, size int)
+TEXT ·bytesSSE2small(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $63, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop64b:
+	MOVOU (SRC0)(POS*1), X0
+	MOVOU 16(SRC0)(POS*1), X1
+	MOVOU 32(SRC0)(POS*1), X2
+	MOVOU 48(SRC0)(POS*1), X3
+
+	MOVOU (SRC1)(POS*1), X4
+	MOVOU 16(SRC1)(POS*1), X5
+	MOVOU 32(SRC1)(POS*1), X6
+	MOVOU 48(SRC1)(POS*1), X7
+
+	PXOR X4, X0
+	PXOR X5, X1
+	PXOR X6, X2
+	PXOR X7, X3
+
+	MOVOU X0, (DST)(POS*1)
+	MOVOU X1, 16(DST)(POS*1)
+	MOVOU X2, 32(DST)(POS*1)
+	MOVOU X3, 48(DST)(POS*1)
+
+	ADDQ $64, POS
+	CMPQ LEN, POS
+	JNE  loop64b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $63, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $63, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $64
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesSSE2big(dst, src0, src1 []byte, size int)
+TEXT ·bytesSSE2big(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $63, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop64b:
+	MOVOU (SRC0)(POS*1), X0
+	MOVOU 16(SRC0)(POS*1), X1
+	MOVOU 32(SRC0)(POS*1), X2
+	MOVOU 48(SRC0)(POS*1), X3
+
+	MOVOU (SRC1)(POS*1), X4
+	MOVOU 16(SRC1)(POS*1), X5
+	MOVOU 32(SRC1)(POS*1), X6
+	MOVOU 48(SRC1)(POS*1), X7
+
+	PXOR X4, X0
+	PXOR X5, X1
+	PXOR X6, X2
+	PXOR X7, X3
+
+	LONG $0xe70f4266; WORD $0x0304             // MOVNTDQ
+	LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
+	LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
+	LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
+
+	ADDQ $64, POS
+	CMPQ LEN, POS
+	JNE  loop64b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $63, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $63, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $64
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func matrixSSE2small(dst []byte, src [][]byte)
+TEXT ·matrixSSE2small(SB), NOSPLIT, $0
+	MOVQ  dst+0(FP), DST
+	MOVQ  src+24(FP), SRC
+	MOVQ  vec+32(FP), VECT
+	MOVQ  len+8(FP), LEN
+	TESTQ $63, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop64b:
+	MOVQ  VECT, TMP1
+	SUBQ  $2, TMP1
+	MOVQ  $0, TMP2
+	MOVQ  (SRC)(TMP2*1), TMP3
+	MOVQ  TMP3, TMP4
+	MOVOU (TMP3)(POS*1), X0
+	MOVOU 16(TMP4)(POS*1), X1
+	MOVOU 32(TMP3)(POS*1), X2
+	MOVOU 48(TMP4)(POS*1), X3
+
+next_vect:
+	ADDQ  $24, TMP2
+	MOVQ  (SRC)(TMP2*1), TMP3
+	MOVQ  TMP3, TMP4
+	MOVOU (TMP3)(POS*1), X4
+	MOVOU 16(TMP4)(POS*1), X5
+	MOVOU 32(TMP3)(POS*1), X6
+	MOVOU 48(TMP4)(POS*1), X7
+	PXOR  X4, X0
+	PXOR  X5, X1
+	PXOR  X6, X2
+	PXOR  X7, X3
+	SUBQ  $1, TMP1
+	JGE   next_vect
+
+	MOVOU X0, (DST)(POS*1)
+	MOVOU X1, 16(DST)(POS*1)
+	MOVOU X2, 32(DST)(POS*1)
+	MOVOU X3, 48(DST)(POS*1)
+
+	ADDQ $64, POS
+	CMPQ LEN, POS
+	JNE  loop64b
+	RET
+
+loop_1b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVB -1(TMP3)(LEN*1), TMP5
+
+next_vect_1b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVB -1(TMP3)(LEN*1), TMP6
+	XORB TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_1b
+
+	MOVB  TMP5, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $63, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP4
+	ANDQ  $63, TMP4
+
+loop_8b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVQ -8(TMP3)(LEN*1), TMP5
+
+next_vect_8b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVQ -8(TMP3)(LEN*1), TMP6
+	XORQ TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_8b
+
+	MOVQ TMP5, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP4
+	JG   loop_8b
+
+	CMPQ LEN, $64
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func matrixSSE2big(dst []byte, src [][]byte)
+TEXT ·matrixSSE2big(SB), NOSPLIT, $0
+	MOVQ  dst+0(FP), DST
+	MOVQ  src+24(FP), SRC
+	MOVQ  vec+32(FP), VECT
+	MOVQ  len+8(FP), LEN
+	TESTQ $63, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop64b:
+	MOVQ  VECT, TMP1
+	SUBQ  $2, TMP1
+	MOVQ  $0, TMP2
+	MOVQ  (SRC)(TMP2*1), TMP3
+	MOVQ  TMP3, TMP4
+	MOVOU (TMP3)(POS*1), X0
+	MOVOU 16(TMP4)(POS*1), X1
+	MOVOU 32(TMP3)(POS*1), X2
+	MOVOU 48(TMP4)(POS*1), X3
+
+next_vect:
+	ADDQ  $24, TMP2
+	MOVQ  (SRC)(TMP2*1), TMP3
+	MOVQ  TMP3, TMP4
+	MOVOU (TMP3)(POS*1), X4
+	MOVOU 16(TMP4)(POS*1), X5
+	MOVOU 32(TMP3)(POS*1), X6
+	MOVOU 48(TMP4)(POS*1), X7
+	PXOR  X4, X0
+	PXOR  X5, X1
+	PXOR  X6, X2
+	PXOR  X7, X3
+	SUBQ  $1, TMP1
+	JGE   next_vect
+
+	LONG $0xe70f4266; WORD $0x0304
+	LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
+	LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
+	LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
+
+	ADDQ $64, POS
+	CMPQ LEN, POS
+	JNE  loop64b
+	RET
+
+loop_1b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVB -1(TMP3)(LEN*1), TMP5
+
+next_vect_1b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVB -1(TMP3)(LEN*1), TMP6
+	XORB TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_1b
+
+	MOVB  TMP5, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $63, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP4
+	ANDQ  $63, TMP4
+
+loop_8b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVQ -8(TMP3)(LEN*1), TMP5
+
+next_vect_8b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVQ -8(TMP3)(LEN*1), TMP6
+	XORQ TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_8b
+
+	MOVQ TMP5, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP4
+	JG   loop_8b
+
+	CMPQ LEN, $64
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+TEXT ·hasSSE2(SB), NOSPLIT, $0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $26, DX
+	ANDQ $1, DX
+	MOVB DX, ret+0(FP)
+	RET
+
--- a/vendor/github.com/templexxx/xor/xor.go
+++ b/vendor/github.com/templexxx/xor/xor.go
@ -0,0 +1,49 @@
+package xor
+
+// SIMD Extensions
+const (
+	none = iota
+	avx2
+	// first introduced by Intel with the initial version of the Pentium 4 in 2001
+	// so I think we can assume all amd64 has sse2
+	sse2
+)
+
+var extension = none
+
+// Bytes : chose the shortest one as xor size
+// it's better to use it for big data ( > 64bytes )
+func Bytes(dst, src0, src1 []byte) {
+	size := len(dst)
+	if size > len(src0) {
+		size = len(src0)
+	}
+	if size > len(src1) {
+		size = len(src1)
+	}
+	xorBytes(dst, src0, src1, size)
+}
+
+// BytesSameLen : all slice's length must be equal
+// cut size branch, save time for small data
+func BytesSameLen(dst, src0, src1 []byte) {
+	xorSrc1(dst, src0, src1)
+}
+
+// BytesSrc0 : src1 >= src0, dst >= src0
+// xor src0's len bytes
+func BytesSrc0(dst, src0, src1 []byte) {
+	xorSrc0(dst, src0, src1)
+}
+
+// BytesSrc1 : src0 >= src1, dst >= src1
+// xor src1's len bytes
+func BytesSrc1(dst, src0, src1 []byte) {
+	xorSrc1(dst, src0, src1)
+}
+
+// Matrix : all slice's length must be equal && != 0
+// len(src) must >= 2
+func Matrix(dst []byte, src [][]byte) {
+	xorMatrix(dst, src)
+}
--- a/vendor/github.com/templexxx/xor/xor_amd64.go
+++ b/vendor/github.com/templexxx/xor/xor_amd64.go
@ -0,0 +1,118 @@
+package xor
+
+func init() {
+	getEXT()
+}
+
+func getEXT() {
+	if hasAVX2() {
+		extension = avx2
+	} else {
+		extension = sse2
+	}
+	return
+}
+
+func xorBytes(dst, src0, src1 []byte, size int) {
+	switch extension {
+	case avx2:
+		bytesAVX2(dst, src0, src1, size)
+	default:
+		bytesSSE2(dst, src0, src1, size)
+	}
+}
+
+// non-temporal hint store
+const nontmp = 8 * 1024
+const avx2loopsize = 128
+
+func bytesAVX2(dst, src0, src1 []byte, size int) {
+	if size < avx2loopsize {
+		bytesAVX2mini(dst, src0, src1, size)
+	} else if size >= avx2loopsize && size <= nontmp {
+		bytesAVX2small(dst, src0, src1, size)
+	} else {
+		bytesAVX2big(dst, src0, src1, size)
+	}
+}
+
+const sse2loopsize = 64
+
+func bytesSSE2(dst, src0, src1 []byte, size int) {
+	if size < sse2loopsize {
+		bytesSSE2mini(dst, src0, src1, size)
+	} else if size >= sse2loopsize && size <= nontmp {
+		bytesSSE2small(dst, src0, src1, size)
+	} else {
+		bytesSSE2big(dst, src0, src1, size)
+	}
+}
+
+func xorMatrix(dst []byte, src [][]byte) {
+	switch extension {
+	case avx2:
+		matrixAVX2(dst, src)
+	default:
+		matrixSSE2(dst, src)
+	}
+}
+
+func matrixAVX2(dst []byte, src [][]byte) {
+	size := len(dst)
+	if size > nontmp {
+		matrixAVX2big(dst, src)
+	} else {
+		matrixAVX2small(dst, src)
+	}
+}
+
+func matrixSSE2(dst []byte, src [][]byte) {
+	size := len(dst)
+	if size > nontmp {
+		matrixSSE2big(dst, src)
+	} else {
+		matrixSSE2small(dst, src)
+	}
+}
+
+//go:noescape
+func xorSrc0(dst, src0, src1 []byte)
+
+//go:noescape
+func xorSrc1(dst, src0, src1 []byte)
+
+//go:noescape
+func bytesAVX2mini(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func bytesAVX2big(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func bytesAVX2small(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func bytesSSE2mini(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func bytesSSE2small(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func bytesSSE2big(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func matrixAVX2small(dst []byte, src [][]byte)
+
+//go:noescape
+func matrixAVX2big(dst []byte, src [][]byte)
+
+//go:noescape
+func matrixSSE2small(dst []byte, src [][]byte)
+
+//go:noescape
+func matrixSSE2big(dst []byte, src [][]byte)
+
+//go:noescape
+func hasAVX2() bool
+
+//go:noescape
+func hasSSE2() bool
--- a/vendor/github.com/templexxx/xor/xor_other.go
+++ b/vendor/github.com/templexxx/xor/xor_other.go
@ -0,0 +1,19 @@
+// +build !amd64 noasm
+
+package xor
+
+func xorBytes(dst, src0, src1 []byte, size int) {
+	bytesNoSIMD(dst, src0, src1, size)
+}
+
+func xorMatrix(dst []byte, src [][]byte) {
+	matrixNoSIMD(dst, src)
+}
+
+func xorSrc0(dst, src0, src1 []byte) {
+	bytesNoSIMD(dst, src0, src1, len(src0))
+}
+
+func xorSrc1(dst, src0, src1 []byte) {
+	bytesNoSIMD(dst, src0, src1, len(src1))
+}
--- a/vendor/manifest
+++ b/vendor/manifest
@ -17,6 +17,14 @@
 			"branch": "master",
 			"notests": true
 		},
+		{
+			"importpath": "github.com/AudriusButkevicius/kcp-go",
+			"repository": "https://github.com/AudriusButkevicius/kcp-go",
+			"vcs": "git",
+			"revision": "0ccc04f3b8a7bdf53e2d4d6d0769adbc7cb3851a",
+			"branch": "master",
+			"notests": true
+		},
 		{
 			"importpath": "github.com/AudriusButkevicius/pfilter",
 			"repository": "https://github.com/AudriusButkevicius/pfilter",
@ -378,6 +386,14 @@
 			"path": "/leveldb",
 			"notests": true
 		},
+		{
+			"importpath": "github.com/templexxx/xor",
+			"repository": "https://github.com/templexxx/xor",
+			"vcs": "git",
+			"revision": "42f9c041c330b560afb991153bf183c25444bcdc",
+			"branch": "master",
+			"notests": true
+		},
 		{
 			"importpath": "github.com/thejerf/suture",
 			"repository": "https://github.com/thejerf/suture",
@ -413,14 +429,6 @@
 			"path": "/qr",
 			"notests": true
 		},
-		{
-			"importpath": "github.com/xtaci/kcp-go",
-			"repository": "https://github.com/xtaci/kcp-go",
-			"vcs": "git",
-			"revision": "0b0731ef3f184a8985edcb4ca26a4b0598c6dc1a",
-			"branch": "master",
-			"notests": true
-		},
 		{
 			"importpath": "github.com/xtaci/smux",
 			"repository": "https://github.com/xtaci/smux",