lib/connections: Use our own fork of kcp (fixes #4063)

This updates kcp and uses our own fork which:

1. Keys sessions not just by remote address, but by remote address +
conversation id 2. Allows not to close connections that were passed directly
to the library. 3. Resets cache key if the session gets terminated.

GitHub-Pull-Request: https://github.com/syncthing/syncthing/pull/4339
LGTM: calmh
This commit is contained in:
Audrius Butkevicius 2017-09-02 06:04:35 +00:00 committed by Jakob Borg
parent ab132ff6fe
commit cbcc3ea132
19 changed files with 1551 additions and 188 deletions

View File

@ -11,9 +11,9 @@ import (
"net/url" "net/url"
"time" "time"
"github.com/AudriusButkevicius/kcp-go"
"github.com/syncthing/syncthing/lib/config" "github.com/syncthing/syncthing/lib/config"
"github.com/syncthing/syncthing/lib/protocol" "github.com/syncthing/syncthing/lib/protocol"
"github.com/xtaci/kcp-go"
"github.com/xtaci/smux" "github.com/xtaci/smux"
) )
@ -38,7 +38,7 @@ func (d *kcpDialer) Dial(id protocol.DeviceID, uri *url.URL) (internalConn, erro
// Try to dial via an existing listening connection // Try to dial via an existing listening connection
// giving better changes punching through NAT. // giving better changes punching through NAT.
if f := getDialingFilter(); f != nil { if f := getDialingFilter(); f != nil {
conn, err = kcp.NewConn(uri.Host, nil, 0, 0, f.NewConn(kcpConversationFilterPriority, &kcpConversationFilter{})) conn, err = kcp.NewConn(uri.Host, nil, 0, 0, f.NewConn(kcpConversationFilterPriority, &kcpConversationFilter{}), false)
l.Debugf("dial %s using existing conn on %s", uri.String(), conn.LocalAddr()) l.Debugf("dial %s using existing conn on %s", uri.String(), conn.LocalAddr())
} else { } else {
conn, err = kcp.DialWithOptions(uri.Host, nil, 0, 0) conn, err = kcp.DialWithOptions(uri.Host, nil, 0, 0)

View File

@ -14,11 +14,11 @@ import (
"sync" "sync"
"time" "time"
"github.com/AudriusButkevicius/kcp-go"
"github.com/AudriusButkevicius/pfilter" "github.com/AudriusButkevicius/pfilter"
"github.com/ccding/go-stun/stun" "github.com/ccding/go-stun/stun"
"github.com/syncthing/syncthing/lib/config" "github.com/syncthing/syncthing/lib/config"
"github.com/syncthing/syncthing/lib/nat" "github.com/syncthing/syncthing/lib/nat"
"github.com/xtaci/kcp-go"
"github.com/xtaci/smux" "github.com/xtaci/smux"
) )

View File

@ -8,8 +8,8 @@ import (
"net" "net"
"testing" "testing"
"github.com/AudriusButkevicius/kcp-go"
"github.com/syncthing/syncthing/lib/dialer" "github.com/syncthing/syncthing/lib/dialer"
"github.com/xtaci/kcp-go"
) )
func BenchmarkRequestsRawTCP(b *testing.B) { func BenchmarkRequestsRawTCP(b *testing.B) {

View File

@ -6,6 +6,8 @@ import (
"crypto/des" "crypto/des"
"crypto/sha1" "crypto/sha1"
"github.com/templexxx/xor"
"golang.org/x/crypto/blowfish" "golang.org/x/crypto/blowfish"
"golang.org/x/crypto/cast5" "golang.org/x/crypto/cast5"
"golang.org/x/crypto/pbkdf2" "golang.org/x/crypto/pbkdf2"
@ -218,8 +220,8 @@ func NewSimpleXORBlockCrypt(key []byte) (BlockCrypt, error) {
return c, nil return c, nil
} }
func (c *simpleXORBlockCrypt) Encrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) } func (c *simpleXORBlockCrypt) Encrypt(dst, src []byte) { xor.Bytes(dst, src, c.xortbl) }
func (c *simpleXORBlockCrypt) Decrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) } func (c *simpleXORBlockCrypt) Decrypt(dst, src []byte) { xor.Bytes(dst, src, c.xortbl) }
type noneBlockCrypt struct{} type noneBlockCrypt struct{}
@ -239,11 +241,11 @@ func encrypt(block cipher.Block, dst, src, buf []byte) {
n := len(src) / blocksize n := len(src) / blocksize
base := 0 base := 0
for i := 0; i < n; i++ { for i := 0; i < n; i++ {
xorWords(dst[base:], src[base:], tbl) xor.BytesSrc1(dst[base:], src[base:], tbl)
block.Encrypt(tbl, dst[base:]) block.Encrypt(tbl, dst[base:])
base += blocksize base += blocksize
} }
xorBytes(dst[base:], src[base:], tbl) xor.BytesSrc0(dst[base:], src[base:], tbl)
} }
func decrypt(block cipher.Block, dst, src, buf []byte) { func decrypt(block cipher.Block, dst, src, buf []byte) {
@ -255,9 +257,9 @@ func decrypt(block cipher.Block, dst, src, buf []byte) {
base := 0 base := 0
for i := 0; i < n; i++ { for i := 0; i < n; i++ {
block.Encrypt(next, src[base:]) block.Encrypt(next, src[base:])
xorWords(dst[base:], src[base:], tbl) xor.BytesSrc1(dst[base:], src[base:], tbl)
tbl, next = next, tbl tbl, next = next, tbl
base += blocksize base += blocksize
} }
xorBytes(dst[base:], src[base:], tbl) xor.BytesSrc0(dst[base:], src[base:], tbl)
} }

View File

@ -22,8 +22,8 @@ type (
data []byte data []byte
} }
// FECDecoder for decoding incoming packets // fecDecoder for decoding incoming packets
FECDecoder struct { fecDecoder struct {
rxlimit int // queue size limit rxlimit int // queue size limit
dataShards int dataShards int
parityShards int parityShards int
@ -39,7 +39,7 @@ type (
} }
) )
func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder { func newFECDecoder(rxlimit, dataShards, parityShards int) *fecDecoder {
if dataShards <= 0 || parityShards <= 0 { if dataShards <= 0 || parityShards <= 0 {
return nil return nil
} }
@ -47,7 +47,7 @@ func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder {
return nil return nil
} }
fec := new(FECDecoder) fec := new(fecDecoder)
fec.rxlimit = rxlimit fec.rxlimit = rxlimit
fec.dataShards = dataShards fec.dataShards = dataShards
fec.parityShards = parityShards fec.parityShards = parityShards
@ -63,7 +63,7 @@ func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder {
} }
// decodeBytes a fec packet // decodeBytes a fec packet
func (dec *FECDecoder) decodeBytes(data []byte) fecPacket { func (dec *fecDecoder) decodeBytes(data []byte) fecPacket {
var pkt fecPacket var pkt fecPacket
pkt.seqid = binary.LittleEndian.Uint32(data) pkt.seqid = binary.LittleEndian.Uint32(data)
pkt.flag = binary.LittleEndian.Uint16(data[4:]) pkt.flag = binary.LittleEndian.Uint16(data[4:])
@ -74,8 +74,8 @@ func (dec *FECDecoder) decodeBytes(data []byte) fecPacket {
return pkt return pkt
} }
// Decode a fec packet // decode a fec packet
func (dec *FECDecoder) Decode(pkt fecPacket) (recovered [][]byte) { func (dec *fecDecoder) decode(pkt fecPacket) (recovered [][]byte) {
// insertion // insertion
n := len(dec.rx) - 1 n := len(dec.rx) - 1
insertIdx := 0 insertIdx := 0
@ -179,7 +179,7 @@ func (dec *FECDecoder) Decode(pkt fecPacket) (recovered [][]byte) {
} }
// free a range of fecPacket, and zero for GC recycling // free a range of fecPacket, and zero for GC recycling
func (dec *FECDecoder) freeRange(first, n int, q []fecPacket) []fecPacket { func (dec *fecDecoder) freeRange(first, n int, q []fecPacket) []fecPacket {
for i := first; i < first+n; i++ { // free for i := first; i < first+n; i++ { // free
xmitBuf.Put(q[i].data) xmitBuf.Put(q[i].data)
} }
@ -191,8 +191,8 @@ func (dec *FECDecoder) freeRange(first, n int, q []fecPacket) []fecPacket {
} }
type ( type (
// FECEncoder for encoding outgoing packets // fecEncoder for encoding outgoing packets
FECEncoder struct { fecEncoder struct {
dataShards int dataShards int
parityShards int parityShards int
shardSize int shardSize int
@ -214,11 +214,11 @@ type (
} }
) )
func newFECEncoder(dataShards, parityShards, offset int) *FECEncoder { func newFECEncoder(dataShards, parityShards, offset int) *fecEncoder {
if dataShards <= 0 || parityShards <= 0 { if dataShards <= 0 || parityShards <= 0 {
return nil return nil
} }
fec := new(FECEncoder) fec := new(fecEncoder)
fec.dataShards = dataShards fec.dataShards = dataShards
fec.parityShards = parityShards fec.parityShards = parityShards
fec.shardSize = dataShards + parityShards fec.shardSize = dataShards + parityShards
@ -241,9 +241,9 @@ func newFECEncoder(dataShards, parityShards, offset int) *FECEncoder {
return fec return fec
} }
// Encode the packet, output parity shards if we have enough datashards // encode the packet, output parity shards if we have enough datashards
// the content of returned parityshards will change in next Encode // the content of returned parityshards will change in next encode
func (enc *FECEncoder) Encode(b []byte) (ps [][]byte) { func (enc *fecEncoder) encode(b []byte) (ps [][]byte) {
enc.markData(b[enc.headerOffset:]) enc.markData(b[enc.headerOffset:])
binary.LittleEndian.PutUint16(b[enc.payloadOffset:], uint16(len(b[enc.payloadOffset:]))) binary.LittleEndian.PutUint16(b[enc.payloadOffset:], uint16(len(b[enc.payloadOffset:])))
@ -290,13 +290,13 @@ func (enc *FECEncoder) Encode(b []byte) (ps [][]byte) {
return return
} }
func (enc *FECEncoder) markData(data []byte) { func (enc *fecEncoder) markData(data []byte) {
binary.LittleEndian.PutUint32(data, enc.next) binary.LittleEndian.PutUint32(data, enc.next)
binary.LittleEndian.PutUint16(data[4:], typeData) binary.LittleEndian.PutUint16(data[4:], typeData)
enc.next++ enc.next++
} }
func (enc *FECEncoder) markFEC(data []byte) { func (enc *fecEncoder) markFEC(data []byte) {
binary.LittleEndian.PutUint32(data, enc.next) binary.LittleEndian.PutUint32(data, enc.next)
binary.LittleEndian.PutUint16(data[4:], typeFEC) binary.LittleEndian.PutUint16(data[4:], typeFEC)
enc.next = (enc.next + 1) % enc.paws enc.next = (enc.next + 1) % enc.paws

View File

@ -30,8 +30,8 @@ const (
IKCP_PROBE_LIMIT = 120000 // up to 120 secs to probe window IKCP_PROBE_LIMIT = 120000 // up to 120 secs to probe window
) )
// Output is a closure which captures conn and calls conn.Write // output_callback is a prototype which ought capture conn and call conn.Write
type Output func(buf []byte, size int) type output_callback func(buf []byte, size int)
/* encode 8 bits unsigned int */ /* encode 8 bits unsigned int */
func ikcp_encode8u(p []byte, c byte) []byte { func ikcp_encode8u(p []byte, c byte) []byte {
@ -91,8 +91,8 @@ func _itimediff(later, earlier uint32) int32 {
return (int32)(later - earlier) return (int32)(later - earlier)
} }
// Segment defines a KCP segment // segment defines a KCP segment
type Segment struct { type segment struct {
conv uint32 conv uint32
cmd uint8 cmd uint8
frg uint8 frg uint8
@ -108,11 +108,11 @@ type Segment struct {
} }
// encode a segment into buffer // encode a segment into buffer
func (seg *Segment) encode(ptr []byte) []byte { func (seg *segment) encode(ptr []byte) []byte {
ptr = ikcp_encode32u(ptr, seg.conv) ptr = ikcp_encode32u(ptr, seg.conv)
ptr = ikcp_encode8u(ptr, uint8(seg.cmd)) ptr = ikcp_encode8u(ptr, seg.cmd)
ptr = ikcp_encode8u(ptr, uint8(seg.frg)) ptr = ikcp_encode8u(ptr, seg.frg)
ptr = ikcp_encode16u(ptr, uint16(seg.wnd)) ptr = ikcp_encode16u(ptr, seg.wnd)
ptr = ikcp_encode32u(ptr, seg.ts) ptr = ikcp_encode32u(ptr, seg.ts)
ptr = ikcp_encode32u(ptr, seg.sn) ptr = ikcp_encode32u(ptr, seg.sn)
ptr = ikcp_encode32u(ptr, seg.una) ptr = ikcp_encode32u(ptr, seg.una)
@ -137,15 +137,15 @@ type KCP struct {
fastresend int32 fastresend int32
nocwnd, stream int32 nocwnd, stream int32
snd_queue []Segment snd_queue []segment
rcv_queue []Segment rcv_queue []segment
snd_buf []Segment snd_buf []segment
rcv_buf []Segment rcv_buf []segment
acklist []ackItem acklist []ackItem
buffer []byte buffer []byte
output Output output output_callback
} }
type ackItem struct { type ackItem struct {
@ -155,7 +155,7 @@ type ackItem struct {
// NewKCP create a new kcp control object, 'conv' must equal in two endpoint // NewKCP create a new kcp control object, 'conv' must equal in two endpoint
// from the same connection. // from the same connection.
func NewKCP(conv uint32, output Output) *KCP { func NewKCP(conv uint32, output output_callback) *KCP {
kcp := new(KCP) kcp := new(KCP)
kcp.conv = conv kcp.conv = conv
kcp.snd_wnd = IKCP_WND_SND kcp.snd_wnd = IKCP_WND_SND
@ -175,13 +175,13 @@ func NewKCP(conv uint32, output Output) *KCP {
} }
// newSegment creates a KCP segment // newSegment creates a KCP segment
func (kcp *KCP) newSegment(size int) (seg Segment) { func (kcp *KCP) newSegment(size int) (seg segment) {
seg.data = xmitBuf.Get().([]byte)[:size] seg.data = xmitBuf.Get().([]byte)[:size]
return return
} }
// delSegment recycles a KCP segment // delSegment recycles a KCP segment
func (kcp *KCP) delSegment(seg Segment) { func (kcp *KCP) delSegment(seg segment) {
xmitBuf.Put(seg.data) xmitBuf.Put(seg.data)
} }
@ -384,7 +384,7 @@ func (kcp *KCP) parse_ack(sn uint32) {
if sn == seg.sn { if sn == seg.sn {
kcp.delSegment(*seg) kcp.delSegment(*seg)
copy(kcp.snd_buf[k:], kcp.snd_buf[k+1:]) copy(kcp.snd_buf[k:], kcp.snd_buf[k+1:])
kcp.snd_buf[len(kcp.snd_buf)-1] = Segment{} kcp.snd_buf[len(kcp.snd_buf)-1] = segment{}
kcp.snd_buf = kcp.snd_buf[:len(kcp.snd_buf)-1] kcp.snd_buf = kcp.snd_buf[:len(kcp.snd_buf)-1]
break break
} }
@ -430,7 +430,7 @@ func (kcp *KCP) ack_push(sn, ts uint32) {
kcp.acklist = append(kcp.acklist, ackItem{sn, ts}) kcp.acklist = append(kcp.acklist, ackItem{sn, ts})
} }
func (kcp *KCP) parse_data(newseg Segment) { func (kcp *KCP) parse_data(newseg segment) {
sn := newseg.sn sn := newseg.sn
if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) >= 0 || if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) >= 0 ||
_itimediff(sn, kcp.rcv_nxt) < 0 { _itimediff(sn, kcp.rcv_nxt) < 0 {
@ -458,7 +458,7 @@ func (kcp *KCP) parse_data(newseg Segment) {
if insert_idx == n+1 { if insert_idx == n+1 {
kcp.rcv_buf = append(kcp.rcv_buf, newseg) kcp.rcv_buf = append(kcp.rcv_buf, newseg)
} else { } else {
kcp.rcv_buf = append(kcp.rcv_buf, Segment{}) kcp.rcv_buf = append(kcp.rcv_buf, segment{})
copy(kcp.rcv_buf[insert_idx+1:], kcp.rcv_buf[insert_idx:]) copy(kcp.rcv_buf[insert_idx+1:], kcp.rcv_buf[insert_idx:])
kcp.rcv_buf[insert_idx] = newseg kcp.rcv_buf[insert_idx] = newseg
} }
@ -625,7 +625,7 @@ func (kcp *KCP) wnd_unused() uint16 {
// flush pending data // flush pending data
func (kcp *KCP) flush(ackOnly bool) { func (kcp *KCP) flush(ackOnly bool) {
var seg Segment var seg segment
seg.conv = kcp.conv seg.conv = kcp.conv
seg.cmd = IKCP_CMD_ACK seg.cmd = IKCP_CMD_ACK
seg.wnd = kcp.wnd_unused() seg.wnd = kcp.wnd_unused()
@ -989,10 +989,10 @@ func (kcp *KCP) WaitSnd() int {
} }
// remove front n elements from queue // remove front n elements from queue
func (kcp *KCP) remove_front(q []Segment, n int) []Segment { func (kcp *KCP) remove_front(q []segment, n int) []segment {
newn := copy(q, q[n:]) newn := copy(q, q[n:])
for i := newn; i < len(q); i++ { for i := newn; i < len(q); i++ {
q[i] = Segment{} // manual set nil for GC q[i] = segment{} // manual set nil for GC
} }
return q[:newn] return q[:newn]
} }

View File

@ -3,6 +3,7 @@ package kcp
import ( import (
"crypto/rand" "crypto/rand"
"encoding/binary" "encoding/binary"
"fmt"
"hash/crc32" "hash/crc32"
"io" "io"
"net" "net"
@ -54,9 +55,6 @@ var (
// global packet buffer // global packet buffer
// shared among sending/receiving/FEC // shared among sending/receiving/FEC
xmitBuf sync.Pool xmitBuf sync.Pool
// monotonic session id
sid uint32
) )
func init() { func init() {
@ -68,11 +66,12 @@ func init() {
type ( type (
// UDPSession defines a KCP session implemented by UDP // UDPSession defines a KCP session implemented by UDP
UDPSession struct { UDPSession struct {
sid uint32 // session id(monotonic) updaterIdx int // record slice index in updater
conn net.PacketConn // the underlying packet connection conn net.PacketConn // the underlying packet connection
kcp *KCP // KCP ARQ protocol closeConn bool // Should we close the underlying conn once UDPSession is closed.
l *Listener // point to the Listener if it's accepted by Listener kcp *KCP // KCP ARQ protocol
block BlockCrypt // block encryption l *Listener // point to the Listener if it's accepted by Listener
block BlockCrypt // block encryption
// kcp receiving is based on packets // kcp receiving is based on packets
// recvbuf turns packets into stream // recvbuf turns packets into stream
@ -82,22 +81,23 @@ type (
ext []byte ext []byte
// FEC // FEC
fecDecoder *FECDecoder fecDecoder *fecDecoder
fecEncoder *FECEncoder fecEncoder *fecEncoder
// settings // settings
remote net.Addr // remote peer address remote net.Addr // remote peer address
rd time.Time // read deadline rd time.Time // read deadline
wd time.Time // write deadline wd time.Time // write deadline
headerSize int // the overall header size added before KCP frame headerSize int // the overall header size added before KCP frame
updateInterval time.Duration // interval in seconds to call kcp.flush() ackNoDelay bool // send ack immediately for each incoming packet
ackNoDelay bool // send ack immediately for each incoming packet writeDelay bool // delay kcp.flush() for Write() for bulk transfer
writeDelay bool // delay kcp.flush() for Write() for bulk transfer dup int // duplicate udp packets
// notifications // notifications
die chan struct{} // notify session has Closed die chan struct{} // notify session has Closed
chReadEvent chan struct{} // notify Read() can be called without blocking chReadEvent chan struct{} // notify Read() can be called without blocking
chWriteEvent chan struct{} // notify Write() can be called without blocking chWriteEvent chan struct{} // notify Write() can be called without blocking
chErrorEvent chan error // notify Read() have an error
isClosed bool // flag the session has Closed isClosed bool // flag the session has Closed
mu sync.Mutex mu sync.Mutex
@ -113,14 +113,15 @@ type (
) )
// newUDPSession create a new udp session for client or server // newUDPSession create a new udp session for client or server
func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt) *UDPSession { func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt, closeConn bool) *UDPSession {
sess := new(UDPSession) sess := new(UDPSession)
sess.sid = atomic.AddUint32(&sid, 1)
sess.die = make(chan struct{}) sess.die = make(chan struct{})
sess.chReadEvent = make(chan struct{}, 1) sess.chReadEvent = make(chan struct{}, 1)
sess.chWriteEvent = make(chan struct{}, 1) sess.chWriteEvent = make(chan struct{}, 1)
sess.chErrorEvent = make(chan error, 1)
sess.remote = remote sess.remote = remote
sess.conn = conn sess.conn = conn
sess.closeConn = closeConn
sess.l = l sess.l = l
sess.block = block sess.block = block
sess.recvbuf = make([]byte, mtuLimit) sess.recvbuf = make([]byte, mtuLimit)
@ -232,6 +233,11 @@ func (s *UDPSession) Read(b []byte) (n int, err error) {
case <-s.chReadEvent: case <-s.chReadEvent:
case <-c: case <-c:
case <-s.die: case <-s.die:
case err = <-s.chErrorEvent:
if timeout != nil {
timeout.Stop()
}
return n, err
} }
if timeout != nil { if timeout != nil {
@ -299,9 +305,11 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
// Close closes the connection. // Close closes the connection.
func (s *UDPSession) Close() error { func (s *UDPSession) Close() error {
// remove this session from updater & listener(if necessary)
updater.removeSession(s) updater.removeSession(s)
if s.l != nil { // notify listener if s.l != nil { // notify listener
s.l.closeSession(s.remote) key := fmt.Sprintf("%s/%d", s.remote.String(), s.kcp.conv)
s.l.closeSession(key)
} }
s.mu.Lock() s.mu.Lock()
@ -312,7 +320,7 @@ func (s *UDPSession) Close() error {
close(s.die) close(s.die)
s.isClosed = true s.isClosed = true
atomic.AddUint64(&DefaultSnmp.CurrEstab, ^uint64(0)) atomic.AddUint64(&DefaultSnmp.CurrEstab, ^uint64(0))
if s.l == nil { // client socket close if s.l == nil && s.closeConn { // client socket close
return s.conn.Close() return s.conn.Close()
} }
return nil return nil
@ -393,12 +401,19 @@ func (s *UDPSession) SetACKNoDelay(nodelay bool) {
s.ackNoDelay = nodelay s.ackNoDelay = nodelay
} }
// SetDUP duplicates udp packets for kcp output, for testing purpose only
func (s *UDPSession) SetDUP(dup int) {
s.mu.Lock()
defer s.mu.Unlock()
s.dup = dup
}
// SetNoDelay calls nodelay() of kcp // SetNoDelay calls nodelay() of kcp
// https://github.com/skywind3000/kcp/blob/master/README.en.md#protocol-configuration
func (s *UDPSession) SetNoDelay(nodelay, interval, resend, nc int) { func (s *UDPSession) SetNoDelay(nodelay, interval, resend, nc int) {
s.mu.Lock() s.mu.Lock()
defer s.mu.Unlock() defer s.mu.Unlock()
s.kcp.NoDelay(nodelay, interval, resend, nc) s.kcp.NoDelay(nodelay, interval, resend, nc)
s.updateInterval = time.Duration(interval) * time.Millisecond
} }
// SetDSCP sets the 6bit DSCP field of IP header, no effect if it's accepted from Listener // SetDSCP sets the 6bit DSCP field of IP header, no effect if it's accepted from Listener
@ -406,8 +421,8 @@ func (s *UDPSession) SetDSCP(dscp int) error {
s.mu.Lock() s.mu.Lock()
defer s.mu.Unlock() defer s.mu.Unlock()
if s.l == nil { if s.l == nil {
if nc, ok := s.conn.(*ConnectedUDPConn); ok { if nc, ok := s.conn.(*connectedUDPConn); ok {
return ipv4.NewConn(nc.Conn).SetTOS(dscp << 2) return ipv4.NewConn(nc.UDPConn).SetTOS(dscp << 2)
} else if nc, ok := s.conn.(net.Conn); ok { } else if nc, ok := s.conn.(net.Conn); ok {
return ipv4.NewConn(nc).SetTOS(dscp << 2) return ipv4.NewConn(nc).SetTOS(dscp << 2)
} }
@ -449,51 +464,47 @@ func (s *UDPSession) SetWriteBuffer(bytes int) error {
func (s *UDPSession) output(buf []byte) { func (s *UDPSession) output(buf []byte) {
var ecc [][]byte var ecc [][]byte
// extend buf's header space // 0. extend buf's header space(if necessary)
ext := buf ext := buf
if s.headerSize > 0 { if s.headerSize > 0 {
ext = s.ext[:s.headerSize+len(buf)] ext = s.ext[:s.headerSize+len(buf)]
copy(ext[s.headerSize:], buf) copy(ext[s.headerSize:], buf)
} }
// FEC stage // 1. FEC encoding
if s.fecEncoder != nil { if s.fecEncoder != nil {
ecc = s.fecEncoder.Encode(ext) ecc = s.fecEncoder.encode(ext)
} }
// encryption stage // 2&3. crc32 & encryption
if s.block != nil { if s.block != nil {
io.ReadFull(rand.Reader, ext[:nonceSize]) io.ReadFull(rand.Reader, ext[:nonceSize])
checksum := crc32.ChecksumIEEE(ext[cryptHeaderSize:]) checksum := crc32.ChecksumIEEE(ext[cryptHeaderSize:])
binary.LittleEndian.PutUint32(ext[nonceSize:], checksum) binary.LittleEndian.PutUint32(ext[nonceSize:], checksum)
s.block.Encrypt(ext, ext) s.block.Encrypt(ext, ext)
if ecc != nil { for k := range ecc {
for k := range ecc { io.ReadFull(rand.Reader, ecc[k][:nonceSize])
io.ReadFull(rand.Reader, ecc[k][:nonceSize]) checksum := crc32.ChecksumIEEE(ecc[k][cryptHeaderSize:])
checksum := crc32.ChecksumIEEE(ecc[k][cryptHeaderSize:]) binary.LittleEndian.PutUint32(ecc[k][nonceSize:], checksum)
binary.LittleEndian.PutUint32(ecc[k][nonceSize:], checksum) s.block.Encrypt(ecc[k], ecc[k])
s.block.Encrypt(ecc[k], ecc[k])
}
} }
} }
// WriteTo kernel // 4. WriteTo kernel
nbytes := 0 nbytes := 0
npkts := 0 npkts := 0
// if mrand.Intn(100) < 50 { for i := 0; i < s.dup+1; i++ {
if n, err := s.conn.WriteTo(ext, s.remote); err == nil { if n, err := s.conn.WriteTo(ext, s.remote); err == nil {
nbytes += n nbytes += n
npkts++ npkts++
}
} }
// }
if ecc != nil { for k := range ecc {
for k := range ecc { if n, err := s.conn.WriteTo(ecc[k], s.remote); err == nil {
if n, err := s.conn.WriteTo(ecc[k], s.remote); err == nil { nbytes += n
nbytes += n npkts++
npkts++
}
} }
} }
atomic.AddUint64(&DefaultSnmp.OutPkts, uint64(npkts)) atomic.AddUint64(&DefaultSnmp.OutPkts, uint64(npkts))
@ -507,15 +518,13 @@ func (s *UDPSession) update() (interval time.Duration) {
if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) { if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) {
s.notifyWriteEvent() s.notifyWriteEvent()
} }
interval = s.updateInterval interval = time.Duration(s.kcp.interval) * time.Millisecond
s.mu.Unlock() s.mu.Unlock()
return return
} }
// GetConv gets conversation id of a session // GetConv gets conversation id of a session
func (s *UDPSession) GetConv() uint32 { func (s *UDPSession) GetConv() uint32 { return s.kcp.conv }
return s.kcp.conv
}
func (s *UDPSession) notifyReadEvent() { func (s *UDPSession) notifyReadEvent() {
select { select {
@ -548,22 +557,21 @@ func (s *UDPSession) kcpInput(data []byte) {
fecParityShards++ fecParityShards++
} }
if recovers := s.fecDecoder.Decode(f); recovers != nil { recovers := s.fecDecoder.decode(f)
for _, r := range recovers { for _, r := range recovers {
if len(r) >= 2 { // must be larger than 2bytes if len(r) >= 2 { // must be larger than 2bytes
sz := binary.LittleEndian.Uint16(r) sz := binary.LittleEndian.Uint16(r)
if int(sz) <= len(r) && sz >= 2 { if int(sz) <= len(r) && sz >= 2 {
if ret := s.kcp.Input(r[2:sz], false, s.ackNoDelay); ret == 0 { if ret := s.kcp.Input(r[2:sz], false, s.ackNoDelay); ret == 0 {
fecRecovered++ fecRecovered++
} else {
kcpInErrors++
}
} else { } else {
fecErrs++ kcpInErrors++
} }
} else { } else {
fecErrs++ fecErrs++
} }
} else {
fecErrs++
} }
} }
} }
@ -601,7 +609,7 @@ func (s *UDPSession) kcpInput(data []byte) {
} }
} }
func (s *UDPSession) receiver(ch chan []byte) { func (s *UDPSession) receiver(ch chan<- []byte) {
for { for {
data := xmitBuf.Get().([]byte)[:mtuLimit] data := xmitBuf.Get().([]byte)[:mtuLimit]
if n, _, err := s.conn.ReadFrom(data); err == nil && n >= s.headerSize+IKCP_OVERHEAD { if n, _, err := s.conn.ReadFrom(data); err == nil && n >= s.headerSize+IKCP_OVERHEAD {
@ -611,6 +619,7 @@ func (s *UDPSession) receiver(ch chan []byte) {
return return
} }
} else if err != nil { } else if err != nil {
s.chErrorEvent <- err
return return
} else { } else {
atomic.AddUint64(&DefaultSnmp.InErrs, 1) atomic.AddUint64(&DefaultSnmp.InErrs, 1)
@ -658,12 +667,12 @@ type (
block BlockCrypt // block encryption block BlockCrypt // block encryption
dataShards int // FEC data shard dataShards int // FEC data shard
parityShards int // FEC parity shard parityShards int // FEC parity shard
fecDecoder *FECDecoder // FEC mock initialization fecDecoder *fecDecoder // FEC mock initialization
conn net.PacketConn // the underlying packet connection conn net.PacketConn // the underlying packet connection
sessions map[string]*UDPSession // all sessions accepted by this Listener sessions map[string]*UDPSession // all sessions accepted by this Listener
chAccepts chan *UDPSession // Listen() backlog chAccepts chan *UDPSession // Listen() backlog
chSessionClosed chan net.Addr // session close queue chSessionClosed chan string // session close queue
headerSize int // the overall header size added before KCP frame headerSize int // the overall header size added before KCP frame
die chan struct{} // notify the listener has closed die chan struct{} // notify the listener has closed
rd atomic.Value // read deadline for Accept() rd atomic.Value // read deadline for Accept()
@ -679,6 +688,10 @@ type (
// monitor incoming data for all connections of server // monitor incoming data for all connections of server
func (l *Listener) monitor() { func (l *Listener) monitor() {
// cache last session
var lastKey string
var lastSession *UDPSession
chPacket := make(chan inPacket, qlen) chPacket := make(chan inPacket, qlen)
go l.receiver(chPacket) go l.receiver(chPacket)
for { for {
@ -703,45 +716,60 @@ func (l *Listener) monitor() {
} }
if dataValid { if dataValid {
addr := from.String() var conv uint32
s, ok := l.sessions[addr] convValid := false
if !ok { // new session if l.fecDecoder != nil {
if len(l.chAccepts) < cap(l.chAccepts) { // do not let new session overwhelm accept queue isfec := binary.LittleEndian.Uint16(data[4:])
var conv uint32 if isfec == typeData {
convValid := false conv = binary.LittleEndian.Uint32(data[fecHeaderSizePlus2:])
if l.fecDecoder != nil { convValid = true
isfec := binary.LittleEndian.Uint16(data[4:])
if isfec == typeData {
conv = binary.LittleEndian.Uint32(data[fecHeaderSizePlus2:])
convValid = true
}
} else {
conv = binary.LittleEndian.Uint32(data)
convValid = true
}
if convValid {
s := newUDPSession(conv, l.dataShards, l.parityShards, l, l.conn, from, l.block)
s.kcpInput(data)
l.sessions[addr] = s
l.chAccepts <- s
}
} }
} else { } else {
s.kcpInput(data) conv = binary.LittleEndian.Uint32(data)
convValid = true
}
if convValid {
addr := from.String()
key := fmt.Sprintf("%s/%d", addr, conv)
var s *UDPSession
var ok bool
// packets received from an address always come in batch.
// cache the session for next packet, without querying map.
if key == lastKey {
s, ok = lastSession, true
} else if s, ok = l.sessions[key]; ok {
lastSession = s
lastKey = addr
}
if !ok { // new session
if len(l.chAccepts) < cap(l.chAccepts) && len(l.sessions) < 4096 { // do not let new session overwhelm accept queue and connection count
s := newUDPSession(conv, l.dataShards, l.parityShards, l, l.conn, from, l.block, false)
s.kcpInput(data)
l.sessions[key] = s
l.chAccepts <- s
}
} else {
s.kcpInput(data)
}
} }
} }
xmitBuf.Put(raw) xmitBuf.Put(raw)
case deadlink := <-l.chSessionClosed: case key := <-l.chSessionClosed:
delete(l.sessions, deadlink.String()) if key == lastKey {
lastKey = ""
}
delete(l.sessions, key)
case <-l.die: case <-l.die:
return return
} }
} }
} }
func (l *Listener) receiver(ch chan inPacket) { func (l *Listener) receiver(ch chan<- inPacket) {
for { for {
data := xmitBuf.Get().([]byte)[:mtuLimit] data := xmitBuf.Get().([]byte)[:mtuLimit]
if n, from, err := l.conn.ReadFrom(data); err == nil && n >= l.headerSize+IKCP_OVERHEAD { if n, from, err := l.conn.ReadFrom(data); err == nil && n >= l.headerSize+IKCP_OVERHEAD {
@ -830,9 +858,9 @@ func (l *Listener) Close() error {
} }
// closeSession notify the listener that a session has closed // closeSession notify the listener that a session has closed
func (l *Listener) closeSession(remote net.Addr) bool { func (l *Listener) closeSession(key string) bool {
select { select {
case l.chSessionClosed <- remote: case l.chSessionClosed <- key:
return true return true
case <-l.die: case <-l.die:
return false return false
@ -840,14 +868,10 @@ func (l *Listener) closeSession(remote net.Addr) bool {
} }
// Addr returns the listener's network address, The Addr returned is shared by all invocations of Addr, so do not modify it. // Addr returns the listener's network address, The Addr returned is shared by all invocations of Addr, so do not modify it.
func (l *Listener) Addr() net.Addr { func (l *Listener) Addr() net.Addr { return l.conn.LocalAddr() }
return l.conn.LocalAddr()
}
// Listen listens for incoming KCP packets addressed to the local address laddr on the network "udp", // Listen listens for incoming KCP packets addressed to the local address laddr on the network "udp",
func Listen(laddr string) (net.Listener, error) { func Listen(laddr string) (net.Listener, error) { return ListenWithOptions(laddr, nil, 0, 0) }
return ListenWithOptions(laddr, nil, 0, 0)
}
// ListenWithOptions listens for incoming KCP packets addressed to the local address laddr on the network "udp" with packet encryption, // ListenWithOptions listens for incoming KCP packets addressed to the local address laddr on the network "udp" with packet encryption,
// dataShards, parityShards defines Reed-Solomon Erasure Coding parameters // dataShards, parityShards defines Reed-Solomon Erasure Coding parameters
@ -870,7 +894,7 @@ func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketCo
l.conn = conn l.conn = conn
l.sessions = make(map[string]*UDPSession) l.sessions = make(map[string]*UDPSession)
l.chAccepts = make(chan *UDPSession, acceptBacklog) l.chAccepts = make(chan *UDPSession, acceptBacklog)
l.chSessionClosed = make(chan net.Addr) l.chSessionClosed = make(chan string)
l.die = make(chan struct{}) l.die = make(chan struct{})
l.dataShards = dataShards l.dataShards = dataShards
l.parityShards = parityShards l.parityShards = parityShards
@ -890,9 +914,7 @@ func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketCo
} }
// Dial connects to the remote address "raddr" on the network "udp" // Dial connects to the remote address "raddr" on the network "udp"
func Dial(raddr string) (net.Conn, error) { func Dial(raddr string) (net.Conn, error) { return DialWithOptions(raddr, nil, 0, 0) }
return DialWithOptions(raddr, nil, 0, 0)
}
// DialWithOptions connects to the remote address "raddr" on the network "udp" with packet encryption // DialWithOptions connects to the remote address "raddr" on the network "udp" with packet encryption
func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards int) (*UDPSession, error) { func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards int) (*UDPSession, error) {
@ -906,11 +928,11 @@ func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards in
return nil, errors.Wrap(err, "net.DialUDP") return nil, errors.Wrap(err, "net.DialUDP")
} }
return NewConn(raddr, block, dataShards, parityShards, &ConnectedUDPConn{udpconn, udpconn}) return NewConn(raddr, block, dataShards, parityShards, &connectedUDPConn{udpconn}, true)
} }
// NewConn establishes a session and talks KCP protocol over a packet connection. // NewConn establishes a session and talks KCP protocol over a packet connection.
func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*UDPSession, error) { func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn net.PacketConn, closeConn bool) (*UDPSession, error) {
udpaddr, err := net.ResolveUDPAddr("udp", raddr) udpaddr, err := net.ResolveUDPAddr("udp", raddr)
if err != nil { if err != nil {
return nil, errors.Wrap(err, "net.ResolveUDPAddr") return nil, errors.Wrap(err, "net.ResolveUDPAddr")
@ -918,22 +940,16 @@ func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn
var convid uint32 var convid uint32
binary.Read(rand.Reader, binary.LittleEndian, &convid) binary.Read(rand.Reader, binary.LittleEndian, &convid)
return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block), nil return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block, closeConn), nil
} }
func currentMs() uint32 { // returns current time in milliseconds
return uint32(time.Now().UnixNano() / int64(time.Millisecond)) func currentMs() uint32 { return uint32(time.Now().UnixNano() / int64(time.Millisecond)) }
}
// ConnectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls // connectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls
// to Write syscalls that are 4 times faster on some OS'es. This should only be // to Write syscalls that are 4 times faster on some OS'es. This should only be
// used for connections that were produced by a net.Dial* call. // used for connections that were produced by a net.Dial* call.
type ConnectedUDPConn struct { type connectedUDPConn struct{ *net.UDPConn }
*net.UDPConn
Conn net.Conn // underlying connection if any
}
// WriteTo redirects all writes to the Write syscall, which is 4 times faster. // WriteTo redirects all writes to the Write syscall, which is 4 times faster.
func (c *ConnectedUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) { func (c *connectedUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) { return c.Write(b) }
return c.Write(b)
}

View File

@ -15,15 +15,13 @@ func init() {
// entry contains a session update info // entry contains a session update info
type entry struct { type entry struct {
sid uint32 ts time.Time
ts time.Time s *UDPSession
s *UDPSession
} }
// a global heap managed kcp.flush() caller // a global heap managed kcp.flush() caller
type updateHeap struct { type updateHeap struct {
entries []entry entries []entry
indices map[uint32]int
mu sync.Mutex mu sync.Mutex
chWakeUp chan struct{} chWakeUp chan struct{}
} }
@ -32,41 +30,40 @@ func (h *updateHeap) Len() int { return len(h.entries) }
func (h *updateHeap) Less(i, j int) bool { return h.entries[i].ts.Before(h.entries[j].ts) } func (h *updateHeap) Less(i, j int) bool { return h.entries[i].ts.Before(h.entries[j].ts) }
func (h *updateHeap) Swap(i, j int) { func (h *updateHeap) Swap(i, j int) {
h.entries[i], h.entries[j] = h.entries[j], h.entries[i] h.entries[i], h.entries[j] = h.entries[j], h.entries[i]
h.indices[h.entries[i].sid] = i h.entries[i].s.updaterIdx = i
h.indices[h.entries[j].sid] = j h.entries[j].s.updaterIdx = j
} }
func (h *updateHeap) Push(x interface{}) { func (h *updateHeap) Push(x interface{}) {
h.entries = append(h.entries, x.(entry)) h.entries = append(h.entries, x.(entry))
n := len(h.entries) n := len(h.entries)
h.indices[h.entries[n-1].sid] = n - 1 h.entries[n-1].s.updaterIdx = n - 1
} }
func (h *updateHeap) Pop() interface{} { func (h *updateHeap) Pop() interface{} {
n := len(h.entries) n := len(h.entries)
x := h.entries[n-1] x := h.entries[n-1]
h.entries[n-1].s.updaterIdx = -1
h.entries[n-1] = entry{} // manual set nil for GC h.entries[n-1] = entry{} // manual set nil for GC
h.entries = h.entries[0 : n-1] h.entries = h.entries[0 : n-1]
delete(h.indices, x.sid)
return x return x
} }
func (h *updateHeap) init() { func (h *updateHeap) init() {
h.indices = make(map[uint32]int)
h.chWakeUp = make(chan struct{}, 1) h.chWakeUp = make(chan struct{}, 1)
} }
func (h *updateHeap) addSession(s *UDPSession) { func (h *updateHeap) addSession(s *UDPSession) {
h.mu.Lock() h.mu.Lock()
heap.Push(h, entry{s.sid, time.Now(), s}) heap.Push(h, entry{time.Now(), s})
h.mu.Unlock() h.mu.Unlock()
h.wakeup() h.wakeup()
} }
func (h *updateHeap) removeSession(s *UDPSession) { func (h *updateHeap) removeSession(s *UDPSession) {
h.mu.Lock() h.mu.Lock()
if idx, ok := h.indices[s.sid]; ok { if s.updaterIdx != -1 {
heap.Remove(h, idx) heap.Remove(h, s.updaterIdx)
} }
h.mu.Unlock() h.mu.Unlock()
} }
@ -99,7 +96,8 @@ func (h *updateHeap) updateTask() {
break break
} }
} }
if h.Len() > 0 {
if hlen > 0 {
timer = time.After(h.entries[0].ts.Sub(now)) timer = time.After(h.entries[0].ts.Sub(now))
} }
h.mu.Unlock() h.mu.Unlock()

21
vendor/github.com/templexxx/xor/LICENSE generated vendored Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2017 Temple3x
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

442
vendor/github.com/templexxx/xor/avx2_amd64.s generated vendored Normal file
View File

@ -0,0 +1,442 @@
#include "textflag.h"
// addr of mem
#define DST BX
#define SRC SI
#define SRC0 TMP4
#define SRC1 TMP5
// loop args
// num of vect
#define VECT CX
#define LEN DX
// pos of matrix
#define POS R8
// tmp store
// num of vect or ...
#define TMP1 R9
// pos of matrix or ...
#define TMP2 R10
// store addr of data/parity or ...
#define TMP3 R11
#define TMP4 R12
#define TMP5 R13
#define TMP6 R14
// func bytesAVX2mini(dst, src0, src1 []byte, size int)
TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $31, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop32b:
VMOVDQU (SRC0)(POS*1), Y0
VPXOR (SRC1)(POS*1), Y0, Y0
VMOVDQU Y0, (DST)(POS*1)
ADDQ $32, POS
CMPQ LEN, POS
JNE loop32b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $31, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $31, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $32
JGE aligned
RET
ret:
RET
// func bytesAVX2small(dst, src0, src1 []byte, size int)
TEXT ·bytesAVX2small(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
VMOVDQU (SRC0)(POS*1), Y0
VMOVDQU 32(SRC0)(POS*1), Y1
VMOVDQU 64(SRC0)(POS*1), Y2
VMOVDQU 96(SRC0)(POS*1), Y3
VPXOR (SRC1)(POS*1), Y0, Y0
VPXOR 32(SRC1)(POS*1), Y1, Y1
VPXOR 64(SRC1)(POS*1), Y2, Y2
VPXOR 96(SRC1)(POS*1), Y3, Y3
VMOVDQU Y0, (DST)(POS*1)
VMOVDQU Y1, 32(DST)(POS*1)
VMOVDQU Y2, 64(DST)(POS*1)
VMOVDQU Y3, 96(DST)(POS*1)
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $127, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET
// func bytesAVX2big(dst, src0, src1 []byte, size int)
TEXT ·bytesAVX2big(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
VMOVDQU (SRC0)(POS*1), Y0
VMOVDQU 32(SRC0)(POS*1), Y1
VMOVDQU 64(SRC0)(POS*1), Y2
VMOVDQU 96(SRC0)(POS*1), Y3
VPXOR (SRC1)(POS*1), Y0, Y0
VPXOR 32(SRC1)(POS*1), Y1, Y1
VPXOR 64(SRC1)(POS*1), Y2, Y2
VPXOR 96(SRC1)(POS*1), Y3, Y3
LONG $0xe77da1c4; WORD $0x0304
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
SFENCE
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $127, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET
// func matrixAVX2small(dst []byte, src [][]byte)
TEXT ·matrixAVX2small(SB), NOSPLIT, $0
MOVQ dst+0(FP), DST
MOVQ src+24(FP), SRC
MOVQ vec+32(FP), VECT
MOVQ len+8(FP), LEN
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
MOVQ VECT, TMP1
SUBQ $2, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y0
VMOVDQU 32(TMP4)(POS*1), Y1
VMOVDQU 64(TMP3)(POS*1), Y2
VMOVDQU 96(TMP4)(POS*1), Y3
next_vect:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y4
VMOVDQU 32(TMP4)(POS*1), Y5
VMOVDQU 64(TMP3)(POS*1), Y6
VMOVDQU 96(TMP4)(POS*1), Y7
VPXOR Y4, Y0, Y0
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
SUBQ $1, TMP1
JGE next_vect
VMOVDQU Y0, (DST)(POS*1)
VMOVDQU Y1, 32(DST)(POS*1)
VMOVDQU Y2, 64(DST)(POS*1)
VMOVDQU Y3, 96(DST)(POS*1)
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
RET
loop_1b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVB -1(TMP3)(LEN*1), TMP5
next_vect_1b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVB -1(TMP3)(LEN*1), TMP6
XORB TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_1b
MOVB TMP5, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP4
ANDQ $127, TMP4
loop_8b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVQ -8(TMP3)(LEN*1), TMP5
next_vect_8b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ -8(TMP3)(LEN*1), TMP6
XORQ TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_8b
MOVQ TMP5, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP4
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET
// func matrixAVX2big(dst []byte, src [][]byte)
TEXT ·matrixAVX2big(SB), NOSPLIT, $0
MOVQ dst+0(FP), DST
MOVQ src+24(FP), SRC
MOVQ vec+32(FP), VECT
MOVQ len+8(FP), LEN
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
MOVQ VECT, TMP1
SUBQ $2, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y0
VMOVDQU 32(TMP4)(POS*1), Y1
VMOVDQU 64(TMP3)(POS*1), Y2
VMOVDQU 96(TMP4)(POS*1), Y3
next_vect:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y4
VMOVDQU 32(TMP4)(POS*1), Y5
VMOVDQU 64(TMP3)(POS*1), Y6
VMOVDQU 96(TMP4)(POS*1), Y7
VPXOR Y4, Y0, Y0
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
SUBQ $1, TMP1
JGE next_vect
LONG $0xe77da1c4; WORD $0x0304 // VMOVNTDQ go1.8 has
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
RET
loop_1b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVB -1(TMP3)(LEN*1), TMP5
next_vect_1b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVB -1(TMP3)(LEN*1), TMP6
XORB TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_1b
MOVB TMP5, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP4
ANDQ $127, TMP4
loop_8b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVQ -8(TMP3)(LEN*1), TMP5
next_vect_8b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ -8(TMP3)(LEN*1), TMP6
XORQ TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_8b
MOVQ TMP5, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP4
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET
TEXT ·hasAVX2(SB), NOSPLIT, $0
XORQ AX, AX
XORQ CX, CX
ADDL $7, AX
CPUID
SHRQ $5, BX
ANDQ $1, BX
MOVB BX, ret+0(FP)
RET

116
vendor/github.com/templexxx/xor/nosimd.go generated vendored Normal file
View File

@ -0,0 +1,116 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package xor
import (
"runtime"
"unsafe"
)
const wordSize = int(unsafe.Sizeof(uintptr(0)))
const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
// xor the bytes in a and b. The destination is assumed to have enough space.
func bytesNoSIMD(dst, a, b []byte, size int) {
if supportsUnaligned {
fastXORBytes(dst, a, b, size)
} else {
// TODO(hanwen): if (dst, a, b) have common alignment
// we could still try fastXORBytes. It is not clear
// how often this happens, and it's only worth it if
// the block encryption itself is hardware
// accelerated.
safeXORBytes(dst, a, b, size)
}
}
// split slice for cache-friendly
const unitSize = 16 * 1024
func matrixNoSIMD(dst []byte, src [][]byte) {
size := len(src[0])
start := 0
do := unitSize
for start < size {
end := start + do
if end <= size {
partNoSIMD(start, end, dst, src)
start = start + do
} else {
partNoSIMD(start, size, dst, src)
start = size
}
}
}
// split vect will improve performance with big data by reducing cache pollution
func partNoSIMD(start, end int, dst []byte, src [][]byte) {
bytesNoSIMD(dst[start:end], src[0][start:end], src[1][start:end], end-start)
for i := 2; i < len(src); i++ {
bytesNoSIMD(dst[start:end], dst[start:end], src[i][start:end], end-start)
}
}
// fastXORBytes xor in bulk. It only works on architectures that
// support unaligned read/writes.
func fastXORBytes(dst, a, b []byte, n int) {
w := n / wordSize
if w > 0 {
wordBytes := w * wordSize
fastXORWords(dst[:wordBytes], a[:wordBytes], b[:wordBytes])
}
for i := n - n%wordSize; i < n; i++ {
dst[i] = a[i] ^ b[i]
}
}
func safeXORBytes(dst, a, b []byte, n int) {
ex := n % 8
for i := 0; i < ex; i++ {
dst[i] = a[i] ^ b[i]
}
for i := ex; i < n; i += 8 {
_dst := dst[i : i+8]
_a := a[i : i+8]
_b := b[i : i+8]
_dst[0] = _a[0] ^ _b[0]
_dst[1] = _a[1] ^ _b[1]
_dst[2] = _a[2] ^ _b[2]
_dst[3] = _a[3] ^ _b[3]
_dst[4] = _a[4] ^ _b[4]
_dst[5] = _a[5] ^ _b[5]
_dst[6] = _a[6] ^ _b[6]
_dst[7] = _a[7] ^ _b[7]
}
}
// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
// The arguments are assumed to be of equal length.
func fastXORWords(dst, a, b []byte) {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
n := len(b) / wordSize
ex := n % 8
for i := 0; i < ex; i++ {
dw[i] = aw[i] ^ bw[i]
}
for i := ex; i < n; i += 8 {
_dw := dw[i : i+8]
_aw := aw[i : i+8]
_bw := bw[i : i+8]
_dw[0] = _aw[0] ^ _bw[0]
_dw[1] = _aw[1] ^ _bw[1]
_dw[2] = _aw[2] ^ _bw[2]
_dw[3] = _aw[3] ^ _bw[3]
_dw[4] = _aw[4] ^ _bw[4]
_dw[5] = _aw[5] ^ _bw[5]
_dw[6] = _aw[6] ^ _bw[6]
_dw[7] = _aw[7] ^ _bw[7]
}
}

574
vendor/github.com/templexxx/xor/sse2_amd64.s generated vendored Normal file
View File

@ -0,0 +1,574 @@
#include "textflag.h"
// addr of mem
#define DST BX
#define SRC SI
#define SRC0 TMP4
#define SRC1 TMP5
// loop args
// num of vect
#define VECT CX
#define LEN DX
// pos of matrix
#define POS R8
// tmp store
// num of vect or ...
#define TMP1 R9
// pos of matrix or ...
#define TMP2 R10
// store addr of data/parity or ...
#define TMP3 R11
#define TMP4 R12
#define TMP5 R13
#define TMP6 R14
// func bytesSrc0(dst, src0, src1 []byte)
TEXT ·xorSrc0(SB), NOSPLIT, $0
MOVQ len+32(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $15, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop16b:
MOVOU (SRC0)(POS*1), X0
XORPD (SRC1)(POS*1), X0
MOVOU X0, (DST)(POS*1)
ADDQ $16, POS
CMPQ LEN, POS
JNE loop16b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $15, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $15, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $16
JGE aligned
RET
ret:
RET
// func bytesSrc1(dst, src0, src1 []byte)
TEXT ·xorSrc1(SB), NOSPLIT, $0
MOVQ len+56(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $15, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop16b:
MOVOU (SRC0)(POS*1), X0
XORPD (SRC1)(POS*1), X0
MOVOU X0, (DST)(POS*1)
ADDQ $16, POS
CMPQ LEN, POS
JNE loop16b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $15, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $15, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $16
JGE aligned
RET
ret:
RET
// func bytesSSE2mini(dst, src0, src1 []byte, size int)
TEXT ·bytesSSE2mini(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $15, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop16b:
MOVOU (SRC0)(POS*1), X0
XORPD (SRC1)(POS*1), X0
// MOVOU (SRC1)(POS*1), X4
// PXOR X4, X0
MOVOU X0, (DST)(POS*1)
ADDQ $16, POS
CMPQ LEN, POS
JNE loop16b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $15, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $15, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $16
JGE aligned
RET
ret:
RET
// func bytesSSE2small(dst, src0, src1 []byte, size int)
TEXT ·bytesSSE2small(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $63, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop64b:
MOVOU (SRC0)(POS*1), X0
MOVOU 16(SRC0)(POS*1), X1
MOVOU 32(SRC0)(POS*1), X2
MOVOU 48(SRC0)(POS*1), X3
MOVOU (SRC1)(POS*1), X4
MOVOU 16(SRC1)(POS*1), X5
MOVOU 32(SRC1)(POS*1), X6
MOVOU 48(SRC1)(POS*1), X7
PXOR X4, X0
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
MOVOU X0, (DST)(POS*1)
MOVOU X1, 16(DST)(POS*1)
MOVOU X2, 32(DST)(POS*1)
MOVOU X3, 48(DST)(POS*1)
ADDQ $64, POS
CMPQ LEN, POS
JNE loop64b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $63, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $63, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $64
JGE aligned
RET
ret:
RET
// func bytesSSE2big(dst, src0, src1 []byte, size int)
TEXT ·bytesSSE2big(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $63, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop64b:
MOVOU (SRC0)(POS*1), X0
MOVOU 16(SRC0)(POS*1), X1
MOVOU 32(SRC0)(POS*1), X2
MOVOU 48(SRC0)(POS*1), X3
MOVOU (SRC1)(POS*1), X4
MOVOU 16(SRC1)(POS*1), X5
MOVOU 32(SRC1)(POS*1), X6
MOVOU 48(SRC1)(POS*1), X7
PXOR X4, X0
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
LONG $0xe70f4266; WORD $0x0304 // MOVNTDQ
LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
ADDQ $64, POS
CMPQ LEN, POS
JNE loop64b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $63, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $63, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $64
JGE aligned
RET
ret:
RET
// func matrixSSE2small(dst []byte, src [][]byte)
TEXT ·matrixSSE2small(SB), NOSPLIT, $0
MOVQ dst+0(FP), DST
MOVQ src+24(FP), SRC
MOVQ vec+32(FP), VECT
MOVQ len+8(FP), LEN
TESTQ $63, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop64b:
MOVQ VECT, TMP1
SUBQ $2, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
MOVOU (TMP3)(POS*1), X0
MOVOU 16(TMP4)(POS*1), X1
MOVOU 32(TMP3)(POS*1), X2
MOVOU 48(TMP4)(POS*1), X3
next_vect:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
MOVOU (TMP3)(POS*1), X4
MOVOU 16(TMP4)(POS*1), X5
MOVOU 32(TMP3)(POS*1), X6
MOVOU 48(TMP4)(POS*1), X7
PXOR X4, X0
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
SUBQ $1, TMP1
JGE next_vect
MOVOU X0, (DST)(POS*1)
MOVOU X1, 16(DST)(POS*1)
MOVOU X2, 32(DST)(POS*1)
MOVOU X3, 48(DST)(POS*1)
ADDQ $64, POS
CMPQ LEN, POS
JNE loop64b
RET
loop_1b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVB -1(TMP3)(LEN*1), TMP5
next_vect_1b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVB -1(TMP3)(LEN*1), TMP6
XORB TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_1b
MOVB TMP5, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $63, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP4
ANDQ $63, TMP4
loop_8b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVQ -8(TMP3)(LEN*1), TMP5
next_vect_8b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ -8(TMP3)(LEN*1), TMP6
XORQ TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_8b
MOVQ TMP5, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP4
JG loop_8b
CMPQ LEN, $64
JGE aligned
RET
ret:
RET
// func matrixSSE2big(dst []byte, src [][]byte)
TEXT ·matrixSSE2big(SB), NOSPLIT, $0
MOVQ dst+0(FP), DST
MOVQ src+24(FP), SRC
MOVQ vec+32(FP), VECT
MOVQ len+8(FP), LEN
TESTQ $63, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop64b:
MOVQ VECT, TMP1
SUBQ $2, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
MOVOU (TMP3)(POS*1), X0
MOVOU 16(TMP4)(POS*1), X1
MOVOU 32(TMP3)(POS*1), X2
MOVOU 48(TMP4)(POS*1), X3
next_vect:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
MOVOU (TMP3)(POS*1), X4
MOVOU 16(TMP4)(POS*1), X5
MOVOU 32(TMP3)(POS*1), X6
MOVOU 48(TMP4)(POS*1), X7
PXOR X4, X0
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
SUBQ $1, TMP1
JGE next_vect
LONG $0xe70f4266; WORD $0x0304
LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
ADDQ $64, POS
CMPQ LEN, POS
JNE loop64b
RET
loop_1b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVB -1(TMP3)(LEN*1), TMP5
next_vect_1b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVB -1(TMP3)(LEN*1), TMP6
XORB TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_1b
MOVB TMP5, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $63, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP4
ANDQ $63, TMP4
loop_8b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVQ -8(TMP3)(LEN*1), TMP5
next_vect_8b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ -8(TMP3)(LEN*1), TMP6
XORQ TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_8b
MOVQ TMP5, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP4
JG loop_8b
CMPQ LEN, $64
JGE aligned
RET
ret:
RET
TEXT ·hasSSE2(SB), NOSPLIT, $0
XORQ AX, AX
INCL AX
CPUID
SHRQ $26, DX
ANDQ $1, DX
MOVB DX, ret+0(FP)
RET

49
vendor/github.com/templexxx/xor/xor.go generated vendored Normal file
View File

@ -0,0 +1,49 @@
package xor
// SIMD Extensions
const (
none = iota
avx2
// first introduced by Intel with the initial version of the Pentium 4 in 2001
// so I think we can assume all amd64 has sse2
sse2
)
var extension = none
// Bytes : chose the shortest one as xor size
// it's better to use it for big data ( > 64bytes )
func Bytes(dst, src0, src1 []byte) {
size := len(dst)
if size > len(src0) {
size = len(src0)
}
if size > len(src1) {
size = len(src1)
}
xorBytes(dst, src0, src1, size)
}
// BytesSameLen : all slice's length must be equal
// cut size branch, save time for small data
func BytesSameLen(dst, src0, src1 []byte) {
xorSrc1(dst, src0, src1)
}
// BytesSrc0 : src1 >= src0, dst >= src0
// xor src0's len bytes
func BytesSrc0(dst, src0, src1 []byte) {
xorSrc0(dst, src0, src1)
}
// BytesSrc1 : src0 >= src1, dst >= src1
// xor src1's len bytes
func BytesSrc1(dst, src0, src1 []byte) {
xorSrc1(dst, src0, src1)
}
// Matrix : all slice's length must be equal && != 0
// len(src) must >= 2
func Matrix(dst []byte, src [][]byte) {
xorMatrix(dst, src)
}

118
vendor/github.com/templexxx/xor/xor_amd64.go generated vendored Normal file
View File

@ -0,0 +1,118 @@
package xor
func init() {
getEXT()
}
func getEXT() {
if hasAVX2() {
extension = avx2
} else {
extension = sse2
}
return
}
func xorBytes(dst, src0, src1 []byte, size int) {
switch extension {
case avx2:
bytesAVX2(dst, src0, src1, size)
default:
bytesSSE2(dst, src0, src1, size)
}
}
// non-temporal hint store
const nontmp = 8 * 1024
const avx2loopsize = 128
func bytesAVX2(dst, src0, src1 []byte, size int) {
if size < avx2loopsize {
bytesAVX2mini(dst, src0, src1, size)
} else if size >= avx2loopsize && size <= nontmp {
bytesAVX2small(dst, src0, src1, size)
} else {
bytesAVX2big(dst, src0, src1, size)
}
}
const sse2loopsize = 64
func bytesSSE2(dst, src0, src1 []byte, size int) {
if size < sse2loopsize {
bytesSSE2mini(dst, src0, src1, size)
} else if size >= sse2loopsize && size <= nontmp {
bytesSSE2small(dst, src0, src1, size)
} else {
bytesSSE2big(dst, src0, src1, size)
}
}
func xorMatrix(dst []byte, src [][]byte) {
switch extension {
case avx2:
matrixAVX2(dst, src)
default:
matrixSSE2(dst, src)
}
}
func matrixAVX2(dst []byte, src [][]byte) {
size := len(dst)
if size > nontmp {
matrixAVX2big(dst, src)
} else {
matrixAVX2small(dst, src)
}
}
func matrixSSE2(dst []byte, src [][]byte) {
size := len(dst)
if size > nontmp {
matrixSSE2big(dst, src)
} else {
matrixSSE2small(dst, src)
}
}
//go:noescape
func xorSrc0(dst, src0, src1 []byte)
//go:noescape
func xorSrc1(dst, src0, src1 []byte)
//go:noescape
func bytesAVX2mini(dst, src0, src1 []byte, size int)
//go:noescape
func bytesAVX2big(dst, src0, src1 []byte, size int)
//go:noescape
func bytesAVX2small(dst, src0, src1 []byte, size int)
//go:noescape
func bytesSSE2mini(dst, src0, src1 []byte, size int)
//go:noescape
func bytesSSE2small(dst, src0, src1 []byte, size int)
//go:noescape
func bytesSSE2big(dst, src0, src1 []byte, size int)
//go:noescape
func matrixAVX2small(dst []byte, src [][]byte)
//go:noescape
func matrixAVX2big(dst []byte, src [][]byte)
//go:noescape
func matrixSSE2small(dst []byte, src [][]byte)
//go:noescape
func matrixSSE2big(dst []byte, src [][]byte)
//go:noescape
func hasAVX2() bool
//go:noescape
func hasSSE2() bool

19
vendor/github.com/templexxx/xor/xor_other.go generated vendored Normal file
View File

@ -0,0 +1,19 @@
// +build !amd64 noasm
package xor
func xorBytes(dst, src0, src1 []byte, size int) {
bytesNoSIMD(dst, src0, src1, size)
}
func xorMatrix(dst []byte, src [][]byte) {
matrixNoSIMD(dst, src)
}
func xorSrc0(dst, src0, src1 []byte) {
bytesNoSIMD(dst, src0, src1, len(src0))
}
func xorSrc1(dst, src0, src1 []byte) {
bytesNoSIMD(dst, src0, src1, len(src1))
}

24
vendor/manifest vendored
View File

@ -17,6 +17,14 @@
"branch": "master", "branch": "master",
"notests": true "notests": true
}, },
{
"importpath": "github.com/AudriusButkevicius/kcp-go",
"repository": "https://github.com/AudriusButkevicius/kcp-go",
"vcs": "git",
"revision": "0ccc04f3b8a7bdf53e2d4d6d0769adbc7cb3851a",
"branch": "master",
"notests": true
},
{ {
"importpath": "github.com/AudriusButkevicius/pfilter", "importpath": "github.com/AudriusButkevicius/pfilter",
"repository": "https://github.com/AudriusButkevicius/pfilter", "repository": "https://github.com/AudriusButkevicius/pfilter",
@ -378,6 +386,14 @@
"path": "/leveldb", "path": "/leveldb",
"notests": true "notests": true
}, },
{
"importpath": "github.com/templexxx/xor",
"repository": "https://github.com/templexxx/xor",
"vcs": "git",
"revision": "42f9c041c330b560afb991153bf183c25444bcdc",
"branch": "master",
"notests": true
},
{ {
"importpath": "github.com/thejerf/suture", "importpath": "github.com/thejerf/suture",
"repository": "https://github.com/thejerf/suture", "repository": "https://github.com/thejerf/suture",
@ -413,14 +429,6 @@
"path": "/qr", "path": "/qr",
"notests": true "notests": true
}, },
{
"importpath": "github.com/xtaci/kcp-go",
"repository": "https://github.com/xtaci/kcp-go",
"vcs": "git",
"revision": "0b0731ef3f184a8985edcb4ca26a4b0598c6dc1a",
"branch": "master",
"notests": true
},
{ {
"importpath": "github.com/xtaci/smux", "importpath": "github.com/xtaci/smux",
"repository": "https://github.com/xtaci/smux", "repository": "https://github.com/xtaci/smux",