lib/db: Use SipHash to deal with hash collision in GC (#6826)

If the GC finds a key k that it wants to keep, it records that in a
Bloom filter. If a key k' can be removed but its hash collides with k,
it will be kept. Since the old Bloom filter code was completely
deterministic, the next run would encounter the same collision, assuming
k must still be kept.

A randomized hash function that uses all the SHA-256 bits solves this
problem: the second run has a non-zero probability of removing k', as
long as the Bloom filter is not completely full.
This commit is contained in:
greatroar 2020-07-11 09:36:09 +02:00 committed by GitHub
parent bbda58a29f
commit 9f92f8c609
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 40 additions and 16 deletions

1
go.mod
View File

@ -11,6 +11,7 @@ require (
github.com/certifi/gocertifi v0.0.0-20190905060710-a5e0173ced67 // indirect github.com/certifi/gocertifi v0.0.0-20190905060710-a5e0173ced67 // indirect
github.com/chmduquesne/rollinghash v0.0.0-20180912150627-a60f8e7142b5 github.com/chmduquesne/rollinghash v0.0.0-20180912150627-a60f8e7142b5
github.com/d4l3k/messagediff v1.2.1 github.com/d4l3k/messagediff v1.2.1
github.com/dchest/siphash v1.2.1
github.com/dgraph-io/badger/v2 v2.0.3 github.com/dgraph-io/badger/v2 v2.0.3
github.com/flynn-archive/go-shlex v0.0.0-20150515145356-3f9db97f8568 github.com/flynn-archive/go-shlex v0.0.0-20150515145356-3f9db97f8568
github.com/getsentry/raven-go v0.2.0 github.com/getsentry/raven-go v0.2.0

2
go.sum
View File

@ -70,6 +70,8 @@ github.com/d4l3k/messagediff v1.2.1/go.mod h1:Oozbb1TVXFac9FtSIxHBMnBCq2qeH/2KkE
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dchest/siphash v1.2.1 h1:4cLinnzVJDKxTCl9B01807Yiy+W7ZzVHj/KIroQRvT4=
github.com/dchest/siphash v1.2.1/go.mod h1:q+IRvb2gOSrUnYoPqHiyHXS0FOBBOdl6tONBlVnOnt4=
github.com/dgraph-io/badger v1.6.1 h1:w9pSFNSdq/JPM1N12Fz/F/bzo993Is1W+Q7HjPzi7yg= github.com/dgraph-io/badger v1.6.1 h1:w9pSFNSdq/JPM1N12Fz/F/bzo993Is1W+Q7HjPzi7yg=
github.com/dgraph-io/badger/v2 v2.0.3 h1:inzdf6VF/NZ+tJ8RwwYMjJMvsOALTHYdozn0qSl6XJI= github.com/dgraph-io/badger/v2 v2.0.3 h1:inzdf6VF/NZ+tJ8RwwYMjJMvsOALTHYdozn0qSl6XJI=
github.com/dgraph-io/badger/v2 v2.0.3/go.mod h1:3KY8+bsP8wI0OEnQJAKpd4wIJW/Mm32yw2j/9FUVnIM= github.com/dgraph-io/badger/v2 v2.0.3/go.mod h1:3KY8+bsP8wI0OEnQJAKpd4wIJW/Mm32yw2j/9FUVnIM=

View File

@ -10,11 +10,14 @@ import (
"bytes" "bytes"
"context" "context"
"encoding/binary" "encoding/binary"
"io"
"time" "time"
"github.com/dchest/siphash"
"github.com/greatroar/blobloom" "github.com/greatroar/blobloom"
"github.com/syncthing/syncthing/lib/db/backend" "github.com/syncthing/syncthing/lib/db/backend"
"github.com/syncthing/syncthing/lib/protocol" "github.com/syncthing/syncthing/lib/protocol"
"github.com/syncthing/syncthing/lib/rand"
"github.com/syncthing/syncthing/lib/sha256" "github.com/syncthing/syncthing/lib/sha256"
"github.com/syncthing/syncthing/lib/sync" "github.com/syncthing/syncthing/lib/sync"
"github.com/syncthing/syncthing/lib/util" "github.com/syncthing/syncthing/lib/util"
@ -679,10 +682,10 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
return err return err
} }
if len(hashes.BlocksHash) > 0 { if len(hashes.BlocksHash) > 0 {
blockFilter.Add(bloomHash(hashes.BlocksHash)) blockFilter.add(hashes.BlocksHash)
} }
if len(hashes.VersionHash) > 0 { if len(hashes.VersionHash) > 0 {
versionFilter.Add(bloomHash(hashes.VersionHash)) versionFilter.add(hashes.VersionHash)
} }
} }
it.Release() it.Release()
@ -707,7 +710,7 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
} }
key := blockListKey(it.Key()) key := blockListKey(it.Key())
if blockFilter.Has(bloomHash(key.Hash())) { if blockFilter.has(key.Hash()) {
matchedBlocks++ matchedBlocks++
continue continue
} }
@ -736,7 +739,7 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
} }
key := versionKey(it.Key()) key := versionKey(it.Key())
if versionFilter.Has(bloomHash(key.Hash())) { if versionFilter.has(key.Hash()) {
matchedVersions++ matchedVersions++
continue continue
} }
@ -762,21 +765,39 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
return db.Compact() return db.Compact()
} }
func newBloomFilter(capacity int) *blobloom.Filter { func newBloomFilter(capacity int) bloomFilter {
return blobloom.NewOptimized(blobloom.Config{ var buf [16]byte
Capacity: uint64(capacity), io.ReadFull(rand.Reader, buf[:])
FPRate: indirectGCBloomFalsePositiveRate,
MaxBits: 8 * indirectGCBloomMaxBytes, return bloomFilter{
}) f: blobloom.NewOptimized(blobloom.Config{
Capacity: uint64(capacity),
FPRate: indirectGCBloomFalsePositiveRate,
MaxBits: 8 * indirectGCBloomMaxBytes,
}),
k0: binary.LittleEndian.Uint64(buf[:8]),
k1: binary.LittleEndian.Uint64(buf[8:]),
}
} }
// Hash function for the bloomfilter: first eight bytes of the SHA-256. type bloomFilter struct {
// Big or little-endian makes no difference, as long as we're consistent. f *blobloom.Filter
func bloomHash(key []byte) uint64 { k0, k1 uint64 // Random key for SipHash.
if len(key) != sha256.Size { }
panic("bug: bloomHash passed something not a SHA256 hash")
func (b *bloomFilter) add(id []byte) { b.f.Add(b.hash(id)) }
func (b *bloomFilter) has(id []byte) bool { return b.f.Has(b.hash(id)) }
// Hash function for the bloomfilter: SipHash of the SHA-256.
//
// The randomization in SipHash means we get different collisions across
// runs and colliding keys are not kept indefinitely.
func (b *bloomFilter) hash(id []byte) uint64 {
if len(id) != sha256.Size {
panic("bug: bloomFilter.hash passed something not a SHA256 hash")
} }
return binary.BigEndian.Uint64(key) return siphash.Hash(b.k0, b.k1, id)
} }
// CheckRepair checks folder metadata and sequences for miscellaneous errors. // CheckRepair checks folder metadata and sequences for miscellaneous errors.