diff --git a/arraycontainer.go b/arraycontainer.go index 9541fd53..901138ae 100644 --- a/arraycontainer.go +++ b/arraycontainer.go @@ -875,6 +875,10 @@ func (ac *arrayContainer) clone() container { return &ptr } +func (ac *arrayContainer) clear() { + ac.content = ac.content[:0] +} + func (ac *arrayContainer) contains(x uint16) bool { return binarySearch(ac.content, x) >= 0 } diff --git a/benchmark_test.go b/benchmark_test.go index 55df2bcf..b92613b5 100644 --- a/benchmark_test.go +++ b/benchmark_test.go @@ -3,16 +3,16 @@ package roaring import ( "bytes" "fmt" - "github.com/stretchr/testify/require" "math/rand" "testing" + "github.com/stretchr/testify/require" + "github.com/bits-and-blooms/bitset" ) // BENCHMARKS, to run them type "go test -bench Benchmark -run -" - // go test -bench BenchmarkIteratorAlloc -benchmem -run - func BenchmarkIteratorAlloc(b *testing.B) { bm := NewBitmap() @@ -84,7 +84,6 @@ func BenchmarkIteratorAlloc(b *testing.B) { b.Fatalf("Cardinalities don't match: %d, %d", counter, expected_cardinality) } - b.Run("many iteration with alloc", func(b *testing.B) { for n := 0; n < b.N; n++ { counter = 0 @@ -117,7 +116,6 @@ func BenchmarkIteratorAlloc(b *testing.B) { } } - // go test -bench BenchmarkOrs -benchmem -run - func BenchmarkOrs(b *testing.B) { @@ -1134,3 +1132,21 @@ func BenchmarkAndAny(b *testing.B) { runSet("small-filters", genOne(r, largeSize, domain), genMulti(r, filtersNum, smallSize, domain)) runSet("equal", genOne(r, defaultSize, domain), genMulti(r, filtersNum, defaultSize, domain)) } + +func BenchmarkRepeatedSparseSerialization(b *testing.B) { + var ( + l = NewBitmap() + buf = bytes.NewBuffer(nil) + ) + for i := 0; i < b.N; i++ { + l.ClearRetainStructures() + for j := 0; j < 16; j++ { + l.Add(uint32(j)) + } + buf.Reset() + _, err := l.WriteTo(buf) + if err != nil { + panic(err) + } + } +} diff --git a/bitmapcontainer.go b/bitmapcontainer.go index 71029f4f..c252312c 100644 --- a/bitmapcontainer.go +++ b/bitmapcontainer.go @@ -360,6 +360,13 @@ func (bc *bitmapContainer) clone() container { return &ptr } +func (bc *bitmapContainer) clear() { + for i := range bc.bitmap { + bc.bitmap[i] = 0 + } + bc.cardinality = 0 +} + // add all values in range [firstOfRange,lastOfRange) func (bc *bitmapContainer) iaddRange(firstOfRange, lastOfRange int) container { bc.cardinality += setBitmapRangeAndCardinalityChange(bc.bitmap, firstOfRange, lastOfRange) diff --git a/parallel.go b/parallel.go index 9208e3e3..38693c8d 100644 --- a/parallel.go +++ b/parallel.go @@ -166,7 +166,9 @@ func appenderRoutine(bitmapChan chan<- *Bitmap, resultChan <-chan keyedContainer make([]container, 0, expectedKeys), make([]bool, 0, expectedKeys), false, + nil, }, + nil, } for i := range keys { if containers[i] != nil { // in case a resulting container was empty, see ParAnd function @@ -440,6 +442,7 @@ func ParOr(parallelism int, bitmaps ...*Bitmap) *Bitmap { keys: make([]uint16, containerCount), needCopyOnWrite: make([]bool, containerCount), }, + nil, } resultOffset := 0 diff --git a/prop_test.go b/prop_test.go new file mode 100644 index 00000000..9346a09d --- /dev/null +++ b/prop_test.go @@ -0,0 +1,216 @@ +package roaring + +import ( + "fmt" + "math/rand" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPropertyRepeatedSerializationWithClearRetainStructures(t *testing.T) { + var ( + // Make test deterministic. + rand = rand.New(rand.NewSource(0)) + reusable = NewBitmap() + ) + testFn := func(t *testing.T) { + roaring1, roaring2, reference1, reference2 := genPropTestInputs(rand) + + reusable.ClearRetainStructures() + roaring1.Iterate(func(x uint32) bool { + reusable.Add(x) + return true + }) + assertRoaringEqualsReference(t, roundTripRoaring(t, reusable), reference1) + + reusable.ClearRetainStructures() + roaring2.Iterate(func(x uint32) bool { + reusable.Add(x) + return true + }) + assertRoaringEqualsReference(t, roundTripRoaring(t, reusable), reference2) + } + + for i := 0; i < 1000; i++ { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + testFn(t) + }) + } +} + +func TestPropertyOr(t *testing.T) { + var ( + // Make test deterministic. + rand = rand.New(rand.NewSource(0)) + reusable = NewBitmap() + ) + testFn := func(t *testing.T) { + roaring1, roaring2, reference1, reference2 := genPropTestInputs(rand) + + reusable.ClearRetainStructures() + reusable.Or(roaring1) + reusable.Or(roaring2) + roaring1.Or(roaring2) + reference1.Or(reference2) + + assertRoaringEqualsReference(t, reusable, reference1) + assertRoaringEqualsReference(t, roaring1, reference1) + } + + for i := 0; i < 1000; i++ { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + testFn(t) + }) + } +} + +func TestPropertyAnd(t *testing.T) { + var ( + // Make test deterministic. + rand = rand.New(rand.NewSource(0)) + reusable = NewBitmap() + ) + testFn := func(t *testing.T) { + roaring1, roaring2, reference1, reference2 := genPropTestInputs(rand) + + reusable.ClearRetainStructures() + reusable.And(roaring1) + reusable.And(roaring2) + roaring1.And(roaring2) + reference1.And(reference2) + + assertRoaringEqualsReference(t, reusable, reference1) + assertRoaringEqualsReference(t, roaring1, reference1) + } + + for i := 0; i < 100; i++ { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + testFn(t) + }) + } +} + +func genPropTestInputs(rand *rand.Rand) (*Bitmap, *Bitmap, *reference, *reference) { + var ( + aSize = rand.Intn(1000) + bSize = rand.Intn(1000) + aValues = make([]uint32, 0, aSize) + bValues = make([]uint32, 0, bSize) + ) + for j := 0; j < aSize; j++ { + aValues = append(aValues, rand.Uint32()) + } + for j := 0; j < bSize; j++ { + bValues = append(bValues, rand.Uint32()) + } + + var ( + roaring1 = New() + roaring2 = New() + + reference1 = newReference() + reference2 = newReference() + ) + for _, v := range aValues { + if rand.Intn(20) == 0 { + rangeStart := rand.Uint32() + roaring1.AddRange(uint64(rangeStart), uint64(rangeStart+100)) + reference1.AddRange(uint64(rangeStart), uint64(rangeStart+100)) + continue + } + + roaring1.Add(v) + reference1.Add(v) + } + + for _, v := range bValues { + if rand.Intn(20) == 0 { + rangeStart := rand.Uint32() + roaring2.AddRange(uint64(rangeStart), uint64(rangeStart+100)) + reference2.AddRange(uint64(rangeStart), uint64(rangeStart+100)) + continue + } + + roaring2.Add(v) + reference2.Add(v) + } + + return roaring1, roaring2, reference1, reference2 +} + +// reference is a reference implementation that can be used in property tests +// to assert the correctness of the actual roaring implementation. +type reference struct { + m map[uint32]struct{} +} + +func newReference() *reference { + return &reference{ + m: make(map[uint32]struct{}), + } +} + +func (r *reference) Add(x uint32) { + r.m[x] = struct{}{} +} + +func (r *reference) AddRange(start, end uint64) { + for i := start; i < end; i++ { + r.m[uint32(i)] = struct{}{} + } +} + +func (r *reference) Contains(x uint32) bool { + _, ok := r.m[x] + return ok +} + +func (r *reference) Cardinality() uint64 { + return uint64(len(r.m)) +} + +func (r *reference) Or(other *reference) { + for v := range other.m { + r.m[v] = struct{}{} + } +} + +func (r *reference) And(other *reference) { + newM := map[uint32]struct{}{} + for v := range other.m { + if _, ok := r.m[v]; ok { + newM[v] = struct{}{} + } + } + r.m = newM +} + +func assertRoaringEqualsReference( + t *testing.T, + roaring *Bitmap, + reference *reference, +) { + // round-trip the roaring bitmap to ensure our property still holds + // true after a round of ser/der. + rounedTrippedRoaring := roundTripRoaring(t, roaring) + require.Equal(t, reference.Cardinality(), rounedTrippedRoaring.Stats().Cardinality) + roaring.Iterate(func(x uint32) bool { + require.True(t, reference.Contains(x)) + return true + }) +} + +func roundTripRoaring(t *testing.T, b *Bitmap) *Bitmap { + b.RunOptimize() + + marshaled, err := b.ToBytes() + require.NoError(t, err) + + unmarshaled := New() + p, err := unmarshaled.FromBuffer(marshaled) + require.NoError(t, err) + require.Equal(t, int64(len(marshaled)), p) + + return unmarshaled +} diff --git a/roaring.go b/roaring.go index 7220da27..1b59ce35 100644 --- a/roaring.go +++ b/roaring.go @@ -18,6 +18,16 @@ import ( // Bitmap represents a compressed bitmap where you can add integers. type Bitmap struct { highlowcontainer roaringArray + + // Used to cache reusable array containers. Usually empty and will be drained + // by a call to Clear(). Will be filled with any existing array containers by + // a call to ClearRetainStructures(). If len(arrayContainerPool) is > 0 + // then new array containers will be created by removing one from this cache + // instead of allocation. + // + // TODO: There is an obvious opportunity to extend this to bitmap containers + // as well, but leaving that out of the first implementation. + arrayContainerPool []container } // ToBase64 serializes a bitmap as Base64 @@ -63,7 +73,7 @@ func (rb *Bitmap) ToBytes() ([]byte, error) { func (rb *Bitmap) Checksum() uint64 { const ( offset = 14695981039346656037 - prime = 1099511628211 + prime = 1099511628211 ) var bytes []byte @@ -192,6 +202,24 @@ func New() *Bitmap { // some memory allocations that may speed up future operations func (rb *Bitmap) Clear() { rb.highlowcontainer.clear() + for i := range rb.arrayContainerPool { + rb.arrayContainerPool[i] = nil + } + rb.arrayContainerPool = rb.arrayContainerPool[:0] +} + +// ClearRetainStructures is the same as Clear(), but it is much more +// aggressive in how it will preserve existing datastructures, and it +// will also return any allocated array containers to the(bitmap-local) +// pool for subsequent reuse. +func (rb *Bitmap) ClearRetainStructures() { + for _, c := range rb.highlowcontainer.containers { + if c.containerType() == arrayContype { + c.clear() + rb.arrayContainerPool = append(rb.arrayContainerPool, c) + } + } + rb.highlowcontainer.clearRetainStructures() } // ToArray creates a new slice containing all of the integers stored in the Bitmap in sorted order @@ -276,9 +304,9 @@ type intIterator struct { // This way, instead of making up-to 64k allocations per full iteration // we get a single allocation and simply reinitialize the appropriate // iterator and point to it in the generic `iter` member on each key bound. - shortIter shortIterator - runIter runIterator16 - bitmapIter bitmapContainerShortIterator + shortIter shortIterator + runIter runIterator16 + bitmapIter bitmapContainerShortIterator } // HasNext returns true if there are more integers to iterate over @@ -341,7 +369,6 @@ func (ii *intIterator) AdvanceIfNeeded(minval uint32) { // IntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap) type IntIterator = intIterator - // Initialize configures the existing iterator so that it can iterate through the values of // the provided bitmap. // The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove). @@ -357,9 +384,9 @@ type intReverseIterator struct { iter shortIterable highlowcontainer *roaringArray - shortIter reverseIterator - runIter runReverseIterator16 - bitmapIter reverseBitmapContainerShortIterator + shortIter reverseIterator + runIter runReverseIterator16 + bitmapIter reverseBitmapContainerShortIterator } // HasNext returns true if there are more integers to iterate over @@ -434,9 +461,9 @@ type manyIntIterator struct { iter manyIterable highlowcontainer *roaringArray - shortIter shortIterator - runIter runIterator16 - bitmapIter bitmapContainerManyIterator + shortIter shortIterator + runIter runIterator16 + bitmapIter bitmapContainerManyIterator } func (ii *manyIntIterator) init() { @@ -495,7 +522,6 @@ func (ii *manyIntIterator) NextMany64(hs64 uint64, buf []uint64) int { return n } - // ManyIntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap) type ManyIntIterator = manyIntIterator @@ -569,7 +595,7 @@ func (rb *Bitmap) Iterate(cb func(x uint32) bool) { // Iterator creates a new IntPeekable to iterate over the integers contained in the bitmap, in sorted order; // the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove). func (rb *Bitmap) Iterator() IntPeekable { - p := new(intIterator) + p := new(intIterator) p.Initialize(rb) return p } @@ -716,11 +742,10 @@ func (rb *Bitmap) Add(x uint32) { ra := &rb.highlowcontainer i := ra.getIndex(hb) if i >= 0 { - var c container - c = ra.getWritableContainerAtIndex(i).iaddReturnMinimized(lowbits(x)) + c := ra.getWritableContainerAtIndex(i).iaddReturnMinimized(lowbits(x)) rb.highlowcontainer.setContainerAtIndex(i, c) } else { - newac := newArrayContainer() + newac := rb.getNewArrayContainer() rb.highlowcontainer.insertNewKeyValueAt(-i-1, hb, newac.iaddReturnMinimized(lowbits(x))) } } @@ -736,7 +761,7 @@ func (rb *Bitmap) addwithptr(x uint32) (int, container) { rb.highlowcontainer.setContainerAtIndex(i, c) return i, c } - newac := newArrayContainer() + newac := rb.getNewArrayContainer() c = newac.iaddReturnMinimized(lowbits(x)) rb.highlowcontainer.insertNewKeyValueAt(-i-1, hb, c) return -i - 1, c @@ -754,7 +779,7 @@ func (rb *Bitmap) CheckedAdd(x uint32) bool { rb.highlowcontainer.setContainerAtIndex(i, C) return C.getCardinality() > oldcard } - newac := newArrayContainer() + newac := rb.getNewArrayContainer() rb.highlowcontainer.insertNewKeyValueAt(-i-1, hb, newac.iaddReturnMinimized(lowbits(x))) return true @@ -1713,3 +1738,21 @@ func (rb *Bitmap) Stats() Statistics { } return stats } + +// getNewArrayContainer checks the arrayContainerPool for an existing array +// container, and if it doesn't find one it just allocates a new one. +func (rb *Bitmap) getNewArrayContainer() container { + if len(rb.arrayContainerPool) > 0 { + // Take the last item out of the pool. + newac := rb.arrayContainerPool[len(rb.arrayContainerPool)-1] + // Nil out the reference in the pool before resizing it so that it can be + // reclaimed by the G.C later if necessary (otherwise even if it is never + // returned to the pool, the G.C will be able to reach it via the underlying + // capacity of the slice even if it has been resized). + rb.arrayContainerPool[len(rb.arrayContainerPool)-1] = nil + rb.arrayContainerPool = rb.arrayContainerPool[:len(rb.arrayContainerPool)-1] + return newac + } else { + return newArrayContainer() + } +} diff --git a/roaringarray.go b/roaringarray.go index eeb3d313..43c2c598 100644 --- a/roaringarray.go +++ b/roaringarray.go @@ -4,8 +4,9 @@ import ( "bytes" "encoding/binary" "fmt" - "github.com/RoaringBitmap/roaring/internal" "io" + + "github.com/RoaringBitmap/roaring/internal" ) type container interface { @@ -15,6 +16,7 @@ type container interface { addOffset(uint16) (container, container) clone() container + clear() and(container) container andCardinality(container) int iand(container) container // i stands for inplace @@ -103,6 +105,11 @@ type roaringArray struct { containers []container `msg:"-"` // don't try to serialize directly. needCopyOnWrite []bool copyOnWrite bool + + // Used to buffer data in writeTo() calls. Will be reset to nil after a call + // to clear(), or retained (but resized to 0) after a call to + // clearRetainStructures(). + serializationBuf []byte } func newRoaringArray() *roaringArray { @@ -234,6 +241,13 @@ func (ra *roaringArray) resize(newsize int) { func (ra *roaringArray) clear() { ra.resize(0) ra.copyOnWrite = false + ra.serializationBuf = nil +} + +func (ra *roaringArray) clearRetainStructures() { + ra.resize(0) + ra.copyOnWrite = false + ra.serializationBuf = ra.serializationBuf[:0] } func (ra *roaringArray) clone() *roaringArray { @@ -469,19 +483,33 @@ func (ra *roaringArray) serializedSizeInBytes() uint64 { // spec: https://github.com/RoaringBitmap/RoaringFormatSpec // func (ra *roaringArray) writeTo(w io.Writer) (n int64, err error) { - hasRun := ra.hasRunCompression() - isRunSizeInBytes := 0 - cookieSize := 8 + var ( + hasRun = ra.hasRunCompression() + isRunSizeInBytes = 0 + cookieSize = 8 + ) if hasRun { cookieSize = 4 isRunSizeInBytes = (len(ra.keys) + 7) / 8 } - descriptiveHeaderSize := 4 * len(ra.keys) - preambleSize := cookieSize + isRunSizeInBytes + descriptiveHeaderSize - - buf := make([]byte, preambleSize+4*len(ra.keys)) - nw := 0 + var ( + descriptiveHeaderSize = 4 * len(ra.keys) + preambleSize = cookieSize + isRunSizeInBytes + descriptiveHeaderSize + buf []byte + bufSizeRequired = preambleSize + 4*len(ra.keys) + nw = 0 + ) + if cap(ra.serializationBuf) < bufSizeRequired { + buf = make([]byte, bufSizeRequired) + ra.serializationBuf = buf // Capture for next time. + } else { + buf = ra.serializationBuf[:bufSizeRequired] + for i := range buf { + // Memclear just to be safe. + buf[i] = 0 + } + } if hasRun { binary.LittleEndian.PutUint16(buf[0:], uint16(serialCookie)) diff --git a/runcontainer.go b/runcontainer.go index 4ce48a29..6890fe38 100644 --- a/runcontainer.go +++ b/runcontainer.go @@ -1008,6 +1008,10 @@ func (rc *runContainer16) Clone() *runContainer16 { return rc2 } +func (rc *runContainer16) clear() { + rc.iv = rc.iv[:0] +} + // newRunContainer16TakeOwnership returns a new runContainer16 // backed by the provided iv slice, which we will // assume exclusive control over from now on.