Skip to content

Commit e59be01

Browse files
author
John Mason
committed
Cache docMatchTree results for RepoIds and Meta
When constructing a complicated match tree, this cache allows zoekt to not have to recompute conditions that it has already seen before.
1 parent 4e4a529 commit e59be01

File tree

3 files changed

+111
-5
lines changed

3 files changed

+111
-5
lines changed

index/indexdata.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,14 @@ type indexData struct {
103103

104104
// rawConfigMasks contains the encoded RawConfig for each repository
105105
rawConfigMasks []uint8
106+
107+
// Cache for docMatchTree objects
108+
docMatchTreeCache docMatchTreeCache
106109
}
107110

111+
// docMatchTreeCache is a cache for docMatchTree objects so they don't need to be recomputed
112+
type docMatchTreeCache map[string]*docMatchTree
113+
108114
type symbolData struct {
109115
// symContent stores Symbol.Sym and Symbol.Parent.
110116
// TODO we don't need to store Symbol.Sym.

index/matchtree.go

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ import (
2323
"strings"
2424
"unicode/utf8"
2525

26+
"crypto/sha256"
27+
2628
"github.com/grafana/regexp"
2729

2830
"github.com/sourcegraph/zoekt"
@@ -971,6 +973,11 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error)
971973
if q == nil {
972974
return nil, fmt.Errorf("got nil (sub)query")
973975
}
976+
977+
if d.docMatchTreeCache == nil {
978+
d.docMatchTreeCache = make(docMatchTreeCache)
979+
}
980+
974981
switch s := q.(type) {
975982
case *query.Regexp:
976983
// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
@@ -1054,6 +1061,11 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error)
10541061
}, nil
10551062

10561063
case *query.Meta:
1064+
cacheKey := queryMetaCacheKey(s.Field, s.Value)
1065+
if cached, ok := d.docMatchTreeCache[cacheKey]; ok {
1066+
return cached, nil
1067+
}
1068+
10571069
reposWant := make([]bool, len(d.repoMetaData))
10581070
for repoIdx, r := range d.repoMetaData {
10591071
if r.Metadata != nil {
@@ -1063,7 +1075,7 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error)
10631075
}
10641076
}
10651077

1066-
return &docMatchTree{
1078+
mt := &docMatchTree{
10671079
reason: "Meta",
10681080
numDocs: d.numDocs(),
10691081
predicate: func(docID uint32) bool {
@@ -1073,7 +1085,9 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error)
10731085
}
10741086
return reposWant[repoIdx]
10751087
},
1076-
}, nil
1088+
}
1089+
d.docMatchTreeCache[cacheKey] = mt
1090+
return mt, nil
10771091

10781092
case *query.Substring:
10791093
return d.newSubstringMatchTree(s)
@@ -1201,19 +1215,27 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error)
12011215
}, nil
12021216

12031217
case *query.RepoIDs:
1218+
cacheKey := queryRepoIdsCacheKey(d.repoMetaData)
1219+
if cached, ok := d.docMatchTreeCache[cacheKey]; ok {
1220+
return cached, nil
1221+
}
1222+
12041223
reposWant := make([]bool, len(d.repoMetaData))
12051224
for repoIdx, r := range d.repoMetaData {
12061225
if s.Repos.Contains(r.ID) {
12071226
reposWant[repoIdx] = true
12081227
}
12091228
}
1210-
return &docMatchTree{
1229+
1230+
mt := &docMatchTree{
12111231
reason: "RepoIDs",
12121232
numDocs: d.numDocs(),
12131233
predicate: func(docID uint32) bool {
12141234
return reposWant[d.repos[docID]]
12151235
},
1216-
}, nil
1236+
}
1237+
d.docMatchTreeCache[cacheKey] = mt
1238+
return mt, nil
12171239

12181240
case *query.Repo:
12191241
reposWant := make([]bool, len(d.repoMetaData))
@@ -1435,3 +1457,18 @@ func isRegexpAll(r *syntax.Regexp) bool {
14351457
return false
14361458
}
14371459
}
1460+
1461+
func queryMetaCacheKey(field string, value *regexp.Regexp) string {
1462+
sum := sha256.Sum256([]byte(fmt.Sprintf("%s:%s", field, value.String())))
1463+
return fmt.Sprintf("Meta:%x", sum[:])
1464+
}
1465+
1466+
func queryRepoIdsCacheKey(repos []zoekt.Repository) string {
1467+
var b strings.Builder
1468+
for _, r := range repos {
1469+
b.WriteString(fmt.Sprint(r.ID))
1470+
b.WriteByte(',')
1471+
}
1472+
sum := sha256.Sum256([]byte(b.String()))
1473+
return fmt.Sprintf("RepoIDs:%x", sum[:])
1474+
}

index/matchtree_test.go

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -372,10 +372,18 @@ func TestRepoIDs(t *testing.T) {
372372
fileBranchMasks: []uint64{1, 1, 1, 1, 1, 1},
373373
repos: []uint16{0, 0, 1, 2, 3, 3},
374374
}
375-
mt, err := d.newMatchTree(&query.RepoIDs{Repos: roaring.BitmapOf(1, 3, 99)}, matchTreeOpt{})
375+
q := &query.RepoIDs{Repos: roaring.BitmapOf(1, 3, 99)}
376+
mt, err := d.newMatchTree(q, matchTreeOpt{})
376377
if err != nil {
377378
t.Fatal(err)
378379
}
380+
381+
// Check that the docMatchTree cache is populated correctly
382+
key := queryRepoIdsCacheKey(d.repoMetaData)
383+
if _, ok := d.docMatchTreeCache[key]; !ok {
384+
t.Errorf("expected docMatchTreeCache to be populated for key %q", key)
385+
}
386+
379387
want := []uint32{2, 4, 5}
380388
for i := range want {
381389
nextDoc := mt.nextDoc()
@@ -447,6 +455,12 @@ func TestMetaQueryMatchTree(t *testing.T) {
447455
t.Fatalf("failed to build matchTree: %v", err)
448456
}
449457

458+
// Check that the docMatchTree cache is populated correctly
459+
key := queryMetaCacheKey("license", regexp.MustCompile("M.T"))
460+
if _, ok := d.docMatchTreeCache[key]; !ok {
461+
t.Errorf("expected docMatchTreeCache to be populated for key %q", key)
462+
}
463+
450464
var matched []uint32
451465
for {
452466
doc := mt.nextDoc()
@@ -462,3 +476,52 @@ func TestMetaQueryMatchTree(t *testing.T) {
462476
t.Errorf("meta match failed: got %v, want %v", matched, want)
463477
}
464478
}
479+
480+
func Test_queryMetaCacheKey(t *testing.T) {
481+
cases := []struct {
482+
field string
483+
pattern string
484+
wantKey string
485+
}{
486+
// Generated via:
487+
// echo -n 'metaField:foo.*bar' | sha256sum
488+
{"metaField", "foo.*bar", "Meta:afc6e783c05767285e8657c92c6af09bd8c72d4c0cabe36614b0b2ba3b697724"},
489+
// echo -n 'metaField:foo.*baz' | sha256sum
490+
{"metaField", "foo.*baz", "Meta:7c5d6616ad2a00042e3ecb1d55cd4ef1907c5b3c232011e45a7f7ba7e8143b63"},
491+
// echo -n 'otherField:foo.*bar' | sha256sum
492+
{"otherField", "foo.*bar", "Meta:5761c1b19ae8b1c34c5933c8ddb4fe696d80918184547ad42e4953b15700f0ef"},
493+
}
494+
for _, tc := range cases {
495+
re := regexp.MustCompile(tc.pattern)
496+
key := queryMetaCacheKey(tc.field, re)
497+
if key != tc.wantKey {
498+
t.Errorf("unexpected key for field=%q pattern=%q: got %q, want %q", tc.field, tc.pattern, key, tc.wantKey)
499+
}
500+
}
501+
}
502+
503+
func Test_queryRepoIdsCacheKey(t *testing.T) {
504+
cases := []struct {
505+
repos []zoekt.Repository
506+
wantKey string
507+
}{
508+
// Generated via:
509+
// echo -n '123,456,' | sha256sum
510+
{[]zoekt.Repository{{ID: 123}, {ID: 456}}, "RepoIDs:a160b50b57496a46824c7e22f8c7047dbbec38752fa1b066d3f50d9f33baaddc"},
511+
// echo -n '456,123,' | sha256sum
512+
{[]zoekt.Repository{{ID: 456}, {ID: 123}}, "RepoIDs:1d899c857ed96d50e2ad5a9f1505a4a988a69375ec142c8bd29b1aaa545facfb"},
513+
// echo -n '123,456,789,' | sha256sum
514+
{[]zoekt.Repository{{ID: 123}, {ID: 456}, {ID: 789}}, "RepoIDs:d2c687720e021d3c3d3b8ae461451e144148d84deca4d45d40523f8501c72c39"},
515+
}
516+
for _, tc := range cases {
517+
key := queryRepoIdsCacheKey(tc.repos)
518+
if key != tc.wantKey {
519+
t.Errorf("unexpected key for repos=%v: got %q, want %q", tc.repos, key, tc.wantKey)
520+
}
521+
// Check determinism
522+
key2 := queryRepoIdsCacheKey(tc.repos)
523+
if key != key2 {
524+
t.Errorf("key not deterministic for repos=%v: %q vs %q", tc.repos, key, key2)
525+
}
526+
}
527+
}

0 commit comments

Comments
 (0)