Skip to content

Commit 816d2f3

Browse files
committed
Faster search on large files
Store blobs inline in the index. This way we can mmap them in, avoiding file open costs and using the page cache. Sadly, this rules out compression except at the file system level. Also restore the original codesearch Regexp. Unlike the Go regexp, this one has an DFA implementation that seems to perform much faster on case-insensitive searches (while having spikier memory usage per regex). See [Issue 11646] for details. # Future See the comment about [regex previews], which may be a good route for even faster parsing of large files. Notably, we spend a lot of time eliminating non-matching files, so this would help. Also, consider some basic query*file caching noting lack of matches. Perhaps a bloom filter? [Issue 11646]: golang/go#11646 (comment) [regex previews]: golang/go#11646 (comment)
1 parent 3d31d62 commit 816d2f3

File tree

11 files changed

+1458
-221
lines changed

11 files changed

+1458
-221
lines changed

cmd/csindex/main.go

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717
"github.com/go-git/go-git/v5/plumbing"
1818
"github.com/go-git/go-git/v5/plumbing/object"
1919

20-
"sgrankin.dev/cs/blobstore"
2120
"sgrankin.dev/cs/codesearch/index"
2221
"sgrankin.dev/cs/livegrep/server/config"
2322
)
@@ -61,7 +60,6 @@ func usage() {
6160

6261
var (
6362
gitPath = flag.String("git", "data", "The path to the git repo")
64-
dataPath = flag.String("data", "data/data.db", "The path to the blob data DB")
6563
indexPath = flag.String("index", "data/csindex", "The path to the index file")
6664
)
6765

@@ -75,11 +73,6 @@ func main() {
7573
Revisions: []string{"HEAD"},
7674
}}
7775

78-
blobs, err := blobstore.Open(*dataPath)
79-
if err != nil {
80-
log.Fatal(err)
81-
}
82-
8376
repo, err := git.PlainOpen(*gitPath)
8477
if err != nil {
8578
log.Fatal(err)
@@ -128,22 +121,6 @@ func main() {
128121
ix.Add(path, r)
129122
r.Close()
130123
}
131-
{
132-
// TODO: skip blobs that are already stored.
133-
134-
r, err := f.Blob.Reader()
135-
if err != nil {
136-
log.Fatal(err)
137-
}
138-
blob, err := blobs.Create(f.Hash[:], int(f.Blob.Size))
139-
if err != nil {
140-
log.Fatal(err)
141-
}
142-
if _, err := blob.ReadFrom(r); err != nil {
143-
log.Fatal(err)
144-
}
145-
blob.Close()
146-
}
147124

148125
return nil
149126
}); err != nil {

codesearch/index/merge.go

Lines changed: 77 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,36 +2,38 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
package index
5+
/*
6+
# Merging indexes
7+
8+
To merge two indexes A and B (newer) into a combined index C:
9+
10+
Load the path list from B and determine for each path the docid ranges that it will replace in A.
11+
12+
Read A's and B's name lists together, merging them into C's name list.
13+
Discard the identified ranges from A during the merge.
14+
Also during the merge, record the mapping from A's docids to C's docids, and also the mapping from B's docids to C's docids.
15+
Both mappings can be summarized in a table like
16+
17+
10-14 map to 20-24
18+
15-24 is deleted
19+
25-34 maps to 40-49
620
7-
// Merging indexes.
8-
//
9-
// To merge two indexes A and B (newer) into a combined index C:
10-
//
11-
// Load the path list from B and determine for each path the docid ranges
12-
// that it will replace in A.
13-
//
14-
// Read A's and B's name lists together, merging them into C's name list.
15-
// Discard the identified ranges from A during the merge. Also during the merge,
16-
// record the mapping from A's docids to C's docids, and also the mapping from
17-
// B's docids to C's docids. Both mappings can be summarized in a table like
18-
//
19-
// 10-14 map to 20-24
20-
// 15-24 is deleted
21-
// 25-34 maps to 40-49
22-
//
23-
// The number of ranges will be at most the combined number of paths.
24-
// Also during the merge, write the name index to a temporary file as usual.
25-
//
26-
// Now merge the posting lists (this is why they begin with the trigram).
27-
// During the merge, translate the docid numbers to the new C docid space.
28-
// Also during the merge, write the posting list index to a temporary file as usual.
29-
//
30-
// Copy the name index and posting list index into C's index and write the trailer.
31-
// Rename C's index onto the new index.
21+
The number of ranges will be at most the combined number of paths.
22+
Also during the merge, write the name index to a temporary file as usual.
23+
24+
Now merge the posting lists (this is why they begin with the trigram).
25+
During the merge, translate the docid numbers to the new C docid space.
26+
Also during the merge, write the posting list index to a temporary file as usual.
27+
28+
Copy the name index and posting list index into C's index and write the trailer.
29+
Rename C's index onto the new index.
30+
*/
31+
32+
package index
3233

3334
import (
3435
"encoding/binary"
36+
"log"
3537
"os"
3638
"strings"
3739
)
@@ -164,6 +166,44 @@ func Merge(dst, src1, src2 string) {
164166
}
165167
nameIndexFile.writeUint32(ix3.offset())
166168

169+
// To merge blobs, create an array of slices in order of the new posting IDs, and then write them to destination.
170+
blobs := [][]byte{}
171+
new = 0
172+
mi1 = 0
173+
mi2 = 0
174+
for new < numName {
175+
var blob []byte
176+
if mi1 < len(map1) && map1[mi1].new == new {
177+
for i := map1[mi1].lo; i < map1[mi1].hi; i++ {
178+
blob = ix1.Data(i)
179+
new++
180+
}
181+
mi1++
182+
} else if mi2 < len(map2) && map2[mi2].new == new {
183+
for i := map2[mi2].lo; i < map2[mi2].hi; i++ {
184+
blob = ix2.Data(i)
185+
new++
186+
}
187+
mi2++
188+
} else {
189+
panic("merge: inconsistent index")
190+
}
191+
blobs = append(blobs, blob)
192+
}
193+
194+
blobFile := bufCreate("")
195+
var blobOffsets, blobSizes []uint32 // TODO: just make a struct
196+
for _, blob := range blobs {
197+
off := blobFile.offset()
198+
size := len(blob)
199+
_, err := blobFile.file.Write(blob)
200+
if err != nil {
201+
log.Fatalf("writing %s: %v", blobFile.name, err)
202+
}
203+
blobOffsets = append(blobOffsets, off)
204+
blobSizes = append(blobSizes, uint32(size))
205+
}
206+
167207
// Merged list of posting lists.
168208
postData := ix3.offset()
169209
var r1 postMapReader
@@ -211,6 +251,9 @@ func Merge(dst, src1, src2 string) {
211251
}
212252
}
213253

254+
blobData := ix3.offset()
255+
copyFile(ix3, blobFile)
256+
214257
// Name index
215258
nameIndex := ix3.offset()
216259
copyFile(ix3, nameIndexFile)
@@ -219,11 +262,19 @@ func Merge(dst, src1, src2 string) {
219262
postIndex := ix3.offset()
220263
copyFile(ix3, w.postIndexFile)
221264

265+
blobIndex := ix3.offset()
266+
for i, off := range blobOffsets {
267+
ix3.writeUint32(off)
268+
ix3.writeUint32(blobSizes[i])
269+
}
270+
222271
ix3.writeUint32(pathData)
223272
ix3.writeUint32(nameData)
224273
ix3.writeUint32(postData)
274+
ix3.writeUint32(blobData)
225275
ix3.writeUint32(nameIndex)
226276
ix3.writeUint32(postIndex)
277+
ix3.writeUint32(blobIndex)
227278
ix3.writeString(trailerMagic)
228279
ix3.flush()
229280

0 commit comments

Comments
 (0)