Skip to content

Commit 95d766e

Browse files
authored
Merge pull request #243 from dbaggerman/new-bloom-filter
New bloom filter
2 parents c916c20 + d0205e5 commit 95d766e

File tree

4 files changed

+41
-16
lines changed

4 files changed

+41
-16
lines changed

processor/bloom.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package processor
2+
3+
// Prime number less than 256
4+
const BloomPrime = 251
5+
6+
var BloomTable [256]uint64
7+
8+
func init() {
9+
for i := range BloomTable {
10+
BloomTable[i] = BloomHash(byte(i))
11+
}
12+
}
13+
14+
func BloomHash(b byte) uint64 {
15+
i := uint64(b)
16+
17+
k := (i^BloomPrime) * i
18+
19+
k1 := k & 0x3f
20+
k2 := k >> 1 & 0x3f
21+
k3 := k >> 2 & 0x3f
22+
23+
return (1 << k1) | (1 << k2) | (1 << k3)
24+
}

processor/processor.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -350,14 +350,14 @@ func processLanguageFeature(name string, value Language) {
350350
stringTrie := &Trie{}
351351
tokenTrie := &Trie{}
352352

353-
complexityMask := byte(0)
354-
singleLineCommentMask := byte(0)
355-
multiLineCommentMask := byte(0)
356-
stringMask := byte(0)
357-
processMask := byte(0)
353+
var complexityMask uint64
354+
var singleLineCommentMask uint64
355+
var multiLineCommentMask uint64
356+
var stringMask uint64
357+
var processMask uint64
358358

359359
for _, v := range value.ComplexityChecks {
360-
complexityMask |= v[0]
360+
complexityMask |= BloomHash(v[0])
361361
complexityTrie.Insert(TComplexity, []byte(v))
362362
if !Complexity {
363363
tokenTrie.Insert(TComplexity, []byte(v))
@@ -368,21 +368,21 @@ func processLanguageFeature(name string, value Language) {
368368
}
369369

370370
for _, v := range value.LineComment {
371-
singleLineCommentMask |= v[0]
371+
singleLineCommentMask |= BloomHash(v[0])
372372
slCommentTrie.Insert(TSlcomment, []byte(v))
373373
tokenTrie.Insert(TSlcomment, []byte(v))
374374
}
375375
processMask |= singleLineCommentMask
376376

377377
for _, v := range value.MultiLine {
378-
multiLineCommentMask |= v[0][0]
378+
multiLineCommentMask |= BloomHash(v[0][0])
379379
mlCommentTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
380380
tokenTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
381381
}
382382
processMask |= multiLineCommentMask
383383

384384
for _, v := range value.Quotes {
385-
stringMask |= v.Start[0]
385+
stringMask |= BloomHash(v.Start[0])
386386
stringTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
387387
tokenTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
388388
}

processor/structs.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,11 @@ type LanguageFeature struct {
4545
Strings *Trie
4646
Tokens *Trie
4747
Nested bool
48-
ComplexityCheckMask byte
49-
SingleLineCommentMask byte
50-
MultiLineCommentMask byte
51-
StringCheckMask byte
52-
ProcessMask byte
48+
ComplexityCheckMask uint64
49+
SingleLineCommentMask uint64
50+
MultiLineCommentMask uint64
51+
StringCheckMask uint64
52+
ProcessMask uint64
5353
Keywords []string
5454
Quotes []Quote
5555
}

processor/workers.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,9 @@ func isBinary(index int, currentByte byte) bool {
9999
return false
100100
}
101101

102-
func shouldProcess(currentByte, processBytesMask byte) bool {
103-
if currentByte&processBytesMask != currentByte {
102+
func shouldProcess(currentByte byte, processBytesMask uint64) bool {
103+
k := BloomTable[currentByte]
104+
if k&processBytesMask != k {
104105
return false
105106
}
106107
return true

0 commit comments

Comments
 (0)