Skip to content

Commit 8fbabe1

Browse files
committed
fix *ay* bug, add pattern-addition benchmark
1 parent 763a227 commit 8fbabe1

File tree

8 files changed

+151
-40
lines changed

8 files changed

+151
-40
lines changed

README.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,15 @@ The following patterns would match it:
6464
```json
6565
{"Foo": [ { "exists": false } ] }
6666
```
67-
67+
```json
68+
{
69+
"Image": {
70+
"Thumbnail": {
71+
"Url": [ { "shellstyle": "*.example.com/*" } ]
72+
}
73+
}
74+
}
75+
```
6876
The structure of the pattern, in terms of field names
6977
and nesting, must be the same as the structure of the event
7078
to be matched. The field values are always given
@@ -142,7 +150,7 @@ The performance of `MatchesForJSONEvent` is strongly
142150
sublinear in the number of patterns. It’s not quite `O(1)`
143151
as it varies somewhat as a function of the number of
144152
unique fields that appear in all the patterns that have
145-
been added to the machine, but remains sublinear in that
153+
been added to the matcher, but remains sublinear in that
146154
number.
147155

148156
A word of explanation is in order. Quamina compiles the

lib/benchmarks_test.go

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ func TestBigShellStyle(t *testing.T) {
253253
}
254254
elapsed := float64(time.Now().Sub(before).Milliseconds())
255255
perSecond := float64(lineCount) / (elapsed / 1000.0)
256-
fmt.Printf("%.2f matches/second with letter rules\n\n", perSecond)
256+
fmt.Printf("%.2f matches/second with letter patterns\n\n", perSecond)
257257

258258
for k, wc := range wanted {
259259
if lCounts[k] != wc {
@@ -266,3 +266,64 @@ func TestBigShellStyle(t *testing.T) {
266266
}
267267
}
268268
}
269+
270+
func TestPatternAddition(t *testing.T) {
271+
w := worder{0, readWWords(t)}
272+
273+
// now we're going to add 10K patterns.
274+
m := NewMatcher()
275+
before := time.Now()
276+
fieldCount := 0
277+
for x1 := 0; x1 < 10; x1++ {
278+
for x2 := 0; x2 < 20; x2++ {
279+
pat := fmt.Sprintf(`{"%s": { "%s": [ "%s"`, w.next(), w.next(), w.next())
280+
for x3 := 0; x3 < 99; x3++ {
281+
pat = pat + fmt.Sprintf(`, "%s"`, w.next())
282+
}
283+
fieldCount += 100
284+
pat = pat + `] } }`
285+
pName := string(w.next()) + string(w.next())
286+
err := m.AddPattern(pName, pat)
287+
if err != nil {
288+
t.Error("addPattern " + err.Error())
289+
}
290+
}
291+
}
292+
fmt.Println("stats:" + matcherStats(m))
293+
elapsed := float64(time.Now().Sub(before).Milliseconds())
294+
perSecond := float64(fieldCount) / (elapsed / 1000.0)
295+
fmt.Printf("%.2f fields/second\n\n", perSecond)
296+
}
297+
298+
type worder struct {
299+
index int
300+
lines [][]byte
301+
}
302+
303+
func (w *worder) next() []byte {
304+
w.index += 761 // relatively prime with the number of lines
305+
w.index = w.index % len(w.lines)
306+
return w.lines[w.index]
307+
}
308+
309+
func readWWords(t *testing.T) [][]byte {
310+
// that's a list from the Wordle source code with a few erased to get a prime number
311+
file, err := os.Open("../test_data/wwords.txt")
312+
if err != nil {
313+
t.Error("Can't open file: " + err.Error())
314+
}
315+
defer func(file *os.File) {
316+
_ = file.Close()
317+
}(file)
318+
scanner := bufio.NewScanner(file)
319+
buf := make([]byte, oneMeg)
320+
scanner.Buffer(buf, oneMeg)
321+
322+
lineCount := 0
323+
var lines [][]byte
324+
for scanner.Scan() {
325+
lineCount++
326+
lines = append(lines, []byte(scanner.Text()))
327+
}
328+
return lines
329+
}

lib/concurrency_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ func updateTree(m *Matcher, use37 bool, t *testing.T, ch chan string) {
2929
func TestConcurrency(t *testing.T) {
3030
const UpdateLines = 250
3131

32-
// this is a cut/paste of TestCityLots, except for every few lines we add another rule to the matcher,
32+
// this is a cut/paste of TestCityLots, except for every few lines we add another pattern to the matcher,
3333
// focusing on the fields that are being used by the patterns. The idea is to exercise concurrent
3434
// update and use of the automaton
3535
// I was initially surprised that adding 860 or so changes to the automaton while it's running doesn't seem to

lib/shell_style.go

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -45,49 +45,54 @@ func readShellStyleSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []type
4545
return
4646
}
4747

48-
// makeShellStyleAutomaton - recognize a "-delimited string containing one or more '*' globs.
49-
// TODO: Add “?”
50-
func makeShellStyleAutomaton(val []byte, useThisTansition *fieldMatcher) (start *smallTable, nextField *fieldMatcher) {
48+
// mssA2 - recognize a "-delimited string containing one or more '*' globs.
49+
// This isn't quite as simple as you'd think. Consider matching "*abc". When you see an 'a' you move to a state
50+
// where you're looking for 'b'. So if it's not a 'b' you go back to the '*' state. But suppose you see "xaabc";
51+
// when you're in that looking-for-'b' state and you see that second 'a', you don't go back to the '*' state, you
52+
// have to stay in the looking-for-'b' state because you have seen the 'a'. Similarly, when you see 'xabac', when
53+
// you're looking for 'c' and you see the 'a', once again, you have to go to the looking-for-'b' state. Let's
54+
// call the 'a the bounceBackByte and the looking-for-b state the bounceBackStep
55+
func makeShellStyleAutomaton(val []byte, useThisTransition *fieldMatcher) (start *smallTable, nextField *fieldMatcher) {
5156
table := newSmallTable()
5257
start = table
53-
if useThisTansition != nil {
54-
nextField = useThisTansition
58+
if useThisTransition != nil {
59+
nextField = useThisTransition
5560
} else {
5661
nextField = newFieldMatcher()
5762
}
5863

59-
// since this is provided as a string, the last byte will be '"'. In the special case where the pattern ends
60-
// with '*' (and thus the string ends with '*"', we will insert a successful transition as soon as we hit
61-
// that last '*', so that the reaching the transition doesn't require going through the trailing characters to
62-
// reach the '"'
63-
if val[len(val) - 2] == '*' {
64-
for i := 0; i < len(val) - 2; i++ {
65-
ch := val[i]
66-
if ch == '*' {
67-
table.addRangeSteps(0, ByteCeiling, table)
68-
} else {
69-
next := newSmallTable()
70-
table.addByteStep(ch, next)
71-
table = next
72-
}
73-
}
74-
table.addRangeSteps(0, ByteCeiling, newSmallTransition(nextField))
75-
return
76-
}
64+
var bounceBackByte byte
65+
var bounceBackStep smallStep = nil
66+
var globStep smallStep = nil
7767

78-
// loop through all but last byte
79-
for i := 0; i < len(val)-1; i++ {
68+
// loop through all but last bytea
69+
i := 0
70+
for i < len(val)-1 {
8071
ch := val[i]
8172
if ch == '*' {
82-
// just loop back
83-
table.addRangeSteps(0, ByteCeiling, table)
73+
// special-case handling for string ending in '*"'
74+
if i == len(val)-2 {
75+
lastStep := newSmallTransition(nextField)
76+
table.addRangeSteps(0, ByteCeiling, lastStep)
77+
return
78+
}
79+
globStep = table
80+
i++
81+
bounceBackStep = newSmallTable()
82+
bounceBackByte = val[i]
83+
table.load(table, []byte{val[i]}, []smallStep{bounceBackStep})
84+
table = bounceBackStep.SmallTable()
8485
} else {
8586
next := newSmallTable()
87+
if globStep != nil {
88+
table.load(globStep, []byte{bounceBackByte}, []smallStep{bounceBackStep})
89+
}
8690
table.addByteStep(ch, next)
8791
table = next
8892
}
93+
i++
8994
}
90-
9195
table.addByteStep(val[len(val)-1], newSmallTransition(nextField))
96+
9297
return
9398
}

lib/small_table.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,11 @@ func (t *smallTable) step(utf8Byte byte) smallStep {
6161
panic("Malformed SmallTable")
6262
}
6363

64-
// mergeAutomata computes the union of two valueMatch automata
64+
// mergeAutomata computes the union of two valueMatch automata. If you look up the textbook theory about this,
65+
// they say to compute the set product for automata A and B and build A0B0, A0B1 … A1BN, A1B0 … but if you look
66+
// at that you realize that many of the product states aren't reachable. So you compute A0B0 and then keep
67+
// recursing on the transitions coming out there, I'm pretty sure you get a correct result. I don't know if it's
68+
// minimal or even avoids being wasteful.
6569
// INVARIANT: neither argument is nil
6670
// INVARIANT: To be thread-safe, no existing table can be updated
6771
func mergeAutomata(existing, newStep smallStep) *smallTable {
@@ -81,7 +85,8 @@ func mergeOne(existing, newStep smallStep, memoize map[string]smallStep) smallSt
8185
return combined
8286
}
8387

84-
// we always take the transition from the existing step, even if there's another in the merged-in step
88+
// TODO: this works, all the tests pass, but I'm not satisfied witih it. My intuition is that you ought
89+
// to be able to come out of this with just one *fieldMatcher, parhaps with a merged matches list.
8590
switch {
8691
case !(existing.HasTransition() || newStep.HasTransition()):
8792
combined = newSmallTable()
@@ -115,6 +120,18 @@ func mergeOne(existing, newStep smallStep, memoize map[string]smallStep) smallSt
115120
return combined
116121
}
117122

123+
// loadSmallTable with a default value and one or more byte values, trying to be efficient about it
124+
func (t *smallTable) load(defaultStep smallStep, positions []byte, steps []smallStep) {
125+
var u unpackedTable
126+
for i := range u {
127+
u[i] = defaultStep
128+
}
129+
for i, position := range positions {
130+
u[position] = steps[i]
131+
}
132+
t.pack(&u)
133+
}
134+
118135
// unpackedTable replicates the data in the smallTable ceilings and steps arrays. It's quite hard to
119136
// update the list structure in a smallTable, but trivial in an unpackedTable. The idea is that to update
120137
// a smallTable you unpack it, update, then re-pack it. Not gonna be the most efficient thing so at some future point…

lib/stats.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@ package quamina
33
import "fmt"
44

55
type stats struct {
6-
fmCount int
6+
fmCount int
77
fmVisited map[*fieldMatcher]bool
8-
vmCount int
8+
vmCount int
99
vmVisited map[*valueMatcher]bool
10-
stCount int
10+
stCount int
1111
stVisited map[*smallTable]bool
12-
siCount int
12+
siCount int
1313
}
1414

1515
func matcherStats(m *Matcher) string {

lib/value_matcher.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,8 @@ func (m *valueMatcher) transitionOn(val []byte) []*fieldMatcher {
9191

9292
// step through the smallTables, byte by byte
9393
table := m.startTable
94-
var step smallStep
9594
for _, utf8Byte := range val {
96-
step = table.step(utf8Byte)
95+
step := table.step(utf8Byte)
9796
if step == nil {
9897
return transitions
9998
}

lib/value_matcher_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,27 @@ func TestMultiTransitions(t *testing.T) {
9898
}
9999
}
100100

101+
func TestAY(t *testing.T) {
102+
m := NewMatcher()
103+
pat := `{"x": [ { "shellstyle": "*ay*"} ] }`
104+
err := m.AddPattern("AY", pat)
105+
if err != nil {
106+
t.Error("AY: " + err.Error())
107+
}
108+
shouldMatch := []string{"ay", "aay", "aaaayyyyy", "xyzay", "ayxxxx"}
109+
e := `{"x": "X"}`
110+
for _, sm := range shouldMatch {
111+
p := strings.ReplaceAll(e, "X", sm)
112+
matches, err := m.MatchesForJSONEvent([]byte(p))
113+
if err != nil {
114+
t.Error("bad JSON: " + err.Error())
115+
}
116+
if len(matches) != 1 || matches[0] != "AY" {
117+
t.Errorf("%s didn't match", sm)
118+
}
119+
}
120+
}
121+
101122
func TestOverlappingValues(t *testing.T) {
102123
m := NewMatcher()
103124
p1 := `{"a": ["foo"]}`

0 commit comments

Comments
 (0)