Skip to content

Commit bd2f980

Browse files
committed
NFA-based matching for "shellstyle" patterns.
1 parent c2004ea commit bd2f980

18 files changed

+691
-294
lines changed

README.md

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,22 @@ The following patterns would match it:
7070
"Thumbnail": { "Url": [ { "shellstyle": "*9943" } ] }
7171
}
7272
}
73+
```
74+
75+
```json
7376
{
7477
"Image": {
75-
"Thumbnail": { "Url": [ { "shellstyle": "http://www.example.com/*" } ] }
78+
"Thumbnail": { "Url":
79+
[ { "shellstyle": "http://www.example.com/*" } ] }
7680
}
7781
}
82+
```
83+
84+
```json
7885
{
7986
"Image": {
80-
"Thumbnail": { "Url": [ { "shellstyle": "http://www.example.com/*9943" } ] }
87+
"Thumbnail": { "Url":
88+
[ { "shellstyle": "http://www.example.com/*9943" } ] }
8189
}
8290
}
8391

@@ -95,11 +103,20 @@ be assumed to match, but all Fields mentioned must match. So the
95103
semantics are effectively an OR on each field's values,
96104
but an AND on the field names.
97105

106+
Note that the `shellstyle` patterns can include only
107+
one `*` character. The architecture probably allows
108+
support for a larger subset of regular expressions
109+
but currently, the testing for just the single -`*`
110+
patterns is a bit lacking.
111+
98112
Number matching is weak - the number has to appear
99113
exactly the same in the pattern and the event. I.e.,
100114
Quamina doesn't know that 35, 35.000, and 3.5e1 are the
101115
same number.
102116

117+
There's a fix for this in the code which is commented
118+
out because it causes a significant performance penalty.
119+
103120
## APIs
104121

105122
**Note**: In all the APIs below, field names and values in both
@@ -163,8 +180,8 @@ been added to the matcher, but remains sublinear in that
163180
number.
164181

165182
A word of explanation is in order. Quamina compiles the
166-
patterns into a somewhat-decorated DFA and uses that to
167-
find matches in events; that DFA-based matching process is
183+
patterns into a somewhat-decorated automaton and uses
184+
that to find matches in events; the matching process is
168185
O(1) in the number of patterns.
169186

170187
However, for this to work, the incoming event must be

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
module quamina
22

3-
go 1.17
3+
go 1.18

lib/benchmarks_test.go

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -125,10 +125,10 @@ func TestCityLots(t *testing.T) {
125125
message1 := fmt.Sprintf("Events-per-second benchmark ran at %.0f events per second, below threshold of %.0f.",
126126
perSecond, thresholdPerformance)
127127
message2 := `
128-
It may be that re-running the benchmark test will address this, or it may be that you're running on a machine
129-
that is slower than the one the software was developed on, in which case you might want to readjust the
130-
"thresholdPerformance" constant. However, it may be that you made a change that reduced the throughput of the
131-
library, which would be unacceptable.`
128+
It may be that re-running the benchmark test will address this, or it may be that you're running on a machine
129+
that is slower than the one the software was developed on, in which case you might want to readjust the
130+
"thresholdPerformance" constant. However, it may be that you made a change that reduced the throughput of the
131+
library, which would be unacceptable.`
132132
t.Errorf(message1 + message2)
133133
}
134134

@@ -178,17 +178,7 @@ func TestMySoftwareHatesMe(t *testing.T) {
178178
t.Error("no match for EEE")
179179
}
180180
}
181-
*/
182-
183-
184-
func containsX(list []X, x X) bool {
185-
for _, in := range list {
186-
if in == x {
187-
return true
188-
}
189-
}
190-
return false
191-
}
181+
*/
192182

193183
// exercise shellstyle matching a little, is much faster than TestCityLots because it's only working wth one field
194184
func TestBigShellStyle(t *testing.T) {
@@ -218,7 +208,7 @@ func TestBigShellStyle(t *testing.T) {
218208
`{"properties": {"STREET":[ {"shellstyle": "N*P*"} ] } }`: 927,
219209
`{"properties": {"STREET":[ {"shellstyle": "*E*E*E*"} ] } }`: 1212,
220210
}
221-
*/
211+
*/
222212

223213
for letter := range wanted {
224214
pat := fmt.Sprintf(`{"properties": {"STREET":[ {"shellstyle": "%s*"} ] } }`, letter)
@@ -229,13 +219,13 @@ func TestBigShellStyle(t *testing.T) {
229219
}
230220

231221
/*
232-
for funk := range funky {
233-
err := m.AddPattern(funk, funk.(string))
234-
if err != nil {
235-
t.Errorf("err on %s: %s", funk, err.Error())
222+
for funk := range funky {
223+
err := m.AddPattern(funk, funk.(string))
224+
if err != nil {
225+
t.Errorf("err on %s: %s", funk, err.Error())
226+
}
236227
}
237-
}
238-
*/
228+
*/
239229
fmt.Println(matcherStats(m))
240230

241231
lineCount := 0
@@ -271,20 +261,21 @@ func TestBigShellStyle(t *testing.T) {
271261
}
272262
}
273263
/*
274-
for k, wc := range funky {
275-
if lCounts[k] != wc {
276-
t.Errorf("for %s wanted %d got %d", k, wc, lCounts[k])
264+
for k, wc := range funky {
265+
if lCounts[k] != wc {
266+
t.Errorf("for %s wanted %d got %d", k, wc, lCounts[k])
267+
}
277268
}
278-
}
279-
280-
*/
269+
270+
*/
281271
}
282272

283273
// TestPatternAddition adds a whole lot of string-only rules as fast as possible The profiler says that the
284274
// performance is totally doinated by the garbage-collector thrashing, in particular it has to allocate
285275
// ~220K smallTables. Tried https://blog.twitch.tv/en/2019/04/10/go-memory-ballast-how-i-learnt-to-stop-worrying-and-love-the-heap/
286276
// but it doesn't seem to help.
287277
// TODO: Find a way to allocate less tables.
278+
// TODO: Add shellstyle patterns
288279
func TestPatternAddition(t *testing.T) {
289280
w := worder{0, readWWords(t)}
290281

@@ -312,7 +303,7 @@ func TestPatternAddition(t *testing.T) {
312303
}
313304
}
314305
runtime.ReadMemStats(&msAfter)
315-
delta := 1.0 / 1000000.0 * float64(msAfter.Alloc - msBefore.Alloc)
306+
delta := 1.0 / 1000000.0 * float64(msAfter.Alloc-msBefore.Alloc)
316307
fmt.Printf("before %d, after %d, delta %f\n", msBefore.Alloc, msAfter.Alloc, delta)
317308
fmt.Println("stats:" + matcherStats(m))
318309
elapsed := float64(time.Now().Sub(before).Milliseconds())

lib/concurrency_test.go

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ func updateTree(m *Matcher, use37 bool, t *testing.T, ch chan string) {
1818
} else {
1919
val = fmt.Sprintf(`"%d"`, rand.Int())
2020
pattern = fmt.Sprintf(`{ "properties": { "STREET": [ %s ] } }`, val)
21+
/* TODO: alternate literal and shellstyle addition
22+
val = fmt.Sprintf(`"*%d"`, rand.Int())
23+
pattern = fmt.Sprintf(`{ "properties": { "STREET": [ {"shellstyle": %s } ] } }`, val)
24+
*/
2125
}
2226
err := m.AddPattern(val, pattern)
2327
if err != nil {
@@ -45,21 +49,24 @@ func TestConcurrency(t *testing.T) {
4549

4650
patterns := []string{
4751
`{ "properties": { "STREET": [ "CRANLEIGH" ] } }`,
52+
`{ "properties": { "STREET": [ { "shellstyle": "B*K"} ] } }`,
4853
`{ "properties": { "STREET": [ "17TH" ], "ODD_EVEN": [ "E"] } }`,
4954
`{ "geometry": { "coordinates": [ 37.807807921694092 ] } }`,
5055
`{ "properties": { "MAPBLKLOT": ["0011008"], "BLKLOT": ["0011008"]}, "geometry": { "coordinates": [ 37.807807921694092 ] } } `,
5156
}
5257
names := []string{
5358
"CRANLEIGH",
59+
"shellstyle",
5460
"17TH Even",
5561
"Geometry",
5662
"0011008",
5763
}
5864
wanted := map[X]int{
59-
"CRANLEIGH": 7,
60-
"17TH Even": 836,
61-
"Geometry": 2,
62-
"0011008": 1,
65+
"CRANLEIGH": 7,
66+
"shellstyle": 746,
67+
"17TH Even": 836,
68+
"Geometry": 2,
69+
"0011008": 1,
6370
}
6471

6572
scanner := bufio.NewScanner(file)
@@ -85,6 +92,7 @@ func TestConcurrency(t *testing.T) {
8592
lineCount = 0
8693
before := time.Now()
8794
ch := make(chan string, 1000)
95+
sent := 0
8896
for _, line := range lines {
8997
matches, err := m.MatchesForJSONEvent(line)
9098
if err != nil {
@@ -93,6 +101,7 @@ func TestConcurrency(t *testing.T) {
93101
lineCount++
94102
if lineCount%UpdateLines == 0 {
95103
use37 = !use37
104+
sent++
96105
go updateTree(m, use37, t, ch)
97106
}
98107
for _, match := range matches {
@@ -103,11 +112,10 @@ func TestConcurrency(t *testing.T) {
103112
results[match] = count + 1
104113
}
105114
}
106-
fmt.Println()
107115

108116
elapsed := float64(time.Now().Sub(before).Milliseconds())
109117
perSecond := float64(lineCount) / (elapsed / 1000.0)
110-
fmt.Printf("%.2f matches/second with updates\n\n", perSecond)
118+
fmt.Printf("\n%.2f matches/second with updates\n\n", perSecond)
111119

112120
err = scanner.Err()
113121
if err != nil {
@@ -125,7 +133,8 @@ func TestConcurrency(t *testing.T) {
125133

126134
// now we go back and make sure that all those AddPattern calls actually made it into the Matcher
127135
close(ch)
128-
for val := range ch {
136+
for i := 0; i < sent; i++ {
137+
val := <-ch
129138
var event string
130139
if val[0] == '"' {
131140
event = fmt.Sprintf(`{"properties": { "STREET": %s} }`, val)

lib/fj.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
// the fields & values can be represented as []byte slices using a couple of offsets into the underlying event.
1717
// There is an exception, namely strings that contain \-prefixed JSON escapes; since we want to work with the
1818
// actual UTF-8 bytes, this requires re-writing such strings into memory we have to allocate.
19+
// TODO: There are gaps in the unit-test coverage, including nearly all the error conditions
1920
type FJ struct {
2021
event []byte // event being processed, treated as immutable
2122
eventIndex int // current byte index into the event

lib/list_maker.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
package quamina
2+
3+
// this needs to exist so that all all the lists containing a single step to X, or the triple step to X,Y,Z are the
4+
// same list, so that pack/unpack work properly
5+
6+
type listMaker struct {
7+
singletons map[*nfaStep]*nfaStepList
8+
plurals []*nfaStepList
9+
}
10+
11+
func newListMaker() *listMaker {
12+
return &listMaker{singletons: make(map[*nfaStep]*nfaStepList)}
13+
}
14+
15+
func (l *listMaker) getSingleton(step *nfaStep) *nfaStepList {
16+
already, ok := l.singletons[step]
17+
if ok {
18+
return already
19+
}
20+
list := &nfaStepList{steps: []*nfaStep{step}}
21+
l.singletons[step] = list
22+
return list
23+
}
24+
25+
func (l *listMaker) getList(steps ...*nfaStep) *nfaStepList {
26+
if len(steps) == 1 {
27+
return l.getSingleton(steps[0])
28+
}
29+
30+
for _, already := range l.plurals {
31+
if listsAreEqual(already.steps, steps) {
32+
return already
33+
}
34+
}
35+
list := &nfaStepList{steps: steps}
36+
l.plurals = append(l.plurals, list)
37+
return list
38+
}
39+
40+
func listsAreEqual(l1, l2 []*nfaStep) bool {
41+
if len(l1) != len(l2) {
42+
return false
43+
}
44+
for _, step := range l1 {
45+
if !listMakerContains(l2, step) {
46+
return false
47+
}
48+
}
49+
return true
50+
}
51+
52+
func listMakerContains(list []*nfaStep, step *nfaStep) bool {
53+
for _, fromList := range list {
54+
if step == fromList {
55+
return true
56+
}
57+
}
58+
return false
59+
}

lib/list_maker_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
package quamina
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestListMaker(t *testing.T) {
8+
steps := []*nfaStep{
9+
{},
10+
{},
11+
{},
12+
}
13+
multi := [][]*nfaStep{
14+
{steps[0]},
15+
{steps[0], steps[1]},
16+
{steps[0], steps[1], steps[2]},
17+
{steps[0], steps[2]},
18+
{steps[1]},
19+
{steps[1], steps[2]},
20+
{steps[2]},
21+
}
22+
lm := newListMaker()
23+
lists := make(map[*nfaStepList]bool)
24+
for _, step := range steps {
25+
lists[lm.getSingleton(step)] = true
26+
}
27+
if len(lists) != 3 {
28+
t.Error("length should be 3")
29+
}
30+
for _, step := range steps {
31+
lists[lm.getSingleton(step)] = true
32+
}
33+
if len(lists) != 3 {
34+
t.Error("length STILL should be 3")
35+
}
36+
lm = newListMaker()
37+
lists = make(map[*nfaStepList]bool)
38+
for _, plural := range multi {
39+
lists[lm.getList(plural...)] = true
40+
}
41+
wanted := len(multi)
42+
if len(lists) != wanted {
43+
t.Errorf("Got %d wanted %d", len(lists), wanted)
44+
}
45+
for _, plural := range multi {
46+
lists[lm.getList(plural...)] = true
47+
}
48+
if len(lists) != wanted {
49+
t.Errorf("Got %d STILL wanted %d", len(lists), wanted)
50+
}
51+
52+
}

lib/matcher.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ func (m *Matcher) AddPattern(x X, patternJSON string) error {
5555
// The matcher contains several map[this]that maps but Go maps aren't thread-safe. This could be solved
5656
// with a straightforward mutex or the fancy sync.Map, but I succumbed to premature optimization and decided
5757
// I didn't want any of that stuff in the Match* path. So in each case the map (or map-like structure in
58-
// smallTable) is copied, the copy updated, then the whole map updated atomically in the containing structure
58+
// smallDfaTable) is copied, the copy updated, then the whole map updated atomically in the containing structure
5959
// see: https://medium.com/@deckarep/the-new-kid-in-town-gos-sync-map-de24a6bf7c2c
6060
m.lock.Lock()
6161
defer m.lock.Unlock()

lib/numbers.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@ import (
66
"strconv"
77
)
88

9+
// TODO: Make this more efficient and improve unit-test coverage
910
const (
10-
nineDigits = 1000000000.0
11+
nineDigits = 1000000000.0
1112
digitsOfPrecision = 18
1213
)
1314

@@ -24,6 +25,5 @@ func canonicalize(s []byte) (string, error) {
2425
if f >= nineDigits || f <= -nineDigits {
2526
return "", errors.New(fmt.Sprintf("number is outside of range [%f, %f]", -nineDigits, nineDigits))
2627
}
27-
return fmt.Sprintf("%019.0f", (f + nineDigits) * nineDigits), nil
28+
return fmt.Sprintf("%019.0f", (f+nineDigits)*nineDigits), nil
2829
}
29-

lib/numbers_test.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,3 @@ func TestOrdering(t *testing.T) {
5656
}
5757
}
5858
}
59-

0 commit comments

Comments
 (0)