Skip to content

Commit dd1e82b

Browse files
committed
Snapshot with all pieces in place before major refactor to support shell-style globbing
1 parent a211df7 commit dd1e82b

File tree

9 files changed

+389
-64
lines changed

9 files changed

+389
-64
lines changed

README.md

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ same number.
8484

8585
## APIs
8686

87+
**Note**: In all the APIs below, field names and values in both
88+
patterns and events must be valid UTF-8. Unescaped characters
89+
smaller than 0x1F (illegal per JSON), and bytes with value
90+
greater than 0XF4 (can't occur in correctly composed UTF-8)
91+
will be rejected by the API.
92+
8793
```go
8894
func NewMatcher() *Matcher
8995
```
@@ -101,8 +107,8 @@ The pattern must be provided as a string which is a
101107
JSON object as exemplified above in this document.
102108
103109
The `error` return is used to signal invalid pattern
104-
structure, which could be malformed JSON or leaf values
105-
which are not provided as arrays.
110+
structure, which could be bad UTF-8 or malformed JSON
111+
or leaf values which are not provided as arrays.
106112
107113
As many patterns as desired can be added to a Matcher
108114
but at this time there is no capability of removing any.
@@ -114,11 +120,12 @@ threads call it, they will block and execute sequentially.
114120
func (m *Matcher) MatchesForJSONEvent(event []byte) ([]X, error)
115121
```
116122

117-
The `event` argument must be a JSON object. It would be
123+
The `event` argument must be a JSON object encoded in
124+
correct UTF-8. It would be
118125
easy to extend Matcher to handle other data formats; see the
119126
`Flattener` interface and its implementation in `FJ`.
120127

121-
The `error` return value is nil unless there was a syntax
128+
The `error` return value is nil unless there was an
122129
error in the event JSON.
123130

124131
The `[]X` return slice may be empty if none of the patterns
@@ -131,11 +138,11 @@ also executing.
131138
### Performance
132139

133140
The performance of `MatchesForJSONEvent` is strongly
134-
sublinear in the number of patterns. It’s not quite `O(1)`,
135-
it does vary somewhat as a function of the number of
141+
sublinear in the number of patterns. It’s not quite `O(1)`
142+
as it varies somewhat as a function of the number of
136143
unique fields that appear in all the patterns that have
137144
been added to the machine, but remains sublinear in that
138-
variation.
145+
number.
139146

140147
A word of explanation is in order. Quamina compiles the
141148
patterns into a somewhat-decorated DFA and uses that to

lib/fj.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -429,8 +429,8 @@ func (fj *FJ) readStringValue() ([]byte, error) {
429429
} else if ch == '\\' {
430430
val, err := fj.readStringValWithEscapes(valStart)
431431
return val, err
432-
} else if ch <= 0x1f {
433-
return nil, fj.error(fmt.Sprintf("illegal character %x in string value", ch))
432+
} else if ch <= 0x1f || ch >= byte(ByteCeiling) {
433+
return nil, fj.error(fmt.Sprintf("illegal UTF-8 byte %x in string value", ch))
434434
}
435435
if fj.step() != nil {
436436
return nil, fj.error("event truncated in mid-string")
@@ -459,8 +459,8 @@ func (fj *FJ) readStringValWithEscapes(nameStart int) ([]byte, error) {
459459
return nil, err
460460
}
461461
val = append(val, unescaped...)
462-
} else if ch <= 0x1f {
463-
return nil, fj.error(fmt.Sprintf("illegal character %x in string value", ch))
462+
} else if ch <= 0x1f || ch >= byte(ByteCeiling) {
463+
return nil, fj.error(fmt.Sprintf("illegal UTF-8 byte %x in string value", ch))
464464
} else {
465465
val = append(val, ch)
466466
}
@@ -487,8 +487,8 @@ func (fj *FJ) readMemberName() ([]byte, error) {
487487
} else if ch == '\\' {
488488
name, err := fj.readMemberNameWithEscapes(nameStart)
489489
return name, err
490-
} else if ch <= 0x1f {
491-
return nil, fj.error(fmt.Sprintf("illegal character %x in field name", ch))
490+
} else if ch <= 0x1f || ch >= byte(ByteCeiling) {
491+
return nil, fj.error(fmt.Sprintf("illegal UTF-8 byte %x in field name", ch))
492492
}
493493
if fj.step() != nil {
494494
return nil, fj.error("premature end of event")
@@ -505,8 +505,8 @@ func (fj *FJ) readMemberNameWithEscapes(nameStart int) ([]byte, error) {
505505
if ch == '"' {
506506
fj.eventIndex = from
507507
return memberName, nil
508-
} else if ch < 0x1f {
509-
return nil, fj.error(fmt.Sprintf("illegal character %x in field name", ch))
508+
} else if ch <= 0x1f || ch >= byte(ByteCeiling) {
509+
return nil, fj.error(fmt.Sprintf("illegal UTF-8 byte %x in field name", ch))
510510
} else if ch == '\\' {
511511
var unescaped []byte
512512
unescaped, from, err = fj.readTextWithEscapes(from)

lib/pattern.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ const (
1717
literalType
1818
existsTrueType
1919
existsFalseType
20+
shellStyleType
2021
)
2122

2223
type typedVal struct {
@@ -184,6 +185,8 @@ func readSpecialPattern(pb *patternBuild, valsIn []typedVal) (pathVals []typedVa
184185
case "exists":
185186
containsExclusive = tt
186187
pathVals, err = readExistsSpecial(pb, pathVals)
188+
case "shellstyle":
189+
pathVals, err = readShellStyleSpecial(pb, pathVals)
187190
default:
188191
err = errors.New("unrecognized in special pattern: " + tt)
189192
}
@@ -207,7 +210,7 @@ func readExistsSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal
207210
pathVals = append(pathVals, typedVal{vType: existsFalseType})
208211
}
209212
default:
210-
err = errors.New("value for 'existsMatches' pattern must be true or false")
213+
err = errors.New("value for 'exists' pattern must be true or false")
211214
return
212215
}
213216

lib/pattern_test.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ func TestPatternFromJSON(t *testing.T) {
2121
`{"xxx": [ { "exists": 23 } ] }`,
2222
`{"xxx": [ { "exists": true, "a": 3 }] }`,
2323
`{"xxx": [ { "exists": false, "x": ["a", 3 ] }] }`,
24+
`{"abc": [ {"shellstyle":15} ] }`,
25+
`{"abc": [ {"shellstyle":"a**b"}, "foo" ] }`,
2426
}
2527
for _, b := range bads {
2628
_, _, err := patternFromJSON([]byte(b))
@@ -35,6 +37,8 @@ func TestPatternFromJSON(t *testing.T) {
3537
`{"x": { "a": [27, 28], "b": { "m": [ "a", "b" ] } } }`,
3638
`{"x": [ {"exists": true} ] }`,
3739
`{"x": { "y": [ {"exists": false} ] } }`,
40+
`{"abc": [ 3, {"shellstyle":"a*b"} ] }`,
41+
`{"abc": [ {"shellstyle":"a*b"}, "foo" ] }`,
3842
}
3943
w1 := []*patternField{{path: "x", vals: []typedVal{{numberType, "2"}}}}
4044
w2 := []*patternField{{path: "x", vals: []typedVal{
@@ -64,7 +68,19 @@ func TestPatternFromJSON(t *testing.T) {
6468
{vType: existsFalseType, val: ""},
6569
},
6670
}}
67-
wanted := [][]*patternField{w1, w2, w3, w4, w5}
71+
w6 := []*patternField{
72+
{path: "abc", vals: []typedVal{
73+
{vType: stringType, val: "3"},
74+
{vType: shellStyleType, val: `"a*b"`},
75+
},
76+
}}
77+
w7 := []*patternField{
78+
{path: "abc", vals: []typedVal{
79+
{vType: shellStyleType, val: `"a*b"`},
80+
{vType: stringType, val: `"foo"`},
81+
},
82+
}}
83+
wanted := [][]*patternField{w1, w2, w3, w4, w5, w6, w7}
6884

6985
for i, good := range goods {
7086
fields, _, err := patternFromJSON([]byte(good))

lib/shell_style.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
package quamina
2+
3+
import (
4+
"encoding/json"
5+
"errors"
6+
"fmt"
7+
)
8+
9+
func readShellStyleSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal, err error) {
10+
t, err := pb.jd.Token()
11+
if err != nil {
12+
return
13+
}
14+
pathVals = valsIn
15+
shellString, ok := t.(string)
16+
if !ok {
17+
err = errors.New("value for `shellstyle` must be a string")
18+
return
19+
}
20+
21+
// no adjacent wildcards
22+
valBytes := []byte(shellString)
23+
for i := 1; i < len(valBytes); i++ {
24+
if valBytes[i] == '*' && valBytes[i - 1] == '*' {
25+
err = errors.New("adjacent '*' characters not allowed in shellstyle pattern")
26+
return
27+
}
28+
}
29+
30+
pathVals = append(pathVals, typedVal{vType: shellStyleType, val: `"` + shellString + `"`})
31+
32+
t, err = pb.jd.Token()
33+
if err != nil {
34+
return
35+
}
36+
switch tt := t.(type) {
37+
case json.Delim:
38+
if tt != '}' {
39+
err = errors.New(fmt.Sprintf("invalid character %v in 'shellstyle' pattern", tt))
40+
}
41+
default:
42+
err = errors.New("trailing garbage in shellstyle pattern")
43+
}
44+
45+
return
46+
}
47+
48+
// makeShellStyleAutomaton - recognize a "-delimited string containing one or more '*' globs. It's useful that
49+
// the string ends with a '"' because we don't have to deal with the special case of '*' at end. Arguably, if
50+
// we ignored the '"' markers, we could be a little more efficient matching "foo*" but it'd add complexity
51+
func makeShellStyleAutomaton(val []byte, useThisTansition *fieldMatcher) (start smallStep, nextField *fieldMatcher) {
52+
table := newSmallTable()
53+
start = table
54+
55+
// loop through all but last byte
56+
for i := 0; i < len(val)-1; i++ {
57+
ch := val[i]
58+
if ch == '*' {
59+
// just loop back
60+
table.addRangeSteps(0, ByteCeiling, table)
61+
} else {
62+
next := newSmallTable()
63+
table.addByteStep(ch, next)
64+
table = next
65+
}
66+
}
67+
68+
// last byte, can't be '*'
69+
if useThisTansition != nil {
70+
nextField = useThisTansition
71+
} else {
72+
nextField = newFieldMatcher()
73+
}
74+
table.addByteStep(val[len(val)-1], newSmallTransition(nextField))
75+
return
76+
}

lib/small_table.go

Lines changed: 76 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package quamina
22

3+
import "fmt"
4+
35
// smallTable serves as a lookup table that encodes mappings between ranges of byte values and the SmallStep
46
// transition on any byte in the range.
57
// The way it works is exposed in the step() function just below. Logically, it's a slice of {byte, *smallStep}
@@ -9,12 +11,13 @@ package quamina
911
// steps: nil, &ss1, nil, &ss2, nil
1012
// invariant: The last element of ceilings is always Utf8ByteCeiling
1113
// The motivation is that we want to build a state machine on byte values to implement things like prefixes and
12-
// ranges of bytes. This could be done simply with a byte array of size Utf8ByteCeiling for each state in the machine,
14+
// ranges of bytes. This could be done simply with a byte array of size ByteCeiling for each state in the machine,
1315
// or a map[byte]smallStep, but both would be size-inefficient, particularly in the case where you're implementing
1416
// ranges. Now, the step function is O(N) in the number of entries, but empirically, the number of entries is
1517
// small even in large machines, so skipping throgh the ceilings list is measurably about the same speed as a map
1618
// or array construct
1719
type smallTable struct {
20+
name string
1821
slices *stSlices
1922
}
2023

@@ -25,14 +28,15 @@ type stSlices struct {
2528
steps []smallStep
2629
}
2730

28-
// Utf8ByteCeiling - the automaton runs on UTF-8 bytes, which map nicely to Go byte, which is uint8. The values
29-
// 0xF5-0xFF can't appear in UTF-8 strings, so anything can safely be assumed to be less than this value
30-
const Utf8ByteCeiling int = 0xf5
31+
// ByteCeiling - the automaton runs on UTF-8 bytes, which map nicely to Go byte, which is uint8. The values
32+
// 0xF5-0xFF can't appear in UTF-8 strings, we use 0xF5 as a value terminator, so characters F6 and higher
33+
// can't appear.
34+
const ByteCeiling int = 0xf6
3135

3236
func newSmallTable() *smallTable {
3337
return &smallTable{
3438
slices: &stSlices{
35-
ceilings: []byte{byte(Utf8ByteCeiling)},
39+
ceilings: []byte{byte(ByteCeiling)},
3640
steps: []smallStep{nil},
3741
},
3842
}
@@ -45,6 +49,9 @@ func (t *smallTable) SmallTable() *smallTable {
4549
func (t *smallTable) SmallTransition() *smallTransition {
4650
return nil
4751
}
52+
func (t *smallTable) HasTransition() bool {
53+
return false
54+
}
4855

4956
func (t *smallTable) step(utf8Byte byte) smallStep {
5057
for index, ceiling := range t.slices.ceilings {
@@ -55,11 +62,57 @@ func (t *smallTable) step(utf8Byte byte) smallStep {
5562
panic("Malformed SmallTable")
5663
}
5764

65+
// mergeAutomata computes the union of two valueMatch automata
66+
// invariant: neither argument is nil
67+
// TODO: Make sure it's thread-safe, as in doesn't write into existing tables from either new or existing
68+
func mergeAutomata(existing, newStep smallStep, memoize map[string]smallStep) smallStep {
69+
var combined smallStep
70+
mKey := fmt.Sprintf("%v%v", existing, newStep)
71+
combined, ok := memoize[mKey]
72+
if ok {
73+
return combined
74+
}
75+
76+
// we always take the transition from the existing step
77+
// switch is easier than if/else
78+
switch {
79+
case (!(existing.HasTransition() || newStep.HasTransition())):
80+
combined = newSmallTable()
81+
case existing.HasTransition() && newStep.HasTransition():
82+
combined = newSmallTransition(existing.SmallTransition().fieldTransition)
83+
case existing.HasTransition() && (!newStep.HasTransition()):
84+
combined = newSmallTransition(existing.SmallTransition().fieldTransition)
85+
case (!existing.HasTransition()) && newStep.HasTransition():
86+
combined = newSmallTransition(newStep.SmallTransition().fieldTransition)
87+
}
88+
memoize[mKey] = combined
89+
combined.SmallTable().name = fmt.Sprintf("(%s/%s)", existing.SmallTable().name, newStep.SmallTable().name)
90+
91+
uExisting := unpack(existing.SmallTable())
92+
uNew := unpack(newStep.SmallTable())
93+
var uComb unpackedTable
94+
for i, stepExisting := range uExisting {
95+
stepNew := uNew[i]
96+
switch {
97+
case stepExisting == nil && stepNew == nil:
98+
uComb[i] = nil
99+
case stepExisting != nil && stepNew == nil:
100+
uComb[i] = stepExisting
101+
case stepExisting == nil && stepNew != nil:
102+
uComb[i] = stepNew
103+
case stepExisting != nil && stepNew != nil:
104+
uComb[i] = mergeAutomata(stepExisting, stepNew, memoize)
105+
}
106+
}
107+
combined.SmallTable().pack(&uComb)
108+
return combined
109+
}
110+
58111
// unpackedTable replicates the data in the smallTable ceilings and steps arrays. It's quite hard to
59112
// update the list structure in a smallTable, but trivial in an unpackedTable. The idea is that to update
60113
// a smallTable you unpack it, update, then re-pack it. Not gonna be the most efficient thing so at some future point…
61114
// TODO: Figure out how to update a smallTable in place
62-
type unpackedTable [Utf8ByteCeiling]smallStep
115+
type unpackedTable [ByteCeiling]smallStep
63116

64117
func unpack(t *smallTable) *unpackedTable {
65118
var u unpackedTable
@@ -84,11 +137,26 @@ func (t *smallTable) pack(u *unpackedTable) {
84137
}
85138
lastStep = ss
86139
}
87-
slices.ceilings = append(slices.ceilings, byte(Utf8ByteCeiling))
140+
slices.ceilings = append(slices.ceilings, byte(ByteCeiling))
88141
slices.steps = append(slices.steps, lastStep)
89142
t.slices = &slices // atomic update
90143
}
91144

145+
func (t *smallTable) addByteStep(utf8Byte byte, step smallStep) {
146+
unpacked := unpack(t)
147+
unpacked[utf8Byte] = step
148+
t.pack(unpacked)
149+
}
150+
151+
func (t *smallTable) addRangeSteps(floor int, ceiling int, step smallStep) {
152+
unpacked := unpack(t)
153+
for i := floor; i < ceiling; i++ {
154+
unpacked[i] = step
155+
}
156+
t.pack(unpacked)
157+
}
158+
159+
/*
92160
func (t *smallTable) addRange(utf8Bytes []byte, step smallStep) {
93161
// TODO update fuzz test to include this
94162
unpacked := unpack(t)
@@ -97,3 +165,4 @@ func (t *smallTable) addRange(utf8Bytes []byte, step smallStep) {
97165
}
98166
t.pack(unpacked)
99167
}
168+
*/

0 commit comments

Comments
 (0)