Snapshot with all pieces in place before major refactor to support shell-style globbing

timbray · timbray · commit dd1e82ba3e3b · 2022-02-24T17:29:12.000-08:00
diff --git a/README.md b/README.md
@@ -84,6 +84,12 @@ same number.
 
 ## APIs
 
+**Note**: In all the APIs below, field names and values in both
+patterns and events must be valid UTF-8.  Unescaped characters
+smaller than 0x1F (illegal per JSON), and bytes with value
+greater than 0XF4 (can't occur in correctly composed UTF-8)
+will be rejected by the API.
+
 ```go
 func NewMatcher() *Matcher
 ```
@@ -101,8 +107,8 @@ The pattern must be provided as a string which is a
 JSON object as exemplified above in this document.
 
 The `error` return is used to signal invalid pattern
-structure, which could be malformed JSON or leaf values
-which are not provided as arrays.
+structure, which could be bad UTF-8 or malformed JSON 
+or leaf values which are not provided as arrays.
 
 As many patterns as desired can be added to a Matcher
 but at this time there is no capability of removing any.
@@ -114,11 +120,12 @@ threads call it, they will block and execute sequentially.
 func (m *Matcher) MatchesForJSONEvent(event []byte) ([]X, error)
 ```
 
-The `event` argument must be a JSON object. It would be 
+The `event` argument must be a JSON object encoded in
+correct UTF-8. It would be 
 easy to extend Matcher to handle other data formats; see the
 `Flattener` interface and its implementation in `FJ`.
 
-The `error` return value is nil unless there was a syntax
+The `error` return value is nil unless there was an
 error in the event JSON.
 
 The `[]X` return slice may be empty if none of the patterns
@@ -131,11 +138,11 @@ also executing.
 ### Performance
 
 The performance of `MatchesForJSONEvent` is strongly
-sublinear in the number of patterns. It’s not quite `O(1)`,
-it does vary somewhat as a function of the number of 
+sublinear in the number of patterns. It’s not quite `O(1)`
+as it varies somewhat as a function of the number of 
 unique fields that appear in all the patterns that have 
 been added to the machine, but remains sublinear in that 
-variation. 
+number. 
 
 A word of explanation is in order. Quamina compiles the
 patterns into a somewhat-decorated DFA and uses that to
diff --git a/lib/fj.go b/lib/fj.go
@@ -429,8 +429,8 @@ func (fj *FJ) readStringValue() ([]byte, error) {
 		} else if ch == '\\' {
 			val, err := fj.readStringValWithEscapes(valStart)
 			return val, err
-		} else if ch <= 0x1f {
-			return nil, fj.error(fmt.Sprintf("illegal character %x in string value", ch))
+		} else if ch <= 0x1f || ch >= byte(ByteCeiling) {
+			return nil, fj.error(fmt.Sprintf("illegal UTF-8 byte %x in string value", ch))
 		}
 		if fj.step() != nil {
 			return nil, fj.error("event truncated in mid-string")
@@ -459,8 +459,8 @@ func (fj *FJ) readStringValWithEscapes(nameStart int) ([]byte, error) {
 				return nil, err
 			}
 			val = append(val, unescaped...)
-		} else if ch <= 0x1f {
-			return nil, fj.error(fmt.Sprintf("illegal character %x in string value", ch))
+		} else if ch <= 0x1f || ch >= byte(ByteCeiling) {
+			return nil, fj.error(fmt.Sprintf("illegal UTF-8 byte %x in string value", ch))
 		} else {
 			val = append(val, ch)
 		}
@@ -487,8 +487,8 @@ func (fj *FJ) readMemberName() ([]byte, error) {
 		} else if ch == '\\' {
 			name, err := fj.readMemberNameWithEscapes(nameStart)
 			return name, err
-		} else if ch <= 0x1f {
-			return nil, fj.error(fmt.Sprintf("illegal character %x in field name", ch))
+		} else if ch <= 0x1f || ch >= byte(ByteCeiling) {
+			return nil, fj.error(fmt.Sprintf("illegal UTF-8 byte %x in field name", ch))
 		}
 		if fj.step() != nil {
 			return nil, fj.error("premature end of event")
@@ -505,8 +505,8 @@ func (fj *FJ) readMemberNameWithEscapes(nameStart int) ([]byte, error) {
 		if ch == '"' {
 			fj.eventIndex = from
 			return memberName, nil
-		} else if ch < 0x1f {
-			return nil, fj.error(fmt.Sprintf("illegal character %x in field name", ch))
+		} else if ch <= 0x1f || ch >= byte(ByteCeiling) {
+			return nil, fj.error(fmt.Sprintf("illegal UTF-8 byte %x in field name", ch))
 		} else if ch == '\\' {
 			var unescaped []byte
 			unescaped, from, err = fj.readTextWithEscapes(from)
diff --git a/lib/pattern.go b/lib/pattern.go
@@ -17,6 +17,7 @@ const (
 	literalType
 	existsTrueType
 	existsFalseType
+	shellStyleType
 )
 
 type typedVal struct {
@@ -184,6 +185,8 @@ func readSpecialPattern(pb *patternBuild, valsIn []typedVal) (pathVals []typedVa
 		case "exists":
 			containsExclusive = tt
 			pathVals, err = readExistsSpecial(pb, pathVals)
+		case "shellstyle":
+			pathVals, err = readShellStyleSpecial(pb, pathVals)
 		default:
 			err = errors.New("unrecognized in special pattern: " + tt)
 		}
@@ -207,7 +210,7 @@ func readExistsSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal
 			pathVals = append(pathVals, typedVal{vType: existsFalseType})
 		}
 	default:
-		err = errors.New("value for 'existsMatches' pattern must be true or false")
+		err = errors.New("value for 'exists' pattern must be true or false")
 		return
 	}
 
diff --git a/lib/pattern_test.go b/lib/pattern_test.go
@@ -21,6 +21,8 @@ func TestPatternFromJSON(t *testing.T) {
 		`{"xxx": [ { "exists": 23 } ] }`,
 		`{"xxx": [ { "exists": true, "a": 3 }] }`,
 		`{"xxx": [ { "exists": false, "x": ["a", 3 ] }] }`,
+		`{"abc": [ {"shellstyle":15} ] }`,
+		`{"abc": [ {"shellstyle":"a**b"}, "foo" ] }`,
 	}
 	for _, b := range bads {
 		_, _, err := patternFromJSON([]byte(b))
@@ -35,6 +37,8 @@ func TestPatternFromJSON(t *testing.T) {
 		`{"x": { "a": [27, 28], "b": { "m": [ "a", "b" ] } } }`,
 		`{"x": [ {"exists": true} ] }`,
 		`{"x": { "y": [ {"exists": false} ] } }`,
+		`{"abc": [ 3, {"shellstyle":"a*b"} ] }`,
+		`{"abc": [ {"shellstyle":"a*b"}, "foo" ] }`,
 	}
 	w1 := []*patternField{{path: "x", vals: []typedVal{{numberType, "2"}}}}
 	w2 := []*patternField{{path: "x", vals: []typedVal{
@@ -64,7 +68,19 @@ func TestPatternFromJSON(t *testing.T) {
 			{vType: existsFalseType, val: ""},
 		},
 		}}
-	wanted := [][]*patternField{w1, w2, w3, w4, w5}
+	w6 := []*patternField{
+		{path: "abc", vals: []typedVal{
+			{vType: stringType, val: "3"},
+			{vType: shellStyleType, val: `"a*b"`},
+		},
+		}}
+	w7 := []*patternField{
+		{path: "abc", vals: []typedVal{
+			{vType: shellStyleType, val: `"a*b"`},
+			{vType: stringType, val: `"foo"`},
+		},
+		}}
+	wanted := [][]*patternField{w1, w2, w3, w4, w5, w6, w7}
 
 	for i, good := range goods {
 		fields, _, err := patternFromJSON([]byte(good))
diff --git a/lib/shell_style.go b/lib/shell_style.go
@@ -0,0 +1,76 @@
+package quamina
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+)
+
+func readShellStyleSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal, err error) {
+	t, err := pb.jd.Token()
+	if err != nil {
+		return
+	}
+	pathVals = valsIn
+	shellString, ok := t.(string)
+	if !ok {
+		err = errors.New("value for `shellstyle` must be a string")
+		return
+	}
+
+	// no adjacent wildcards
+	valBytes := []byte(shellString)
+	for i := 1; i < len(valBytes); i++ {
+		if valBytes[i] == '*' && valBytes[i - 1] == '*' {
+			err = errors.New("adjacent '*' characters not allowed in shellstyle pattern")
+			return
+		}
+	}
+
+	pathVals = append(pathVals, typedVal{vType: shellStyleType, val: `"` + shellString + `"`})
+
+	t, err = pb.jd.Token()
+	if err != nil {
+		return
+	}
+	switch tt := t.(type) {
+	case json.Delim:
+		if tt != '}' {
+			err = errors.New(fmt.Sprintf("invalid character %v in 'shellstyle' pattern", tt))
+		}
+	default:
+		err = errors.New("trailing garbage in shellstyle pattern")
+	}
+
+	return
+}
+
+// makeShellStyleAutomaton - recognize a "-delimited string containing one or more '*' globs. It's useful that
+//  the string ends with a '"' because we don't have to deal with the special case of '*' at end.  Arguably, if
+//  we ignored the '"' markers, we could be a little more efficient matching "foo*" but it'd add complexity
+func makeShellStyleAutomaton(val []byte, useThisTansition *fieldMatcher) (start smallStep, nextField *fieldMatcher) {
+	table := newSmallTable()
+	start = table
+
+	// loop through all but last byte
+	for i := 0; i < len(val)-1; i++ {
+		ch := val[i]
+		if ch == '*' {
+			// just loop back
+			table.addRangeSteps(0, ByteCeiling, table)
+		} else {
+			next := newSmallTable()
+			table.addByteStep(ch, next)
+			table = next
+		}
+	}
+
+	// last byte, can't be '*'
+	if useThisTansition != nil {
+		nextField = useThisTansition
+	} else {
+		nextField = newFieldMatcher()
+	}
+	table.addByteStep(val[len(val)-1], newSmallTransition(nextField))
+	return
+}
diff --git a/lib/small_table.go b/lib/small_table.go
@@ -1,5 +1,7 @@
 package quamina
 
+import "fmt"
+
 // smallTable serves as a lookup table that encodes mappings between ranges of byte values and the SmallStep
 //  transition on any byte in the range.
 //  The way it works is exposed in the step() function just below.  Logically, it's a slice of {byte, *smallStep}
@@ -9,12 +11,13 @@ package quamina
 //     steps: nil, &ss1, nil,  &ss2, nil
 //  invariant: The last element of ceilings is always Utf8ByteCeiling
 // The motivation is that we want to build a state machine on byte values to implement things like prefixes and
-//  ranges of bytes.  This could be done simply with a byte array of size Utf8ByteCeiling for each state in the machine,
+//  ranges of bytes.  This could be done simply with a byte array of size ByteCeiling for each state in the machine,
 //  or a map[byte]smallStep, but both would be size-inefficient, particularly in the case where you're implementing
 //  ranges.  Now, the step function is O(N) in the number of entries, but empirically, the number of entries is
 //  small even in large machines, so skipping throgh the ceilings list is measurably about the same speed as a map
 //  or array construct
 type smallTable struct {
+	name   string
 	slices *stSlices
 }
 
@@ -25,14 +28,15 @@ type stSlices struct {
 	steps    []smallStep
 }
 
-// Utf8ByteCeiling - the automaton runs on UTF-8 bytes, which map nicely to Go byte, which is uint8. The values
-//  0xF5-0xFF can't appear in UTF-8 strings, so anything can safely be assumed to be less than this value
-const Utf8ByteCeiling int = 0xf5
+// ByteCeiling - the automaton runs on UTF-8 bytes, which map nicely to Go byte, which is uint8. The values
+//  0xF5-0xFF can't appear in UTF-8 strings, we use 0xF5 as a value terminator, so characters F6 and higher
+//  can't appear.
+const ByteCeiling int = 0xf6
 
 func newSmallTable() *smallTable {
 	return &smallTable{
 		slices: &stSlices{
-			ceilings: []byte{byte(Utf8ByteCeiling)},
+			ceilings: []byte{byte(ByteCeiling)},
 			steps:    []smallStep{nil},
 		},
 	}
@@ -45,6 +49,9 @@ func (t *smallTable) SmallTable() *smallTable {
 func (t *smallTable) SmallTransition() *smallTransition {
 	return nil
 }
+func (t *smallTable) HasTransition() bool {
+	return false
+}
 
 func (t *smallTable) step(utf8Byte byte) smallStep {
 	for index, ceiling := range t.slices.ceilings {
@@ -55,11 +62,57 @@ func (t *smallTable) step(utf8Byte byte) smallStep {
 	panic("Malformed SmallTable")
 }
 
+// mergeAutomata computes the union of two valueMatch automata
+//  invariant: neither argument is nil
+//  TODO: Make sure it's thread-safe, as in doesn't write into existing tables from either new or existing
+func mergeAutomata(existing, newStep smallStep, memoize map[string]smallStep) smallStep {
+	var combined smallStep
+	mKey := fmt.Sprintf("%v%v", existing, newStep)
+	combined, ok := memoize[mKey]
+	if ok {
+		return combined
+	}
+
+	// we always take the transition from the existing step
+	// switch is easier than if/else
+	switch {
+	case (!(existing.HasTransition() || newStep.HasTransition())):
+		combined = newSmallTable()
+	case existing.HasTransition() && newStep.HasTransition():
+		combined = newSmallTransition(existing.SmallTransition().fieldTransition)
+	case existing.HasTransition() && (!newStep.HasTransition()):
+		combined = newSmallTransition(existing.SmallTransition().fieldTransition)
+	case (!existing.HasTransition()) && newStep.HasTransition():
+		combined = newSmallTransition(newStep.SmallTransition().fieldTransition)
+	}
+	memoize[mKey] = combined
+	combined.SmallTable().name = fmt.Sprintf("(%s/%s)", existing.SmallTable().name, newStep.SmallTable().name)
+
+	uExisting := unpack(existing.SmallTable())
+	uNew := unpack(newStep.SmallTable())
+	var uComb unpackedTable
+	for i, stepExisting := range uExisting {
+		stepNew := uNew[i]
+		switch {
+		case stepExisting == nil && stepNew == nil:
+			uComb[i] = nil
+		case stepExisting != nil && stepNew == nil:
+			uComb[i] = stepExisting
+		case stepExisting == nil && stepNew != nil:
+			uComb[i] = stepNew
+		case stepExisting != nil && stepNew != nil:
+			uComb[i] = mergeAutomata(stepExisting, stepNew, memoize)
+		}
+	}
+	combined.SmallTable().pack(&uComb)
+	return combined
+}
+
 // unpackedTable replicates the data in the smallTable ceilings and steps arrays.  It's quite hard to
 //  update the list structure in a smallTable, but trivial in an unpackedTable.  The idea is that to update
 //  a smallTable you unpack it, update, then re-pack it.  Not gonna be the most efficient thing so at some future point…
 // TODO: Figure out how to update a smallTable in place
-type unpackedTable [Utf8ByteCeiling]smallStep
+type unpackedTable [ByteCeiling]smallStep
 
 func unpack(t *smallTable) *unpackedTable {
 	var u unpackedTable
@@ -84,11 +137,26 @@ func (t *smallTable) pack(u *unpackedTable) {
 		}
 		lastStep = ss
 	}
-	slices.ceilings = append(slices.ceilings, byte(Utf8ByteCeiling))
+	slices.ceilings = append(slices.ceilings, byte(ByteCeiling))
 	slices.steps = append(slices.steps, lastStep)
 	t.slices = &slices // atomic update
 }
 
+func (t *smallTable) addByteStep(utf8Byte byte, step smallStep) {
+	unpacked := unpack(t)
+	unpacked[utf8Byte] = step
+	t.pack(unpacked)
+}
+
+func (t *smallTable) addRangeSteps(floor int, ceiling int, step smallStep) {
+	unpacked := unpack(t)
+	for i := floor; i < ceiling; i++ {
+		unpacked[i] = step
+	}
+	t.pack(unpacked)
+}
+
+/*
 func (t *smallTable) addRange(utf8Bytes []byte, step smallStep) {
 	// TODO update fuzz test to include this
 	unpacked := unpack(t)
@@ -97,3 +165,4 @@ func (t *smallTable) addRange(utf8Bytes []byte, step smallStep) {
 	}
 	t.pack(unpacked)
 }
+*/
diff --git a/lib/small_table_test.go b/lib/small_table_test.go
diff --git a/lib/value_matcher.go b/lib/value_matcher.go
diff --git a/lib/value_matcher_test.go b/lib/value_matcher_test.go

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ const (`
`17`	`17`	`literalType`
`18`	`18`	`existsTrueType`
`19`	`19`	`existsFalseType`
	`20`	`+ shellStyleType`
`20`	`21`	`)`
`21`	`22`
`22`	`23`	`type typedVal struct {`
`@@ -184,6 +185,8 @@ func readSpecialPattern(pb *patternBuild, valsIn []typedVal) (pathVals []typedVa`
`184`	`185`	`case "exists":`
`185`	`186`	`containsExclusive = tt`
`186`	`187`	`pathVals, err = readExistsSpecial(pb, pathVals)`
	`188`	`+ case "shellstyle":`
	`189`	`+ pathVals, err = readShellStyleSpecial(pb, pathVals)`
`187`	`190`	`default:`
`188`	`191`	`err = errors.New("unrecognized in special pattern: " + tt)`
`189`	`192`	`}`
`@@ -207,7 +210,7 @@ func readExistsSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal`
`207`	`210`	`pathVals = append(pathVals, typedVal{vType: existsFalseType})`
`208`	`211`	`}`
`209`	`212`	`default:`
`210`		`- err = errors.New("value for 'existsMatches' pattern must be true or false")`
	`213`	`+ err = errors.New("value for 'exists' pattern must be true or false")`
`211`	`214`	`return`
`212`	`215`	`}`
`213`	`216`