fix *ay* bug, add pattern-addition benchmark

timbray · timbray · commit 8fbabe17956e · 2022-03-04T08:17:39.000-08:00
diff --git a/README.md b/README.md
@@ -64,7 +64,15 @@ The following patterns would match it:
 ```json
 {"Foo": [ { "exists": false } ] }
 ```
-
+```json
+{ 
+  "Image": {
+    "Thumbnail": {
+      "Url": [ { "shellstyle": "*.example.com/*" } ]
+    }
+  }
+}
+```
 The structure of the pattern, in terms of field names
 and nesting, must be the same as the structure of the event 
 to be matched.  The field values are always given
@@ -142,7 +150,7 @@ The performance of `MatchesForJSONEvent` is strongly
 sublinear in the number of patterns. It’s not quite `O(1)`
 as it varies somewhat as a function of the number of 
 unique fields that appear in all the patterns that have 
-been added to the machine, but remains sublinear in that 
+been added to the matcher, but remains sublinear in that 
 number. 
 
 A word of explanation is in order. Quamina compiles the
diff --git a/lib/benchmarks_test.go b/lib/benchmarks_test.go
@@ -253,7 +253,7 @@ func TestBigShellStyle(t *testing.T) {
 	}
 	elapsed := float64(time.Now().Sub(before).Milliseconds())
 	perSecond := float64(lineCount) / (elapsed / 1000.0)
-	fmt.Printf("%.2f matches/second with letter rules\n\n", perSecond)
+	fmt.Printf("%.2f matches/second with letter patterns\n\n", perSecond)
 
 	for k, wc := range wanted {
 		if lCounts[k] != wc {
@@ -266,3 +266,64 @@ func TestBigShellStyle(t *testing.T) {
 		}
 	}
 }
+
+func TestPatternAddition(t *testing.T) {
+	w := worder{0, readWWords(t)}
+
+	// now we're going to add 10K patterns.
+	m := NewMatcher()
+	before := time.Now()
+	fieldCount := 0
+	for x1 := 0; x1 < 10; x1++ {
+		for x2 := 0; x2 < 20; x2++ {
+			pat := fmt.Sprintf(`{"%s": { "%s": [ "%s"`, w.next(), w.next(), w.next())
+			for x3 := 0; x3 < 99; x3++ {
+				pat = pat + fmt.Sprintf(`, "%s"`, w.next())
+			}
+			fieldCount += 100
+			pat = pat + `] } }`
+			pName := string(w.next()) + string(w.next())
+			err := m.AddPattern(pName, pat)
+			if err != nil {
+				t.Error("addPattern " + err.Error())
+			}
+		}
+	}
+	fmt.Println("stats:" + matcherStats(m))
+	elapsed := float64(time.Now().Sub(before).Milliseconds())
+	perSecond := float64(fieldCount) / (elapsed / 1000.0)
+	fmt.Printf("%.2f fields/second\n\n", perSecond)
+}
+
+type worder struct {
+	index int
+	lines [][]byte
+}
+
+func (w *worder) next() []byte {
+	w.index += 761 // relatively prime with the number of lines
+	w.index = w.index % len(w.lines)
+	return w.lines[w.index]
+}
+
+func readWWords(t *testing.T) [][]byte {
+	// that's a list from the Wordle source code with a few erased to get a prime number
+	file, err := os.Open("../test_data/wwords.txt")
+	if err != nil {
+		t.Error("Can't open file: " + err.Error())
+	}
+	defer func(file *os.File) {
+		_ = file.Close()
+	}(file)
+	scanner := bufio.NewScanner(file)
+	buf := make([]byte, oneMeg)
+	scanner.Buffer(buf, oneMeg)
+
+	lineCount := 0
+	var lines [][]byte
+	for scanner.Scan() {
+		lineCount++
+		lines = append(lines, []byte(scanner.Text()))
+	}
+	return lines
+}
diff --git a/lib/concurrency_test.go b/lib/concurrency_test.go
@@ -29,7 +29,7 @@ func updateTree(m *Matcher, use37 bool, t *testing.T, ch chan string) {
 func TestConcurrency(t *testing.T) {
 	const UpdateLines = 250
 
-	// this is a cut/paste of TestCityLots, except for every few lines we add another rule to the matcher,
+	// this is a cut/paste of TestCityLots, except for every few lines we add another pattern to the matcher,
 	//  focusing on the fields that are being used by the patterns. The idea is to exercise concurrent
 	//  update and use of the automaton
 	// I was initially surprised that adding 860 or so changes to the automaton while it's running doesn't seem to
diff --git a/lib/shell_style.go b/lib/shell_style.go
@@ -45,49 +45,54 @@ func readShellStyleSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []type
 	return
 }
 
-// makeShellStyleAutomaton - recognize a "-delimited string containing one or more '*' globs.
-// TODO: Add “?”
-func makeShellStyleAutomaton(val []byte, useThisTansition *fieldMatcher) (start *smallTable, nextField *fieldMatcher) {
+// mssA2 - recognize a "-delimited string containing one or more '*' globs.
+//  This isn't quite as simple as you'd think.  Consider matching "*abc". When you see an 'a' you move to a state
+//  where you're looking for 'b'. So if it's not a 'b' you go back to the '*' state. But suppose you see "xaabc";
+//  when you're in that looking-for-'b' state and you see that second 'a', you don't go back to the '*' state, you
+//  have to stay in the looking-for-'b' state because you have seen the 'a'.  Similarly, when you see 'xabac', when
+//  you're looking for 'c' and you see the 'a', once again, you have to go to the looking-for-'b' state.  Let's
+//  call the 'a the bounceBackByte and the looking-for-b state the bounceBackStep
+func makeShellStyleAutomaton(val []byte, useThisTransition *fieldMatcher) (start *smallTable, nextField *fieldMatcher) {
 	table := newSmallTable()
 	start = table
-	if useThisTansition != nil {
-		nextField = useThisTansition
+	if useThisTransition != nil {
+		nextField = useThisTransition
 	} else {
 		nextField = newFieldMatcher()
 	}
 
-	// since this is provided as a string, the last byte will be '"'. In the special case where the pattern ends
-	//  with '*' (and thus the string ends with '*"', we will insert a successful transition as soon as we hit
-	//  that last '*', so that the reaching the transition doesn't require going through the trailing characters to
-	//  reach the '"'
-	if val[len(val) - 2] == '*' {
-		for i := 0; i < len(val) - 2; i++ {
-			ch := val[i]
-			if ch == '*' {
-				table.addRangeSteps(0, ByteCeiling, table)
-			} else {
-				next := newSmallTable()
-				table.addByteStep(ch, next)
-				table = next
-			}
-		}
-		table.addRangeSteps(0, ByteCeiling, newSmallTransition(nextField))
-		return
-	}
+	var bounceBackByte byte
+	var bounceBackStep smallStep = nil
+	var globStep smallStep = nil
 
-	// loop through all but last byte
-	for i := 0; i < len(val)-1; i++ {
+	// loop through all but last bytea
+	i := 0
+	for i < len(val)-1 {
 		ch := val[i]
 		if ch == '*' {
-			// just loop back
-			table.addRangeSteps(0, ByteCeiling, table)
+			// special-case handling for string ending in '*"'
+			if i == len(val)-2 {
+				lastStep := newSmallTransition(nextField)
+				table.addRangeSteps(0, ByteCeiling, lastStep)
+				return
+			}
+			globStep = table
+			i++
+			bounceBackStep = newSmallTable()
+			bounceBackByte = val[i]
+			table.load(table, []byte{val[i]}, []smallStep{bounceBackStep})
+			table = bounceBackStep.SmallTable()
 		} else {
 			next := newSmallTable()
+			if globStep != nil {
+				table.load(globStep, []byte{bounceBackByte}, []smallStep{bounceBackStep})
+			}
 			table.addByteStep(ch, next)
 			table = next
 		}
+		i++
 	}
-
 	table.addByteStep(val[len(val)-1], newSmallTransition(nextField))
+
 	return
 }
diff --git a/lib/small_table.go b/lib/small_table.go
@@ -61,7 +61,11 @@ func (t *smallTable) step(utf8Byte byte) smallStep {
 	panic("Malformed SmallTable")
 }
 
-// mergeAutomata computes the union of two valueMatch automata
+// mergeAutomata computes the union of two valueMatch automata.  If you look up the textbook theory about this,
+//  they say to compute the set product for automata A and B and build A0B0, A0B1 … A1BN, A1B0 … but if you look
+//  at that you realize that many of the product states aren't reachable. So you compute A0B0 and then keep
+//  recursing on the transitions coming out there, I'm pretty sure you get a correct result. I don't know if it's
+//  minimal or even avoids being wasteful.
 //  INVARIANT: neither argument is nil
 //  INVARIANT: To be thread-safe, no existing table can be updated
 func mergeAutomata(existing, newStep smallStep) *smallTable {
@@ -81,7 +85,8 @@ func mergeOne(existing, newStep smallStep, memoize map[string]smallStep) smallSt
 		return combined
 	}
 
-	// we always take the transition from the existing step, even if there's another in the merged-in step
+	// TODO: this works, all the tests pass, but I'm not satisfied witih it. My intuition is that you ought
+	//  to be able to come out of this with just one *fieldMatcher, parhaps with a merged matches list.
 	switch {
 	case !(existing.HasTransition() || newStep.HasTransition()):
 		combined = newSmallTable()
@@ -115,6 +120,18 @@ func mergeOne(existing, newStep smallStep, memoize map[string]smallStep) smallSt
 	return combined
 }
 
+// loadSmallTable with a default value and one or more byte values, trying to be efficient about it
+func (t *smallTable) load(defaultStep smallStep, positions []byte, steps []smallStep) {
+	var u unpackedTable
+	for i := range u {
+		u[i] = defaultStep
+	}
+	for i, position := range positions {
+		u[position] = steps[i]
+	}
+	t.pack(&u)
+}
+
 // unpackedTable replicates the data in the smallTable ceilings and steps arrays.  It's quite hard to
 //  update the list structure in a smallTable, but trivial in an unpackedTable.  The idea is that to update
 //  a smallTable you unpack it, update, then re-pack it.  Not gonna be the most efficient thing so at some future point…
diff --git a/lib/stats.go b/lib/stats.go
@@ -3,13 +3,13 @@ package quamina
 import "fmt"
 
 type stats struct {
-	fmCount int
+	fmCount   int
 	fmVisited map[*fieldMatcher]bool
-	vmCount int
+	vmCount   int
 	vmVisited map[*valueMatcher]bool
-	stCount int
+	stCount   int
 	stVisited map[*smallTable]bool
-	siCount int
+	siCount   int
 }
 
 func matcherStats(m *Matcher) string {
diff --git a/lib/value_matcher.go b/lib/value_matcher.go
@@ -91,9 +91,8 @@ func (m *valueMatcher) transitionOn(val []byte) []*fieldMatcher {
 
 	// step through the smallTables, byte by byte
 	table := m.startTable
-	var step smallStep
 	for _, utf8Byte := range val {
-		step = table.step(utf8Byte)
+		step := table.step(utf8Byte)
 		if step == nil {
 			return transitions
 		}
diff --git a/lib/value_matcher_test.go b/lib/value_matcher_test.go
@@ -98,6 +98,27 @@ func TestMultiTransitions(t *testing.T) {
 	}
 }
 
+func TestAY(t *testing.T) {
+	m := NewMatcher()
+	pat := `{"x": [ { "shellstyle": "*ay*"} ] }`
+	err := m.AddPattern("AY", pat)
+	if err != nil {
+		t.Error("AY: " + err.Error())
+	}
+	shouldMatch := []string{"ay", "aay", "aaaayyyyy", "xyzay", "ayxxxx"}
+	e := `{"x": "X"}`
+	for _, sm := range shouldMatch {
+		p := strings.ReplaceAll(e, "X", sm)
+		matches, err := m.MatchesForJSONEvent([]byte(p))
+		if err != nil {
+			t.Error("bad JSON: " + err.Error())
+		}
+		if len(matches) != 1 || matches[0] != "AY" {
+			t.Errorf("%s didn't match", sm)
+		}
+	}
+}
+
 func TestOverlappingValues(t *testing.T) {
 	m := NewMatcher()
 	p1 := `{"a": ["foo"]}`