timbray
diff --git a/‎README.md‎
Lines changed: 21 additions & 4 deletions b/‎README.md‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎go.mod‎
Lines changed: 1 addition & 1 deletion b/‎go.mod‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/benchmarks_test.go‎
Lines changed: 20 additions & 29 deletions b/‎lib/benchmarks_test.go‎
Lines changed: 20 additions & 29 deletions
diff --git a/‎lib/concurrency_test.go‎
Lines changed: 16 additions & 7 deletions b/‎lib/concurrency_test.go‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎lib/fj.go‎
Lines changed: 1 addition & 0 deletions b/‎lib/fj.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/list_maker.go‎
Lines changed: 59 additions & 0 deletions b/‎lib/list_maker.go‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎lib/list_maker_test.go‎
Lines changed: 52 additions & 0 deletions b/‎lib/list_maker_test.go‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎lib/matcher.go‎
Lines changed: 1 addition & 1 deletion b/‎lib/matcher.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/numbers.go‎
Lines changed: 3 additions & 3 deletions b/‎lib/numbers.go‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/numbers_test.go‎
Lines changed: 0 additions & 1 deletion b/‎lib/numbers_test.go‎
Lines changed: 0 additions & 1 deletion
@@ -70,14 +70,22 @@ The following patterns would match it:
     "Thumbnail": { "Url": [ { "shellstyle": "*9943" } ] }
   }
 }
+```
+        
+```json
 {
   "Image": {
-    "Thumbnail": { "Url": [ { "shellstyle": "http://www.example.com/*" } ] }
+    "Thumbnail": { "Url": 
+      [ { "shellstyle": "http://www.example.com/*" } ] }
   }
 }
+```
+
+```json
 {
   "Image": {
-    "Thumbnail": { "Url": [ { "shellstyle": "http://www.example.com/*9943" } ] }
+    "Thumbnail": { "Url": 
+      [ { "shellstyle": "http://www.example.com/*9943" } ] }
   }
 }
 
@@ -95,11 +103,20 @@ be assumed to match, but all Fields mentioned must match. So the
 semantics are effectively an OR on each field's values, 
 but an AND on the field names.
 
+Note that the `shellstyle` patterns can include only
+one `*` character. The architecture probably allows
+support for a larger subset of regular expressions
+but currently,  the testing for just the single -`*`
+patterns is a bit lacking.
+
 Number matching is weak - the number has to appear 
 exactly the same in the pattern and the event. I.e.,
 Quamina doesn't know that 35, 35.000, and 3.5e1 are the
 same number.
 
+There's a fix for this in the code which is commented
+out because it causes a significant performance penalty.
+
 ## APIs
 
 **Note**: In all the APIs below, field names and values in both
@@ -163,8 +180,8 @@ been added to the matcher, but remains sublinear in that
 number. 
 
 A word of explanation is in order. Quamina compiles the
-patterns into a somewhat-decorated DFA and uses that to
-find matches in events; that DFA-based matching process is 
+patterns into a somewhat-decorated automaton and uses 
+that to find matches in events; the matching process is 
 O(1) in the number of patterns.
 
 However, for this to work, the incoming event must be
 
@@ -1,3 +1,3 @@
 module quamina
 
-go 1.17
+go 1.18
@@ -125,10 +125,10 @@ func TestCityLots(t *testing.T) {
 		message1 := fmt.Sprintf("Events-per-second benchmark ran at %.0f events per second, below threshold of %.0f.",
 			perSecond, thresholdPerformance)
 		message2 := `
-It may be that re-running the benchmark test will address this, or it may be that you're running on a machine
-that is slower than the one the software was developed on, in which case you might want to readjust the 
-"thresholdPerformance" constant. However, it may be that you made a change that reduced the throughput of the 
-library, which would be unacceptable.`
+		It may be that re-running the benchmark test will address this, or it may be that you're running on a machine
+		that is slower than the one the software was developed on, in which case you might want to readjust the
+		"thresholdPerformance" constant. However, it may be that you made a change that reduced the throughput of the
+		library, which would be unacceptable.`
 		t.Errorf(message1 + message2)
 	}
 
@@ -178,17 +178,7 @@ func TestMySoftwareHatesMe(t *testing.T) {
 		t.Error("no match for EEE")
 	}
 }
- */
-
-
-func containsX(list []X, x X) bool {
-	for _, in := range list {
-		if in == x {
-			return true
-		}
-	}
-	return false
-}
+*/
 
 // exercise shellstyle matching a little, is much faster than TestCityLots because it's only working wth one field
 func TestBigShellStyle(t *testing.T) {
@@ -218,7 +208,7 @@ func TestBigShellStyle(t *testing.T) {
 		`{"properties": {"STREET":[ {"shellstyle": "N*P*"} ] } }`:    927,
 		`{"properties": {"STREET":[ {"shellstyle": "*E*E*E*"} ] } }`: 1212,
 	}
-	 */
+	*/
 
 	for letter := range wanted {
 		pat := fmt.Sprintf(`{"properties": {"STREET":[ {"shellstyle": "%s*"} ] } }`, letter)
@@ -229,13 +219,13 @@ func TestBigShellStyle(t *testing.T) {
 	}
 
 	/*
-	for funk := range funky {
-		err := m.AddPattern(funk, funk.(string))
-		if err != nil {
-			t.Errorf("err on %s: %s", funk, err.Error())
+		for funk := range funky {
+			err := m.AddPattern(funk, funk.(string))
+			if err != nil {
+				t.Errorf("err on %s: %s", funk, err.Error())
+			}
 		}
-	}
-	 */
+	*/
 	fmt.Println(matcherStats(m))
 
 	lineCount := 0
@@ -271,20 +261,21 @@ func TestBigShellStyle(t *testing.T) {
 		}
 	}
 	/*
-	for k, wc := range funky {
-		if lCounts[k] != wc {
-			t.Errorf("for %s wanted %d got %d", k, wc, lCounts[k])
+		for k, wc := range funky {
+			if lCounts[k] != wc {
+				t.Errorf("for %s wanted %d got %d", k, wc, lCounts[k])
+			}
 		}
-	}
-	
-	 */
+
+	*/
 }
 
 // TestPatternAddition adds a whole lot of string-only rules as fast as possible  The profiler says that the
 //  performance is totally doinated by the garbage-collector thrashing, in particular it has to allocate
 //  ~220K smallTables.  Tried https://blog.twitch.tv/en/2019/04/10/go-memory-ballast-how-i-learnt-to-stop-worrying-and-love-the-heap/
 //  but it doesn't seem to help.
 //  TODO: Find a way to allocate less tables.
+//  TODO: Add shellstyle patterns
 func TestPatternAddition(t *testing.T) {
 	w := worder{0, readWWords(t)}
 
@@ -312,7 +303,7 @@ func TestPatternAddition(t *testing.T) {
 		}
 	}
 	runtime.ReadMemStats(&msAfter)
-	delta := 1.0 / 1000000.0 * float64(msAfter.Alloc - msBefore.Alloc)
+	delta := 1.0 / 1000000.0 * float64(msAfter.Alloc-msBefore.Alloc)
 	fmt.Printf("before %d, after %d, delta %f\n", msBefore.Alloc, msAfter.Alloc, delta)
 	fmt.Println("stats:" + matcherStats(m))
 	elapsed := float64(time.Now().Sub(before).Milliseconds())
 
@@ -18,6 +18,10 @@ func updateTree(m *Matcher, use37 bool, t *testing.T, ch chan string) {
 	} else {
 		val = fmt.Sprintf(`"%d"`, rand.Int())
 		pattern = fmt.Sprintf(`{ "properties": { "STREET": [ %s ] } }`, val)
+		/* TODO: alternate literal and shellstyle addition
+		val = fmt.Sprintf(`"*%d"`, rand.Int())
+		pattern = fmt.Sprintf(`{ "properties": { "STREET": [ {"shellstyle": %s } ] } }`, val)
+		*/
 	}
 	err := m.AddPattern(val, pattern)
 	if err != nil {
@@ -45,21 +49,24 @@ func TestConcurrency(t *testing.T) {
 
 	patterns := []string{
 		`{ "properties": { "STREET": [ "CRANLEIGH" ] } }`,
+		`{ "properties": { "STREET": [ { "shellstyle": "B*K"} ] } }`,
 		`{ "properties": { "STREET": [ "17TH" ], "ODD_EVEN": [ "E"] } }`,
 		`{ "geometry": { "coordinates": [ 37.807807921694092 ] } }`,
 		`{ "properties": { "MAPBLKLOT": ["0011008"], "BLKLOT": ["0011008"]},  "geometry": { "coordinates": [ 37.807807921694092 ] } } `,
 	}
 	names := []string{
 		"CRANLEIGH",
+		"shellstyle",
 		"17TH Even",
 		"Geometry",
 		"0011008",
 	}
 	wanted := map[X]int{
-		"CRANLEIGH": 7,
-		"17TH Even": 836,
-		"Geometry":  2,
-		"0011008":   1,
+		"CRANLEIGH":  7,
+		"shellstyle": 746,
+		"17TH Even":  836,
+		"Geometry":   2,
+		"0011008":    1,
 	}
 
 	scanner := bufio.NewScanner(file)
@@ -85,6 +92,7 @@ func TestConcurrency(t *testing.T) {
 	lineCount = 0
 	before := time.Now()
 	ch := make(chan string, 1000)
+	sent := 0
 	for _, line := range lines {
 		matches, err := m.MatchesForJSONEvent(line)
 		if err != nil {
@@ -93,6 +101,7 @@ func TestConcurrency(t *testing.T) {
 		lineCount++
 		if lineCount%UpdateLines == 0 {
 			use37 = !use37
+			sent++
 			go updateTree(m, use37, t, ch)
 		}
 		for _, match := range matches {
@@ -103,11 +112,10 @@ func TestConcurrency(t *testing.T) {
 			results[match] = count + 1
 		}
 	}
-	fmt.Println()
 
 	elapsed := float64(time.Now().Sub(before).Milliseconds())
 	perSecond := float64(lineCount) / (elapsed / 1000.0)
-	fmt.Printf("%.2f matches/second with updates\n\n", perSecond)
+	fmt.Printf("\n%.2f matches/second with updates\n\n", perSecond)
 
 	err = scanner.Err()
 	if err != nil {
@@ -125,7 +133,8 @@ func TestConcurrency(t *testing.T) {
 
 	// now we go back and make sure that all those AddPattern calls actually made it into the Matcher
 	close(ch)
-	for val := range ch {
+	for i := 0; i < sent; i++ {
+		val := <-ch
 		var event string
 		if val[0] == '"' {
 			event = fmt.Sprintf(`{"properties": { "STREET": %s} }`, val)
 
@@ -16,6 +16,7 @@ import (
 //  the fields & values can be represented as []byte slices using a couple of offsets into the underlying event.
 //  There is an exception, namely strings that contain \-prefixed JSON escapes; since we want to work with the
 //  actual UTF-8 bytes, this requires re-writing such strings into memory we have to allocate.
+// TODO: There are gaps in the unit-test coverage, including nearly all the error conditions
 type FJ struct {
 	event       []byte      // event being processed, treated as immutable
 	eventIndex  int         // current byte index into the event
 
@@ -0,0 +1,59 @@
+package quamina
+
+// this needs to exist so that all all the lists containing a single step to X, or the triple step to X,Y,Z are the
+//  same list, so that pack/unpack work properly
+
+type listMaker struct {
+	singletons map[*nfaStep]*nfaStepList
+	plurals    []*nfaStepList
+}
+
+func newListMaker() *listMaker {
+	return &listMaker{singletons: make(map[*nfaStep]*nfaStepList)}
+}
+
+func (l *listMaker) getSingleton(step *nfaStep) *nfaStepList {
+	already, ok := l.singletons[step]
+	if ok {
+		return already
+	}
+	list := &nfaStepList{steps: []*nfaStep{step}}
+	l.singletons[step] = list
+	return list
+}
+
+func (l *listMaker) getList(steps ...*nfaStep) *nfaStepList {
+	if len(steps) == 1 {
+		return l.getSingleton(steps[0])
+	}
+
+	for _, already := range l.plurals {
+		if listsAreEqual(already.steps, steps) {
+			return already
+		}
+	}
+	list := &nfaStepList{steps: steps}
+	l.plurals = append(l.plurals, list)
+	return list
+}
+
+func listsAreEqual(l1, l2 []*nfaStep) bool {
+	if len(l1) != len(l2) {
+		return false
+	}
+	for _, step := range l1 {
+		if !listMakerContains(l2, step) {
+			return false
+		}
+	}
+	return true
+}
+
+func listMakerContains(list []*nfaStep, step *nfaStep) bool {
+	for _, fromList := range list {
+		if step == fromList {
+			return true
+		}
+	}
+	return false
+}
@@ -0,0 +1,52 @@
+package quamina
+
+import (
+	"testing"
+)
+
+func TestListMaker(t *testing.T) {
+	steps := []*nfaStep{
+		{},
+		{},
+		{},
+	}
+	multi := [][]*nfaStep{
+		{steps[0]},
+		{steps[0], steps[1]},
+		{steps[0], steps[1], steps[2]},
+		{steps[0], steps[2]},
+		{steps[1]},
+		{steps[1], steps[2]},
+		{steps[2]},
+	}
+	lm := newListMaker()
+	lists := make(map[*nfaStepList]bool)
+	for _, step := range steps {
+		lists[lm.getSingleton(step)] = true
+	}
+	if len(lists) != 3 {
+		t.Error("length should be 3")
+	}
+	for _, step := range steps {
+		lists[lm.getSingleton(step)] = true
+	}
+	if len(lists) != 3 {
+		t.Error("length STILL should be 3")
+	}
+	lm = newListMaker()
+	lists = make(map[*nfaStepList]bool)
+	for _, plural := range multi {
+		lists[lm.getList(plural...)] = true
+	}
+	wanted := len(multi)
+	if len(lists) != wanted {
+		t.Errorf("Got %d wanted %d", len(lists), wanted)
+	}
+	for _, plural := range multi {
+		lists[lm.getList(plural...)] = true
+	}
+	if len(lists) != wanted {
+		t.Errorf("Got %d STILL wanted %d", len(lists), wanted)
+	}
+
+}
@@ -55,7 +55,7 @@ func (m *Matcher) AddPattern(x X, patternJSON string) error {
 	// The matcher contains several map[this]that maps but Go maps aren't thread-safe.  This could be solved
 	//  with a straightforward mutex or the fancy sync.Map, but I succumbed to premature optimization and decided
 	//  I didn't want any of that stuff in the Match* path.  So in each case the map (or map-like structure in
-	//  smallTable) is copied, the copy updated, then the whole map updated atomically in the containing structure
+	//  smallDfaTable) is copied, the copy updated, then the whole map updated atomically in the containing structure
 	//  see: https://medium.com/@deckarep/the-new-kid-in-town-gos-sync-map-de24a6bf7c2c
 	m.lock.Lock()
 	defer m.lock.Unlock()
 
@@ -6,8 +6,9 @@ import (
 	"strconv"
 )
 
+// TODO: Make this more efficient and improve unit-test coverage
 const (
-	nineDigits = 1000000000.0
+	nineDigits        = 1000000000.0
 	digitsOfPrecision = 18
 )
 
@@ -24,6 +25,5 @@ func canonicalize(s []byte) (string, error) {
 	if f >= nineDigits || f <= -nineDigits {
 		return "", errors.New(fmt.Sprintf("number is outside of range [%f, %f]", -nineDigits, nineDigits))
 	}
-	return fmt.Sprintf("%019.0f", (f + nineDigits) * nineDigits), nil
+	return fmt.Sprintf("%019.0f", (f+nineDigits)*nineDigits), nil
 }
-
@@ -56,4 +56,3 @@ func TestOrdering(t *testing.T) {
 		}
 	}
 }
-
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`module quamina`
`2`	`2`
`3`		`-go 1.17`
	`3`	`+go 1.18`
Original file line number	Diff line number	Diff line change
`@@ -6,8 +6,9 @@ import (`
`6`	`6`	`"strconv"`
`7`	`7`	`)`
`8`	`8`
	`9`	`+// TODO: Make this more efficient and improve unit-test coverage`
`9`	`10`	`const (`
`10`		`- nineDigits = 1000000000.0`
	`11`	`+ nineDigits = 1000000000.0`
`11`	`12`	`digitsOfPrecision = 18`
`12`	`13`	`)`
`13`	`14`
`@@ -24,6 +25,5 @@ func canonicalize(s []byte) (string, error) {`
`24`	`25`	`if f >= nineDigits \|\| f <= -nineDigits {`
`25`	`26`	`return "", errors.New(fmt.Sprintf("number is outside of range [%f, %f]", -nineDigits, nineDigits))`
`26`	`27`	`}`
`27`		`- return fmt.Sprintf("%019.0f", (f + nineDigits) * nineDigits), nil`
	`28`	`+ return fmt.Sprintf("%019.0f", (f+nineDigits)*nineDigits), nil`
`28`	`29`	`}`
`29`		`-`
Original file line number	Diff line number	Diff line change
`@@ -56,4 +56,3 @@ func TestOrdering(t *testing.T) {`
`56`	`56`	`}`
`57`	`57`	`}`
`58`	`58`	`}`
`59`		`-`