11package quamina
22
3+ import "fmt"
4+
35// smallTable serves as a lookup table that encodes mappings between ranges of byte values and the SmallStep
46// transition on any byte in the range.
57// The way it works is exposed in the step() function just below. Logically, it's a slice of {byte, *smallStep}
@@ -9,12 +11,13 @@ package quamina
911// steps: nil, &ss1, nil, &ss2, nil
1012// invariant: The last element of ceilings is always Utf8ByteCeiling
1113// The motivation is that we want to build a state machine on byte values to implement things like prefixes and
12- // ranges of bytes. This could be done simply with a byte array of size Utf8ByteCeiling for each state in the machine,
14+ // ranges of bytes. This could be done simply with a byte array of size ByteCeiling for each state in the machine,
1315// or a map[byte]smallStep, but both would be size-inefficient, particularly in the case where you're implementing
1416// ranges. Now, the step function is O(N) in the number of entries, but empirically, the number of entries is
1517// small even in large machines, so skipping throgh the ceilings list is measurably about the same speed as a map
1618// or array construct
1719type smallTable struct {
20+ name string
1821 slices * stSlices
1922}
2023
@@ -25,14 +28,15 @@ type stSlices struct {
2528 steps []smallStep
2629}
2730
28- // Utf8ByteCeiling - the automaton runs on UTF-8 bytes, which map nicely to Go byte, which is uint8. The values
29- // 0xF5-0xFF can't appear in UTF-8 strings, so anything can safely be assumed to be less than this value
30- const Utf8ByteCeiling int = 0xf5
31+ // ByteCeiling - the automaton runs on UTF-8 bytes, which map nicely to Go byte, which is uint8. The values
32+ // 0xF5-0xFF can't appear in UTF-8 strings, we use 0xF5 as a value terminator, so characters F6 and higher
33+ // can't appear.
34+ const ByteCeiling int = 0xf6
3135
3236func newSmallTable () * smallTable {
3337 return & smallTable {
3438 slices : & stSlices {
35- ceilings : []byte {byte (Utf8ByteCeiling )},
39+ ceilings : []byte {byte (ByteCeiling )},
3640 steps : []smallStep {nil },
3741 },
3842 }
@@ -45,6 +49,9 @@ func (t *smallTable) SmallTable() *smallTable {
4549func (t * smallTable ) SmallTransition () * smallTransition {
4650 return nil
4751}
52+ func (t * smallTable ) HasTransition () bool {
53+ return false
54+ }
4855
4956func (t * smallTable ) step (utf8Byte byte ) smallStep {
5057 for index , ceiling := range t .slices .ceilings {
@@ -55,11 +62,57 @@ func (t *smallTable) step(utf8Byte byte) smallStep {
5562 panic ("Malformed SmallTable" )
5663}
5764
65+ // mergeAutomata computes the union of two valueMatch automata
66+ // invariant: neither argument is nil
67+ // TODO: Make sure it's thread-safe, as in doesn't write into existing tables from either new or existing
68+ func mergeAutomata (existing , newStep smallStep , memoize map [string ]smallStep ) smallStep {
69+ var combined smallStep
70+ mKey := fmt .Sprintf ("%v%v" , existing , newStep )
71+ combined , ok := memoize [mKey ]
72+ if ok {
73+ return combined
74+ }
75+
76+ // we always take the transition from the existing step
77+ // switch is easier than if/else
78+ switch {
79+ case (! (existing .HasTransition () || newStep .HasTransition ())):
80+ combined = newSmallTable ()
81+ case existing .HasTransition () && newStep .HasTransition ():
82+ combined = newSmallTransition (existing .SmallTransition ().fieldTransition )
83+ case existing .HasTransition () && (! newStep .HasTransition ()):
84+ combined = newSmallTransition (existing .SmallTransition ().fieldTransition )
85+ case (! existing .HasTransition ()) && newStep .HasTransition ():
86+ combined = newSmallTransition (newStep .SmallTransition ().fieldTransition )
87+ }
88+ memoize [mKey ] = combined
89+ combined .SmallTable ().name = fmt .Sprintf ("(%s/%s)" , existing .SmallTable ().name , newStep .SmallTable ().name )
90+
91+ uExisting := unpack (existing .SmallTable ())
92+ uNew := unpack (newStep .SmallTable ())
93+ var uComb unpackedTable
94+ for i , stepExisting := range uExisting {
95+ stepNew := uNew [i ]
96+ switch {
97+ case stepExisting == nil && stepNew == nil :
98+ uComb [i ] = nil
99+ case stepExisting != nil && stepNew == nil :
100+ uComb [i ] = stepExisting
101+ case stepExisting == nil && stepNew != nil :
102+ uComb [i ] = stepNew
103+ case stepExisting != nil && stepNew != nil :
104+ uComb [i ] = mergeAutomata (stepExisting , stepNew , memoize )
105+ }
106+ }
107+ combined .SmallTable ().pack (& uComb )
108+ return combined
109+ }
110+
58111// unpackedTable replicates the data in the smallTable ceilings and steps arrays. It's quite hard to
59112// update the list structure in a smallTable, but trivial in an unpackedTable. The idea is that to update
60113// a smallTable you unpack it, update, then re-pack it. Not gonna be the most efficient thing so at some future point…
61114// TODO: Figure out how to update a smallTable in place
62- type unpackedTable [Utf8ByteCeiling ]smallStep
115+ type unpackedTable [ByteCeiling ]smallStep
63116
64117func unpack (t * smallTable ) * unpackedTable {
65118 var u unpackedTable
@@ -84,11 +137,26 @@ func (t *smallTable) pack(u *unpackedTable) {
84137 }
85138 lastStep = ss
86139 }
87- slices .ceilings = append (slices .ceilings , byte (Utf8ByteCeiling ))
140+ slices .ceilings = append (slices .ceilings , byte (ByteCeiling ))
88141 slices .steps = append (slices .steps , lastStep )
89142 t .slices = & slices // atomic update
90143}
91144
145+ func (t * smallTable ) addByteStep (utf8Byte byte , step smallStep ) {
146+ unpacked := unpack (t )
147+ unpacked [utf8Byte ] = step
148+ t .pack (unpacked )
149+ }
150+
151+ func (t * smallTable ) addRangeSteps (floor int , ceiling int , step smallStep ) {
152+ unpacked := unpack (t )
153+ for i := floor ; i < ceiling ; i ++ {
154+ unpacked [i ] = step
155+ }
156+ t .pack (unpacked )
157+ }
158+
159+ /*
92160func (t *smallTable) addRange(utf8Bytes []byte, step smallStep) {
93161 // TODO update fuzz test to include this
94162 unpacked := unpack(t)
@@ -97,3 +165,4 @@ func (t *smallTable) addRange(utf8Bytes []byte, step smallStep) {
97165 }
98166 t.pack(unpacked)
99167}
168+ */
0 commit comments