Skip to content

Commit 61eabea

Browse files
committed
Introduce "Segments Tree Tracker"
Currently the flattener and more specifically "NameTracker" is not hierarchy aware, to explain that let's examine this case: Pattern: `{ "context": { "user_id": [1] } }` Event: `{ "context": { "user_id": 1 }, "payload": { /* lots of fields, large payload */ }` In this case the flattener will read context and then user_id, all the other values (strings, objects and arrays) will be skipped, but still will get traversed. By replacing the "NameTracker" with "SegmentsTreeTracker" which is aware of the hierarchy, we can: * Know that once we read "user_id" we can stop consuming the event and exit * Cache the paths - so `pathForChild` is not needed and we reduce allocations. ``` name old time/op new time/op delta CityLots-10 4.42µs ± 0% 4.28µs ± 0% ~ (p=1.000 n=1+1) _JsonFlattener_ContextFields-10 9.06µs ± 0% 0.21µs ± 0% ~ (p=1.000 n=1+1) _JsonFlattener_MiddleNestedField-10 10.9µs ± 0% 1.8µs ± 0% ~ (p=1.000 n=1+1) _JsonFlattener_LastField-10 9.65µs ± 0% 9.68µs ± 0% ~ (p=1.000 n=1+1) _JsonFlattner_Evaluate_ContextFields-10 9.42µs ± 0% 0.50µs ± 0% ~ (p=1.000 n=1+1) _JsonFlattner_Evaluate_MiddleNestedField-10 11.0µs ± 0% 2.0µs ± 0% ~ (p=1.000 n=1+1) _JsonFlattner_Evaluate_LastField-10 9.81µs ± 0% 9.87µs ± 0% ~ (p=1.000 n=1+1) name old alloc/op new alloc/op delta CityLots-10 985B ± 0% 832B ± 0% ~ (p=1.000 n=1+1) _JsonFlattener_ContextFields-10 48.0B ± 0% 0.0B ~ (p=1.000 n=1+1) _JsonFlattener_MiddleNestedField-10 64.0B ± 0% 0.0B ~ (p=1.000 n=1+1) _JsonFlattener_LastField-10 32.0B ± 0% 0.0B ~ (p=1.000 n=1+1) _JsonFlattner_Evaluate_ContextFields-10 232B ± 0% 184B ± 0% ~ (p=1.000 n=1+1) _JsonFlattner_Evaluate_MiddleNestedField-10 240B ± 0% 48B ± 0% ~ (p=1.000 n=1+1) _JsonFlattner_Evaluate_LastField-10 80.0B ± 0% 48.0B ± 0% ~ (p=1.000 n=1+1) name old allocs/op new allocs/op delta CityLots-10 38.0 ± 0% 31.0 ± 0% ~ (p=1.000 n=1+1) _JsonFlattener_ContextFields-10 3.00 ± 0% 0.00 ~ (p=1.000 n=1+1) _JsonFlattener_MiddleNestedField-10 4.00 ± 0% 0.00 ~ (p=1.000 n=1+1) _JsonFlattener_LastField-10 2.00 ± 0% 0.00 ~ (p=1.000 n=1+1) _JsonFlattner_Evaluate_ContextFields-10 9.00 ± 0% 6.00 ± 0% ~ (p=1.000 n=1+1) _JsonFlattner_Evaluate_MiddleNestedField-10 9.00 ± 0% 3.00 ± 0% ~ (p=1.000 n=1+1) _JsonFlattner_Evaluate_LastField-10 5.00 ± 0% 3.00 ± 0% ~ (p=1.000 n=1+1) ``` Signed-off-by: Yosi Attias <[email protected]>
1 parent b04e6d5 commit 61eabea

23 files changed

+511
-202
lines changed

anything_but_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ func TestParseAnythingButPattern(t *testing.T) {
129129
}
130130

131131
for i, good := range goods {
132-
fields, _, err := patternFromJSON([]byte(good))
132+
fields, err := patternFromJSON([]byte(good))
133133
if err != nil {
134134
t.Errorf("parse anything-but i=%d: "+err.Error(), i)
135135
}
@@ -139,7 +139,7 @@ func TestParseAnythingButPattern(t *testing.T) {
139139
}
140140

141141
for _, bad := range bads {
142-
_, _, err := patternFromJSON([]byte(bad))
142+
_, err := patternFromJSON([]byte(bad))
143143
if err == nil {
144144
t.Errorf(`accepted anything-but "%s"`, bad)
145145
}

arrays_test.go

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -70,30 +70,29 @@ const bands = `{
7070
}`
7171

7272
func TestArrayCorrectness(t *testing.T) {
73-
// only pattern3 should match
74-
pattern1 := `{"bands": { "members": { "given": [ "Mick" ], "surname": [ "Strummer" ] } } }`
75-
pattern2 := `{"bands": { "members": { "given": [ "Wata" ], "role": [ "drums" ] } } }`
76-
pattern3 := `{"bands": { "members": { "given": [ "Wata" ], "role": [ "guitar" ] } } }`
73+
// only wataGuiterPattern should match
74+
mickStrummerPattern := `{"bands": { "members": { "given": [ "Mick" ], "surname": [ "Strummer" ] } } }`
75+
wataDrumsPattern := `{"bands": { "members": { "given": [ "Wata" ], "role": [ "drums" ] } } }`
76+
wataGuiterPattern := `{"bands": { "members": { "given": [ "Wata" ], "role": [ "guitar" ] } } }`
77+
7778
m := newCoreMatcher()
78-
err := m.addPattern("Mick strummer", pattern1)
79-
if err != nil {
80-
t.Error(err.Error())
79+
if err := m.addPattern("Mick strummer", mickStrummerPattern); err != nil {
80+
t.Errorf("Failed adding pattern: %s: %s", mickStrummerPattern, err)
8181
}
82-
err = m.addPattern("Wata drums", pattern2)
83-
if err != nil {
84-
t.Error(err.Error())
82+
83+
if err := m.addPattern("Wata drums", wataDrumsPattern); err != nil {
84+
t.Errorf("Failed adding pattern: %s: %s", wataDrumsPattern, err)
8585
}
86-
err = m.addPattern("Wata guitar", pattern3)
87-
if err != nil {
88-
t.Error(err.Error())
86+
if err := m.addPattern("Wata guitar", wataGuiterPattern); err != nil {
87+
t.Errorf("Failed adding pattern: %s: %s", wataGuiterPattern, err)
8988
}
9089

9190
matches, err := m.matchesForJSONEvent([]byte(bands))
9291
if err != nil {
93-
t.Error(err.Error())
92+
t.Errorf("Failed 'matchesForJSONEvent': %s", err)
9493
}
9594

9695
if len(matches) != 1 || matches[0].(string) != "Wata guitar" {
97-
t.Error("Matches across array boundaries")
96+
t.Errorf("Expected to get a single of 'Wata guiter', but got %d matches: %+v", len(matches), matches)
9897
}
9998
}

benchmarks_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ func TestBigShellStyle(t *testing.T) {
160160
before := time.Now()
161161
fj := newJSONFlattener()
162162
for _, line := range lines {
163-
fields, err := fj.Flatten(line, m)
163+
fields, err := fj.Flatten(line, m.getSegmentsTreeTracker())
164164
if err != nil {
165165
t.Error("Flatten: " + err.Error())
166166
}

core_matcher.go

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,12 @@ type coreMatcher struct {
3131

3232
// coreFields groups the updateable fields in coreMatcher.
3333
// state is the start of the automaton.
34-
// namesUsed is a map of field names that are used in any of the patterns that this automaton encodes. Typically,
34+
// segmentsTree is a tree of segments that are used in any of the patterns that this automaton encodes.Typically,
3535
// patterns only consider a subset of the fields in an incoming data object, and there is no reason to consider
3636
// fields that do not appear in patterns when using the automaton for matching.
37-
// fakeField is used when the flattener for an event returns no fields, because it could still match if
38-
// there were patterns with "exists":false. So in this case we run one fake field through the matcher
39-
// which will cause it to notice that any "exists":false patterns should match.
4037
type coreFields struct {
41-
state *fieldMatcher
42-
namesUsed map[string]bool
38+
state *fieldMatcher
39+
segmentsTree *segmentsTree
4340
}
4441

4542
func newCoreMatcher() *coreMatcher {
@@ -49,8 +46,8 @@ func newCoreMatcher() *coreMatcher {
4946
// user-supplied path-name because it's not valid in UTF-8
5047
m := coreMatcher{}
5148
m.updateable.Store(&coreFields{
52-
state: newFieldMatcher(),
53-
namesUsed: make(map[string]bool),
49+
state: newFieldMatcher(),
50+
segmentsTree: newSegmentsIndex(),
5451
})
5552
return &m
5653
}
@@ -62,7 +59,7 @@ func (m *coreMatcher) start() *coreFields {
6259
// addPattern - the patternBytes is a JSON object. The X is what the matcher returns to indicate that the
6360
// provided pattern has been matched. In many applications it might be a string which is the pattern's name.
6461
func (m *coreMatcher) addPattern(x X, patternJSON string) error {
65-
patternFields, patternNamesUsed, err := patternFromJSON([]byte(patternJSON))
62+
patternFields, err := patternFromJSON([]byte(patternJSON))
6663
if err != nil {
6764
return err
6865
}
@@ -75,15 +72,13 @@ func (m *coreMatcher) addPattern(x X, patternJSON string) error {
7572

7673
// we build up the new coreMatcher state in freshStart so we can atomically switch it in once complete
7774
freshStart := &coreFields{}
78-
freshStart.namesUsed = make(map[string]bool)
7975
current := m.start()
76+
freshStart.segmentsTree = current.segmentsTree.copy()
8077
freshStart.state = current.state
8178

82-
for k := range current.namesUsed {
83-
freshStart.namesUsed[k] = true
84-
}
85-
for used := range patternNamesUsed {
86-
freshStart.namesUsed[used] = true
79+
// Add paths to the segments tree index.
80+
for _, field := range patternFields {
81+
freshStart.segmentsTree.add(field.path)
8782
}
8883

8984
// now we add each of the name/value pairs in fields slice to the automaton, starting with the start state -
@@ -132,7 +127,7 @@ func (m *coreMatcher) deletePatterns(_ X) error {
132127
// This is a leftover from previous times, is only used by tests, but it's used by a *lot*
133128
// so removing it would require a lot of tedious work
134129
func (m *coreMatcher) matchesForJSONEvent(event []byte) ([]X, error) {
135-
fields, err := newJSONFlattener().Flatten(event, m)
130+
fields, err := newJSONFlattener().Flatten(event, m.getSegmentsTreeTracker())
136131
if err != nil {
137132
return nil, err
138133
}
@@ -251,7 +246,6 @@ func noArrayTrailConflict(from []ArrayPos, to []ArrayPos) bool {
251246
return true
252247
}
253248

254-
func (m *coreMatcher) IsNameUsed(label []byte) bool {
255-
_, ok := m.start().namesUsed[string(label)]
256-
return ok
249+
func (m *coreMatcher) getSegmentsTreeTracker() SegmentsTreeTracker {
250+
return m.start().segmentsTree
257251
}

core_matcher_test.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -248,15 +248,6 @@ func TestSimpleaddPattern(t *testing.T) {
248248
if err != nil {
249249
t.Error(err.Error())
250250
}
251-
if len(m.start().namesUsed) != 2 {
252-
t.Errorf("nameUsed = %d", len(m.start().namesUsed))
253-
}
254-
if !m.IsNameUsed([]byte("a")) {
255-
t.Error("'a' not showing as used")
256-
}
257-
if !m.IsNameUsed([]byte("b")) {
258-
t.Error("'b' not showing as used")
259-
}
260251
s0 := m.start().state
261252
if len(s0.fields().transitions) != 1 {
262253
t.Errorf("s0 trans len %d", len(s0.fields().transitions))

escaping_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ func TestReadMemberName(t *testing.T) {
88
j := `{"😀💋😺": 1, "x\u0078\ud83d\udc8by": "2"}`
99
m := fakeMatcher("😀💋😺", `xx💋y`)
1010
f := newJSONFlattener()
11-
fields, err := f.Flatten([]byte(j), m)
11+
fields, err := f.Flatten([]byte(j), m.getSegmentsTreeTracker())
1212
if err != nil {
1313
t.Error("TRMN: " + err.Error())
1414
}
@@ -26,7 +26,7 @@ func TestReadMemberName(t *testing.T) {
2626
func TestStringValuesWithEscapes(t *testing.T) {
2727
j := `{"a": "x\u0078\ud83d\udc8by", "b": "\ud83d\ude00\ud83d\udc8b\ud83d\ude3a"}`
2828
f := newJSONFlattener()
29-
fields, err := f.Flatten([]byte(j), fakeMatcher("a", "b"))
29+
fields, err := f.Flatten([]byte(j), fakeMatcher("a", "b").getSegmentsTreeTracker())
3030
if err != nil {
3131
t.Error("TSVWE: " + err.Error())
3232
}

external_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ type fakeFlattener struct {
1010
r []quamina.Field
1111
}
1212

13-
func (f *fakeFlattener) Flatten(_ []byte, _ quamina.NameTracker) ([]quamina.Field, error) {
13+
func (f *fakeFlattener) Flatten(_ []byte, _ quamina.SegmentsTreeTracker) ([]quamina.Field, error) {
1414
return f.r, nil
1515
}
1616

0 commit comments

Comments
 (0)