Skip to content

Commit cd8d31a

Browse files
authored
numbits+base128 8-byte full-precision numbers (#349)
* numbits+base128 8-byte full-precision numbers Signed-off-by: Tim Bray <[email protected]> * fix up comments and README Signed-off-by: Tim Bray <[email protected]> * make UTF-8 version of numbits variable-length Signed-off-by: Tim Bray <[email protected]> * Address feedback from Arne, rewrite varwidth numbits Signed-off-by: Tim Bray <[email protected]> * fix lint, use latest numbits.go Signed-off-by: Tim Bray <[email protected]> --------- Signed-off-by: Tim Bray <[email protected]>
1 parent be1752d commit cd8d31a

File tree

12 files changed

+756
-703
lines changed

12 files changed

+756
-703
lines changed

PATTERNS.md

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -58,16 +58,8 @@ Thus, the following Pattern would match both JSON events above:
5858

5959
### Numeric Values
6060

61-
It would be convenient if Quamina knew, for matching purposes, that 35,
62-
35.00, and 3.5e1 were all the same number.
63-
64-
In many cases, Quamina can manage this. Specifically, for numbers that:
65-
66-
* are between -5.0e9 and 5.0e9 inclusive.
67-
* have five or fewer fractional digits.
68-
69-
Numbers which do not meet these criteria will be treated as strings, which
70-
usually produces good results.
61+
Quamina can match numeric values with precision and range exactly the same as that provided by
62+
Go's `float64` data type, which is said to conform to IEE 754 `binary64`.
7163

7264
## Extended Patterns
7365
An **Extended Pattern** **MUST** be a JSON object containing

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,6 @@ The `"exists":true` and `"exists":false` patterns
150150
have corner cases; details are covered in
151151
[Patterns in Quamina](PATTERNS.md).
152152

153-
Quamina can match numeric values correctly, subject to
154-
certain limits; details are in [Patterns in Quamina](PATTERNS.md).
155-
156153
## Flattening and Matching
157154

158155
The first step in finding matches for an Event is
@@ -386,3 +383,5 @@ colonies before slavery was abolished.
386383
@embano1: CI/CD and project structure.
387384

388385
@yosiat: Flattening optimization.
386+
387+
@arnehormann: compact high-precision number representation.

case_folding.go

Lines changed: 471 additions & 471 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

core_matcher.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,13 @@ func (m *coreMatcher) deletePatterns(_ X) error {
149149
// This is a leftover from previous times, is only used by tests, but it's used by a *lot*
150150
// and it's a convenient API for testing.
151151
func (m *coreMatcher) matchesForJSONEvent(event []byte) ([]X, error) {
152-
fields, _ := newJSONFlattener().Flatten(event, m.getSegmentsTreeTracker())
152+
return m.matchesForJSONWithFlattener(event, newJSONFlattener())
153+
}
154+
155+
// if your test is a benchmark, call newJSONFlattener and pass it to this routine, matchesForJSONWithFlattener
156+
// because newJSONFlattener() is fairly heavyweight and you want it out of the benchmark loop
157+
func (m *coreMatcher) matchesForJSONWithFlattener(event []byte, f Flattener) ([]X, error) {
158+
fields, _ := f.Flatten(event, m.getSegmentsTreeTracker())
153159
return m.matchesForFields(fields)
154160
}
155161

flatten_json.go

Lines changed: 27 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
214214
}
215215

216216
var val []byte
217-
isQNumber := false
217+
isNumber := false
218218
switch ch {
219219
case '"':
220220
if fj.skipping > 0 || !memberIsUsed {
@@ -233,7 +233,10 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
233233
val, err = fj.readLiteral(nullBytes)
234234
isLeaf = true
235235
case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
236-
val, isQNumber, err = fj.readNumber()
236+
val, err = fj.readNumber()
237+
if err == nil {
238+
isNumber = true
239+
}
237240
isLeaf = true
238241
case '[':
239242
if !pathNode.IsSegmentUsed(memberName) {
@@ -296,7 +299,7 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
296299
}
297300
if val != nil {
298301
if memberIsUsed {
299-
fj.storeObjectMemberField(pathNode.PathForSegment(memberName), arrayTrail, val, isQNumber)
302+
fj.storeObjectMemberField(pathNode.PathForSegment(memberName), arrayTrail, val, isNumber)
300303
fieldsCount--
301304
}
302305
}
@@ -340,7 +343,7 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
340343
for {
341344
ch := fj.ch()
342345
var val []byte // resets on each loop
343-
isQNumber := false
346+
isNumber := false
344347
switch state {
345348
case fjInArrayState:
346349
// bypass space before element value. A bit klunky but allows for immense simplification
@@ -365,7 +368,10 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
365368
val, err = fj.readLiteral(nullBytes)
366369
isLeaf = true
367370
case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
368-
val, isQNumber, err = fj.readNumber()
371+
val, err = fj.readNumber()
372+
if err == nil {
373+
isNumber = true
374+
}
369375
isLeaf = true
370376
case '{':
371377
if fj.skipping == 0 {
@@ -398,7 +404,7 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
398404
if val != nil {
399405
if fj.skipping == 0 {
400406
fj.stepOneArrayElement()
401-
fj.storeArrayElementField(pathName, val, isQNumber)
407+
fj.storeArrayElementField(pathName, val, isNumber)
402408
}
403409
}
404410
state = fjAfterValueState
@@ -427,13 +433,10 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
427433
* these higher-level funcs are going to advance the pointer after each invocation
428434
*/
429435

430-
func (fj *flattenJSON) readNumber() ([]byte, bool, error) {
436+
func (fj *flattenJSON) readNumber() ([]byte, error) {
431437
// points at the first character in the number
432438
numStart := fj.eventIndex
433439
state := fjNumberStartState
434-
isQNumber := false
435-
fracStart := 0
436-
expStart := 0
437440
for {
438441
ch := fj.ch()
439442
switch state {
@@ -450,38 +453,33 @@ func (fj *flattenJSON) readNumber() ([]byte, bool, error) {
450453
// no-op
451454
case '.':
452455
state = fjNumberFracState
453-
fracStart = fj.eventIndex + 1
454456
case 'e', 'E':
455457
state = fjNumberAfterEState
456-
expStart = fj.eventIndex + 1
457458
case ',', ']', '}', ' ', '\t', '\n', '\r':
458459
fj.eventIndex--
459-
return fj.event[numStart : fj.eventIndex+1], true, nil
460+
return fj.event[numStart : fj.eventIndex+1], nil
460461
default:
461-
return nil, false, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
462+
return nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
462463
}
463464
case fjNumberFracState:
464465
switch ch {
465466
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
466467
// no-op
467468
case ',', ']', '}', ' ', '\t', '\n', '\r':
468-
fractionalDigits := (expStart - 1) - fracStart
469-
isQNumber = fractionalDigits <= MaxFractionalDigits
470469
fj.eventIndex--
471470
bytes := fj.event[numStart : fj.eventIndex+1]
472-
return bytes, isQNumber, nil
471+
return bytes, nil
473472
case 'e', 'E':
474473
state = fjNumberAfterEState
475-
expStart = fj.eventIndex + 1
476474
default:
477-
return nil, false, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
475+
return nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
478476
}
479477
case fjNumberAfterEState:
480478
switch ch {
481479
case '-', '1', '2', '3', '4', '5', '6', '7', '8', '9':
482480
// no-op
483481
default:
484-
return nil, false, fj.error(fmt.Sprintf("illegal char '%c' after 'e' in number", ch))
482+
return nil, fj.error(fmt.Sprintf("illegal char '%c' after 'e' in number", ch))
485483
}
486484
state = fjNumberExpState
487485

@@ -490,27 +488,14 @@ func (fj *flattenJSON) readNumber() ([]byte, bool, error) {
490488
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
491489
// no-op
492490
case ',', ']', '}', ' ', '\t', '\n', '\r':
493-
fractionalDigits := 0
494-
if fracStart != 0 {
495-
fractionalDigits = (expStart - 1) - fracStart
496-
if fractionalDigits > MaxFractionalDigits {
497-
if expStart != 0 {
498-
exp, err := strconv.ParseInt(string(fj.event[expStart:fj.eventIndex]), 10, 32)
499-
if err == nil {
500-
fractionalDigits -= int(exp)
501-
}
502-
}
503-
}
504-
}
505-
isQNumber = fractionalDigits <= MaxFractionalDigits
506491
fj.eventIndex--
507-
return fj.event[numStart : fj.eventIndex+1], isQNumber, nil
492+
return fj.event[numStart : fj.eventIndex+1], nil
508493
default:
509-
return nil, false, fj.error(fmt.Sprintf("illegal char '%c' in exponent", ch))
494+
return nil, fj.error(fmt.Sprintf("illegal char '%c' in exponent", ch))
510495
}
511496
}
512497
if fj.step() != nil {
513-
return nil, false, fj.error("event truncated in number")
498+
return nil, fj.error("event truncated in number")
514499
}
515500
}
516501
}
@@ -811,6 +796,8 @@ func (fj *flattenJSON) readHexUTF16(from int) ([]byte, int, error) {
811796
if hexDigitCount == 4 {
812797
hexString := string(fj.event[from-3 : from+1])
813798
r, _ := strconv.ParseUint(hexString, 16, 16)
799+
// parsing 4 hex digits can't overflow a uint16
800+
//nolint:gosec
814801
codepoints = append(codepoints, uint16(r))
815802
state = fjStartEscapeState
816803
}
@@ -831,14 +818,14 @@ func (fj *flattenJSON) readHexUTF16(from int) ([]byte, int, error) {
831818
// its own snapshot of the array-trail data, because it'll be different for each array element
832819
// NOTE: The profiler says this is the most expensive function in the whole matchesForJSONEvent universe, presumably
833820
// because of the necessity to construct a new arrayTrail for each element.
834-
func (fj *flattenJSON) storeArrayElementField(path []byte, val []byte, isQNumber bool) {
835-
f := Field{Path: path, ArrayTrail: make([]ArrayPos, len(fj.arrayTrail)), Val: val, IsQNumber: isQNumber}
821+
func (fj *flattenJSON) storeArrayElementField(path []byte, val []byte, isNumber bool) {
822+
f := Field{Path: path, ArrayTrail: make([]ArrayPos, len(fj.arrayTrail)), Val: val, IsNumber: isNumber}
836823
copy(f.ArrayTrail, fj.arrayTrail)
837824
fj.fields = append(fj.fields, f)
838825
}
839826

840-
func (fj *flattenJSON) storeObjectMemberField(path []byte, arrayTrail []ArrayPos, val []byte, isQNumber bool) {
841-
fj.fields = append(fj.fields, Field{Path: path, ArrayTrail: arrayTrail, Val: val, IsQNumber: isQNumber})
827+
func (fj *flattenJSON) storeObjectMemberField(path []byte, arrayTrail []ArrayPos, val []byte, isNumber bool) {
828+
fj.fields = append(fj.fields, Field{Path: path, ArrayTrail: arrayTrail, Val: val, IsNumber: isNumber})
842829
}
843830

844831
func (fj *flattenJSON) enterArray() {

flattener.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,5 +58,5 @@ type Field struct {
5858
Path []byte
5959
Val []byte
6060
ArrayTrail []ArrayPos
61-
IsQNumber bool
61+
IsNumber bool
6262
}

numbers.go

Lines changed: 23 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,121 +1,54 @@
11
package quamina
22

33
import (
4-
"encoding/binary"
54
"errors"
5+
"fmt"
66
"strconv"
77
)
88

99
// You can't easily build automata to compare numbers based on either the decimal notation found
10-
// in text data or the internal floating-point bits. Therefore, for a restricted subset of numbers,
11-
// we define a 7-byte (14 hex digit) representation that facilitates building automata to support
12-
// equality and ordering comparison.
13-
//
14-
// The representation supports 10**15 numbers. The first three are:
15-
// decimal: -5_000_000_000, -4_999_999_999.99999, -4_999_999_999.99998, ...
16-
// 14-byte: 00000000000000, 00000000000009, 00000000000014
17-
// and the last three are
18-
// decimal: .., 4_999_999_999.99998, 4_999_999_999.99999, 5_000_000_000
19-
// 14-byte: 2386F26FC0FFEC, 2386F26FC0FFF6, 2386F26FC10000
20-
//
21-
// In English: all numbers that are between negative and positive 5 billion inclusive, with up to five
22-
// digits after the decimal point.
23-
// These numbers have fifteen decimal digits of precision, which is what double floats can offer.
24-
// They include most numbers that are used in practice, including prices, occurrence counts, size
25-
// measurements, and so on.
26-
// Examples of numbers that do NOT meet these criteria include AWS account numbers, some telephone
27-
// numbers, and cryptographic keys/signatures. For these, treatment as strings seems to produce
28-
// satisfactory results for equality testing.
10+
// in text data or the internal floating-point bits. Therefore, we map floating-point numbers
11+
// (which is what JSON numbers basically are) to comparable slices of 7-bit bytes which preserve the
12+
// numbers' ordering. Versions of Quamina up to 1.3 used a home-grown format which used 14 hex digits
13+
// to represent a subset of numbers. This has now been replaced by Arne Hormann's "numbits"
14+
// construct, see numbits.go. It uses up to 10 base128 bytes to represent the entire range of float64 numbers.
15+
// Both this file and numbits.go are very short, but I'm keeping them separated because someone might
16+
// figure out a still-better serialization of numbers and then this part wouldn't have to change.
2917
// In Quamina these are called "Q numbers".
30-
// How It's Done
18+
3119
// There is considerable effort to track, at the NFA level, which NFAs are built to match field values
32-
// that are Q numbers; see vmFields.hasQNumbers. Similarly, the JSONFlattener, since it has to
20+
// that are Q numbers; see vmFields.hasNumbers. Similarly, the JSONFlattener, since it has to
3321
// look at all the digits in a number in order to parse it, can keep track of whether it can be made
3422
// a Q number. The key benefit of this is in valueMatcher.transitionOn, which incurs the cost of
3523
// making a Q number only if it is known that the valueMatcher's NFA can benefit from it and
3624
// that the number in the incoming event can in fact be made a Q number.
3725

38-
const (
39-
TenE6 = 1e6
40-
FiveBillion = 5e9
41-
Hexes = "0123456789ABCDEF"
42-
MaxFractionalDigits = 5
43-
)
44-
4526
type qNumber []byte
4627

4728
// qNumFromBytes works out whether a string representing a number falls within the
4829
// limits imposed for Q numbers. It is heavily optimized and relies on the form
4930
// of the number already having been validated, e.g. by flattenJSON().
5031
func qNumFromBytes(bytes []byte) (qNumber, error) {
51-
// shortcut: The shorest number with more than 5 fractional digits is like 0.123456
52-
if len(bytes) < 8 {
53-
numeric, err := strconv.ParseFloat(string(bytes), 64)
54-
if err != nil {
55-
return nil, errors.New("not a float") // should never happen, json parser upstream
56-
}
57-
return qNumFromFloat(numeric)
58-
}
59-
// compute number of fractional digits. The loop below relies on the fact that anything between '.' and either
60-
// 'e' or the end of the string must be a digit, as must anything between 'e' and the end of the string.
61-
//. NOTE: This will be fooled by "35.000000"
62-
fracStart := 0
63-
expStart := 0
64-
index := 0
65-
var utf8Byte byte
66-
fractionalDigits := 0
67-
ForEachByte:
68-
for index, utf8Byte = range bytes {
69-
switch utf8Byte {
70-
case '.':
71-
fracStart = index + 1
72-
case 'e', 'E':
73-
expStart = index + 1
74-
break ForEachByte
75-
}
76-
}
77-
if fracStart != 0 {
78-
fractionalDigits = index - fracStart
79-
}
80-
// if too many fractional digits, perhaps the exponent will push the '.' to the right
81-
if fractionalDigits > MaxFractionalDigits {
82-
if expStart != 0 {
83-
exp, err := strconv.ParseInt(string(bytes[expStart:]), 10, 32)
84-
if err == nil {
85-
fractionalDigits -= int(exp)
86-
}
87-
}
88-
}
89-
if fractionalDigits > MaxFractionalDigits {
90-
return nil, errors.New("more than 5 fractional digits")
91-
}
92-
9332
numeric, err := strconv.ParseFloat(string(bytes), 64)
9433
if err != nil {
95-
return nil, errors.New("not a float") // shouldn't happen, upstream parser should prvent
34+
return nil, errors.New("not a float") // should never happen, json parser upstream
9635
}
97-
return qNumFromFloat(numeric)
36+
return qNumFromFloat(numeric), nil
9837
}
9938

100-
func qNumFromFloat(f float64) (qNumber, error) {
101-
if f < -FiveBillion || f > FiveBillion {
102-
return nil, errors.New("value must be between -5e9 and +5e9 inclusive")
103-
}
104-
value := uint64(TenE6 * (FiveBillion + f))
105-
return toHexStringSkippingFirstByte(value), nil
39+
// qNumFromFLoat is here mostly to support testing
40+
func qNumFromFloat(f float64) qNumber {
41+
return numbitsFromFloat64(f).toQNumber()
10642
}
10743

108-
func toHexStringSkippingFirstByte(value uint64) []byte {
109-
var buf [8]byte
110-
binary.BigEndian.PutUint64(buf[:], value)
111-
var outputChars [14]byte
112-
for i, utf8Byte := range buf {
113-
if i == 0 {
114-
continue
44+
// for debugging
45+
func (q qNumber) String() string {
46+
ret := ""
47+
for i, b := range q {
48+
if i != 0 {
49+
ret += "-"
11550
}
116-
pos := (i - 1) * 2
117-
outputChars[pos] = Hexes[utf8Byte>>4]
118-
outputChars[pos+1] = Hexes[buf[i]&0xf]
51+
ret += fmt.Sprintf("%02x", b)
11952
}
120-
return outputChars[:]
53+
return ret
12154
}

0 commit comments

Comments
 (0)