Skip to content

Commit 2b1fe99

Browse files
randall77tomocy
authored andcommitted
bytes/hash: add hashing package for bytes and strings
Fixes golang#28322 R=go1.14 RELNOTE=yes Change-Id: Ic29f8b587c8c77472260836a5c3e13edaded13fa Reviewed-on: https://go-review.googlesource.com/c/go/+/186877 Reviewed-by: Alan Donovan <[email protected]>
1 parent 9e5db40 commit 2b1fe99

File tree

4 files changed

+764
-50
lines changed

4 files changed

+764
-50
lines changed

src/bytes/hash/hash.go

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
// Copyright 2019 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// Package bytes/hash provides hash functions on byte sequences. These
6+
// hash functions are intended to be used to implement hash tables or
7+
// other data structures that need to map arbitrary strings or byte
8+
// sequences to a uniform distribution of integers. The hash functions
9+
// are collision-resistant but are not cryptographically secure (use
10+
// one of the hash functions in crypto/* if you need that).
11+
//
12+
// The produced hashes depend only on the sequence of bytes provided
13+
// to the Hash object, not on the way in which they are provided. For
14+
// example, the calls
15+
// h.AddString("foo")
16+
// h.AddBytes([]byte{'f','o','o'})
17+
// h.AddByte('f'); h.AddByte('o'); h.AddByte('o')
18+
// will all have the same effect.
19+
//
20+
// Two Hash instances in the same process using the same seed
21+
// behave identically.
22+
//
23+
// Two Hash instances with the same seed in different processes are
24+
// not guaranteed to behave identically, even if the processes share
25+
// the same binary.
26+
//
27+
// Hashes are intended to be collision-resistant, even for situations
28+
// where an adversary controls the byte sequences being hashed.
29+
// All bits of the Hash result are close to uniformly and
30+
// independently distributed, so can be safely restricted to a range
31+
// using bit masking, shifting, or modular arithmetic.
32+
package hash
33+
34+
import (
35+
"unsafe"
36+
)
37+
38+
// A Seed controls the behavior of a Hash. Two Hash objects with the
39+
// same seed in the same process will behave identically. Two Hash
40+
// objects with different seeds will very likely behave differently.
41+
type Seed struct {
42+
s uint64
43+
}
44+
45+
// A Hash object is used to compute the hash of a byte sequence.
46+
type Hash struct {
47+
seed Seed // initial seed used for this hash
48+
state Seed // current hash of all flushed bytes
49+
buf [64]byte // unflushed byte buffer
50+
n int // number of unflushed bytes
51+
}
52+
53+
// AddByte adds b to the sequence of bytes hashed by h.
54+
func (h *Hash) AddByte(b byte) {
55+
if h.n == len(h.buf) {
56+
h.flush()
57+
}
58+
h.buf[h.n] = b
59+
h.n++
60+
}
61+
62+
// AddBytes adds b to the sequence of bytes hashed by h.
63+
func (h *Hash) AddBytes(b []byte) {
64+
for h.n+len(b) > len(h.buf) {
65+
k := copy(h.buf[h.n:], b)
66+
h.n = len(h.buf)
67+
b = b[k:]
68+
h.flush()
69+
}
70+
h.n += copy(h.buf[h.n:], b)
71+
}
72+
73+
// AddString adds the bytes of s to the sequence of bytes hashed by h.
74+
func (h *Hash) AddString(s string) {
75+
for h.n+len(s) > len(h.buf) {
76+
k := copy(h.buf[h.n:], s)
77+
h.n = len(h.buf)
78+
s = s[k:]
79+
h.flush()
80+
}
81+
h.n += copy(h.buf[h.n:], s)
82+
}
83+
84+
// Seed returns the seed value specified in the most recent call to
85+
// SetSeed, or the initial seed if SetSeed was never called.
86+
func (h *Hash) Seed() Seed {
87+
return h.seed
88+
}
89+
90+
// SetSeed sets the seed used by h. Two Hash objects with the same
91+
// seed in the same process will behave identically. Two Hash objects
92+
// with different seeds will very likely behave differently. Any
93+
// bytes added to h previous to this call will be discarded.
94+
func (h *Hash) SetSeed(seed Seed) {
95+
h.seed = seed
96+
h.state = seed
97+
h.n = 0
98+
}
99+
100+
// Reset discards all bytes added to h.
101+
// (The seed remains the same.)
102+
func (h *Hash) Reset() {
103+
h.state = h.seed
104+
h.n = 0
105+
}
106+
107+
// precondition: buffer is full.
108+
func (h *Hash) flush() {
109+
if h.n != len(h.buf) {
110+
panic("flush of partially full buffer")
111+
}
112+
h.state.s = rthash(h.buf[:], h.state.s)
113+
h.n = 0
114+
}
115+
116+
// Hash returns a value which depends on h's seed and the sequence of
117+
// bytes added to h (since the last call to Reset or SetSeed).
118+
func (h *Hash) Hash() uint64 {
119+
return rthash(h.buf[:h.n], h.state.s)
120+
}
121+
122+
// MakeSeed returns a Seed initialized using the bits in s.
123+
// Two seeds generated with the same s are guaranteed to be equal.
124+
// Two seeds generated with different s are very likely to be different.
125+
// TODO: disallow this? See Alan's comment in the issue.
126+
func MakeSeed(s uint64) Seed {
127+
return Seed{s: s}
128+
}
129+
130+
// New returns a new Hash object. Different hash objects allocated by
131+
// this function will very likely have different seeds.
132+
func New() *Hash {
133+
seed := Seed{s: uint64(runtime_fastrand())}
134+
return &Hash{
135+
seed: seed,
136+
state: seed,
137+
}
138+
}
139+
140+
//go:linkname runtime_fastrand runtime.fastrand
141+
func runtime_fastrand() uint32
142+
143+
func rthash(b []byte, seed uint64) uint64 {
144+
if len(b) == 0 {
145+
return seed
146+
}
147+
// The runtime hasher only works on uintptr. For 64-bit
148+
// architectures, we use the hasher directly. Otherwise,
149+
// we use two parallel hashers on the lower and upper 32 bits.
150+
if unsafe.Sizeof(uintptr(0)) == 8 {
151+
return uint64(runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed), uintptr(len(b))))
152+
}
153+
lo := runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed), uintptr(len(b)))
154+
hi := runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed>>32), uintptr(len(b)))
155+
// TODO: mix lo/hi? Get 64 bits some other way?
156+
return uint64(hi)<<32 | uint64(lo)
157+
}
158+
159+
//go:linkname runtime_memhash runtime.memhash
160+
func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
161+
162+
// Wrapper functions so that a bytes/hash.Hash implements
163+
// the hash.Hash and hash.Hash64 interfaces.
164+
165+
func (h *Hash) Write(b []byte) (int, error) {
166+
h.AddBytes(b)
167+
return len(b), nil
168+
}
169+
func (h *Hash) Sum(b []byte) []byte {
170+
x := h.Hash()
171+
return append(b,
172+
byte(x>>0),
173+
byte(x>>8),
174+
byte(x>>16),
175+
byte(x>>24),
176+
byte(x>>32),
177+
byte(x>>40),
178+
byte(x>>48),
179+
byte(x>>56))
180+
}
181+
func (h *Hash) Sum64() uint64 {
182+
return h.Hash()
183+
}
184+
func (h *Hash) Size() int { return 8 }
185+
func (h *Hash) BlockSize() int { return len(h.buf) }

src/bytes/hash/hash_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// Copyright 2019 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package hash_test
6+
7+
import (
8+
"bytes/hash"
9+
basehash "hash"
10+
"testing"
11+
)
12+
13+
func TestUnseededHash(t *testing.T) {
14+
m := map[uint64]struct{}{}
15+
for i := 0; i < 1000; i++ {
16+
h := hash.New()
17+
m[h.Hash()] = struct{}{}
18+
}
19+
if len(m) < 900 {
20+
t.Errorf("empty hash not sufficiently random: got %d, want 1000", len(m))
21+
}
22+
}
23+
24+
func TestSeededHash(t *testing.T) {
25+
s := hash.MakeSeed(1234)
26+
m := map[uint64]struct{}{}
27+
for i := 0; i < 1000; i++ {
28+
h := hash.New()
29+
h.SetSeed(s)
30+
m[h.Hash()] = struct{}{}
31+
}
32+
if len(m) != 1 {
33+
t.Errorf("seeded hash is random: got %d, want 1", len(m))
34+
}
35+
}
36+
37+
func TestHashGrouping(t *testing.T) {
38+
b := []byte("foo")
39+
h1 := hash.New()
40+
h2 := hash.New()
41+
h2.SetSeed(h1.Seed())
42+
h1.AddBytes(b)
43+
for _, x := range b {
44+
h2.AddByte(x)
45+
}
46+
if h1.Hash() != h2.Hash() {
47+
t.Errorf("hash of \"foo\" and \"f\",\"o\",\"o\" not identical")
48+
}
49+
}
50+
51+
func TestHashBytesVsString(t *testing.T) {
52+
s := "foo"
53+
b := []byte(s)
54+
h1 := hash.New()
55+
h2 := hash.New()
56+
h2.SetSeed(h1.Seed())
57+
h1.AddString(s)
58+
h2.AddBytes(b)
59+
if h1.Hash() != h2.Hash() {
60+
t.Errorf("hash of string and byts not identical")
61+
}
62+
}
63+
64+
// Make sure a Hash implements the hash.Hash and hash.Hash64 interfaces.
65+
var _ basehash.Hash = &hash.Hash{}
66+
var _ basehash.Hash64 = &hash.Hash{}

0 commit comments

Comments
 (0)