Skip to content

Commit e6007e4

Browse files
committed
Add HashCompressor to combine hashes together into a single, fixed-width hash. Fixes #7.
1 parent 4f92dd5 commit e6007e4

9 files changed

+188
-55
lines changed

Manifest.toml

+23-22
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,18 @@ version = "0.4.0"
1414

1515
[[Arpack_jll]]
1616
deps = ["Libdl", "OpenBLAS_jll", "Pkg"]
17-
git-tree-sha1 = "68a90a692ddc0eb72d69a6993ca26e2a923bf195"
17+
git-tree-sha1 = "e214a9b9bd1b4e1b4f15b22c0994862b66af7ff7"
1818
uuid = "68821587-b530-5797-8361-c406ea357684"
19-
version = "3.5.0+2"
19+
version = "3.5.0+3"
2020

2121
[[Base64]]
2222
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
2323

2424
[[CompilerSupportLibraries_jll]]
2525
deps = ["Libdl", "Pkg"]
26-
git-tree-sha1 = "aa832564f930a7fc9290972526908d01a35aefac"
26+
git-tree-sha1 = "7c4f882c41faa72118841185afc58a2eb00ef612"
2727
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
28-
version = "0.3.0+0"
28+
version = "0.3.3+0"
2929

3030
[[DataAPI]]
3131
git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -34,9 +34,9 @@ version = "1.1.0"
3434

3535
[[DataStructures]]
3636
deps = ["InteractiveUtils", "OrderedCollections"]
37-
git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8"
37+
git-tree-sha1 = "73eb18320fe3ba58790c8b8f6f89420f0a622773"
3838
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
39-
version = "0.17.10"
39+
version = "0.17.11"
4040

4141
[[Dates]]
4242
deps = ["Printf"]
@@ -48,9 +48,9 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
4848

4949
[[Distributions]]
5050
deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"]
51-
git-tree-sha1 = "46e716ee1f5d14e64cbf55b7db5621a860e7fc32"
51+
git-tree-sha1 = "c4ed10355637fcb0725dc6a27060f74df24f13cd"
5252
uuid = "31c24e10-a181-5473-b8eb-7969acd0382f"
53-
version = "0.23.0"
53+
version = "0.23.2"
5454

5555
[[DocStringExtensions]]
5656
deps = ["LibGit2", "Markdown", "Pkg", "Test"]
@@ -60,9 +60,9 @@ version = "0.8.1"
6060

6161
[[Documenter]]
6262
deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
63-
git-tree-sha1 = "3bacd94d853a6bccaee1d0104d8b06d29a7506ac"
63+
git-tree-sha1 = "646ebc3db49889ffeb4c36f89e5d82c6a26295ff"
6464
uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
65-
version = "0.24.6"
65+
version = "0.24.7"
6666

6767
[[FFTW]]
6868
deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
@@ -72,15 +72,15 @@ version = "1.2.0"
7272

7373
[[FFTW_jll]]
7474
deps = ["Libdl", "Pkg"]
75-
git-tree-sha1 = "ddb57f4cf125243b4aa4908c94d73a805f3cbf2c"
75+
git-tree-sha1 = "6c975cd606128d45d1df432fb812d6eb10fee00b"
7676
uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
77-
version = "3.3.9+4"
77+
version = "3.3.9+5"
7878

7979
[[FillArrays]]
8080
deps = ["LinearAlgebra", "Random", "SparseArrays"]
81-
git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66"
81+
git-tree-sha1 = "51cc2f9bc4eb9c6c0e81ec2f779d1085583cc956"
8282
uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
83-
version = "0.8.5"
83+
version = "0.8.7"
8484

8585
[[IntelOpenMP_jll]]
8686
deps = ["Libdl", "Pkg"]
@@ -99,6 +99,7 @@ uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
9999
version = "0.21.0"
100100

101101
[[LibGit2]]
102+
deps = ["Printf"]
102103
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
103104

104105
[[Libdl]]
@@ -131,10 +132,10 @@ version = "0.4.3"
131132
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
132133

133134
[[OpenBLAS_jll]]
134-
deps = ["Libdl", "Pkg"]
135-
git-tree-sha1 = "858f107d79a016d9511e34186fe2af11566ba762"
135+
deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
136+
git-tree-sha1 = "2ee3e636e94b9fd95fa8364d5cba2e20dae16609"
136137
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
137-
version = "0.3.7+7"
138+
version = "0.3.9+2"
138139

139140
[[OpenSpecFun_jll]]
140141
deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
@@ -156,12 +157,12 @@ version = "0.9.12"
156157

157158
[[Parsers]]
158159
deps = ["Dates", "Test"]
159-
git-tree-sha1 = "0c16b3179190d3046c073440d94172cfc3bb0553"
160+
git-tree-sha1 = "75d07cb840c300084634b4991761886d0d762724"
160161
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
161-
version = "0.3.12"
162+
version = "1.0.1"
162163

163164
[[Pkg]]
164-
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
165+
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
165166
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
166167

167168
[[Printf]]
@@ -231,9 +232,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
231232

232233
[[StatsBase]]
233234
deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
234-
git-tree-sha1 = "19bfcb46245f69ff4013b3df3b977a289852c3a1"
235+
git-tree-sha1 = "a6102b1f364befdb05746f386b67c6b7e3262c45"
235236
uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
236-
version = "0.32.2"
237+
version = "0.33.0"
237238

238239
[[StatsFuns]]
239240
deps = ["Rmath", "SpecialFunctions"]

Project.toml

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1111
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
1212
QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
1313
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
14+
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
1415
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
1516
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
1617

src/LSHBase.jl

+5-2
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,11 @@ function LSHFunction end
5858
function lsh_family end
5959

6060
@doc """
61-
collision_probability(hashfn::H, sim;
62-
n_hashes::Union{Symbol,Integer}=:auto) where {H <: LSHFunction}
61+
collision_probability(
62+
hashfn::H,
63+
sim;
64+
n_hashes::Union{Symbol,Integer}=:auto
65+
) where {H <: LSHFunction}
6366
6467
Compute the probability of hash collision between two inputs with similarity `sim` for an [`LSHFunction`](@ref) of type `H`. This function returns the probability that `n_hashes` hashes simultaneously collide.
6568

src/LSHFunctions.jl

+4-3
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@ module LSHFunctions
33
using Distributions, LinearAlgebra, SparseArrays
44

55
#========================
6-
Common types/utilities used through the LSH module
6+
Common types/utilities used throughout the module
77
========================#
88

9-
include("utils.jl")
9+
include(joinpath("utils", "hash_compression.jl"))
10+
include(joinpath("utils", "vecops.jl"))
1011
include("LSHBase.jl")
1112
include("intervals.jl")
1213
include("similarities.jl")
@@ -47,6 +48,6 @@ export SimHash, L1Hash, L2Hash, MIPSHash, SignALSH, MinHash,
4748

4849
# Helper / utility functions for LSHFunctions
4950
export index_hash, query_hash, n_hashes, hashtype, similarity, lsh_family,
50-
embedded_similarity, collision_probability, @interval
51+
embedded_similarity, collision_probability, @interval, HashCompressor
5152

5253
end # module

src/utils.jl

-28
This file was deleted.

src/utils/hash_compression.jl

+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#================================================================
2+
3+
Utilities for compressing hashes into fixed-size hashes.
4+
5+
================================================================#
6+
7+
using SHA
8+
9+
#========================
10+
HashCompressor definition and constructors
11+
========================#
12+
13+
@doc """
14+
struct HashCompressor
15+
16+
A compressor for converting variable-width hashes generated by LSHFunctions
17+
into fixed-width hashes. HashCompressor works by taking an array of hashes
18+
generated by an LSHFunction, and using SHA-256 to convert it into a fixed-width
19+
hash.
20+
"""
21+
struct HashCompressor
22+
n_bytes :: Int64
23+
salt :: Vector{UInt8}
24+
end
25+
26+
@doc """
27+
function HashCompressor(
28+
n_bytes :: Integer = 32,
29+
salt :: Union{Vector{UInt8}} = Vector{UInt8}(undef,0)
30+
)
31+
32+
Construct a new HashCompressor that compresses a hash returned by an LSHFunction
33+
into `n_bytes` bytes.
34+
35+
# Keyword arguments
36+
- `n_bytes::Integer` (default: `32`): the number of bytes to compress hashes into.
37+
- `salt::Vector{UInt8}` (default: `Vector{UInt8}(undef,0)`: a salt to prepend to hashes before compression using SHA-256.
38+
39+
# Examples
40+
```jldoctest; setup = :(using LSHFunctions)
41+
julia> compressor = HashCompressor(n_bytes=4);
42+
43+
julia> compressor([1, 4, 2, 9])
44+
4-element Array{UInt8,1}:
45+
0xb8
46+
0xdd
47+
0x5a
48+
0x5e
49+
```
50+
"""
51+
function HashCompressor(
52+
;
53+
n_bytes :: Integer = 32,
54+
salt :: Union{Nothing,Vector{UInt8}} = Vector{UInt8}(undef,0)
55+
)
56+
if !(0 <= n_bytes <= 32)
57+
"n_bytes must satisfy 0 <= n_bytes <= 32" |>
58+
ErrorException |>
59+
throw
60+
end
61+
62+
HashCompressor(Int64(n_bytes), salt)
63+
end
64+
65+
#========================
66+
Compression functions
67+
========================#
68+
(compressor::HashCompressor)(hashes::BitArray{1}) =
69+
reinterpret(UInt8, hashes.chunks) |> compressor
70+
71+
(compressor::HashCompressor)(hashes::AbstractVector{I}) where {I <: Integer} =
72+
reinterpret(UInt8, hashes) |> compressor
73+
74+
function (compressor::HashCompressor)(hashes::AbstractVector{UInt8})
75+
hashes = begin
76+
if length(compressor.salt) == 0
77+
sha2_256(hashes)
78+
else
79+
sha2_256([compressor.salt; hashes])
80+
end
81+
end
82+
83+
if compressor.n_bytes < 32
84+
hashes[1:compressor.n_bytes]
85+
else
86+
hashes
87+
end
88+
end
89+

src/utils/vecops.jl

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#================================================================
2+
3+
Common matrix and vector operations used in multiple locations
4+
throughout the module.
5+
6+
================================================================#
7+
8+
# Compute the norms of vectors and columns of matrices
9+
col_norms(x::Union{AbstractVector,AbstractMatrix}) =
10+
map(norm, eachcol(x))
11+
12+
col_norms(x::Union{Vector,Matrix}) =
13+
map(BLAS.nrm2, eachcol(x))
14+
15+
col_norms(x::SparseVector) =
16+
[BLAS.nrm2(x.nzval)]
17+
18+
col_norms(x::SparseMatrixCSC{T}) where {T} = begin
19+
output = Vector{T}(undef, size(x,2))
20+
@inbounds for ii = 1:size(x,2)
21+
result = T(0)
22+
start_idx, end_idx = x.colptr[ii], x.colptr[ii+1]-1
23+
@simd for idx = start_idx:end_idx
24+
result += x.nzval[idx].^2
25+
end
26+
output[ii] = result
27+
end
28+
return output
29+
end

test/runtests.jl

+2
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,5 @@ include(joinpath("hashes", "test_lshfunction.jl"))
2828

2929
include(joinpath("function_hashing", "test_monte_carlo.jl"))
3030
include(joinpath("function_hashing", "test_chebhash.jl"))
31+
32+
include(joinpath("utils", "test_hash_compression.jl"))

test/utils/test_hash_compression.jl

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
using Test, LSHFunctions
2+
3+
#==================
4+
Tests
5+
==================#
6+
@testset "HashCompressor tests" begin
7+
@testset "Can compress Vector{UInt8} hashes" begin
8+
compressor = HashCompressor(n_bytes=6)
9+
hashes = UInt8[0x01, 0x04, 0x02, 0x08, 0x06, 0x07, 0x08, 0x04]
10+
11+
@test compressor(hashes) == UInt8[0xce, 0xd8, 0x24, 0x1c, 0xc0, 0x48]
12+
end
13+
14+
@testset "Can compress Vector{Integer} hashes" begin
15+
compressor = HashCompressor(n_bytes=4)
16+
hashes = [-1, 8, -6, 3, -5, -9, 9, 0]
17+
18+
@test compressor(hashes) == UInt8[0xb2, 0x7f, 0x8e, 0xb4]
19+
end
20+
21+
@testset "Can compress BitArray{1} hashes" begin
22+
compressor = HashCompressor(n_bytes=5)
23+
hashes = BitArray([1, 1, 1, 0, 0, 1, 0, 0, 1, 0])
24+
25+
@test compressor(hashes) == UInt8[0xa2, 0x99, 0xd7, 0x9f, 0x67]
26+
end
27+
28+
@testset "Can salt hashes" begin
29+
salt = UInt8[0xcb, 0xe7, 0x12]
30+
compressor = HashCompressor(n_bytes=6, salt=salt)
31+
hashes = [-1, 8, -6, 3, -5, -9, 9, 0]
32+
33+
@test compressor(hashes) == UInt8[0x9f, 0x5c, 0xf4, 0x3a, 0x29, 0x22]
34+
end
35+
end

0 commit comments

Comments
 (0)