Skip to content

Commit

Permalink
Add efficient SparseVector method for some metrics (#235)
Browse files Browse the repository at this point in the history
  • Loading branch information
jlapeyre authored Dec 2, 2021
1 parent 265a363 commit 91f51b5
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 38 deletions.
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
name = "Distances"
uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
version = "0.10.6"
version = "0.10.7"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0"

Expand Down
1 change: 1 addition & 0 deletions src/Distances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ module Distances

using LinearAlgebra
using Statistics
using SparseArrays: SparseVectorUnion, nonzeroinds, nonzeros, nnz
import StatsAPI: pairwise, pairwise!

export
Expand Down
26 changes: 26 additions & 0 deletions src/bhattacharyya.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,32 @@ end
return sqab, asum, bsum
end

@inline function _bhattacharyya_coeff(a::SparseVectorUnion{<:Number}, b::SparseVectorUnion{<:Number})
anzind = nonzeroinds(a)
bnzind = nonzeroinds(b)
anzval = nonzeros(a)
bnzval = nonzeros(b)
ma = nnz(a)
mb = nnz(b)

ia = 1; ib = 1
s = zero(typeof(sqrt(oneunit(eltype(a))*oneunit(eltype(b)))))
@inbounds while ia <= ma && ib <= mb
ja = anzind[ia]
jb = bnzind[ib]
if ja == jb
s += sqrt(anzval[ia] * bnzval[ib])
ia += 1; ib += 1
elseif ja < jb
ia += 1
else
ib += 1
end
end
# efficient method for sum for SparseVectorView is missing
return s, sum(anzval), sum(bnzval)
end

# Faster pair- and column-wise versions TBD...


Expand Down
49 changes: 49 additions & 0 deletions src/metrics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,55 @@ Base.@propagate_inbounds function _evaluate(d::UnionMetrics, a::AbstractArray, b
end
end

eval_op_a(d, ai, b) = eval_op(d, ai, zero(eltype(b)))
eval_op_b(d, bi, a) = eval_op(d, zero(eltype(a)), bi)

# It is assumed that eval_reduce(d, s, eval_op(d, zero(eltype(a)), zero(eltype(b)))) == s
# This justifies ignoring all terms where both inputs are zero.
Base.@propagate_inbounds function _evaluate(d::UnionMetrics, a::SparseVectorUnion, b::SparseVectorUnion, ::Nothing)
@boundscheck if length(a) != length(b)
throw(DimensionMismatch("first array has length $(length(a)) which does not match the length of the second, $(length(b))."))
end
if length(a) == 0
return zero(result_type(d, a, b))
end
anzind = nonzeroinds(a)
bnzind = nonzeroinds(b)
anzval = nonzeros(a)
bnzval = nonzeros(b)
ma = nnz(a)
mb = nnz(b)
ia = 1; ib = 1
s = eval_start(d, a, b)
@inbounds while ia <= ma && ib <= mb
ja = anzind[ia]
jb = bnzind[ib]
if ja == jb
v = eval_op(d, anzval[ia], bnzval[ib])
ia += 1; ib += 1
elseif ja < jb
v = eval_op_a(d, anzval[ia], b)
ia += 1
else
v = eval_op_b(d, bnzval[ib], a)
ib += 1
end
s = eval_reduce(d, s, v)
end
@inbounds while ia <= ma
v = eval_op_a(d, anzval[ia], b)
s = eval_reduce(d, s, v)
ia += 1
end
@inbounds while ib <= mb
v = eval_op_b(d, bnzval[ib], a)
s = eval_reduce(d, s, v)
ib += 1
end
return eval_end(d, s)
end


_evaluate(dist::UnionMetrics, a::Number, b::Number, ::Nothing) = eval_end(dist, eval_op(dist, a, b))
function _evaluate(dist::UnionMetrics, a::Number, b::Number, p)
length(p) != 1 && throw(DimensionMismatch("inputs are scalars but parameters have length $(length(p))."))
Expand Down
94 changes: 57 additions & 37 deletions test/test_dists.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Unit tests for Distances

using SparseArrays: sparsevec, sprand

struct FooDist <: PreMetric end # Julia 1.0 Compat: struct definition must be put in global scope

@testset "result_type" begin
Expand Down Expand Up @@ -217,7 +219,7 @@ end
for (_x, _y) in (([4.0, 5.0, 6.0, 7.0], [3.0, 9.0, 8.0, 1.0]),
([4.0, 5.0, 6.0, 7.0], [3. 8.; 9. 1.0]))
x, y = T.(_x), T.(_y)
for (x, y) in ((x, y),
for (x, y) in ((x, y), (sparsevec(x), sparsevec(y)),
(convert(Array{Union{Missing, T}}, x), convert(Array{Union{Missing, T}}, y)),
((Iterators.take(x, 4), Iterators.take(y, 4))), # iterator
(((x[i] for i in 1:length(x)), (y[i] for i in 1:length(y)))), # generator
Expand Down Expand Up @@ -331,7 +333,8 @@ end # testset
end #testset

@testset "empty vector" begin
for T in (Float64, F64), (a, b) in ((T[], T[]), (Iterators.take(T[], 0), Iterators.take(T[], 0)))
for T in (Float64, F64), (a, b) in ((T[], T[]), (Iterators.take(T[], 0), Iterators.take(T[], 0)),
(sprand(T, 0, .1), sprand(T, 0, .1)))
@test sqeuclidean(a, b) == 0.0
@test isa(sqeuclidean(a, b), T)
@test euclidean(a, b) == 0.0
Expand Down Expand Up @@ -391,6 +394,10 @@ end # testset
@test_throws DimensionMismatch colwise!(mat23, Bregman(x -> sqeuclidean(x, zero(x)), x -> 2*x), mat23, mat22)
@test_throws DimensionMismatch Bregman(x -> sqeuclidean(x, zero(x)), x -> 2*x)([1, 2, 3], [1, 2])
@test_throws DimensionMismatch Bregman(x -> sqeuclidean(x, zero(x)), x -> [1, 2])([1, 2, 3], [1, 2, 3])
sv1 = sprand(10, .2)
sv2 = sprand(20, .2)
@test_throws DimensionMismatch euclidean(sv1, sv2)
@test_throws DimensionMismatch bhattacharyya(sv1, sv2)
end # testset

@testset "Different input types" begin
Expand Down Expand Up @@ -504,41 +511,43 @@ end

@testset "bhattacharyya / hellinger" begin
for T in (Int, Float64, F64)
x, y = T.([4, 5, 6, 7]), T.([3, 9, 8, 1])
a = T.([1, 2, 1, 3, 2, 1])
b = T.([1, 3, 0, 2, 2, 0])
p = T == Int ? rand(0:10, 12) : rand(T, 12)
p[p .< median(p)] .= 0
q = T == Int ? rand(0:10, 12) : rand(T, 12)

# Bhattacharyya and Hellinger distances are defined for discrete
# probability distributions so to calculate the expected values
# we need to normalize vectors.
px = x ./ sum(x)
py = y ./ sum(y)
expected_bc_x_y = sum(sqrt.(px .* py))
for (x, y) in ((x, y), (Iterators.take(x, 12), Iterators.take(y, 12)))
@test Distances.bhattacharyya_coeff(x, y) expected_bc_x_y
@test bhattacharyya(x, y) (-log(expected_bc_x_y))
@test hellinger(x, y) sqrt(1 - expected_bc_x_y)
end
_x, _y = T.([4, 5, 6, 7]), T.([3, 9, 8, 1])
_a = T.([1, 2, 1, 3, 2, 1])
_b = T.([1, 3, 0, 2, 2, 0])
_p = T == Int ? rand(0:10, 12) : rand(T, 12)
_p[_p .< median(_p)] .= 0
_q = T == Int ? rand(0:10, 12) : rand(T, 12)

for (x, y, a, b, p, q) in ((_x, _y, _a, _b, _p, _q), sparsevec.((_x, _y, _a, _b, _p, _q)))
# Bhattacharyya and Hellinger distances are defined for discrete
# probability distributions so to calculate the expected values
# we need to normalize vectors.
px = x ./ sum(x)
py = y ./ sum(y)
expected_bc_x_y = sum(sqrt.(px .* py))
for (x, y) in ((x, y), (Iterators.take(x, 12), Iterators.take(y, 12)))
@test Distances.bhattacharyya_coeff(x, y) expected_bc_x_y
@test bhattacharyya(x, y) (-log(expected_bc_x_y))
@test hellinger(x, y) sqrt(1 - expected_bc_x_y)
end

pa = a ./ sum(a)
pb = b ./ sum(b)
expected_bc_a_b = sum(sqrt.(pa .* pb))
@test Distances.bhattacharyya_coeff(a, b) expected_bc_a_b
@test bhattacharyya(a, b) (-log(expected_bc_a_b))
@test hellinger(a, b) sqrt(1 - expected_bc_a_b)

pp = p ./ sum(p)
pq = q ./ sum(q)
expected_bc_p_q = sum(sqrt.(pp .* pq))
@test Distances.bhattacharyya_coeff(p, q) expected_bc_p_q
@test bhattacharyya(p, q) (-log(expected_bc_p_q))
@test hellinger(p, q) sqrt(1 - expected_bc_p_q)

# Ensure it is semimetric
@test bhattacharyya(x, y) bhattacharyya(y, x)
pa = a ./ sum(a)
pb = b ./ sum(b)
expected_bc_a_b = sum(sqrt.(pa .* pb))
@test Distances.bhattacharyya_coeff(a, b) expected_bc_a_b
@test bhattacharyya(a, b) (-log(expected_bc_a_b))
@test hellinger(a, b) sqrt(1 - expected_bc_a_b)

pp = p ./ sum(p)
pq = q ./ sum(q)
expected_bc_p_q = sum(sqrt.(pp .* pq))
@test Distances.bhattacharyya_coeff(p, q) expected_bc_p_q
@test bhattacharyya(p, q) (-log(expected_bc_p_q))
@test hellinger(p, q) sqrt(1 - expected_bc_p_q)

# Ensure it is semimetric
@test bhattacharyya(x, y) bhattacharyya(y, x)
end
end
end #testset

Expand Down Expand Up @@ -769,7 +778,7 @@ end

X = rand(ComplexF64, m, nx)
Y = rand(ComplexF64, m, ny)

test_pairwise(SqEuclidean(), X, Y, Float64)
test_pairwise(Euclidean(), X, Y, Float64)

Expand Down Expand Up @@ -946,6 +955,17 @@ end
@test pairwise(PeriodicEuclidean(p), X, Y, dims=2)[1,2] == 0m
end

@testset "SparseVector, nnz(a) != nnz(b)" begin
for (n, densa, densb) in ((100, .1, .8), (200, .8, .1))
a = sprand(n, densa)
b = sprand(n, densb)
for d in (bhattacharyya, euclidean, sqeuclidean, jaccard, cityblock, totalvariation,
chebyshev, braycurtis, hamming)
@test d(a, b) d(Vector(a), Vector(b))
end
end
end

#=
@testset "zero allocation colwise!" begin
d = Euclidean()
Expand Down

2 comments on commit 91f51b5

@dkarrasch
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/49787

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.10.7 -m "<description of version>" 91f51b543ea6c54936d3e6183acdf7da50bf1f9e
git push origin v0.10.7

Please sign in to comment.