Skip to content

Add multivariate hypergeometric distribution #1963

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
4 changes: 4 additions & 0 deletions src/Distributions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ export
MatrixTDist,
MixtureModel,
Multinomial,
MvHypergeometric,
MultivariateNormal,
MvLogNormal,
MvNormal,
Expand Down Expand Up @@ -244,6 +245,7 @@ export
ncategories, # the number of categories in a Categorical distribution
ncomponents, # the number of components in a mixture model
ntrials, # the number of trials being performed in the experiment
nelements, # the number of elements of each type in a finite population
params, # get the tuple of parameters
params!, # provide storage space to calculate the tuple of parameters for a multivariate distribution like mvlognormal
partype, # returns a type large enough to hold all of a distribution's parameters' element types
Expand Down Expand Up @@ -281,6 +283,8 @@ export
# type system
include("common.jl")



# implementation helpers
include("utils.jl")
include("eachvariate.jl")
Expand Down
127 changes: 127 additions & 0 deletions src/multivariate/mvhypergeometric.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""
The [Multivariate hypergeometric distribution](https://en.wikipedia.org/wiki/Hypergeometric_distribution#Multivariate_hypergeometric_distribution)
generalizes the *hypergeometric distribution*. Consider ``n`` draws from a finite population containing ``k`` types of elements. Suppose that the population has size `M` and there are ``m_i`` elements of type ``i`` for ``i = 1, .., k`` with ``m_1+...m_k = M``. Let ``X = (X_1, ..., X_k)`` where ``X_i`` represents the number of elements of type ``i`` drawn, then the distribution of ``X`` is a multivariate hypergeometric distribution. Each sample of a multivariate hypergeometric distribution is a ``k``-dimensional integer vector that sums to ``n`` and satisfies ``0 \\le X_i \\le m_i``.


The probability mass function is given by

```math
f(x; m, n) = {{{m_1 \\choose x_1}{m_2 \\choose x_2}\\cdots {m_k \\choose x_k}}\\over {N \\choose n}},
\\quad x_1 + \\cdots + x_k = n, \\quad x_i \\le m_i
```

```julia
MvHypergeometric(m, n) # Multivariate hypergeometric distribution for a population with
# m = (m_1, ..., m_k) elements of type 1 to k and n draws

params(d) # Get the parameters, i.e. (m, n)
```
"""
struct MvHypergeometric <: DiscreteMultivariateDistribution
m::AbstractVector{Int} # number of elements of each type
n::Int # number of draws
function MvHypergeometric(m::AbstractVector{Int}, n::Int; check_args::Bool=true)
@check_args(
MvHypergeometric,
(m, all(m .>= zero.(n))),
zero(n) <= n <= sum(m),
)
new(m, n)
end
end


# Parameters

ncategories(d::MvHypergeometric) = length(d.m)
length(d::MvHypergeometric) = ncategories(d)
nelements(d::MvHypergeometric) = d.m
ntrials(d::MvHypergeometric) = d.n

params(d::MvHypergeometric) = (d.m, d.n)

# Statistics

mean(d::MvHypergeometric) = d.n .* d.m ./ sum(d.m)

function var(d::MvHypergeometric)
m = nelements(d)
k = length(m)
n = ntrials(d)
M = sum(m)
p = m / M
f = n * (M - n) / (M-1)

v = Vector{Real}(undef, k)
for i = 1:k
@inbounds p_i = p[i]
v[i] = f * p_i * (1 - p_i)
end
v
end

function cov(d::MvHypergeometric)
m = nelements(d)
k = length(m)
n = ntrials(d)
M = sum(m)
p = m / M
f = n * (M - n) / (M-1)

C = Matrix{Real}(undef, k, k)
for j = 1:k
pj = p[j]
for i = 1:j-1
@inbounds C[i,j] = - f * p[i] * pj
end

@inbounds C[j,j] = f * pj * (1-pj)
end

for j = 1:k-1
for i = j+1:k
@inbounds C[i,j] = C[j,i]
end
end
C
end


# Evaluation
function insupport(d::MvHypergeometric, x::AbstractVector{T}) where T<:Real
k = length(d)
m = nelements(d)
length(x) == k || return false
s = 0.0
for i = 1:k
@inbounds xi = x[i]
if !(isinteger(xi) && xi >= 0 && xi <= m[i])
return false
end
s += xi
end
return s == ntrials(d) # integer computation would not yield truncation errors
end

function _logpdf(d::MvHypergeometric, x::AbstractVector{T}) where T<:Real
m = nelements(d)
M = sum(m)
n = ntrials(d)
insupport(d,x) || return -Inf
s = -logabsbinomial(M, n)[1]
for i = 1:length(m)
@inbounds xi = x[i]
@inbounds m_i = m[i]
s += logabsbinomial(m_i, xi)[1]
end
return s
end

# Sampling is performed by sequentially sampling each entry from the
# hypergeometric distribution
_rand!(rng::AbstractRNG, d::MvHypergeometric, x::AbstractVector{Int}) =
mvhypergeom_rand!(rng, nelements(d), ntrials(d), x)




3 changes: 2 additions & 1 deletion src/multivariates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ for fname in ["dirichlet.jl",
"mvlognormal.jl",
"mvtdist.jl",
"product.jl", # deprecated
"vonmisesfisher.jl"]
"vonmisesfisher.jl",
"mvhypergeometric.jl"]
include(joinpath("multivariate", fname))
end
1 change: 1 addition & 0 deletions src/samplers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ for fname in ["aliastable.jl",
"exponential.jl",
"gamma.jl",
"multinomial.jl",
"mvhypergeometric.jl",
"vonmises.jl",
"vonmisesfisher.jl",
"discretenonparametric.jl",
Expand Down
41 changes: 41 additions & 0 deletions src/samplers/mvhypergeometric.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
function mvhypergeom_rand!(rng::AbstractRNG, m::AbstractVector{Int}, n::Int,
x::AbstractVector{<:Real})
k = length(m)
length(x) == k || throw(DimensionMismatch("Invalid argument dimension."))

z = zero(eltype(m))
M = sum(m) # remaining population
i = 0
km1 = k - 1

while i < km1 && n > 0
i += 1
@inbounds mi = m[i]
if mi < M
# Sample from hypergeometric distribution
# element of type i are considered successes
# all other elements are considered failures
xi = rand(rng, Hypergeometric(mi, M-mi, n))
@inbounds x[i] = xi
# Remove elements of type i from population
n -= xi
M -= mi
else
# In this case, we don't even have to sample
# from Hypergeometric. Just assign remaining counts
@inbounds x[i] = n
n = 0
end
end

if i == km1
@inbounds x[k] = n
else # n must have been zero
z = zero(eltype(x))
for j = i+1 : k
@inbounds x[j] = z
end
end

return x
end
122 changes: 122 additions & 0 deletions test/multivariate/mvhypergeometric.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Tests for Multivariate Hypergeometric

using Distributions
using Test

@testset "Multivariate Hypergeometric" begin
@test_throws DomainError MvHypergeometric([5, 3, -2], 4)
@test_throws ArgumentError MvHypergeometric([5, 3], 10)

m = [5, 3, 2]
n = 4
d = MvHypergeometric(m, n)
@test length(d) == 3
@test d.n == n
@test nelements(d) == m
@test ncategories(d) == length(m)
@test params(d) == (m, n)

@test mean(d) ≈ [2.0, 1.2, 0.8]
@test var(d) ≈ [2/3, 56/100, 32/75]

covmat = cov(d)
@test covmat ≈ (8/3) .* [1/4 -3/20 -1/10; -3/20 21/100 -3/50; -1/10 -3/50 4/25]

@test insupport(d, [2, 1, 1])
@test !insupport(d, [3, 2, 1])
@test !insupport(d, [0, 0, 4])


# random sampling
x = rand(d)
@test isa(x, Vector{Int})
@test sum(x) == n
@test all(x .>= 0)
@test all(x .<= m)
@test insupport(d, x)

x = rand(d, 100)
@test isa(x, Matrix{Int})
@test all(sum(x, dims=1) .== n)
@test all(x .>= 0)
@test all(x .<= m)
@test all(insupport(d, x))

# random sampling with many catergories
m = [20, 2, 2, 2, 1, 1, 1]
n = 5
d2 = MvHypergeometric(m, n)
x = rand(d2)
@test isa(x, Vector{Int})
@test sum(x) == n
@test all(x .>= 0)
@test all(x .<= m)
@test insupport(d2, x)

# log pdf
x = [2, 1, 1]
@test pdf(d, x) ≈ 2/7
@test logpdf(d, x) ≈ log(2/7)
@test logpdf(d, x) ≈ log(pdf(d, x))

x = rand(d, 100)
pv = pdf(d, x)
lp = logpdf(d, x)
for i in 1 : size(x, 2)
@test pv[i] ≈ pdf(d, x[:,i])
@test lp[i] ≈ logpdf(d, x[:,i])
end

# test degenerate cases
d1 = MvHypergeometric([1], 1)
@test logpdf(d1, [1]) ≈ 0
@test logpdf(d1, [0]) == -Inf
d2 = MvHypergeometric([2, 0], 1)
@test logpdf(d2, [1, 0]) ≈ 0
@test logpdf(d2, [0, 1]) == -Inf

d3 = MvHypergeometric([5, 0, 0, 0], 3)
@test logpdf(d3, [3, 0, 0, 0]) ≈ 0
@test logpdf(d3, [2, 1, 0, 0]) == -Inf
@test logpdf(d3, [2, 0, 0, 0]) == -Inf

# behavior with n = 0
d0 = MvHypergeometric([5, 3, 2], 0)
@test logpdf(d0, [0, 0, 0]) ≈ 0
@test logpdf(d0, [1, 0, 0]) == -Inf

@test rand(d0) == [0, 0, 0]
@test mean(d0) == [0.0, 0.0, 0.0]
@test var(d0) == [0.0, 0.0, 0.0]
@test insupport(d0, [0, 0, 0])
@test !insupport(d0, [1, 0, 0])
@test length(d0) == 3

# compare with hypergeometric
ns = 3
nf = 5
n = 4
dh1 = MvHypergeometric([ns, nf], n)
dh2 = Hypergeometric(ns, nf, n)

x = 2
@test pdf(dh1, [x, n-x]) ≈ pdf(dh2, x)
x = 3
@test pdf(dh1, [x, n-x]) ≈ pdf(dh2, x)

# comparing marginals to hypergeometric
m = [5, 3, 2]
n = 4
d = MvHypergeometric(m, n)
dh = Hypergeometric(m[1], sum(m[2:end]), n)
x1 = 2
@test pdf(dh, x1) ≈ sum([pdf(d, [x1, x2, n-x1-x2]) for x2 in 0:m[2]])

# comparing conditionals to hypergeometric
x1 = 2
dh = Hypergeometric(m[2], m[3], n-x1)
q = sum([pdf(d, [x1, x2, n-x1-x2]) for x2 in 0:m[2]])
for x2 = 0:m[2]
@test pdf(dh, x2) ≈ pdf(d, [x1, x2, n-x1-x2])/q
end
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ const tests = [
"univariate/continuous/triangular",
"statsapi",
"univariate/continuous/inversegaussian",
"multivariate/mvhypergeometric",

### missing files compared to /src:
# "common",
Expand Down
Loading