diff --git a/src/Distributions.jl b/src/Distributions.jl index fcf6994347..bc3a4d159a 100644 --- a/src/Distributions.jl +++ b/src/Distributions.jl @@ -133,6 +133,7 @@ export MatrixTDist, MixtureModel, Multinomial, + MvHypergeometric, MultivariateNormal, MvLogNormal, MvNormal, @@ -244,6 +245,7 @@ export ncategories, # the number of categories in a Categorical distribution ncomponents, # the number of components in a mixture model ntrials, # the number of trials being performed in the experiment + nelements, # the number of elements of each type in a finite population params, # get the tuple of parameters params!, # provide storage space to calculate the tuple of parameters for a multivariate distribution like mvlognormal partype, # returns a type large enough to hold all of a distribution's parameters' element types @@ -281,6 +283,8 @@ export # type system include("common.jl") + + # implementation helpers include("utils.jl") include("eachvariate.jl") diff --git a/src/multivariate/mvhypergeometric.jl b/src/multivariate/mvhypergeometric.jl new file mode 100644 index 0000000000..3ac4733172 --- /dev/null +++ b/src/multivariate/mvhypergeometric.jl @@ -0,0 +1,127 @@ +""" +The [Multivariate hypergeometric distribution](https://en.wikipedia.org/wiki/Hypergeometric_distribution#Multivariate_hypergeometric_distribution) +generalizes the *hypergeometric distribution*. Consider ``n`` draws from a finite population containing ``k`` types of elements. Suppose that the population has size `M` and there are ``m_i`` elements of type ``i`` for ``i = 1, .., k`` with ``m_1+...m_k = M``. Let ``X = (X_1, ..., X_k)`` where ``X_i`` represents the number of elements of type ``i`` drawn, then the distribution of ``X`` is a multivariate hypergeometric distribution. Each sample of a multivariate hypergeometric distribution is a ``k``-dimensional integer vector that sums to ``n`` and satisfies ``0 \\le X_i \\le m_i``. + + +The probability mass function is given by + +```math +f(x; m, n) = {{{m_1 \\choose x_1}{m_2 \\choose x_2}\\cdots {m_k \\choose x_k}}\\over {N \\choose n}}, +\\quad x_1 + \\cdots + x_k = n, \\quad x_i \\le m_i +``` + +```julia +MvHypergeometric(m, n) # Multivariate hypergeometric distribution for a population with + # m = (m_1, ..., m_k) elements of type 1 to k and n draws + +params(d) # Get the parameters, i.e. (m, n) +``` +""" +struct MvHypergeometric <: DiscreteMultivariateDistribution + m::AbstractVector{Int} # number of elements of each type + n::Int # number of draws + function MvHypergeometric(m::AbstractVector{Int}, n::Int; check_args::Bool=true) + @check_args( + MvHypergeometric, + (m, all(m .>= zero.(n))), + zero(n) <= n <= sum(m), + ) + new(m, n) + end +end + + +# Parameters + +ncategories(d::MvHypergeometric) = length(d.m) +length(d::MvHypergeometric) = ncategories(d) +nelements(d::MvHypergeometric) = d.m +ntrials(d::MvHypergeometric) = d.n + +params(d::MvHypergeometric) = (d.m, d.n) + +# Statistics + +mean(d::MvHypergeometric) = d.n .* d.m ./ sum(d.m) + +function var(d::MvHypergeometric) + m = nelements(d) + k = length(m) + n = ntrials(d) + M = sum(m) + p = m / M + f = n * (M - n) / (M-1) + + v = Vector{Real}(undef, k) + for i = 1:k + @inbounds p_i = p[i] + v[i] = f * p_i * (1 - p_i) + end + v +end + +function cov(d::MvHypergeometric) + m = nelements(d) + k = length(m) + n = ntrials(d) + M = sum(m) + p = m / M + f = n * (M - n) / (M-1) + + C = Matrix{Real}(undef, k, k) + for j = 1:k + pj = p[j] + for i = 1:j-1 + @inbounds C[i,j] = - f * p[i] * pj + end + + @inbounds C[j,j] = f * pj * (1-pj) + end + + for j = 1:k-1 + for i = j+1:k + @inbounds C[i,j] = C[j,i] + end + end + C +end + + +# Evaluation +function insupport(d::MvHypergeometric, x::AbstractVector{T}) where T<:Real + k = length(d) + m = nelements(d) + length(x) == k || return false + s = 0.0 + for i = 1:k + @inbounds xi = x[i] + if !(isinteger(xi) && xi >= 0 && xi <= m[i]) + return false + end + s += xi + end + return s == ntrials(d) # integer computation would not yield truncation errors +end + +function _logpdf(d::MvHypergeometric, x::AbstractVector{T}) where T<:Real + m = nelements(d) + M = sum(m) + n = ntrials(d) + insupport(d,x) || return -Inf + s = -logabsbinomial(M, n)[1] + for i = 1:length(m) + @inbounds xi = x[i] + @inbounds m_i = m[i] + s += logabsbinomial(m_i, xi)[1] + end + return s +end + +# Sampling is performed by sequentially sampling each entry from the +# hypergeometric distribution +_rand!(rng::AbstractRNG, d::MvHypergeometric, x::AbstractVector{Int}) = + mvhypergeom_rand!(rng, nelements(d), ntrials(d), x) + + + + diff --git a/src/multivariates.jl b/src/multivariates.jl index 56d91233cf..8639e4caed 100644 --- a/src/multivariates.jl +++ b/src/multivariates.jl @@ -119,6 +119,7 @@ for fname in ["dirichlet.jl", "mvlognormal.jl", "mvtdist.jl", "product.jl", # deprecated - "vonmisesfisher.jl"] + "vonmisesfisher.jl", + "mvhypergeometric.jl"] include(joinpath("multivariate", fname)) end diff --git a/src/samplers.jl b/src/samplers.jl index fa667fff2b..bff267f189 100644 --- a/src/samplers.jl +++ b/src/samplers.jl @@ -21,6 +21,7 @@ for fname in ["aliastable.jl", "exponential.jl", "gamma.jl", "multinomial.jl", + "mvhypergeometric.jl", "vonmises.jl", "vonmisesfisher.jl", "discretenonparametric.jl", diff --git a/src/samplers/mvhypergeometric.jl b/src/samplers/mvhypergeometric.jl new file mode 100644 index 0000000000..43e684ed97 --- /dev/null +++ b/src/samplers/mvhypergeometric.jl @@ -0,0 +1,41 @@ +function mvhypergeom_rand!(rng::AbstractRNG, m::AbstractVector{Int}, n::Int, + x::AbstractVector{<:Real}) + k = length(m) + length(x) == k || throw(DimensionMismatch("Invalid argument dimension.")) + + z = zero(eltype(m)) + M = sum(m) # remaining population + i = 0 + km1 = k - 1 + + while i < km1 && n > 0 + i += 1 + @inbounds mi = m[i] + if mi < M + # Sample from hypergeometric distribution + # element of type i are considered successes + # all other elements are considered failures + xi = rand(rng, Hypergeometric(mi, M-mi, n)) + @inbounds x[i] = xi + # Remove elements of type i from population + n -= xi + M -= mi + else + # In this case, we don't even have to sample + # from Hypergeometric. Just assign remaining counts + @inbounds x[i] = n + n = 0 + end + end + + if i == km1 + @inbounds x[k] = n + else # n must have been zero + z = zero(eltype(x)) + for j = i+1 : k + @inbounds x[j] = z + end + end + + return x + end diff --git a/test/multivariate/mvhypergeometric.jl b/test/multivariate/mvhypergeometric.jl new file mode 100644 index 0000000000..be49fcbfd4 --- /dev/null +++ b/test/multivariate/mvhypergeometric.jl @@ -0,0 +1,122 @@ +# Tests for Multivariate Hypergeometric + +using Distributions +using Test + +@testset "Multivariate Hypergeometric" begin + @test_throws DomainError MvHypergeometric([5, 3, -2], 4) + @test_throws ArgumentError MvHypergeometric([5, 3], 10) + + m = [5, 3, 2] + n = 4 + d = MvHypergeometric(m, n) + @test length(d) == 3 + @test d.n == n + @test nelements(d) == m + @test ncategories(d) == length(m) + @test params(d) == (m, n) + + @test mean(d) ≈ [2.0, 1.2, 0.8] + @test var(d) ≈ [2/3, 56/100, 32/75] + + covmat = cov(d) + @test covmat ≈ (8/3) .* [1/4 -3/20 -1/10; -3/20 21/100 -3/50; -1/10 -3/50 4/25] + + @test insupport(d, [2, 1, 1]) + @test !insupport(d, [3, 2, 1]) + @test !insupport(d, [0, 0, 4]) + + + # random sampling + x = rand(d) + @test isa(x, Vector{Int}) + @test sum(x) == n + @test all(x .>= 0) + @test all(x .<= m) + @test insupport(d, x) + + x = rand(d, 100) + @test isa(x, Matrix{Int}) + @test all(sum(x, dims=1) .== n) + @test all(x .>= 0) + @test all(x .<= m) + @test all(insupport(d, x)) + + # random sampling with many catergories + m = [20, 2, 2, 2, 1, 1, 1] + n = 5 + d2 = MvHypergeometric(m, n) + x = rand(d2) + @test isa(x, Vector{Int}) + @test sum(x) == n + @test all(x .>= 0) + @test all(x .<= m) + @test insupport(d2, x) + + # log pdf + x = [2, 1, 1] + @test pdf(d, x) ≈ 2/7 + @test logpdf(d, x) ≈ log(2/7) + @test logpdf(d, x) ≈ log(pdf(d, x)) + + x = rand(d, 100) + pv = pdf(d, x) + lp = logpdf(d, x) + for i in 1 : size(x, 2) + @test pv[i] ≈ pdf(d, x[:,i]) + @test lp[i] ≈ logpdf(d, x[:,i]) + end + + # test degenerate cases + d1 = MvHypergeometric([1], 1) + @test logpdf(d1, [1]) ≈ 0 + @test logpdf(d1, [0]) == -Inf + d2 = MvHypergeometric([2, 0], 1) + @test logpdf(d2, [1, 0]) ≈ 0 + @test logpdf(d2, [0, 1]) == -Inf + + d3 = MvHypergeometric([5, 0, 0, 0], 3) + @test logpdf(d3, [3, 0, 0, 0]) ≈ 0 + @test logpdf(d3, [2, 1, 0, 0]) == -Inf + @test logpdf(d3, [2, 0, 0, 0]) == -Inf + + # behavior with n = 0 + d0 = MvHypergeometric([5, 3, 2], 0) + @test logpdf(d0, [0, 0, 0]) ≈ 0 + @test logpdf(d0, [1, 0, 0]) == -Inf + + @test rand(d0) == [0, 0, 0] + @test mean(d0) == [0.0, 0.0, 0.0] + @test var(d0) == [0.0, 0.0, 0.0] + @test insupport(d0, [0, 0, 0]) + @test !insupport(d0, [1, 0, 0]) + @test length(d0) == 3 + + # compare with hypergeometric + ns = 3 + nf = 5 + n = 4 + dh1 = MvHypergeometric([ns, nf], n) + dh2 = Hypergeometric(ns, nf, n) + + x = 2 + @test pdf(dh1, [x, n-x]) ≈ pdf(dh2, x) + x = 3 + @test pdf(dh1, [x, n-x]) ≈ pdf(dh2, x) + + # comparing marginals to hypergeometric + m = [5, 3, 2] + n = 4 + d = MvHypergeometric(m, n) + dh = Hypergeometric(m[1], sum(m[2:end]), n) + x1 = 2 + @test pdf(dh, x1) ≈ sum([pdf(d, [x1, x2, n-x1-x2]) for x2 in 0:m[2]]) + + # comparing conditionals to hypergeometric + x1 = 2 + dh = Hypergeometric(m[2], m[3], n-x1) + q = sum([pdf(d, [x1, x2, n-x1-x2]) for x2 in 0:m[2]]) + for x2 = 0:m[2] + @test pdf(dh, x2) ≈ pdf(d, [x1, x2, n-x1-x2])/q + end +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index f63a3d5e8e..e08eda167f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -100,6 +100,7 @@ const tests = [ "univariate/continuous/triangular", "statsapi", "univariate/continuous/inversegaussian", + "multivariate/mvhypergeometric", ### missing files compared to /src: # "common",