Skip to content

Commit ecd526a

Browse files
Merge pull request #44 from yuta-nakahara/develop-hiddenmarkovnormal
Develop hiddenmarkovnormal
2 parents c74e32d + e9df92d commit ecd526a

File tree

7 files changed

+1849
-2
lines changed

7 files changed

+1849
-2
lines changed

bayesml/_check.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,30 @@ def nonneg_ints(val,val_name,exception_class):
4747
return val
4848
raise(exception_class(val_name + " must be int or a numpy.ndarray whose dtype is int. Its values must be non-negative (including 0)."))
4949

50+
def int_vec(val,val_name,exception_class):
51+
if type(val) is np.ndarray:
52+
if np.issubdtype(val.dtype,np.integer) and val.ndim == 1:
53+
return val
54+
raise(exception_class(val_name + " must be a 1-dimensional numpy.ndarray whose dtype is int."))
55+
5056
def nonneg_int_vec(val,val_name,exception_class):
5157
if type(val) is np.ndarray:
5258
if np.issubdtype(val.dtype,np.integer) and val.ndim == 1 and np.all(val>=0):
5359
return val
5460
raise(exception_class(val_name + " must be a 1-dimensional numpy.ndarray whose dtype is int. Its values must be non-negative (including 0)."))
5561

62+
def nonneg_int_vecs(val,val_name,exception_class):
63+
if type(val) is np.ndarray:
64+
if np.issubdtype(val.dtype,np.integer) and val.ndim >= 1 and np.all(val>=0):
65+
return val
66+
raise(exception_class(val_name + " must be a numpy.ndarray whose ndim >= 1 and dtype is int. Its values must be non-negative (including 0)."))
67+
68+
def nonneg_float_vec(val,val_name,exception_class):
69+
if type(val) is np.ndarray:
70+
if np.issubdtype(val.dtype,np.floating) and val.ndim == 1 and np.all(val>=0):
71+
return val
72+
raise(exception_class(val_name + " must be a 1-dimensional numpy.ndarray whose dtype is float. Its values must be non-negative (including 0)."))
73+
5674
def int_of_01(val,val_name,exception_class):
5775
if np.issubdtype(type(val),np.integer):
5876
if val == 0 or val ==1:
@@ -173,6 +191,14 @@ def float_vecs(val,val_name,exception_class):
173191
return val
174192
raise(exception_class(val_name + " must be a numpy.ndarray whose ndim >= 1."))
175193

194+
def pos_float_vecs(val,val_name,exception_class):
195+
if type(val) is np.ndarray:
196+
if np.issubdtype(val.dtype,np.integer) and val.ndim >= 1 and np.all(val>0):
197+
return val.astype(float)
198+
if np.issubdtype(val.dtype,np.floating) and val.ndim >= 1 and np.all(val>0.0):
199+
return val
200+
raise(exception_class(val_name + " must be a 1-dimensional numpy.ndarray. Its values must be positive (not including 0)"))
201+
176202
def float_vec_sum_1(val,val_name,exception_class):
177203
if type(val) is np.ndarray:
178204
if np.issubdtype(val.dtype,np.integer) and val.ndim == 1 and abs(val.sum() - 1.) <= _EPSILON:

bayesml/autoregressive/_autoregressive.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ class GenModel(base.Generative):
3030
h_mu_vec : numpy ndarray, optional
3131
a vector of real numbers, by default [0.0, 0.0, ... , 0.0]
3232
h_lambda_mat : numpy ndarray, optional
33-
a positibe definate matrix, by default the identity matrix
33+
a positive definate matrix, by default the identity matrix
3434
h_alpha : float, optional
3535
a positive real number, by default 1.0
3636
h_beta : float, optional
37-
a positibe real number, by default 1.0
37+
a positive real number, by default 1.0
3838
seed : {None, int}, optional
3939
A seed to initialize numpy.random.default_rng(),
4040
by default None
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# Document Author
2+
# Ryohei Oka <o.ryohei07@gmail.com>
3+
r"""
4+
The hidden Markov model with the Gauss-Wishart prior distribution and the Dirichlet prior distribution.
5+
6+
The stochastic data generative model is as follows:
7+
8+
* :math:`K \in \mathbb{N}`: number of latent classes
9+
* :math:`\boldsymbol{z} \in \{ 0, 1 \}^K`: a one-hot vector representing the latent class (latent variable)
10+
* :math:`\boldsymbol{\pi} \in [0, 1]^K`: a parameter for latent classes, (:math:`\sum_{k=1}^K \pi_k=1`)
11+
* :math:`a_{j,k} \in [0,1]` : transition probability to latent state k under latent state j
12+
* :math:`\boldsymbol{a}_j = [a_{j,1}, a_{j,2}, \dots , a_{j,K}]\in [0,1]^K`, a vector of the transition probability (:math:`\sum_{k=1}^K a_{j,k}=1`)
13+
* :math:`\boldsymbol{A}=(a_{j,k})_{1\leq j,k\leq K} \in [0, 1]^{K\times K}`: a matrix of the transition probability
14+
* :math:`D \in \mathbb{N}`: a dimension of data
15+
* :math:`\boldsymbol{x} \in \mathbb{R}^D`: a data point
16+
* :math:`\boldsymbol{\mu}_k \in \mathbb{R}^D`: a parameter
17+
* :math:`\boldsymbol{\mu} = \{ \boldsymbol{\mu}_k \}_{k=1}^K`
18+
* :math:`\boldsymbol{\Lambda}_k \in \mathbb{R}^{D\times D}` : a parameter (a positive definite matrix)
19+
* :math:`\boldsymbol{\Lambda} = \{ \boldsymbol{\Lambda}_k \}_{k=1}^K`
20+
* :math:`| \boldsymbol{\Lambda}_k | \in \mathbb{R}`: the determinant of :math:`\boldsymbol{\Lambda}_k`
21+
22+
.. math::
23+
p(\boldsymbol{z}_{1} | \boldsymbol{\pi}) &= \mathrm{Cat}(\boldsymbol{z}_{1}|\boldsymbol{\pi}) = \prod_{k=1}^K \pi_k^{z_{1,k}},\\
24+
p(\boldsymbol{z}_{n} |\boldsymbol{z}_{n-1} ,\boldsymbol{A}) &= \prod_{k=1}^K \prod_{j=1}^K a_{j,k}^{z_{n-1,j}z_{n,k}},\\
25+
p(\boldsymbol{x}_{n} | \boldsymbol{\mu}, \boldsymbol{\Lambda}, \boldsymbol{z}_{n}) &= \prod_{k=1}^K \mathcal{N}(\boldsymbol{x}|\boldsymbol{\mu}_k,\boldsymbol{\Lambda}_k^{-1})^{z_{n,k}} \\
26+
&= \prod_{k=1}^K \left( \frac{| \boldsymbol{\Lambda}_{k} |^{1/2}}{(2\pi)^{D/2}} \exp \left\{ -\frac{1}{2}(\boldsymbol{x}-\boldsymbol{\mu}_{k})^\top \boldsymbol{\Lambda}_{k} (\boldsymbol{x}-\boldsymbol{\mu}_{k}) \right\} \right)^{z_{n,k}},
27+
28+
The prior distribution is as follows:
29+
30+
* :math:`\boldsymbol{m}_0 \in \mathbb{R}^{D}`: a hyperparameter
31+
* :math:`\kappa_0 \in \mathbb{R}_{>0}`: a hyperparameter
32+
* :math:`\nu_0 \in \mathbb{R}`: a hyperparameter (:math:`\nu_0 > D-1`)
33+
* :math:`\boldsymbol{W}_0 \in \mathbb{R}^{D\times D}`: a hyperparameter (a positive definite matrix)
34+
* :math:`\boldsymbol{\eta}_0 \in \mathbb{R}_{> 0}^K`: a hyperparameter
35+
* :math:`\boldsymbol{\zeta}_{0,j} \in \mathbb{R}_{> 0}^K`: a hyperparameter
36+
* :math:`\mathrm{Tr} \{ \cdot \}`: a trace of a matrix
37+
* :math:`\Gamma (\cdot)`: the gamma function
38+
39+
.. math::
40+
p(\boldsymbol{\mu},\boldsymbol{\Lambda},\boldsymbol{\pi},\boldsymbol{A}) &= \left\{ \prod_{k=1}^K \mathcal{N}(\boldsymbol{\mu}_k|\boldsymbol{m}_0,(\kappa_0 \boldsymbol{\Lambda}_k)^{-1})\mathcal{W}(\boldsymbol{\Lambda}_k|\boldsymbol{W}_0, \nu_0) \right\} \mathrm{Dir}(\boldsymbol{\pi}|\boldsymbol{\eta}_0) \prod_{j=1}^{K}\mathrm{Dir}(\boldsymbol{a}_{j}|\boldsymbol{\zeta}_{0,j}), \\
41+
&= \Biggl[ \prod_{k=1}^K \left( \frac{\kappa_0}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_0}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_0)^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_0) \right\} \\
42+
&\qquad \times B(\boldsymbol{W}_0, \nu_0) | \boldsymbol{\Lambda}_k |^{(\nu_0 - D - 1) / 2} \exp \left\{ -\frac{1}{2} \mathrm{Tr} \{ \boldsymbol{W}_0^{-1} \boldsymbol{\Lambda}_k \} \right\}\biggl] \\
43+
&\qquad \times \Biggl[ \prod_{k=1}^KC(\boldsymbol{\eta}_0)\pi_k^{\eta_{0,k}-1}\biggl]\\
44+
&\qquad \times \biggl[\prod_{j=1}^KC(\boldsymbol{\zeta}_{0,j})\prod_{k=1}^K a_{j,k}^{\zeta_{0,j,k}-1}\Biggr],\\
45+
46+
where :math:`B(\boldsymbol{W}_0, \nu_0)` and :math:`C(\boldsymbol{\eta}_0)` are defined as follows:
47+
48+
.. math::
49+
B(\boldsymbol{W}_0, \nu_0) &= | \boldsymbol{W}_0 |^{-\nu_0 / 2} \left( 2^{\nu_0 D / 2} \pi^{D(D-1)/4} \prod_{i=1}^D \Gamma \left( \frac{\nu_0 + 1 - i}{2} \right) \right)^{-1}, \\
50+
C(\boldsymbol{\eta}_0) &= \frac{\Gamma(\sum_{k=1}^K \eta_{0,k})}{\Gamma(\eta_{0,1})\cdots\Gamma(\eta_{0,K})},\\
51+
C(\boldsymbol{\zeta}_{0,j}) &= \frac{\Gamma(\sum_{k=1}^K \zeta_{0,j,k})}{\Gamma(\zeta_{0,j,1})\cdots\Gamma(\zeta_{0,j,K})}.
52+
53+
The apporoximate posterior distribution in the :math:`t`-th iteration of a variational Bayesian method is as follows:
54+
55+
* :math:`\boldsymbol{x}^n = (\boldsymbol{x}_1, \boldsymbol{x}_2, \dots , \boldsymbol{x}_n) \in \mathbb{R}^{D \times n}`: given data
56+
* :math:`\boldsymbol{z}^n = (\boldsymbol{z}_1, \boldsymbol{z}_2, \dots , \boldsymbol{z}_n) \in \{ 0, 1 \}^{K \times n}`: latent classes of given data
57+
* :math:`\boldsymbol{m}_{n,k}^{(t)} \in \mathbb{R}^{D}`: a hyperparameter
58+
* :math:`\kappa_{n,k}^{(t)} \in \mathbb{R}_{>0}`: a hyperparameter
59+
* :math:`\nu_{n,k}^{(t)} \in \mathbb{R}`: a hyperparameter :math:`(\nu_n > D-1)`
60+
* :math:`\boldsymbol{W}_{n,k}^{(t)} \in \mathbb{R}^{D\times D}`: a hyperparameter (a positive definite matrix)
61+
* :math:`\boldsymbol{\eta}_n^{(t)} \in \mathbb{R}_{> 0}^K`: a hyperparameter
62+
* :math:`\boldsymbol{\zeta}_{n,j}^{(t)} \in \mathbb{R}_{> 0}^K`: a hyperparameter
63+
64+
.. math::
65+
&q(\boldsymbol{z}^n, \boldsymbol{\mu},\boldsymbol{\Lambda},\boldsymbol{\pi},\boldsymbol{A}) \nonumber \\
66+
&= q^{(t)}(\boldsymbol{z}^n) \left\{ \prod_{k=1}^K \mathcal{N}(\boldsymbol{\mu}_k|\boldsymbol{m}_{n,k}^{(t)},(\kappa_{n,k}^{(t)} \boldsymbol{\Lambda}_k)^{-1})\mathcal{W}(\boldsymbol{\Lambda}_k|\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) \right\} \mathrm{Dir}(\boldsymbol{\pi}|\boldsymbol{\eta}_n^{(t)})\left\{\prod_{j=1}^K\mathrm{Dir}(\boldsymbol{a}_j|\boldsymbol{\zeta}_{n,j}^{(t)})\right\}, \\
67+
&= q^{(t)}(\boldsymbol{z}^n) \Biggl[ \prod_{k=1}^K \left( \frac{\kappa_{n,k}^{(t)}}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_{n,k}^{(t)}}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_{n,k}^{(t)})^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_{n,k}^{(t)}) \right\} \\
68+
&\qquad \times B(\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) | \boldsymbol{\Lambda}_k |^{(\nu_{n,k}^{(t)} - D - 1) / 2} \exp \left\{ -\frac{1}{2} \mathrm{Tr} \{ ( \boldsymbol{W}_{n,k}^{(t)} )^{-1} \boldsymbol{\Lambda}_k \} \right\} \Biggr] \\
69+
&\qquad \times C(\boldsymbol{\eta}_n^{(t)})\prod_{k=1}^K \pi_k^{\eta_{n,k}^{(t)}-1}\left[\prod_{j=1}^K C(\boldsymbol{\zeta}_{n,j}^{(t)})\prod_{k=1}^K a_{j,k}^{\zeta_{n,j,k}^{(t)}-1}\right],\\
70+
71+
where the updating rule of the hyperparameters is as follows.
72+
73+
.. math::
74+
N_k^{(t)} &= \sum_{i=1}^n \gamma^{(t)}_{i,k}, \\
75+
M_{j,k}^{(t)} &= \sum_{i=2}^n \xi^{(t)}_{i,j,k},\\
76+
\bar{\boldsymbol{x}}_k^{(t)} &= \frac{1}{N_k^{(t)}} \sum_{i=1}^n \gamma^{(t)}_{i,k} \boldsymbol{x}_i, \\
77+
S_k^{(t)} &= \frac{1}{N_k^{(t)}}\sum_{i=1}^n \gamma^{(t)}_{i,k} (x_i-\bar{\boldsymbol{x}}_k^{(t)})(x_i-\bar{\boldsymbol{x}}_k^{(t)})^{\top},\\
78+
\boldsymbol{m}_{n,k}^{(t+1)} &= \frac{\kappa_0\boldsymbol{\mu}_0 + N_k^{(t)} \bar{\boldsymbol{x}}_k^{(t)}}{\kappa_0 + N_k^{(t)}}, \\
79+
\kappa_{n,k}^{(t+1)} &= \kappa_0 + N_k^{(t)}, \\
80+
(\boldsymbol{W}_{n,k}^{(t+1)})^{-1} &= \boldsymbol{W}_0^{-1} + N_k^{(t)}S_k^{(t)} + \frac{\kappa_0 N_k^{(t)}}{\kappa_0 + N_k^{(t)}}(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)^\top, \\
81+
\nu_{n,k}^{(t+1)} &= \nu_0 + N_k^{(t)},\\
82+
\eta_{n,k}^{(t+1)} &= \eta_{0,k} + \gamma^{(t)}_{1,k}, \\
83+
\zeta_{n,j,k}^{(t+1)} &= \zeta_{0,j,k}+M_{j,k}^{(t)}.
84+
85+
The approximate posterior distribution of the latent variable :math:`q^{(t+1)}(z^n)` is calculated by the forward-backward algorithm as follows.
86+
87+
.. math::
88+
\ln \rho_{i,k}^{(t+1)} &= \frac{1}{2} \Biggl[\, \sum_{d=1}^D \psi \left( \frac{\nu_{n,k}^{(t+1)} + 1 - d}{2} \right) + D \ln 2 + \ln | \boldsymbol{W}_{n,k}^{(t+1)} | \notag \\
89+
&\qquad - D \ln (2 \pi ) - \frac{D}{\kappa_{n,k}^{(t+1)}} - \nu_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)})^\top \boldsymbol{W}_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)}) \Biggr], \\
90+
\ln \tilde{\pi}_k^{(t+1)} &= \psi (\eta_{n,k}^{(t+1)}) - \psi \left( \textstyle \sum_{k=1}^K \eta_{n,k}^{(t+1)} \right) \\
91+
\ln \tilde{a}_{j,k}^{(t+1)} &= \psi (\zeta_{n,j,k}^{(t+1)}) - \psi \left( \textstyle \sum_{k=1}^K \zeta_{n,j,k}^{(t+1)} \right) \\
92+
\alpha^{(t+1)} (\boldsymbol{z}_i) &\propto
93+
\begin{cases}
94+
\prod_{k=1}^{K} \left( \rho_{i,k}^{(t+1)}\right)^{z_{i,k}} \sum_{\boldsymbol{z}_{i-1}} \left[\prod_{k=1}^{K}\prod_{j=1}^{K}\left(\tilde{a}^{(t+1)}_{j,k}\right)^{z_{i-1,j}z_{i,k}}\alpha^{(t+1)}(\boldsymbol{z}_{i-1})\right] & (i>1)\\
95+
\prod_{k=1}^{K}\left( \rho_{1,k}^{(t+1)} \tilde{\pi}_k^{(t+1)} \right)^{z_{1,k}} & (i=1)
96+
\end{cases} \\
97+
\beta^{(t+1)} (\boldsymbol{z}_i) &\propto
98+
\begin{cases}
99+
\sum_{\boldsymbol{z}_{i+1}} \left[ \prod_{k=1}^{K} \left( \rho_{i+1,k}^{(t+1)}\right)^{z_{i+1,k}} \prod_{k=1}^{K}\prod_{j=1}^{K}\left(\tilde{a}^{(t+1)}_{j,k}\right)^{z_{i,j}z_{i+1,k}}\beta^{(t+1)}(\boldsymbol{z}_{i+1})\right] & (i<n)\\
100+
1 & (i=n)
101+
\end{cases} \\
102+
q^{(t+1)}(\boldsymbol{z}_i) &\propto \alpha^{(t+1)}(\boldsymbol{z}_i)\beta^{(t+1)}(\boldsymbol{z}_i) \\
103+
\gamma^{(t+1)}_{i,k} &= \sum_{\boldsymbol{z}_i} q^{(t+1)}(\boldsymbol{z}_i) z_{i,k}\\
104+
q^{(t+1)}(\boldsymbol{z}_{i-1}, \boldsymbol{z}_{i}) &\propto \alpha^{(t+1)}(\boldsymbol{z}_{i-1}) \prod_{k=1}^{K} \left( \rho_{i,k}^{(t+1)}\right)^{z_{i,k}} \prod_{k=1}^{K}\prod_{j=1}^{K}\left(\tilde{a}^{(t+1)}_{j,k}\right)^{z_{i-1,j}z_{i,k}} \beta^{(t+1)}(\boldsymbol{z}_i) \\
105+
\xi^{(t+1)}_{i,j,k} &= \sum_{\boldsymbol{z}_{i-1}} \sum_{\boldsymbol{z}_i} q^{(t+1)}(\boldsymbol{z}_{i-1}, \boldsymbol{z}_{i}) z_{i-1,j} z_{i,k}
106+
107+
The approximate predictive distribution is as follows:
108+
109+
* :math:`\boldsymbol{x}_{n+1} \in \mathbb{R}^D`: a new data point
110+
* :math:`(a_{\mathrm{p},j,k})_{1\leq j,k\leq K} \in [0, 1]^{K\times K}`: the parameters of the predictive transition probability of latent classes, (:math:`\sum_{k=1}^K a_{\mathrm{p},j,k}=1`)
111+
* :math:`\boldsymbol{\mu}_{\mathrm{p},k} \in \mathbb{R}^D`: the parameter of the predictive distribution
112+
* :math:`\boldsymbol{\Lambda}_{\mathrm{p},k} \in \mathbb{R}^{D \times D}`: the parameter of the predictive distribution (a positive definite matrix)
113+
* :math:`\nu_{\mathrm{p},k} \in \mathbb{R}_{>0}`: the parameter of the predictive distribution
114+
115+
.. math::
116+
&p(x_{n+1}|x^n) \\
117+
&\approx \sum_{k=1}^K \left( \sum_{j=1}^K \gamma_{n,j}^{(t)} a_{\mathrm{p},j,k} \right) \mathrm{St}(x_{n+1}|\boldsymbol{\mu}_{\mathrm{p},k},\boldsymbol{\Lambda}_{\mathrm{p},k}, \nu_{\mathrm{p},k}) \\
118+
&= \sum_{k=1}^K \left( \sum_{j=1}^K \gamma_{n,j}^{(t)} a_{\mathrm{p},j,k} \right)\Biggl[ \frac{\Gamma (\nu_{\mathrm{p},k} / 2 + D / 2)}{\Gamma (\nu_{\mathrm{p},k} / 2)} \frac{|\boldsymbol{\Lambda}_{\mathrm{p},k}|^{1/2}}{(\nu_{\mathrm{p},k} \pi)^{D/2}} \nonumber \\
119+
&\qquad \qquad \qquad \qquad \qquad \times \left( 1 + \frac{1}{\nu_{\mathrm{p},k}} (\boldsymbol{x}_{n+1} - \boldsymbol{\mu}_{\mathrm{p},k})^\top \boldsymbol{\Lambda}_{\mathrm{p},k} (\boldsymbol{x}_{n+1} - \boldsymbol{\mu}_{\mathrm{p},k}) \right)^{-\nu_{\mathrm{p},k}/2 - D/2} \Biggr],
120+
121+
where the parameters are obtained from the hyperparameters of the predictive distribution as follows:
122+
123+
.. math::
124+
a_{\mathrm{p},j,k} &= \frac{\zeta_{n,j,k}^{(t)}}{\sum_{k=1}^K \zeta_{n,j,k}^{(t)}}, \\
125+
\boldsymbol{\mu}_{\mathrm{p},k} &= \boldsymbol{m}_{n,k}^{(t)}, \\
126+
\boldsymbol{\Lambda}_{\mathrm{p},k} &= \frac{\kappa_{n,k}^{(t)} (\nu_{n,k}^{(t)} - D + 1)}{\kappa_{n,k}^{(t)} + 1} \boldsymbol{W}_{n,k}^{(t)}, \\
127+
\nu_{\mathrm{p},k} &= \nu_{n,k}^{(t)} - D + 1.
128+
"""
129+
from ._hiddenmarkovnormal import GenModel
130+
from ._hiddenmarkovnormal import LearnModel
131+
132+
__all__ = ["GenModel","LearnModel"]

0 commit comments

Comments
 (0)