Skip to content

Commit a13ad15

Browse files
committed
Revise posterior and predictive distribution
1 parent 76667f1 commit a13ad15

File tree

1 file changed

+39
-27
lines changed

1 file changed

+39
-27
lines changed

bayesml/hiddenmarkovnormal/hiddenmarkovnormal.md

Lines changed: 39 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@ The stochastic data generative model is as follows:
99
* $K \in \mathbb{N}$: number of latent classes
1010
* $\boldsymbol{z} \in \{ 0, 1 \}^K$: a one-hot vector representing the latent class (latent variable)
1111
* $\boldsymbol{\pi} \in [0, 1]^K$: a parameter for latent classes, ($\sum_{k=1}^K \pi_k=1$)
12-
* $a_{jk}$ : transition probability to latent state k under latent state j
13-
* $\boldsymbol{A}=(a_{jk})_{0\leq j,k\leq K} \in [0, 1]^{K\times K}$: a parameter for latent classes, ($\sum_{k=1}^K a_{jk}=1$)
12+
* $a_{j,k} \in [0,1]$ : transition probability to latent state k under latent state j
13+
* $\boldsymbol{a}_j = [a_{j,1}, a_{j,2}, \dots , a_{j,K}]\in [0,1]^K$, a vector of the transition probability ($\sum_{k=1}^K a_{j,k}=1$)
14+
* $\boldsymbol{A}=(a_{j,k})_{1\leq j,k\leq K} \in [0, 1]^{K\times K}$: a matrix of the transition probability
1415
* $D \in \mathbb{N}$: a dimension of data
1516
* $\boldsymbol{x} \in \mathbb{R}^D$: a data point
1617
* $\boldsymbol{\mu}_k \in \mathbb{R}^D$: a parameter
@@ -22,7 +23,7 @@ The stochastic data generative model is as follows:
2223
$$
2324
\begin{align}
2425
p(\boldsymbol{z}_{1} | \boldsymbol{\pi}) &= \mathrm{Cat}(\boldsymbol{z}_{1}|\boldsymbol{\pi}) = \prod_{k=1}^K \pi_k^{z_{1,k}},\\
25-
p(\boldsymbol{z}_{n} |\boldsymbol{z}_{n-1} ,\boldsymbol{A}) &= \prod_{k=1}^K \prod_{j=1}^K a_{jk}^{z_{n-1,j}z_{n,k}},\\
26+
p(\boldsymbol{z}_{n} |\boldsymbol{z}_{n-1} ,\boldsymbol{A}) &= \prod_{k=1}^K \prod_{j=1}^K a_{j,k}^{z_{n-1,j}z_{n,k}},\\
2627
p(\boldsymbol{x}_{n} | \boldsymbol{\mu}, \boldsymbol{\Lambda}, \boldsymbol{z}_{n}) &= \prod_{k=1}^K \mathcal{N}(\boldsymbol{x}|\boldsymbol{\mu}_k,\boldsymbol{\Lambda}_k^{-1})^{z_{n,k}} \\
2728
&= \prod_{k=1}^K \left( \frac{| \boldsymbol{\Lambda}_{k} |^{1/2}}{(2\pi)^{D/2}} \exp \left\{ -\frac{1}{2}(\boldsymbol{x}-\boldsymbol{\mu}_{k})^\top \boldsymbol{\Lambda}_{k} (\boldsymbol{x}-\boldsymbol{\mu}_{k}) \right\} \right)^{z_{n,k}},
2829
\end{align}
@@ -45,7 +46,7 @@ $$
4546
&= \Biggl[ \prod_{k=1}^K \left( \frac{\kappa_0}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_0}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_0)^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_0) \right\} \\
4647
&\qquad \times B(\boldsymbol{W}_0, \nu_0) | \boldsymbol{\Lambda}_k |^{(\nu_0 - D - 1) / 2} \exp \left\{ -\frac{1}{2} \mathrm{Tr} \{ \boldsymbol{W}_0^{-1} \boldsymbol{\Lambda}_k \} \right\}\biggl] \\
4748
&\qquad \times \Biggl[ \prod_{k=1}^KC(\boldsymbol{\eta}_0)\pi_k^{\eta_{0,k}-1}\biggl]\\
48-
&\qquad \times \biggl[\prod_{j=1}^KC(\boldsymbol{\zeta}_{0,j})\prod_{k=1}^K a_{jk}^{\zeta_{0,j,k}-1}\Biggr],\\
49+
&\qquad \times \biggl[\prod_{j=1}^KC(\boldsymbol{\zeta}_{0,j})\prod_{k=1}^K a_{j,k}^{\zeta_{0,j,k}-1}\Biggr],\\
4950
\end{align}
5051
$$
5152

@@ -71,8 +72,9 @@ The apporoximate posterior distribution in the $t$-th iteration of a variational
7172
* $\boldsymbol{\zeta}_{n,j}^{(t)} \in \mathbb{R}_{> 0}^K$: a hyperparameter
7273
$$
7374
\begin{align}
74-
q(\boldsymbol{z}^{n+1}, \boldsymbol{\mu},\boldsymbol{\Lambda},\boldsymbol{\pi}) &= q(\boldsymbol{z}^{n+1}) \left\{ \prod_{k=1}^K \mathcal{N}(\boldsymbol{\mu}_k|\boldsymbol{m}_{n,k}^{(t)},(\kappa_{n,k}^{(t)} \boldsymbol{\Lambda}_k)^{-1})\mathcal{W}(\boldsymbol{\Lambda}_k|\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) \right\} \mathrm{Dir}(\boldsymbol{\pi}|\boldsymbol{\eta}_n^{(t)})\left\{\prod_{j=1}^K\mathrm{Dir}(\boldsymbol{a}_j|\boldsymbol{\zeta}_{n,j}^{(t)})\right\}, \\
75-
&= q(\boldsymbol{z}^{n+1}) \Biggl[ \prod_{k=1}^K \left( \frac{\kappa_{n,k}^{(t)}}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_{n,k}^{(t)}}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_{n,k}^{(t)})^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_{n,k}^{(t)}) \right\} \\
75+
&q(\boldsymbol{z}^n, \boldsymbol{\mu},\boldsymbol{\Lambda},\boldsymbol{\pi},\boldsymbol{A}) \nonumber \\
76+
&= q^{(t)}(\boldsymbol{z}^n) \left\{ \prod_{k=1}^K \mathcal{N}(\boldsymbol{\mu}_k|\boldsymbol{m}_{n,k}^{(t)},(\kappa_{n,k}^{(t)} \boldsymbol{\Lambda}_k)^{-1})\mathcal{W}(\boldsymbol{\Lambda}_k|\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) \right\} \mathrm{Dir}(\boldsymbol{\pi}|\boldsymbol{\eta}_n^{(t)})\left\{\prod_{j=1}^K\mathrm{Dir}(\boldsymbol{a}_j|\boldsymbol{\zeta}_{n,j}^{(t)})\right\}, \\
77+
&= q^{(t)}(\boldsymbol{z}^n) \Biggl[ \prod_{k=1}^K \left( \frac{\kappa_{n,k}^{(t)}}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_{n,k}^{(t)}}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_{n,k}^{(t)})^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_{n,k}^{(t)}) \right\} \\
7678
&\qquad \times B(\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) | \boldsymbol{\Lambda}_k |^{(\nu_{n,k}^{(t)} - D - 1) / 2} \exp \left\{ -\frac{1}{2} \mathrm{Tr} \{ ( \boldsymbol{W}_{n,k}^{(t)} )^{-1} \boldsymbol{\Lambda}_k \} \right\} \Biggr] \\
7779
&\qquad \times C(\boldsymbol{\eta}_n^{(t)})\prod_{k=1}^K \pi_k^{\eta_{n,k}^{(t)}-1}\left[\prod_{j=1}^K C(\boldsymbol{\zeta}_{n,j}^{(t)})\prod_{k=1}^K a_{j,k}^{\zeta_{n,j,k}^{(t)}-1}\right],\\
7880
\end{align}
@@ -82,47 +84,57 @@ where the updating rule of the hyperparameters is as follows.
8284

8385
$$
8486
\begin{align}
85-
N_k^{(t)} &= \sum_{i=1}^n q(z_{i,k})^{(t)}, \\
86-
M_{j,k}^{(t)} &= \sum_{i=2}^{n+1}q(z_{i-1,j}z_{i,k})^{(t)},\\
87-
\bar{\boldsymbol{x}}_k^{(t)} &= \frac{1}{N_k^{(t)}} \sum_{i=1}^n q(z_{i,k})^{(t)} \boldsymbol{x}_i, \\
88-
S_k^{(t)} &= \frac{1}{N_k}\sum_{i=1}^nq(z_{i,k})^{(t)}(x_i-\bar{\boldsymbol{x}}_k^{(t)})(x_i-\bar{\boldsymbol{x}}_k^{(t)})^{\top},\\
87+
N_k^{(t)} &= \sum_{i=1}^n \gamma^{(t)}_{i,k}, \\
88+
M_{j,k}^{(t)} &= \sum_{i=2}^n \xi^{(t)}_{i,j,k},\\
89+
\bar{\boldsymbol{x}}_k^{(t)} &= \frac{1}{N_k^{(t)}} \sum_{i=1}^n \gamma^{(t)}_{i,k} \boldsymbol{x}_i, \\
90+
S_k^{(t)} &= \frac{1}{N_k^{(t)}}\sum_{i=1}^n \gamma^{(t)}_{i,k} (x_i-\bar{\boldsymbol{x}}_k^{(t)})(x_i-\bar{\boldsymbol{x}}_k^{(t)})^{\top},\\
8991
\boldsymbol{m}_{n,k}^{(t+1)} &= \frac{\kappa_0\boldsymbol{\mu}_0 + N_k^{(t)} \bar{\boldsymbol{x}}_k^{(t)}}{\kappa_0 + N_k^{(t)}}, \\
9092
\kappa_{n,k}^{(t+1)} &= \kappa_0 + N_k^{(t)}, \\
9193
(\boldsymbol{W}_{n,k}^{(t+1)})^{-1} &= \boldsymbol{W}_0^{-1} + N_k^{(t)}S_k^{(t)} + \frac{\kappa_0 N_k^{(t)}}{\kappa_0 + N_k^{(t)}}(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)^\top, \\
9294
\nu_{n,k}^{(t+1)} &= \nu_0 + N_k^{(t)},\\
93-
\eta_{n,k}^{(t+1)} &= \eta_{0,k} + q(z_{0,k})^{(t)}, \\
95+
\eta_{n,k}^{(t+1)} &= \eta_{0,k} + \gamma^{(t)}_{1,k}, \\
9496
\zeta_{n,j,k}^{(t+1)} &= \zeta_{0,j,k}+M_{j,k}^{(t)}.
9597
\end{align}
9698
$$
9799

98-
The forward-backward algorithm is used to calculate the approximate posterior distribution of the latent variable.
100+
The approximate posterior distribution of the latent variable $q^{(t+1)}(z^n)$ is calculated by the forward-backward algorithm as follows.
99101

100102
$$
101-
q(z_n)^{(t)} = \frac{1}{\sum_{\bm{z}_{N+1}}\alpha(\bm{z}_{N+1})^{(t)}}\alpha(\bm{z}_n)^{(t)}\beta(\bm{z}_n)^{(t)}
102-
$$
103-
104-
where
105-
106-
$$
107-
\begin{cases}
108-
\alpha(\bm{z}_n)^{(t)}&=\prod_{k=1}^{K} \{\rho_{\text{out},n,k}^{(t)}\}^{z_{1,k}}\sum_{\bm{z}_{n-1}}\left[\prod_{k=1}^{K}\prod_{j=1}^{K}\{\rho^{(t)}_{\text{trans},j,k}\}^{z_{n-1,j}z_{n,k}}\right]\alpha(\bm{z}_{n-1})^{(t)}\\
109-
\alpha(\bm{z}_1)^{(t)}&=\prod_{k=1}^{K}\left\{\rho_{\text{init},k}^{(t)}\rho_{\text{out},1,k}^{(t)}\right\}^{z_{1,k}}\\
110-
\beta(\bm{z}_n)^{(t)}&=\sum_{\bm{z}_{t+1}}\left[\prod_{k=1}^{K} \{\rho_{\text{out},n+1,k}^{(t)}\}^{z_{n+1,k}}\sum_{\bm{z}_{n+1}}\left[\prod_{k=1}^{K}\prod_{j=1}^{K}\{\rho^{(t)}_{\text{trans},j,k}\}^{z_{n,j}z_{n+1,k}}\right]\beta(\bm{z}_{n+1})^{(t)}\right]
111-
\end{cases}
103+
\begin{align}
104+
\ln \rho_{i,k}^{(t+1)} &= \frac{1}{2} \Biggl[\, \sum_{d=1}^D \psi \left( \frac{\nu_{n,k}^{(t+1)} + 1 - d}{2} \right) + D \ln 2 + \ln | \boldsymbol{W}_{n,k}^{(t+1)} | \notag \\
105+
&\qquad - D \ln (2 \pi ) - \frac{D}{\kappa_{n,k}^{(t+1)}} - \nu_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)})^\top \boldsymbol{W}_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)}) \Biggr], \\
106+
\ln \tilde{\pi}_k^{(t+1)} &= \psi (\eta_{n,k}^{(t+1)}) - \psi \left( \textstyle \sum_{k=1}^K \eta_{n,k}^{(t+1)} \right) \\
107+
\ln \tilde{a}_{j,k}^{(t+1)} &= \psi (\zeta_{n,j,k}^{(t+1)}) - \psi \left( \textstyle \sum_{k=1}^K \zeta_{n,j,k}^{(t+1)} \right) \\
108+
\alpha^{(t+1)} (\boldsymbol{z}_i) &\propto
109+
\begin{cases}
110+
\prod_{k=1}^{K} \left( \rho_{i,k}^{(t+1)}\right)^{z_{i,k}} \sum_{\boldsymbol{z}_{i-1}} \left[\prod_{k=1}^{K}\prod_{j=1}^{K}\left(\tilde{a}^{(t+1)}_{j,k}\right)^{z_{i-1,j}z_{i,k}}\alpha^{(t+1)}(\boldsymbol{z}_{i-1})\right] & (i>1)\\
111+
\prod_{k=1}^{K}\left( \rho_{1,k}^{(t+1)} \tilde{\pi}_k^{(t+1)} \right)^{z_{1,k}} & (i=1)
112+
\end{cases} \\
113+
\beta^{(t+1)} (\boldsymbol{z}_i) &\propto
114+
\begin{cases}
115+
\sum_{\boldsymbol{z}_{i+1}} \left[ \prod_{k=1}^{K} \left( \rho_{i+1,k}^{(t+1)}\right)^{z_{i,k}} \prod_{k=1}^{K}\prod_{j=1}^{K}\left(\tilde{a}^{(t+1)}_{j,k}\right)^{z_{i,j}z_{i+1,k}}\beta^{(t+1)}(\boldsymbol{z}_{i+1})\right] & (i<n)\\
116+
1 & (i=n)
117+
\end{cases} \\
118+
q^{(t+1)}(\boldsymbol{z}_i) &\propto \alpha^{(t+1)}(\boldsymbol{z}_i)\beta^{(t+1)}(\boldsymbol{z}_i) \\
119+
\gamma^{(t+1)}_{i,k} &= \sum_{\boldsymbol{z}_i} q^{(t+1)}(\boldsymbol{z}_i) z_{i,k}\\
120+
q^{(t+1)}(\boldsymbol{z}_{i-1}, \boldsymbol{z}_{i}) &\propto \alpha^{(t+1)}(\boldsymbol{z}_{i-1}) \prod_{k=1}^{K} \left( \rho_{i,k}^{(t+1)}\right)^{z_{i,k}} \prod_{k=1}^{K}\prod_{j=1}^{K}\left(\tilde{a}^{(t+1)}_{j,k}\right)^{z_{i-1,j}z_{i,k}} \beta^{(t+1)}(\boldsymbol{z}_i) \\
121+
\xi^{(t+1)}_{i,j,k} &= \sum_{\boldsymbol{z}_{i-1}} \sum_{\boldsymbol{z}_i} q^{(t+1)}(\boldsymbol{z}_{i-1}, \boldsymbol{z}_{i}) z_{i-1,j} z_{i,k}
122+
\end{align}
112123
$$
113124

114125
The approximate predictive distribution is as follows:
115126

116127
* $\boldsymbol{x}_{n+1} \in \mathbb{R}^D$: a new data point
128+
* $(a_{\mathrm{p},j,k})_{1\leq j,k\leq K} \in [0, 1]^{K\times K}$: the parameters of the predictive transition probability of latent classes, ($\sum_{k=1}^K a_{\mathrm{p},j,k}=1$)
117129
* $\boldsymbol{\mu}_{\mathrm{p},k} \in \mathbb{R}^D$: the parameter of the predictive distribution
118130
* $\boldsymbol{\Lambda}_{\mathrm{p},k} \in \mathbb{R}^{D \times D}$: the parameter of the predictive distribution (a positive definite matrix)
119131
* $\nu_{\mathrm{p},k} \in \mathbb{R}_{>0}$: the parameter of the predictive distribution
120132

121133
$$
122134
\begin{align}
123135
&p(x_{n+1}|x^n) \\
124-
&\simeq \frac{1}{\sum_{k=1}^K \alpha(z_{n+1,k})^{(t)}} \sum_{k=1}^K \alpha(z_{n+1,k})^{(t)} \mathrm{St}(x_{n+1}|\boldsymbol{\mu}_{\mathrm{p},k},\boldsymbol{\Lambda}_{\mathrm{p},k}, \nu_{\mathrm{p},k}) \\
125-
&= \frac{1}{\sum_{k=1}^K \alpha(z_{n+1,k})^{(t)}} \sum_{k=1}^K \alpha(z_{n+1,k})^{(t)}\Biggl[ \frac{\Gamma (\nu_{\mathrm{p},k} / 2 + D / 2)}{\Gamma (\nu_{\mathrm{p},k} / 2)} \frac{|\boldsymbol{\Lambda}_{\mathrm{p},k}|^{1/2}}{(\nu_{\mathrm{p},k} \pi)^{D/2}} \nonumber \\
136+
&\approx \sum_{k=1}^K \left( \sum_{j=1}^K \gamma_{n,j}^{(t)} a_{\mathrm{p},j,k} \right) \mathrm{St}(x_{n+1}|\boldsymbol{\mu}_{\mathrm{p},k},\boldsymbol{\Lambda}_{\mathrm{p},k}, \nu_{\mathrm{p},k}) \\
137+
&= \sum_{k=1}^K \left( \sum_{j=1}^K \gamma_{n,j}^{(t)} a_{\mathrm{p},j,k} \right)\Biggl[ \frac{\Gamma (\nu_{\mathrm{p},k} / 2 + D / 2)}{\Gamma (\nu_{\mathrm{p},k} / 2)} \frac{|\boldsymbol{\Lambda}_{\mathrm{p},k}|^{1/2}}{(\nu_{\mathrm{p},k} \pi)^{D/2}} \nonumber \\
126138
&\qquad \qquad \qquad \qquad \qquad \times \left( 1 + \frac{1}{\nu_{\mathrm{p},k}} (\boldsymbol{x}_{n+1} - \boldsymbol{\mu}_{\mathrm{p},k})^\top \boldsymbol{\Lambda}_{\mathrm{p},k} (\boldsymbol{x}_{n+1} - \boldsymbol{\mu}_{\mathrm{p},k}) \right)^{-\nu_{\mathrm{p},k}/2 - D/2} \Biggr],
127139
\end{align}
128140
$$
@@ -131,8 +143,8 @@ where the parameters are obtained from the hyperparameters of the predictive dis
131143

132144
$$
133145
\begin{align}
134-
\alpha(\boldsymbol{z}_{n+1})^{(t)}&=\sum_{\boldsymbol{z}^{n}}q(\boldsymbol{z}^{n+1})^{(t)}\\
135-
\boldsymbol{\mu}_{\mathrm{p},k} &= \boldsymbol{m}_{n,k}^{(t)} \\
146+
a_{\mathrm{p},j,k} &= \frac{\zeta_{n,j,k}^{(t)}}{\sum_{k=1}^K \zeta_{n,j,k}^{(t)}}, \\
147+
\boldsymbol{\mu}_{\mathrm{p},k} &= \boldsymbol{m}_{n,k}^{(t)}, \\
136148
\boldsymbol{\Lambda}_{\mathrm{p},k} &= \frac{\kappa_{n,k}^{(t)} (\nu_{n,k}^{(t)} - D + 1)}{\kappa_{n,k}^{(t)} + 1} \boldsymbol{W}_{n,k}^{(t)}, \\
137149
\nu_{\mathrm{p},k} &= \nu_{n,k}^{(t)} - D + 1.
138150
\end{align}

0 commit comments

Comments
 (0)