Skip to content

Commit cd82991

Browse files
authored
Merge pull request #15 from yuta-nakahara/develop-gaussianmixture-resume
Develop gaussianmixture resume
2 parents 0ff39a8 + 13fced1 commit cd82991

File tree

2 files changed

+236
-0
lines changed

2 files changed

+236
-0
lines changed
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
<!-- Document Author
2+
Yuta Nakahara <yuta.nakahara@aoni.waseda.jp>
3+
-->
4+
5+
The Gaussian mixture model with the Gauss-Wishart prior distribution and the Dirichlet prior distribution.
6+
7+
The stochastic data generative model is as follows:
8+
9+
* $K \in \mathbb{N}$: number of latent classes
10+
* $\boldsymbol{z} \in \{ 0, 1 \}^K$: a one-hot vector representing the latent class (latent variable)
11+
* $\boldsymbol{\pi} \in [0, 1]^K$: a parameter for latent classes, ($\sum_{k=1}^K \pi_k=1$)
12+
* $D \in \mathbb{N}$: a dimension of data
13+
* $\boldsymbol{x} \in \mathbb{R}^D$: a data point
14+
* $\boldsymbol{\mu}_k \in \mathbb{R}^D$: a parameter
15+
* $\boldsymbol{\mu} = \{ \boldsymbol{\mu}_k \}_{k=1}^K$
16+
* $\boldsymbol{\Lambda}_k \in \mathbb{R}^{D\times D}$ : a parameter (a positive definite matrix)
17+
* $\boldsymbol{\Lambda} = \{ \boldsymbol{\Lambda}_k \}_{k=1}^K$
18+
* $| \boldsymbol{\Lambda}_k | \in \mathbb{R}$: the determinant of $\boldsymbol{\Lambda}_k$
19+
20+
$$
21+
\begin{align}
22+
p(\boldsymbol{z} | \boldsymbol{\pi}) &= \mathrm{Cat}(\boldsymbol{z}|\boldsymbol{\pi}) = \prod_{k=1}^K \pi_k^{z_k},\\
23+
p(\boldsymbol{x} | \boldsymbol{\mu}, \boldsymbol{\Lambda}, \boldsymbol{z}) &= \prod_{k=1}^K \mathcal{N}(\boldsymbol{x}|\boldsymbol{\mu}_k,\boldsymbol{\Lambda}_k^{-1})^{z_k} \\
24+
&= \prod_{k=1}^K \left( \frac{| \boldsymbol{\Lambda}_k |^{1/2}}{(2\pi)^{D/2}} \exp \left\{ -\frac{1}{2}(\boldsymbol{x}-\boldsymbol{\mu}_k)^\top \boldsymbol{\Lambda}_k (\boldsymbol{x}-\boldsymbol{\mu}_k) \right\} \right)^{z_k}.
25+
\end{align}
26+
$$
27+
28+
The prior distribution is as follows:
29+
30+
* $\boldsymbol{m}_0 \in \mathbb{R}^{D}$: a hyperparameter
31+
* $\kappa_0 \in \mathbb{R}_{>0}$: a hyperparameter
32+
* $\nu_0 \in \mathbb{R}$: a hyperparameter ($\nu_0 > D-1$)
33+
* $\boldsymbol{W}_0 \in \mathbb{R}^{D\times D}$: a hyperparameter (a positive definite matrix)
34+
* $\boldsymbol{\alpha}_0 \in \mathbb{R}_{> 0}^K$: a hyperparameter
35+
* $\mathrm{Tr} \{ \cdot \}$: a trace of a matrix
36+
* $\Gamma (\cdot)$: the gamma function
37+
38+
$$
39+
\begin{align}
40+
p(\boldsymbol{\mu},\boldsymbol{\Lambda},\boldsymbol{\pi}) &= \left\{ \prod_{k=1}^K \mathcal{N}(\boldsymbol{\mu}_k|\boldsymbol{m}_0,(\kappa_0 \boldsymbol{\Lambda}_k)^{-1})\mathcal{W}(\boldsymbol{\Lambda}_k|\boldsymbol{W}_0, \nu_0) \right\} \mathrm{Dir}(\boldsymbol{\pi}|\boldsymbol{\alpha}_0) \\
41+
&= \Biggl[\, \prod_{k=1}^K \left( \frac{\kappa_0}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_0}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_0)^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_0) \right\} \notag \\
42+
&\qquad \times B(\boldsymbol{W}_0, \nu_0) | \boldsymbol{\Lambda}_k |^{(\nu_0 - D - 1) / 2} \exp \left\{ -\frac{1}{2} \mathrm{Tr} \{ \boldsymbol{W}_0^{-1} \boldsymbol{\Lambda}_k \} \right\} \Biggr] \notag \\
43+
&\qquad \times C(\boldsymbol{\alpha}_0)\prod_{k=1}^K \pi_k^{\alpha_{0,k}-1},\\
44+
\end{align}
45+
$$
46+
47+
where $B(\boldsymbol{W}_0, \nu_0)$ and $C(\boldsymbol{\alpha}_0)$ are defined as follows:
48+
49+
$$
50+
\begin{align}
51+
B(\boldsymbol{W}_0, \nu_0) &= | \boldsymbol{W}_0 |^{-\nu_0 / 2} \left( 2^{\nu_0 D / 2} \pi^{D(D-1)/4} \prod_{i=1}^D \Gamma \left( \frac{\nu_0 + 1 - i}{2} \right) \right)^{-1}, \\
52+
C(\boldsymbol{\alpha}_0) &= \frac{\Gamma(\sum_{k=1}^K \alpha_{0,k})}{\Gamma(\alpha_{0,1})\cdots\Gamma(\alpha_{0,K})}.
53+
\end{align}
54+
$$
55+
56+
The apporoximate posterior distribution in the $t$-th iteration of a variational Bayesian method is as follows:
57+
58+
* $\boldsymbol{x}^n = (\boldsymbol{x}_1, \boldsymbol{x}_2, \dots , \boldsymbol{x}_n) \in \mathbb{R}^{D \times n}$: given data
59+
* $\boldsymbol{z}^n = (\boldsymbol{z}_1, \boldsymbol{z}_2, \dots , \boldsymbol{z}_n) \in \{ 0, 1 \}^{K \times n}$: latent classes of given data
60+
* $\boldsymbol{r}_i^{(t)} = (r_{i,1}^{(t)}, r_{i,2}^{(t)}, \dots , r_{i,K}^{(t)}) \in [0, 1]^K$: a parameter for the $i$-th latent class. ($\sum_{k=1}^K r_{i, k}^{(t)} = 1$)
61+
* $\boldsymbol{m}_{n,k}^{(t)} \in \mathbb{R}^{D}$: a hyperparameter
62+
* $\kappa_{n,k}^{(t)} \in \mathbb{R}_{>0}$: a hyperparameter
63+
* $\nu_{n,k}^{(t)} \in \mathbb{R}$: a hyperparameter $(\nu_n > D-1)$
64+
* $\boldsymbol{W}_{n,k}^{(t)} \in \mathbb{R}^{D\times D}$: a hyperparameter (a positive definite matrix)
65+
* $\boldsymbol{\alpha}_n^{(t)} \in \mathbb{R}_{> 0}^K$: a hyperparameter
66+
67+
$$
68+
\begin{align}
69+
q(\boldsymbol{z}^n, \boldsymbol{\mu},\boldsymbol{\Lambda},\boldsymbol{\pi}) &= \left\{ \prod_{i=1}^n \mathrm{Cat} (\boldsymbol{z}_i | \boldsymbol{r}_i^{(t)}) \right\} \left\{ \prod_{k=1}^K \mathcal{N}(\boldsymbol{\mu}_k|\boldsymbol{m}_{n,k}^{(t)},(\kappa_{n,k}^{(t)} \boldsymbol{\Lambda}_k)^{-1})\mathcal{W}(\boldsymbol{\Lambda}_k|\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) \right\} \mathrm{Dir}(\boldsymbol{\pi}|\boldsymbol{\alpha}_n^{(t)}) \\
70+
&= \Biggl[\, \prod_{i=1}^n \prod_{k=1}^K (r_{i,k}^{(t)})^{z_{i,k}} \Biggr] \Biggl[\, \prod_{k=1}^K \left( \frac{\kappa_{n,k}^{(t)}}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_{n,k}^{(t)}}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_{n,k}^{(t)})^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_{n,k}^{(t)}) \right\} \\
71+
&\qquad \times B(\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) | \boldsymbol{\Lambda}_k |^{(\nu_{n,k}^{(t)} - D - 1) / 2} \exp \left\{ -\frac{1}{2} \mathrm{Tr} \{ ( \boldsymbol{W}_{n,k}^{(t)} )^{-1} \boldsymbol{\Lambda}_k \} \right\} \Biggr] \\
72+
&\qquad \times C(\boldsymbol{\alpha}_n^{(t)})\prod_{k=1}^K \pi_k^{\alpha_{n,k}^{(t)}-1},\\
73+
\end{align}
74+
$$
75+
76+
where the updating rule of the hyperparameters is as follows.
77+
78+
$$
79+
\begin{align}
80+
N_k^{(t)} &= \sum_{i=1}^n r_{i,k}^{(t)}, \\
81+
\bar{\boldsymbol{x}}_k^{(t)} &= \frac{1}{N_k^{(t)}} \sum_{i=1}^n r_{i,k}^{(t)} \boldsymbol{x}_i, \\
82+
\boldsymbol{m}_{n,k}^{(t+1)} &= \frac{\kappa_0\boldsymbol{\mu}_0 + N_k^{(t)} \bar{\boldsymbol{x}}_k^{(t)}}{\kappa_0 + N_k^{(t)}}, \\
83+
\kappa_{n,k}^{(t+1)} &= \kappa_0 + N_k^{(t)}, \\
84+
(\boldsymbol{W}_{n,k}^{(t+1)})^{-1} &= \boldsymbol{W}_0^{-1} + \sum_{i=1}^{n} r_{i,k}^{(t)} (\boldsymbol{x}_i-\bar{\boldsymbol{x}}_k^{(t)})(\boldsymbol{x}_i-\bar{\boldsymbol{x}}_k^{(t)})^\top + \frac{\kappa_0 N_k^{(t)}}{\kappa_0 + N_k^{(t)}}(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)^\top, \\
85+
\nu_{n,k}^{(t+1)} &= \nu_0 + N_k^{(t)},\\
86+
\alpha_{n,k}^{(t+1)} &= \alpha_{0,k} + N_k^{(t)}, \\
87+
\ln \rho_{i,k}^{(t+1)} &= \psi (\alpha_{n,k}^{(t+1)}) - \psi ( {\textstyle \sum_{k=1}^K \alpha_{n,k}^{(t+1)}} ) \notag \\
88+
&\qquad + \frac{1}{2} \Biggl[\, \sum_{d=1}^D \psi \left( \frac{\nu_{n,k}^{(t+1)} + 1 - d}{2} \right) + D \ln 2 + \ln | \boldsymbol{W}_{n,k}^{(t+1)} | \notag \\
89+
&\qquad - D \ln (2 \pi ) - \frac{D}{\kappa_{n,k}^{(t+1)}} - \nu_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)})^\top \boldsymbol{W}_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)}) \Biggr], \\
90+
r_{i,k}^{(t+1)} &= \frac{\rho_{i,k}^{(t+1)}}{\sum_{j=1}^K \rho_{i,j}^{(t+1)}}.
91+
\end{align}
92+
$$
93+
94+
The approximate predictive distribution is as follows:
95+
96+
* $\boldsymbol{x}_{n+1} \in \mathbb{R}^D$: a new data point
97+
* $\boldsymbol{\mu}_{\mathrm{p},k} \in \mathbb{R}^D$: the parameter of the predictive distribution
98+
* $\boldsymbol{\Lambda}_{\mathrm{p},k} \in \mathbb{R}^{D \times D}$: the parameter of the predictive distribution (a positive definite matrix)
99+
* $\nu_{\mathrm{p},k} \in \mathbb{R}_{>0}$: the parameter of the predictive distribution
100+
101+
$$
102+
\begin{align}
103+
&p(x_{n+1}|x^n) \\
104+
&= \frac{1}{\sum_{k=1}^K \alpha_{n,k}^{(t)}} \sum_{k=1}^K \alpha_{n,k}^{(t)} \mathrm{St}(x_{n+1}|\boldsymbol{\mu}_{\mathrm{p},k},\boldsymbol{\Lambda}_{\mathrm{p},k}, \nu_{\mathrm{p},k}) \\
105+
&= \frac{1}{\sum_{k=1}^K \alpha_{n,k}^{(t)}} \sum_{k=1}^K \alpha_{n,k}^{(t)} \Biggl[ \frac{\Gamma (\nu_{\mathrm{p},k} / 2 + D / 2)}{\Gamma (\nu_{\mathrm{p},k} / 2)} \frac{|\boldsymbol{\Lambda}_{\mathrm{p},k}|^{1/2}}{(\nu_{\mathrm{p},k} \pi)^{D/2}} \notag \\
106+
&\qquad \qquad \qquad \qquad \qquad \times \left( 1 + \frac{1}{\nu_{\mathrm{p},k}} (\boldsymbol{x}_{n+1} - \boldsymbol{\mu}_{\mathrm{p},k})^\top \boldsymbol{\Lambda}_{\mathrm{p},k} (\boldsymbol{x}_{n+1} - \boldsymbol{\mu}_{\mathrm{p},k}) \right)^{-\nu_{\mathrm{p},k}/2 - D/2} \Biggr],
107+
\end{align}
108+
$$
109+
110+
where the parameters are obtained from the hyperparameters of the posterior distribution as follows:
111+
112+
$$
113+
\begin{align}
114+
\boldsymbol{\mu}_{\mathrm{p},k} &= \boldsymbol{m}_{n,k}^{(t)}, \\
115+
\nu_{\mathrm{p},k} &= \nu_{n,k}^{(t)} - D + 1,\\
116+
\boldsymbol{\Lambda}_{\mathrm{p},k} &= \frac{\kappa_{n,k}^{(t)} \nu_{\mathrm{p},k}}{\kappa_{n,k}^{(t)} + 1} \boldsymbol{W}_{n,k}^{(t)}.
117+
\end{align}
118+
$$
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
<!-- Document Author
2+
Yuta Nakahara <yuta.nakahara@aoni.waseda.jp>
3+
-->
4+
5+
The Gaussian mixture model with the Gauss-Wishart prior distribution and the Dirichlet prior distribution.
6+
7+
The stochastic data generative model is as follows:
8+
9+
* $K \in \mathbb{N}$: number of latent classes
10+
* $\boldsymbol{z} \in \\{ 0, 1 \\}^K$: a one-hot vector representing the latent class (latent variable)
11+
* $\boldsymbol{\pi} \in [0, 1]^K$: a parameter for latent classes, ($\sum_{k=1}^K \pi_k=1$)
12+
* $D \in \mathbb{N}$: a dimension of data
13+
* $\boldsymbol{x} \in \mathbb{R}^D$: a data point
14+
* $\boldsymbol{\mu}_k \in \mathbb{R}^D$: a parameter
15+
* $\boldsymbol{\mu} = \\{ \boldsymbol{\mu}_k \\}_{k=1}^K$
16+
* $\boldsymbol{\Lambda}_k \in \mathbb{R}^{D\times D}$ : a parameter (a positive definite matrix)
17+
* $\boldsymbol{\Lambda} = \\{ \boldsymbol{\Lambda}_k \\}_{k=1}^K$
18+
* $| \boldsymbol{\Lambda}_k | \in \mathbb{R}$: the determinant of $\boldsymbol{\Lambda}_k$
19+
20+
$$
21+
\begin{align}
22+
p(\boldsymbol{z} | \boldsymbol{\pi}) &= \mathrm{Cat}(\boldsymbol{z}|\boldsymbol{\pi}) = \prod_{k=1}^K \pi_k^{z_k},\cr
23+
p(\boldsymbol{x} | \boldsymbol{\mu}, \boldsymbol{\Lambda}, \boldsymbol{z}) &= \prod_{k=1}^K \mathcal{N}(\boldsymbol{x}|\boldsymbol{\mu}_k,\boldsymbol{\Lambda}_k^{-1})^{z_k} \cr
24+
&= \prod_{k=1}^K \left( \frac{| \boldsymbol{\Lambda}_k |^{1/2}}{(2\pi)^{D/2}} \exp \left\{ -\frac{1}{2}(\boldsymbol{x}-\boldsymbol{\mu}_k)^\top \boldsymbol{\Lambda}_k (\boldsymbol{x}-\boldsymbol{\mu}_k) \right\} \right)^{z_k}.
25+
\end{align}
26+
$$
27+
28+
The prior distribution is as follows:
29+
30+
* $\boldsymbol{m}_0 \in \mathbb{R}^{D}$: a hyperparameter
31+
* $\kappa_0 \in \mathbb{R}_{>0}$: a hyperparameter
32+
* $\nu_0 \in \mathbb{R}$: a hyperparameter ($\nu_0 > D-1$)
33+
* $\boldsymbol{W}_0 \in \mathbb{R}^{D\times D}$: a hyperparameter (a positive definite matrix)
34+
* $\boldsymbol{\alpha}_0 \in \mathbb{R}_{> 0}^K$: a hyperparameter
35+
* $\mathrm{Tr} \\{ \cdot \\}$: a trace of a matrix
36+
* $\Gamma (\cdot)$: the gamma function
37+
38+
$$
39+
\begin{align}
40+
p(\boldsymbol{\mu},\boldsymbol{\Lambda},\boldsymbol{\pi}) &= \left\{ \prod_{k=1}^K \mathcal{N}(\boldsymbol{\mu}_k|\boldsymbol{m}_0,(\kappa_0 \boldsymbol{\Lambda}_k)^{-1})\mathcal{W}(\boldsymbol{\Lambda}_k|\boldsymbol{W}_0, \nu_0) \right\} \mathrm{Dir}(\boldsymbol{\pi}|\boldsymbol{\alpha}_0) \cr
41+
&= \Biggl[\, \prod_{k=1}^K \left( \frac{\kappa_0}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_0}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_0)^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_0) \right\} \notag\cr
42+
&\qquad \times B(\boldsymbol{W}_0, \nu_0) | \boldsymbol{\Lambda}_k |^{(\nu_0 - D - 1) / 2} \exp \left\{ -\frac{1}{2} \mathrm{Tr} \\{ \boldsymbol{W}_0^{-1} \boldsymbol{\Lambda}_k \\} \right\} \Biggr] \notag\cr
43+
&\qquad \times C(\boldsymbol{\alpha}_0)\prod_{k=1}^K \pi_k^{\alpha_{0,k}-1},\cr
44+
\end{align}
45+
$$
46+
47+
where $B(\boldsymbol{W}_0, \nu_0)$ and $C(\boldsymbol{\alpha}_0)$ are defined as follows:
48+
49+
$$
50+
\begin{align}
51+
B(\boldsymbol{W}_0, \nu_0) &= | \boldsymbol{W}_0 |^{-\nu_0 / 2} \left( 2^{\nu_0 D / 2} \pi^{D(D-1)/4} \prod_{i=1}^D \Gamma \left( \frac{\nu_0 + 1 - i}{2} \right) \right)^{-1}, \cr
52+
C(\boldsymbol{\alpha}_0) &= \frac{\Gamma(\sum_{k=1}^K \alpha_{0,k})}{\Gamma(\alpha_{0,1})\cdots\Gamma(\alpha_{0,K})}.
53+
\end{align}
54+
$$
55+
56+
The apporoximate posterior distribution in the $t$-th iteration of a variational Bayesian method is as follows:
57+
58+
* $\boldsymbol{x}^n = (\boldsymbol{x}_1, \boldsymbol{x}_2, \dots , \boldsymbol{x}_n) \in \mathbb{R}^{D \times n}$: given data
59+
* $\boldsymbol{z}^n = (\boldsymbol{z}_1, \boldsymbol{z}_2, \dots , \boldsymbol{z}_n) \in \\{ 0, 1 \\}^{K \times n}$: latent classes of given data
60+
* $\boldsymbol{r}_i^{(t)} = (r_{i,1}^{(t)}, r_{i,2}^{(t)}, \dots , r_{i,K}^{(t)}) \in [0, 1]^K$: a parameter for the $i$-th latent class. ($\sum_{k=1}^K r_{i, k}^{(t)} = 1$)
61+
* $\boldsymbol{m}_{n,k}^{(t)} \in \mathbb{R}^{D}$: a hyperparameter
62+
* $\kappa_{n,k}^{(t)} \in \mathbb{R}_{>0}$: a hyperparameter
63+
* $\nu_{n,k}^{(t)} \in \mathbb{R}$: a hyperparameter $(\nu_n > D-1)$
64+
* $\boldsymbol{W}_{n,k}^{(t)} \in \mathbb{R}^{D\times D}$: a hyperparameter (a positive definite matrix)
65+
* $\boldsymbol{\alpha}_n^{(t)} \in \mathbb{R}_{> 0}^K$: a hyperparameter
66+
67+
$$
68+
\begin{align}
69+
q(\boldsymbol{z}^n, \boldsymbol{\mu},\boldsymbol{\Lambda},\boldsymbol{\pi}) &= \left\{ \prod_{i=1}^n \mathrm{Cat} (\boldsymbol{z}_i | \boldsymbol{r}_i^{(t)}) \right\} \left\{ \prod_{k=1}^K \mathcal{N}(\boldsymbol{\mu}_k|\boldsymbol{m}_{n,k}^{(t)},(\kappa_{n,k}^{(t)} \boldsymbol{\Lambda}_k)^{-1})\mathcal{W}(\boldsymbol{\Lambda}_k|\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) \right\} \mathrm{Dir}(\boldsymbol{\pi}|\boldsymbol{\alpha}_n^{(t)}) \cr
70+
&= \biggl[\, \prod_{i=1}^n \prod_{k=1}^K (r_{i,k}^{(t)})^{z_{i,k}} \Biggr] \biggl[\, \prod_{k=1}^K \left( \frac{\kappa_{n,k}^{(t)}}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_{n,k}^{(t)}}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_{n,k}^{(t)})^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_{n,k}^{(t)}) \right\} \notag\cr
71+
&\qquad \times B(\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) | \boldsymbol{\Lambda}_k |^{(\nu_{n,k}^{(t)} - D - 1) / 2} \exp \left\{ -\frac{1}{2} \mathrm{Tr} \\{ ( \boldsymbol{W}_{n,k}^{(t)} )^{-1} \boldsymbol{\Lambda}_k \\} \right\} \Biggr] \notag\cr
72+
&\qquad \times C(\boldsymbol{\alpha}_n^{(t)})\prod_{k=1}^K \pi_k^{\alpha_{n,k}^{(t)}-1},\cr
73+
\end{align}
74+
$$
75+
76+
where the updating rule of the hyperparameters is as follows.
77+
78+
$$
79+
\begin{align}
80+
N_k^{(t)} &= \sum_{i=1}^n r_{i,k}^{(t)} ,\cr
81+
\bar{\boldsymbol{x}}_k^{(t)} &= \frac{1}{N_k^{(t)}} \sum_{i=1}^n r_{i,k}^{(t)} \boldsymbol{x}_i ,\cr
82+
\boldsymbol{m}_{n,k}^{(t+1)} &= \frac{\kappa_0\boldsymbol{\mu}_0 + N_k^{(t)} \bar{\boldsymbol{x}}_k^{(t)}}{\kappa_0 + N_k^{(t)}}, \cr
83+
\kappa_{n,k}^{(t+1)} &= \kappa_0 + N_k^{(t)}, \cr
84+
(\boldsymbol{W}_{n,k}^{(t+1)})^{-1} &= \boldsymbol{W}_0^{-1} + \sum_{i=1}^{n} r_{i,k}^{(t)} (\boldsymbol{x}_i-\bar{\boldsymbol{x}}_k^{(t)})(\boldsymbol{x}_i-\bar{\boldsymbol{x}}_k^{(t)})^\top + \frac{\kappa_0 N_k^{(t)}}{\kappa_0 + N_k^{(t)}}(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)^\top, \cr
85+
\nu_{n,k}^{(t+1)} &= \nu_0 + N_k^{(t)},\cr
86+
\alpha_{n,k}^{(t+1)} &= \alpha_{0,k} + N_k^{(t)} ,\cr
87+
\ln \rho_{i,k}^{(t+1)} &= \psi (\alpha_{n,k}^{(t+1)}) - \psi ( {\textstyle \sum_{k=1}^K \alpha_{n,k}^{(t+1)}} ) \notag \cr
88+
&\qquad + \frac{1}{2} \biggl[\, \sum_{d=1}^D \psi \left( \frac{\nu_{n,k}^{(t+1)} + 1 - d}{2} \right) + D \ln 2 + \ln | \boldsymbol{W}_{n,k}^{(t+1)} | \notag \cr
89+
&\qquad - D \ln (2 \pi ) - \frac{D}{\kappa_{n,k}^{(t+1)}} - \nu_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)})^\top \boldsymbol{W}_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)}) \Biggr], \cr
90+
r_{i,k}^{(t+1)} &= \frac{\rho_{i,k}^{(t+1)}}{\sum_{j=1}^K \rho_{i,j}^{(t+1)}}.
91+
\end{align}
92+
$$
93+
94+
The approximate predictive distribution is as follows:
95+
96+
* $\boldsymbol{x}_{n+1} \in \mathbb{R}^D$: a new data point
97+
* $\boldsymbol{\mu}_{\mathrm{p},k} \in \mathbb{R}^D$: the parameter of the predictive distribution
98+
* $\boldsymbol{\Lambda}_{\mathrm{p},k} \in \mathbb{R}^{D \times D}$: the parameter of the predictive distribution (a positive definite matrix)
99+
* $\nu_{\mathrm{p},k} \in \mathbb{R}_{>0}$: the parameter of the predictive distribution
100+
101+
$$
102+
\begin{align}
103+
&p(x_{n+1}|x^n) \cr
104+
&= \frac{1}{\sum_{k=1}^K \alpha_{n,k}^{(t)}} \sum_{k=1}^K \alpha_{n,k}^{(t)} \mathrm{St}(x_{n+1}|\boldsymbol{\mu}_{\mathrm{p},k},\boldsymbol{\Lambda}_{\mathrm{p},k}, \nu_{\mathrm{p},k}) \cr
105+
&= \frac{1}{\sum_{k=1}^K \alpha_{n,k}^{(t)}} \sum_{k=1}^K \alpha_{n,k}^{(t)} \Biggl[ \frac{\Gamma (\nu_{\mathrm{p},k} / 2 + D / 2)}{\Gamma (\nu_{\mathrm{p},k} / 2)} \frac{|\boldsymbol{\Lambda}_{\mathrm{p},k}|^{1/2}}{(\nu_{\mathrm{p},k} \pi)^{D/2}} \nonumber \cr
106+
&\qquad \qquad \qquad \qquad \qquad \times \left( 1 + \frac{1}{\nu_{\mathrm{p},k}} (\boldsymbol{x}_{n+1} - \boldsymbol{\mu}_{\mathrm{p},k})^\top \boldsymbol{\Lambda}_{\mathrm{p},k} (\boldsymbol{x}_{n+1} - \boldsymbol{\mu}_{\mathrm{p},k}) \right)^{-\nu_{\mathrm{p},k}/2 - D/2} \Biggr],
107+
\end{align}
108+
$$
109+
110+
where the parameters are obtained from the hyperparameters of the posterior distribution as follows:
111+
112+
$$
113+
\begin{align}
114+
\boldsymbol{\mu}_{\mathrm{p},k} &= \boldsymbol{m}_{n,k}^{(t)}, \cr
115+
\nu_{\mathrm{p},k} &= \nu_{n,k}^{(t)} - D + 1,\cr
116+
\boldsymbol{\Lambda}_{\mathrm{p},k} &= \frac{\kappa_{n,k}^{(t)} \nu_{\mathrm{p},k}}{\kappa_{n,k}^{(t)} + 1} \boldsymbol{W}_{n,k}^{(t)}.
117+
\end{align}
118+
$$

0 commit comments

Comments
 (0)