@@ -9,8 +9,9 @@ The stochastic data generative model is as follows:
9
9
* $K \in \mathbb{N}$: number of latent classes
10
10
* $\boldsymbol{z} \in \{ 0, 1 \} ^K$: a one-hot vector representing the latent class (latent variable)
11
11
* $\boldsymbol{\pi} \in [ 0, 1] ^K$: a parameter for latent classes, ($\sum_ {k=1}^K \pi_k=1$)
12
- * $a_ {jk}$ : transition probability to latent state k under latent state j
13
- * $\boldsymbol{A}=(a_ {jk})_ {0\leq j,k\leq K} \in [ 0, 1] ^{K\times K}$: a parameter for latent classes, ($\sum_ {k=1}^K a_ {jk}=1$)
12
+ * $a_ {j,k} \in [ 0,1] $ : transition probability to latent state k under latent state j
13
+ * $\boldsymbol{a}_ j = [ a_ {j,1}, a_ {j,2}, \dots , a_ {j,K}] \in [ 0,1] ^K$, a vector of the transition probability ($\sum_ {k=1}^K a_ {j,k}=1$)
14
+ * $\boldsymbol{A}=(a_ {j,k})_ {1\leq j,k\leq K} \in [ 0, 1] ^{K\times K}$: a matrix of the transition probability
14
15
* $D \in \mathbb{N}$: a dimension of data
15
16
* $\boldsymbol{x} \in \mathbb{R}^D$: a data point
16
17
* $\boldsymbol{\mu}_ k \in \mathbb{R}^D$: a parameter
@@ -22,7 +23,7 @@ The stochastic data generative model is as follows:
22
23
$$
23
24
\begin{align}
24
25
p(\boldsymbol{z}_{1} | \boldsymbol{\pi}) &= \mathrm{Cat}(\boldsymbol{z}_{1}|\boldsymbol{\pi}) = \prod_{k=1}^K \pi_k^{z_{1,k}},\\
25
- p(\boldsymbol{z}_{n} |\boldsymbol{z}_{n-1} ,\boldsymbol{A}) &= \prod_{k=1}^K \prod_{j=1}^K a_{jk }^{z_{n-1,j}z_{n,k}},\\
26
+ p(\boldsymbol{z}_{n} |\boldsymbol{z}_{n-1} ,\boldsymbol{A}) &= \prod_{k=1}^K \prod_{j=1}^K a_{j,k }^{z_{n-1,j}z_{n,k}},\\
26
27
p(\boldsymbol{x}_{n} | \boldsymbol{\mu}, \boldsymbol{\Lambda}, \boldsymbol{z}_{n}) &= \prod_{k=1}^K \mathcal{N}(\boldsymbol{x}|\boldsymbol{\mu}_k,\boldsymbol{\Lambda}_k^{-1})^{z_{n,k}} \\
27
28
&= \prod_{k=1}^K \left( \frac{| \boldsymbol{\Lambda}_{k} |^{1/2}}{(2\pi)^{D/2}} \exp \left\{ -\frac{1}{2}(\boldsymbol{x}-\boldsymbol{\mu}_{k})^\top \boldsymbol{\Lambda}_{k} (\boldsymbol{x}-\boldsymbol{\mu}_{k}) \right\} \right)^{z_{n,k}},
28
29
\end{align}
44
45
p(\boldsymbol{\mu},\boldsymbol{\Lambda},\boldsymbol{\pi},\boldsymbol{A}) &= \left\{ \prod_{k=1}^K \mathcal{N}(\boldsymbol{\mu}_k|\boldsymbol{m}_0,(\kappa_0 \boldsymbol{\Lambda}_k)^{-1})\mathcal{W}(\boldsymbol{\Lambda}_k|\boldsymbol{W}_0, \nu_0) \right\} \mathrm{Dir}(\boldsymbol{\pi}|\boldsymbol{\eta}_0) \prod_{j=1}^{K}\mathrm{Dir}(\boldsymbol{a}_{j}|\boldsymbol{\zeta}_{0,j}), \\
45
46
&= \Biggl[ \prod_{k=1}^K \left( \frac{\kappa_0}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_0}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_0)^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_0) \right\} \\
46
47
&\qquad \times B(\boldsymbol{W}_0, \nu_0) | \boldsymbol{\Lambda}_k |^{(\nu_0 - D - 1) / 2} \exp \left\{ -\frac{1}{2} \mathrm{Tr} \{ \boldsymbol{W}_0^{-1} \boldsymbol{\Lambda}_k \} \right\}\biggl] \\
47
- &\qquad \times \Biggl[ \prod_{k=1}^KC(\boldsymbol{\eta}_0)\pi_k^{\eta_{0,k}-1}\biggl]\times \biggl[\prod_{j=1}^KC(\boldsymbol{\zeta}_{0,j})\prod_{k=1}^K a_{jk}^{\zeta_{0,j,k}-1}\Biggr],\\
48
+ &\qquad \times \Biggl[ \prod_{k=1}^KC(\boldsymbol{\eta}_0)\pi_k^{\eta_{0,k}-1}\biggl]\\
49
+ &\qquad \times \biggl[\prod_{j=1}^KC(\boldsymbol{\zeta}_{0,j})\prod_{k=1}^K a_{j,k}^{\zeta_{0,j,k}-1}\Biggr],\\
48
50
\end{align}
49
51
$$
50
52
57
59
C(\boldsymbol{\zeta}_{0,j}) &= \frac{\Gamma(\sum_{k=1}^K \zeta_{0,j,k})}{\Gamma(\zeta_{0,j,1})\cdots\Gamma(\zeta_{0,j,K})}.
58
60
\end{align}
59
61
$$
60
- <!--
62
+
61
63
The apporoximate posterior distribution in the $t$-th iteration of a variational Bayesian method is as follows:
62
64
63
65
* $\boldsymbol{x}^n = (\boldsymbol{x}_ 1, \boldsymbol{x}_ 2, \dots , \boldsymbol{x}_ n) \in \mathbb{R}^{D \times n}$: given data
64
66
* $\boldsymbol{z}^n = (\boldsymbol{z}_ 1, \boldsymbol{z}_ 2, \dots , \boldsymbol{z}_ n) \in \{ 0, 1 \} ^{K \times n}$: latent classes of given data
65
- * $\boldsymbol{r}_i^{(t)} = (r_{i,1}^{(t)}, r_{i,2}^{(t)}, \dots , r_{i,K}^{(t)}) \in [0, 1]^K$: a parameter for the $i$-th latent class. ($\sum_{k=1}^K r_{i, k}^{(t)} = 1$)
66
67
* $\boldsymbol{m}_ {n,k}^{(t)} \in \mathbb{R}^{D}$: a hyperparameter
67
68
* $\kappa_ {n,k}^{(t)} \in \mathbb{R}_ {>0}$: a hyperparameter
68
69
* $\nu_ {n,k}^{(t)} \in \mathbb{R}$: a hyperparameter $(\nu_n > D-1)$
69
70
* $\boldsymbol{W}_ {n,k}^{(t)} \in \mathbb{R}^{D\times D}$: a hyperparameter (a positive definite matrix)
70
71
* $\boldsymbol{\eta}_ n^{(t)} \in \mathbb{R}_ {> 0}^K$: a hyperparameter
71
-
72
+ * $\boldsymbol{\zeta} _ {n,j}^{(t)} \in \mathbb{R} _ {> 0}^K$: a hyperparameter
72
73
$$
73
74
\begin{align}
74
- q(\boldsymbol{z}^n, \boldsymbol{\mu},\boldsymbol{\Lambda},\boldsymbol{\pi}) &= \left\{ \prod_{i=1}^n \mathrm{Cat} (\boldsymbol{z}_i | \boldsymbol{r}_i^{(t)}) \right\} \left\{ \prod_{k=1}^K \mathcal{N}(\boldsymbol{\mu}_k|\boldsymbol{m}_{n,k}^{(t)},(\kappa_{n,k}^{(t)} \boldsymbol{\Lambda}_k)^{-1})\mathcal{W}(\boldsymbol{\Lambda}_k|\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) \right\} \mathrm{Dir}(\boldsymbol{\pi}|\boldsymbol{\eta}_n^{(t)}) \\
75
- &= \Biggl[ \prod_{i=1}^n \prod_{k=1}^K (r_{i,k}^{(t)})^{z_{i,k}} \Biggr] \Biggl[ \prod_{k=1}^K \left( \frac{\kappa_{n,k}^{(t)}}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_{n,k}^{(t)}}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_{n,k}^{(t)})^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_{n,k}^{(t)}) \right\} \\
75
+ &q(\boldsymbol{z}^n, \boldsymbol{\mu},\boldsymbol{\Lambda},\boldsymbol{\pi},\boldsymbol{A}) \nonumber \\
76
+ &= q^{(t)}(\boldsymbol{z}^n) \left\{ \prod_{k=1}^K \mathcal{N}(\boldsymbol{\mu}_k|\boldsymbol{m}_{n,k}^{(t)},(\kappa_{n,k}^{(t)} \boldsymbol{\Lambda}_k)^{-1})\mathcal{W}(\boldsymbol{\Lambda}_k|\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) \right\} \mathrm{Dir}(\boldsymbol{\pi}|\boldsymbol{\eta}_n^{(t)})\left\{\prod_{j=1}^K\mathrm{Dir}(\boldsymbol{a}_j|\boldsymbol{\zeta}_{n,j}^{(t)})\right\}, \\
77
+ &= q^{(t)}(\boldsymbol{z}^n) \Biggl[ \prod_{k=1}^K \left( \frac{\kappa_{n,k}^{(t)}}{2\pi} \right)^{D/2} |\boldsymbol{\Lambda}_k|^{1/2} \exp \left\{ -\frac{\kappa_{n,k}^{(t)}}{2}(\boldsymbol{\mu}_k -\boldsymbol{m}_{n,k}^{(t)})^\top \boldsymbol{\Lambda}_k (\boldsymbol{\mu}_k - \boldsymbol{m}_{n,k}^{(t)}) \right\} \\
76
78
&\qquad \times B(\boldsymbol{W}_{n,k}^{(t)}, \nu_{n,k}^{(t)}) | \boldsymbol{\Lambda}_k |^{(\nu_{n,k}^{(t)} - D - 1) / 2} \exp \left\{ -\frac{1}{2} \mathrm{Tr} \{ ( \boldsymbol{W}_{n,k}^{(t)} )^{-1} \boldsymbol{\Lambda}_k \} \right\} \Biggr] \\
77
- &\qquad \times C(\boldsymbol{\eta}_n^{(t)})\prod_{k=1}^K \pi_k^{\eta_{n,k}^{(t)}-1},\\
79
+ &\qquad \times C(\boldsymbol{\eta}_n^{(t)})\prod_{k=1}^K \pi_k^{\eta_{n,k}^{(t)}-1}\left[\prod_{j=1}^K C(\boldsymbol{\zeta}_{n,j}^{(t)})\prod_{k=1}^K a_{j,k}^{\zeta_{n,j,k}^{(t)}-1}\right] ,\\
78
80
\end{align}
79
81
$$
80
82
81
83
where the updating rule of the hyperparameters is as follows.
82
84
83
85
$$
84
86
\begin{align}
85
- N_k^{(t)} &= \sum_{i=1}^n r_{i,k}^{(t)} \\
86
- \bar{\boldsymbol{x}}_k^{(t)} &= \frac{1}{N_k^{(t)}} \sum_{i=1}^n r_{i,k}^{(t)} \boldsymbol{x}_i \\
87
+ N_k^{(t)} &= \sum_{i=1}^n \gamma^{(t)}_{i,k}, \\
88
+ M_{j,k}^{(t)} &= \sum_{i=2}^n \xi^{(t)}_{i,j,k},\\
89
+ \bar{\boldsymbol{x}}_k^{(t)} &= \frac{1}{N_k^{(t)}} \sum_{i=1}^n \gamma^{(t)}_{i,k} \boldsymbol{x}_i, \\
90
+ S_k^{(t)} &= \frac{1}{N_k^{(t)}}\sum_{i=1}^n \gamma^{(t)}_{i,k} (x_i-\bar{\boldsymbol{x}}_k^{(t)})(x_i-\bar{\boldsymbol{x}}_k^{(t)})^{\top},\\
87
91
\boldsymbol{m}_{n,k}^{(t+1)} &= \frac{\kappa_0\boldsymbol{\mu}_0 + N_k^{(t)} \bar{\boldsymbol{x}}_k^{(t)}}{\kappa_0 + N_k^{(t)}}, \\
88
92
\kappa_{n,k}^{(t+1)} &= \kappa_0 + N_k^{(t)}, \\
89
- (\boldsymbol{W}_{n,k}^{(t+1)})^{-1} &= \boldsymbol{W}_0^{-1} + \sum_{i=1}^{n} r_{i,k}^{ (t)} (\boldsymbol{x}_i-\bar{\boldsymbol{x}}_k ^{(t)})(\boldsymbol{x}_i-\bar{\boldsymbol{x}}_k^{(t)})^\top + \frac{\kappa_0 N_k^{(t)}}{\kappa_0 + N_k^{(t)}}(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)^\top, \\
93
+ (\boldsymbol{W}_{n,k}^{(t+1)})^{-1} &= \boldsymbol{W}_0^{-1} + N_k^{ (t)}S_k ^{(t)} + \frac{\kappa_0 N_k^{(t)}}{\kappa_0 + N_k^{(t)}}(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)(\bar{\boldsymbol{x}}_k^{(t)}-\boldsymbol{\mu}_0)^\top, \\
90
94
\nu_{n,k}^{(t+1)} &= \nu_0 + N_k^{(t)},\\
91
- \eta_{n,k}^{(t+1)} &= \eta_{0,k} + N_k^{(t)} \\
92
- \ln \rho_{i,k}^{(t+1)} &= \psi (\eta_{n,k}^{(t+1)}) - \psi ( {\textstyle \sum_{k=1}^K \eta_{n,k}^{(t+1)}} ) \nonumber \\
93
- &\qquad + \frac{1}{2} \Biggl[ \sum_{d=1}^D \psi \left( \frac{\nu_{n,k}^{(t+1)} + 1 - d}{2} \right) + D \ln 2 + \ln | \boldsymbol{W}_{n,k}^{(t+1)} | \nonumber \\
94
- &\qquad - D \ln (2 \pi ) - \frac{D}{\kappa_{n,k}^{(t+1)}} - \nu_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)})^\top \boldsymbol{W}_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)}) \Biggr] \\
95
- r_{i,k}^{(t+1)} &= \frac{\rho_{i,k}^{(t+1)}}{\sum_{k=1}^K \rho_{i,k}^{(t+1)}}
95
+ \eta_{n,k}^{(t+1)} &= \eta_{0,k} + \gamma^{(t)}_{1,k}, \\
96
+ \zeta_{n,j,k}^{(t+1)} &= \zeta_{0,j,k}+M_{j,k}^{(t)}.
97
+ \end{align}
98
+ $$
99
+
100
+ The approximate posterior distribution of the latent variable $q^{(t+1)}(z^n)$ is calculated by the forward-backward algorithm as follows.
101
+
102
+ $$
103
+ \begin{align}
104
+ \ln \rho_{i,k}^{(t+1)} &= \frac{1}{2} \Biggl[\, \sum_{d=1}^D \psi \left( \frac{\nu_{n,k}^{(t+1)} + 1 - d}{2} \right) + D \ln 2 + \ln | \boldsymbol{W}_{n,k}^{(t+1)} | \notag \\
105
+ &\qquad - D \ln (2 \pi ) - \frac{D}{\kappa_{n,k}^{(t+1)}} - \nu_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)})^\top \boldsymbol{W}_{n,k}^{(t+1)} (\boldsymbol{x}_i - \boldsymbol{m}_{n,k}^{(t+1)}) \Biggr], \\
106
+ \ln \tilde{\pi}_k^{(t+1)} &= \psi (\eta_{n,k}^{(t+1)}) - \psi \left( \textstyle \sum_{k=1}^K \eta_{n,k}^{(t+1)} \right) \\
107
+ \ln \tilde{a}_{j,k}^{(t+1)} &= \psi (\zeta_{n,j,k}^{(t+1)}) - \psi \left( \textstyle \sum_{k=1}^K \zeta_{n,j,k}^{(t+1)} \right) \\
108
+ \alpha^{(t+1)} (\boldsymbol{z}_i) &\propto
109
+ \begin{cases}
110
+ \prod_{k=1}^{K} \left( \rho_{i,k}^{(t+1)}\right)^{z_{i,k}} \sum_{\boldsymbol{z}_{i-1}} \left[\prod_{k=1}^{K}\prod_{j=1}^{K}\left(\tilde{a}^{(t+1)}_{j,k}\right)^{z_{i-1,j}z_{i,k}}\alpha^{(t+1)}(\boldsymbol{z}_{i-1})\right] & (i>1)\\
111
+ \prod_{k=1}^{K}\left( \rho_{1,k}^{(t+1)} \tilde{\pi}_k^{(t+1)} \right)^{z_{1,k}} & (i=1)
112
+ \end{cases} \\
113
+ \beta^{(t+1)} (\boldsymbol{z}_i) &\propto
114
+ \begin{cases}
115
+ \sum_{\boldsymbol{z}_{i+1}} \left[ \prod_{k=1}^{K} \left( \rho_{i+1,k}^{(t+1)}\right)^{z_{i,k}} \prod_{k=1}^{K}\prod_{j=1}^{K}\left(\tilde{a}^{(t+1)}_{j,k}\right)^{z_{i,j}z_{i+1,k}}\beta^{(t+1)}(\boldsymbol{z}_{i+1})\right] & (i<n)\\
116
+ 1 & (i=n)
117
+ \end{cases} \\
118
+ q^{(t+1)}(\boldsymbol{z}_i) &\propto \alpha^{(t+1)}(\boldsymbol{z}_i)\beta^{(t+1)}(\boldsymbol{z}_i) \\
119
+ \gamma^{(t+1)}_{i,k} &= \sum_{\boldsymbol{z}_i} q^{(t+1)}(\boldsymbol{z}_i) z_{i,k}\\
120
+ q^{(t+1)}(\boldsymbol{z}_{i-1}, \boldsymbol{z}_{i}) &\propto \alpha^{(t+1)}(\boldsymbol{z}_{i-1}) \prod_{k=1}^{K} \left( \rho_{i,k}^{(t+1)}\right)^{z_{i,k}} \prod_{k=1}^{K}\prod_{j=1}^{K}\left(\tilde{a}^{(t+1)}_{j,k}\right)^{z_{i-1,j}z_{i,k}} \beta^{(t+1)}(\boldsymbol{z}_i) \\
121
+ \xi^{(t+1)}_{i,j,k} &= \sum_{\boldsymbol{z}_{i-1}} \sum_{\boldsymbol{z}_i} q^{(t+1)}(\boldsymbol{z}_{i-1}, \boldsymbol{z}_{i}) z_{i-1,j} z_{i,k}
96
122
\end{align}
97
123
$$
98
124
99
125
The approximate predictive distribution is as follows:
100
126
101
127
* $\boldsymbol{x}_ {n+1} \in \mathbb{R}^D$: a new data point
128
+ * $(a_ {\mathrm{p},j,k})_ {1\leq j,k\leq K} \in [ 0, 1] ^{K\times K}$: the parameters of the predictive transition probability of latent classes, ($\sum_ {k=1}^K a_ {\mathrm{p},j,k}=1$)
102
129
* $\boldsymbol{\mu}_ {\mathrm{p},k} \in \mathbb{R}^D$: the parameter of the predictive distribution
103
130
* $\boldsymbol{\Lambda}_ {\mathrm{p},k} \in \mathbb{R}^{D \times D}$: the parameter of the predictive distribution (a positive definite matrix)
104
131
* $\nu_ {\mathrm{p},k} \in \mathbb{R}_ {>0}$: the parameter of the predictive distribution
105
132
106
133
$$
107
134
\begin{align}
108
135
&p(x_{n+1}|x^n) \\
109
- &= \frac{1}{\ sum_{k=1}^K \eta_{n,k}^{(t)}} \sum_{k =1}^K \eta_ {n,k }^{(t)} \mathrm{St}(x_{n+1}|\boldsymbol{\mu}_{\mathrm{p},k},\boldsymbol{\Lambda}_{\mathrm{p},k}, \nu_{\mathrm{p},k}) \\
110
- &= \frac{1}{\ sum_{k=1}^K \eta_{n,k}^{(t)}} \sum_{k =1}^K \eta_ {n,k }^{(t)} \Biggl[ \frac{\Gamma (\nu_{\mathrm{p},k} / 2 + D / 2)}{\Gamma (\nu_{\mathrm{p},k} / 2)} \frac{|\boldsymbol{\Lambda}_{\mathrm{p},k}|^{1/2}}{(\nu_{\mathrm{p},k} \pi)^{D/2}} \nonumber \\
136
+ &\approx \ sum_{k=1}^K \left( \sum_{j =1}^K \gamma_ {n,j }^{(t)} a_{\mathrm{p},j,k} \right) \mathrm{St}(x_{n+1}|\boldsymbol{\mu}_{\mathrm{p},k},\boldsymbol{\Lambda}_{\mathrm{p},k}, \nu_{\mathrm{p},k}) \\
137
+ &= \sum_{k=1}^K \left( \sum_{j =1}^K \gamma_ {n,j }^{(t)} a_{\mathrm{p},j,k} \right) \Biggl[ \frac{\Gamma (\nu_{\mathrm{p},k} / 2 + D / 2)}{\Gamma (\nu_{\mathrm{p},k} / 2)} \frac{|\boldsymbol{\Lambda}_{\mathrm{p},k}|^{1/2}}{(\nu_{\mathrm{p},k} \pi)^{D/2}} \nonumber \\
111
138
&\qquad \qquad \qquad \qquad \qquad \times \left( 1 + \frac{1}{\nu_{\mathrm{p},k}} (\boldsymbol{x}_{n+1} - \boldsymbol{\mu}_{\mathrm{p},k})^\top \boldsymbol{\Lambda}_{\mathrm{p},k} (\boldsymbol{x}_{n+1} - \boldsymbol{\mu}_{\mathrm{p},k}) \right)^{-\nu_{\mathrm{p},k}/2 - D/2} \Biggr],
112
139
\end{align}
113
140
$$
114
141
115
- where the parameters are obtained from the hyperparameters of the posterior distribution as follows:
142
+ where the parameters are obtained from the hyperparameters of the predictive distribution as follows:
116
143
117
144
$$
118
145
\begin{align}
119
- \boldsymbol{\mu}_{\mathrm{p},k} &= \boldsymbol{m}_{n,k}^{(t)} \\
146
+ a_{\mathrm{p},j,k} &= \frac{\zeta_{n,j,k}^{(t)}}{\sum_{k=1}^K \zeta_{n,j,k}^{(t)}}, \\
147
+ \boldsymbol{\mu}_{\mathrm{p},k} &= \boldsymbol{m}_{n,k}^{(t)}, \\
120
148
\boldsymbol{\Lambda}_{\mathrm{p},k} &= \frac{\kappa_{n,k}^{(t)} (\nu_{n,k}^{(t)} - D + 1)}{\kappa_{n,k}^{(t)} + 1} \boldsymbol{W}_{n,k}^{(t)}, \\
121
149
\nu_{\mathrm{p},k} &= \nu_{n,k}^{(t)} - D + 1.
122
150
\end{align}
123
151
$$
124
- -->
152
+
0 commit comments