From 1de110b2789f20cf463f6fe9dbaba845e9af96a4 Mon Sep 17 00:00:00 2001
From: Sharganov Artem <sharganov.artem@yandex.ru>
Date: Sun, 6 Jun 2021 21:32:02 +0300
Subject: [PATCH 1/7] Cythonized Python code for sslm and LdaPost classes

---
 .gitignore                               |    2 +
 gensim/models/__init__.py                |    1 +
 gensim/models/ldaseq_posterior_inner.pyx |  317 +++++++
 gensim/models/ldaseq_sslm_inner.pxd      |   27 +
 gensim/models/ldaseq_sslm_inner.pyx      |  740 ++++++++++++++++
 gensim/models/ldaseqmodel_optimized.py   | 1007 ++++++++++++++++++++++
 setup.py                                 |    2 +
 7 files changed, 2096 insertions(+)
 create mode 100644 gensim/models/ldaseq_posterior_inner.pyx
 create mode 100644 gensim/models/ldaseq_sslm_inner.pxd
 create mode 100644 gensim/models/ldaseq_sslm_inner.pyx
 create mode 100644 gensim/models/ldaseqmodel_optimized.py

diff --git a/.gitignore b/.gitignore
index 019e1812f7..570f4130d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -89,5 +89,7 @@ gensim/models/fasttext_inner.c
 gensim/models/nmf_pgd.c
 gensim/models/word2vec_corpusfile.cpp
 gensim/models/word2vec_inner.c
+gensim/models/ldaseq_sslm_inner.c
+gensim/models/ldaseq_posterior_inner.c
 
 .ipynb_checkpoints
diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py
index a2d0af279f..ec47f03715 100644
--- a/gensim/models/__init__.py
+++ b/gensim/models/__init__.py
@@ -22,6 +22,7 @@
 from .fasttext import FastText  # noqa:F401
 from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix  # noqa:F401
 from .nmf import Nmf  # noqa:F401
+from .ldaseqmodel_optimized import LdaSeqModel_V4
 
 from gensim import interfaces, utils
 
diff --git a/gensim/models/ldaseq_posterior_inner.pyx b/gensim/models/ldaseq_posterior_inner.pyx
new file mode 100644
index 0000000000..2caa1b0d0c
--- /dev/null
+++ b/gensim/models/ldaseq_posterior_inner.pyx
@@ -0,0 +1,317 @@
+from scipy.special.cython_special cimport gammaln
+from scipy.special.cython_special cimport psi
+
+from libc.math cimport exp, log, abs
+from libc.string cimport memcpy
+from libc.stdlib cimport malloc, free
+
+cimport numpy as np
+
+import numpy as np
+
+ctypedef np.float64_t REAL_t
+
+
+cdef update_phi(REAL_t * gamma, REAL_t *phi, REAL_t * log_phi,
+               int * word_ids, REAL_t * lda_topics, const int num_topics,
+               const int doc_length):
+
+    """Update variational multinomial parameters, based on a document and a time-slice.
+
+    This is done based on the original Blei-LDA paper, where:
+    log_phi := beta * exp(Ψ(gamma)), over every topic for every word.
+
+    TODO: incorporate lee-sueng trick used in
+    **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**.
+
+    Parameters
+    ----------
+    gamma : size of num_topics. in-parameter
+
+    phi : size of (max_doc_len, num_topics). int/out-parameter
+
+    log_phi: size of (max_doc_len, num_topics). in/out-parameter
+
+    word_ids: size of doc_length. in-parameter
+
+    lda_topics: size of (vocab_len, num_topics). in-parameter
+
+    num_topics: number of topics in the model. in-parameter
+
+    doc_length: length of the document (number of words in the document). in-parameter
+
+    """
+
+    cdef int k, i
+
+    # digamma values
+    cdef REAL_t * dig = <REAL_t *> malloc(num_topics * sizeof(REAL_t))
+    if dig == NULL:
+        raise
+    if dig == NULL:
+        print("NULL pointer")
+
+    for k in range(num_topics):
+        dig[k] = psi(gamma[k])
+
+    cdef REAL_t *log_phi_row = NULL
+    cdef REAL_t *phi_row = NULL
+
+    for i in range(doc_length):
+        for k in range(num_topics):
+            # TODO check if proper indexing
+            log_phi[i * num_topics + k] = \
+                dig[k] + lda_topics[word_ids[i] * num_topics + k]
+
+        log_phi_row = log_phi + i * num_topics
+
+        phi_row = phi + i * num_topics
+
+        # log normalize
+        v = log_phi_row[0]
+        for i in range(1, num_topics):
+            v = log(exp(v) + exp(log_phi_row[i]))
+
+        # subtract every element by v
+        # TODO blas
+        for i in range(num_topics):
+            log_phi_row[i] = log_phi_row[i] - v
+
+        for i in range(num_topics):
+            phi_row[i] = exp(log_phi_row[i])
+
+    free(dig)
+
+
+cdef update_phi_fixed():
+    return
+
+
+cdef update_gamma(REAL_t * gamma, const REAL_t * phi, const REAL_t *lda_alpha,
+                  const int *word_counts, const int num_topics, const int doc_length):
+    """Update variational dirichlet parameters.
+
+    This operations is described in the original Blei LDA paper:
+    gamma = alpha + sum(phi), over every topic for every word.
+
+    Parameters
+    ----------
+    gamma: size of num_topics. int/out-parameter
+    
+    phi: size of (max_doc_len, num_topics). in-parameter
+    
+    lda_alpha: size of num_topics. in-parameter
+    
+    word_counts: size of doc_length. in-parameter
+    
+    num_topics: number of topics in the model 
+    
+    doc_length: length of the document
+
+    """
+    memcpy(gamma, lda_alpha, num_topics * sizeof(REAL_t))
+
+    cdef int i, k
+    # TODO BLAS matrix*vector
+    for i in range(doc_length):
+        for k in range(num_topics):
+            gamma[k] += phi[i * num_topics + k] * word_counts[i]
+
+
+cdef REAL_t compute_lda_lhood(REAL_t * lhood, const REAL_t *gamma, const REAL_t * phi, const REAL_t *log_phi,
+                       const REAL_t * lda_alpha, const REAL_t *lda_topics,
+                       int *word_counts, int *word_ids,
+                       const int num_topics, const int doc_length):
+    """Compute the log likelihood bound.
+    Parameters
+    ----------
+    gamma: size of num_topics
+    
+    lhood: size of num_topics + 1
+    
+    phi: size of (max_doc_len, num_topics). in-parameter
+    
+    log_phi: size of (max_doc_len, num_topics). in-parameter
+    
+    lda_alpha: size of num_topics. in-parameter
+
+    lda_topics: size of (vocab_len, num_topics). in-parameter
+    
+    word_counts: size of doc_len
+    
+    word_ids: size of doc_len 
+    
+    num_topics: number of topics in the model 
+    
+    doc_length: length of the document 
+
+    Returns
+    -------
+    float
+        The optimal lower bound for the true posterior using the approximate distribution.
+
+    """
+    cdef int i
+
+    cdef REAL_t gamma_sum = 0.0
+
+    for i in range(num_topics):
+        gamma_sum += gamma[i]
+
+    cdef REAL_t alpha_sum = 0.0
+    for i in range(num_topics):
+        alpha_sum += lda_alpha[i]
+
+    cdef REAL_t lhood_v = gammaln(alpha_sum) - gammaln(gamma_sum)
+    lhood[num_topics] = lhood_v
+
+    cdef REAL_t digsum = psi(gamma_sum)
+    cdef REAL_t lhood_term, e_log_theta_k
+
+    for k in range(num_topics):
+
+        e_log_theta_k = psi(gamma[k]) - digsum
+
+        lhood_term = \
+            (lda_alpha[k] - gamma[k]) * e_log_theta_k + \
+            gammaln(gamma[k]) - gammaln(lda_alpha[k])
+
+        # TODO: check why there's an IF
+        for i in range(doc_length):
+            if phi[i * num_topics + k] > 0:
+                lhood_term += \
+                    word_counts[i] * phi[i * num_topics + k] \
+                    *  (e_log_theta_k + lda_topics[word_ids[i] * num_topics + k]
+                        - log_phi[i * num_topics + k])
+
+        lhood[k] = lhood_term
+        lhood_v += lhood_term
+
+    return lhood_v
+
+cdef init_lda_post(REAL_t *gamma, REAL_t * phi, const int *word_counts,
+                   const REAL_t *lda_alpha,
+                   const int doc_length, const int num_topics):
+
+    """Initialize variational posterior. """
+
+    cdef int i, j
+
+    cdef int total = 0
+
+    # BLAS sum of absolute numbers
+    for i in range(doc_length):
+        total += word_counts[i]
+
+    cdef REAL_t init_value = lda_alpha[0] + float(total) / num_topics
+
+    for i in range(num_topics):
+        gamma[i] = init_value
+
+    init_value = 1.0 / num_topics
+
+    for i in range(doc_length):
+        phi_doc = phi + i * num_topics
+        for j in range(num_topics):
+            phi_doc[j] = init_value
+
+
+def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
+                lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None):
+    """Posterior inference for lda.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    float
+        The optimal lower bound for the true posterior using the approximate distribution.
+    """
+
+    cdef int i
+
+    ###############
+    # Setup C structures
+    ###############
+
+    cdef int num_topics = self.lda.num_topics
+    cdef int vocab_len = len(self.lda.id2word)
+
+    cdef int doc_length = len(self.doc)
+    cdef int max_doc_len = doc_length
+
+    # TODO adopt implementation to avoid memory allocation for every document.
+    #  E.g. create numpy array field of Python class. Be careful with the array size, it should be at least
+    #  the length of the longest document
+    cdef int * word_ids = <int *> malloc(doc_length * sizeof(int))
+    if word_ids == NULL:
+        raise
+    cdef int * word_counts = <int *> malloc(doc_length * sizeof(int))
+    if word_counts == NULL:
+        raise
+    # TODO Would it be better to create numpy array first?
+    for i in range(doc_length):
+        word_ids[i] = self.doc[i][0]
+        word_counts[i] = self.doc[i][0]
+
+    cdef REAL_t * gamma = <REAL_t *> (np.PyArray_DATA(self.gamma))
+    cdef REAL_t * phi = <REAL_t *> (np.PyArray_DATA(self.phi))
+    cdef REAL_t * log_phi = <REAL_t *> (np.PyArray_DATA(self.log_phi))
+    cdef REAL_t * lhood  = <REAL_t *> (np.PyArray_DATA(self.lhood))
+
+    cdef REAL_t * lda_topics = <REAL_t *> (np.PyArray_DATA(self.lda.topics))
+    cdef REAL_t * lda_alpha = <REAL_t *> (np.PyArray_DATA(self.lda.alpha))
+
+    ###############
+    # Finished setup of c structures here
+    ###############
+
+    init_lda_post(gamma, phi, word_counts, lda_alpha, doc_length, num_topics)
+
+    # sum of counts in a doc
+    cdef REAL_t total = sum(count for word_id, count in self.doc)
+
+    cdef REAL_t lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi,
+                                          lda_alpha, lda_topics, word_counts, word_ids,
+                                          num_topics, doc_length)
+
+    cdef REAL_t lhood_old = 0.0
+    cdef REAL_t converged = 0.0
+    cdef int iter_ = 0
+
+    # TODO Why first iteration starts here is done outside of the loop?
+    iter_ += 1
+    lhood_old = lhood_v
+    update_gamma(gamma, phi, lda_alpha, word_counts, num_topics, doc_length)
+
+    model = "DTM"
+
+    # if model == "DTM" or sslm is None:
+    update_phi(gamma, phi, log_phi, word_ids, lda_topics, num_topics, doc_length)
+
+    lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi,
+                              lda_alpha, lda_topics, word_counts, word_ids,
+                              num_topics, doc_length)
+
+    converged = abs((lhood_old - lhood_v) / (lhood_old * total))
+
+    while converged > LDA_INFERENCE_CONVERGED and iter_ <= lda_inference_max_iter:
+
+        iter_ += 1
+        lhood_old = lhood_v
+        update_gamma(gamma, phi, lda_alpha, word_counts, num_topics, doc_length)
+        model = "DTM"
+
+        update_phi(gamma, phi, log_phi, word_ids, lda_topics, num_topics, doc_length)
+
+        lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi,
+                              lda_alpha, lda_topics, word_counts, word_ids,
+                              num_topics, doc_length)
+
+        converged = np.fabs((lhood_old - lhood_v) / (lhood_old * total))
+
+    free(word_ids)
+    free(word_counts)
+
+    return lhood_v
diff --git a/gensim/models/ldaseq_sslm_inner.pxd b/gensim/models/ldaseq_sslm_inner.pxd
new file mode 100644
index 0000000000..4895249190
--- /dev/null
+++ b/gensim/models/ldaseq_sslm_inner.pxd
@@ -0,0 +1,27 @@
+cimport numpy as np
+ctypedef np.float64_t REAL_t
+
+
+
+cdef struct StateSpaceLanguageModelConfig:
+    int num_time_slices, vocab_len
+    REAL_t chain_variance, obs_variance
+
+    REAL_t * obs
+    REAL_t * mean
+    REAL_t * variance
+
+    REAL_t * fwd_mean
+    REAL_t * fwd_variance
+    REAL_t * zeta
+    REAL_t * e_log_prob
+    # T
+    REAL_t * word_counts
+    # T
+    REAL_t * totals
+
+    REAL_t * deriv
+    # T * (T+1)
+    REAL_t * mean_deriv_mtx
+    int word
+
diff --git a/gensim/models/ldaseq_sslm_inner.pyx b/gensim/models/ldaseq_sslm_inner.pyx
new file mode 100644
index 0000000000..9245a8cd95
--- /dev/null
+++ b/gensim/models/ldaseq_sslm_inner.pyx
@@ -0,0 +1,740 @@
+#!/usr/bin/env cython
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: cdivision=True
+# cython: embedsignature=True
+
+from libc.math cimport pow, log, exp, abs, sqrt
+from libc.string cimport memcpy
+from libc.stdlib cimport malloc, free
+from libc.stdint cimport uintptr_t
+
+
+cimport numpy as np
+
+
+cdef init_sslm_config(StateSpaceLanguageModelConfig * config, model):
+
+    config[0].num_time_slices = model.num_time_slices
+    config[0].vocab_len = model.vocab_len
+
+    config[0].chain_variance = model.chain_variance
+    config[0].obs_variance = model.obs_variance
+
+    config[0].obs = <REAL_t *> (np.PyArray_DATA(model.obs))
+    config[0].mean = <REAL_t *> (np.PyArray_DATA(model.mean))
+    config[0].variance = <REAL_t *> (np.PyArray_DATA(model.variance))
+
+    config[0].fwd_mean = <REAL_t *> (np.PyArray_DATA(model.fwd_mean))
+    config[0].fwd_variance = <REAL_t *> (np.PyArray_DATA(model.fwd_variance))
+
+    config[0].zeta = <REAL_t *> (np.PyArray_DATA(model.zeta))
+
+    config[0].e_log_prob = <REAL_t *> (np.PyArray_DATA(model.e_log_prob))
+
+    # Default initialization should raise exception if it used without proper initialization
+    config[0].deriv = NULL
+    config[0].mean_deriv_mtx = NULL
+    word = -1
+
+
+import numpy as np
+from scipy import optimize
+
+
+cdef compute_post_mean(REAL_t *mean, REAL_t *fwd_mean, const REAL_t *fwd_variance, const REAL_t *obs,
+                       const int word, const int num_time_slices,
+                       const REAL_t obs_variance, const REAL_t chain_variance):
+    """Get the mean, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
+    <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
+
+    Notes
+    -----
+    This function essentially computes E[\beta_{t,w}] for t = 1:T.
+
+    .. :math::
+
+        Fwd_Mean(t) ≡  E(beta_{t,w} | beta_ˆ 1:t )
+        = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] +
+        (1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta
+
+    .. :math::
+
+        Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T )
+        = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) +
+        (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t]
+
+    Parameters
+    ----------
+    mean: a pointer to C-array of the REAL_t type and size of (vocab_len, num_time_slices + 1)
+        Contains the mean values to be used for inference for each word for a time slice.
+    fwd_mean: a pointer to C-array of the REAL_t type and size of (vocab_len, num_time_slices + 1)
+        The forward posterior values for the mean
+    fwd_variance: a pointer to C-array of the REAL_t type and size of (vocab_len, num_time_slices + 1)
+        The forward posterior values for the variance
+    obs: obs
+        A matrix containing the document to topic ratios
+    num_time_slices: int 
+        Number of time slices in the model.
+    word: int
+        The word's ID to process.
+    chain_variance : REAL_t 
+        Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
+    obs_variance: REAL_t 
+        Observed variance used to approximate the true and forward variance.
+
+
+    """
+    # print("")
+    cdef Py_ssize_t T = num_time_slices
+    obs = obs + word * num_time_slices
+    fwd_variance = fwd_variance + word * (num_time_slices + 1)
+    mean = mean + word * (num_time_slices + 1)
+    fwd_mean = fwd_mean + word * (num_time_slices + 1)
+
+    cdef Py_ssize_t t
+    cdef REAL_t c
+
+    # forward
+    fwd_mean[0] = 0
+
+    for t in range(1, T + 1):
+        c = obs_variance / (fwd_variance[t - 1] + chain_variance + obs_variance)
+        fwd_mean[t] = c * fwd_mean[t - 1] + (1 - c) * obs[t - 1]
+
+    # backward pass
+    mean[T] = fwd_mean[T]
+
+    for t in range(T - 1, -1, -1):
+        if chain_variance == 0.0:
+            c = 0.0
+        else:
+            c = chain_variance / (fwd_variance[t] + chain_variance)
+        mean[t] = c * fwd_mean[t] + (1 - c) * mean[t + 1]
+
+
+cdef compute_post_variance(REAL_t *variance, REAL_t *fwd_variance,
+                           const REAL_t obs_variance, const REAL_t chain_variance,
+                           const int word, const int num_time_slices):
+    r"""Get the variance, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
+    <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
+
+    This function accepts the word to compute variance for, along with the associated sslm class object,
+    and returns the `variance` and the posterior approximation `fwd_variance`.
+
+    Notes
+    -----
+    This function essentially computes Var[\beta_{t,w}] for t = 1:T
+
+    .. :math::
+
+        fwd\_variance[t] \equiv E((beta_{t,w}-mean_{t,w})^2 |beta_{t}\ for\ 1:t) =
+        (obs\_variance / fwd\_variance[t - 1] + chain\_variance + obs\_variance ) *
+        (fwd\_variance[t - 1] + obs\_variance)
+
+    .. :math::
+
+        variance[t] \equiv E((beta_{t,w}-mean\_cap_{t,w})^2 |beta\_cap_{t}\ for\ 1:t) =
+        fwd\_variance[t - 1] + (fwd\_variance[t - 1] / fwd\_variance[t - 1] + obs\_variance)^2 *
+        (variance[t - 1] - (fwd\_variance[t-1] + obs\_variance))
+
+    Parameters
+    ----------
+    variance: a pointer to C-array of the REAL_t type and size of (vocab_len, num_time_slices + 1)
+        Contains the variance values to be used for inference of word in a time slice
+    fwd_variance: a pointer to C-array of the REAL_t type and size of (vocab_len, num_time_slices + 1)
+        The forward posterior values for the variance
+    obs_variance: REAL_t
+        Observed variance used to approximate the true and forward variance.
+    chain_variance : REAL_t
+        Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
+    word: int
+        The word's ID to process.
+    num_time_slices: int 
+        Number of time slices in the model.
+    """
+    cdef int INIT_VARIANCE_CONST = 1000
+
+    cdef Py_ssize_t T = num_time_slices
+    variance = variance + word * (num_time_slices + 1)
+    fwd_variance = fwd_variance + word * (num_time_slices + 1)
+    cdef REAL_t c
+    cdef Py_ssize_t t
+
+    # forward pass. Set initial variance very high
+    fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST
+
+    for t in range(1, T + 1):
+        if obs_variance != 0.0:
+            c = obs_variance / (fwd_variance[t - 1] + chain_variance + obs_variance)
+        else:
+            c = 0
+        fwd_variance[t] = c * (fwd_variance[t - 1] + chain_variance)
+
+    # backward pass
+    variance[T] = fwd_variance[T]
+    for t in range(T - 1, -1, -1):
+        if fwd_variance[t] > 0.0:
+            c = pow((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2)
+        else:
+            c = 0
+        variance[t] = c * (variance[t + 1] - chain_variance) + (1 - c) * fwd_variance[t]
+
+
+cdef compute_mean_deriv(REAL_t *deriv, const REAL_t *variance, const REAL_t obs_variance, const REAL_t chain_variance,
+                        const int word, const int time, const int num_time_slices):
+    """Helper functions for optimizing a function.
+
+    Compute the derivative of:
+
+    .. :math::
+
+        E[\beta_{t,w}]/d obs_{s,w} for t = 1:T.
+
+    Parameters
+    ----------
+    deriv : a pointer to C-array of the REAL_t type and size of (num_time_slices)
+        Derivative for each time slice.
+    variance: a pointer to C-array of the REAL_t type and size of (vocab_len, num_time_slices + 1)
+        Contains the variance values to be used for inference of word in a time slice
+    obs_variance: REAL_t
+        Observed variance used to approximate the true and forward variance.
+    chain_variance : REAL_t
+        Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
+    word : int
+        The word's ID.
+    time : int
+        The time slice.
+    num_time_slices
+        Number of time slices in the model.
+    """
+
+    cdef Py_ssize_t T = num_time_slices
+
+    cdef REAL_t *fwd_variance = variance + word * (num_time_slices + 1)
+    cdef Py_ssize_t t
+    cdef REAL_t val
+    cdef REAL_t w
+
+    deriv[0] = 0
+
+    # forward pass
+    for t in range(1, T + 1):
+        if obs_variance > 0.0:
+            w = obs_variance / (fwd_variance[t - 1] + chain_variance + obs_variance)
+        else:
+            w = 0.0
+        val = w * deriv[t - 1]
+
+        if time == t - 1:
+            val += (1 - w)
+
+        deriv[t] = val
+
+    for t in range(T - 1, -1, -1):
+        if chain_variance == 0.0:
+            w = 0.0
+        else:
+            w = chain_variance / (fwd_variance[t] + chain_variance)
+
+        deriv[t] = w * deriv[t] + (1 - w) * deriv[t + 1]
+
+
+cdef compute_obs_deriv(REAL_t *deriv, const REAL_t *mean, const REAL_t *mean_deriv_mtx, const REAL_t *variance,
+                       const REAL_t *zeta, const REAL_t *totals, const REAL_t *word_counts,
+                       const REAL_t chain_variance, const int word, const int num_time_slices
+                       ):
+    """Derivation of obs which is used in derivative function `df_obs` while optimizing.
+
+    Parameters
+    ----------
+    deriv:
+    mean: a pointer to C-array of the REAL_t type and size of (vocab_len, num_time_slices + 1)
+        Contains the mean values to be used for inference for each word for a time slice.
+    mean_deriv_mtx:
+        Mean derivative for each time slice.
+    variance: a pointer to C-array of the REAL_t type and size of (vocab_len, num_time_slices + 1)
+        Contains the variance values to be used for inference of word in a time slice
+    zeta: a pointer to C-array of the REAL_t type and size of (num_time_slices)
+        An extra variational parameter with a value for each time slice.
+    word_counts : a pointer to C-array of the REAL_t type and size of (num_time_slices)
+        Total word counts for each time slice.
+    totals : a pointer to C-array of the REAL_t type and size of (num_time_slices)
+        The totals for each time slice.
+    chain_variance : REAL_t
+        Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
+    word : int
+        The word's ID to process.
+    num_time_slices: int 
+        Number of time slices in the model.
+    """
+
+    cdef REAL_t init_mult = 1000
+
+    cdef Py_ssize_t T = num_time_slices
+
+    mean = mean + word * (num_time_slices + 1)
+    variance = variance + word * (num_time_slices + 1)
+
+    cdef Py_ssize_t u, t
+    cdef REAL_t term1, term2, term3, term4
+
+    cdef REAL_t *temp_vect = <REAL_t *> malloc(T * sizeof(REAL_t))
+    if temp_vect == NULL:
+        raise
+
+    for u in range(T):
+        temp_vect[u] = exp(mean[u + 1] + variance[u + 1] / 2)
+
+    cdef REAL_t *mean_deriv = NULL
+
+    for t in range(T):
+
+        mean_deriv = mean_deriv_mtx + t * (num_time_slices + 1)
+        term1 = 0.0
+        term2 = 0.0
+        term3 = 0.0
+        term4 = 0.0
+
+        for u in range(1, T + 1):
+            term1 += (mean[u] - mean[u - 1]) * (mean_deriv[u] - mean_deriv[u - 1])
+            term2 += (word_counts[u - 1] - (totals[u - 1] * temp_vect[u - 1] / zeta[u - 1])) * mean_deriv[u]
+
+        if chain_variance != 0.0:
+
+            # TODO should not it be term2 here, in not prime version term2
+            term1 = - (term1 / chain_variance) - (mean[0] * mean_deriv[0]) / (init_mult * chain_variance)
+        else:
+            term1 = 0.0
+
+        deriv[t] = term1 + term2 + term3 + term4
+
+    free(temp_vect)
+
+cdef update_zeta(REAL_t * zeta, const REAL_t *mean, const REAL_t *variance,
+                 const int num_time_slices, const int vocab_len):
+    """Update the Zeta variational parameter.
+
+    Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)),
+    over every time-slice. It is the value of variational parameter zeta which maximizes the lower bound.
+    Parameters
+    ----------
+    zeta: a pointer to C-array of the REAL_t type and size of (num_time_slices)
+        An extra variational parameter with a value for each time slice.
+    mean: a pointer to C-array of the REAL_t type and size of (vocab_len, num_time_slices + 1)
+        Contains the mean values to be used for inference for each word for a time slice.
+    variance: a pointer to C-array of the REAL_t type and size of (vocab_len, num_time_slices + 1)
+        Contains the variance values to be used for inference of word in a time slice
+    num_time_slices: int 
+        Number of time slies in the model.
+    vocab_len:
+        Length of the model's vocabulary 
+    """
+
+    cdef Py_ssize_t i, w
+
+    cdef REAL_t temp
+
+    for i in range(num_time_slices):
+        temp = 0.0
+
+        for w in range(vocab_len):
+            # TODO check if it possible to use BLAS here
+            # TODO compare with original code, some info about log
+            temp += exp(mean[w * (num_time_slices + 1) + i + 1] + variance[
+                w * (num_time_slices + 1) + i + 1] / 2.0)
+
+        zeta[i] = temp
+
+
+cdef REAL_t compute_bound(StateSpaceLanguageModelConfig * config, REAL_t *sstats, REAL_t *totals):
+    """Compute the maximized lower bound achieved for the log probability of the true posterior.
+
+    Uses the formula presented in the appendix of the DTM paper (formula no. 5).
+
+    Parameters
+    ----------
+    config 
+        A pointer to the instance of config structure which stores links to the data.
+    sstats : numpy.ndarray
+        Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the first
+        time slice, expected shape (`self.vocab_len`, `num_topics`).
+    totals : list of int of length `len(self.time_slice)`
+        The totals for each time slice.
+
+    Returns
+    -------
+    float
+        The maximized lower bound.
+
+    """
+
+    cdef int vocab_len = config[0].vocab_len
+    cdef Py_ssize_t T = config[0].num_time_slices
+
+    cdef REAL_t term_1 = 0.0
+    cdef REAL_t term_2 = 0.0
+    cdef REAL_t term_3 = 0.0
+
+    cdef REAL_t val = 0.0
+    cdef REAL_t ent = 0.0
+
+    cdef REAL_t chain_variance = config[0].chain_variance
+
+    cdef REAL_t *mean = config[0].mean
+    cdef REAL_t *fwd_mean = config[0].fwd_mean
+    cdef REAL_t *variance = config[0].variance
+    cdef REAL_t *zeta = config[0].zeta
+
+    cdef Py_ssize_t i, t, w
+
+    for i in range(vocab_len):
+        config[0].word = i
+        compute_post_mean(mean, fwd_mean, config[0].fwd_variance, config[0].obs,
+                          i, T, config[0].obs_variance, chain_variance)
+
+    update_zeta(zeta, mean, variance, T, vocab_len)
+
+    val = 0.0
+
+    for i in range(vocab_len):
+        val += variance[i * (config[0].num_time_slices + 1)] - variance[i * (config[0].num_time_slices + 1) + T]
+
+    # TODO check if it is correct, not val (2.0 / chain_variance)
+    val = val / 2.0 * chain_variance
+
+    cdef REAL_t m, prev_m, v
+
+    for t in range(1, T + 1):
+
+        term_1 = 0.0
+        term_2 = 0.0
+        ent = 0.0
+
+        for w in range(vocab_len):
+            m = mean[w * (config[0].num_time_slices + 1) + t]
+            prev_m = mean[w * (config[0].num_time_slices + 1) + t - 1]
+
+            v = variance[w * (config[0].num_time_slices + 1) + t]
+
+            term_1 += \
+                (pow(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - log(chain_variance)
+            term_2 += sstats[w * config[0].num_time_slices + t - 1] * m
+
+            ent += log(v) / 2  # note the 2pi's cancel with term1 (see doc)
+
+        term_3 = -totals[t - 1] * log(zeta[t - 1])
+
+        val += term_2 + term_3 + ent - term_1
+
+    return val
+
+#
+cdef compute_expected_log_prob(REAL_t *e_log_prob, const REAL_t *zeta, const REAL_t *mean,
+                               const int vocab_len, const int num_time_slices):
+    """Compute the expected log probability given values of m.
+
+    The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper;
+    The below implementation is the result of solving the equation and is implemented as in the original
+    Blei DTM code.
+    
+    Parameters
+    ----------
+    e_log_prob:
+        A matrix containing the topic to word ratios.
+    zeta: a pointer to C-array of the REAL_t type and size of (num_time_slices)
+        An extra variational parameter with a value for each time slice.
+    mean: a pointer to C-array of the REAL_t type and size of (vocab_len, num_time_slices + 1)
+        Contains the mean values to be used for inference for each word for a time slice.
+    num_time_slices: int 
+        Number of time slies in the model.
+    vocab_len:
+        Length of the model's vocabulary 
+  
+    """
+
+    cdef Py_ssize_t w, t
+
+    for w in range(vocab_len):
+        for t in range(num_time_slices):
+            e_log_prob[w * num_time_slices + t] = mean[w * (num_time_slices + 1) + t + 1] - log(
+                zeta[t])
+
+
+cdef update_obs(StateSpaceLanguageModelConfig *config, REAL_t *sstats, REAL_t *totals):
+    """Optimize the bound with respect to the observed variables.
+
+    Parameters
+    ----------
+    config 
+        A pointer to the instance of config structure which stores links to the data.
+    sstats
+        Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the
+        current time slice, expected shape (vocab_len, num_time_slices).
+    
+    totals:
+
+    Returns
+    -------
+    (numpy.ndarray of float, numpy.ndarray of float)
+        The updated optimized values for obs and the zeta variational parameter.
+
+    """
+
+    cdef int OBS_NORM_CUTOFF = 2
+    cdef REAL_t STEP_SIZE = 0.01
+    cdef REAL_t TOL = 0.001
+
+    cdef Py_ssize_t W = config[0].vocab_len
+    cdef Py_ssize_t T = config[0].num_time_slices
+
+    cdef int runs = 0
+
+    cdef REAL_t *mean_deriv_mtx = <REAL_t *> malloc(T * (T + 1) * sizeof(REAL_t))
+    if mean_deriv_mtx == NULL:
+        raise
+    cdef Py_ssize_t w, t
+    cdef REAL_t counts_norm
+
+    cdef REAL_t * obs
+    config[0].totals = totals
+
+    np_norm_cutoff_obs = np.zeros(T, dtype=np.double)
+    np_w_counts = np.zeros(T, dtype=np.double)
+    np_obs = np.zeros(T, dtype=np.double)
+
+    # This is a work memory for df_obs function
+    working_array = np.zeros(T, dtype=np.double)
+
+    cdef REAL_t *norm_cutoff_obs = NULL
+    cdef REAL_t *w_counts
+
+    for w in range(W):
+        w_counts = sstats + w * config[0].num_time_slices
+        config[0].word = w
+
+        counts_norm = 0.0
+
+        # now we find L2 norm of w_counts
+        for i in range(config[0].num_time_slices):
+            counts_norm += w_counts[i] * w_counts[i]
+
+        counts_norm = sqrt(counts_norm)
+
+        if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not NULL:
+            obs = config[0].obs + w * config[0].num_time_slices
+            norm_cutoff_obs = <REAL_t *> (np.PyArray_DATA(np_norm_cutoff_obs))
+            # norm_cutoff_obs = <REAL_t *> malloc(config[0].num_time_slices * sizeof(REAL_t))
+            if norm_cutoff_obs == NULL:
+                raise
+            memcpy(norm_cutoff_obs, obs, config[0].num_time_slices * sizeof(REAL_t))
+
+        else:
+            if counts_norm < OBS_NORM_CUTOFF:
+                np_w_counts = np.zeros(config[0].num_time_slices, dtype=np.double)
+                w_counts = <REAL_t *> (np.PyArray_DATA(np_w_counts))
+
+            for t in range(T):
+                compute_mean_deriv(mean_deriv_mtx + t * (config[0].num_time_slices + 1), config[0].variance,
+                                   config[0].obs_variance, config[0].chain_variance, w, t, T)
+
+            np_deriv = np.zeros(T, dtype=np.double)
+            deriv = <REAL_t *> (np.PyArray_DATA(np_deriv))
+            config[0].deriv = deriv
+
+            obs = <REAL_t *> (np.PyArray_DATA(np_obs))
+            memcpy(obs, config[0].obs + w * config[0].num_time_slices, config[0].num_time_slices * sizeof(REAL_t))
+
+            config[0].word_counts = w_counts
+            config[0].mean_deriv_mtx = mean_deriv_mtx
+
+            # Passing C config structure as integer in Python code
+            args = (<uintptr_t>(config),working_array)
+
+            temp_obs = optimize.fmin_cg(
+                f=f_obs, fprime=df_obs, x0=np_obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0
+            )
+
+            obs = <REAL_t *> (np.PyArray_DATA(temp_obs))
+
+            runs += 1
+
+            if counts_norm < OBS_NORM_CUTOFF:
+
+                norm_cutoff_obs = <REAL_t *> (np.PyArray_DATA(np_norm_cutoff_obs))
+                memcpy(norm_cutoff_obs, obs, config[0].num_time_slices * sizeof(REAL_t))
+
+            memcpy(config[0].obs + w * config[0].num_time_slices, obs, config[0].num_time_slices * sizeof(REAL_t))
+
+    update_zeta(config[0].zeta, config[0].mean, config[0].variance,
+                config[0].num_time_slices, config[0].vocab_len)
+
+    free(mean_deriv_mtx)
+
+
+# the following functions are used in update_obs as the objective function.
+def f_obs(_x, uintptr_t c, work_array):
+    """Function which we are optimising for minimizing obs.
+
+    Parameters
+    ----------
+    _x : np.ndarray of float64
+        The obs values for this word.
+    c: uintptr_t
+        An pointer's value or address where config structure is stored in the memory.
+    work_array:
+        Additional work memory
+    Returns
+    -------
+    REAL_t
+        The value of the objective function evaluated at point `x`.
+
+    """
+    cdef StateSpaceLanguageModelConfig * config = <StateSpaceLanguageModelConfig *> c
+    cdef REAL_t *x = <REAL_t *> (np.PyArray_DATA(_x))
+
+    # flag
+    cdef int init_mult = 1000
+    cdef Py_ssize_t T = config[0].num_time_slices
+    cdef Py_ssize_t t
+
+    cdef REAL_t val = 0.0
+    cdef REAL_t term1 = 0.0
+    cdef REAL_t term2 = 0.0
+
+    # term 3 and 4 for DIM
+    cdef REAL_t term3 = 0.0
+    cdef REAL_t term4 = 0.0
+
+    # obs[word] = x
+    memcpy(config[0].obs + config[0].word * config[0].num_time_slices, x, config[0].num_time_slices * sizeof(REAL_t))
+
+    compute_post_mean(config[0].mean, config[0].fwd_mean, config[0].fwd_variance, config[0].obs,
+                      config[0].word, config[0].num_time_slices, config[0].obs_variance, config[0].chain_variance)
+
+    cdef REAL_t *mean = config[0].mean + config[0].word * (config[0].num_time_slices + 1)
+    cdef REAL_t *variance = config[0].variance + config[0].word * (config[0].num_time_slices + 1)
+
+    for t in range(1, T + 1):
+
+        term1 += (mean[t] - mean[t - 1]) * (mean[t] - mean[t - 1])
+
+        term2 += config[0].word_counts[t - 1] * mean[t] - config[0].totals[t - 1] * \
+                 exp(mean[t] + variance[t] / 2) / config[0].zeta[t - 1]
+
+
+    if config[0].chain_variance > 0.0:
+
+        term1 = -(term1 / (2 * config[0].chain_variance)) - \
+                mean[0] * mean[0] / (2 * init_mult * config[0].chain_variance)
+    else:
+        term1 = 0.0
+
+    return -(term1 + term2 + term3 + term4)
+
+
+def df_obs(_x, uintptr_t c, work_array):
+    """Derivative of the objective function which optimises obs.
+
+    Parameters
+    ----------
+    _x : np.ndarray of float64
+        The obs values for this word.
+    c: uintptr_t
+        An pointer's value or address where config structure is stored in the memory.
+    work_array:
+        Additional work memory
+
+    Returns
+    -------
+    np.ndarray of float64
+        The derivative of the objective function evaluated at point `x`.
+
+    """
+    cdef StateSpaceLanguageModelConfig * config = <StateSpaceLanguageModelConfig *> c
+    cdef REAL_t *x = <REAL_t *> (np.PyArray_DATA(_x))
+
+
+    memcpy(config[0].obs + config[0].num_time_slices * config[0].word, x, config[0].num_time_slices * sizeof(REAL_t))
+
+    compute_post_mean(config[0].mean, config[0].fwd_mean, config[0].fwd_variance, config[0].obs,
+                      config[0].word, config[0].num_time_slices, config[0].obs_variance, config[0].chain_variance)
+
+    compute_obs_deriv(config[0].deriv, config[0].mean, config[0].mean_deriv_mtx, config[0].variance,
+                      config[0].zeta, config[0].totals, config[0].word_counts,
+                      config[0].chain_variance, config[0].word, config[0].num_time_slices)
+
+    for i in range(config[0].num_time_slices):
+        config[0].deriv[i] = -config[0].deriv[i]
+
+    cdef REAL_t *temp_ptr = <REAL_t *>(np.PyArray_DATA(work_array))
+
+    memcpy(temp_ptr, config[0].deriv, config[0].num_time_slices * sizeof(REAL_t))
+
+    return work_array
+
+
+def fit_sslm(model, np_sstats):
+    """Fits variational distribution.
+
+    This is essentially the m-step.
+    Maximizes the approximation of the true posterior for a particular topic using the provided sufficient
+    statistics. Updates the values using :meth:`~gensim.models.ldaseqmodel.sslm.update_obs` and
+    :meth:`~gensim.models.ldaseqmodel.sslm.compute_expected_log_prob`.
+
+    Parameters
+    ----------
+    model
+        An instance of SSLM model
+    sstats : numpy.ndarray
+        Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the
+        current time slice, expected shape (vocab_len, num_time_slices).
+
+    Returns
+    -------
+    float
+        The lower bound for the true posterior achieved using the fitted approximate distribution.
+
+    """
+
+    # Initialize C structures based on Python instance of the model
+    cdef StateSpaceLanguageModelConfig* config = <StateSpaceLanguageModelConfig *>malloc(sizeof(StateSpaceLanguageModelConfig))
+    init_sslm_config(config, model)
+
+    cdef int W = config[0].vocab_len
+    cdef REAL_t old_bound = 0.0
+    cdef REAL_t sslm_fit_threshold = 0.000001
+    cdef int sslm_max_iter = 2
+    cdef REAL_t converged = sslm_fit_threshold + 1
+
+    cdef int w
+
+    for w in range(W):
+        config[0].word = w
+        compute_post_variance(config[0].variance, config[0].fwd_variance, config[0].obs_variance,
+                              config[0].chain_variance, w, config[0].num_time_slices)
+
+    cdef REAL_t *sstats = <REAL_t *> (np.PyArray_DATA(np_sstats))
+
+    # column sum of sstats
+    np_totals = np_sstats.sum(axis=0)
+    cdef REAL_t *totals = <REAL_t *> (np.PyArray_DATA(np_totals))
+
+    cdef int iter_ = 0
+
+    cdef REAL_t bound = compute_bound(config, sstats, totals)
+
+    while converged > sslm_fit_threshold and iter_ < sslm_max_iter:
+        iter_ += 1
+        old_bound = bound
+        update_obs(config, sstats, totals)
+
+        bound = compute_bound(config, sstats, totals)
+
+        converged = abs((bound - old_bound) / old_bound)
+
+    compute_expected_log_prob(config[0].e_log_prob, config[0].zeta, config[0].mean,
+                              W, config[0].num_time_slices)
+
+    free(config)
+    return bound
diff --git a/gensim/models/ldaseqmodel_optimized.py b/gensim/models/ldaseqmodel_optimized.py
new file mode 100644
index 0000000000..ebbcfced6b
--- /dev/null
+++ b/gensim/models/ldaseqmodel_optimized.py
@@ -0,0 +1,1007 @@
+import logging
+
+import numpy as np
+from gensim import utils, matutils
+from gensim.models import ldamodel
+from .ldaseq_sslm_inner import fit_sslm
+from .ldaseq_posterior_inner import fit_lda_post
+
+logger = logging.getLogger(__name__)
+
+
+class LdaSeqModel_V4(utils.SaveLoad):
+    """Estimate Dynamic Topic Model parameters based on a training corpus."""
+    def __init__(
+            self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10,
+            initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10,
+            random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100,
+        ):
+        """
+
+        Parameters
+        ----------
+        corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
+            Stream of   document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
+            If not given, the model is left untrained (presumably because you want to call
+            :meth:`~gensim.models.ldamodel.LdaSeqModel.update` manually).
+        time_slice : list of int, optional
+            Number of documents in each time-slice. Each time slice could for example represent a year's published
+            papers, in case the corpus comes from a journal publishing over multiple years.
+            It is assumed that `sum(time_slice) == num_documents`.
+        id2word : dict of (int, str), optional
+            Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for
+            debugging and topic printing.
+        alphas : float, optional
+            The prior probability for the model.
+        num_topics : int, optional
+            The number of requested latent topics to be extracted from the training corpus.
+        initialize : {'gensim', 'own', 'ldamodel'}, optional
+            Controls the initialization of the DTM model. Supports three different modes:
+                * 'gensim': Uses gensim's LDA initialization.
+                * 'own': Uses your own initialization matrix of an LDA model that has been previously trained.
+                * 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument.
+        sstats : numpy.ndarray , optional
+            Sufficient statistics used for initializing the model if `initialize == 'own'`. Corresponds to matrix
+            beta in the linked paper for time slice 0, expected shape (`self.vocab_len`, `num_topics`).
+        lda_model : :class:`~gensim.models.ldamodel.LdaModel`
+            Model whose sufficient statistics will be used to initialize the current object if `initialize == 'gensim'`.
+        obs_variance : float, optional
+            Observed variance used to approximate the true and forward variance as shown in
+            `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
+            <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
+        chain_variance : float, optional
+            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
+        passes : int, optional
+            Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel`
+        random_state : {numpy.random.RandomState, int}, optional
+            Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results.
+        lda_inference_max_iter : int, optional
+            Maximum number of iterations in the inference step of the LDA training.
+        em_min_iter : int, optional
+            Minimum number of iterations until converge of the Expectation-Maximization algorithm
+        em_max_iter : int, optional
+            Maximum number of iterations until converge of the Expectation-Maximization algorithm.
+        chunksize : int, optional
+            Number of documents in the corpus do be processed in in a chunk.
+
+        """
+        self.id2word = id2word
+        if corpus is None and self.id2word is None:
+            raise ValueError(
+                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
+            )
+
+        if self.id2word is None:
+            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
+            self.id2word = utils.dict_from_corpus(corpus)
+            self.vocab_len = len(self.id2word)
+        elif self.id2word:
+            self.vocab_len = len(self.id2word)
+        else:
+            self.vocab_len = 0
+
+        if corpus is not None:
+            try:
+                self.corpus_len = len(corpus)
+            except TypeError:
+                logger.warning("input corpus stream has no len(); counting documents")
+                self.corpus_len = sum(1 for _ in corpus)
+
+        self.time_slice = time_slice
+        if self.time_slice is not None:
+            self.num_time_slices = len(time_slice)
+
+        self.num_topics = num_topics
+        self.num_time_slices = len(time_slice)
+        self.alphas = np.full(num_topics, alphas)
+
+        # topic_chains contains for each topic a 'state space language model' object
+        # which in turn has information about each topic
+        # the sslm class is described below and contains information
+        # on topic-word probabilities and doc-topic probabilities.
+        self.topic_chains = []
+        for topic in range(num_topics):
+            sslm_ = sslm(
+                num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics,
+                chain_variance=chain_variance, obs_variance=obs_variance
+            )
+            self.topic_chains.append(sslm_)
+
+        # the following are class variables which are to be integrated during Document Influence Model
+        self.top_doc_phis = None
+        self.influence = None
+        self.renormalized_influence = None
+        self.influence_sum_lgl = None
+
+        # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM.
+        if corpus is not None and time_slice is not None:
+            self.max_doc_len = max(len(line) for line in corpus)
+
+            if initialize == 'gensim':
+                lda_model = ldamodel.LdaModel(
+                    corpus, id2word=self.id2word, num_topics=self.num_topics,
+                    passes=passes, alpha=self.alphas, random_state=random_state,
+                    dtype=np.float64
+                )
+                self.sstats = np.transpose(lda_model.state.sstats)
+            if initialize == 'ldamodel':
+                self.sstats = np.transpose(lda_model.state.sstats)
+            if initialize == 'own':
+                self.sstats = sstats
+
+            # initialize model from sstats
+            self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas, self.sstats)
+
+            # fit DTM
+            self.fit_lda_seq(corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize)
+
+    def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_suffstats):
+        """Initialize State Space Language Model, topic-wise.
+
+        Parameters
+        ----------
+        topic_chain_variance : float
+            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve.
+        topic_obs_variance : float
+            Observed variance used to approximate the true and forward variance as shown in
+            `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
+            <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
+        alpha : float
+            The prior probability for the model.
+        init_suffstats : numpy.ndarray
+            Sufficient statistics used for initializing the model, expected shape (`self.vocab_len`, `num_topics`).
+
+        """
+        self.alphas = alpha
+        for k, chain in enumerate(self.topic_chains):
+            sstats = init_suffstats[:, k]
+            chain.sslm_counts_init(topic_obs_variance, topic_chain_variance, sstats)
+
+            # initialize the below matrices only if running DIM
+            # ldaseq.topic_chains[k].w_phi_l = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
+            # ldaseq.topic_chains[k].w_phi_sum = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
+            # ldaseq.topic_chains[k].w_phi_sq = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
+
+    def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize):
+        """Fit a LDA Sequence model (DTM).
+
+        This method will iteratively setup LDA models and perform EM steps until the sufficient statistics convergence,
+        or until the maximum number of iterations is reached. Because the true posterior is intractable, an
+        appropriately tight lower bound must be used instead. This function will optimize this bound, by minimizing
+        its true Kullback-Liebler Divergence with the true posterior.
+
+        Parameters
+        ----------
+        corpus : {iterable of list of (int, float), scipy.sparse.csc}
+            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
+        lda_inference_max_iter : int
+            Maximum number of iterations for the inference step of LDA.
+        em_min_iter : int
+            Minimum number of time slices to be inspected.
+        em_max_iter : int
+            Maximum number of time slices to be inspected.
+        chunksize : int
+            Number of documents to be processed in each chunk.
+
+        Returns
+        -------
+        float
+            The highest lower bound for the true posterior produced after all iterations.
+
+       """
+        LDASQE_EM_THRESHOLD = 1e-4
+        # if bound is low, then we increase iterations.
+        LOWER_ITER = 10
+        ITER_MULT_LOW = 2
+        MAX_ITER = 500
+
+        num_topics = self.num_topics
+        vocab_len = self.vocab_len
+        data_len = self.num_time_slices
+        corpus_len = self.corpus_len
+
+        bound = 0
+        convergence = LDASQE_EM_THRESHOLD + 1
+        iter_ = 0
+
+        while iter_ < em_min_iter or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= em_max_iter):
+
+            logger.info(" EM iter %i", iter_)
+            logger.info("E Step")
+            # TODO: bound is initialized to 0
+            old_bound = bound
+
+            # initiate sufficient statistics
+            topic_suffstats = []
+            for topic in range(num_topics):
+                topic_suffstats.append(np.zeros((vocab_len, data_len)))
+
+            # set up variables
+            gammas = np.zeros((corpus_len, num_topics))
+            lhoods = np.zeros((corpus_len, num_topics + 1))
+            # compute the likelihood of a sequential corpus under an LDA
+            # seq model and find the evidence lower bound. This is the E - Step
+            bound, gammas = \
+                self.lda_seq_infer(corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize)
+            self.gammas = gammas
+
+            logger.info("M Step")
+
+            # fit the variational distribution. This is the M - Step
+            topic_bound = self.fit_lda_seq_topics(topic_suffstats)
+            bound += topic_bound
+
+            if (bound - old_bound) < 0:
+                # if max_iter is too low, increase iterations.
+                if lda_inference_max_iter < LOWER_ITER:
+                    lda_inference_max_iter *= ITER_MULT_LOW
+                logger.info("Bound went down, increasing iterations to %i", lda_inference_max_iter)
+
+            # check for convergence
+            convergence = np.fabs((bound - old_bound) / old_bound)
+
+            if convergence < LDASQE_EM_THRESHOLD:
+
+                lda_inference_max_iter = MAX_ITER
+                logger.info("Starting final iterations, max iter is %i", lda_inference_max_iter)
+                convergence = 1.0
+
+            logger.info("iteration %i iteration lda seq bound is %f convergence is %f", iter_, bound, convergence)
+
+            iter_ += 1
+
+        return bound
+
+    def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
+                      iter_, lda_inference_max_iter, chunksize):
+        """Inference (or E-step) for the lower bound EM optimization.
+
+        This is used to set up the gensim :class:`~gensim.models.ldamodel.LdaModel` to be used for each time-slice.
+        It also allows for Document Influence Model code to be written in.
+
+        Parameters
+        ----------
+        corpus : {iterable of list of (int, float), scipy.sparse.csc}
+            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
+        topic_suffstats : numpy.ndarray
+            Sufficient statistics for time slice 0, used for initializing the model if `initialize == 'own'`,
+            expected shape (`self.vocab_len`, `num_topics`).
+        gammas : numpy.ndarray
+            Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
+        lhoods : list of float
+            The total log probability lower bound for each topic. Corresponds to the phi variational parameters in the
+            linked paper.
+        iter_ : int
+            Current iteration.
+        lda_inference_max_iter : int
+            Maximum number of iterations for the inference step of LDA.
+        chunksize : int
+            Number of documents to be processed in each chunk.
+
+        Returns
+        -------
+        (float, list of float)
+            The first value is the highest lower bound for the true posterior.
+            The second value is the list of optimized dirichlet variational parameters for the approximation of
+            the posterior.
+
+        """
+        num_topics = self.num_topics
+        vocab_len = self.vocab_len
+        bound = 0.0
+
+        lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
+        lda.topics = np.zeros((vocab_len, num_topics))
+        ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda)
+
+        model = "DTM"
+        if model == "DTM":
+            bound, gammas = self.inferDTMseq(
+                corpus, topic_suffstats, gammas, lhoods, lda,
+                ldapost, iter_, bound, lda_inference_max_iter, chunksize
+            )
+        elif model == "DIM":
+            self.InfluenceTotalFixed(corpus)
+            bound, gammas = self.inferDIMseq(
+                corpus, topic_suffstats, gammas, lhoods, lda,
+                ldapost, iter_, bound, lda_inference_max_iter, chunksize
+            )
+
+        return bound, gammas
+
+    def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda,
+                    ldapost, iter_, bound, lda_inference_max_iter, chunksize):
+        """Compute the likelihood of a sequential corpus under an LDA seq model, and reports the likelihood bound.
+
+        Parameters
+        ----------
+        corpus : {iterable of list of (int, float), scipy.sparse.csc}
+            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
+        topic_suffstats : numpy.ndarray
+            Sufficient statistics of the current model, expected shape (`self.vocab_len`, `num_topics`).
+        gammas : numpy.ndarray
+            Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
+        lhoods : list of float of length `self.num_topics`
+            The total log probability bound for each topic. Corresponds to phi from the linked paper.
+        lda : :class:`~gensim.models.ldamodel.LdaModel`
+            The trained LDA model of the previous iteration.
+        ldapost : :class:`~gensim.models.ldaseqmodel.LdaPost`
+            Posterior probability variables for the given LDA model. This will be used as the true (but intractable)
+            posterior.
+        iter_ : int
+            The current iteration.
+        bound : float
+            The LDA bound produced after all iterations.
+        lda_inference_max_iter : int
+            Maximum number of iterations for the inference step of LDA.
+        chunksize : int
+            Number of documents to be processed in each chunk.
+
+        Returns
+        -------
+        (float, list of float)
+            The first value is the highest lower bound for the true posterior.
+            The second value is the list of optimized dirichlet variational parameters for the approximation of
+            the posterior.
+
+        """
+        doc_index = 0  # overall doc_index in corpus
+        time = 0  # current time-slice
+        doc_num = 0  # doc-index in current time-slice
+        lda = self.make_lda_seq_slice(lda, time)  # create lda_seq slice
+
+        time_slice = np.cumsum(np.array(self.time_slice))
+
+        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
+            # iterates chunk size for constant memory footprint
+            for doc in chunk:
+                # this is used to update the time_slice and create a new lda_seq slice every new time_slice
+                if doc_index > time_slice[time]:
+                    time += 1
+                    lda = self.make_lda_seq_slice(lda, time)  # create lda_seq slice
+                    doc_num = 0
+
+                gam = gammas[doc_index]
+                lhood = lhoods[doc_index]
+
+                ldapost.gamma = gam
+                ldapost.lhood = lhood
+                ldapost.doc = doc
+
+                # TODO: replace fit_lda_post with appropriate ldamodel functions, if possible.
+                if iter_ == 0:
+                    doc_lhood = LdaPost.fit_lda_post(
+                        ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter
+                    )
+                else:
+                    doc_lhood = LdaPost.fit_lda_post(
+                        ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter
+                    )
+
+                if topic_suffstats is not None:
+                    topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, doc, topic_suffstats)
+
+                gammas[doc_index] = ldapost.gamma
+                bound += doc_lhood
+                doc_index += 1
+                doc_num += 1
+
+        return bound, gammas
+
+    def make_lda_seq_slice(self, lda, time):
+        """Update the LDA model topic-word values using time slices.
+
+        Parameters
+        ----------
+
+        lda : :class:`~gensim.models.ldamodel.LdaModel`
+            The stationary model to be updated
+        time : int
+            The time slice assigned to the stationary model.
+
+        Returns
+        -------
+        lda : :class:`~gensim.models.ldamodel.LdaModel`
+            The stationary model updated to reflect the passed time slice.
+
+        """
+        for k in range(self.num_topics):
+            lda.topics[:, k] = self.topic_chains[k].e_log_prob[:, time]
+
+        lda.alpha = np.copy(self.alphas)
+        return lda
+
+    def fit_lda_seq_topics(self, topic_suffstats):
+        """Fit the sequential model topic-wise.
+
+        Parameters
+        ----------
+        topic_suffstats : numpy.ndarray
+            Sufficient statistics of the current model, expected shape (`self.vocab_len`, `num_topics`).
+
+        Returns
+        -------
+        float
+            The sum of the optimized lower bounds for all topics.
+
+        """
+        lhood = 0
+
+        for k, chain in enumerate(self.topic_chains):
+            logger.info("Fitting topic number %i", k)
+            lhood_term = chain.fit_sslm(topic_suffstats[k])
+            lhood += lhood_term
+
+        return lhood
+
+    def print_topic_times(self, topic, top_terms=20):
+        """Get the most relevant words for a topic, for each timeslice. This can be used to inspect the evolution of a
+        topic through time.
+
+        Parameters
+        ----------
+        topic : int
+            The index of the topic.
+        top_terms : int, optional
+            Number of most relevant words associated with the topic to be returned.
+
+        Returns
+        -------
+        list of list of str
+            Top `top_terms` relevant terms for the topic for each time slice.
+
+        """
+        topics = []
+        for time in range(self.num_time_slices):
+            topics.append(self.print_topic(topic, time, top_terms))
+
+        return topics
+
+    def print_topics(self, time=0, top_terms=20):
+        """Get the most relevant words for every topic.
+
+        Parameters
+        ----------
+        time : int, optional
+            The time slice in which we are interested in (since topics evolve over time, it is expected that the most
+            relevant words will also gradually change).
+        top_terms : int, optional
+            Number of most relevant words to be returned for each topic.
+
+        Returns
+        -------
+        list of list of (str, float)
+            Representation of all topics. Each of them is represented by a list of pairs of words and their assigned
+            probability.
+
+        """
+        return [self.print_topic(topic, time, top_terms) for topic in range(self.num_topics)]
+
+    def print_topic(self, topic, time=0, top_terms=20):
+        """Get the list of words most relevant to the given topic.
+
+        Parameters
+        ----------
+        topic : int
+            The index of the topic to be inspected.
+        time : int, optional
+            The time slice in which we are interested in (since topics evolve over time, it is expected that the most
+            relevant words will also gradually change).
+        top_terms : int, optional
+            Number of words associated with the topic to be returned.
+
+        Returns
+        -------
+        list of (str, float)
+            The representation of this topic. Each element in the list includes the word itself, along with the
+            probability assigned to it by the topic.
+
+        """
+        topic = self.topic_chains[topic].e_log_prob
+        topic = np.transpose(topic)
+        topic = np.exp(topic[time])
+        topic = topic / topic.sum()
+        bestn = matutils.argsort(topic, top_terms, reverse=True)
+        beststr = [(self.id2word[id_], topic[id_]) for id_ in bestn]
+        return beststr
+
+    def doc_topics(self, doc_number):
+        """Get the topic mixture for a document.
+
+        Uses the priors for the dirichlet distribution that approximates the true posterior with the optimal
+        lower bound, and therefore requires the model to be already trained.
+
+
+        Parameters
+        ----------
+        doc_number : int
+            Index of the document for which the mixture is returned.
+
+        Returns
+        -------
+        list of length `self.num_topics`
+            Probability for each topic in the mixture (essentially a point in the `self.num_topics - 1` simplex.
+
+        """
+        doc_topic = self.gammas / self.gammas.sum(axis=1)[:, np.newaxis]
+        return doc_topic[doc_number]
+
+    def dtm_vis(self, time, corpus):
+        """Get the information needed to visualize the corpus model at a given time slice, using the pyLDAvis format.
+
+        Parameters
+        ----------
+        time : int
+            The time slice we are interested in.
+        corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
+            The corpus we want to visualize at the given time slice.
+
+        Returns
+        -------
+        doc_topics : list of length `self.num_topics`
+            Probability for each topic in the mixture (essentially a point in the `self.num_topics - 1` simplex.
+        topic_term : numpy.ndarray
+            The representation of each topic as a multinomial over words in the vocabulary,
+            expected shape (`num_topics`, vocabulary length).
+        doc_lengths : list of int
+            The number of words in each document. These could be fixed, or drawn from a Poisson distribution.
+        term_frequency : numpy.ndarray
+            The term frequency matrix (denoted as beta in the original Blei paper). This could also be the TF-IDF
+            representation of the corpus, expected shape (number of documents, length of vocabulary).
+        vocab : list of str
+            The set of unique terms existing in the cropuse's vocabulary.
+
+        """
+        doc_topic = self.gammas / self.gammas.sum(axis=1)[:, np.newaxis]
+
+        def normalize(x):
+            return x / x.sum()
+
+        topic_term = [
+            normalize(np.exp(chain.e_log_prob.T[time]))
+            for k, chain in enumerate(self.topic_chains)
+        ]
+
+        doc_lengths = []
+        term_frequency = np.zeros(self.vocab_len)
+        for doc_no, doc in enumerate(corpus):
+            doc_lengths.append(len(doc))
+
+            for term, freq in doc:
+                term_frequency[term] += freq
+
+        vocab = [self.id2word[i] for i in range(len(self.id2word))]
+
+        return doc_topic, np.array(topic_term), doc_lengths, term_frequency, vocab
+
+    def dtm_coherence(self, time):
+        """Get the coherence for each topic.
+
+        Can be used to measure the quality of the model, or to inspect the convergence through training via a callback.
+
+        Parameters
+        ----------
+        time : int
+            The time slice.
+
+        Returns
+        -------
+        list of list of str
+            The word representation for each topic, for each time slice. This can be used to check the time coherence
+            of topics as time evolves: If the most relevant words remain the same then the topic has somehow
+            converged or is relatively static, if they change rapidly the topic is evolving.
+
+        """
+        coherence_topics = []
+        for topics in self.print_topics(time):
+            coherence_topic = []
+            for word, dist in topics:
+                coherence_topic.append(word)
+            coherence_topics.append(coherence_topic)
+
+        return coherence_topics
+
+    def __getitem__(self, doc):
+        """Get the topic mixture for the given document, using the inferred approximation of the true posterior.
+
+        Parameters
+        ----------
+        doc : list of (int, float)
+            The doc in BOW format. Can be an unseen document.
+
+        Returns
+        -------
+        list of float
+            Probabilities for each topic in the mixture. This is essentially a point in the `num_topics - 1` simplex.
+
+        """
+        lda_model = ldamodel.LdaModel(
+            num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
+        lda_model.topics = np.zeros((self.vocab_len, self.num_topics))
+        ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc)
+
+        time_lhoods = []
+        for time in range(self.num_time_slices):
+            lda_model = self.make_lda_seq_slice(lda_model, time)  # create lda_seq slice
+            lhood = LdaPost.fit_lda_post(ldapost, 0, time, self)
+            time_lhoods.append(lhood)
+
+        doc_topic = ldapost.gamma / ldapost.gamma.sum()
+        # should even the likelihoods be returned?
+        return doc_topic
+
+
+class sslm(utils.SaveLoad):
+    """Encapsulate the inner State Space Language Model for DTM.
+
+    Some important attributes of this class:
+
+        * `obs` is a matrix containing the document to topic ratios.
+        * `e_log_prob` is a matrix containing the topic to word ratios.
+        * `mean` contains the mean values to be used for inference for each word for a time slice.
+        * `variance` contains the variance values to be used for inference of word in a time slice.
+        * `fwd_mean` and`fwd_variance` are the forward posterior values for the mean and the variance.
+        * `zeta` is an extra variational parameter with a value for each time slice.
+
+    """
+
+    def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005):
+        self.vocab_len = vocab_len
+        self.num_time_slices = num_time_slices
+        self.obs_variance = obs_variance
+        self.chain_variance = chain_variance
+        self.num_topics = num_topics
+
+        # setting up matrices
+        self.obs = np.zeros((vocab_len, num_time_slices), dtype=np.float64)
+        self.e_log_prob = np.zeros((vocab_len, num_time_slices), dtype=np.float64)
+        self.mean = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
+        self.fwd_mean = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
+        self.fwd_variance = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
+        self.variance = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
+        self.zeta = np.zeros(num_time_slices, dtype=np.float64)
+
+        # the following are class variables which are to be integrated during Document Influence Model
+        self.m_update_coeff = None
+        self.mean_t = None
+        self.variance_t = None
+        self.influence_sum_lgl = None
+        self.w_phi_l = None
+        self.w_phi_sum = None
+        self.w_phi_l_sq = None
+        self.m_update_coeff_g = None
+
+    def update_zeta(self):
+        """Update the Zeta variational parameter.
+
+        Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)),
+        over every time-slice. It is the value of variational parameter zeta which maximizes the lower bound.
+
+        Returns
+        -------
+        list of float
+            The updated zeta values for each time slice.
+
+        """
+        for j, val in enumerate(self.zeta):
+            self.zeta[j] = np.sum(np.exp(self.mean[:, j + 1] + self.variance[:, j + 1] / 2))
+        return self.zeta
+
+    def compute_post_variance(self, word, chain_variance):
+        r"""Get the variance, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
+        <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
+
+        This function accepts the word to compute variance for, along with the associated sslm class object,
+        and returns the `variance` and the posterior approximation `fwd_variance`.
+
+        Notes
+        -----
+        This function essentially computes Var[\beta_{t,w}] for t = 1:T
+
+        .. :math::
+
+            fwd\_variance[t] \equiv E((beta_{t,w}-mean_{t,w})^2 |beta_{t}\ for\ 1:t) =
+            (obs\_variance / fwd\_variance[t - 1] + chain\_variance + obs\_variance ) *
+            (fwd\_variance[t - 1] + obs\_variance)
+
+        .. :math::
+
+            variance[t] \equiv E((beta_{t,w}-mean\_cap_{t,w})^2 |beta\_cap_{t}\ for\ 1:t) =
+            fwd\_variance[t - 1] + (fwd\_variance[t - 1] / fwd\_variance[t - 1] + obs\_variance)^2 *
+            (variance[t - 1] - (fwd\_variance[t-1] + obs\_variance))
+
+        Parameters
+        ----------
+        word: int
+            The word's ID.
+        chain_variance : float
+            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
+
+        Returns
+        -------
+        (numpy.ndarray, numpy.ndarray)
+            The first returned value is the variance of each word in each time slice, the second value is the
+            inferred posterior variance for the same pairs.
+
+        """
+        INIT_VARIANCE_CONST = 1000
+
+        T = self.num_time_slices
+        variance = self.variance[word]
+        fwd_variance = self.fwd_variance[word]
+        # forward pass. Set initial variance very high
+        fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST
+        for t in range(1, T + 1):
+            if self.obs_variance:
+                c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance)
+            else:
+                c = 0
+            fwd_variance[t] = c * (fwd_variance[t - 1] + chain_variance)
+
+        # backward pass
+        variance[T] = fwd_variance[T]
+        for t in range(T - 1, -1, -1):
+            if fwd_variance[t] > 0.0:
+                c = np.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2)
+            else:
+                c = 0
+            variance[t] = (c * (variance[t + 1] - chain_variance)) + ((1 - c) * fwd_variance[t])
+
+        return variance, fwd_variance
+
+    def compute_post_mean(self, word, chain_variance):
+        """Get the mean, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
+        <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
+
+        Notes
+        -----
+        This function essentially computes E[\beta_{t,w}] for t = 1:T.
+
+        .. :math::
+
+            Fwd_Mean(t) ≡  E(beta_{t,w} | beta_ˆ 1:t )
+            = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] +
+            (1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta
+
+        .. :math::
+
+            Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T )
+            = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) +
+            (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t]
+
+        Parameters
+        ----------
+        word: int
+            The word's ID.
+        chain_variance : float
+            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
+
+        Returns
+        -------
+        (numpy.ndarray, numpy.ndarray)
+            The first returned value is the mean of each word in each time slice, the second value is the
+            inferred posterior mean for the same pairs.
+
+        """
+        T = self.num_time_slices
+        obs = self.obs[word]
+        fwd_variance = self.fwd_variance[word]
+        mean = self.mean[word]
+        fwd_mean = self.fwd_mean[word]
+
+        # forward
+        fwd_mean[0] = 0
+        for t in range(1, T + 1):
+            c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance)
+            fwd_mean[t] = c * fwd_mean[t - 1] + (1 - c) * obs[t - 1]
+
+        # backward pass
+        mean[T] = fwd_mean[T]
+        for t in range(T - 1, -1, -1):
+            if chain_variance == 0.0:
+                c = 0.0
+            else:
+                c = chain_variance / (fwd_variance[t] + chain_variance)
+            mean[t] = c * fwd_mean[t] + (1 - c) * mean[t + 1]
+        return mean, fwd_mean
+
+    def compute_expected_log_prob(self):
+        """Compute the expected log probability given values of m.
+
+        The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper;
+        The below implementation is the result of solving the equation and is implemented as in the original
+        Blei DTM code.
+
+        Returns
+        -------
+        numpy.ndarray of float
+            The expected value for the log probabilities for each word and time slice.
+
+        """
+        for (w, t), val in np.ndenumerate(self.e_log_prob):
+            self.e_log_prob[w][t] = self.mean[w][t + 1] - np.log(self.zeta[t])
+        return self.e_log_prob
+
+    def sslm_counts_init(self, obs_variance, chain_variance, sstats):
+        """Initialize the State Space Language Model with LDA sufficient statistics.
+
+        Called for each topic-chain and initializes initial mean, variance and Topic-Word probabilities
+        for the first time-slice.
+
+        Parameters
+        ----------
+        obs_variance : float, optional
+            Observed variance used to approximate the true and forward variance.
+        chain_variance : float
+            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
+        sstats : numpy.ndarray
+            Sufficient statistics of the LDA model. Corresponds to matrix beta in the linked paper for time slice 0,
+            expected shape (`self.vocab_len`, `num_topics`).
+
+        """
+        W = self.vocab_len
+        T = self.num_time_slices
+
+        log_norm_counts = np.copy(sstats)
+        log_norm_counts /= sum(log_norm_counts)
+        log_norm_counts += 1.0 / W
+        log_norm_counts /= sum(log_norm_counts)
+        log_norm_counts = np.log(log_norm_counts)
+
+        # setting variational observations to transformed counts
+        self.obs = (np.repeat(log_norm_counts, T, axis=0)).reshape(W, T)
+        # set variational parameters
+        self.obs_variance = obs_variance
+        self.chain_variance = chain_variance
+
+        # # compute post variance, mean
+        for w in range(W):
+            self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance)
+            self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance)
+
+        self.zeta = self.update_zeta()
+        self.e_log_prob = self.compute_expected_log_prob()
+
+    def fit_sslm(self, sstats):
+        """Fits variational distribution.
+
+        This is essentially the m-step.
+        Maximizes the approximation of the true posterior for a particular topic using the provided sufficient
+        statistics. Updates the values using :meth:`~gensim.models.ldaseqmodel.sslm.update_obs` and
+        :meth:`~gensim.models.ldaseqmodel.sslm.compute_expected_log_prob`.
+
+        Parameters
+        ----------
+        sstats : numpy.ndarray
+            Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the
+            current time slice, expected shape (`self.vocab_len`, `num_topics`).
+
+        Returns
+        -------
+        float
+            The lower bound for the true posterior achieved using the fitted approximate distribution.
+
+        """
+
+        return fit_sslm(self, sstats)
+
+
+class LdaPost(utils.SaveLoad):
+    """Posterior values associated with each set of documents.
+
+    TODO: use **Hoffman, Blei, Bach: Online Learning for Latent Dirichlet Allocation, NIPS 2010.**
+    to update phi, gamma. End game would be to somehow replace LdaPost entirely with LdaModel.
+
+    """
+
+    def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=None, lhood=None):
+        """Initialize the posterior value structure for the given LDA model.
+
+        Parameters
+        ----------
+        doc : list of (int, int)
+            A BOW representation of the document. Each element in the list is a pair of a word's ID and its number
+            of occurences in the document.
+        lda : :class:`~gensim.models.ldamodel.LdaModel`, optional
+            The underlying LDA model.
+        max_doc_len : int, optional
+            The maximum number of words in a document.
+        num_topics : int, optional
+            Number of topics discovered by the LDA model.
+        gamma : numpy.ndarray, optional
+            Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
+        lhood : float, optional
+            The log likelihood lower bound.
+
+        """
+        self.doc = doc
+        self.lda = lda
+        self.gamma = gamma
+        self.lhood = lhood
+        if self.gamma is None:
+            self.gamma = np.zeros(num_topics)
+        if self.lhood is None:
+            self.lhood = np.zeros(num_topics + 1)
+
+        if max_doc_len is not None and num_topics is not None:
+            self.phi = np.zeros((max_doc_len, num_topics))
+            self.log_phi = np.zeros((max_doc_len, num_topics))
+
+        # the following are class variables which are to be integrated during Document Influence Model
+
+        self.doc_weight = None
+        self.renormalized_doc_weight = None
+
+    def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
+                    lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None):
+        """Posterior inference for lda.
+
+        Parameters
+        ----------
+        doc_number : int
+            The documents number.
+        time : int
+            Time slice.
+        ldaseq : object
+            Unused.
+        LDA_INFERENCE_CONVERGED : float
+            Epsilon value used to check whether the inference step has sufficiently converged.
+        lda_inference_max_iter : int
+            Maximum number of iterations in the inference step.
+        g : object
+            Unused. Will be useful when the DIM model is implemented.
+        g3_matrix: object
+            Unused. Will be useful when the DIM model is implemented.
+        g4_matrix: object
+            Unused. Will be useful when the DIM model is implemented.
+        g5_matrix: object
+            Unused. Will be useful when the DIM model is implemented.
+
+        Returns
+        -------
+        float
+            The optimal lower bound for the true posterior using the approximate distribution.
+        """
+
+        return np.array(fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED,
+                lda_inference_max_iter, g, g3_matrix, g4_matrix, g5_matrix))
+
+
+    def update_lda_seq_ss(self, time, doc, topic_suffstats):
+        """Update lda sequence sufficient statistics from an lda posterior.
+
+        This is very similar to the :meth:`~gensim.models.ldaseqmodel.LdaPost.update_gamma` method and uses
+        the same formula.
+
+        Parameters
+        ----------
+        time : int
+            The time slice.
+        doc : list of (int, float)
+            Unused but kept here for backwards compatibility. The document set in the constructor (`self.doc`) is used
+            instead.
+        topic_suffstats : list of float
+            Sufficient statistics for each topic.
+
+        Returns
+        -------
+        list of float
+            The updated sufficient statistics for each topic.
+
+        """
+        num_topics = self.lda.num_topics
+
+        for k in range(num_topics):
+            topic_ss = topic_suffstats[k]
+            n = 0
+            for word_id, count in self.doc:
+                topic_ss[word_id][time] += count * self.phi[n][k]
+                n += 1
+            topic_suffstats[k] = topic_ss
+
+        return topic_suffstats
+
+
+
+
+"[[('idexx', 0.02256082958356584), ('electrical', 0.021255572724880007), ('skills', 0.019944382064918525), ('engineering', 0.015763035068946996), ('models', 0.013750631038974458), ('bank', 0.013583726745509497), ('modeling', 0.01203761886072807), ('complex', 0.010914594579298338), ('engineers', 0.010400197077673822), ('reviews', 0.010341923125796125), ('validation', 0.010293220205119354), ('interns', 0.0102928498107953), ('laboratories', 0.010292839655664899), ('solutions', 0.009697385952098011), ('senior', 0.008667538842761514), ('relationships', 0.008642879046552596), ('developing', 0.008619893796003735), ('documentation', 0.008610828170283354), ('tasks', 0.008605851091404736), ('components', 0.0085837114798782)], [('engineering', 0.02951417287621784), ('skills', 0.028313440245959602), ('knowledge', 0.025956248413893723), ('technical', 0.024245552645358926), ('testing', 0.01727409220691255), ('management', 0.016617274195619907), ('teams', 0.014589132561545997), ('environment', 0.012537027351412494), ('complex', 0.012292851849788823), ('reviews', 0.011188854688188414), ('technology', 0.011122285040026962), ('responsibilities', 0.010945741042933459), ('participates', 0.009658533518869254), ('quality', 0.009587766436523074), ('asrc', 0.009485862858633645), ('applications', 0.009302567710268318), ('water', 0.00928599554777531), ('solutions', 0.008539835904207396), ('hardware', 0.00843600934087972), ('engineers', 0.007947946419547743)]]"
\ No newline at end of file
diff --git a/setup.py b/setup.py
index a24203353d..76f10ed563 100644
--- a/setup.py
+++ b/setup.py
@@ -28,6 +28,8 @@
     'gensim._matutils': 'gensim/_matutils.c',
     'gensim.models.nmf_pgd': 'gensim/models/nmf_pgd.c',
     'gensim.similarities.fastss': 'gensim/similarities/fastss.c',
+    'gensim.models.ldaseq_sslm_inner': 'gensim/models/ldaseq_sslm_inner.c',
+    'gensim.models.ldaseq_posterior_inner': 'gensim/models/ldaseq_posterior_inner.c'
 }
 
 cpp_extensions = {

From c3efd57eb3eac6d94c4bdfbfe8910d748d4c1be5 Mon Sep 17 00:00:00 2001
From: Sharganov Artem <sharganov.artem@yandex.ru>
Date: Mon, 14 Jun 2021 22:05:11 +0300
Subject: [PATCH 2/7] Fix error in LdaPosterior initialization

---
 gensim/models/ldaseq_posterior_inner.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ldaseq_posterior_inner.pyx b/gensim/models/ldaseq_posterior_inner.pyx
index 2caa1b0d0c..c95e7a63e4 100644
--- a/gensim/models/ldaseq_posterior_inner.pyx
+++ b/gensim/models/ldaseq_posterior_inner.pyx
@@ -253,7 +253,7 @@ def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
     # TODO Would it be better to create numpy array first?
     for i in range(doc_length):
         word_ids[i] = self.doc[i][0]
-        word_counts[i] = self.doc[i][0]
+        word_counts[i] = self.doc[i][1]
 
     cdef REAL_t * gamma = <REAL_t *> (np.PyArray_DATA(self.gamma))
     cdef REAL_t * phi = <REAL_t *> (np.PyArray_DATA(self.phi))

From 2015bdb441dc61499eb3412b64bc330d025a6ed9 Mon Sep 17 00:00:00 2001
From: Sharganov Artem <sharganov.artem@yandex.ru>
Date: Fri, 18 Jun 2021 01:59:11 +0300
Subject: [PATCH 3/7] Cythonized sslm_counts_init method of sslm class

Reduced time for sslm initialization (it was especially critical for large datasets). Removed duplicated code.
---
 gensim/models/ldaseq_sslm_inner.pyx    |  62 ++++++++-
 gensim/models/ldaseqmodel_optimized.py | 175 +------------------------
 2 files changed, 63 insertions(+), 174 deletions(-)

diff --git a/gensim/models/ldaseq_sslm_inner.pyx b/gensim/models/ldaseq_sslm_inner.pyx
index 9245a8cd95..19936b9993 100644
--- a/gensim/models/ldaseq_sslm_inner.pyx
+++ b/gensim/models/ldaseq_sslm_inner.pyx
@@ -42,6 +42,62 @@ import numpy as np
 from scipy import optimize
 
 
+def sslm_counts_init(model, obs_variance, chain_variance, sstats):
+    """Initialize the State Space Language Model with LDA sufficient statistics.
+
+    Called for each topic-chain and initializes initial mean, variance and Topic-Word probabilities
+    for the first time-slice.
+
+    Parameters
+    ----------
+    obs_variance : float, optional
+        Observed variance used to approximate the true and forward variance.
+    chain_variance : float
+        Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
+    sstats : numpy.ndarray
+        Sufficient statistics of the LDA model. Corresponds to matrix beta in the linked paper for time slice 0,
+        expected shape (`self.vocab_len`, `num_topics`).
+
+    """
+    W = model.vocab_len
+    T = model.num_time_slices
+
+    log_norm_counts = np.copy(sstats)
+    log_norm_counts /= sum(log_norm_counts)
+    log_norm_counts += 1.0 / W
+    log_norm_counts /= sum(log_norm_counts)
+    log_norm_counts = np.log(log_norm_counts)
+
+    cdef StateSpaceLanguageModelConfig * config = <StateSpaceLanguageModelConfig *> malloc(
+        sizeof(StateSpaceLanguageModelConfig))
+
+
+    # setting variational observations to transformed counts
+    model.obs = (np.repeat(log_norm_counts, T, axis=0)).reshape(W, T)
+    # set variational parameters
+    model.obs_variance = obs_variance
+    model.chain_variance = chain_variance
+
+    init_sslm_config(config, model)
+
+    cdef int w
+    cdef int vocab_len = model.vocab_len
+
+    # # compute post variance, mean
+    for w in range(vocab_len):
+        compute_post_variance(config.variance, config.fwd_variance,
+                              config.obs_variance, config.chain_variance,
+                              w, config.num_time_slices)
+
+        compute_post_mean(config.mean, config.fwd_mean, config.fwd_variance,
+                              config.obs, w, config.num_time_slices,
+                              config.obs_variance, config.chain_variance)
+
+    update_zeta(config.zeta, config.mean, config.variance, config.num_time_slices, config.vocab_len)
+    compute_expected_log_prob(config.e_log_prob, config.zeta, config.mean,
+                               config.vocab_len, config.num_time_slices)
+    model.config_c_address =  <uintptr_t>(config)
+
 cdef compute_post_mean(REAL_t *mean, REAL_t *fwd_mean, const REAL_t *fwd_variance, const REAL_t *obs,
                        const int word, const int num_time_slices,
                        const REAL_t obs_variance, const REAL_t chain_variance):
@@ -698,7 +754,8 @@ def fit_sslm(model, np_sstats):
     """
 
     # Initialize C structures based on Python instance of the model
-    cdef StateSpaceLanguageModelConfig* config = <StateSpaceLanguageModelConfig *>malloc(sizeof(StateSpaceLanguageModelConfig))
+    cdef StateSpaceLanguageModelConfig * config = <StateSpaceLanguageModelConfig *> (<uintptr_t>(model.config_c_address))
+
     init_sslm_config(config, model)
 
     cdef int W = config[0].vocab_len
@@ -736,5 +793,6 @@ def fit_sslm(model, np_sstats):
     compute_expected_log_prob(config[0].e_log_prob, config[0].zeta, config[0].mean,
                               W, config[0].num_time_slices)
 
-    free(config)
+    # TODO find a way/place where to free a memory
+    # free(config)
     return bound
diff --git a/gensim/models/ldaseqmodel_optimized.py b/gensim/models/ldaseqmodel_optimized.py
index ebbcfced6b..97eae4666b 100644
--- a/gensim/models/ldaseqmodel_optimized.py
+++ b/gensim/models/ldaseqmodel_optimized.py
@@ -3,7 +3,7 @@
 import numpy as np
 from gensim import utils, matutils
 from gensim.models import ldamodel
-from .ldaseq_sslm_inner import fit_sslm
+from .ldaseq_sslm_inner import fit_sslm, sslm_counts_init
 from .ldaseq_posterior_inner import fit_lda_post
 
 logger = logging.getLogger(__name__)
@@ -670,157 +670,8 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va
         self.w_phi_sum = None
         self.w_phi_l_sq = None
         self.m_update_coeff_g = None
+        self.config_c_address = 0
 
-    def update_zeta(self):
-        """Update the Zeta variational parameter.
-
-        Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)),
-        over every time-slice. It is the value of variational parameter zeta which maximizes the lower bound.
-
-        Returns
-        -------
-        list of float
-            The updated zeta values for each time slice.
-
-        """
-        for j, val in enumerate(self.zeta):
-            self.zeta[j] = np.sum(np.exp(self.mean[:, j + 1] + self.variance[:, j + 1] / 2))
-        return self.zeta
-
-    def compute_post_variance(self, word, chain_variance):
-        r"""Get the variance, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
-        <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
-
-        This function accepts the word to compute variance for, along with the associated sslm class object,
-        and returns the `variance` and the posterior approximation `fwd_variance`.
-
-        Notes
-        -----
-        This function essentially computes Var[\beta_{t,w}] for t = 1:T
-
-        .. :math::
-
-            fwd\_variance[t] \equiv E((beta_{t,w}-mean_{t,w})^2 |beta_{t}\ for\ 1:t) =
-            (obs\_variance / fwd\_variance[t - 1] + chain\_variance + obs\_variance ) *
-            (fwd\_variance[t - 1] + obs\_variance)
-
-        .. :math::
-
-            variance[t] \equiv E((beta_{t,w}-mean\_cap_{t,w})^2 |beta\_cap_{t}\ for\ 1:t) =
-            fwd\_variance[t - 1] + (fwd\_variance[t - 1] / fwd\_variance[t - 1] + obs\_variance)^2 *
-            (variance[t - 1] - (fwd\_variance[t-1] + obs\_variance))
-
-        Parameters
-        ----------
-        word: int
-            The word's ID.
-        chain_variance : float
-            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
-
-        Returns
-        -------
-        (numpy.ndarray, numpy.ndarray)
-            The first returned value is the variance of each word in each time slice, the second value is the
-            inferred posterior variance for the same pairs.
-
-        """
-        INIT_VARIANCE_CONST = 1000
-
-        T = self.num_time_slices
-        variance = self.variance[word]
-        fwd_variance = self.fwd_variance[word]
-        # forward pass. Set initial variance very high
-        fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST
-        for t in range(1, T + 1):
-            if self.obs_variance:
-                c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance)
-            else:
-                c = 0
-            fwd_variance[t] = c * (fwd_variance[t - 1] + chain_variance)
-
-        # backward pass
-        variance[T] = fwd_variance[T]
-        for t in range(T - 1, -1, -1):
-            if fwd_variance[t] > 0.0:
-                c = np.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2)
-            else:
-                c = 0
-            variance[t] = (c * (variance[t + 1] - chain_variance)) + ((1 - c) * fwd_variance[t])
-
-        return variance, fwd_variance
-
-    def compute_post_mean(self, word, chain_variance):
-        """Get the mean, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
-        <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
-
-        Notes
-        -----
-        This function essentially computes E[\beta_{t,w}] for t = 1:T.
-
-        .. :math::
-
-            Fwd_Mean(t) ≡  E(beta_{t,w} | beta_ˆ 1:t )
-            = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] +
-            (1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta
-
-        .. :math::
-
-            Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T )
-            = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) +
-            (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t]
-
-        Parameters
-        ----------
-        word: int
-            The word's ID.
-        chain_variance : float
-            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
-
-        Returns
-        -------
-        (numpy.ndarray, numpy.ndarray)
-            The first returned value is the mean of each word in each time slice, the second value is the
-            inferred posterior mean for the same pairs.
-
-        """
-        T = self.num_time_slices
-        obs = self.obs[word]
-        fwd_variance = self.fwd_variance[word]
-        mean = self.mean[word]
-        fwd_mean = self.fwd_mean[word]
-
-        # forward
-        fwd_mean[0] = 0
-        for t in range(1, T + 1):
-            c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance)
-            fwd_mean[t] = c * fwd_mean[t - 1] + (1 - c) * obs[t - 1]
-
-        # backward pass
-        mean[T] = fwd_mean[T]
-        for t in range(T - 1, -1, -1):
-            if chain_variance == 0.0:
-                c = 0.0
-            else:
-                c = chain_variance / (fwd_variance[t] + chain_variance)
-            mean[t] = c * fwd_mean[t] + (1 - c) * mean[t + 1]
-        return mean, fwd_mean
-
-    def compute_expected_log_prob(self):
-        """Compute the expected log probability given values of m.
-
-        The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper;
-        The below implementation is the result of solving the equation and is implemented as in the original
-        Blei DTM code.
-
-        Returns
-        -------
-        numpy.ndarray of float
-            The expected value for the log probabilities for each word and time slice.
-
-        """
-        for (w, t), val in np.ndenumerate(self.e_log_prob):
-            self.e_log_prob[w][t] = self.mean[w][t + 1] - np.log(self.zeta[t])
-        return self.e_log_prob
 
     def sslm_counts_init(self, obs_variance, chain_variance, sstats):
         """Initialize the State Space Language Model with LDA sufficient statistics.
@@ -839,28 +690,8 @@ def sslm_counts_init(self, obs_variance, chain_variance, sstats):
             expected shape (`self.vocab_len`, `num_topics`).
 
         """
-        W = self.vocab_len
-        T = self.num_time_slices
-
-        log_norm_counts = np.copy(sstats)
-        log_norm_counts /= sum(log_norm_counts)
-        log_norm_counts += 1.0 / W
-        log_norm_counts /= sum(log_norm_counts)
-        log_norm_counts = np.log(log_norm_counts)
-
-        # setting variational observations to transformed counts
-        self.obs = (np.repeat(log_norm_counts, T, axis=0)).reshape(W, T)
-        # set variational parameters
-        self.obs_variance = obs_variance
-        self.chain_variance = chain_variance
-
-        # # compute post variance, mean
-        for w in range(W):
-            self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance)
-            self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance)
 
-        self.zeta = self.update_zeta()
-        self.e_log_prob = self.compute_expected_log_prob()
+        sslm_counts_init(self, obs_variance, chain_variance, sstats)
 
     def fit_sslm(self, sstats):
         """Fits variational distribution.

From f22670b293a9e0eb886a3e451343ddbbe2eb0a2e Mon Sep 17 00:00:00 2001
From: Sharganov Artem <sharganov.artem@yandex.ru>
Date: Wed, 23 Jun 2021 00:10:12 +0300
Subject: [PATCH 4/7] Code refactoring + merge of optimized code with original
 implementation

---
 gensim/models/__init__.py                |   1 -
 gensim/models/ldaseq_posterior_inner.pyx |  87 +--
 gensim/models/ldaseq_sslm_inner.pyx      | 275 ++++----
 gensim/models/ldaseqmodel.py             | 810 +---------------------
 gensim/models/ldaseqmodel_optimized.py   | 838 -----------------------
 5 files changed, 224 insertions(+), 1787 deletions(-)
 delete mode 100644 gensim/models/ldaseqmodel_optimized.py

diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py
index ec47f03715..a2d0af279f 100644
--- a/gensim/models/__init__.py
+++ b/gensim/models/__init__.py
@@ -22,7 +22,6 @@
 from .fasttext import FastText  # noqa:F401
 from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix  # noqa:F401
 from .nmf import Nmf  # noqa:F401
-from .ldaseqmodel_optimized import LdaSeqModel_V4
 
 from gensim import interfaces, utils
 
diff --git a/gensim/models/ldaseq_posterior_inner.pyx b/gensim/models/ldaseq_posterior_inner.pyx
index c95e7a63e4..1f838227f4 100644
--- a/gensim/models/ldaseq_posterior_inner.pyx
+++ b/gensim/models/ldaseq_posterior_inner.pyx
@@ -1,3 +1,5 @@
+import numpy as np
+
 from scipy.special.cython_special cimport gammaln
 from scipy.special.cython_special cimport psi
 
@@ -7,14 +9,15 @@ from libc.stdlib cimport malloc, free
 
 cimport numpy as np
 
-import numpy as np
 
 ctypedef np.float64_t REAL_t
 
 
-cdef update_phi(REAL_t * gamma, REAL_t *phi, REAL_t * log_phi,
-               int * word_ids, REAL_t * lda_topics, const int num_topics,
-               const int doc_length):
+cdef update_phi(
+        REAL_t * gamma, REAL_t *phi, REAL_t * log_phi,
+        int * word_ids, REAL_t * lda_topics, const int num_topics,
+        const int doc_length
+     ):
 
     """Update variational multinomial parameters, based on a document and a time-slice.
 
@@ -48,8 +51,6 @@ cdef update_phi(REAL_t * gamma, REAL_t *phi, REAL_t * log_phi,
     cdef REAL_t * dig = <REAL_t *> malloc(num_topics * sizeof(REAL_t))
     if dig == NULL:
         raise
-    if dig == NULL:
-        print("NULL pointer")
 
     for k in range(num_topics):
         dig[k] = psi(gamma[k])
@@ -59,9 +60,7 @@ cdef update_phi(REAL_t * gamma, REAL_t *phi, REAL_t * log_phi,
 
     for i in range(doc_length):
         for k in range(num_topics):
-            # TODO check if proper indexing
-            log_phi[i * num_topics + k] = \
-                dig[k] + lda_topics[word_ids[i] * num_topics + k]
+            log_phi[i * num_topics + k] = dig[k] + lda_topics[word_ids[i] * num_topics + k]
 
         log_phi_row = log_phi + i * num_topics
 
@@ -73,7 +72,6 @@ cdef update_phi(REAL_t * gamma, REAL_t *phi, REAL_t * log_phi,
             v = log(exp(v) + exp(log_phi_row[i]))
 
         # subtract every element by v
-        # TODO blas
         for i in range(num_topics):
             log_phi_row[i] = log_phi_row[i] - v
 
@@ -87,8 +85,10 @@ cdef update_phi_fixed():
     return
 
 
-cdef update_gamma(REAL_t * gamma, const REAL_t * phi, const REAL_t *lda_alpha,
-                  const int *word_counts, const int num_topics, const int doc_length):
+cdef update_gamma(
+        REAL_t * gamma, const REAL_t * phi, const REAL_t *lda_alpha,
+        const int *word_counts, const int num_topics, const int doc_length
+     ):
     """Update variational dirichlet parameters.
 
     This operations is described in the original Blei LDA paper:
@@ -96,13 +96,13 @@ cdef update_gamma(REAL_t * gamma, const REAL_t * phi, const REAL_t *lda_alpha,
 
     Parameters
     ----------
-    gamma: size of num_topics. int/out-parameter
+    gamma: size of num_topics. 
     
-    phi: size of (max_doc_len, num_topics). in-parameter
+    phi: size of (max_doc_len, num_topics). 
     
-    lda_alpha: size of num_topics. in-parameter
+    lda_alpha: size of num_topics. 
     
-    word_counts: size of doc_length. in-parameter
+    word_counts: size of doc_length. 
     
     num_topics: number of topics in the model 
     
@@ -118,10 +118,12 @@ cdef update_gamma(REAL_t * gamma, const REAL_t * phi, const REAL_t *lda_alpha,
             gamma[k] += phi[i * num_topics + k] * word_counts[i]
 
 
-cdef REAL_t compute_lda_lhood(REAL_t * lhood, const REAL_t *gamma, const REAL_t * phi, const REAL_t *log_phi,
-                       const REAL_t * lda_alpha, const REAL_t *lda_topics,
-                       int *word_counts, int *word_ids,
-                       const int num_topics, const int doc_length):
+cdef REAL_t compute_lda_lhood(
+        REAL_t * lhood, const REAL_t *gamma, const REAL_t * phi, const REAL_t *log_phi,
+        const REAL_t * lda_alpha, const REAL_t *lda_topics,
+        int *word_counts, int *word_ids,
+        const int num_topics, const int doc_length
+     ):
     """Compute the log likelihood bound.
     Parameters
     ----------
@@ -129,7 +131,7 @@ cdef REAL_t compute_lda_lhood(REAL_t * lhood, const REAL_t *gamma, const REAL_t
     
     lhood: size of num_topics + 1
     
-    phi: size of (max_doc_len, num_topics). in-parameter
+    phi: size of (max_doc_len, num_topics).
     
     log_phi: size of (max_doc_len, num_topics). in-parameter
     
@@ -154,7 +156,6 @@ cdef REAL_t compute_lda_lhood(REAL_t * lhood, const REAL_t *gamma, const REAL_t
     cdef int i
 
     cdef REAL_t gamma_sum = 0.0
-
     for i in range(num_topics):
         gamma_sum += gamma[i]
 
@@ -172,9 +173,7 @@ cdef REAL_t compute_lda_lhood(REAL_t * lhood, const REAL_t *gamma, const REAL_t
 
         e_log_theta_k = psi(gamma[k]) - digsum
 
-        lhood_term = \
-            (lda_alpha[k] - gamma[k]) * e_log_theta_k + \
-            gammaln(gamma[k]) - gammaln(lda_alpha[k])
+        lhood_term = (lda_alpha[k] - gamma[k]) * e_log_theta_k + gammaln(gamma[k]) - gammaln(lda_alpha[k])
 
         # TODO: check why there's an IF
         for i in range(doc_length):
@@ -189,9 +188,11 @@ cdef REAL_t compute_lda_lhood(REAL_t * lhood, const REAL_t *gamma, const REAL_t
 
     return lhood_v
 
-cdef init_lda_post(REAL_t *gamma, REAL_t * phi, const int *word_counts,
-                   const REAL_t *lda_alpha,
-                   const int doc_length, const int num_topics):
+cdef init_lda_post(
+        REAL_t *gamma, REAL_t * phi, const int *word_counts,
+        const REAL_t *lda_alpha,
+        const int doc_length, const int num_topics
+     ):
 
     """Initialize variational posterior. """
 
@@ -216,8 +217,10 @@ cdef init_lda_post(REAL_t *gamma, REAL_t * phi, const int *word_counts,
             phi_doc[j] = init_value
 
 
-def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
-                lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None):
+def fit_lda_post(
+        self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
+        lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None
+    ):
     """Posterior inference for lda.
 
     Parameters
@@ -272,9 +275,11 @@ def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
     # sum of counts in a doc
     cdef REAL_t total = sum(count for word_id, count in self.doc)
 
-    cdef REAL_t lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi,
-                                          lda_alpha, lda_topics, word_counts, word_ids,
-                                          num_topics, doc_length)
+    cdef REAL_t lhood_v = compute_lda_lhood(
+        lhood, gamma, phi, log_phi,
+        lda_alpha, lda_topics, word_counts, word_ids,
+        num_topics, doc_length
+    )
 
     cdef REAL_t lhood_old = 0.0
     cdef REAL_t converged = 0.0
@@ -290,9 +295,11 @@ def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
     # if model == "DTM" or sslm is None:
     update_phi(gamma, phi, log_phi, word_ids, lda_topics, num_topics, doc_length)
 
-    lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi,
-                              lda_alpha, lda_topics, word_counts, word_ids,
-                              num_topics, doc_length)
+    lhood_v = compute_lda_lhood(
+        lhood, gamma, phi, log_phi,
+        lda_alpha, lda_topics, word_counts, word_ids,
+        num_topics, doc_length
+    )
 
     converged = abs((lhood_old - lhood_v) / (lhood_old * total))
 
@@ -305,9 +312,11 @@ def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
 
         update_phi(gamma, phi, log_phi, word_ids, lda_topics, num_topics, doc_length)
 
-        lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi,
-                              lda_alpha, lda_topics, word_counts, word_ids,
-                              num_topics, doc_length)
+        lhood_v = compute_lda_lhood(
+            lhood, gamma, phi, log_phi,
+            lda_alpha, lda_topics, word_counts, word_ids,
+            num_topics, doc_length
+        )
 
         converged = np.fabs((lhood_old - lhood_v) / (lhood_old * total))
 
diff --git a/gensim/models/ldaseq_sslm_inner.pyx b/gensim/models/ldaseq_sslm_inner.pyx
index 19936b9993..5fbd6920c7 100644
--- a/gensim/models/ldaseq_sslm_inner.pyx
+++ b/gensim/models/ldaseq_sslm_inner.pyx
@@ -4,12 +4,12 @@
 # cython: cdivision=True
 # cython: embedsignature=True
 
+
 from libc.math cimport pow, log, exp, abs, sqrt
 from libc.string cimport memcpy
 from libc.stdlib cimport malloc, free
 from libc.stdint cimport uintptr_t
 
-
 cimport numpy as np
 
 
@@ -35,7 +35,7 @@ cdef init_sslm_config(StateSpaceLanguageModelConfig * config, model):
     # Default initialization should raise exception if it used without proper initialization
     config[0].deriv = NULL
     config[0].mean_deriv_mtx = NULL
-    word = -1
+    config[0].word = -1
 
 
 import numpy as np
@@ -85,22 +85,29 @@ def sslm_counts_init(model, obs_variance, chain_variance, sstats):
 
     # # compute post variance, mean
     for w in range(vocab_len):
-        compute_post_variance(config.variance, config.fwd_variance,
-                              config.obs_variance, config.chain_variance,
-                              w, config.num_time_slices)
-
-        compute_post_mean(config.mean, config.fwd_mean, config.fwd_variance,
-                              config.obs, w, config.num_time_slices,
-                              config.obs_variance, config.chain_variance)
+        compute_post_variance(
+            config.variance, config.fwd_variance,
+            config.obs_variance, config.chain_variance,
+            w, config.num_time_slices
+        )
+
+        compute_post_mean(
+            config.mean, config.fwd_mean, config.fwd_variance,
+            config.obs, w, config.num_time_slices,
+            config.obs_variance, config.chain_variance
+        )
 
     update_zeta(config.zeta, config.mean, config.variance, config.num_time_slices, config.vocab_len)
-    compute_expected_log_prob(config.e_log_prob, config.zeta, config.mean,
-                               config.vocab_len, config.num_time_slices)
+    compute_expected_log_prob(config.e_log_prob, config.zeta, config.mean, config.vocab_len, config.num_time_slices)
+
     model.config_c_address =  <uintptr_t>(config)
 
-cdef compute_post_mean(REAL_t *mean, REAL_t *fwd_mean, const REAL_t *fwd_variance, const REAL_t *obs,
-                       const int word, const int num_time_slices,
-                       const REAL_t obs_variance, const REAL_t chain_variance):
+
+cdef compute_post_mean(
+        REAL_t *mean, REAL_t *fwd_mean, const REAL_t *fwd_variance, const REAL_t *obs,
+        const int word, const int num_time_slices,
+        const REAL_t obs_variance, const REAL_t chain_variance
+     ):
     """Get the mean, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
     <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
 
@@ -141,8 +148,7 @@ cdef compute_post_mean(REAL_t *mean, REAL_t *fwd_mean, const REAL_t *fwd_varianc
 
 
     """
-    # print("")
-    cdef Py_ssize_t T = num_time_slices
+
     obs = obs + word * num_time_slices
     fwd_variance = fwd_variance + word * (num_time_slices + 1)
     mean = mean + word * (num_time_slices + 1)
@@ -154,14 +160,14 @@ cdef compute_post_mean(REAL_t *mean, REAL_t *fwd_mean, const REAL_t *fwd_varianc
     # forward
     fwd_mean[0] = 0
 
-    for t in range(1, T + 1):
+    for t in range(1, num_time_slices + 1):
         c = obs_variance / (fwd_variance[t - 1] + chain_variance + obs_variance)
         fwd_mean[t] = c * fwd_mean[t - 1] + (1 - c) * obs[t - 1]
 
     # backward pass
-    mean[T] = fwd_mean[T]
+    mean[num_time_slices] = fwd_mean[num_time_slices]
 
-    for t in range(T - 1, -1, -1):
+    for t in range(num_time_slices - 1, -1, -1):
         if chain_variance == 0.0:
             c = 0.0
         else:
@@ -169,9 +175,11 @@ cdef compute_post_mean(REAL_t *mean, REAL_t *fwd_mean, const REAL_t *fwd_varianc
         mean[t] = c * fwd_mean[t] + (1 - c) * mean[t + 1]
 
 
-cdef compute_post_variance(REAL_t *variance, REAL_t *fwd_variance,
-                           const REAL_t obs_variance, const REAL_t chain_variance,
-                           const int word, const int num_time_slices):
+cdef compute_post_variance(
+        REAL_t *variance, REAL_t *fwd_variance,
+        const REAL_t obs_variance, const REAL_t chain_variance,
+        const int word, const int num_time_slices
+     ):
     r"""Get the variance, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
     <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
 
@@ -211,7 +219,6 @@ cdef compute_post_variance(REAL_t *variance, REAL_t *fwd_variance,
     """
     cdef int INIT_VARIANCE_CONST = 1000
 
-    cdef Py_ssize_t T = num_time_slices
     variance = variance + word * (num_time_slices + 1)
     fwd_variance = fwd_variance + word * (num_time_slices + 1)
     cdef REAL_t c
@@ -220,7 +227,7 @@ cdef compute_post_variance(REAL_t *variance, REAL_t *fwd_variance,
     # forward pass. Set initial variance very high
     fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST
 
-    for t in range(1, T + 1):
+    for t in range(1, num_time_slices + 1):
         if obs_variance != 0.0:
             c = obs_variance / (fwd_variance[t - 1] + chain_variance + obs_variance)
         else:
@@ -228,8 +235,8 @@ cdef compute_post_variance(REAL_t *variance, REAL_t *fwd_variance,
         fwd_variance[t] = c * (fwd_variance[t - 1] + chain_variance)
 
     # backward pass
-    variance[T] = fwd_variance[T]
-    for t in range(T - 1, -1, -1):
+    variance[num_time_slices] = fwd_variance[num_time_slices]
+    for t in range(num_time_slices - 1, -1, -1):
         if fwd_variance[t] > 0.0:
             c = pow((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2)
         else:
@@ -237,8 +244,10 @@ cdef compute_post_variance(REAL_t *variance, REAL_t *fwd_variance,
         variance[t] = c * (variance[t + 1] - chain_variance) + (1 - c) * fwd_variance[t]
 
 
-cdef compute_mean_deriv(REAL_t *deriv, const REAL_t *variance, const REAL_t obs_variance, const REAL_t chain_variance,
-                        const int word, const int time, const int num_time_slices):
+cdef compute_mean_deriv(
+        REAL_t *deriv, const REAL_t *variance, const REAL_t obs_variance, const REAL_t chain_variance,
+        const int word, const int time, const int num_time_slices
+     ):
     """Helper functions for optimizing a function.
 
     Compute the derivative of:
@@ -265,8 +274,6 @@ cdef compute_mean_deriv(REAL_t *deriv, const REAL_t *variance, const REAL_t obs_
         Number of time slices in the model.
     """
 
-    cdef Py_ssize_t T = num_time_slices
-
     cdef REAL_t *fwd_variance = variance + word * (num_time_slices + 1)
     cdef Py_ssize_t t
     cdef REAL_t val
@@ -275,7 +282,7 @@ cdef compute_mean_deriv(REAL_t *deriv, const REAL_t *variance, const REAL_t obs_
     deriv[0] = 0
 
     # forward pass
-    for t in range(1, T + 1):
+    for t in range(1, num_time_slices + 1):
         if obs_variance > 0.0:
             w = obs_variance / (fwd_variance[t - 1] + chain_variance + obs_variance)
         else:
@@ -287,7 +294,7 @@ cdef compute_mean_deriv(REAL_t *deriv, const REAL_t *variance, const REAL_t obs_
 
         deriv[t] = val
 
-    for t in range(T - 1, -1, -1):
+    for t in range(num_time_slices - 1, -1, -1):
         if chain_variance == 0.0:
             w = 0.0
         else:
@@ -296,10 +303,11 @@ cdef compute_mean_deriv(REAL_t *deriv, const REAL_t *variance, const REAL_t obs_
         deriv[t] = w * deriv[t] + (1 - w) * deriv[t + 1]
 
 
-cdef compute_obs_deriv(REAL_t *deriv, const REAL_t *mean, const REAL_t *mean_deriv_mtx, const REAL_t *variance,
-                       const REAL_t *zeta, const REAL_t *totals, const REAL_t *word_counts,
-                       const REAL_t chain_variance, const int word, const int num_time_slices
-                       ):
+cdef compute_obs_deriv(
+        REAL_t *deriv, const REAL_t *mean, const REAL_t *mean_deriv_mtx, const REAL_t *variance,
+        const REAL_t *zeta, const REAL_t *totals, const REAL_t *word_counts,
+        const REAL_t chain_variance, const int word, const int num_time_slices
+     ):
     """Derivation of obs which is used in derivative function `df_obs` while optimizing.
 
     Parameters
@@ -327,24 +335,22 @@ cdef compute_obs_deriv(REAL_t *deriv, const REAL_t *mean, const REAL_t *mean_der
 
     cdef REAL_t init_mult = 1000
 
-    cdef Py_ssize_t T = num_time_slices
-
     mean = mean + word * (num_time_slices + 1)
     variance = variance + word * (num_time_slices + 1)
 
     cdef Py_ssize_t u, t
     cdef REAL_t term1, term2, term3, term4
 
-    cdef REAL_t *temp_vect = <REAL_t *> malloc(T * sizeof(REAL_t))
+    cdef REAL_t *temp_vect = <REAL_t *> malloc(num_time_slices * sizeof(REAL_t))
     if temp_vect == NULL:
         raise
 
-    for u in range(T):
+    for u in range(num_time_slices):
         temp_vect[u] = exp(mean[u + 1] + variance[u + 1] / 2)
 
     cdef REAL_t *mean_deriv = NULL
 
-    for t in range(T):
+    for t in range(num_time_slices):
 
         mean_deriv = mean_deriv_mtx + t * (num_time_slices + 1)
         term1 = 0.0
@@ -352,7 +358,7 @@ cdef compute_obs_deriv(REAL_t *deriv, const REAL_t *mean, const REAL_t *mean_der
         term3 = 0.0
         term4 = 0.0
 
-        for u in range(1, T + 1):
+        for u in range(1, num_time_slices + 1):
             term1 += (mean[u] - mean[u - 1]) * (mean_deriv[u] - mean_deriv[u - 1])
             term2 += (word_counts[u - 1] - (totals[u - 1] * temp_vect[u - 1] / zeta[u - 1])) * mean_deriv[u]
 
@@ -367,8 +373,10 @@ cdef compute_obs_deriv(REAL_t *deriv, const REAL_t *mean, const REAL_t *mean_der
 
     free(temp_vect)
 
-cdef update_zeta(REAL_t * zeta, const REAL_t *mean, const REAL_t *variance,
-                 const int num_time_slices, const int vocab_len):
+cdef update_zeta(
+        REAL_t * zeta, const REAL_t *mean, const REAL_t *variance,
+        const int num_time_slices, const int vocab_len
+     ):
     """Update the Zeta variational parameter.
 
     Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)),
@@ -395,8 +403,6 @@ cdef update_zeta(REAL_t * zeta, const REAL_t *mean, const REAL_t *variance,
         temp = 0.0
 
         for w in range(vocab_len):
-            # TODO check if it possible to use BLAS here
-            # TODO compare with original code, some info about log
             temp += exp(mean[w * (num_time_slices + 1) + i + 1] + variance[
                 w * (num_time_slices + 1) + i + 1] / 2.0)
 
@@ -425,8 +431,8 @@ cdef REAL_t compute_bound(StateSpaceLanguageModelConfig * config, REAL_t *sstats
 
     """
 
-    cdef int vocab_len = config[0].vocab_len
-    cdef Py_ssize_t T = config[0].num_time_slices
+    cdef int vocab_len = config.vocab_len
+    cdef Py_ssize_t num_time_slices = config.num_time_slices
 
     cdef REAL_t term_1 = 0.0
     cdef REAL_t term_2 = 0.0
@@ -435,47 +441,48 @@ cdef REAL_t compute_bound(StateSpaceLanguageModelConfig * config, REAL_t *sstats
     cdef REAL_t val = 0.0
     cdef REAL_t ent = 0.0
 
-    cdef REAL_t chain_variance = config[0].chain_variance
+    cdef REAL_t chain_variance = config.chain_variance
 
-    cdef REAL_t *mean = config[0].mean
-    cdef REAL_t *fwd_mean = config[0].fwd_mean
-    cdef REAL_t *variance = config[0].variance
-    cdef REAL_t *zeta = config[0].zeta
+    cdef REAL_t *mean = config.mean
+    cdef REAL_t *fwd_mean = config.fwd_mean
+    cdef REAL_t *variance = config.variance
+    cdef REAL_t *zeta = config.zeta
 
     cdef Py_ssize_t i, t, w
 
     for i in range(vocab_len):
-        config[0].word = i
-        compute_post_mean(mean, fwd_mean, config[0].fwd_variance, config[0].obs,
-                          i, T, config[0].obs_variance, chain_variance)
+        config.word = i
+        compute_post_mean(
+            mean, fwd_mean, config.fwd_variance, config.obs,
+            i, num_time_slices, config.obs_variance, chain_variance
+        )
 
-    update_zeta(zeta, mean, variance, T, vocab_len)
+    update_zeta(zeta, mean, variance, num_time_slices, vocab_len)
 
     val = 0.0
 
     for i in range(vocab_len):
-        val += variance[i * (config[0].num_time_slices + 1)] - variance[i * (config[0].num_time_slices + 1) + T]
+        val += variance[i * (num_time_slices + 1)] - variance[i * (num_time_slices + 1) + num_time_slices]
 
     # TODO check if it is correct, not val (2.0 / chain_variance)
     val = val / 2.0 * chain_variance
 
     cdef REAL_t m, prev_m, v
 
-    for t in range(1, T + 1):
+    for t in range(1, num_time_slices + 1):
 
         term_1 = 0.0
         term_2 = 0.0
         ent = 0.0
 
         for w in range(vocab_len):
-            m = mean[w * (config[0].num_time_slices + 1) + t]
-            prev_m = mean[w * (config[0].num_time_slices + 1) + t - 1]
+            m = mean[w * (num_time_slices + 1) + t]
+            prev_m = mean[w * (num_time_slices + 1) + t - 1]
 
-            v = variance[w * (config[0].num_time_slices + 1) + t]
+            v = variance[w * (num_time_slices + 1) + t]
 
-            term_1 += \
-                (pow(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - log(chain_variance)
-            term_2 += sstats[w * config[0].num_time_slices + t - 1] * m
+            term_1 += (pow(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - log(chain_variance)
+            term_2 += sstats[w * num_time_slices + t - 1] * m
 
             ent += log(v) / 2  # note the 2pi's cancel with term1 (see doc)
 
@@ -486,8 +493,10 @@ cdef REAL_t compute_bound(StateSpaceLanguageModelConfig * config, REAL_t *sstats
     return val
 
 #
-cdef compute_expected_log_prob(REAL_t *e_log_prob, const REAL_t *zeta, const REAL_t *mean,
-                               const int vocab_len, const int num_time_slices):
+cdef compute_expected_log_prob(
+        REAL_t *e_log_prob, const REAL_t *zeta, const REAL_t *mean,
+        const int vocab_len, const int num_time_slices
+     ):
     """Compute the expected log probability given values of m.
 
     The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper;
@@ -541,68 +550,74 @@ cdef update_obs(StateSpaceLanguageModelConfig *config, REAL_t *sstats, REAL_t *t
     cdef REAL_t STEP_SIZE = 0.01
     cdef REAL_t TOL = 0.001
 
-    cdef Py_ssize_t W = config[0].vocab_len
-    cdef Py_ssize_t T = config[0].num_time_slices
+    cdef Py_ssize_t vocab_len = config.vocab_len
+    cdef Py_ssize_t num_time_slices = config.num_time_slices
 
     cdef int runs = 0
 
-    cdef REAL_t *mean_deriv_mtx = <REAL_t *> malloc(T * (T + 1) * sizeof(REAL_t))
+    cdef REAL_t *mean_deriv_mtx = <REAL_t *> malloc(num_time_slices * (num_time_slices + 1) * sizeof(REAL_t))
     if mean_deriv_mtx == NULL:
         raise
     cdef Py_ssize_t w, t
     cdef REAL_t counts_norm
 
     cdef REAL_t * obs
-    config[0].totals = totals
+    config.totals = totals
 
-    np_norm_cutoff_obs = np.zeros(T, dtype=np.double)
-    np_w_counts = np.zeros(T, dtype=np.double)
-    np_obs = np.zeros(T, dtype=np.double)
+    # TODO check if it should be changed to C memory allocation
+    np_norm_cutoff_obs = np.zeros(num_time_slices, dtype=np.double)
+    # TODO check if it should be changed to C memory allocation
+    np_w_counts = np.zeros(num_time_slices, dtype=np.double)
+
+    # Allocate it as numpy array to pass it to Python code (optimize.fmin_cg)
+    np_obs = np.zeros(num_time_slices, dtype=np.double)
 
     # This is a work memory for df_obs function
-    working_array = np.zeros(T, dtype=np.double)
+    working_array = np.zeros(num_time_slices, dtype=np.double)
 
     cdef REAL_t *norm_cutoff_obs = NULL
     cdef REAL_t *w_counts
 
-    for w in range(W):
-        w_counts = sstats + w * config[0].num_time_slices
-        config[0].word = w
+    for w in range(vocab_len):
+        w_counts = sstats + w * num_time_slices
+        config.word = w
 
         counts_norm = 0.0
 
         # now we find L2 norm of w_counts
-        for i in range(config[0].num_time_slices):
+        for i in range(num_time_slices):
             counts_norm += w_counts[i] * w_counts[i]
 
         counts_norm = sqrt(counts_norm)
 
         if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not NULL:
-            obs = config[0].obs + w * config[0].num_time_slices
+            obs = config.obs + w * num_time_slices
             norm_cutoff_obs = <REAL_t *> (np.PyArray_DATA(np_norm_cutoff_obs))
             # norm_cutoff_obs = <REAL_t *> malloc(config[0].num_time_slices * sizeof(REAL_t))
             if norm_cutoff_obs == NULL:
                 raise
-            memcpy(norm_cutoff_obs, obs, config[0].num_time_slices * sizeof(REAL_t))
+            memcpy(norm_cutoff_obs, obs, num_time_slices * sizeof(REAL_t))
 
         else:
             if counts_norm < OBS_NORM_CUTOFF:
-                np_w_counts = np.zeros(config[0].num_time_slices, dtype=np.double)
+                np_w_counts = np.zeros(num_time_slices, dtype=np.double)
                 w_counts = <REAL_t *> (np.PyArray_DATA(np_w_counts))
 
-            for t in range(T):
-                compute_mean_deriv(mean_deriv_mtx + t * (config[0].num_time_slices + 1), config[0].variance,
-                                   config[0].obs_variance, config[0].chain_variance, w, t, T)
+            for t in range(num_time_slices):
+                compute_mean_deriv(
+                    mean_deriv_mtx + t * (num_time_slices + 1), config.variance,
+                    config.obs_variance, config.chain_variance, w, t, num_time_slices
+                )
 
-            np_deriv = np.zeros(T, dtype=np.double)
+            np_deriv = np.zeros(num_time_slices, dtype=np.double)
             deriv = <REAL_t *> (np.PyArray_DATA(np_deriv))
-            config[0].deriv = deriv
+            config.deriv = deriv
 
             obs = <REAL_t *> (np.PyArray_DATA(np_obs))
-            memcpy(obs, config[0].obs + w * config[0].num_time_slices, config[0].num_time_slices * sizeof(REAL_t))
+            memcpy(obs, config.obs + w * num_time_slices, num_time_slices * sizeof(REAL_t))
 
-            config[0].word_counts = w_counts
-            config[0].mean_deriv_mtx = mean_deriv_mtx
+            config.word_counts = w_counts
+            config.mean_deriv_mtx = mean_deriv_mtx
 
             # Passing C config structure as integer in Python code
             args = (<uintptr_t>(config),working_array)
@@ -618,12 +633,12 @@ cdef update_obs(StateSpaceLanguageModelConfig *config, REAL_t *sstats, REAL_t *t
             if counts_norm < OBS_NORM_CUTOFF:
 
                 norm_cutoff_obs = <REAL_t *> (np.PyArray_DATA(np_norm_cutoff_obs))
-                memcpy(norm_cutoff_obs, obs, config[0].num_time_slices * sizeof(REAL_t))
+                memcpy(norm_cutoff_obs, obs, num_time_slices * sizeof(REAL_t))
 
-            memcpy(config[0].obs + w * config[0].num_time_slices, obs, config[0].num_time_slices * sizeof(REAL_t))
+            memcpy(config.obs + w * num_time_slices, obs, num_time_slices * sizeof(REAL_t))
 
-    update_zeta(config[0].zeta, config[0].mean, config[0].variance,
-                config[0].num_time_slices, config[0].vocab_len)
+    update_zeta(config.zeta, config.mean, config.variance,
+                num_time_slices, config.vocab_len)
 
     free(mean_deriv_mtx)
 
@@ -651,7 +666,8 @@ def f_obs(_x, uintptr_t c, work_array):
 
     # flag
     cdef int init_mult = 1000
-    cdef Py_ssize_t T = config[0].num_time_slices
+    cdef Py_ssize_t num_time_slices = config.num_time_slices
+
     cdef Py_ssize_t t
 
     cdef REAL_t val = 0.0
@@ -663,26 +679,28 @@ def f_obs(_x, uintptr_t c, work_array):
     cdef REAL_t term4 = 0.0
 
     # obs[word] = x
-    memcpy(config[0].obs + config[0].word * config[0].num_time_slices, x, config[0].num_time_slices * sizeof(REAL_t))
+    memcpy(config.obs + config.word * num_time_slices, x, num_time_slices * sizeof(REAL_t))
 
-    compute_post_mean(config[0].mean, config[0].fwd_mean, config[0].fwd_variance, config[0].obs,
-                      config[0].word, config[0].num_time_slices, config[0].obs_variance, config[0].chain_variance)
+    compute_post_mean(
+        config.mean, config.fwd_mean, config.fwd_variance, config.obs,
+        config.word, num_time_slices, config.obs_variance, config.chain_variance
+    )
 
-    cdef REAL_t *mean = config[0].mean + config[0].word * (config[0].num_time_slices + 1)
-    cdef REAL_t *variance = config[0].variance + config[0].word * (config[0].num_time_slices + 1)
+    cdef REAL_t *mean = config.mean + config.word * (num_time_slices + 1)
+    cdef REAL_t *variance = config.variance + config.word * (num_time_slices + 1)
 
-    for t in range(1, T + 1):
+    for t in range(1, num_time_slices + 1):
 
         term1 += (mean[t] - mean[t - 1]) * (mean[t] - mean[t - 1])
 
-        term2 += config[0].word_counts[t - 1] * mean[t] - config[0].totals[t - 1] * \
-                 exp(mean[t] + variance[t] / 2) / config[0].zeta[t - 1]
+        term2 += config.word_counts[t - 1] * mean[t] - config.totals[t - 1] * \
+                 exp(mean[t] + variance[t] / 2) / config.zeta[t - 1]
 
 
-    if config[0].chain_variance > 0.0:
+    if config.chain_variance > 0.0:
 
-        term1 = -(term1 / (2 * config[0].chain_variance)) - \
-                mean[0] * mean[0] / (2 * init_mult * config[0].chain_variance)
+        term1 = -(term1 / (2 * config.chain_variance)) - \
+                mean[0] * mean[0] / (2 * init_mult * config.chain_variance)
     else:
         term1 = 0.0
 
@@ -711,21 +729,25 @@ def df_obs(_x, uintptr_t c, work_array):
     cdef REAL_t *x = <REAL_t *> (np.PyArray_DATA(_x))
 
 
-    memcpy(config[0].obs + config[0].num_time_slices * config[0].word, x, config[0].num_time_slices * sizeof(REAL_t))
+    memcpy(config.obs + config.num_time_slices * config.word, x, config.num_time_slices * sizeof(REAL_t))
 
-    compute_post_mean(config[0].mean, config[0].fwd_mean, config[0].fwd_variance, config[0].obs,
-                      config[0].word, config[0].num_time_slices, config[0].obs_variance, config[0].chain_variance)
+    compute_post_mean(
+        config.mean, config.fwd_mean, config.fwd_variance, config.obs,
+        config.word, config.num_time_slices, config.obs_variance, config.chain_variance
+    )
 
-    compute_obs_deriv(config[0].deriv, config[0].mean, config[0].mean_deriv_mtx, config[0].variance,
-                      config[0].zeta, config[0].totals, config[0].word_counts,
-                      config[0].chain_variance, config[0].word, config[0].num_time_slices)
+    compute_obs_deriv(
+        config.deriv, config.mean, config.mean_deriv_mtx, config.variance,
+        config.zeta, config.totals, config.word_counts,
+        config.chain_variance, config.word, config.num_time_slices
+    )
 
-    for i in range(config[0].num_time_slices):
-        config[0].deriv[i] = -config[0].deriv[i]
+    for i in range(config.num_time_slices):
+        config.deriv[i] = -config.deriv[i]
 
     cdef REAL_t *temp_ptr = <REAL_t *>(np.PyArray_DATA(work_array))
 
-    memcpy(temp_ptr, config[0].deriv, config[0].num_time_slices * sizeof(REAL_t))
+    memcpy(temp_ptr, config.deriv, config.num_time_slices * sizeof(REAL_t))
 
     return work_array
 
@@ -754,22 +776,25 @@ def fit_sslm(model, np_sstats):
     """
 
     # Initialize C structures based on Python instance of the model
-    cdef StateSpaceLanguageModelConfig * config = <StateSpaceLanguageModelConfig *> (<uintptr_t>(model.config_c_address))
+    cdef StateSpaceLanguageModelConfig * config = \
+        <StateSpaceLanguageModelConfig *> (<uintptr_t>(model.config_c_address))
 
     init_sslm_config(config, model)
 
-    cdef int W = config[0].vocab_len
     cdef REAL_t old_bound = 0.0
     cdef REAL_t sslm_fit_threshold = 0.000001
-    cdef int sslm_max_iter = 2
     cdef REAL_t converged = sslm_fit_threshold + 1
+    cdef int sslm_max_iter = 2
 
+    cdef int vocab_len = config.vocab_len
     cdef int w
 
-    for w in range(W):
-        config[0].word = w
-        compute_post_variance(config[0].variance, config[0].fwd_variance, config[0].obs_variance,
-                              config[0].chain_variance, w, config[0].num_time_slices)
+    for w in range(vocab_len):
+
+        compute_post_variance(
+            config.variance, config.fwd_variance, config.obs_variance,
+            config.chain_variance, w, config.num_time_slices
+        )
 
     cdef REAL_t *sstats = <REAL_t *> (np.PyArray_DATA(np_sstats))
 
@@ -790,8 +815,10 @@ def fit_sslm(model, np_sstats):
 
         converged = abs((bound - old_bound) / old_bound)
 
-    compute_expected_log_prob(config[0].e_log_prob, config[0].zeta, config[0].mean,
-                              W, config[0].num_time_slices)
+    compute_expected_log_prob(
+        config.e_log_prob, config.zeta, config.mean,
+        vocab_len, config.num_time_slices
+    )
 
     # TODO find a way/place where to free a memory
     # free(config)
diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py
index 0f222c9c6c..9ecc60fa43 100644
--- a/gensim/models/ldaseqmodel.py
+++ b/gensim/models/ldaseqmodel.py
@@ -55,23 +55,22 @@
 import logging
 
 import numpy as np
-from scipy.special import digamma, gammaln
-from scipy import optimize
-
 from gensim import utils, matutils
 from gensim.models import ldamodel
-
+from .ldaseq_sslm_inner import fit_sslm, sslm_counts_init
+from .ldaseq_posterior_inner import fit_lda_post
 
 logger = logging.getLogger(__name__)
 
 
 class LdaSeqModel(utils.SaveLoad):
     """Estimate Dynamic Topic Model parameters based on a training corpus."""
+
     def __init__(
             self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10,
             initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10,
             random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100,
-        ):
+    ):
         """
 
         Parameters
@@ -211,7 +210,7 @@ def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_s
         self.alphas = alpha
         for k, chain in enumerate(self.topic_chains):
             sstats = init_suffstats[:, k]
-            sslm.sslm_counts_init(chain, topic_obs_variance, topic_chain_variance, sstats)
+            chain.sslm_counts_init(topic_obs_variance, topic_chain_variance, sstats)
 
             # initialize the below matrices only if running DIM
             # ldaseq.topic_chains[k].w_phi_l = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
@@ -297,7 +296,6 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter,
             convergence = np.fabs((bound - old_bound) / old_bound)
 
             if convergence < LDASQE_EM_THRESHOLD:
-
                 lda_inference_max_iter = MAX_ITER
                 logger.info("Starting final iterations, max iter is %i", lda_inference_max_iter)
                 convergence = 1.0
@@ -426,16 +424,16 @@ def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda,
 
                 # TODO: replace fit_lda_post with appropriate ldamodel functions, if possible.
                 if iter_ == 0:
-                    doc_lhood = LdaPost.fit_lda_post(
-                        ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter
+                    doc_lhood = ldapost.fit_lda_post(
+                        doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter
                     )
                 else:
-                    doc_lhood = LdaPost.fit_lda_post(
-                        ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter
+                    doc_lhood = ldapost.fit_lda_post(
+                        doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter
                     )
 
                 if topic_suffstats is not None:
-                    topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, doc, topic_suffstats)
+                    topic_suffstats = ldapost.update_lda_seq_ss(time, doc, topic_suffstats)
 
                 gammas[doc_index] = ldapost.gamma
                 bound += doc_lhood
@@ -485,7 +483,7 @@ def fit_lda_seq_topics(self, topic_suffstats):
 
         for k, chain in enumerate(self.topic_chains):
             logger.info("Fitting topic number %i", k)
-            lhood_term = sslm.fit_sslm(chain, topic_suffstats[k])
+            lhood_term = chain.fit_sslm(topic_suffstats[k])
             lhood += lhood_term
 
         return lhood
@@ -679,7 +677,7 @@ def __getitem__(self, doc):
         time_lhoods = []
         for time in range(self.num_time_slices):
             lda_model = self.make_lda_seq_slice(lda_model, time)  # create lda_seq slice
-            lhood = LdaPost.fit_lda_post(ldapost, 0, time, self)
+            lhood = ldapost.fit_lda_post(0, time, self)
             time_lhoods.append(lhood)
 
         doc_topic = ldapost.gamma / ldapost.gamma.sum()
@@ -709,13 +707,13 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va
         self.num_topics = num_topics
 
         # setting up matrices
-        self.obs = np.zeros((vocab_len, num_time_slices))
-        self.e_log_prob = np.zeros((vocab_len, num_time_slices))
-        self.mean = np.zeros((vocab_len, num_time_slices + 1))
-        self.fwd_mean = np.zeros((vocab_len, num_time_slices + 1))
-        self.fwd_variance = np.zeros((vocab_len, num_time_slices + 1))
-        self.variance = np.zeros((vocab_len, num_time_slices + 1))
-        self.zeta = np.zeros(num_time_slices)
+        self.obs = np.zeros((vocab_len, num_time_slices), dtype=np.float64)
+        self.e_log_prob = np.zeros((vocab_len, num_time_slices), dtype=np.float64)
+        self.mean = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
+        self.fwd_mean = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
+        self.fwd_variance = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
+        self.variance = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
+        self.zeta = np.zeros(num_time_slices, dtype=np.float64)
 
         # the following are class variables which are to be integrated during Document Influence Model
         self.m_update_coeff = None
@@ -726,157 +724,7 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va
         self.w_phi_sum = None
         self.w_phi_l_sq = None
         self.m_update_coeff_g = None
-
-    def update_zeta(self):
-        """Update the Zeta variational parameter.
-
-        Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)),
-        over every time-slice. It is the value of variational parameter zeta which maximizes the lower bound.
-
-        Returns
-        -------
-        list of float
-            The updated zeta values for each time slice.
-
-        """
-        for j, val in enumerate(self.zeta):
-            self.zeta[j] = np.sum(np.exp(self.mean[:, j + 1] + self.variance[:, j + 1] / 2))
-        return self.zeta
-
-    def compute_post_variance(self, word, chain_variance):
-        r"""Get the variance, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
-        <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
-
-        This function accepts the word to compute variance for, along with the associated sslm class object,
-        and returns the `variance` and the posterior approximation `fwd_variance`.
-
-        Notes
-        -----
-        This function essentially computes Var[\beta_{t,w}] for t = 1:T
-
-        .. :math::
-
-            fwd\_variance[t] \equiv E((beta_{t,w}-mean_{t,w})^2 |beta_{t}\ for\ 1:t) =
-            (obs\_variance / fwd\_variance[t - 1] + chain\_variance + obs\_variance ) *
-            (fwd\_variance[t - 1] + obs\_variance)
-
-        .. :math::
-
-            variance[t] \equiv E((beta_{t,w}-mean\_cap_{t,w})^2 |beta\_cap_{t}\ for\ 1:t) =
-            fwd\_variance[t - 1] + (fwd\_variance[t - 1] / fwd\_variance[t - 1] + obs\_variance)^2 *
-            (variance[t - 1] - (fwd\_variance[t-1] + obs\_variance))
-
-        Parameters
-        ----------
-        word: int
-            The word's ID.
-        chain_variance : float
-            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
-
-        Returns
-        -------
-        (numpy.ndarray, numpy.ndarray)
-            The first returned value is the variance of each word in each time slice, the second value is the
-            inferred posterior variance for the same pairs.
-
-        """
-        INIT_VARIANCE_CONST = 1000
-
-        T = self.num_time_slices
-        variance = self.variance[word]
-        fwd_variance = self.fwd_variance[word]
-        # forward pass. Set initial variance very high
-        fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST
-        for t in range(1, T + 1):
-            if self.obs_variance:
-                c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance)
-            else:
-                c = 0
-            fwd_variance[t] = c * (fwd_variance[t - 1] + chain_variance)
-
-        # backward pass
-        variance[T] = fwd_variance[T]
-        for t in range(T - 1, -1, -1):
-            if fwd_variance[t] > 0.0:
-                c = np.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2)
-            else:
-                c = 0
-            variance[t] = (c * (variance[t + 1] - chain_variance)) + ((1 - c) * fwd_variance[t])
-
-        return variance, fwd_variance
-
-    def compute_post_mean(self, word, chain_variance):
-        """Get the mean, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
-        <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
-
-        Notes
-        -----
-        This function essentially computes E[\beta_{t,w}] for t = 1:T.
-
-        .. :math::
-
-            Fwd_Mean(t) ≡  E(beta_{t,w} | beta_ˆ 1:t )
-            = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] +
-            (1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta
-
-        .. :math::
-
-            Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T )
-            = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) +
-            (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t]
-
-        Parameters
-        ----------
-        word: int
-            The word's ID.
-        chain_variance : float
-            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
-
-        Returns
-        -------
-        (numpy.ndarray, numpy.ndarray)
-            The first returned value is the mean of each word in each time slice, the second value is the
-            inferred posterior mean for the same pairs.
-
-        """
-        T = self.num_time_slices
-        obs = self.obs[word]
-        fwd_variance = self.fwd_variance[word]
-        mean = self.mean[word]
-        fwd_mean = self.fwd_mean[word]
-
-        # forward
-        fwd_mean[0] = 0
-        for t in range(1, T + 1):
-            c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance)
-            fwd_mean[t] = c * fwd_mean[t - 1] + (1 - c) * obs[t - 1]
-
-        # backward pass
-        mean[T] = fwd_mean[T]
-        for t in range(T - 1, -1, -1):
-            if chain_variance == 0.0:
-                c = 0.0
-            else:
-                c = chain_variance / (fwd_variance[t] + chain_variance)
-            mean[t] = c * fwd_mean[t] + (1 - c) * mean[t + 1]
-        return mean, fwd_mean
-
-    def compute_expected_log_prob(self):
-        """Compute the expected log probability given values of m.
-
-        The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper;
-        The below implementation is the result of solving the equation and is implemented as in the original
-        Blei DTM code.
-
-        Returns
-        -------
-        numpy.ndarray of float
-            The expected value for the log probabilities for each word and time slice.
-
-        """
-        for (w, t), val in np.ndenumerate(self.e_log_prob):
-            self.e_log_prob[w][t] = self.mean[w][t + 1] - np.log(self.zeta[t])
-        return self.e_log_prob
+        self.config_c_address = 0
 
     def sslm_counts_init(self, obs_variance, chain_variance, sstats):
         """Initialize the State Space Language Model with LDA sufficient statistics.
@@ -895,28 +743,8 @@ def sslm_counts_init(self, obs_variance, chain_variance, sstats):
             expected shape (`self.vocab_len`, `num_topics`).
 
         """
-        W = self.vocab_len
-        T = self.num_time_slices
-
-        log_norm_counts = np.copy(sstats)
-        log_norm_counts /= sum(log_norm_counts)
-        log_norm_counts += 1.0 / W
-        log_norm_counts /= sum(log_norm_counts)
-        log_norm_counts = np.log(log_norm_counts)
-
-        # setting variational observations to transformed counts
-        self.obs = (np.repeat(log_norm_counts, T, axis=0)).reshape(W, T)
-        # set variational parameters
-        self.obs_variance = obs_variance
-        self.chain_variance = chain_variance
-
-        # compute post variance, mean
-        for w in range(W):
-            self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance)
-            self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance)
 
-        self.zeta = self.update_zeta()
-        self.e_log_prob = self.compute_expected_log_prob()
+        sslm_counts_init(self, obs_variance, chain_variance, sstats)
 
     def fit_sslm(self, sstats):
         """Fits variational distribution.
@@ -938,308 +766,8 @@ def fit_sslm(self, sstats):
             The lower bound for the true posterior achieved using the fitted approximate distribution.
 
         """
-        W = self.vocab_len
-        bound = 0
-        old_bound = 0
-        sslm_fit_threshold = 1e-6
-        sslm_max_iter = 2
-        converged = sslm_fit_threshold + 1
-
-        # computing variance, fwd_variance
-        self.variance, self.fwd_variance = \
-            (np.array(x) for x in zip(*(self.compute_post_variance(w, self.chain_variance) for w in range(W))))
-
-        # column sum of sstats
-        totals = sstats.sum(axis=0)
-        iter_ = 0
-
-        model = "DTM"
-        if model == "DTM":
-            bound = self.compute_bound(sstats, totals)
-        if model == "DIM":
-            bound = self.compute_bound_fixed(sstats, totals)
-
-        logger.info("initial sslm bound is %f", bound)
-
-        while converged > sslm_fit_threshold and iter_ < sslm_max_iter:
-            iter_ += 1
-            old_bound = bound
-            self.obs, self.zeta = self.update_obs(sstats, totals)
-
-            if model == "DTM":
-                bound = self.compute_bound(sstats, totals)
-            if model == "DIM":
-                bound = self.compute_bound_fixed(sstats, totals)
-
-            converged = np.fabs((bound - old_bound) / old_bound)
-            logger.info("iteration %i iteration lda seq bound is %f convergence is %f", iter_, bound, converged)
-
-        self.e_log_prob = self.compute_expected_log_prob()
-        return bound
-
-    def compute_bound(self, sstats, totals):
-        """Compute the maximized lower bound achieved for the log probability of the true posterior.
-
-        Uses the formula presented in the appendix of the DTM paper (formula no. 5).
-
-        Parameters
-        ----------
-        sstats : numpy.ndarray
-            Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the first
-            time slice, expected shape (`self.vocab_len`, `num_topics`).
-        totals : list of int of length `len(self.time_slice)`
-            The totals for each time slice.
-
-        Returns
-        -------
-        float
-            The maximized lower bound.
-
-        """
-        w = self.vocab_len
-        t = self.num_time_slices
-
-        term_1 = 0
-        term_2 = 0
-        term_3 = 0
-
-        val = 0
-        ent = 0
-
-        chain_variance = self.chain_variance
-        # computing mean, fwd_mean
-        self.mean, self.fwd_mean = \
-            (np.array(x) for x in zip(*(self.compute_post_mean(w, self.chain_variance) for w in range(w))))
-        self.zeta = self.update_zeta()
-
-        val = sum(self.variance[w][0] - self.variance[w][t] for w in range(w)) / 2 * chain_variance
-
-        logger.info("Computing bound, all times")
-
-        for t in range(1, t + 1):
-            term_1 = 0.0
-            term_2 = 0.0
-            ent = 0.0
-            for w in range(w):
-
-                m = self.mean[w][t]
-                prev_m = self.mean[w][t - 1]
-
-                v = self.variance[w][t]
-
-                # w_phi_l is only used in Document Influence Model; the values are always zero in this case
-                # w_phi_l = sslm.w_phi_l[w][t - 1]
-                # exp_i = np.exp(-prev_m)
-                # term_1 += (np.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) -
-                # (v / chain_variance) - np.log(chain_variance)
-
-                term_1 += \
-                    (np.power(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - np.log(chain_variance)
-                term_2 += sstats[w][t - 1] * m
-                ent += np.log(v) / 2  # note the 2pi's cancel with term1 (see doc)
-
-            term_3 = -totals[t - 1] * np.log(self.zeta[t - 1])
-            val += term_2 + term_3 + ent - term_1
-
-        return val
 
-    def update_obs(self, sstats, totals):
-        """Optimize the bound with respect to the observed variables.
-
-        TODO:
-        This is by far the slowest function in the whole algorithm.
-        Replacing or improving the performance of this would greatly speed things up.
-
-        Parameters
-        ----------
-        sstats : numpy.ndarray
-            Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the first
-            time slice, expected shape (`self.vocab_len`, `num_topics`).
-        totals : list of int of length `len(self.time_slice)`
-            The totals for each time slice.
-
-        Returns
-        -------
-        (numpy.ndarray of float, numpy.ndarray of float)
-            The updated optimized values for obs and the zeta variational parameter.
-
-        """
-
-        OBS_NORM_CUTOFF = 2
-        STEP_SIZE = 0.01
-        TOL = 1e-3
-
-        W = self.vocab_len
-        T = self.num_time_slices
-
-        runs = 0
-        mean_deriv_mtx = np.zeros((T, T + 1))
-
-        norm_cutoff_obs = None
-        for w in range(W):
-            w_counts = sstats[w]
-            counts_norm = 0
-            # now we find L2 norm of w_counts
-            for i in range(len(w_counts)):
-                counts_norm += w_counts[i] * w_counts[i]
-
-            counts_norm = np.sqrt(counts_norm)
-
-            if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not None:
-                obs = self.obs[w]
-                norm_cutoff_obs = np.copy(obs)
-            else:
-                if counts_norm < OBS_NORM_CUTOFF:
-                    w_counts = np.zeros(len(w_counts))
-
-                # TODO: apply lambda function
-                for t in range(T):
-                    mean_deriv_mtx[t] = self.compute_mean_deriv(w, t, mean_deriv_mtx[t])
-
-                deriv = np.zeros(T)
-                args = self, w_counts, totals, mean_deriv_mtx, w, deriv
-                obs = self.obs[w]
-                model = "DTM"
-
-                if model == "DTM":
-                    # slowest part of method
-                    obs = optimize.fmin_cg(
-                        f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0
-                    )
-                if model == "DIM":
-                    pass
-                runs += 1
-
-                if counts_norm < OBS_NORM_CUTOFF:
-                    norm_cutoff_obs = obs
-
-                self.obs[w] = obs
-
-        self.zeta = self.update_zeta()
-
-        return self.obs, self.zeta
-
-    def compute_mean_deriv(self, word, time, deriv):
-        """Helper functions for optimizing a function.
-
-        Compute the derivative of:
-
-        .. :math::
-
-            E[\beta_{t,w}]/d obs_{s,w} for t = 1:T.
-
-        Parameters
-        ----------
-        word : int
-            The word's ID.
-        time : int
-            The time slice.
-        deriv : list of float
-            Derivative for each time slice.
-
-        Returns
-        -------
-        list of float
-            Mean derivative for each time slice.
-
-        """
-
-        T = self.num_time_slices
-        fwd_variance = self.variance[word]
-
-        deriv[0] = 0
-
-        # forward pass
-        for t in range(1, T + 1):
-            if self.obs_variance > 0.0:
-                w = self.obs_variance / (fwd_variance[t - 1] + self.chain_variance + self.obs_variance)
-            else:
-                w = 0.0
-            val = w * deriv[t - 1]
-            if time == t - 1:
-                val += (1 - w)
-            deriv[t] = val
-
-        for t in range(T - 1, -1, -1):
-            if self.chain_variance == 0.0:
-                w = 0.0
-            else:
-                w = self.chain_variance / (fwd_variance[t] + self.chain_variance)
-            deriv[t] = w * deriv[t] + (1 - w) * deriv[t + 1]
-
-        return deriv
-
-    def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv):
-        """Derivation of obs which is used in derivative function `df_obs` while optimizing.
-
-        Parameters
-        ----------
-        word : int
-            The word's ID.
-        word_counts : list of int
-            Total word counts for each time slice.
-        totals : list of int of length `len(self.time_slice)`
-            The totals for each time slice.
-        mean_deriv_mtx : list of float
-            Mean derivative for each time slice.
-        deriv : list of float
-            Mean derivative for each time slice.
-
-        Returns
-        -------
-        list of float
-            Mean derivative for each time slice.
-
-        """
-
-        # flag
-        init_mult = 1000
-
-        T = self.num_time_slices
-
-        mean = self.mean[word]
-        variance = self.variance[word]
-
-        # only used for DIM mode
-        # w_phi_l = self.w_phi_l[word]
-        # m_update_coeff = self.m_update_coeff[word]
-
-        # temp_vector holds temporary zeta values
-        self.temp_vect = np.zeros(T)
-
-        for u in range(T):
-            self.temp_vect[u] = np.exp(mean[u + 1] + variance[u + 1] / 2)
-
-        for t in range(T):
-            mean_deriv = mean_deriv_mtx[t]
-            term1 = 0
-            term2 = 0
-            term3 = 0
-            term4 = 0
-
-            for u in range(1, T + 1):
-                mean_u = mean[u]
-                mean_u_prev = mean[u - 1]
-                dmean_u = mean_deriv[u]
-                dmean_u_prev = mean_deriv[u - 1]
-
-                term1 += (mean_u - mean_u_prev) * (dmean_u - dmean_u_prev)
-                term2 += (word_counts[u - 1] - (totals[u - 1] * self.temp_vect[u - 1] / self.zeta[u - 1])) * dmean_u
-
-                model = "DTM"
-                if model == "DIM":
-                    # do some stuff
-                    pass
-
-            if self.chain_variance:
-                term1 = - (term1 / self.chain_variance)
-                term1 = term1 - (mean[0] * mean_deriv[0]) / (init_mult * self.chain_variance)
-            else:
-                term1 = 0.0
-
-            deriv[t] = term1 + term2 + term3 + term4
-
-        return deriv
+        return fit_sslm(self, sstats)
 
 
 class LdaPost(utils.SaveLoad):
@@ -1288,137 +816,8 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=
         self.doc_weight = None
         self.renormalized_doc_weight = None
 
-    def update_phi(self, doc_number, time):
-        """Update variational multinomial parameters, based on a document and a time-slice.
-
-        This is done based on the original Blei-LDA paper, where:
-        log_phi := beta * exp(Ψ(gamma)), over every topic for every word.
-
-        TODO: incorporate lee-sueng trick used in
-        **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**.
-
-        Parameters
-        ----------
-        doc_number : int
-            Document number. Unused.
-        time : int
-            Time slice. Unused.
-
-        Returns
-        -------
-        (list of float, list of float)
-            Multinomial parameters, and their logarithm, for each word in the document.
-
-        """
-        num_topics = self.lda.num_topics
-        # digamma values
-        dig = np.zeros(num_topics)
-
-        for k in range(num_topics):
-            dig[k] = digamma(self.gamma[k])
-
-        n = 0   # keep track of iterations for phi, log_phi
-        for word_id, count in self.doc:
-            for k in range(num_topics):
-                self.log_phi[n][k] = dig[k] + self.lda.topics[word_id][k]
-
-            log_phi_row = self.log_phi[n]
-            phi_row = self.phi[n]
-
-            # log normalize
-            v = log_phi_row[0]
-            for i in range(1, len(log_phi_row)):
-                v = np.logaddexp(v, log_phi_row[i])
-
-            # subtract every element by v
-            log_phi_row = log_phi_row - v
-            phi_row = np.exp(log_phi_row)
-            self.log_phi[n] = log_phi_row
-            self.phi[n] = phi_row
-            n += 1  # increase iteration
-
-        return self.phi, self.log_phi
-
-    def update_gamma(self):
-        """Update variational dirichlet parameters.
-
-        This operations is described in the original Blei LDA paper:
-        gamma = alpha + sum(phi), over every topic for every word.
-
-        Returns
-        -------
-        list of float
-            The updated gamma parameters for each word in the document.
-
-        """
-        self.gamma = np.copy(self.lda.alpha)
-        n = 0  # keep track of number of iterations for phi, log_phi
-        for word_id, count in self.doc:
-            phi_row = self.phi[n]
-            for k in range(self.lda.num_topics):
-                self.gamma[k] += phi_row[k] * count
-            n += 1
-
-        return self.gamma
-
-    def init_lda_post(self):
-        """Initialize variational posterior. """
-        total = sum(count for word_id, count in self.doc)
-        self.gamma.fill(self.lda.alpha[0] + float(total) / self.lda.num_topics)
-        self.phi[:len(self.doc), :] = 1.0 / self.lda.num_topics
-        # doc_weight used during DIM
-        # ldapost.doc_weight = None
-
-    def compute_lda_lhood(self):
-        """Compute the log likelihood bound.
-
-        Returns
-        -------
-        float
-            The optimal lower bound for the true posterior using the approximate distribution.
-
-        """
-        num_topics = self.lda.num_topics
-        gamma_sum = np.sum(self.gamma)
-
-        # to be used in DIM
-        # sigma_l = 0
-        # sigma_d = 0
-
-        lhood = gammaln(np.sum(self.lda.alpha)) - gammaln(gamma_sum)
-        self.lhood[num_topics] = lhood
-
-        # influence_term = 0
-        digsum = digamma(gamma_sum)
-
-        model = "DTM"  # noqa:F841
-        for k in range(num_topics):
-            # below code only to be used in DIM mode
-            # if ldapost.doc_weight is not None and (model == "DIM" or model == "fixed"):
-            #     influence_topic = ldapost.doc_weight[k]
-            #     influence_term = \
-            #           - ((influence_topic * influence_topic + sigma_l * sigma_l) / 2.0 / (sigma_d * sigma_d))
-
-            e_log_theta_k = digamma(self.gamma[k]) - digsum
-            lhood_term = \
-                (self.lda.alpha[k] - self.gamma[k]) * e_log_theta_k + \
-                gammaln(self.gamma[k]) - gammaln(self.lda.alpha[k])
-            # TODO: check why there's an IF
-            n = 0
-            for word_id, count in self.doc:
-                if self.phi[n][k] > 0:
-                    lhood_term += \
-                        count * self.phi[n][k] * (e_log_theta_k + self.lda.topics[word_id][k] - self.log_phi[n][k])
-                n += 1
-            self.lhood[k] = lhood_term
-            lhood += lhood_term
-            # in case of DIM add influence term
-            # lhood += influence_term
-
-        return lhood
-
     def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
-                    lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None):
+                     lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None):
         """Posterior inference for lda.
 
         Parameters
@@ -1448,51 +847,8 @@ def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
             The optimal lower bound for the true posterior using the approximate distribution.
         """
 
-        self.init_lda_post()
-        # sum of counts in a doc
-        total = sum(count for word_id, count in self.doc)
-
-        model = "DTM"
-        if model == "DIM":
-            # if in DIM then we initialise some variables here
-            pass
-
-        lhood = self.compute_lda_lhood()
-        lhood_old = 0
-        converged = 0
-        iter_ = 0
-
-        # first iteration starts here
-        iter_ += 1
-        lhood_old = lhood
-        self.gamma = self.update_gamma()
-
-        model = "DTM"
-
-        if model == "DTM" or sslm is None:
-            self.phi, self.log_phi = self.update_phi(doc_number, time)
-        elif model == "DIM" and sslm is not None:
-            self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix)
-
-        lhood = self.compute_lda_lhood()
-        converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
-
-        while converged > LDA_INFERENCE_CONVERGED and iter_ <= lda_inference_max_iter:
-
-            iter_ += 1
-            lhood_old = lhood
-            self.gamma = self.update_gamma()
-            model = "DTM"
-
-            if model == "DTM" or sslm is None:
-                self.phi, self.log_phi = self.update_phi(doc_number, time)
-            elif model == "DIM" and sslm is not None:
-                self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix)
-
-            lhood = self.compute_lda_lhood()
-            converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
-
-        return lhood
+        return np.array(fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED,
+                                     lda_inference_max_iter, g, g3_matrix, g4_matrix, g5_matrix))
 
     def update_lda_seq_ss(self, time, doc, topic_suffstats):
         """Update lda sequence sufficient statistics from an lda posterior.
@@ -1527,119 +883,3 @@ def update_lda_seq_ss(self, time, doc, topic_suffstats):
             topic_suffstats[k] = topic_ss
 
         return topic_suffstats
-
-
-# the following functions are used in update_obs as the objective function.
-def f_obs(x, *args):
-    """Function which we are optimising for minimizing obs.
-
-    Parameters
-    ----------
-    x : list of float
-        The obs values for this word.
-    sslm : :class:`~gensim.models.ldaseqmodel.sslm`
-        The State Space Language Model for DTM.
-    word_counts : list of int
-        Total word counts for each time slice.
-    totals : list of int of length `len(self.time_slice)`
-        The totals for each time slice.
-    mean_deriv_mtx : list of float
-        Mean derivative for each time slice.
-    word : int
-        The word's ID.
-    deriv : list of float
-        Mean derivative for each time slice.
-
-    Returns
-    -------
-    list of float
-        The value of the objective function evaluated at point `x`.
-
-    """
-    sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args
-    # flag
-    init_mult = 1000
-
-    T = len(x)
-    val = 0
-    term1 = 0
-    term2 = 0
-
-    # term 3 and 4 for DIM
-    term3 = 0
-    term4 = 0
-
-    sslm.obs[word] = x
-    sslm.mean[word], sslm.fwd_mean[word] = sslm.compute_post_mean(word, sslm.chain_variance)
-
-    mean = sslm.mean[word]
-    variance = sslm.variance[word]
-
-    # only used for DIM mode
-    # w_phi_l = sslm.w_phi_l[word]
-    # m_update_coeff = sslm.m_update_coeff[word]
-
-    for t in range(1, T + 1):
-        mean_t = mean[t]
-        mean_t_prev = mean[t - 1]
-
-        val = mean_t - mean_t_prev
-        term1 += val * val
-        term2 += word_counts[t - 1] * mean_t - totals[t - 1] * np.exp(mean_t + variance[t] / 2) / sslm.zeta[t - 1]
-
-        model = "DTM"
-        if model == "DIM":
-            # stuff happens
-            pass
-
-    if sslm.chain_variance > 0.0:
-
-        term1 = - (term1 / (2 * sslm.chain_variance))
-        term1 = term1 - mean[0] * mean[0] / (2 * init_mult * sslm.chain_variance)
-    else:
-        term1 = 0.0
-
-    final = -(term1 + term2 + term3 + term4)
-
-    return final
-
-
-def df_obs(x, *args):
-    """Derivative of the objective function which optimises obs.
-
-    Parameters
-    ----------
-    x : list of float
-        The obs values for this word.
-    sslm : :class:`~gensim.models.ldaseqmodel.sslm`
-        The State Space Language Model for DTM.
-    word_counts : list of int
-        Total word counts for each time slice.
-    totals : list of int of length `len(self.time_slice)`
-        The totals for each time slice.
-    mean_deriv_mtx : list of float
-        Mean derivative for each time slice.
-    word : int
-        The word's ID.
-    deriv : list of float
-        Mean derivative for each time slice.
-
-    Returns
-    -------
-    list of float
-        The derivative of the objective function evaluated at point `x`.
-
-    """
-    sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args
-
-    sslm.obs[word] = x
-    sslm.mean[word], sslm.fwd_mean[word] = sslm.compute_post_mean(word, sslm.chain_variance)
-
-    model = "DTM"
-    if model == "DTM":
-        deriv = sslm.compute_obs_deriv(word, word_counts, totals, mean_deriv_mtx, deriv)
-    elif model == "DIM":
-        deriv = sslm.compute_obs_deriv_fixed(
-            p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv)  # noqa:F821
-
-    return np.negative(deriv)
diff --git a/gensim/models/ldaseqmodel_optimized.py b/gensim/models/ldaseqmodel_optimized.py
deleted file mode 100644
index 97eae4666b..0000000000
--- a/gensim/models/ldaseqmodel_optimized.py
+++ /dev/null
@@ -1,838 +0,0 @@
-import logging
-
-import numpy as np
-from gensim import utils, matutils
-from gensim.models import ldamodel
-from .ldaseq_sslm_inner import fit_sslm, sslm_counts_init
-from .ldaseq_posterior_inner import fit_lda_post
-
-logger = logging.getLogger(__name__)
-
-
-class LdaSeqModel_V4(utils.SaveLoad):
-    """Estimate Dynamic Topic Model parameters based on a training corpus."""
-    def __init__(
-            self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10,
-            initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10,
-            random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100,
-        ):
-        """
-
-        Parameters
-        ----------
-        corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
-            Stream of   document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
-            If not given, the model is left untrained (presumably because you want to call
-            :meth:`~gensim.models.ldamodel.LdaSeqModel.update` manually).
-        time_slice : list of int, optional
-            Number of documents in each time-slice. Each time slice could for example represent a year's published
-            papers, in case the corpus comes from a journal publishing over multiple years.
-            It is assumed that `sum(time_slice) == num_documents`.
-        id2word : dict of (int, str), optional
-            Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for
-            debugging and topic printing.
-        alphas : float, optional
-            The prior probability for the model.
-        num_topics : int, optional
-            The number of requested latent topics to be extracted from the training corpus.
-        initialize : {'gensim', 'own', 'ldamodel'}, optional
-            Controls the initialization of the DTM model. Supports three different modes:
-                * 'gensim': Uses gensim's LDA initialization.
-                * 'own': Uses your own initialization matrix of an LDA model that has been previously trained.
-                * 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument.
-        sstats : numpy.ndarray , optional
-            Sufficient statistics used for initializing the model if `initialize == 'own'`. Corresponds to matrix
-            beta in the linked paper for time slice 0, expected shape (`self.vocab_len`, `num_topics`).
-        lda_model : :class:`~gensim.models.ldamodel.LdaModel`
-            Model whose sufficient statistics will be used to initialize the current object if `initialize == 'gensim'`.
-        obs_variance : float, optional
-            Observed variance used to approximate the true and forward variance as shown in
-            `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
-            <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
-        chain_variance : float, optional
-            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
-        passes : int, optional
-            Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel`
-        random_state : {numpy.random.RandomState, int}, optional
-            Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results.
-        lda_inference_max_iter : int, optional
-            Maximum number of iterations in the inference step of the LDA training.
-        em_min_iter : int, optional
-            Minimum number of iterations until converge of the Expectation-Maximization algorithm
-        em_max_iter : int, optional
-            Maximum number of iterations until converge of the Expectation-Maximization algorithm.
-        chunksize : int, optional
-            Number of documents in the corpus do be processed in in a chunk.
-
-        """
-        self.id2word = id2word
-        if corpus is None and self.id2word is None:
-            raise ValueError(
-                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
-            )
-
-        if self.id2word is None:
-            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
-            self.id2word = utils.dict_from_corpus(corpus)
-            self.vocab_len = len(self.id2word)
-        elif self.id2word:
-            self.vocab_len = len(self.id2word)
-        else:
-            self.vocab_len = 0
-
-        if corpus is not None:
-            try:
-                self.corpus_len = len(corpus)
-            except TypeError:
-                logger.warning("input corpus stream has no len(); counting documents")
-                self.corpus_len = sum(1 for _ in corpus)
-
-        self.time_slice = time_slice
-        if self.time_slice is not None:
-            self.num_time_slices = len(time_slice)
-
-        self.num_topics = num_topics
-        self.num_time_slices = len(time_slice)
-        self.alphas = np.full(num_topics, alphas)
-
-        # topic_chains contains for each topic a 'state space language model' object
-        # which in turn has information about each topic
-        # the sslm class is described below and contains information
-        # on topic-word probabilities and doc-topic probabilities.
-        self.topic_chains = []
-        for topic in range(num_topics):
-            sslm_ = sslm(
-                num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics,
-                chain_variance=chain_variance, obs_variance=obs_variance
-            )
-            self.topic_chains.append(sslm_)
-
-        # the following are class variables which are to be integrated during Document Influence Model
-        self.top_doc_phis = None
-        self.influence = None
-        self.renormalized_influence = None
-        self.influence_sum_lgl = None
-
-        # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM.
-        if corpus is not None and time_slice is not None:
-            self.max_doc_len = max(len(line) for line in corpus)
-
-            if initialize == 'gensim':
-                lda_model = ldamodel.LdaModel(
-                    corpus, id2word=self.id2word, num_topics=self.num_topics,
-                    passes=passes, alpha=self.alphas, random_state=random_state,
-                    dtype=np.float64
-                )
-                self.sstats = np.transpose(lda_model.state.sstats)
-            if initialize == 'ldamodel':
-                self.sstats = np.transpose(lda_model.state.sstats)
-            if initialize == 'own':
-                self.sstats = sstats
-
-            # initialize model from sstats
-            self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas, self.sstats)
-
-            # fit DTM
-            self.fit_lda_seq(corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize)
-
-    def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_suffstats):
-        """Initialize State Space Language Model, topic-wise.
-
-        Parameters
-        ----------
-        topic_chain_variance : float
-            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve.
-        topic_obs_variance : float
-            Observed variance used to approximate the true and forward variance as shown in
-            `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
-            <https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
-        alpha : float
-            The prior probability for the model.
-        init_suffstats : numpy.ndarray
-            Sufficient statistics used for initializing the model, expected shape (`self.vocab_len`, `num_topics`).
-
-        """
-        self.alphas = alpha
-        for k, chain in enumerate(self.topic_chains):
-            sstats = init_suffstats[:, k]
-            chain.sslm_counts_init(topic_obs_variance, topic_chain_variance, sstats)
-
-            # initialize the below matrices only if running DIM
-            # ldaseq.topic_chains[k].w_phi_l = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
-            # ldaseq.topic_chains[k].w_phi_sum = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
-            # ldaseq.topic_chains[k].w_phi_sq = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
-
-    def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize):
-        """Fit a LDA Sequence model (DTM).
-
-        This method will iteratively setup LDA models and perform EM steps until the sufficient statistics convergence,
-        or until the maximum number of iterations is reached. Because the true posterior is intractable, an
-        appropriately tight lower bound must be used instead. This function will optimize this bound, by minimizing
-        its true Kullback-Liebler Divergence with the true posterior.
-
-        Parameters
-        ----------
-        corpus : {iterable of list of (int, float), scipy.sparse.csc}
-            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
-        lda_inference_max_iter : int
-            Maximum number of iterations for the inference step of LDA.
-        em_min_iter : int
-            Minimum number of time slices to be inspected.
-        em_max_iter : int
-            Maximum number of time slices to be inspected.
-        chunksize : int
-            Number of documents to be processed in each chunk.
-
-        Returns
-        -------
-        float
-            The highest lower bound for the true posterior produced after all iterations.
-
-       """
-        LDASQE_EM_THRESHOLD = 1e-4
-        # if bound is low, then we increase iterations.
-        LOWER_ITER = 10
-        ITER_MULT_LOW = 2
-        MAX_ITER = 500
-
-        num_topics = self.num_topics
-        vocab_len = self.vocab_len
-        data_len = self.num_time_slices
-        corpus_len = self.corpus_len
-
-        bound = 0
-        convergence = LDASQE_EM_THRESHOLD + 1
-        iter_ = 0
-
-        while iter_ < em_min_iter or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= em_max_iter):
-
-            logger.info(" EM iter %i", iter_)
-            logger.info("E Step")
-            # TODO: bound is initialized to 0
-            old_bound = bound
-
-            # initiate sufficient statistics
-            topic_suffstats = []
-            for topic in range(num_topics):
-                topic_suffstats.append(np.zeros((vocab_len, data_len)))
-
-            # set up variables
-            gammas = np.zeros((corpus_len, num_topics))
-            lhoods = np.zeros((corpus_len, num_topics + 1))
-            # compute the likelihood of a sequential corpus under an LDA
-            # seq model and find the evidence lower bound. This is the E - Step
-            bound, gammas = \
-                self.lda_seq_infer(corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize)
-            self.gammas = gammas
-
-            logger.info("M Step")
-
-            # fit the variational distribution. This is the M - Step
-            topic_bound = self.fit_lda_seq_topics(topic_suffstats)
-            bound += topic_bound
-
-            if (bound - old_bound) < 0:
-                # if max_iter is too low, increase iterations.
-                if lda_inference_max_iter < LOWER_ITER:
-                    lda_inference_max_iter *= ITER_MULT_LOW
-                logger.info("Bound went down, increasing iterations to %i", lda_inference_max_iter)
-
-            # check for convergence
-            convergence = np.fabs((bound - old_bound) / old_bound)
-
-            if convergence < LDASQE_EM_THRESHOLD:
-
-                lda_inference_max_iter = MAX_ITER
-                logger.info("Starting final iterations, max iter is %i", lda_inference_max_iter)
-                convergence = 1.0
-
-            logger.info("iteration %i iteration lda seq bound is %f convergence is %f", iter_, bound, convergence)
-
-            iter_ += 1
-
-        return bound
-
-    def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
-                      iter_, lda_inference_max_iter, chunksize):
-        """Inference (or E-step) for the lower bound EM optimization.
-
-        This is used to set up the gensim :class:`~gensim.models.ldamodel.LdaModel` to be used for each time-slice.
-        It also allows for Document Influence Model code to be written in.
-
-        Parameters
-        ----------
-        corpus : {iterable of list of (int, float), scipy.sparse.csc}
-            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
-        topic_suffstats : numpy.ndarray
-            Sufficient statistics for time slice 0, used for initializing the model if `initialize == 'own'`,
-            expected shape (`self.vocab_len`, `num_topics`).
-        gammas : numpy.ndarray
-            Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
-        lhoods : list of float
-            The total log probability lower bound for each topic. Corresponds to the phi variational parameters in the
-            linked paper.
-        iter_ : int
-            Current iteration.
-        lda_inference_max_iter : int
-            Maximum number of iterations for the inference step of LDA.
-        chunksize : int
-            Number of documents to be processed in each chunk.
-
-        Returns
-        -------
-        (float, list of float)
-            The first value is the highest lower bound for the true posterior.
-            The second value is the list of optimized dirichlet variational parameters for the approximation of
-            the posterior.
-
-        """
-        num_topics = self.num_topics
-        vocab_len = self.vocab_len
-        bound = 0.0
-
-        lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
-        lda.topics = np.zeros((vocab_len, num_topics))
-        ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda)
-
-        model = "DTM"
-        if model == "DTM":
-            bound, gammas = self.inferDTMseq(
-                corpus, topic_suffstats, gammas, lhoods, lda,
-                ldapost, iter_, bound, lda_inference_max_iter, chunksize
-            )
-        elif model == "DIM":
-            self.InfluenceTotalFixed(corpus)
-            bound, gammas = self.inferDIMseq(
-                corpus, topic_suffstats, gammas, lhoods, lda,
-                ldapost, iter_, bound, lda_inference_max_iter, chunksize
-            )
-
-        return bound, gammas
-
-    def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda,
-                    ldapost, iter_, bound, lda_inference_max_iter, chunksize):
-        """Compute the likelihood of a sequential corpus under an LDA seq model, and reports the likelihood bound.
-
-        Parameters
-        ----------
-        corpus : {iterable of list of (int, float), scipy.sparse.csc}
-            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
-        topic_suffstats : numpy.ndarray
-            Sufficient statistics of the current model, expected shape (`self.vocab_len`, `num_topics`).
-        gammas : numpy.ndarray
-            Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
-        lhoods : list of float of length `self.num_topics`
-            The total log probability bound for each topic. Corresponds to phi from the linked paper.
-        lda : :class:`~gensim.models.ldamodel.LdaModel`
-            The trained LDA model of the previous iteration.
-        ldapost : :class:`~gensim.models.ldaseqmodel.LdaPost`
-            Posterior probability variables for the given LDA model. This will be used as the true (but intractable)
-            posterior.
-        iter_ : int
-            The current iteration.
-        bound : float
-            The LDA bound produced after all iterations.
-        lda_inference_max_iter : int
-            Maximum number of iterations for the inference step of LDA.
-        chunksize : int
-            Number of documents to be processed in each chunk.
-
-        Returns
-        -------
-        (float, list of float)
-            The first value is the highest lower bound for the true posterior.
-            The second value is the list of optimized dirichlet variational parameters for the approximation of
-            the posterior.
-
-        """
-        doc_index = 0  # overall doc_index in corpus
-        time = 0  # current time-slice
-        doc_num = 0  # doc-index in current time-slice
-        lda = self.make_lda_seq_slice(lda, time)  # create lda_seq slice
-
-        time_slice = np.cumsum(np.array(self.time_slice))
-
-        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
-            # iterates chunk size for constant memory footprint
-            for doc in chunk:
-                # this is used to update the time_slice and create a new lda_seq slice every new time_slice
-                if doc_index > time_slice[time]:
-                    time += 1
-                    lda = self.make_lda_seq_slice(lda, time)  # create lda_seq slice
-                    doc_num = 0
-
-                gam = gammas[doc_index]
-                lhood = lhoods[doc_index]
-
-                ldapost.gamma = gam
-                ldapost.lhood = lhood
-                ldapost.doc = doc
-
-                # TODO: replace fit_lda_post with appropriate ldamodel functions, if possible.
-                if iter_ == 0:
-                    doc_lhood = LdaPost.fit_lda_post(
-                        ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter
-                    )
-                else:
-                    doc_lhood = LdaPost.fit_lda_post(
-                        ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter
-                    )
-
-                if topic_suffstats is not None:
-                    topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, doc, topic_suffstats)
-
-                gammas[doc_index] = ldapost.gamma
-                bound += doc_lhood
-                doc_index += 1
-                doc_num += 1
-
-        return bound, gammas
-
-    def make_lda_seq_slice(self, lda, time):
-        """Update the LDA model topic-word values using time slices.
-
-        Parameters
-        ----------
-
-        lda : :class:`~gensim.models.ldamodel.LdaModel`
-            The stationary model to be updated
-        time : int
-            The time slice assigned to the stationary model.
-
-        Returns
-        -------
-        lda : :class:`~gensim.models.ldamodel.LdaModel`
-            The stationary model updated to reflect the passed time slice.
-
-        """
-        for k in range(self.num_topics):
-            lda.topics[:, k] = self.topic_chains[k].e_log_prob[:, time]
-
-        lda.alpha = np.copy(self.alphas)
-        return lda
-
-    def fit_lda_seq_topics(self, topic_suffstats):
-        """Fit the sequential model topic-wise.
-
-        Parameters
-        ----------
-        topic_suffstats : numpy.ndarray
-            Sufficient statistics of the current model, expected shape (`self.vocab_len`, `num_topics`).
-
-        Returns
-        -------
-        float
-            The sum of the optimized lower bounds for all topics.
-
-        """
-        lhood = 0
-
-        for k, chain in enumerate(self.topic_chains):
-            logger.info("Fitting topic number %i", k)
-            lhood_term = chain.fit_sslm(topic_suffstats[k])
-            lhood += lhood_term
-
-        return lhood
-
-    def print_topic_times(self, topic, top_terms=20):
-        """Get the most relevant words for a topic, for each timeslice. This can be used to inspect the evolution of a
-        topic through time.
-
-        Parameters
-        ----------
-        topic : int
-            The index of the topic.
-        top_terms : int, optional
-            Number of most relevant words associated with the topic to be returned.
-
-        Returns
-        -------
-        list of list of str
-            Top `top_terms` relevant terms for the topic for each time slice.
-
-        """
-        topics = []
-        for time in range(self.num_time_slices):
-            topics.append(self.print_topic(topic, time, top_terms))
-
-        return topics
-
-    def print_topics(self, time=0, top_terms=20):
-        """Get the most relevant words for every topic.
-
-        Parameters
-        ----------
-        time : int, optional
-            The time slice in which we are interested in (since topics evolve over time, it is expected that the most
-            relevant words will also gradually change).
-        top_terms : int, optional
-            Number of most relevant words to be returned for each topic.
-
-        Returns
-        -------
-        list of list of (str, float)
-            Representation of all topics. Each of them is represented by a list of pairs of words and their assigned
-            probability.
-
-        """
-        return [self.print_topic(topic, time, top_terms) for topic in range(self.num_topics)]
-
-    def print_topic(self, topic, time=0, top_terms=20):
-        """Get the list of words most relevant to the given topic.
-
-        Parameters
-        ----------
-        topic : int
-            The index of the topic to be inspected.
-        time : int, optional
-            The time slice in which we are interested in (since topics evolve over time, it is expected that the most
-            relevant words will also gradually change).
-        top_terms : int, optional
-            Number of words associated with the topic to be returned.
-
-        Returns
-        -------
-        list of (str, float)
-            The representation of this topic. Each element in the list includes the word itself, along with the
-            probability assigned to it by the topic.
-
-        """
-        topic = self.topic_chains[topic].e_log_prob
-        topic = np.transpose(topic)
-        topic = np.exp(topic[time])
-        topic = topic / topic.sum()
-        bestn = matutils.argsort(topic, top_terms, reverse=True)
-        beststr = [(self.id2word[id_], topic[id_]) for id_ in bestn]
-        return beststr
-
-    def doc_topics(self, doc_number):
-        """Get the topic mixture for a document.
-
-        Uses the priors for the dirichlet distribution that approximates the true posterior with the optimal
-        lower bound, and therefore requires the model to be already trained.
-
-
-        Parameters
-        ----------
-        doc_number : int
-            Index of the document for which the mixture is returned.
-
-        Returns
-        -------
-        list of length `self.num_topics`
-            Probability for each topic in the mixture (essentially a point in the `self.num_topics - 1` simplex.
-
-        """
-        doc_topic = self.gammas / self.gammas.sum(axis=1)[:, np.newaxis]
-        return doc_topic[doc_number]
-
-    def dtm_vis(self, time, corpus):
-        """Get the information needed to visualize the corpus model at a given time slice, using the pyLDAvis format.
-
-        Parameters
-        ----------
-        time : int
-            The time slice we are interested in.
-        corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
-            The corpus we want to visualize at the given time slice.
-
-        Returns
-        -------
-        doc_topics : list of length `self.num_topics`
-            Probability for each topic in the mixture (essentially a point in the `self.num_topics - 1` simplex.
-        topic_term : numpy.ndarray
-            The representation of each topic as a multinomial over words in the vocabulary,
-            expected shape (`num_topics`, vocabulary length).
-        doc_lengths : list of int
-            The number of words in each document. These could be fixed, or drawn from a Poisson distribution.
-        term_frequency : numpy.ndarray
-            The term frequency matrix (denoted as beta in the original Blei paper). This could also be the TF-IDF
-            representation of the corpus, expected shape (number of documents, length of vocabulary).
-        vocab : list of str
-            The set of unique terms existing in the cropuse's vocabulary.
-
-        """
-        doc_topic = self.gammas / self.gammas.sum(axis=1)[:, np.newaxis]
-
-        def normalize(x):
-            return x / x.sum()
-
-        topic_term = [
-            normalize(np.exp(chain.e_log_prob.T[time]))
-            for k, chain in enumerate(self.topic_chains)
-        ]
-
-        doc_lengths = []
-        term_frequency = np.zeros(self.vocab_len)
-        for doc_no, doc in enumerate(corpus):
-            doc_lengths.append(len(doc))
-
-            for term, freq in doc:
-                term_frequency[term] += freq
-
-        vocab = [self.id2word[i] for i in range(len(self.id2word))]
-
-        return doc_topic, np.array(topic_term), doc_lengths, term_frequency, vocab
-
-    def dtm_coherence(self, time):
-        """Get the coherence for each topic.
-
-        Can be used to measure the quality of the model, or to inspect the convergence through training via a callback.
-
-        Parameters
-        ----------
-        time : int
-            The time slice.
-
-        Returns
-        -------
-        list of list of str
-            The word representation for each topic, for each time slice. This can be used to check the time coherence
-            of topics as time evolves: If the most relevant words remain the same then the topic has somehow
-            converged or is relatively static, if they change rapidly the topic is evolving.
-
-        """
-        coherence_topics = []
-        for topics in self.print_topics(time):
-            coherence_topic = []
-            for word, dist in topics:
-                coherence_topic.append(word)
-            coherence_topics.append(coherence_topic)
-
-        return coherence_topics
-
-    def __getitem__(self, doc):
-        """Get the topic mixture for the given document, using the inferred approximation of the true posterior.
-
-        Parameters
-        ----------
-        doc : list of (int, float)
-            The doc in BOW format. Can be an unseen document.
-
-        Returns
-        -------
-        list of float
-            Probabilities for each topic in the mixture. This is essentially a point in the `num_topics - 1` simplex.
-
-        """
-        lda_model = ldamodel.LdaModel(
-            num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
-        lda_model.topics = np.zeros((self.vocab_len, self.num_topics))
-        ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc)
-
-        time_lhoods = []
-        for time in range(self.num_time_slices):
-            lda_model = self.make_lda_seq_slice(lda_model, time)  # create lda_seq slice
-            lhood = LdaPost.fit_lda_post(ldapost, 0, time, self)
-            time_lhoods.append(lhood)
-
-        doc_topic = ldapost.gamma / ldapost.gamma.sum()
-        # should even the likelihoods be returned?
-        return doc_topic
-
-
-class sslm(utils.SaveLoad):
-    """Encapsulate the inner State Space Language Model for DTM.
-
-    Some important attributes of this class:
-
-        * `obs` is a matrix containing the document to topic ratios.
-        * `e_log_prob` is a matrix containing the topic to word ratios.
-        * `mean` contains the mean values to be used for inference for each word for a time slice.
-        * `variance` contains the variance values to be used for inference of word in a time slice.
-        * `fwd_mean` and`fwd_variance` are the forward posterior values for the mean and the variance.
-        * `zeta` is an extra variational parameter with a value for each time slice.
-
-    """
-
-    def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005):
-        self.vocab_len = vocab_len
-        self.num_time_slices = num_time_slices
-        self.obs_variance = obs_variance
-        self.chain_variance = chain_variance
-        self.num_topics = num_topics
-
-        # setting up matrices
-        self.obs = np.zeros((vocab_len, num_time_slices), dtype=np.float64)
-        self.e_log_prob = np.zeros((vocab_len, num_time_slices), dtype=np.float64)
-        self.mean = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
-        self.fwd_mean = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
-        self.fwd_variance = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
-        self.variance = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
-        self.zeta = np.zeros(num_time_slices, dtype=np.float64)
-
-        # the following are class variables which are to be integrated during Document Influence Model
-        self.m_update_coeff = None
-        self.mean_t = None
-        self.variance_t = None
-        self.influence_sum_lgl = None
-        self.w_phi_l = None
-        self.w_phi_sum = None
-        self.w_phi_l_sq = None
-        self.m_update_coeff_g = None
-        self.config_c_address = 0
-
-
-    def sslm_counts_init(self, obs_variance, chain_variance, sstats):
-        """Initialize the State Space Language Model with LDA sufficient statistics.
-
-        Called for each topic-chain and initializes initial mean, variance and Topic-Word probabilities
-        for the first time-slice.
-
-        Parameters
-        ----------
-        obs_variance : float, optional
-            Observed variance used to approximate the true and forward variance.
-        chain_variance : float
-            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
-        sstats : numpy.ndarray
-            Sufficient statistics of the LDA model. Corresponds to matrix beta in the linked paper for time slice 0,
-            expected shape (`self.vocab_len`, `num_topics`).
-
-        """
-
-        sslm_counts_init(self, obs_variance, chain_variance, sstats)
-
-    def fit_sslm(self, sstats):
-        """Fits variational distribution.
-
-        This is essentially the m-step.
-        Maximizes the approximation of the true posterior for a particular topic using the provided sufficient
-        statistics. Updates the values using :meth:`~gensim.models.ldaseqmodel.sslm.update_obs` and
-        :meth:`~gensim.models.ldaseqmodel.sslm.compute_expected_log_prob`.
-
-        Parameters
-        ----------
-        sstats : numpy.ndarray
-            Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the
-            current time slice, expected shape (`self.vocab_len`, `num_topics`).
-
-        Returns
-        -------
-        float
-            The lower bound for the true posterior achieved using the fitted approximate distribution.
-
-        """
-
-        return fit_sslm(self, sstats)
-
-
-class LdaPost(utils.SaveLoad):
-    """Posterior values associated with each set of documents.
-
-    TODO: use **Hoffman, Blei, Bach: Online Learning for Latent Dirichlet Allocation, NIPS 2010.**
-    to update phi, gamma. End game would be to somehow replace LdaPost entirely with LdaModel.
-
-    """
-
-    def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=None, lhood=None):
-        """Initialize the posterior value structure for the given LDA model.
-
-        Parameters
-        ----------
-        doc : list of (int, int)
-            A BOW representation of the document. Each element in the list is a pair of a word's ID and its number
-            of occurences in the document.
-        lda : :class:`~gensim.models.ldamodel.LdaModel`, optional
-            The underlying LDA model.
-        max_doc_len : int, optional
-            The maximum number of words in a document.
-        num_topics : int, optional
-            Number of topics discovered by the LDA model.
-        gamma : numpy.ndarray, optional
-            Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
-        lhood : float, optional
-            The log likelihood lower bound.
-
-        """
-        self.doc = doc
-        self.lda = lda
-        self.gamma = gamma
-        self.lhood = lhood
-        if self.gamma is None:
-            self.gamma = np.zeros(num_topics)
-        if self.lhood is None:
-            self.lhood = np.zeros(num_topics + 1)
-
-        if max_doc_len is not None and num_topics is not None:
-            self.phi = np.zeros((max_doc_len, num_topics))
-            self.log_phi = np.zeros((max_doc_len, num_topics))
-
-        # the following are class variables which are to be integrated during Document Influence Model
-
-        self.doc_weight = None
-        self.renormalized_doc_weight = None
-
-    def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
-                    lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None):
-        """Posterior inference for lda.
-
-        Parameters
-        ----------
-        doc_number : int
-            The documents number.
-        time : int
-            Time slice.
-        ldaseq : object
-            Unused.
-        LDA_INFERENCE_CONVERGED : float
-            Epsilon value used to check whether the inference step has sufficiently converged.
-        lda_inference_max_iter : int
-            Maximum number of iterations in the inference step.
-        g : object
-            Unused. Will be useful when the DIM model is implemented.
-        g3_matrix: object
-            Unused. Will be useful when the DIM model is implemented.
-        g4_matrix: object
-            Unused. Will be useful when the DIM model is implemented.
-        g5_matrix: object
-            Unused. Will be useful when the DIM model is implemented.
-
-        Returns
-        -------
-        float
-            The optimal lower bound for the true posterior using the approximate distribution.
-        """
-
-        return np.array(fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED,
-                lda_inference_max_iter, g, g3_matrix, g4_matrix, g5_matrix))
-
-
-    def update_lda_seq_ss(self, time, doc, topic_suffstats):
-        """Update lda sequence sufficient statistics from an lda posterior.
-
-        This is very similar to the :meth:`~gensim.models.ldaseqmodel.LdaPost.update_gamma` method and uses
-        the same formula.
-
-        Parameters
-        ----------
-        time : int
-            The time slice.
-        doc : list of (int, float)
-            Unused but kept here for backwards compatibility. The document set in the constructor (`self.doc`) is used
-            instead.
-        topic_suffstats : list of float
-            Sufficient statistics for each topic.
-
-        Returns
-        -------
-        list of float
-            The updated sufficient statistics for each topic.
-
-        """
-        num_topics = self.lda.num_topics
-
-        for k in range(num_topics):
-            topic_ss = topic_suffstats[k]
-            n = 0
-            for word_id, count in self.doc:
-                topic_ss[word_id][time] += count * self.phi[n][k]
-                n += 1
-            topic_suffstats[k] = topic_ss
-
-        return topic_suffstats
-
-
-
-
-"[[('idexx', 0.02256082958356584), ('electrical', 0.021255572724880007), ('skills', 0.019944382064918525), ('engineering', 0.015763035068946996), ('models', 0.013750631038974458), ('bank', 0.013583726745509497), ('modeling', 0.01203761886072807), ('complex', 0.010914594579298338), ('engineers', 0.010400197077673822), ('reviews', 0.010341923125796125), ('validation', 0.010293220205119354), ('interns', 0.0102928498107953), ('laboratories', 0.010292839655664899), ('solutions', 0.009697385952098011), ('senior', 0.008667538842761514), ('relationships', 0.008642879046552596), ('developing', 0.008619893796003735), ('documentation', 0.008610828170283354), ('tasks', 0.008605851091404736), ('components', 0.0085837114798782)], [('engineering', 0.02951417287621784), ('skills', 0.028313440245959602), ('knowledge', 0.025956248413893723), ('technical', 0.024245552645358926), ('testing', 0.01727409220691255), ('management', 0.016617274195619907), ('teams', 0.014589132561545997), ('environment', 0.012537027351412494), ('complex', 0.012292851849788823), ('reviews', 0.011188854688188414), ('technology', 0.011122285040026962), ('responsibilities', 0.010945741042933459), ('participates', 0.009658533518869254), ('quality', 0.009587766436523074), ('asrc', 0.009485862858633645), ('applications', 0.009302567710268318), ('water', 0.00928599554777531), ('solutions', 0.008539835904207396), ('hardware', 0.00843600934087972), ('engineers', 0.007947946419547743)]]"
\ No newline at end of file

From 3f07b46374c1290ac4d474449a091014e81227e9 Mon Sep 17 00:00:00 2001
From: Sharganov Artem <sharganov.artem@yandex.ru>
Date: Wed, 23 Jun 2021 17:13:36 +0300
Subject: [PATCH 5/7] Support compatibility with scipy 0.18.1

---
 gensim/models/ldaseq_posterior_inner.pyx | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/gensim/models/ldaseq_posterior_inner.pyx b/gensim/models/ldaseq_posterior_inner.pyx
index 1f838227f4..aa9eb70464 100644
--- a/gensim/models/ldaseq_posterior_inner.pyx
+++ b/gensim/models/ldaseq_posterior_inner.pyx
@@ -1,7 +1,12 @@
-import numpy as np
+#!/usr/bin/env cython
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: cdivision=True
+# cython: embedsignature=True
+
 
-from scipy.special.cython_special cimport gammaln
-from scipy.special.cython_special cimport psi
+import numpy as np
+from scipy.special import gammaln, psi
 
 from libc.math cimport exp, log, abs
 from libc.string cimport memcpy

From 7c6f2838737bef0de364489faebe7a4a45890e3a Mon Sep 17 00:00:00 2001
From: sharganov <sharganov.artem@yandex.ru>
Date: Sat, 15 Jan 2022 21:37:27 +0300
Subject: [PATCH 6/7] Refactored code and removed numpy array allocation from
 loop

---
 gensim/models/ldaseqmodel.py | 110 ++++++++++++-----------------------
 1 file changed, 38 insertions(+), 72 deletions(-)

diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py
index 9ecc60fa43..8ba0e31022 100644
--- a/gensim/models/ldaseqmodel.py
+++ b/gensim/models/ldaseqmodel.py
@@ -13,8 +13,6 @@
 
 #. Include DIM mode. Most of the infrastructure for this is in place.
 #. See if LdaPost can be replaced by LdaModel completely without breaking anything.
-#. Heavy lifting going on in the Sslm class - efforts can be made to cythonise mathematical methods, in particular,
-   update_obs and the optimization takes a lot time.
 #. Try and make it distributed, especially around the E and M step.
 #. Remove all C/C++ coding style/syntax.
 
@@ -154,22 +152,15 @@ def __init__(
         # which in turn has information about each topic
         # the sslm class is described below and contains information
         # on topic-word probabilities and doc-topic probabilities.
-        self.topic_chains = []
-        for topic in range(num_topics):
-            sslm_ = sslm(
-                num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics,
+        self.topic_chains = [
+            sslm(
+                num_time_slices=self.num_time_slices, vocab_len=self.vocab_len,
                 chain_variance=chain_variance, obs_variance=obs_variance
-            )
-            self.topic_chains.append(sslm_)
-
-        # the following are class variables which are to be integrated during Document Influence Model
-        self.top_doc_phis = None
-        self.influence = None
-        self.renormalized_influence = None
-        self.influence_sum_lgl = None
+            ) for i in range(num_topics)]
 
         # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM.
         if corpus is not None and time_slice is not None:
+
             self.max_doc_len = max(len(line) for line in corpus)
 
             if initialize == 'gensim':
@@ -188,7 +179,7 @@ def __init__(
             self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas, self.sstats)
 
             # fit DTM
-            self.fit_lda_seq(corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize)
+            self.fit(corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize)
 
     def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_suffstats):
         """Initialize State Space Language Model, topic-wise.
@@ -207,17 +198,14 @@ def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_s
             Sufficient statistics used for initializing the model, expected shape (`self.vocab_len`, `num_topics`).
 
         """
+        # TODO why do we pass this alpha if it is already attr?
         self.alphas = alpha
         for k, chain in enumerate(self.topic_chains):
+            # а что мы сюда копируем? получается наша chain для каждого топика?
             sstats = init_suffstats[:, k]
             chain.sslm_counts_init(topic_obs_variance, topic_chain_variance, sstats)
 
-            # initialize the below matrices only if running DIM
-            # ldaseq.topic_chains[k].w_phi_l = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
-            # ldaseq.topic_chains[k].w_phi_sum = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
-            # ldaseq.topic_chains[k].w_phi_sq = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
-
-    def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize):
+    def fit(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize):
         """Fit a LDA Sequence model (DTM).
 
         This method will iteratively setup LDA models and perform EM steps until the sufficient statistics convergence,
@@ -250,30 +238,34 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter,
         ITER_MULT_LOW = 2
         MAX_ITER = 500
 
-        num_topics = self.num_topics
-        vocab_len = self.vocab_len
-        data_len = self.num_time_slices
-        corpus_len = self.corpus_len
-
         bound = 0
         convergence = LDASQE_EM_THRESHOLD + 1
         iter_ = 0
 
+        # setting up memory buffer which are used on every iteration of a cycle below
+        gammas = np.zeros((self.corpus_len, self.num_topics))
+        lhoods = np.zeros((self.corpus_len, self.num_topics + 1))
+
+        # initiate sufficient statistics buffer
+        topic_suffstats = [np.zeros((self.vocab_len, self.num_time_slices))
+                           for topic in range(self.num_topics)]
+
+        # main optimization cycle
         while iter_ < em_min_iter or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= em_max_iter):
 
             logger.info(" EM iter %i", iter_)
             logger.info("E Step")
-            # TODO: bound is initialized to 0
+
             old_bound = bound
 
-            # initiate sufficient statistics
-            topic_suffstats = []
-            for topic in range(num_topics):
-                topic_suffstats.append(np.zeros((vocab_len, data_len)))
+            # initiate sufficient statistics (resetting buffers from previous interation)
+            for topic_stat in topic_suffstats:
+                topic_stat[:] = 0.0
+
+            # resetting buffer from previous iteration
+            gammas[:] = 0.0
+            lhoods[:] = 0.0
 
-            # set up variables
-            gammas = np.zeros((corpus_len, num_topics))
-            lhoods = np.zeros((corpus_len, num_topics + 1))
             # compute the likelihood of a sequential corpus under an LDA
             # seq model and find the evidence lower bound. This is the E - Step
             bound, gammas = \
@@ -340,31 +332,22 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
             the posterior.
 
         """
-        num_topics = self.num_topics
-        vocab_len = self.vocab_len
         bound = 0.0
 
-        lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
-        lda.topics = np.zeros((vocab_len, num_topics))
-        ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda)
+        lda = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
 
-        model = "DTM"
-        if model == "DTM":
-            bound, gammas = self.inferDTMseq(
-                corpus, topic_suffstats, gammas, lhoods, lda,
-                ldapost, iter_, bound, lda_inference_max_iter, chunksize
-            )
-        elif model == "DIM":
-            self.InfluenceTotalFixed(corpus)
-            bound, gammas = self.inferDIMseq(
-                corpus, topic_suffstats, gammas, lhoods, lda,
-                ldapost, iter_, bound, lda_inference_max_iter, chunksize
-            )
+        lda.topics = np.zeros((self.vocab_len, self.num_topics))
+
+        ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=self.num_topics, lda=lda)
+        bound, gammas = self.infer_dtm_seq(
+            corpus, topic_suffstats, gammas, lhoods, lda,
+            ldapost, iter_, bound, lda_inference_max_iter, chunksize
+        )
 
         return bound, gammas
 
-    def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda,
-                    ldapost, iter_, bound, lda_inference_max_iter, chunksize):
+    def infer_dtm_seq(self, corpus, topic_suffstats, gammas, lhoods, lda,
+                      ldapost, iter_, bound, lda_inference_max_iter, chunksize):
         """Compute the likelihood of a sequential corpus under an LDA seq model, and reports the likelihood bound.
 
         Parameters
@@ -699,12 +682,11 @@ class sslm(utils.SaveLoad):
 
     """
 
-    def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005):
+    def __init__(self, vocab_len=None, num_time_slices=None, obs_variance=0.5, chain_variance=0.005):
         self.vocab_len = vocab_len
         self.num_time_slices = num_time_slices
         self.obs_variance = obs_variance
         self.chain_variance = chain_variance
-        self.num_topics = num_topics
 
         # setting up matrices
         self.obs = np.zeros((vocab_len, num_time_slices), dtype=np.float64)
@@ -715,17 +697,6 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va
         self.variance = np.zeros((vocab_len, num_time_slices + 1), dtype=np.float64)
         self.zeta = np.zeros(num_time_slices, dtype=np.float64)
 
-        # the following are class variables which are to be integrated during Document Influence Model
-        self.m_update_coeff = None
-        self.mean_t = None
-        self.variance_t = None
-        self.influence_sum_lgl = None
-        self.w_phi_l = None
-        self.w_phi_sum = None
-        self.w_phi_l_sq = None
-        self.m_update_coeff_g = None
-        self.config_c_address = 0
-
     def sslm_counts_init(self, obs_variance, chain_variance, sstats):
         """Initialize the State Space Language Model with LDA sufficient statistics.
 
@@ -811,11 +782,6 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=
             self.phi = np.zeros((max_doc_len, num_topics))
             self.log_phi = np.zeros((max_doc_len, num_topics))
 
-        # the following are class variables which are to be integrated during Document Influence Model
-
-        self.doc_weight = None
-        self.renormalized_doc_weight = None
-
     def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
                      lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None):
         """Posterior inference for lda.
@@ -882,4 +848,4 @@ def update_lda_seq_ss(self, time, doc, topic_suffstats):
                 n += 1
             topic_suffstats[k] = topic_ss
 
-        return topic_suffstats
+        return topic_suffstats
\ No newline at end of file

From f20f196cae231cb4f926c084de1ed4e51dd909db Mon Sep 17 00:00:00 2001
From: sharganov <sharganov.artem@yandex.ru>
Date: Sat, 15 Jan 2022 21:43:51 +0300
Subject: [PATCH 7/7] Fix: changed behavior for cut off as in original
 implementation from Blei's lab

---
 gensim/models/ldaseq_sslm_inner.pyx | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/gensim/models/ldaseq_sslm_inner.pyx b/gensim/models/ldaseq_sslm_inner.pyx
index 5fbd6920c7..62c7efd07c 100644
--- a/gensim/models/ldaseq_sslm_inner.pyx
+++ b/gensim/models/ldaseq_sslm_inner.pyx
@@ -593,10 +593,7 @@ cdef update_obs(StateSpaceLanguageModelConfig *config, REAL_t *sstats, REAL_t *t
         if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not NULL:
             obs = config.obs + w * num_time_slices
             norm_cutoff_obs = <REAL_t *> (np.PyArray_DATA(np_norm_cutoff_obs))
-            # norm_cutoff_obs = <REAL_t *> malloc(config[0].num_time_slices * sizeof(REAL_t))
-            if norm_cutoff_obs == NULL:
-                raise
-            memcpy(norm_cutoff_obs, obs, num_time_slices * sizeof(REAL_t))
+            memcpy(obs, norm_cutoff_obs, num_time_slices * sizeof(REAL_t))
 
         else:
             if counts_norm < OBS_NORM_CUTOFF: