Skip to content

Suggestion using a class #1

@casadoj

Description

@casadoj

Hi,

Thank you for the tutorial. I have tested it and come with a new version in which I create a Python class in which the functions you created are methods. Feel free to use it if you find it interesting:

import numpy as np
np.random.seed = 0
from scipy.stats import norm
import matplotlib.pyplot as plt

# define original data as a combination of 3 normal distributions

n_samples = 100
# mean and variance
mu1, sigma1 = -5, 1.2 
mu2, sigma2 = 5, 1.8 
mu3, sigma3 = 0, 1.6 

x1 = np.random.normal(loc=mu1, scale=np.sqrt(sigma1), size=n_samples)
x2 = np.random.normal(loc=mu2, scale=np.sqrt(sigma2), size=n_samples)
x3 = np.random.normal(loc=mu3, scale=np.sqrt(sigma3), size=n_samples)

X = np.concatenate((x1, x2, x3))



def plot_pdf(means, variances, alpha=0.5, linestyle='k--', ax=None):
    """
    Plot 1-D data and its PDF curve.

    """
    
    if isinstance(means, int) | isinstance(means, float):
        means = list(means)
    if isinstance(variances, int) | isinstance(variances, float):
        variances = list(variances)
    
    if ax is None:
        fig, ax = plt.subplots()
    for mean, variance in zip(means, variances):
        # Plot a historgram
        X = norm.rvs(mean, variance, size=1000)
        label=r"$\mu={0:.2f} \ ; \ \sigma={1:.2f}$".format(mean, variance)
        ax.hist(X, bins=50, density=True, alpha=alpha, label=label)

        # Plot the PDF
        x = np.linspace(X.min(), X.max(), 1000)
        y = norm.pdf(x, mean, variance)
        ax.plot(x, y, linestyle)
    
    ax.legend()
    ax.set_ylabel('pdf (-)')


# show the PDF of the original gaussian distributions
plot_pdf([mu1, mu2, mu3], [sigma1, sigma2, sigma3])


class GMM():
    
    def __init__(self, X, n_components):
        """Gaussian mixture model of n components
        
        Parameters:
        -----------
        X : array-like, shape (n_samples,)
            The data.
        n_components : int
            The number of clusters
            
        Returns:
        --------
        As methods of the class:
        pi : array-like, shape (n_components,)
            Mixing coefficients of each mixture components
        means : array-like, shape (n_components,)
            The means of each mixture component.
        variances : array-like, shape (n_components,)
            The variances of each mixture component.
        """

        pi = np.ones(n_components) / n_components
        means = np.random.choice(X, n_components)
        variances = np.random.random_sample(size=n_components)

        self.X = X
        self.n_components = n_components
        self.pi = pi
        self.means = means
        self.variances = variances
        
    def expectation(self):
        """Expectation step in the fitting of the Gaussian Mixture Model

        Returns
        -------
        weights : array-like, shape (n_components, n_samples)
        """

        weights = np.zeros((self.n_components, len(self.X)))
        for c in range(self.n_components):
            weights[c,:] = norm(loc=self.means[c], scale=np.sqrt(self.variances[c])).pdf(self.X)

        return weights
    
    def maximization(self, weights):
        """Maximization step in the fitting of the Gaussian Mixture Model

        Parameters
        ----------
        weights : array-like, shape (n_components,n_samples)
            initilized weights array

        Returns
        -------
        It updates the following methods of the class:
        pi : array-like, shape (n_components,)
            Mixing coefficients of each mixture components
        means : array-like, shape (n_components,)
            The means of each mixture component.
        variances : array-like, shape (n_components,)
            The variances of each mixture component.
        """
        
        pi_new, means_new, variances_new = self.pi, self.means, self.variances
        r = (gmm.pi * weights.transpose()).transpose()
        for c, r_c in enumerate(r):  

            r_ic = r_c / r.sum(axis=0)
            
            # update mixing coefficient, mean and variance
            pi_new[c] = r_ic.mean()
            means_new[c] = np.sum(r_ic * self.X) / r_ic.sum()
            variances_new[c] = np.sum(r_ic * (self.X - means_new[c])**2) / r_ic.sum()
            
        self.pi, self.means, self.variances = pi_new, means_new, variances_new
            
    def fit(self, n_steps=50, tol=1e-3, plot_intermediate_steps=None):
        """Fit the Gaussian Mixture Model to the training data. It iterates the expectation and maximization steps as many times as desired

        Parameters
        ----------
        n_steps: int
            Number of iterations to repeat the expectation-maximization steps
        plot_intermediate_steps: int
            Whether to plot the PDF of the mixture components every n iteration steps
            
        Returns:
        --------
        It updates the following methods of the class:
        pi : array-like, shape (n_components,)
            Mixing coefficients of each mixture components
        means : array-like, shape (n_components,)
            The means of each mixture component.
        variances : array-like, shape (n_components,)
            The variances of each mixture component.
            
        It generates the following methods to monitor the evolution of the fitting process
        means_step : array-like, shape (n_steps, n_components)
            The means of each mixture component in each iteration.
        variances : array-like, shape (n_steps, n_components)
            The variances of each mixture component in each iteration.
        """
        
        # arrays where the evolution of the parameters will be saved
        means_ = np.zeros((n_steps + 1, self.n_components)) * np.nan
        variances_ = means_.copy()
        
        # save (and plot) initialized parameters
        means_[0,:] = self.means
        variances_[0,:] = self.variances
        if isinstance(plot_intermediate_steps, int):
            self.plot_pdf(title=f'initialization')

        for step in range(1, n_steps + 1):
            # expectation step
            weights = self.expectation()
            # maximization step
            self.maximization(weights)
            # save (and plot) updated parameters
            means_[step] = self.means
            variances_[step] = self.variances
            if isinstance(plot_intermediate_steps, int):
                if step % plot_intermediate_steps == 0:
                    self.plot_pdf(title=f'iteration {step}')
            # stop fitting if no significant change in the means and variances
            means_old, variances_old = means_[step - 1, :], variances_[step - 1, :]
            delta_means = np.max(np.abs(self.means - means_old))
            delta_variances = np.max(np.abs(self.variances - variances_old))
            if (delta_means < tol) & (delta_variances < tol):
                print(f'Fitting stopeed at step {step}')
                break

        self.means_step, self.variances_step = means_, variances_
    
    def plot_pdf(self, ax=None, **kwargs):
        """
        Plot the PDF (probability density functions) of the mixture components
        
        Parameters:
        -----------
        ax : matplotlib.axes
            Axes where the plot will be added. If 'None' (default), a new axes is created
        """
        
        means, variances = self.means, self.variances

        if isinstance(means, int) | isinstance(means, float):
            means = list(means)
        if isinstance(variances, int) | isinstance(variances, float):
            variances = list(variances)
        
        if ax is None:
            fig, ax = plt.subplots(figsize=kwargs.get('figsize', (5, 4)))
            
        for mean, variance in zip(means, variances):
            # Plot a historgram
            X = norm.rvs(mean, variance, size=1000)
            label=r"$\mu={0:.2f} \ ; \ \sigma={1:.2f}$".format(mean, variance)
            ax.hist(X, bins=50, density=True, alpha=kwargs.get('alpha', .5), label=label)

            # Plot the PDF
            x = np.linspace(X.min(), X.max(), 1000)
            y = norm.pdf(x, mean, variance)
            ax.plot(x, y, ls=kwargs.get('linestyle', '--'), c=kwargs.get('color', 'k'))

        ax.legend()
        ax.set_ylabel('pdf (-)')
        if 'title' in kwargs:
            ax.set_title(kwargs['title'])
            
    def plot_fitting(self, **kwargs):
        """It plots the evolution of the paramenters of the mixture components
        """
        
        fig, ax = plt.subplots(nrows=2, sharex=True, figsize=kwargs.get('figsize', (5, 5)))
        for c in range(self.n_components):
            ax[0].plot(self.means_step[:,c])
            ax[1].plot(self.variances_step[:,c])
        ax[0].set_ylabel(r"$\mu$", rotation=0)
        ax[1].set_ylabel(r"$\sigma$", rotation=0)
        ax[1].set_xlabel('iteration')


# declare the Gaussian Mixture Model
gmm = GMM(X, n_components=3)

# fit the model
gmm.fit(plot_intermediate_steps=10)

# plot the evolution of the means and variances
gmm.plot_fitting()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions