-
Notifications
You must be signed in to change notification settings - Fork 18
Open
Description
Hi,
Thank you for the tutorial. I have tested it and come with a new version in which I create a Python class in which the functions you created are methods. Feel free to use it if you find it interesting:
import numpy as np
np.random.seed = 0
from scipy.stats import norm
import matplotlib.pyplot as plt
# define original data as a combination of 3 normal distributions
n_samples = 100
# mean and variance
mu1, sigma1 = -5, 1.2
mu2, sigma2 = 5, 1.8
mu3, sigma3 = 0, 1.6
x1 = np.random.normal(loc=mu1, scale=np.sqrt(sigma1), size=n_samples)
x2 = np.random.normal(loc=mu2, scale=np.sqrt(sigma2), size=n_samples)
x3 = np.random.normal(loc=mu3, scale=np.sqrt(sigma3), size=n_samples)
X = np.concatenate((x1, x2, x3))
def plot_pdf(means, variances, alpha=0.5, linestyle='k--', ax=None):
"""
Plot 1-D data and its PDF curve.
"""
if isinstance(means, int) | isinstance(means, float):
means = list(means)
if isinstance(variances, int) | isinstance(variances, float):
variances = list(variances)
if ax is None:
fig, ax = plt.subplots()
for mean, variance in zip(means, variances):
# Plot a historgram
X = norm.rvs(mean, variance, size=1000)
label=r"$\mu={0:.2f} \ ; \ \sigma={1:.2f}$".format(mean, variance)
ax.hist(X, bins=50, density=True, alpha=alpha, label=label)
# Plot the PDF
x = np.linspace(X.min(), X.max(), 1000)
y = norm.pdf(x, mean, variance)
ax.plot(x, y, linestyle)
ax.legend()
ax.set_ylabel('pdf (-)')
# show the PDF of the original gaussian distributions
plot_pdf([mu1, mu2, mu3], [sigma1, sigma2, sigma3])
class GMM():
def __init__(self, X, n_components):
"""Gaussian mixture model of n components
Parameters:
-----------
X : array-like, shape (n_samples,)
The data.
n_components : int
The number of clusters
Returns:
--------
As methods of the class:
pi : array-like, shape (n_components,)
Mixing coefficients of each mixture components
means : array-like, shape (n_components,)
The means of each mixture component.
variances : array-like, shape (n_components,)
The variances of each mixture component.
"""
pi = np.ones(n_components) / n_components
means = np.random.choice(X, n_components)
variances = np.random.random_sample(size=n_components)
self.X = X
self.n_components = n_components
self.pi = pi
self.means = means
self.variances = variances
def expectation(self):
"""Expectation step in the fitting of the Gaussian Mixture Model
Returns
-------
weights : array-like, shape (n_components, n_samples)
"""
weights = np.zeros((self.n_components, len(self.X)))
for c in range(self.n_components):
weights[c,:] = norm(loc=self.means[c], scale=np.sqrt(self.variances[c])).pdf(self.X)
return weights
def maximization(self, weights):
"""Maximization step in the fitting of the Gaussian Mixture Model
Parameters
----------
weights : array-like, shape (n_components,n_samples)
initilized weights array
Returns
-------
It updates the following methods of the class:
pi : array-like, shape (n_components,)
Mixing coefficients of each mixture components
means : array-like, shape (n_components,)
The means of each mixture component.
variances : array-like, shape (n_components,)
The variances of each mixture component.
"""
pi_new, means_new, variances_new = self.pi, self.means, self.variances
r = (gmm.pi * weights.transpose()).transpose()
for c, r_c in enumerate(r):
r_ic = r_c / r.sum(axis=0)
# update mixing coefficient, mean and variance
pi_new[c] = r_ic.mean()
means_new[c] = np.sum(r_ic * self.X) / r_ic.sum()
variances_new[c] = np.sum(r_ic * (self.X - means_new[c])**2) / r_ic.sum()
self.pi, self.means, self.variances = pi_new, means_new, variances_new
def fit(self, n_steps=50, tol=1e-3, plot_intermediate_steps=None):
"""Fit the Gaussian Mixture Model to the training data. It iterates the expectation and maximization steps as many times as desired
Parameters
----------
n_steps: int
Number of iterations to repeat the expectation-maximization steps
plot_intermediate_steps: int
Whether to plot the PDF of the mixture components every n iteration steps
Returns:
--------
It updates the following methods of the class:
pi : array-like, shape (n_components,)
Mixing coefficients of each mixture components
means : array-like, shape (n_components,)
The means of each mixture component.
variances : array-like, shape (n_components,)
The variances of each mixture component.
It generates the following methods to monitor the evolution of the fitting process
means_step : array-like, shape (n_steps, n_components)
The means of each mixture component in each iteration.
variances : array-like, shape (n_steps, n_components)
The variances of each mixture component in each iteration.
"""
# arrays where the evolution of the parameters will be saved
means_ = np.zeros((n_steps + 1, self.n_components)) * np.nan
variances_ = means_.copy()
# save (and plot) initialized parameters
means_[0,:] = self.means
variances_[0,:] = self.variances
if isinstance(plot_intermediate_steps, int):
self.plot_pdf(title=f'initialization')
for step in range(1, n_steps + 1):
# expectation step
weights = self.expectation()
# maximization step
self.maximization(weights)
# save (and plot) updated parameters
means_[step] = self.means
variances_[step] = self.variances
if isinstance(plot_intermediate_steps, int):
if step % plot_intermediate_steps == 0:
self.plot_pdf(title=f'iteration {step}')
# stop fitting if no significant change in the means and variances
means_old, variances_old = means_[step - 1, :], variances_[step - 1, :]
delta_means = np.max(np.abs(self.means - means_old))
delta_variances = np.max(np.abs(self.variances - variances_old))
if (delta_means < tol) & (delta_variances < tol):
print(f'Fitting stopeed at step {step}')
break
self.means_step, self.variances_step = means_, variances_
def plot_pdf(self, ax=None, **kwargs):
"""
Plot the PDF (probability density functions) of the mixture components
Parameters:
-----------
ax : matplotlib.axes
Axes where the plot will be added. If 'None' (default), a new axes is created
"""
means, variances = self.means, self.variances
if isinstance(means, int) | isinstance(means, float):
means = list(means)
if isinstance(variances, int) | isinstance(variances, float):
variances = list(variances)
if ax is None:
fig, ax = plt.subplots(figsize=kwargs.get('figsize', (5, 4)))
for mean, variance in zip(means, variances):
# Plot a historgram
X = norm.rvs(mean, variance, size=1000)
label=r"$\mu={0:.2f} \ ; \ \sigma={1:.2f}$".format(mean, variance)
ax.hist(X, bins=50, density=True, alpha=kwargs.get('alpha', .5), label=label)
# Plot the PDF
x = np.linspace(X.min(), X.max(), 1000)
y = norm.pdf(x, mean, variance)
ax.plot(x, y, ls=kwargs.get('linestyle', '--'), c=kwargs.get('color', 'k'))
ax.legend()
ax.set_ylabel('pdf (-)')
if 'title' in kwargs:
ax.set_title(kwargs['title'])
def plot_fitting(self, **kwargs):
"""It plots the evolution of the paramenters of the mixture components
"""
fig, ax = plt.subplots(nrows=2, sharex=True, figsize=kwargs.get('figsize', (5, 5)))
for c in range(self.n_components):
ax[0].plot(self.means_step[:,c])
ax[1].plot(self.variances_step[:,c])
ax[0].set_ylabel(r"$\mu$", rotation=0)
ax[1].set_ylabel(r"$\sigma$", rotation=0)
ax[1].set_xlabel('iteration')
# declare the Gaussian Mixture Model
gmm = GMM(X, n_components=3)
# fit the model
gmm.fit(plot_intermediate_steps=10)
# plot the evolution of the means and variances
gmm.plot_fitting()
Metadata
Metadata
Assignees
Labels
No labels