0

mlpack: K GMMs fitting by EM algorithm using K-1 GMMs fitting result

#include <time.h>

#include <mlpack/core.hpp>

#include <mlpack/methods/kmeans/kmeans.hpp>
#include <mlpack/methods/kmeans/allow_empty_clusters.hpp>
#include <mlpack/methods/kmeans/refined_start.hpp>
#include <mlpack/methods/kmeans/max_variance_new_cluster.hpp>

#include <mlpack/methods/gmm/gmm.hpp>
#include <mlpack/methods/gmm/phi.hpp>

#include <mlpack/methods/gmm/no_constraint.hpp>
#include <mlpack/methods/gmm/positive_definite_constraint.hpp>
#include <mlpack/methods/gmm/diagonal_constraint.hpp>
#include <mlpack/methods/gmm/eigenvalue_ratio_constraint.hpp>

int main(int argc, char* argv[])
{
    srand(time(NULL));

    int numDim = 3;
    int numGaussian = 3;
    int numObservation = 1000;
    if(argc > 1)
    {
        str2scalar(argv[1], numObservation);
    }

////////////////////////////////////////////////////////////
// Synthesize ground truth data.
    mlpack::gmm::GMM<> gmm(numGaussian, numDim);
    for(int n = 0; n < numGaussian; ++n)
    {
        gmm.Means()[n] = 10.0*arma::randu<arma::vec>(numDim);
        gmm.Covariances()[n] = arma::eye<arma::mat>(numDim,numDim) + 0.1*arma::randu<arma::mat>(numDim,numDim);
        mlpack::gmm::PositiveDefiniteConstraint::ApplyConstraint(gmm.Covariances()[n]);
    }
    double minWeight = 0.0;
    while(minWeight < (1.0/(double)(numGaussian*5)) )
    {
        gmm.Weights() = arma::randu<arma::vec>(numDim);
        gmm.Weights() /= arma::accu(gmm.Weights());
        minWeight = gmm.Weights().min();
    }
    arma::mat data(numDim, numObservation);
    for(int n = 0; n < numObservation; ++n)
    {
        data.col(n) = gmm.Random();
    }

////////////////////////////////////////////////////////////
// EM algorithm for GMM fitting without initial guess.
    mlpack::gmm::GMM<> gmmCur(numGaussian, numDim);
    clock_t t = clock();
    gmmCur.Estimate(
        data,
        1
    );
    std::cout << "EM algorithm without initial guess: " << (double)(clock()-t)/CLOCKS_PER_SEC << std::endl;
    std::cout << "  EM fitting result without initial guess:" << std::endl;
    for(int n = 0; n < numGaussian; ++n)
    {
        std::cout << "mean[" << n << "] = " << gmmCur.Means()[n].t();
        std::cout << "cov.[" << n << "] = " << gmmCur.Covariances()[n].t();
        std::cout << "wei.[" << n << "] = " << gmmCur.Weights()[n] << std::endl;
    }

////////////////////////////////////////////////////////////
// EM algorithm for GMM fitting with initial guess.
    // step.0: compute K-1 GMM by EM algorithm
    mlpack::gmm::GMM<> gmmPrev(numGaussian-1, numDim);
    gmmPrev.Estimate(
        data,
        1
    );

    // step.1: compute assignment with known K-1 centroids
    t = clock();
    mlpack::kmeans::KMeans<> kmeans; // No overclustering.
    arma::uvec tmp;
    arma::mat centroidPrev = arma::zeros<arma::mat>(numDim,numGaussian-1);
    arma::Col<size_t> assignmentsPrev;
    arma::Col<size_t> clusterCountsPrev = arma::zeros< arma::Col<size_t> >(numGaussian-1);
    for(int n = 0; n < (numGaussian-1); ++n)
    {
        centroidPrev.col(n) = gmmPrev.Means()[n];
    }
    kmeans.Cluster(
        data,
        numGaussian-1,
        assignmentsPrev,
        centroidPrev,
        false,
        true
    );
    tmp = arma::hist(assignmentsPrev, arma::linspace< arma::Col<size_t> >(0,numGaussian-2,numGaussian-1) );
    for(int n = 0; n < (numGaussian-1); ++n)
    {
        clusterCountsPrev(n) = tmp(n);
    }

    // step.2: find K-th centroid
    arma::mat centroidCur = arma::zeros<arma::mat>(numDim,numGaussian);
    arma::Col<size_t> assignmentsCur;
    arma::Col<size_t> clusterCountsCur = arma::zeros< arma::Col<size_t> >(numGaussian);
    for(int n = 0; n < (numGaussian-1); ++n)
    {
        centroidCur.col(n) = centroidPrev.col(n);
        clusterCountsCur(n) = tmp(n);
    }
    size_t numOfChangedPoints = mlpack::kmeans::MaxVarianceNewCluster::EmptyCluster(
        data,
        numGaussian-1,
        centroidCur,
        clusterCountsCur,
        assignmentsPrev
    );
    for(int m = 0; m < numObservation; ++m)
    {
        if( assignmentsPrev(m) == (numGaussian-1) )
        {
            centroidCur.col(numGaussian-1) = data.col(m);
        }
    }

    // step.3: re-compute assignments with K centroids
    kmeans.Cluster(
        data,
        numGaussian,
        assignmentsCur,
        centroidCur,
        false,
        true
    );
    tmp = arma::hist(assignmentsCur, arma::linspace< arma::Col<size_t> >(0,numGaussian-1,numGaussian) );
    for(int n = 0; n < numGaussian; ++n)
    {
        clusterCountsCur(n) = tmp(n);
    }

    // step.4 computes covariance and weight
    mlpack::gmm::GMM<> gmmCur_(numGaussian, numDim); // 3 dimensions, 4 components.
    gmmCur_.Means() = std::vector<arma::vec>(numGaussian, arma::zeros<arma::vec>(numDim));
    gmmCur_.Covariances() = std::vector<arma::mat>(numGaussian, arma::zeros<arma::mat>(numDim,numDim));
    gmmCur_.Weights() = arma::zeros<arma::vec>(numGaussian);
    for(int n = 0; n < numObservation; ++n)
    {
        gmmCur_.Means()[ assignmentsCur[n] ] += data.col(n);
    }
    for(int n = 0; n < numGaussian; ++n)
    {
        gmmCur_.Means()[n] /= (clusterCountsCur[n] > 1) ? (double)clusterCountsCur[n] : 1.0;
        gmmCur_.Weights()(n) = (double)clusterCountsCur(n) / (double)numObservation;
    }
    size_t cluster;
    arma::vec vecDiff;
    for(int n = 0; n < numObservation; ++n)
    {
        cluster = assignmentsCur[n];
        vecDiff = data.col(n) - gmmCur_.Means()[cluster];
        gmmCur_.Covariances()[cluster] += vecDiff * vecDiff.t();
    }
    for(int n = 0; n < numGaussian; ++n)
    {
        gmmCur_.Covariances()[n] /= (clusterCountsCur[n] >1) ? (double)clusterCountsCur[n] : 1.0;
        mlpack::gmm::PositiveDefiniteConstraint::ApplyConstraint(gmmCur_.Covariances()[n]);
    }

    // step.5 EM algorithm with known means, covariances, and weights
    gmmCur_.Estimate(
        data,
        1,
        true
    );

    std::cout << "EM algorithm with initial guess: " << (double)(clock()-t)/CLOCKS_PER_SEC << std::endl;
    std::cout << "  EM fitting result with initial guess:" << std::endl;
    for(int n = 0; n < numGaussian; ++n)
    {
        std::cout << "mean[" << n << "] = " << gmmCur_.Means()[n].t();
        std::cout << "cov.[" << n << "] = " << gmmCur_.Covariances()[n].t();
        std::cout << "wei.[" << n << "] = " << gmmCur_.Weights()[n] << std::endl;
    }

    std::cout << "  Ground truth:" << std::endl;
    for(int n = 0; n < numGaussian; ++n)
    {
        std::cout << "mean[" << n << "] = " << gmm.Means()[n].t();
        std::cout << "cov.[" << n << "] = " << gmm.Covariances()[n].t();
        std::cout << "wei.[" << n << "] = " << gmm.Weights()[n] << std::endl;
    }

    return 0;
}
0

MLPack: generate random data from GMM

int numGauss = 3;
int numDim = 3;
int numObs = 1000;
mlpack::gmm::GMM<> gmm(numGauss,numDim);
arma::mat covar(numDim, numDim);

gmm.Weights() = GenerateWeight(numGauss, 1.0/((double)numGauss*2.0));
// set j-th gmm's mean and covariance
for(int k = 0; k < numGauss; ++k)
{
    gmm.Means()[k] = 255.0*arma::randu&amp;lt; arma::vec &amp;gt;(numDim);
    covar = arma::eye<arma::mat>(numDim, numDim);
    covar += 0.2*arma::randu<arma::mat>(numDim, numDim);
    gmm.Covariances()[k] = arma::symmatu(covar);
}
armaData = arma::zeros(numDim, numObs);
for(int i = 0; i < numObs; ++i)
{
    armaData.col(i) = gmm.Random();
}
0

mlpack: GMM fitting

mlpack::gmm::Estimate() estimates the probability distribution directly from the given observations, using the given algorithm in the FittingType class to fit the data. The fitting will be performed ‘trials’ times; from these trials, the model with the greatest log-likelihood will be selected. By default, only one trial is performed. The log-likelihood of the best fitting is returned.

template<typename FittingType = EMFit<>>
double mlpack::gmm::GMM< FittingType >::Estimate(
 	const arma::mat &  	observations,   // Observations of the model.
	const size_t trials = 1,            // Number of trials to perform; the model in these trials with the greatest log-likelihood will be selected.
	const bool useExistingModel = false	// If true, the existing model is used as an initial model for the estimation.  
)

taking into account the probability of each observation actually being from this distribution

template<typename FittingType = EMFit<>>
double mlpack::gmm::GMM< FittingType >::Estimate(
 	const arma::mat &  	observations,   // Observations of the model.
	const arma::vec &  	probabilities,  // Probability of each observation being from this distribution.
	const size_t trials = 1,            // Number of trials to perform; the model in these trials with the greatest log-likelihood will be selected.
	const bool useExistingModel = false // If true, the existing model is used as an initial model for the estimation.	 
) 	
0

mlpack: create Gaussian Mixture Model

mlpack::gmm::GMM creates a Gaussian Mixture Model with different constructors.

Create an empty Gaussian Mixture Model, with zero gaussians.

template<typename FittingType = EMFit<>>
mlpack::gmm::GMM< FittingType >::GMM()

Create a GMM with the given number of Gaussians, each of which have the specified dimensionality.

template<typename FittingType = EMFit<>>
mlpack::gmm::GMM< FittingType >::GMM(
const size_t      gaussians,     // Number of Gaussians in this GMM.
const size_t      dimensionality // Dimensionality of each Gaussian.
)
// Example usage
// create a GMM with 3 6D Gaussians
size_t gaussians = 3;
size_t dimensionality = 6;
mlpack::gmm::GMM<> gmm(gaussians,dimensionality);

Create a GMM with the given number of Gaussians, each of which have the specified dimensionality.
Also, pass in an initialized FittingType class; this is useful in cases where the FittingType class needs to store some state.

template<typename FittingType = EMFit<>>
mlpack::gmm::GMM< FittingType >::GMM(
const size_t      gaussians,      // Number of Gaussians in this GMM.
const size_t      dimensionality, // Dimensionality of each Gaussian.
FittingType &      fitter            // Initialized fitting mechanism.
)

Create a GMM with the given means, covariances, and weights.

template<typename FittingType = EMFit<>>
mlpack::gmm::GMM< FittingType >::GMM(
const std::vector< arma::vec > & means,       // Means of the model.
const std::vector< arma::mat > & covariances, // Covariances of the model.
const arma::vec &                   weights      // Weights of the model.
)

Create a GMM with the given means, covariances, and weights, and use the given initialized FittingType class.
This is useful in cases where the FittingType class needs to store some state.

template<typename FittingType = EMFit<>>
mlpack::gmm::GMM< FittingType >::GMM(
const std::vector< arma::vec > & means,       // Means of the model.
const std::vector< arma::mat > & covariances, // Covariances of the model.
const arma::vec &                   weights,     // Weights of the model.
FittingType &                       fitter
)
0

mlpack: how to compute Gaussian probability density function

mlpack::gmm::phi() computes univariate/multivariate Gaussian probability density functions.

probability of a univariate Gaussian.
double mlpack::gmm::phi(
    const double x,    // Observation.
    const double mean, // Mean of univariate Gaussian.
    const double var   // Variance of univariate Gaussian.
) // returns Probability of x being observed from the given univariate Gaussian.
// Example usage
    double x, mean, var;
    ....
    double f = phi(x, mean, var);

probability of a multivariate Gaussian.

double mlpack::gmm::phi(
     const arma::vec &      x,    // Observation.
    const arma::vec &      mean, // Mean of multivariate Gaussian.
    const arma::mat &      cov      // Covariance of multivariate Gaussian.
) // returns Probability of x being observed from the given multivariate Gaussian.
// Example usage
    arma::vec x, mean;
    arma::mat cov;
    ....
    double f = phi(x, mean, cov);

a set of probabilities of a multivariate Gaussian.

void mlpack::gmm::phi(
     const arma::mat &      x,            // List of observations.
    const arma::vec &      mean,         // Mean of multivariate Gaussian.
    const arma::mat &      cov,          // Covariance of multivariate Gaussian.
    arma::vec &      probabilities    // Output probabilities for each input observation.
) // Calculates the multivariate Gaussian probability density function for each data point (column) in the given matrix, with respect to the given mean and variance.

probability of a multivariate Gaussian and its gradients

double mlpack::gmm::phi(
     const arma::vec &      x,    // Observation.
    const arma::vec &      mean, // Mean of multivariate Gaussian.
    const arma::mat &      cov,  // Covariance of multivariate Gaussian.
    const std::vector< arma::mat > & d_cov,  //
    arma::vec &                         g_mean, // gradients w.r.t. the mean
    arma::vec &                         g_cov     // gradients w.r.t. the covariance
) // Calculates the multivariate Gaussian probability density function and also the gradients with respect to the mean and the variance.
// Example usage
    arma::vec x, mean, g_mean, g_cov;
    std::vector<arma::mat> d_cov; // the dSigma
    ....
    double f = phi(x, mean, cov, d_cov, &g_mean, &g_cov);
0

mlpack installation on Ubuntu 12.04

mlpack is a C++ machine learning library, which is developed by the fundamental algorithmic and statistical tools laboratory (FASTLab) at Georgia Tech. Machine learning methods implemented in mlpack are following:

mlpack requires

0. Prerequisities
0.1. BLAS

  1. sudo apt-get install libblas-dev

0.2. LAPACK
Armadillo requires LAPACK to be compiled with gfortran.
#Lapack with gfortran

  1. wget http://www.netlib.org/lapack/lapack-3.5.0.tgz
  2. tar xzf lapack-3.5.0.tgz
  3. cd lapack-3.5.0/
  4. cmake-gui .
  5. # set CMAKE_Fortran_COMPILER = /usr/bin/gfortran for Armadillo
  6. # set CMAKE_Fortran_FLAGS = -fPIC
  7. make
  8. sudo make install

0.3 Armadillo >= 3.6.0 (with LAPACK support)

  1. wget http://downloads.sourceforge.net/project/arma/armadillo-3.930.2.tar.gz
  2. tar xzf armadillo-3.930.2.tar.gz
  3. cd armadillo-3.930.2
  4. cmake-gui .
  5. make
  6. sudo make install

0.4 LibXML2 >= 2.6.0

  1. sudo apt-get -yV install libxml2-dev

1. MLpack

  1. sudo apt-get -yV install txt2man
  2. wget http://www.mlpack.org/files/mlpack-1.0.7.tar.gz
  3. tar xzf mlpack-1.0.7.tar.gz
  4. mkdir mlpack-1.0.7-build
  5. cd mlpack-1.0.7-build
  6. cmake-gui ../mlpack-1.0.7
  7. make
  8. sudo make install
  9. sudo ldconfig