A-quick-tour-of-tMoE

Introduction

TMoE (t Mixture-of-Experts) provides a flexible and robust modelling framework for heterogenous data with possibly heavy-tailed distributions and corrupted by atypical observations. TMoE consists of a mixture of K t expert regressors network (of degree p) gated by a softmax gating network (of degree q) and is represented by:

Model estimation/learning is performed by a dedicated expectation conditional maximization (ECM) algorithm by maximizing the observed data log-likelihood. We provide simulated examples to illustrate the use of the model in model-based clustering of heterogeneous regression data and in fitting non-linear regression functions.

It was written in R Markdown, using the knitr package for production.

See help(package="meteorits") for further details and references provided by citation("meteorits").

Application to a simulated dataset

Generate sample

n <- 500 # Size of the sample
alphak <- matrix(c(0, 8), ncol = 1) # Parameters of the gating network
betak <- matrix(c(0, -2.5, 0, 2.5), ncol = 2) # Regression coefficients of the experts
sigmak <- c(0.5, 0.5) # Standard deviations of the experts
nuk <- c(5, 7) # Degrees of freedom of the experts network t densities
x <- seq.int(from = -1, to = 1, length.out = n) # Inputs (predictors)

# Generate sample of size n
sample <- sampleUnivTMoE(alphak = alphak, betak = betak, sigmak = sigmak, 
                         nuk = nuk, x = x)
y <- sample$y

Set up tMoE model parameters

K <- 2 # Number of regressors/experts
p <- 1 # Order of the polynomial regression (regressors/experts)
q <- 1 # Order of the logistic regression (gating network)

Set up EM parameters

n_tries <- 1
max_iter <- 1500
threshold <- 1e-5
verbose <- TRUE
verbose_IRLS <- FALSE

Estimation

tmoe <- emTMoE(X = x, Y = y, K, p, q, n_tries, max_iter, 
               threshold, verbose, verbose_IRLS)
## EM - tMoE: Iteration: 1 | log-likelihood: -661.194087633915
## EM - tMoE: Iteration: 2 | log-likelihood: -641.118034017673
## EM - tMoE: Iteration: 3 | log-likelihood: -631.662905038756
## EM - tMoE: Iteration: 4 | log-likelihood: -626.55856826644
## EM - tMoE: Iteration: 5 | log-likelihood: -623.59061473488
## EM - tMoE: Iteration: 6 | log-likelihood: -621.767501162972
## EM - tMoE: Iteration: 7 | log-likelihood: -620.597410048867
## EM - tMoE: Iteration: 8 | log-likelihood: -619.818223836116
## EM - tMoE: Iteration: 9 | log-likelihood: -619.282550006116
## EM - tMoE: Iteration: 10 | log-likelihood: -618.903927670549
## EM - tMoE: Iteration: 11 | log-likelihood: -618.629816175435
## EM - tMoE: Iteration: 12 | log-likelihood: -618.427276738059
## EM - tMoE: Iteration: 13 | log-likelihood: -618.275055085626
## EM - tMoE: Iteration: 14 | log-likelihood: -618.159054064068
## EM - tMoE: Iteration: 15 | log-likelihood: -618.069672537698
## EM - tMoE: Iteration: 16 | log-likelihood: -618.00020392537
## EM - tMoE: Iteration: 17 | log-likelihood: -617.945851530725
## EM - tMoE: Iteration: 18 | log-likelihood: -617.903110405362
## EM - tMoE: Iteration: 19 | log-likelihood: -617.869371012933
## EM - tMoE: Iteration: 20 | log-likelihood: -617.842659417472
## EM - tMoE: Iteration: 21 | log-likelihood: -617.821462960664
## EM - tMoE: Iteration: 22 | log-likelihood: -617.804610457525
## EM - tMoE: Iteration: 23 | log-likelihood: -617.791187861453
## EM - tMoE: Iteration: 24 | log-likelihood: -617.780477515413
## EM - tMoE: Iteration: 25 | log-likelihood: -617.771913458661
## EM - tMoE: Iteration: 26 | log-likelihood: -617.765047930176
## EM - tMoE: Iteration: 27 | log-likelihood: -617.759525868953

Summary

tmoe$summary()
## -------------------------------------
## Fitted t Mixture-of-Experts model
## -------------------------------------
## 
## tMoE model with K = 2 experts:
## 
##  log-likelihood df       AIC       BIC       ICL
##       -617.7595 10 -627.7595 -648.8326 -648.8644
## 
## Clustering table (Number of observations in each expert):
## 
##   1   2 
## 249 251 
## 
## Regression coefficients:
## 
##     Beta(k = 1) Beta(k = 2)
## 1    0.04114993  0.06536734
## X^1  2.45913316 -2.62908578
## 
## Variances:
## 
##  Sigma2(k = 1) Sigma2(k = 2)
##      0.3610664    0.08309702

Plots

Mean curve

tmoe$plot(what = "meancurve")

Confidence regions

tmoe$plot(what = "confregions")

Clusters

tmoe$plot(what = "clusters")

Log-likelihood

tmoe$plot(what = "loglikelihood")

Application to a real dataset

Load data

library(MASS)
data("mcycle")
x <- mcycle$times
y <- mcycle$accel

Set up tMoE model parameters

K <- 4 # Number of regressors/experts
p <- 2 # Order of the polynomial regression (regressors/experts)
q <- 1 # Order of the logistic regression (gating network)

Set up EM parameters

n_tries <- 1
max_iter <- 1500
threshold <- 1e-5
verbose <- TRUE
verbose_IRLS <- FALSE

Estimation

tmoe <- emTMoE(X = x, Y = y, K, p, q, n_tries, max_iter, 
               threshold, verbose, verbose_IRLS)
## EM - tMoE: Iteration: 1 | log-likelihood: -591.942308500413
## EM - tMoE: Iteration: 2 | log-likelihood: -589.372299407737
## EM - tMoE: Iteration: 3 | log-likelihood: -587.794015902611
## EM - tMoE: Iteration: 4 | log-likelihood: -584.455742566615
## EM - tMoE: Iteration: 5 | log-likelihood: -574.789229340777
## EM - tMoE: Iteration: 6 | log-likelihood: -565.983526325089
## EM - tMoE: Iteration: 7 | log-likelihood: -562.099246937107
## EM - tMoE: Iteration: 8 | log-likelihood: -561.173856349954
## EM - tMoE: Iteration: 9 | log-likelihood: -560.583906519414
## EM - tMoE: Iteration: 10 | log-likelihood: -559.93965398455
## EM - tMoE: Iteration: 11 | log-likelihood: -559.231993514021
## EM - tMoE: Iteration: 12 | log-likelihood: -558.476295999767
## EM - tMoE: Iteration: 13 | log-likelihood: -557.693656091345
## EM - tMoE: Iteration: 14 | log-likelihood: -556.911169604265
## EM - tMoE: Iteration: 15 | log-likelihood: -556.161830976135
## EM - tMoE: Iteration: 16 | log-likelihood: -555.483400673225
## EM - tMoE: Iteration: 17 | log-likelihood: -554.913166853056
## EM - tMoE: Iteration: 18 | log-likelihood: -554.472683426026
## EM - tMoE: Iteration: 19 | log-likelihood: -554.157205558304
## EM - tMoE: Iteration: 20 | log-likelihood: -553.943101067019
## EM - tMoE: Iteration: 21 | log-likelihood: -553.802286665864
## EM - tMoE: Iteration: 22 | log-likelihood: -553.711030789358
## EM - tMoE: Iteration: 23 | log-likelihood: -553.652138119479
## EM - tMoE: Iteration: 24 | log-likelihood: -553.614051323505
## EM - tMoE: Iteration: 25 | log-likelihood: -553.589274940433
## EM - tMoE: Iteration: 26 | log-likelihood: -553.573023582867
## EM - tMoE: Iteration: 27 | log-likelihood: -553.562255259559
## EM - tMoE: Iteration: 28 | log-likelihood: -553.555034824713
## EM - tMoE: Iteration: 29 | log-likelihood: -553.550126678261

Summary

tmoe$summary()
## -------------------------------------
## Fitted t Mixture-of-Experts model
## -------------------------------------
## 
## tMoE model with K = 4 experts:
## 
##  log-likelihood df       AIC       BIC       ICL
##       -553.5501 26 -579.5501 -617.1247 -617.1197
## 
## Clustering table (Number of observations in each expert):
## 
##  1  2  3  4 
## 28 37 31 37 
## 
## Regression coefficients:
## 
##     Beta(k = 1) Beta(k = 2)  Beta(k = 3) Beta(k = 4)
## 1   -1.05033423  1015.71543 -2257.104595 292.6633863
## X^1 -0.10061109  -106.49028   143.950922 -12.1687226
## X^2 -0.00878205     2.50297    -2.246349   0.1248834
## 
## Variances:
## 
##  Sigma2(k = 1) Sigma2(k = 2) Sigma2(k = 3) Sigma2(k = 4)
##       1.664661      434.9651      147.2707      571.4554

Plots

Mean curve

tmoe$plot(what = "meancurve")

Confidence regions

tmoe$plot(what = "confregions")

Clusters

tmoe$plot(what = "clusters")

Log-likelihood

tmoe$plot(what = "loglikelihood")