The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.
Full R-Code for:
Maier, M. J. (2014). DirichletReg: Dirichlet Regression for Compositional Data in R. Research Report Series/Department of Statistics and Mathematics, 125. WU Vienna University of Economics and Business, Vienna. https://epub.wu.ac.at/4077/
library("DirichletReg")
head(ArcticLake)
## sand silt clay depth
## 1 0.775 0.195 0.030 10.4
## 2 0.719 0.249 0.032 11.7
## 3 0.507 0.361 0.132 12.8
## 4 0.522 0.409 0.066 13.0
## 5 0.700 0.265 0.035 15.7
## 6 0.665 0.322 0.013 16.3
DR_data(ArcticLake[, 1:3])
AL <-## Warning in DR_data(ArcticLake[, 1:3]): not all rows sum up to 1 => normalization
## forced
1:6, ]
AL[## sand silt clay
## 1 0.7750000 0.1950000 0.0300000
## 2 0.7190000 0.2490000 0.0320000
## 3 0.5070000 0.3610000 0.1320000
## 4 0.5235707 0.4102307 0.0661986
## 5 0.7000000 0.2650000 0.0350000
## 6 0.6650000 0.3220000 0.0130000
Code for Fig. 1 (left):
plot(AL, cex = 0.5, a2d = list(colored = FALSE, c.grid = FALSE))
Figure 1: Arctic lake: Ternary plot and depth vs. composition. (left)
Code for Fig. 1 (right):
plot(rep(ArcticLake$depth, 3), as.numeric(AL), pch = 21, bg = rep(c("#E495A5", "#86B875",
"#7DB0DD"), each = 39), xlab = "Depth (m)", ylab = "Proportion", ylim = 0:1)
Figure 1: Arctic lake: Ternary plot and depth vs. composition. (right)
DirichReg(AL ~ depth, ArcticLake)
lake1 <-
lake1## Call:
## DirichReg(formula = AL ~ depth, data = ArcticLake)
## using the common parametrization
##
## Log-likelihood: 101.4 on 6 df (104 BFGS + 1 NR Iterations)
##
## -----------------------------------------
## Coefficients for variable no. 1: sand
## (Intercept) depth
## 0.11662 0.02335
## -----------------------------------------
## Coefficients for variable no. 2: silt
## (Intercept) depth
## -0.31060 0.05557
## -----------------------------------------
## Coefficients for variable no. 3: clay
## (Intercept) depth
## -1.1520 0.0643
## -----------------------------------------
coef(lake1)
## $sand
## (Intercept) depth
## 0.11662480 0.02335114
##
## $silt
## (Intercept) depth
## -0.31059591 0.05556745
##
## $clay
## (Intercept) depth
## -1.15195642 0.06430175
update(lake1, . ~ . + I(depth^2) | . + I(depth^2) | . + I(depth^2))
lake2 <-anova(lake1, lake2)
## Analysis of Deviance Table
##
## Model 1: DirichReg(formula = AL ~ depth, data = ArcticLake)
## Model 2: DirichReg(formula = AL ~ depth + I(depth^2) | depth + I(depth^2) |
## depth + I(depth^2), data = ArcticLake)
##
## Deviance N. par Difference df Pr(>Chi)
## Model 1 -202.74 6
## Model 2 -217.99 9 15.254 3 0.001612 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lake2)
## Call:
## DirichReg(formula = AL ~ depth + I(depth^2) | depth + I(depth^2) | depth +
## I(depth^2), data = ArcticLake)
##
## Standardized Residuals:
## Min 1Q Median 3Q Max
## sand -1.7647 -0.7080 -0.1786 0.9598 3.0460
## silt -1.1379 -0.5330 -0.1546 0.2788 1.5604
## clay -1.7661 -0.6583 -0.0454 0.6584 2.0152
##
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 1: sand
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4361967 0.8026814 1.789 0.0736 .
## depth -0.0072382 0.0329433 -0.220 0.8261
## I(depth^2) 0.0001324 0.0002761 0.480 0.6315
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 2: silt
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0259705 0.7598827 -0.034 0.9727
## depth 0.0717450 0.0343089 2.091 0.0365 *
## I(depth^2) -0.0002679 0.0003088 -0.867 0.3857
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 3: clay
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7931487 0.7362293 -2.436 0.01487 *
## depth 0.1107906 0.0357705 3.097 0.00195 **
## I(depth^2) -0.0004872 0.0003308 -1.473 0.14079
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-likelihood: 109 on 9 df (162 BFGS + 2 NR Iterations)
## AIC: -200, BIC: -185
## Number of Observations: 39
## Link: Log
## Parametrization: common
Code for Fig. 2:
par(mar = c(4, 4, 4, 4) + 0.1)
plot(rep(ArcticLake$depth, 3), as.numeric(AL), pch = 21, bg = rep(c("#E495A5", "#86B875",
"#7DB0DD"), each = 39), xlab = "Depth (m)", ylab = "Proportion", ylim = 0:1,
main = "Sediment Composition in an Arctic Lake")
data.frame(depth = seq(min(ArcticLake$depth), max(ArcticLake$depth), length.out = 100))
Xnew <-for (i in 1:3) lines(cbind(Xnew, predict(lake2, Xnew)[, i]), col = c("#E495A5", "#86B875",
"#7DB0DD")[i], lwd = 2)
legend("topleft", legend = c("Sand", "Silt", "Clay"), lwd = 2, col = c("#E495A5",
"#86B875", "#7DB0DD"), pt.bg = c("#E495A5", "#86B875", "#7DB0DD"), pch = 21,
bty = "n")
par(new = TRUE)
plot(cbind(Xnew, predict(lake2, Xnew, F, F, T)), lty = "24", type = "l", ylim = c(0,
max(predict(lake2, Xnew, F, F, T))), axes = F, ann = F, lwd = 2)
axis(4)
mtext(expression(paste("Precision (", phi, ")", sep = "")), 4, line = 3)
legend("top", legend = c(expression(hat(mu[c] == hat(alpha)[c]/hat(alpha)[0])), expression(hat(phi) ==
hat(alpha)[0])), lty = c(1, 2), lwd = c(3, 2), bty = "n")
Figure 2: Arctic lake: Fitted values of the quadratic model.
ArcticLake
AL <-$AL <- DR_data(ArcticLake[, 1:3])
AL## Warning in DR_data(ArcticLake[, 1:3]): not all rows sum up to 1 => normalization
## forced
range(ArcticLake$depth)
dd <- data.frame(depth = seq(dd[1], dd[2], length.out = 200))
X <-
predict(DirichReg(AL ~ depth + I(depth^2), AL), X) pp <-
Code for Fig. 3:
plot(AL$AL, cex = 0.1, reset_par = FALSE)
points(toSimplex(AL$AL), pch = 16, cex = 0.5, col = gray(0.5))
lines(toSimplex(pp), lwd = 3, col = c("#6E1D34", "#004E42")[2])
log(cbind(ArcticLake[, 2]/ArcticLake[, 1], ArcticLake[, 3]/ArcticLake[, 1]))
Dols <-
lm(Dols ~ depth + I(depth^2), ArcticLake)
ols <- predict(ols, X)
p2 <- exp(cbind(0, p2[, 1], p2[, 2]))/rowSums(exp(cbind(0, p2[, 1], p2[, 2])))
p2m <-
lines(toSimplex(p2m), lwd = 3, col = c("#6E1D34", "#004E42")[1], lty = "21")
Figure 3: Arctic lake: OLS (dashed) vs. Dirichlet regression (solid) predictions.
BloodSamples
Bld <-$Smp <- DR_data(Bld[, 1:4])
Bld## Warning in DR_data(Bld[, 1:4]): not all rows sum up to 1 => normalization forced
DirichReg(Smp ~ Disease | 1, Bld, model = "alternative", base = 3)
blood1 <- DirichReg(Smp ~ Disease | Disease, Bld, model = "alternative", base = 3)
blood2 <-anova(blood1, blood2)
## Analysis of Deviance Table
##
## Model 1: DirichReg(formula = Smp ~ Disease | 1, data = Bld, model =
## "alternative", base = 3)
## Model 2: DirichReg(formula = Smp ~ Disease | Disease, data = Bld, model =
## "alternative", base = 3)
##
## Deviance N. par Difference df Pr(>Chi)
## Model 1 -303.86 7
## Model 2 -304.61 8 0.7587 1 0.3837
summary(blood1)
## Call:
## DirichReg(formula = Smp ~ Disease | 1, data = Bld, model = "alternative", base
## = 3)
##
## Standardized Residuals:
## Min 1Q Median 3Q Max
## Albumin -2.1310 -0.9307 -0.1234 0.8149 2.8429
## Pre.Albumin -1.0687 -0.4054 -0.0789 0.1947 1.5691
## Globulin.A -2.0503 -1.0392 0.1938 0.7927 2.2393
## Globulin.B -1.8176 -0.5347 0.1488 0.5115 1.3284
##
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: Albumin
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.11639 0.09935 11.237 <2e-16 ***
## DiseaseB -0.07002 0.13604 -0.515 0.607
## ------------------------------------------------------------------
## Coefficients for variable no. 2: Pre.Albumin
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5490 0.1082 5.076 3.86e-07 ***
## DiseaseB -0.1276 0.1493 -0.855 0.393
## ------------------------------------------------------------------
## Coefficients for variable no. 3: Globulin.A
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 4: Globulin.B
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4863 0.1094 4.445 8.8e-06 ***
## DiseaseB 0.1819 0.1472 1.236 0.216
## ------------------------------------------------------------------
##
## PRECISION MODEL:
## ------------------------------------------------------------------
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.2227 0.1475 28.64 <2e-16 ***
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-likelihood: 151.9 on 7 df (44 BFGS + 1 NR Iterations)
## AIC: -289.9, BIC: -280
## Number of Observations: 30
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative
Code for Fig.~:
par(mfrow = c(1, 4), mar = c(4, 4, 4, 2) + 0.25)
for (i in 1:4) {
boxplot(Bld$Smp[, i] ~ Bld$Disease, ylim = range(Bld$Smp[, 1:4]), main = paste(names(Bld)[i]),
xlab = "Disease Type", ylab = "Proportion")
segments(c(-5, 1.5), unique(fitted(blood2)[, i]), c(1.5, 5), unique(fitted(blood2)[,
lwd = 2, lty = 2)
i]), }
Blood samples: Box plots and fitted values (dashed lines indicate the fitted values for each group).
predict(blood2, data.frame(Disease = factor(c("A", "B"))), F, T, F)
alpha <- sapply(1:2, function(i) ddirichlet(DR_data(Bld[31:36, 1:4]), unlist(alpha[i,
L <-
])))## Warning in DR_data(Bld[31:36, 1:4]): not all rows sum up to 1 => normalization
## forced
## Warning in DR_data(Bld[31:36, 1:4]): not all rows sum up to 1 => normalization
## forced
L/rowSums(L)
LP <-dimnames(LP) <- list(paste("C", 1:6), c("A", "B"))
print(data.frame(round(LP * 100, 1), pred. = as.factor(ifelse(LP[, 1] > LP[, 2],
"==> A", "==> B"))), print.gap = 2)
## A B pred.
## C 1 59.4 40.6 ==> A
## C 2 43.2 56.8 ==> B
## C 3 38.4 61.6 ==> B
## C 4 43.8 56.2 ==> B
## C 5 36.6 63.4 ==> B
## C 6 70.2 29.8 ==> A
Code for Fig.~:
DR_data(BloodSamples[, c(1, 2, 4)])
B2 <-## Warning in DR_data(BloodSamples[, c(1, 2, 4)]): not all rows sum up to 1 =>
## normalization forced
plot(B2, cex = 0.001, reset_par = FALSE)
colorRampPalette(c("#023FA5", "#c0c0c0", "#8E063B"))(100)
div.col <-
# expected values
(alpha/rowSums(alpha))[, c(1, 2, 4)]
temp <-points(toSimplex(temp/rowSums(temp)), pch = 22, bg = div.col[c(1, 100)], cex = 2,
lwd = 0.25)
# known values
B2[1:30, ]
temp <-points(toSimplex(temp/rowSums(temp)), pch = 21, bg = (div.col[c(1, 100)])[BloodSamples$Disease[1:30]],
cex = 0.5, lwd = 0.25)
# unclassified
B2[31:36, ]
temp <-points(toSimplex(temp/rowSums(temp)), pch = 21, bg = div.col[round(100 * LP[, 2],
0)], cex = 1, lwd = 0.5)
legend("topright", bty = "n", legend = c("Disease A", "Disease B", NA, "Expected Values"),
pch = c(21, 21, NA, 22), pt.bg = c(div.col[c(1, 100)], NA, "white"))
Blood samples: Observed values and predictions
ReadingSkills
RS <-$acc <- DR_data(RS$accuracy)
RS## only one variable in [0, 1] supplied - beta-distribution assumed.
## check this assumption.
$dyslexia <- C(RS$dyslexia, treatment)
RS DirichReg(acc ~ dyslexia * iq | dyslexia * iq, RS, model = "alternative")
rs1 <- DirichReg(acc ~ dyslexia * iq | dyslexia + iq, RS, model = "alternative")
rs2 <-anova(rs1, rs2)
## Analysis of Deviance Table
##
## Model 1: DirichReg(formula = acc ~ dyslexia * iq | dyslexia * iq, data = RS,
## model = "alternative")
## Model 2: DirichReg(formula = acc ~ dyslexia * iq | dyslexia + iq, data = RS,
## model = "alternative")
##
## Deviance N. par Difference df Pr(>Chi)
## Model 1 -133.47 8
## Model 2 -131.80 7 1.6645 1 0.197
Code for Fig.~:
as.numeric(RS$dyslexia)
g.ind <- g.ind == 1 # normal
g1 <- g.ind != 1 # dyslexia
g2 <-par(mar = c(4, 4, 4, 4) + 0.25)
plot(accuracy ~ iq, RS, pch = 21, bg = c("#E495A5", "#39BEB1")[3 - g.ind], cex = 1.5,
main = "Dyslexic (Red) vs. Control (Green) Group", xlab = "IQ Score", ylab = "Reading Accuracy",
xlim = range(ReadingSkills$iq))
seq(min(RS$iq[g1]), max(RS$iq[g1]), length.out = 200)
x1 <- seq(min(RS$iq[g2]), max(RS$iq[g2]), length.out = 200)
x2 <- length(x1)
n <- data.frame(dyslexia = factor(rep(0:1, each = n), levels = 0:1, labels = c("no",
X <-"yes")), iq = c(x1, x2))
predict(rs2, X, TRUE, TRUE, TRUE)
pv <-
lines(x1, pv$mu[1:n, 2], col = c("#E495A5", "#39BEB1")[2], lwd = 3)
lines(x2, pv$mu[(n + 1):(2 * n), 2], col = c("#E495A5", "#39BEB1")[1], lwd = 3)
RS$accuracy
a <- log(a/(1 - a))
logRa_a <- lm(logRa_a ~ dyslexia * iq, RS)
rlr <- 1/(1 + exp(-predict(rlr, X)))
ols <-lines(x1, ols[1:n], col = c("#AD6071", "#00897D")[2], lwd = 3, lty = 2)
lines(x2, ols[(n + 1):(2 * n)], col = c("#AD6071", "#00897D")[1], lwd = 3, lty = 2)
### precision plot
par(new = TRUE)
plot(x1, pv$phi[1:n], col = c("#6E1D34", "#004E42")[2], lty = "11", type = "l", ylim = c(0,
max(pv$phi)), axes = F, ann = F, lwd = 2, xlim = range(RS$iq))
lines(x2, pv$phi[(n + 1):(2 * n)], col = c("#6E1D34", "#004E42")[1], lty = "11",
type = "l", lwd = 2)
axis(4)
mtext(expression(paste("Precision (", phi, ")", sep = "")), 4, line = 3)
legend("topleft", legend = c(expression(hat(mu)), expression(hat(phi)), "OLS"), lty = c(1,
3, 2), lwd = c(3, 2, 3), bty = "n")
Reading skills: Predicted values of Dirichlet regression and OLS regression.
RS$accuracy
a <- log(a/(1 - a))
logRa_a <- lm(logRa_a ~ dyslexia * iq, RS)
rlr <-summary(rlr)
##
## Call:
## lm(formula = logRa_a ~ dyslexia * iq, data = RS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.66405 -0.37966 0.03687 0.40887 2.50345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8067 0.2822 9.944 2.27e-12 ***
## dyslexiayes -2.4113 0.4517 -5.338 4.01e-06 ***
## iq 0.7823 0.2992 2.615 0.0125 *
## dyslexiayes:iq -0.8457 0.4510 -1.875 0.0681 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.2 on 40 degrees of freedom
## Multiple R-squared: 0.6151, Adjusted R-squared: 0.5862
## F-statistic: 21.31 on 3 and 40 DF, p-value: 2.083e-08
summary(rs2)
## Call:
## DirichReg(formula = acc ~ dyslexia * iq | dyslexia + iq, data = RS, model =
## "alternative")
##
## Standardized Residuals:
## Min 1Q Median 3Q Max
## 1 - accuracy -1.5661 -0.8204 -0.5112 0.5211 3.4334
## accuracy -3.4334 -0.5211 0.5112 0.8204 1.5661
##
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: 1 - accuracy
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 2: accuracy
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.8649 0.2991 6.235 4.52e-10 ***
## dyslexiayes -1.4833 0.3029 -4.897 9.74e-07 ***
## iq 1.0676 0.3359 3.178 0.001482 **
## dyslexiayes:iq -1.1625 0.3452 -3.368 0.000757 ***
## ------------------------------------------------------------------
##
## PRECISION MODEL:
## ------------------------------------------------------------------
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.5579 0.3336 4.670 3.01e-06 ***
## dyslexiayes 3.4931 0.5880 5.941 2.83e-09 ***
## iq 1.2291 0.4596 2.674 0.00749 **
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-likelihood: 65.9 on 7 df (56 BFGS + 2 NR Iterations)
## AIC: -117.8, BIC: -105.3
## Number of Observations: 44
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative
confint(rs2)
##
## 95% Confidence Intervals (original form)
##
## - Beta-Parameters:
## Variable: 1 - accuracy
## variable omitted
##
## Variable: accuracy
## 2.5% Est. 97.5%
## (Intercept) 1.279 1.86 2.451
## dyslexiayes -2.077 -1.48 -0.890
## iq 0.409 1.07 1.726
## dyslexiayes:iq -1.839 -1.16 -0.486
##
## - Gamma-Parameters
## 2.5% Est. 97.5%
## (Intercept) 0.904 1.56 2.21
## dyslexiayes 2.341 3.49 4.65
## iq 0.328 1.23 2.13
confint(rs2, exp = TRUE)
##
## 95% Confidence Intervals (exponentiated)
##
## - Beta-Parameters:
## Variable: 1 - accuracy
## variable omitted
##
## Variable: accuracy
## 2.5% exp(Est.) 97.5%
## (Intercept) 3.592 6.455 11.601
## dyslexiayes 0.125 0.227 0.411
## iq 1.506 2.908 5.618
## dyslexiayes:iq 0.159 0.313 0.615
##
## - Gamma-Parameters
## 2.5% exp(Est.) 97.5%
## (Intercept) 2.47 4.75 9.13
## dyslexiayes 10.39 32.89 104.12
## iq 1.39 3.42 8.41
Code for Fig.~:
c("#E495A5", "#39BEB1")[3 - as.numeric(RS$dyslexia)]
gcol <- c(-3, 3)
tmt <-
par(mfrow = c(3, 2), cex = 0.8)
qqnorm(residuals(rlr, "pearson"), ylim = tmt, xlim = tmt, pch = 21, bg = gcol, main = "Normal Q-Q-Plot: OLS Residuals",
cex = 0.75, lwd = 0.5)
abline(0, 1, lwd = 2)
qqline(residuals(rlr, "pearson"), lty = 2)
qqnorm(residuals(rs2, "standardized")[, 2], ylim = tmt, xlim = tmt, pch = 21, bg = gcol,
main = "Normal Q-Q-Plot: DirichReg Residuals", cex = 0.75, lwd = 0.5)
abline(0, 1, lwd = 2)
qqline(residuals(rs2, "standardized")[, 2], lty = 2)
plot(ReadingSkills$iq, residuals(rlr, "pearson"), pch = 21, bg = gcol, ylim = c(-3,
3), main = "OLS Residuals", xlab = "IQ", ylab = "Pearson Residuals", cex = 0.75,
lwd = 0.5)
abline(h = 0, lty = 2)
plot(ReadingSkills$iq, residuals(rs2, "standardized")[, 2], pch = 21, bg = gcol,
ylim = c(-3, 3), main = "DirichReg Residuals", xlab = "IQ", ylab = "Standardized Residuals",
cex = 0.75, lwd = 0.5)
abline(h = 0, lty = 2)
plot(fitted(rlr), residuals(rlr, "pearson"), pch = 21, bg = gcol, ylim = c(-3, 3),
main = "OLS Residuals", xlab = "Fitted", ylab = "Pearson Residuals", cex = 0.75,
lwd = 0.5)
abline(h = 0, lty = 2)
plot(fitted(rs2)[, 2], residuals(rs2, "standardized")[, 2], pch = 21, bg = gcol,
ylim = c(-3, 3), main = "DirichReg Residuals", xlab = "Fitted", ylab = "Standardized Residuals",
cex = 0.75, lwd = 0.5)
abline(h = 0, lty = 2)
Reading skills: residual plots of OLS and Dirichlet regression models.
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.