Introduction
In this vignette we will explain how some functions of the package are used in order to estimate a contingency table. We will work on the eusilc
dataset of the laeken
package. All the functions presented in the following are explained in the proposed manuscript by Raphaël Jauslin and Yves Tillé (2021) <arXiv:2105.08379>.
Contingency table
We will estimate the contingency table when the factor variable which represents the economic status pl030
is crossed with a discretized version of the equivalized household income eqIncome
. In order to discretize the equivalized income, we calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable and define the category as intervals between the values.
library(laeken)
library(sampling)
library(StratifiedSampling)
#> Le chargement a nécessité le package : Matrix
data("eusilc")
<- na.omit(eusilc)
eusilc <- nrow(eusilc)
N
# Xm are the matching variables and id are identity of the units
<- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xm <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xmcat <- cbind(Xmcat,Xm[,-c(2,4,5)])
Xm <- eusilc$rb030
id
# categorial income splitted by the percentile
<- eusilc$eqIncome
c_income <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
q which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which( eusilc$eqIncome > q[7] )] <- "(90,100]"
c_income[
# variable of interests
<- data.frame(ecostat = eusilc$pl030)
Y <- data.frame(c_income = c_income)
Z
# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id
<- table(cbind(Y,Z))
YZ addmargins(YZ)
#> c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 409 616 722 807 935 1025 648 5162
#> 2 189 181 205 184 165 154 82 1160
#> 3 137 90 72 75 59 52 33 518
#> 4 210 159 103 95 74 49 46 736
#> 5 470 462 492 477 459 435 351 3146
#> 6 57 25 28 30 17 11 10 178
#> 7 344 283 194 149 106 91 40 1207
#> Sum 1816 1816 1816 1817 1815 1817 1210 12107
Sampling schemes
Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.
# size of sample
<- 1000
n1 <- 500
n2
# samples
<- srswor(n1,N)
s1 <- srswor(n2,N)
s2
# extract matching units
<- Xm[s1 == 1,]
X1 <- Xm[s2 == 1,]
X2
# extract variable of interest
<- data.frame(Y[s1 == 1,])
Y1 colnames(Y1) <- colnames(Y)
<- as.data.frame(Z[s2 == 1,])
Z2 colnames(Z2) <- colnames(Z)
# extract correct identities
<- id[s1 == 1]
id1 <- id[s2 == 1]
id2
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
# here weights are inverse of inclusion probabilities
<- rep(N/n1,n1)
d1 <- rep(N/n2,n2)
d2
# disjunctive form
<- sampling::disjunctive(as.matrix(Y))
Y_dis <- sampling::disjunctive(as.matrix(Z))
Z_dis
<- Y_dis[s1 ==1,]
Y1_dis <- Z_dis[s2 ==1,] Z2_dis
Harmonization
Then the harmonization step must be performed. The harmonize
function returns the harmonized weights. If by chance the true population totals are known, it is possible to use these instead of the estimate made within the function.
<- harmonize(X1,d1,id1,X2,d2,id2)
re
# if we want to use the population totals to harmonize we can use
<- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))
re
<- re$w1
w1 <- re$w2
w2
colSums(Xm)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w1*X1)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w2*X2)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
Optimal transport matching
The statistical matching is done by using the otmatch
function. The estimation of the contingency table is calculated by extracting the id1
units (respectively id2
units) and by using the function tapply
with the correct weights.
# Optimal transport matching
<- otmatch(X1,id1,X2,id2,w1,w2)
object head(object[,1:3])
#> id1 id2 weight
#> 1 2002 2002 12.3693242
#> 2403 2403 18503 6.6549822
#> 2403.1 2403 391904 7.0634999
#> 2404 2404 256903 0.7315227
#> 2404.1 2404 286302 12.9999855
#> 2505 2505 361005 12.9288819
<- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Y1_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
Z2_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)
YZ_ot
# transform NA into 0
is.na(YZ_ot)] <- 0
YZ_ot[
# result
round(addmargins(YZ_ot),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 590.712 842.961 691.578 913.959 642.791 763.596 579.974 5025.572
#> 2 92.120 172.447 177.192 178.544 156.887 130.758 94.599 1002.547
#> 3 44.705 49.082 72.319 99.973 59.875 98.070 77.787 501.812
#> 4 157.860 141.832 135.632 145.374 147.062 98.797 38.622 865.179
#> 5 490.223 391.945 514.436 493.816 489.049 484.409 491.583 3355.461
#> 6 21.357 68.943 20.831 10.583 23.219 9.848 21.952 176.734
#> 7 139.597 210.281 226.080 256.982 184.846 74.058 87.854 1179.696
#> Sum 1536.574 1877.491 1838.068 2099.231 1703.730 1659.536 1392.370 12107.000
Balanced sampling
As you can see from the previous section, the optimal transport results generally do not have a one-to-one match, meaning that for every unit in sample 1, we have more than one unit with weights not equal to 0 in sample 2. The bsmatch
function creates a one-to-one match by selecting a balanced stratified sampling to obtain a data.frame where each unit in sample 1 has only one imputed unit from sample 2.
# Balanced Sampling
<- bsmatch(object)
BS head(BS$object[,1:3])
#> id1 id2 weight
#> 1 2002 2002 12.3693242
#> 2403 2403 18503 6.6549822
#> 2404 2404 256903 0.7315227
#> 2505 2505 361005 12.9288819
#> 3901 3901 474502 13.1517446
#> 4101.1 4101 138501 1.6895103
<- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Y1_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
Z2_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs is.na(YZ_bs)] <- 0
YZ_bs[round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 621.099 800.005 660.437 889.889 647.441 788.650 618.051 5025.572
#> 2 100.353 158.434 172.335 184.415 166.926 119.849 100.235 1002.547
#> 3 61.437 62.601 86.312 93.052 37.187 99.268 61.955 501.812
#> 4 149.533 114.650 138.389 165.151 123.416 108.776 65.264 865.179
#> 5 494.525 355.216 494.427 490.951 542.947 493.790 483.605 3355.461
#> 6 23.058 68.806 25.641 12.413 24.311 10.763 11.743 176.734
#> 7 145.609 200.086 237.068 267.377 188.187 60.880 80.490 1179.696
#> Sum 1595.614 1759.797 1814.609 2103.248 1730.415 1681.976 1421.342 12107.000
# With Z2 as auxiliary information for stratified balanced sampling.
<- bsmatch(object,Z2)
BS
<- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Y1_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
Z2_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs is.na(YZ_bs)] <- 0
YZ_bs[round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 651.249 862.018 647.975 885.562 679.848 772.688 526.232 5025.572
#> 2 87.458 170.447 182.069 171.690 157.381 120.034 113.468 1002.547
#> 3 48.814 24.211 72.143 107.549 61.131 101.476 86.488 501.812
#> 4 137.241 164.263 137.980 139.333 125.073 136.224 25.065 865.179
#> 5 455.864 388.036 536.370 525.841 471.763 469.060 508.527 3355.461
#> 6 23.058 68.806 25.641 0.000 23.053 0.000 36.176 176.734
#> 7 133.574 212.685 230.697 267.377 193.994 60.880 80.490 1179.696
#> Sum 1537.257 1890.466 1832.874 2097.351 1712.243 1660.362 1376.446 12107.000
Prediction
# split the weight by id1
<- split(object$weight,f = object$id1)
q_l # normalize in each id1
<- lapply(q_l, function(x){x/sum(x)})
q_l <- as.numeric(do.call(c,q_l))
q
<- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
Z_pred colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]
#> [1,] 0 0.0000000 0 0.0000000 0.000000 0 1
#> [2,] 0 0.5148893 0 0.4851107 0.000000 0 0
#> [3,] 0 0.0000000 0 0.0000000 0.000000 1 0
#> [4,] 0 0.0000000 0 1.0000000 0.000000 0 0
#> [5,] 0 0.0000000 0 0.0000000 0.000000 0 1
#> [6,] 0 0.8656330 0 0.0000000 0.134367 0 0