This file contains examples of the use of the weightedEM, unpack, and cwLocScat functions. It reproduces all the figures of the report “Analyzing cellwise weighted data” by P.J. Rousseeuw.
library("cellWise")
n = 10
d = 3
A = matrix(0.7, d, d); diag(A) = 1
A
##      [,1] [,2] [,3]
## [1,]  1.0  0.7  0.7
## [2,]  0.7  1.0  0.7
## [3,]  0.7  0.7  1.0
set.seed(12345)
library("MASS")
X = mvrnorm(n, rep(0,d), A)
colnames(X) = c("X1","X2","X3")
X[1,3] = X[2,2] = X[3,1] = X[4,1] = X[6,2] = NA
X # rows 1, 2, 3, 4, 6 have NAs
##                X1         X2         X3
##  [1,]  0.81494704  0.5501005         NA
##  [2,]  1.57453449         NA  0.5526973
##  [3,]          NA -0.2420284  0.2337319
##  [4,]          NA -0.5869239  0.2996664
##  [5,] -0.24745384  0.9288529  0.9443676
##  [6,] -0.74023654         NA -2.0885170
##  [7,]  0.19707169  0.9746399  0.5190202
##  [8,] -0.06183274 -0.1193971 -0.5598499
##  [9,]  0.21050943 -0.7740109 -0.1989791
## [10,] -0.82944245 -0.9502024 -0.6871550
w = c(1,2,1,1,2,2,1,1,1,1) # rowwise weights
out = weightedEM(X,w,crit=1e-12,computeloglik=T)
out$niter # number of iteration steps taken
## [1] 131
out$mu
##          X1          X2          X3 
##  0.21182926 -0.28077406 -0.06154232
round(out$Sigma,6)
##          X1       X2       X3
## X1 0.660288 0.331107 0.474936
## X2 0.331107 1.088924 0.943985
## X3 0.474936 0.943985 1.002466
plot(1:out$niter, out$loglikhd[1:out$niter], type='l',
     lty=1, col=4, xlab='step', ylab='log(likelihood)',
     main='log(likelihood) of weighted EM iterations')
plot of chunk unnamed-chunk-3
tail(out$loglikhd) # would have NAs for computeloglik=F
## [1] -34.07189 -34.07189 -34.07189 -34.07189 -34.07189 -34.07189
out$impX # imputed data, has no NA's
##                X1          X2         X3
##  [1,]  0.81494704  0.55010045  0.7764194
##  [2,]  1.57453449  0.01172782  0.5526973
##  [3,]  0.49065646 -0.24202836  0.2337319
##  [4,]  0.75818277 -0.58692386  0.2996664
##  [5,] -0.24745384  0.92885285  0.9443676
##  [6,] -0.74023654 -2.19170090 -2.0885170
##  [7,]  0.19707169  0.97463990  0.5190202
##  [8,] -0.06183274 -0.11939713 -0.5598499
##  [9,]  0.21050943 -0.77401094 -0.1989791
## [10,] -0.82944245 -0.95020238 -0.6871550
# The weights may be multiplied by a constant:
#
(w = c(1,2,1,1,2,2,1,1,1,1)/3) # divide weights by 3
##  [1] 0.3333333 0.6666667 0.3333333 0.3333333 0.6666667 0.6666667 0.3333333
##  [8] 0.3333333 0.3333333 0.3333333
out = weightedEM(X,w,crit=1e-12,computeloglik=T)
out$niter # OK, same results:
## [1] 131
out$mu # same
##          X1          X2          X3 
##  0.21182926 -0.28077406 -0.06154232
round(out$Sigma,6) # same
##          X1       X2       X3
## X1 0.660288 0.331107 0.474936
## X2 0.331107 1.088924 0.943985
## X3 0.474936 0.943985 1.002466
tail(out$loglikhd)
## [1] -11.3573 -11.3573 -11.3573 -11.3573 -11.3573 -11.3573
# converges to -11.3573 = -34.07189 / 3
# Create an equivalent matrix y without weights, by repeating
# some rows according to their integer weights:
#
Y = X[c(1,2,2,3,4,5,5,6,6,7,8,9,10),]
dim(Y)
## [1] 13  3
Y # This gives the same results:
##                X1         X2         X3
##  [1,]  0.81494704  0.5501005         NA
##  [2,]  1.57453449         NA  0.5526973
##  [3,]  1.57453449         NA  0.5526973
##  [4,]          NA -0.2420284  0.2337319
##  [5,]          NA -0.5869239  0.2996664
##  [6,] -0.24745384  0.9288529  0.9443676
##  [7,] -0.24745384  0.9288529  0.9443676
##  [8,] -0.74023654         NA -2.0885170
##  [9,] -0.74023654         NA -2.0885170
## [10,]  0.19707169  0.9746399  0.5190202
## [11,] -0.06183274 -0.1193971 -0.5598499
## [12,]  0.21050943 -0.7740109 -0.1989791
## [13,] -0.82944245 -0.9502024 -0.6871550
out = weightedEM(Y,crit=1e-12,computeloglik=T) # OK, same
out$niter
## [1] 131
out$mu
##          X1          X2          X3 
##  0.21182926 -0.28077406 -0.06154232
round(out$Sigma,6)
##          X1       X2       X3
## X1 0.660288 0.331107 0.474936
## X2 0.331107 1.088924 0.943985
## X3 0.474936 0.943985 1.002466
tail(out$loglikhd)
## [1] -34.07189 -34.07189 -34.07189 -34.07189 -34.07189 -34.07189
# converges to -34.07189 like before.
X = matrix(c(2.8,5.3,4.9,7.4,
             2.3,5.7,4.3,7.2,
             2.5,5.1,4.4,7.6),nrow=3,byrow=T)
W = matrix(c(0.8,1.0,0.3,0.4,
             0.3,0.5,0.9,0.5,
             1.0,0.6,0,0.7),nrow=3,byrow=T)
rownames(X) = rownames(W) = c("A","B","C")
colnames(X) = colnames(W) = c("V1","V2","V3","V4")
n = nrow(X); d = ncol(X)
X
##    V1  V2  V3  V4
## A 2.8 5.3 4.9 7.4
## B 2.3 5.7 4.3 7.2
## C 2.5 5.1 4.4 7.6
W
##    V1  V2  V3  V4
## A 0.8 1.0 0.3 0.4
## B 0.3 0.5 0.9 0.5
## C 1.0 0.6 0.0 0.7
out = unpack(X,W)
cbind(out$U,out$v) # OK
##    V1  V2  V3  V4    
## A  NA 5.3  NA  NA 0.2
## A 2.8 5.3  NA  NA 0.4
## A 2.8 5.3  NA 7.4 0.1
## A 2.8 5.3 4.9 7.4 0.3
## B  NA  NA 4.3  NA 0.4
## B  NA 5.7 4.3 7.2 0.2
## B 2.3 5.7 4.3 7.2 0.3
## C 2.5  NA  NA  NA 0.3
## C 2.5  NA  NA 7.6 0.1
## C 2.5 5.1  NA 7.6 0.6
dim(out$U)
## [1] 10  4
set.seed(12345)
n = 1000; d = 2
A = matrix(0.7, d, d); diag(A) = 1
A
##      [,1] [,2]
## [1,]  1.0  0.7
## [2,]  0.7  1.0
X = mvrnorm(n, rep(0,d), A)
head(X)
##            [,1]       [,2]
## [1,] -0.1098666  1.1895284
## [2,]  0.6233152  0.6848755
## [3,]  0.2309203 -0.4324656
## [4,] -0.1164846 -0.7197229
## [5,]  0.7061365  0.4110647
## [6,] -0.9412289 -2.4109163
W = abs(mvrnorm(n, rep(0,d), diag(rep(1,2))))
W = W/max(as.vector(W))
W[2,1] = 0
W[5,2] = 0
head(W)
##             [,1]       [,2]
## [1,] 0.151856434 0.18102767
## [2,] 0.000000000 0.32052154
## [3,] 0.120772713 0.17167154
## [4,] 0.003729069 0.32719369
## [5,] 0.327865177 0.00000000
## [6,] 0.285126341 0.01091696
fit = cwLocScat(X,W)
fit$cwMLEiter # number of iteration steps
## [1] 47
fit$cwMLEmu
##          1          2 
## 0.05972004 0.04231900
fit$cwMean
##          1          2 
## 0.06657723 0.03736100
fit$cwMLEsigma
##           1         2
## 1 0.9717504 0.6735339
## 2 0.6735339 1.0075136
fit$cwCov # similar to cwMLEsigma:
##           1         2
## 1 0.9759485 0.7398607
## 2 0.7398607 1.0299615
fit$sqrtCov # same diagonal:
##           1        2
## 1 0.9759485 0.718826
## 2 0.7188260 1.029961
data("data_personality_traits")
X <- data_personality_traits$X
W <- data_personality_traits$W
cbind(X,W) # as in table in the paper
##         t1 t2   t3   t4   t5   t6   t1   t2   t3   t4   t5   t6
##  [1,]  7.0  5  7.0  5.0  5.0  5.0 0.50 0.29 0.50 0.29 0.29 0.29
##  [2,] 10.0 10 10.0  7.0  8.5  7.0 1.00 1.00 1.00 0.50 0.58 0.50
##  [3,]  5.0  5 10.0  5.0  5.0  5.0 0.29 0.29 1.00 0.29 0.29 0.29
##  [4,] 10.0 10 10.0  5.0  5.0  5.0 1.00 1.00 1.00 0.29 0.29 0.29
##  [5,]  7.0  7  8.5  5.0  5.0  5.0 0.50 0.50 0.58 0.29 0.29 0.29
##  [6,] 10.0  5  5.0  8.5  8.5  5.0 1.00 0.29 0.29 0.58 0.58 0.29
##  [7,]  5.0  7  7.0  5.0  5.0  8.5 0.29 0.50 0.50 0.29 0.29 0.58
##  [8,] 10.0 10 10.0 10.0 10.0 10.0 1.00 1.00 1.00 1.00 1.00 1.00
##  [9,]  8.5  7  8.5  5.0  5.0  5.0 0.58 0.50 0.58 0.29 0.29 0.29
## [10,]  5.0 10  5.0  7.0  5.0  7.0 0.29 1.00 0.29 0.50 0.29 0.50
out = unpack(X,W)
cbind(out$U,out$v)
##      t1 t2   t3   t4   t5   t6     
## 1   7.0 NA  7.0   NA   NA   NA 0.21
## 1   7.0  5  7.0  5.0  5.0  5.0 0.29
## 2  10.0 10 10.0   NA   NA   NA 0.42
## 2  10.0 10 10.0   NA  8.5   NA 0.08
## 2  10.0 10 10.0  7.0  8.5  7.0 0.50
## 3    NA NA 10.0   NA   NA   NA 0.71
## 3   5.0  5 10.0  5.0  5.0  5.0 0.29
## 4  10.0 10 10.0   NA   NA   NA 0.71
## 4  10.0 10 10.0  5.0  5.0  5.0 0.29
## 5    NA NA  8.5   NA   NA   NA 0.08
## 5   7.0  7  8.5   NA   NA   NA 0.21
## 5   7.0  7  8.5  5.0  5.0  5.0 0.29
## 6  10.0 NA   NA   NA   NA   NA 0.42
## 6  10.0 NA   NA  8.5  8.5   NA 0.29
## 6  10.0  5  5.0  8.5  8.5  5.0 0.29
## 7    NA NA   NA   NA   NA  8.5 0.08
## 7    NA  7  7.0   NA   NA  8.5 0.21
## 7   5.0  7  7.0  5.0  5.0  8.5 0.29
## 8  10.0 10 10.0 10.0 10.0 10.0 1.00
## 9   8.5 NA  8.5   NA   NA   NA 0.08
## 9   8.5  7  8.5   NA   NA   NA 0.21
## 9   8.5  7  8.5  5.0  5.0  5.0 0.29
## 10   NA 10   NA   NA   NA   NA 0.50
## 10   NA 10   NA  7.0   NA  7.0 0.21
## 10  5.0 10  5.0  7.0  5.0  7.0 0.29
fit = cwLocScat(X,W)
fit$cwMLEiter
## [1] 49
round(fit$cwMLEmu,2)
##   t1   t2   t3   t4   t5   t6 
## 8.82 8.72 8.98 7.40 7.53 7.37
round(fit$cwMean,2)
##   t1   t2   t3   t4   t5   t6 
## 8.73 8.61 8.87 7.09 7.16 7.09
round(fit$cwMLEsigma, 2)
##      t1   t2   t3   t4   t5   t6
## t1 3.22 1.83 1.49 1.91 2.52 1.02
## t2 1.83 3.49 1.55 1.69 1.82 2.00
## t3 1.49 1.55 2.50 0.63 1.22 0.92
## t4 1.91 1.69 0.63 3.54 3.48 2.84
## t5 2.52 1.82 1.22 3.48 3.90 2.82
## t6 1.02 2.00 0.92 2.84 2.82 3.58
round(eigen(fit$cwMLEsigma)$values, 2)
## [1] 13.13  3.24  2.18  1.25  0.31  0.11
round(fit$cwCov, 2)
##      t1   t2   t3   t4   t5   t6
## t1 3.36 2.10 1.72 2.63 3.33 1.37
## t2 2.10 3.61 1.78 2.20 2.44 2.46
## t3 1.72 1.78 2.59 0.81 1.64 1.14
## t4 2.63 2.20 0.81 3.99 4.17 3.26
## t5 3.33 2.44 1.64 4.17 4.76 3.40
## t6 1.37 2.46 1.14 3.26 3.40 4.00
round(eigen(fit$cwCov)$values,5)
## [1] 15.91356  2.99031  2.26943  1.08606  0.05570  0.00416
round(cov(X), 2) # unweighted
##      t1   t2    t3    t4   t5   t6
## t1 4.96 1.61  1.44  2.01 3.00 0.07
## t2 1.61 4.93  1.38  1.39 1.26 2.17
## t3 1.44 1.38  4.04 -0.42 0.59 0.36
## t4 2.01 1.39 -0.42  3.29 3.25 1.93
## t5 3.00 1.26  0.59  3.25 3.90 1.89
## t6 0.07 2.17  0.36  1.93 1.89 3.29
round(eigen(cov(X))$values, 2)
## [1] 11.97  4.95  4.64  2.28  0.47  0.10
Now we reproduce the figure in the paper
ellips = function(covmat, mu, quant=0.95, npoints = 120)
{ # computes points of the ellipse t(X-mu)%*%covmat%*%(X-mu) = c
  # with c = qchisq(quant,df=2)
  if (!all(dim(covmat) == c(2, 2))) stop("covmat is not 2 by 2")
  eig = eigen(covmat)
  U = eig$vectors
  R = U %*% diag(sqrt(eig$values)) %*% t(U) # square root of covmat
  angles = seq(0, 2*pi, length = npoints+1)
  xy = cbind(cos(angles),sin(angles)) # points on the unit circle
  fac = sqrt(qchisq(quant, df=2))
  scale(fac*xy%*%R, center = -mu, scale=FALSE)
}  
n = nrow(X)
j = 3; k = 6 # to plot variables t3 and t6
xy = X[,c(j,k)]
cov2cor(cov(X)[c(j,k),c(j,k)]) # unweighted correlation is 0.10
##            t3         t6
## t3 1.00000000 0.09896998
## t6 0.09896998 1.00000000
cov2cor(fit$cwMLEsigma[c(j,k),c(j,k)]) # now correlation is 0.31
##           t3        t6
## t3 1.0000000 0.3077506
## t6 0.3077506 1.0000000
(wxy = W[,c(j,k)])
##         t3   t6
##  [1,] 0.50 0.29
##  [2,] 1.00 0.50
##  [3,] 1.00 0.29
##  [4,] 1.00 0.29
##  [5,] 0.58 0.29
##  [6,] 0.29 0.29
##  [7,] 0.50 0.58
##  [8,] 1.00 1.00
##  [9,] 0.58 0.29
## [10,] 0.29 0.50
duplicated(xy) # ties: row 4 equals row 3, and row 9 equals row 5
##  [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
wxy[3,] = wxy[3,] + wxy[4,] # add cell weights of rows 3 and 4
wxy[5,] = wxy[5,] + wxy[9,] # add cell weights of rows 5 and 9
(wxy = wxy[-c(4,9),]) # remove duplicate rows
##        t3   t6
## [1,] 0.50 0.29
## [2,] 1.00 0.50
## [3,] 2.00 0.58
## [4,] 1.16 0.58
## [5,] 0.29 0.29
## [6,] 0.50 0.58
## [7,] 1.00 1.00
## [8,] 0.29 0.50
(xy = xy[-c(4,9),]) # remove duplicate rows
##        t3   t6
## [1,]  7.0  5.0
## [2,] 10.0  7.0
## [3,] 10.0  5.0
## [4,]  8.5  5.0
## [5,]  5.0  5.0
## [6,]  7.0  8.5
## [7,] 10.0 10.0
## [8,]  5.0  7.0
# pdf("personality_cwMLE_cwCov.pdf",width=5.5,height=5.5)
myxlim = c(2,14); myylim = c(1,13)
plot(xy, pch=16, col="white", xlim=myxlim, ylim=myylim,
     xlab="",ylab="")
fac = 0.3 # for the size of the lines representing the cell weights
for(i in seq_len(nrow(xy))){
  WY = c(xy[i,1] - fac*wxy[i,1],xy[i,2]) # (WestX, Y) 
  EY = c(xy[i,1] + fac*wxy[i,1],xy[i,2]) # (EastX, Y) 
  XS = c(xy[i,1],xy[i,2] - fac*wxy[i,2]) # (X, SouthY)
  XN = c(xy[i,1],xy[i,2] + fac*wxy[i,2]) # (X, NorthY)
  lines(rbind(WY,EY),lwd=3)
  lines(rbind(XS,XN),lwd=3)
}
title(main="tolerance ellipses with and without cell weights",
      line=0.8,cex.main=1) # 1.2)
title(xlab="trait 3",line=2.1,cex.lab=1.0)
title(ylab="trait 6",line=2.1,cex.lab=1.0)
center1 = colMeans(X[,c(j,k)])
covmat1 = (n-1)*cov(X[,c(j,k)])/n                   
ell1 = ellips(covmat1, center1)
lines(ell1,lwd=1.5,col="red") # ellipse from unweighted covariance
fit2 = cwLocScat(xy,wxy)
center2 = fit2$cwMLEmu
covmat2 = fit2$cwMLEsigma
ell2 = ellips(covmat2, center2) # ellipse from cwMLE estimates
lines(ell2,lwd=1.5,col="blue")
center3 = fit2$cwMean
covmat3 = fit2$cwCov
ell3 = ellips(covmat3, center3) # ellipse from cwMean and cwCov
lines(ell3,lwd=1.5,lty=2,col="blue")
legend("topleft",c("cwMLE","cwCov","MLE"),
       col=c("blue","blue","red"), lty=c(1,2,1), lwd=1.5, cex=1)
plot of chunk unnamed-chunk-8
# dev.off()
# The blue ellipses, that use the cell weights, are a bit
# higher and more slanted than the red ellipse that doesn't,
# mainly due to the high cell weight of the y-coordinate of 
# the point in the top right corner.