General information
This page contains the R commands given in "Biplots in Practice".
You can find the data sets in the section 'data sets'
In this page commands that are given in the Computational Appendix of the book are given in brown, just as in the book.
Results of commands are given in green, just like in the book.
Comments about these commands are given in black.
EXTRA means commands given here (but not in the book) to obtain additional results contained in the corresponding chapter.
Both the comments and commands of EXTRA material are in blue, with results given in green.
R code
# Chapter 1: Biplots - the basic idea
# Read in European indicators data set (Exhibit 1.1) into # data frame EU2008 (see p.168) # For example, on Windows PC's, copy table, and then type: # EU2008 <- read.table("clipboard")
windows(width = 11, height = 6) par(mfrow = c(1,2)) plot(EU2008[,2:1], type = "n", cex.axis = 0.7, xlab = "GDP/capita", ylab = "Purchasing power/capita") text(EU2008[,2:1], labels = rownames(EU2008), col = "forestgreen", font = 2) plot(EU2008[,2:3], type = "n", cex.axis = 0.7, xlab = "GDP/capita", ylab = "Inflation rate") text(EU2008[,2:3], labels = rownames(EU2008), col = "forestgreen", font = 2)
# Note: we use "forestgreen" for the texts above above, more like # the green in the book. # Install the R package rgl for 3-d graphics, and then load # as follows:
library(rgl) plot3d(EU2008[,c(2,1,3)], xlab = "GDP", ylab = "Purchasing power", zlab = "Inflation", font = 2, col = "brown", type = "n") text3d(EU2008[,c(2,1,3)], text = rownames(EU2008), font = 2, col = "forestgreen")
# Chapter 2: Regression biplots
# 'bioenv' data set with 8 columns (first 8 columns of Exhibit 2.1) # in data frame bioenv. d is species d, y is pollution, x is depth
d <- bioenv[,4] y <- bioenv[,6] x <- bioenv[,7] summary(lm(d ~ y + x))
Call: lm(formula = d ~ y + x) Residuals: Min 1Q Median 3Q Max -11.7001 -2.4684 0.1749 3.0563 9.1803 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.13518 6.25721 0.980 0.33554 y -1.38766 0.48745 -2.847 0.00834 ** x 0.14822 0.06684 2.217 0.03520 * --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 5.162 on 27 degrees of freedom Multiple R-squared: 0.4416, Adjusted R-squared: 0.4003 F-statistic: 10.68 on 2 and 27 DF, p-value: 0.0003831
# two ways of calculating regression coefficients # 1. by doing the regression on standardized variables
ds <- (d-mean(d)) / sd(d) ys <- (y-mean(y)) / sd(y) xs <- (x-mean(x)) / sd(x) summary(lm(ds ~ ys + xs))
Call: lm(formula = ds ~ ys + xs) Residuals: Min 1Q Median 3Q Max -1.75515 -0.37028 0.02623 0.45847 1.37715 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.487e-17 1.414e-01 1.76e-16 1.00000 ys -4.457e-01 1.566e-01 -2.847 0.00834 ** xs 3.472e-01 1.566e-01 2.217 0.03520 * --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.7744 on 27 degrees of freedom Multiple R-squared: 0.4416, Adjusted R-squared: 0.4003 F-statistic: 10.68 on 2 and 27 DF, p-value: 0.0003831
# 2. by direct calculation
lm(d ~ y + x)$coefficients[2] * sd(y) / sd(d)
y -0.4457286
lm(d ~ y + x)$coefficients[3] * sd(x) / sd(d)
x 0.3471993
# finding and storing the standardized regression coefficients for # all five species - matrix B in (2.2)
B <- lm(bioenv[,1] ~ y+x)$coefficients[2:3] * c(sd(y),sd(x)) / sd(bioenv[,1]) for(j in 2:5) { B <- rbind(B,lm(bioenv[,j] ~ y + x)$coefficients[2:3] * c(sd(y),sd(x)) / sd(bioenv[j])) } B
y x B -0.7171713 0.02465266 -0.4986038 0.22885450 0.4910580 0.07424574 -0.4457286 0.34719935 -0.4750841 -0.39952072
# EXTRA: storing the variance explained for each regression # and computing overall variance explained var.expl <- var(lm(bioenv[,1] ~ y + x)$fitted.values) print(var(lm(bioenv[,j] ~ y + x)$fitted.values) / var(bioenv[,j]))
[1] 0.2351773
for(j in 2:5) { var.expl <- var.expl + var(lm(bioenv[,j] ~ y + x)$fitted.values) print(var(lm(bioenv[,j] ~ y + x)$fitted.values) / var(bioenv[,j])) }
[1] 0.3912441 [1] 0.2178098 [1] 0.4416404 [1] 0.2351773
# overall R2 var.expl / sum(apply(bioenv[,1:5], 2, var))
[1] 0.4145226
# EXTRA: doing the same as previous 'EXTRA', but for three predictor # variables, adding z = temperature z <- bioenv[,8] var.expl3 <- var(lm(bioenv[,1] ~ y + x + z)$fitted.values) print(var(lm(bioenv[,1] ~ y + x + z)$fitted.values) / var(bioenv[,1]))
[1] 0.5569366
for(j in 2:5) { var.expl3 <- var.expl3 + var(lm(bioenv[,j] ~ y + x + z)$fitted.values) print(var(lm(bioenv[,j] ~ y + x + z)$fitted.values) / var(bioenv[,j])) }
[1] 0.3924837 [1] 0.2181056 [1] 0.4416436 [1] 0.2463643
# overall R2 var.expl3 / sum(apply(bioenv[,1:5], 2, var))
[1] 0.4271042
# Drawing a regression biplot (Exhibit 2.5) using standardized values # of x and y. # species labels put at the point, arrows 5% shorter # Note: either close the previous graphics window, or open a new # one -on Windows machines open with command win.graph()
win.graph() plot(xs, ys, xlab = "x*(depth)", ylab = "y*(pollution)", type = "n", asp = 1, cex.axis = 0.7) text(xs, ys, labels = rownames(bioenv), col = "forestgreen") text(B[,2:1], labels = colnames(bioenv[,1:5]), col = "chocolate", font = 4) arrows(0, 0, 0.95*B[,2], 0.95*B[,1], col = "chocolate", angle = 15, length = 0.1)
# (Notice that we have varied the color definitions to be more like # those in the book)
# Chapter 3: Generalized linear model biplots
# d0 is the fourth-root transformed d; do regression of d0 on # ys and xs
d0 <- d^0.25 summary(lm(d0 ~ ys + xs))
Call: lm(formula = d0 ~ ys + xs) Residuals: Min 1Q Median 3Q Max -1.50933 -0.04458 0.13950 0.25483 0.85149 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.63908 0.09686 16.923 6.71e-16 *** ys -0.28810 0.10726 -2.686 0.0122 * xs 0.05959 0.10726 0.556 0.5831 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.5305 on 27 degrees of freedom Multiple R-squared: 0.2765, Adjusted R-squared: 0.2229 F-statistic: 5.159 on 2 and 27 DF, p-value: 0.01266
# EXTRA: saving all the coefficients for the double square-root # regressions (Exhibit 3.1) # Also accumulating variance explained in each and computing overall dsqrt_abcde <- bioenv[,1:5]^0.25 dsqrt_abcde_coefs <- lm(dsqrt_abcde[,1] ~ ys + xs)$coefficients dsqrt.var.expl <- var(lm(dsqrt_abcde[,1] ~ ys + xs)$fitted.values) print(var(lm(dsqrt_abcde[,j] ~ ys + xs)$fitted.values) / var(dsqrt_abcde[,j]))
[1] 0.2284871
for(j in 2:5){ dsqrt_abcde_coefs <- rbind(dsqrt_abcde_coefs, lm(dsqrt_abcde[,j] ~ ys + xs)$coefficients) dsqrt.var.expl <- dsqrt.var.expl + var(lm(dsqrt_abcde[,j] ~ ys + xs)$fitted.values) print(var(lm(dsqrt_abcde[,j] ~ ys + xs)$fitted.values) / var(dsqrt_abcde[,j])) }
[1] 0.3621872 [1] 0.1591791 [1] 0.2764936 [1] 0.2284871
# overall R2 dsqrt.var.expl / sum(apply(dsqrt_abcde[,1:5], 2, var))
[1] 0.3393836
# EXTRA: drawing the biplot of Exhibit 3.2 plot(xs, ys, xlab = "x*(depth)", ylab = "y*(pollution)", type = "n", asp = 1, cex.axis = 0.7) text(xs, ys, labels = rownames(bioenv), col = "forestgreen") text(dsqrt_abcde_coefs[,3:2], labels = colnames(bioenv[,1:5]), col = "chocolate", font = 4) for(j in 1:5){ arrows(0, 0, 0.95*dsqrt_abcde_coefs[j,3], 0.95*dsqrt_abcde_coefs[j,2], col = "chocolate", angle = 15, length = 0.1) }
# performing Poisson regression for species d:
summary(glm(d ~ ys + xs, family = poisson))
Call: glm(formula = d ~ ys + xs, family = poisson) Deviance Residuals: Min 1Q Median 3Q Max -4.4123 -0.7585 0.1637 0.8262 2.7565 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 2.29617 0.06068 37.838 < 2e-16 *** ys -0.33682 0.07357 -4.578 4.69e-06 *** xs 0.19963 0.06278 3.180 0.00147 ** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 (Dispersion parameter for poisson family taken to be 1) Null deviance: 144.450 on 29 degrees of freedom Residual deviance: 88.671 on 27 degrees of freedom AIC: 208.55 Number of Fisher Scoring iterations: 5
# to get error deviance for Poisson regression of species d:
poisson.glm <- glm(d~ys+xs, family=poisson) poisson.glm$deviance/poisson.glm$null.deviance
[1] 0.6138564
# EXTRA: saving all the coefficients for the Poisson regressions # (Exhibit 3.4) pois_abcde_coefs <- glm(abcde[,1] ~ ys + xs, family = poisson)$coefficients for(j in 2:5){ pois_abcde_coefs <- rbind(pois_abcde_coefs, glm(abcde[,j] ~ ys + xs, family=poisson)$coefficients) } pois_abcde_coefs
(Intercept) ys xs pois_abcde_coefs 2.1786692 -1.1254080 -0.06714289 1.8525985 -0.8122742 0.18286062 2.0411089 0.4174381 0.05321664 2.2961652 -0.3368244 0.19963421 0.8277491 -0.8234332 -0.56805285
# performing logistic regression for species d, after converting to # presence/absence:
d01 <- d > 0 summary(glm(d01 ~ ys + xs, family = binomial))
Call: glm(formula = d01 ~ ys + xs, family = binomial) Deviance Residuals: Min 1Q Median 3Q Max -2.2749 0.2139 0.2913 0.3884 1.2874 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 2.7124 0.8533 3.179 0.00148 ** ys -1.1773 0.6522 -1.805 0.07105 . xs -0.1369 0.7097 -0.193 0.84708 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 19.505 on 29 degrees of freedom Residual deviance: 15.563 on 27 degrees of freedom AIC: 21.563 Number of Fisher Scoring iterations: 6
# to get error deviance for logistic regression of species d:
logistic.glm <- glm(d01 ~ ys + xs, family = binomial) logistic.glm$deviance / logistic.glm$null.deviance
[1] 0.7979165
# EXTRA: saving all the coefficients for the logistic regressions # (Exhibit 3.5) abcde <- bioenv[,1:5] abcde01 <- abcde > 0 logit_abcde_coefs <- glm(abcde01[,1] ~ ys + xs, family = binomial)$coefficients for(j in 2:5){ logit_abcde_coefs <- rbind(logit_abcde_coefs, glm(abcde01[,j] ~ ys + xs, family = binomial)$coefficients) } logit_abcde_coefs
(Intercept) ys xs logit_abcde_coefs 2.3842273 -2.8885410 0.8631339 1.2731575 -1.4183387 -0.1431720 0.8309807 0.9726882 0.3148828 2.7124406 -1.1772989 -0.1368686 0.2532229 -1.2796484 -0.7858995
# EXTRA: another example of a plot (Example 3.6) # This time the arrows are drawn to the exact point and labels # are offset plot(xs, ys, type = "n", xlab = "x* (depth)", ylab = "y* (pollution)", ylim = c(-3,2.6), cex.axis = 0.7, asp = 1) text(xs, ys, labels = rownames(bioenv), col = "forestgreen", cex=0.8) arrows(0, 0, logit_abcde_coefs[,3], logit_abcde_coefs[,2], col = "chocolate", length = 0.1, angle = 15, lwd = 1) text(1.05*logit_abcde_coefs[,3:2], labels = colnames(bioenv[,1:5]), col = "chocolate", font = 4, cex = 1.1)
# Chapter 4: Multidimensional scaling biplots
# 'countries' data set -- dissimilarities between 13 countries # (Exhibit 4.1) -- # assumed to have been read into data frame MT_matrix # R function cmdscale performs classical multidimensional scaling # Exhibit 4.2 obtained as follows
plot(cmdscale(MT_matrix), type = "n", xlab = "dim 1", ylab = "dim 2", asp = 1) text(cmdscale(MT_matrix), labels = colnames(MT_matrix), col = "forestgreen")
# EXTRA (*only* for MDS experts -- there is a bug in the cmdscale # function... read on...) # The eigenvalues of the MDS (notice that warnings are given that # some are negative) cmdscale(MT_matrix, eig = TRUE, k = 1)$eig # Function cmdscale does not allow obtaining the last (13th) # eigenvalue, and there is thus a bug in the computation of the # goodness-of-fit (GOF), which doesn't take last dimension into # account cmdscale(MT_matrix, eig = TRUE)$GOF
[1] 0.6177913 0.6500204
# In the above the first reported GOF value is wrong. It is based on # the first 12 eigenvalues: sum(abs(cmdscale(MT_matrix, eig = TRUE, k = 12)$eig))
[1] 264.5464
# (it is not possible to set k=13) # Here is the correct total, which includes the last (negative) # eigenvalue MT_sum <- sum(abs(eigen(-0.5*cmdscale(MT_matrix, eig = TRUE, k = 12, x.ret = TRUE)$x)$values)) MT_sum
[1] 288.0903
# The following verifiess the second GOF value, which is correct: cmdscale(MT_matrix, eig = TRUE, k = 12)$eig[1:2] / sum(cmdscale(MT_matrix, eig = TRUE, k = 12)$eig[1:8])
[1] 0.3821252 0.2678952
# (0.6500204 = 0.3821252 + 0.2678952) # And the following revised calculation gives what the first GOF # should have been: cmdscale(MT_matrix, eig = TRUE, k = 12)$eig[1:2] / MT_sum
[1] 0.3334984 0.2338045
# that is, the sum for the two-dimensional solution is not 0.6177913, # but rather sum(cmdscale(MT_matrix, eig = TRUE, k = 12)$eig[1:2] / MT_sum)
[1] 0.5673029
# 'atributes' data set -- ratings of countries on 6 attributes # (Exhibit 4.3) -- # assumed to have been read into data frame MT_ratings # keep coordinates of MDS plot in MT_dims
MT_dims <- cmdscale(MT_matrix, eig = TRUE, k = 2)$points colnames(MT_dims) <- c("dim1","dim2")
# calculate the regression coefficients of the 6 attributes on the # coordinates (Exhibit 4.4)
MT_coefs <- lm(MT_ratings[,1] ~ MT_dims[,1] + MT_dims[,2])$coefficients for(j in 2:6){ MT_coefs <- rbind(MT_coefs, MT_coefs <- lm(MT_ratings[,j] ~ MT_dims[,1] + MT_dims[,2])$coefficients) }
# plot the regression coefficients on the MDS plot (Exhibit 4.5)
plot(cmdscale(MT_matrix), type = "n", xlab = "dim 1", ylab = "dim 2", asp = 1) text(cmdscale(MT_matrix), labels = colnames(MT_matrix), col = "forestgreen") arrows(0, 0, MT_coefs[,2], MT_coefs[,3], col = "chocolate", length = 0.1, angle = 10) text(1.2*MT_coefs[,2:3], labels = colnames(MT_ratings), col = "chocolate", font = 4, cex = 0.8)
# Function to calculate chi-square distances between row or column # profiles of a matrix # Usage: chidist(N,1) calculates the chi-square distances between row # profiles # (for row profiles, chidist(N) is sufficient) # chidist(N,2) calculates the chi-square distances between # column profiles
chidist <- function(mat, rowcol = 1) { if(rowcol == 1) { prof <- mat / apply(mat, 1, sum) rootaveprof <- sqrt(apply(mat, 2, sum) / sum(mat)) } if(rowcol == 2) { prof <- t(mat) / apply(mat, 2, sum) rootaveprof <- sqrt(apply(mat, 1, sum) / sum(mat)) } dist(scale(prof, FALSE, rootaveprof)) }
# Note: functions defined in this way are added as objects to your # workspace, and are saved if the workspace is saved, otherwise they # are lost # Compute chi-square distances between row profiles, perform # classical MDS and plot
abcde <- bioenv[,1:5] abcde_mds <- cmdscale(chidist(abcde), eig = TRUE, k = 4) 100 * abcde_mds$eig / sum(abcde_mds$eig)
[1] 52.364447 22.014980 16.187784 9.432789
plot(abcde_mds$points[,1:2], type = "n", xlab = "dim 1", ylab = "dim 2", xlim = c(-1.2,1.6), ylim = c(-1.1,1.8), asp = 1) text(abcde_mds$points[,1:2], labels = rownames(abcde), col = "forestgreen")
# Now add columns (species) as biplot vectors # First convert to row profiles and standardize in the chi-squared way
abcde_prof <- abcde / apply(abcde, 1, sum) abcde_prof_stand <- t(t(abcde_prof) / sqrt(apply(abcde, 2, sum) / sum(abcde)))
# perform regressions and save coefficients for plotting
mds_coefs <- lm(abcde_prof_stand[,1] ~ abcde_mds$points[,1] + abcde_mds$points[,2])$coefficients for(j in 2:5){ mds_coefs <- rbind(mds_coefs, MT_coefs <- lm(abcde_prof_stand[,j] ~ abcde_mds$points[,1] + abcde_mds$points[,2])$coefficients) } arrows(0, 0, mds_coefs[,2], mds_coefs[,3], col = "chocolate", length = 0.1, angle = 10) text(1.1*mds_coefs[,2:3], labels = colnames(abcde), col = "chocolate", font = 4, cex = 0.9)
# Assume that the sediment vector has been read into a character # vector (e.g., in Windows, copy the vector from the Excel file, # without the column label, and then type in the command: # sediment <- scan("clipboard", what="character") # convert sediment (consisting of "C","G" and "S") to factor object
sediment <- as.factor(sediment)
# compute mean positions in biplot and plot them
sediment.means <- cbind(tapply(abcde_mds$points[,1], sediment, mean), tapply(abcde_mds$points[,2], sediment, mean)) text(sediment.means[,1], sediment.means[,2], labels = c("C","G","S"), font = 4, cex = 0.8, col = "chocolate")
# convert sediment categories to dummy variables
clay01 <- sediment == "C" gravel01 <- sediment == "G" sand01 <- sediment == "S"
# compute logistic regression coefficients and plot them connected # to origin
sediment_coefs <- glm(as.numeric(clay01) ~ abcde_mds$points[,1] + abcde_mds$points[,2], family = "binomial")$coefficients sediment_coefs <- rbind(sediment_coefs, glm(as.numeric(gravel01) ~ abcde_mds$points[,1] + abcde_mds$points[,2], family = "binomial")$coefficients) sediment_coefs <- rbind(sediment_coefs, glm(as.numeric(sand01) ~ abcde_mds$points[,1] + abcde_mds$points[,2], family = "binomial")$coefficients) segments(0, 0, sediment_coefs[,2], sediment_coefs[,3], col = "chocolate") text(sediment_coefs[,2:3], labels = c("C","G","S"), col = "chocolate", font = 4)
# Chapter 5: Reduced-dimension biplots
# Small example from Chapter 1, biplotted # (Note that the last two singular values, theoretically zero, and # their respective singular vectors, can turn out slightly differently # in different versions of R)
Y <- matrix(c(8,5,-2,2,4,2,0,-3,3,6,2,3,3,-3,-6,-6,-4,1,-1,-2), nrow = 5) svd(Y)
$d [1] 1.412505e+01 9.822577e+00 6.351831e-16 3.592426e-33 $u [,1] [,2] [,3] [,4] [1,] -0.6634255 -0.4574027 -0.59215653 2.640623e-35 [2,] -0.3641420 -0.4939878 0.78954203 2.167265e-34 [3,] 0.2668543 -0.3018716 -0.06579517 -9.128709e-01 [4,] -0.2668543 0.3018716 0.06579517 -1.825742e-01 [5,] -0.5337085 0.6037432 0.13159034 -3.651484e-01 $v [,1] [,2] [,3] [,4] [1,] -0.7313508 -0.2551980 -0.6276102 -0.0781372 [2,] -0.4339970 0.4600507 0.2264451 0.7407581 [3,] 0.1687853 -0.7971898 0.0556340 0.5769791 [4,] 0.4982812 0.2961685 -0.7427873 0.3350628
colnames(Y) <- c("A","B","C","D") rowcoord <- svd(Y)$u %*% diag(sqrt(svd(Y)$d)) colcoord <- svd(Y)$v %*% diag(sqrt(svd(Y)$d)) plot(rbind(rowcoord,colcoord)[,1:2], type = "n", xlab = "", ylab = "", asp = 1, cex.axis = 0.7) abline(h = 0, v = 0, lty = "dotted") text(rowcoord[,1:2], labels = 1:5, col = "forestgreen") text(colcoord[,1:2], labels = colnames(Y), col = "chocolate", font = 3)
# Chapter 6: Principal component biplots
# 'attributes' data set already in data frame MT_ratings (Chap. 4) # centre the data by the column means
MT_means <- apply(MT_ratings, 2, mean) MT_Y <- sweep(MT_ratings, 2, MT_means)
# compute matrix with equal row and column weights and compute SVD
MT_Y <- MT_Y/sqrt(nrow(MT_Y)*ncol(MT_Y)) MT_SVD <- svd(MT_Y)
# form biplot: row principal, column standard coordinates
MT_F <- sqrt(nrow(MT_Y))*MT_SVD$u%*%diag(MT_SVD$d) MT_G <- sqrt(ncol(MT_Y))*MT_SVD$v plot(rbind(MT_F[,1:2],MT_G[,1:2]), type = "n", asp = 1, xlim = c(-3.6,2.3), xlab = "dim 1", ylab = "dim 2", cex.axis = 0.7) text(MT_F[,1:2], labels = rownames(MT_ratings), col = "forestgreen", cex = 0.8) arrows(0, 0, MT_G[,1], MT_G[,2], col = "chocolate", length = 0.1, angle = 10) text(c(1.07,1.3,1.07,1.35,1.2,1.4)*MT_G[,1], c(1.07,1.07,1.05,1,1.16,1.1)*MT_G[,2], labels = colnames(MT_ratings), col = "chocolate", font = 4, cex = 0.8)
# covariance biplot: row standard, column principal coordinates
MT_F <- sqrt(nrow(MT_Y))*MT_SVD$u MT_G <- sqrt(ncol(MT_Y))*MT_SVD$v%*%diag(MT_SVD$d) plot(rbind(MT_F[,1:2],MT_G[,1:2]), type = "n", asp = 1, xlim = c(-3.6, 2.3), xlab = "dim 1", ylab = "dim 2", cex.axis = 0.7) text(MT_F[,1:2], labels = rownames(MT_ratings), col = "forestgreen", cex = 0.8) arrows(0, 0, MT_G[,1], MT_G[,2], col = "chocolate", length = 0.1, angle = 10) text(c(1.07,1.2,1.07,1.25,1.07,1.3)*MT_G[,1], c(1.07,1.07,1.04,1.02,1.16,1.07)*MT_G[,2], labels = colnames(MT_ratings), col = "chocolate", font = 4, cex = 0.8)
# scree plot
MT_percents <- 100*MT_SVD$d^2/sum(MT_SVD$d^2) MT_percents <- MT_percents[seq(6,1)] barplot(MT_percents, horiz = TRUE, cex.axis = 0.7)
# Chapter 7: Log-ratio biplots
# get the US Arrests data provided in R
data(USArrests)
# perform (weighted) log-ratio analysis on columns 1, 2 and 4 # rm and cm are the row and column margins; # mr and mc are the weighted means used in the double-centering
N <- USArrests[,c(1,2,4)] P <- N / sum(N) rm <- apply(P, 1, sum) cm <- apply(P, 2, sum) Y <- as.matrix(log(P)) mc <- t(Y) %*% as.vector(rm) Y <- Y - rep(1, nrow(P)) %*% t(mc) mr <- Y %*% as.vector(cm) Y <- Y - mr %*% t(rep(1,ncol(P))) Z <- diag(sqrt(rm)) %*% Y %*% diag(sqrt(cm)) svdZ <- svd(Z)
# compute form biplot coordinates from results of SVD
USA_F <- diag(1/sqrt(rm)) %*% svdZ$u[,1:2] %*% diag(svdZ$d[1:2]) USA_G <- diag(1/sqrt(cm)) %*% svdZ$v[,1:2]
# biplot - axes with different scales plotted individually
plot(rbind(USA_F, USA_G/20), xlim = c(-0.35,0.45), ylim = c(-0.18,0.23), asp = 1, type = "n", xlab = "", ylab = "", xaxt = "n", yaxt = "n", cex.axis = 0.7) axis(1, cex.axis = 0.7, col.axis = "forestgreen", col.ticks = "forestgreen") axis(2, cex.axis = 0.7, col.axis = "forestgreen", col.ticks = "forestgreen", at = seq(-0.2, 0.2, 0.2)) axis(3, cex.axis = 0.7, col.axis = "chocolate", col.ticks = "chocolate", at = seq(-0.4, 0.4, 0.2), labels = seq(-8,8,4)) axis(4, cex.axis = 0.7, col.axis = "chocolate", col.ticks = "chocolate", at = seq(-0.2, 0.2, 0.2), labels = seq(-4,4,4)) text(USA_F, labels = rownames(N), col = "forestgreen", font = 2, cex = 0.8) text(USA_G/20, labels = colnames(N), col = "chocolate", font = 4, cex = 0.9)
# total variance (inertia): either sum of squares of Z matrix, or sum # of squared singular values
sum(Z*Z)
[1] 0.01790182
sum(svdZ$d^2)
[1] 0.01790182
# fish morphology example: assume data read into data frame fish: # -- first two columns are sex, habitat # -- remaining 26 columns are the morphometric data
fish.morph<-fish[,3:ncol(fish)]
# log-ratio analysis is performed in the same way as above
N <- fish.morph P <- N/sum(N) rm <- apply(P, 1, sum) cm <- apply(P, 2, sum) Y <- as.matrix(log(P)) mc <- t(Y) %*% as.vector(rm) Y <- Y - rep(1,nrow(P)) %*% t(mc) mr <- Y %*% as.vector(cm) Y <- Y - mr %*% t(rep(1,ncol(P))) Z <- diag(sqrt(rm)) %*% Y %*% diag(sqrt(cm)) svdZ <- svd(Z)
# total variance
sum(Z*Z)
[1] 0.001960883
# calculate sex-habitat interaction and set up labels for four # sex*habitat groups
fish.sexhab <- 2*(fish[,2]-1)+fish[,1] fish.labels <- rep("fL", nrow(fish)) fish.labels[fish.sexhab=="2"] <- "mL" fish.labels[fish.sexhab=="3"] <- "fP" fish.labels[fish.sexhab=="4"] <- "mP"
# EXTRA: log-ratio biplot of morphological data # Exhibit 7.3 (first axis coordinates reversed) fish_F <- diag(1/sqrt(rm)) %*% svdZ$u %*% diag(svdZ$d) fish_G <- diag(1/sqrt(cm)) %*% svdZ$v fish_F[,1] <- -fish_F[,1] fish_G[,1] <- -fish_G[,1] plot(rbind(fish_F,fish_G/50), type = "n", xlab = "", ylab = "", asp = 1, xaxt = "n", yaxt = "n", cex.axis = 0.7) axis(1, col.axis = "forestgreen", col.ticks = "forestgreen") axis(2, col.axis = "forestgreen", col.ticks = "forestgreen") axis(3, col.axis = "chocolate", col.ticks = "chocolate", at = seq(-0.06, 0.06, 0.02), labels = seq(-3,3,1)) axis(4, col.axis = "chocolate", col.ticks = "chocolate", at = seq(-0.04, 0.04, 0.02), labels = seq(-2, 2, 1)) text(fish_F, labels = fish.labels, col = "forestgreen", font = 2, cex = 0.7) text(fish_G/50, labels = colnames(fish.morph), col = "chocolate", font = 4, cex=0.8)
# scatterplot of log-ratios in Exhibit 7.4
logFdlFal <- log(fish.morph[,"Fdl"] / fish.morph[,"Fal"]) logFdwFal <- log(fish.morph[,"Fdw"] / fish.morph[,"Fal"]) plot(logFdlFal, logFdwFal, asp = 1, pch = 24, cex = 0.7, xlab = "log(Fdl/Fal)", ylab = "log(Fdw/Fal)", col = "forestgreen") abline(a = 0.0107, b = 0.707, lty = 2, col = "chocolate")
# predictions of Fdw in Exhibit 7.5
Fdw_pred <- 1.0108*fish.morph[,"Fdl"]^0.707*fish.morph[,"Fal"]^0.293 plot(Fdw_pred, fish.morph[,"Fdw"], xlim = c(18,30), ylim = c(18,30), pch = 24, cex = 0.7, xlab = "predicted Fdw", ylab = "actual Fdw", col = "forestgreen") abline(a = 0, b = 1, lty = 2, col = "chocolate")
# correlation between predicted and observed
cor(Fdw_pred, fish.morph[,"Fdw"])
[1] 0.7496034
# Chapter 8: Correspondence analysis biplots
# Data set 'smoking' is in the ca package (to be installed from CRAN # first, before loading) # Its name is actually 'smoke' (Exhibit 8.1)
library(ca) data(smoke)
# computation of CA
N <- smoke P <- N/sum(N) rm <- apply(P, 1, sum) cm <- apply(P, 2, sum) Dr <- diag(rm) Dc <- diag(cm) Z <- diag(sqrt(1/rm))%*%(as.matrix(P)-rm%*%t(cm))%*%diag(sqrt(1/cm)) svdZ <- svd(Z)
# principal coordinates of rows, standard coordinates of columns
smoke_F <- diag(1/sqrt(rm))%*%svdZ$u %*%diag(svdZ$d) smoke_G <- diag(1/sqrt(cm))%*%svdZ$v
# plotting (notice that by default the first two columns are used)
plot(rbind(smoke_F, smoke_G), type = "n", xlab = "Dim 1", ylab = "Dim 2", asp = 1) text(smoke_F, labels = rownames(smoke), col = "forestgreen", font = 2, cex = 0.7) text(smoke_G, labels = colnames(smoke), col = "chocolate", font = 2, cex = 0.8)
# obtaining Exhibit 8.2 using ca package
plot(ca(smoke), map = "rowprincipal", col = c("forestgreen","chocolate"))
# Summary of CA, with contributions to inertia
summary(ca(smoke))
Principal inertias (eigenvalues): dim value % cum% scree plot 1 0.074759 87.8 87.8 ************************* 2 0.010017 11.8 99.5 *** 3 0.000414 0.5 100.0 -------- ----- Total: 0.085190 100.0 Rows: name mass qlt inr k=1 cor ctr k=2 cor ctr 1 | SM | 57 893 31 | -66 92 3 | -194 800 214 | 2 | JM | 93 991 139 | 259 526 84 | -243 465 551 | 3 | SE | 264 1000 450 | -381 999 512 | -11 1 3 | 4 | JE | 456 1000 308 | 233 942 331 | 58 58 152 | 5 | SC | 130 999 71 | -201 865 70 | 79 133 81 | Columns: name mass qlt inr k=1 cor ctr k=2 cor ctr 1 | none | 316 1000 577 | -393 994 654 | -30 6 29 | 2 | lght | 233 984 83 | 99 327 31 | 141 657 463 | 3 | medm | 321 983 148 | 196 982 166 | 7 1 2 | 4 | hevy | 130 995 192 | 294 684 150 | -198 310 506 |
# suppose the 'benthos' data have been read into data frame benthos # calculation of CA and the row contributions (ca package stores # standard coordinates in $rowcoord)
benthos.ca <- ca(benthos) benthos.F <- benthos.ca$rowcoord %*% diag(benthos.ca$sv) benthos.rowcon <- benthos.ca$rowmass * (benthos.F[,1]^2 + benthos.F[,2]^2) / sum(benthos.ca$sv[1:2]^2)
# set up a vector of species labels where those with less than 0.01 # contribution are labelled "á"
benthos.names <- rownames(benthos) benthos.names[benthos.rowcon < 0.01] <- "á"
# nonlinear transformation of contributions for plotting of labels
benthos.rowsize <- log(1+exp(1)*benthos.rowcon^0.3) benthos.rowsize[benthos.rowcon<0.01] <- 1
# plot the asymmetric map with label size related to contribution # (Exhibit 8.3)
FF <- benthos.ca$rowcoord GG <- benthos.ca$colcoord %*% diag(benthos.ca$sv) plot(c(FF[,1],GG[,1]), c(FF[,2],GG[,2]), type = "n", xlab = "", ylab = "", asp = 1) text(FF[,1], FF[,2], labels = benthos.names, cex = benthos.rowsize, col = "chocolate", font = 4) text(GG[,1], GG[,2], labels = colnames(benthos), cex = 0.8, col = "forestgreen", font = 2)
# The contribution biplot can be obtained using the map="colgreen" # option in the ca package where "green" refers to Greenacre, not a # colour! (The option "colgab" refers to Gabriel...) # (remember that the columns are the sites, and the species the rows, # which is the tranpose of the usual format).
# EXTRA: First substitute the species labels, using blank labels for # low contributions benthos.names[benthos.rowcon < 0.01] <- " " benthos.ca$rownames <- benthos.names
# Contribution biplot of Exhibit 8.4
plot(benthos.ca, map = "colgreen", mass = c(1,0), col = c("chocolate","forestgreen"), pch = c(17,24,16,1))
# draw lines to the species points
segments(0, 0, benthos.ca$rowcoord[,1]*sqrt(benthos.ca$rowmass), benthos.ca$rowcoord[,2]*sqrt(benthos.ca$rowmass), col = "chocolate")
# Chapter 9: Multiple correspondence analysis biplots I
# The original Spanish data (N=2471) is in the Excel file # women_Spain2002_original.xls # Later (in Chapter 10) we will show how to create the concatenated # matrix in R. # Here we use the prepared concatenated file for the reduced sample # (N=2107) which has no missing data and where categories H4 and H5 # have already been combined -- this concatenated matrix (Exhibit 9.1) # is in the Excel file women_Spain2002_concat.xls # So we assume that this concatenated table has been read into the data # frame 'women.concat' # the symmetric CA map, which is the default plotting option of the # ca package
plot(ca(women.concat))
# to change the sign of the second axis (if required)
women.ca <- ca(women.concat) women.ca$rowcoord[,2] <- -women.ca$rowcoord[,2] women.ca$colcoord[,2] <- -women.ca$colcoord[,2] plot(women.ca)
# to obtain Exhibit 9.2: plot symbols, compute principal coordinates, # add labels and inertias # NOTE: after plotting, adjust the window to accommodate the points # and labels as in Exhibit 9.2. Repeat the plotting again, if # necessary, in that window to reposition labels
par(cex.axis = 0.7) plot(women.ca, labels = 0, col = c("forestgreen", "chocolate")) women.F <- women.ca$rowcoord %*% diag(women.ca$sv) women.G <- women.ca$colcoord %*% diag(women.ca$sv) text(women.F, labels = women.ca$rownames, pos = 4, offset = 0.3, col = "forestgreen", cex = 0.8, font = 2) text(women.G, labels = women.ca$colnames, pos = 4, offset = 0.3, col = "chocolate", cex = 0.8, font = 4) text(max(women.G[,1]), 0, "0.0571 (82.1%)", adj = c(0.6,-0.6), cex = 0.7) text(0, max(women.G[,2]), "0.0030 (4.4%)", adj = c(-0.1,-3), cex = 0.7)
# asymmetric biplot of Exhibit 9.3 # (again, if you need to adjust the window, repeat plotting in new # window to position labels better)
plot(women.ca, map = "rowprincipal", labels = c(0,2), col = c("forestgreen", "chocolate"))
# EXTRA: same as above, but labelling categories in colour plot(women.ca, map = "rowprincipal", labels = 0, col = c("forestgreen", "chocolate")) text(women.ca$colcoord, labels = women.ca$colnames, pos = 4, offset = 0.3, col = "chocolate", cex = 0.8, font = 4) # EXTRA: Exhibit 9.4 -- showing categories 'e5' and 'ma4' connected # to their 8 corresponding variables # computation of coordinates of these demographic categories per # substantive variable (the slight complication here is that the last # variable has 4 categories, while all the others have 5) women.e5 <- as.numeric(women.concat["e5",]) women.e5 <- women.e5 / sum(women.e5/8) women.e5.coord <- apply(women.e5[36:39] * women.ca$colcoord[36:39,], 2, sum) for(j in 7:1){ women.e5.coord <- rbind(apply(women.e5[(5*(j-1)+1):(5*j)] * women.ca$colcoord[(5*(j-1)+1):(5*j),], 2, sum), women.e5.coord) } women.ma4 <- as.numeric(women.concat["ma4",]) women.ma4 <- women.ma4 / sum(women.ma4/8) women.ma4.coord <- apply(women.ma4[36:39] * women.ca$colcoord[36:39,], 2, sum) for(j in 7:1){ women.ma4.coord <- rbind(apply(women.ma4[(5*(j-1)+1):(5*j)] * women.ca$colcoord[(5*(j-1)+1):(5*j),], 2, sum), women.ma4.coord) } # plot them with principal coordinates of rows and link to their # centroid plot(rbind(women.F, women.e5.coord, women.ma4.coord), type = "n", asp = 1, xlab = "", ylab = "") points(women.F, pch = 19, col = "forestgreen") text(women.F, labels = women.ca$rownames, pos = 4, offset = 0.3, col = "forestgreen", cex = 0.9, font = 2) points(rbind(women.e5.coord, women.ma4.coord), pch = 22, col = "chocolate") text(rbind(women.e5.coord, women.ma4.coord), labels=LETTERS[1:8], pos = 4, offset = 0.3, col = "chocolate", cex = 0.9, font = 4) rownames(women.F) <- women.ca$rownames segments(women.F["e5",1], women.F["e5",2], women.e5.coord[,1], women.e5.coord[,2], col = "chocolate") segments(women.F["ma4",1], women.F["ma4",2], women.ma4.coord[,1], women.ma4.coord[,2], col = "chocolate") abline(h = 0, v = 0, lty = 3, col = "gray")
# Exhibit 9.6: contribution biplot using map="rowgreen" option
par(mar=c(3,2,1,1)) # (optional, reduces margins around plot)
plot(women.ca, map ="rowgreen", mass = c(FALSE,TRUE), col = c("forestgreen", "chocolate"))
# same again, but labelling points in colour after computing their # positions and also adding average age and gender groups, as shown in # Exhibit 9.6
plot(women.ca, map = "rowgreen", labels = 0, mass = c(FALSE,TRUE), col = c("forestgreen", "chocolate")) women.sex <- c(rep("m", 6),rep("f", 6)) women.age <- rep(c("a1","a2","a3","a4","a5","a6"), 2) women.sex.F <- cbind(tapply(women.F[12:23,1], women.sex, mean), tapply(women.F[12:23,2], women.sex, mean)) women.age.F <- cbind(tapply(women.F[12:23,1], women.age, mean), tapply(women.F[12:23,2], women.age, mean)) points(rbind(women.sex.F, women.age.F), pch = 21, col = "forestgreen", cex = 0.7) text(rbind(women.sex.F, women.age.F), labels = c("f","m","a1","a2","a3","a4","a5","a6"), pos = 4, offset = 0.3, col = "forestgreen", cex = 0.8, font = 2) text(women.F, labels = women.ca$rownames, pos = 4, offset = 0.3, col = "forestgreen", cex = 0.8, font = 2) text(sqrt(women.ca$colmass)*women.ca$colcoord, labels = women.ca$colnames, pos = 4, offset = 0.3, col = "chocolate", cex = 0.8, font = 4)
# Chapter 10: Multiple correspondence analysis biplots II
# EXTRA: We make rather a long detour here to show how to # 1. read in the original Spanish data (N=2471) and remove cases # with missing data # 2. compute the corresponding indicator matrix # 3. compute the Burt matrix and extract the concatenated matrix # used in Chap. 9 # First read in the original data (2471x12 matrix in # women_Spain2002_original.xls) into data frame 'women'. We count # missing data (=9) for each case, and then keep the rows that have # no missing data dim(women)
[1] 2471 12
women.9 <- rep(0, nrow(women)) for(i in 1:nrow(women)) { for(j in 1:12) { if(women[i,j] == 9) women.9[i] <- women.9[i]+1 } } women <- women[women.9 == 0,] dim(women)
[1] 2107 12
# Compute the indicator matrix - there are 8x5 categories for the # substantive variables and 2+5+6+6 categories for the demographics, # totalling 59 women.Z <- matrix(0, nrow = nrow(women), ncol = 59) # We admit the following single statement is cryptic but it works! # (remember that education categories start at 0) women.Z[cbind(as.numeric(matrix(rep(1:nrow(women), 12), byrow = TRUE, nrow = 12)), as.numeric(t(women[,1:12]) + c(seq(0,35,5),40,42,48,53)))] <- 1 colnames(women.Z) <- c(sort(paste(c("A","B","C","D","E","F","G","H"), rep(1:5, 8), sep = "")),"m","f","ma","wi","di","se","si", paste("e", 0:5, sep = ""), paste("a", 1:6, sep = "")) # column sums of women.Z are the frequencies of each category apply(women.Z,2,sum)
A1 A2 A3 A4 A5 B1 B2 B3 B4 B5 C1 C2 C3 C4 397 932 91 598 89 124 956 237 665 125 165 988 237 620 C5 D1 D2 D3 D4 D5 E1 E2 E3 E4 E5 F1 F2 F3 97 92 763 298 741 213 130 749 249 747 232 506 1193 147 F4 F5 G1 G2 G3 G4 G5 H1 H2 H3 H4 H5 m f 236 25 94 425 180 888 520 1105 917 52 31 2 972 1135 ma wi di se si e0 e1 e2 e3 e4 e5 a1 a2 a3 1169 172 45 69 652 240 522 580 431 166 168 334 453 374 a4 a5 a6 325 251 370
# amalgamate the H4 and H5 columns (39 & 40) into column 39 and then # delete the 40th column women.Z[,39] <- women.Z[,39] + women.Z[,40] women.Z <- women.Z[,-40] colnames(women.Z)[39] <- "H4,5" # Compute the 39x39 Burt matrix pf the substantive variables from the # indicator matrix... women.Burt <- t(women.Z[,1:39]) %*% women.Z[,1:39] # ... or the Burt matrix is a by-product of the mjca function in the # ca package women.Burt <- mjca(women[,1:8])$Burt # after which you have to combine the categories H4 and H5 women.Burt[,39] <- women.Burt[,39]+women.Burt[,40] women.Burt[39,] <- women.Burt[39,]+women.Burt[40,] women.Burt <- women.Burt[-40,-40] rownames(women.Burt)[39] <- colnames(women.Burt)[39] <- "H4,5" # for the concatenated table we need to first create the interactive # variable and add it to the original data then compute the full Burt # matrix, then extract the required part of the Burt matrix and # combine H4 and H5 again women <- cbind(women, 6*(women[,"g"]-1)+ women[,"a"]) colnames(women)[13] <- "ga" women.concat <- mjca(women)$Burt[40+c(3:13,20:31),1:40] women.concat[,39] <- women.concat[,39]+women.concat[,40] women.concat <- women.concat[,1:39] colnames(women.concat)[39] <- "H4,5" # The matrix is correct, now just the labels of the interactively # coded variable have to be redefined! # Presently they are ga1, ga2, ..., ga12; we want to code the first 6 # as male, and the last 6 as female rownames(women.concat)[12:23] <- paste(c(rep("ma", 6), rep("fa", 6)), 1:6, sep = "")
# After that detour, back to MCA! # The total inertia of the Burt matrix (check that it's 39x39 first)
dim(women.Burt)
[1] 39 39
sum(ca(women.Burt)$sv^2)
[1] 0.677625
# Adjusted total ienrtia (average off-diagonal inertia)
(8/7)*(sum(ca(women.Burt)$sv^2)-(31/64))
[1] 0.2208571
# Before continuing, check that you have the correct indicator matrix, # it should be 2107 x 39 in this case. # Above we have described either reading in the full indicator matrix # from an Excel file, or computing it. # In either case you probably have to extract the first 40 columns # corresponding to the substantive variables, then do the combining of # H4 and H5 in the indicator matrix:
women.Z <- women.Z[,1:40] women.Z[,39] <- women.Z[,39] + women.Z[,40] women.Z <- women.Z[,-40] dim(women.Z)
[1] 2107 39
# total inertia of the 2107x39 indicator matrix:
sum(ca(women.Z)$sv^2)
[1] 3.875
# Exhibit 10.3 (with vertical axis possibly inverted, depending on R # version). More code given here for category labelling
par(cex.axis = 0.7) plot(ca(women.Z), map = "rowprincipal", labels = 0, col = c("lightgreen","chocolate"), pch = c(149,1,17,24), mass = c(FALSE,TRUE)) women.Z.ca <- ca(women.Z) text(women.Z.ca$colcoord, labels = women.Z.ca$colnames, pos = 2, offset = 0.3, col = "chocolate", cex = 0.8, font = 4)
# How many singular values are greater than 1/8:
which(ca(women.Burt)$sv > (1/8))
[1] 1 2 3 4 5 6 7 8 9
# The adjusted singular values for these first 9 axes
(8/7)*(ca(women.Burt)$sv[1:9]-(1/8))
[1] 0.342185682 0.232597737 0.124150258 0.114994566 0.034506798 [6] 0.025747549 0.014892541 0.008973295 0.006810267
# The parts of (adjusted) inertia on the first 9 axes
(64 / 49) * (ca(women.Burt)$sv[1:9] - (1/8))^2 / 0.2208571
[1] 0.5301665244 0.2449624996 0.0697885037 0.0598746893 0.0053913554 [6] 0.0030016525 0.0010042139 0.0003645797 0.0002099988
# Compute MCA using ca function and then replace the singular values # with the adjusted ones
women.Burt.ca <- ca(women.Burt) women.Burt.ca$sv[1:9] <- (8/7) * (ca(women.Burt)$sv[1:9] - (1/8))
# EXTRA: Exhibit 10.4 - first axis coordinates inverted to agree with # Exhibit, although this depends on version of R # In Exhibit 10.4, the row labels were made lower case externally women.Burt.ca$rowcoord[,1] <- -women.Burt.ca$rowcoord[,1] women.Burt.ca$colcoord[,1] <- -women.Burt.ca$colcoord[,1] plot(women.Burt.ca, map = "rowprincipal", labels = 0, pch = c(149,1,17,24), col = c("lightgreen","chocolate"), mass = c(FALSE,TRUE)) text(women.Burt.ca$colcoord, labels = women.Burt.ca$colnames, pos = 2, offset = 0.3, col = "chocolate", cex = 0.8, font = 4) women.Burt.Fadj <- women.Burt.ca$colcoord[,1:9] %*% diag((8/7) * (ca(women.Burt)$sv[1:9] - (1/8))) points(women.Burt.Fadj, pch = 19, col = "lightgreen", cex = 0.7) text(women.Burt.Fadj, labels = rownames(women.Burt), pos = 4, offset = 0.3, col = "forestgreen", cex = 0.7, font = 2) # EXTRA: Exhibit 10.5 - we assume that the singular values of # women.Burt.ca have already been substituted, and that the row # principal coordinates women.Burt.Fadj computed, as above. # The column coordinates are shrunk by the square roots of their # masses, and then plotted as before: women.Burt.Gctr <- sqrt(women.Burt.ca$colmass) * women.Burt.ca$colcoord women.Burt.ca$colcoord <- women.Burt.Gctr plot(women.Burt.ca, map = "rowprincipal", labels = 0, pch = c(149,1,17,24), col = c("lightgreen","chocolate")) text(women.Burt.Gctr, labels = women.Burt.ca$colnames, pos = 2, offset = 0.3, col = "chocolate", cex = 0.8, font = 4) points(women.Burt.Fadj, pch = 19, col = "lightgreen", cex = 0.7) text(women.Burt.Fadj, labels = women.Burt.ca$rownames, pos = 2, offset = 0.3, col = "forestgreen", cex = 0.7, font = 2)
# Exhibit 10.6 - some of the demographic category labels have been # edited externally in the book's version
women.BurtS.ca <- ca(rbind(women.Burt, women.concat), suprow = 40:62) women.BurtS.ca$rowcoord[,1] <- -women.BurtS.ca$rowcoord[,1] women.BurtS.ca$colcoord[,1] <- -women.BurtS.ca$colcoord[,1] women.BurtS.Gctr <- sqrt(women.BurtS.ca$colmass) * women.BurtS.ca$colcoord women.BurtS.ca$colcoord <- women.BurtS.Gctr women.BurtS.ca$sv[1:9] <- (8/7) * (women.BurtS.ca$sv[1:9]-(1/8)) plot(women.BurtS.ca, map = "rowprincipal", what = c("none","all"), col = "chocolate", labels = 0, pch = c(20,1,17,24)) text(women.BurtS.Gctr, labels = women.Burt.ca$colnames, pos = 2, offset = 0.3, col = "chocolate", cex = 0.8, font = 4) women.BurtS.Fsup <- women.BurtS.ca$rowcoord[40:62,] %*% diag(women.BurtS.ca$sv) points(women.BurtS.Fsup, pch = 21, col = "forestgreen") text(women.BurtS.Fsup, labels = women.BurtS.ca$rownames[40:62], pos = 2, offset = 0.3, col = "forestgreen", cex = 0.7, font = 2)
# EXTRA: an easier way to do the MCA using the mjca function in the # ca package. # This package takes in original data, so one has to convert the # original data first (e.g., combining H4 and H5) # before reading into the function # So let's start again and read in the original data, cut out the # missing data, combine H4 and H5, and creating the interactive # variable, and then use the package. # Suppose 'women' contains the original data again (2471 x 12) dim(women)
[1] 2471 12
# Take out any rows with missing data women.9 <- rep(0, nrow(women)) for(i in 1:nrow(women)) { for(j in 1:12) { if(women[i,j] == 9) women.9[i] <- women.9[i] + 1 } } women <- women[women.9 == 0,] dim(women)
[1] 2107 12
# Replace 5's for variable 'H' with 4's women[women[,"H"] == 5, "H"] <- 4 # Create interactive variable women <- cbind(women, 6 * (women[,"g"]-1) + women[,"a"]) colnames(women)[13] <- "ga" # Now use mjca function directly plot(mjca(women[,1:8]), what = c("none","all")) plot(mjca(women[,1:8]), what = c("none","all"), map = "rowgreen") # Notice that to get exactly Exhibit 10.6, we will have to work via # the Burt matrix women.BURT <- mjca(women)$Burt dim(women.BURT)
[1] 70 70
# 8 questions + gender + marital + education + age + age*gender # 39 + 2 + 5 + 6 + 6 + 12 = 70 plot(ca(women.BURT[c(1:39, 42:52, 59:70), 1:39], suprow = 40:62), what = c("passive","all"), map = "rowgreen") # only problem here is the scaling factor for the rows, because the # inertias are not adjusted