CARME-N: Correspondence Analysis and Related Methods Network

General information

This page contains the R commands given in "Biplots in Practice".

You can find the data sets in the section 'data sets'

In this page commands that are given in the Computational Appendix of the book are given in brown, just as in the book.
Results of commands are given in green, just like in the book.
Comments about these commands are given in black.
EXTRA means commands given here (but not in the book) to obtain additional results contained in the corresponding chapter.
Both the comments and commands of EXTRA material are in blue, with results given in green.

R code

# Chapter 1: Biplots - the basic idea

# Read in European indicators data set (Exhibit 1.1) into 
# data frame EU2008 (see p.168)
# For example, on Windows PC's, copy table, and then type:  
# EU2008 <- read.table("clipboard")

windows(width = 11, height = 6)
par(mfrow = c(1,2))
plot(EU2008[,2:1], type = "n", cex.axis = 0.7, xlab = "GDP/capita", 
     ylab = "Purchasing power/capita")
text(EU2008[,2:1], labels = rownames(EU2008), col = "forestgreen", 
     font = 2)
plot(EU2008[,2:3], type = "n", cex.axis = 0.7, xlab = "GDP/capita", 
     ylab = "Inflation rate")
text(EU2008[,2:3], labels = rownames(EU2008), col = "forestgreen", 
     font = 2)

# Note: we use "forestgreen" for the texts above above, more like 
# the green in the book.
# Install the R package rgl for 3-d graphics, and then load 
# as follows:

library(rgl)
plot3d(EU2008[,c(2,1,3)], xlab = "GDP", ylab = "Purchasing power", 
       zlab = "Inflation", font = 2, col = "brown", type = "n")
text3d(EU2008[,c(2,1,3)], text = rownames(EU2008), font = 2, 
       col = "forestgreen")

# Chapter 2: Regression biplots

# 'bioenv' data set with 8 columns (first 8 columns of Exhibit 2.1) 
# in data frame bioenv. d is species d, y is pollution, x is depth

d <- bioenv[,4]
y <- bioenv[,6]
x <- bioenv[,7]

summary(lm(d ~ y + x))

Call:
lm(formula = d ~ y + x)

Residuals:
     Min       1Q   Median       3Q      Max 
-11.7001  -2.4684   0.1749   3.0563   9.1803 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)  6.13518    6.25721   0.980  0.33554   
y           -1.38766    0.48745  -2.847  0.00834 **
x            0.14822    0.06684   2.217  0.03520 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 5.162 on 27 degrees of freedom
Multiple R-squared: 0.4416,     Adjusted R-squared: 0.4003 
F-statistic: 10.68 on 2 and 27 DF,  p-value: 0.0003831

# two ways of calculating regression coefficients
# 1. by doing the regression on standardized variables

ds <- (d-mean(d)) / sd(d)
ys <- (y-mean(y)) / sd(y)
xs <- (x-mean(x)) / sd(x)
summary(lm(ds ~ ys + xs))

Call:
lm(formula = ds ~ ys + xs)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.75515 -0.37028  0.02623  0.45847  1.37715 

Coefficients:
              Estimate Std. Error  t value Pr(>|t|)   
(Intercept)  2.487e-17  1.414e-01 1.76e-16  1.00000   
ys          -4.457e-01  1.566e-01   -2.847  0.00834 **
xs           3.472e-01  1.566e-01    2.217  0.03520 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 0.7744 on 27 degrees of freedom
Multiple R-squared: 0.4416,     Adjusted R-squared: 0.4003 
F-statistic: 10.68 on 2 and 27 DF,  p-value: 0.0003831

# 2. by direct calculation

lm(d ~ y + x)$coefficients[2] * sd(y) / sd(d)

         y 
-0.4457286

lm(d ~ y + x)$coefficients[3] * sd(x) / sd(d)

        x 
0.3471993

# finding and storing the standardized regression coefficients for 
# all five species - matrix B in (2.2)

B <- lm(bioenv[,1] ~ y+x)$coefficients[2:3] * c(sd(y),sd(x)) / 
        sd(bioenv[,1])
for(j in 2:5) {
  B <- rbind(B,lm(bioenv[,j] ~ y + x)$coefficients[2:3] * 
         c(sd(y),sd(x)) / sd(bioenv[j]))
  }
B

           y           x
B -0.7171713  0.02465266
  -0.4986038  0.22885450
   0.4910580  0.07424574
  -0.4457286  0.34719935
  -0.4750841 -0.39952072

# EXTRA: storing the variance explained for each regression 
# and computing overall variance explained
var.expl <- var(lm(bioenv[,1] ~ y + x)$fitted.values)
print(var(lm(bioenv[,j] ~ y + x)$fitted.values) / var(bioenv[,j]))

[1] 0.2351773

for(j in 2:5) {
  var.expl <- var.expl + var(lm(bioenv[,j] ~ y + x)$fitted.values)
  print(var(lm(bioenv[,j] ~ y + x)$fitted.values) / var(bioenv[,j]))
  }

[1] 0.3912441
[1] 0.2178098
[1] 0.4416404
[1] 0.2351773

# overall R2
var.expl / sum(apply(bioenv[,1:5], 2, var))

[1] 0.4145226

# EXTRA: doing the same as previous 'EXTRA', but for three predictor 
# variables, adding z = temperature
z <- bioenv[,8]
var.expl3 <- var(lm(bioenv[,1] ~ y + x + z)$fitted.values)
print(var(lm(bioenv[,1] ~ y + x + z)$fitted.values) / var(bioenv[,1]))

[1] 0.5569366

for(j in 2:5) {
  var.expl3 <- var.expl3 + 
                 var(lm(bioenv[,j] ~ y + x + z)$fitted.values)
  print(var(lm(bioenv[,j] ~ y + x + z)$fitted.values) / 
          var(bioenv[,j]))
  }

[1] 0.3924837
[1] 0.2181056
[1] 0.4416436
[1] 0.2463643

# overall R2
var.expl3 / sum(apply(bioenv[,1:5], 2, var))

[1] 0.4271042

# Drawing a regression biplot (Exhibit 2.5) using standardized values 
# of x and y.
# species labels put at the point, arrows 5% shorter
# Note: either close the previous graphics window, or open a new  
# one -on Windows machines open with command win.graph()

win.graph()
plot(xs, ys, xlab = "x*(depth)", ylab = "y*(pollution)", type = "n", 
     asp = 1, cex.axis = 0.7)
text(xs, ys, labels = rownames(bioenv), col = "forestgreen")
text(B[,2:1], labels = colnames(bioenv[,1:5]), col = "chocolate", 
     font = 4)
arrows(0, 0, 0.95*B[,2], 0.95*B[,1], col = "chocolate", angle = 15, 
       length = 0.1)

# (Notice that we have varied the color definitions to be more like 
# those in the book)

# Chapter 3: Generalized linear model biplots

# d0 is the fourth-root transformed d; do regression of d0 on 
# ys and xs

d0 <- d^0.25
summary(lm(d0 ~ ys + xs))

Call:
lm(formula = d0 ~ ys + xs)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.50933 -0.04458  0.13950  0.25483  0.85149 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.63908    0.09686  16.923 6.71e-16 ***
ys          -0.28810    0.10726  -2.686   0.0122 *  
xs           0.05959    0.10726   0.556   0.5831    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 0.5305 on 27 degrees of freedom
Multiple R-squared: 0.2765,     Adjusted R-squared: 0.2229 
F-statistic: 5.159 on 2 and 27 DF,  p-value: 0.01266

# EXTRA: saving all the coefficients for the double square-root 
# regressions (Exhibit 3.1)
# Also accumulating variance explained in each and computing overall
dsqrt_abcde       <- bioenv[,1:5]^0.25
dsqrt_abcde_coefs <- lm(dsqrt_abcde[,1] ~ ys + xs)$coefficients
dsqrt.var.expl    <- var(lm(dsqrt_abcde[,1] ~ ys + xs)$fitted.values)
print(var(lm(dsqrt_abcde[,j] ~ ys + xs)$fitted.values) / 
        var(dsqrt_abcde[,j]))

[1] 0.2284871

for(j in 2:5){
  dsqrt_abcde_coefs <- rbind(dsqrt_abcde_coefs,
                           lm(dsqrt_abcde[,j] ~ ys + xs)$coefficients)
  dsqrt.var.expl <- dsqrt.var.expl + 
                      var(lm(dsqrt_abcde[,j] ~ ys + xs)$fitted.values)
  print(var(lm(dsqrt_abcde[,j] ~ ys + xs)$fitted.values) / 
          var(dsqrt_abcde[,j]))
  }

[1] 0.3621872
[1] 0.1591791
[1] 0.2764936
[1] 0.2284871

# overall R2
dsqrt.var.expl / sum(apply(dsqrt_abcde[,1:5], 2, var))

[1] 0.3393836

# EXTRA: drawing the biplot of Exhibit 3.2
plot(xs, ys, xlab = "x*(depth)", ylab = "y*(pollution)", type = "n", 
     asp = 1, cex.axis = 0.7)
text(xs, ys, labels = rownames(bioenv), col = "forestgreen")
text(dsqrt_abcde_coefs[,3:2], labels = colnames(bioenv[,1:5]), 
     col = "chocolate", font = 4)
for(j in 1:5){ 
  arrows(0, 0, 0.95*dsqrt_abcde_coefs[j,3], 
         0.95*dsqrt_abcde_coefs[j,2], col = "chocolate", angle = 15, 
         length = 0.1)
  }

# performing Poisson regression for species d:

summary(glm(d ~ ys + xs, family = poisson))

Call:
glm(formula = d ~ ys + xs, family = poisson)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-4.4123  -0.7585   0.1637   0.8262   2.7565  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  2.29617    0.06068  37.838  < 2e-16 ***
ys          -0.33682    0.07357  -4.578 4.69e-06 ***
xs           0.19963    0.06278   3.180  0.00147 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 144.450  on 29  degrees of freedom
Residual deviance:  88.671  on 27  degrees of freedom
AIC: 208.55

Number of Fisher Scoring iterations: 5

# to get error deviance for Poisson regression of species d:

poisson.glm <- glm(d~ys+xs, family=poisson)
poisson.glm$deviance/poisson.glm$null.deviance

[1] 0.6138564

# EXTRA: saving all the coefficients for the Poisson regressions 
# (Exhibit 3.4)
pois_abcde_coefs <- glm(abcde[,1] ~ ys + xs, 
                        family = poisson)$coefficients
for(j in 2:5){ 
  pois_abcde_coefs <- rbind(pois_abcde_coefs, 
              glm(abcde[,j] ~ ys + xs, family=poisson)$coefficients)
  }
pois_abcde_coefs

                 (Intercept)         ys          xs
pois_abcde_coefs   2.1786692 -1.1254080 -0.06714289
                   1.8525985 -0.8122742  0.18286062
                   2.0411089  0.4174381  0.05321664
                   2.2961652 -0.3368244  0.19963421
                   0.8277491 -0.8234332 -0.56805285

# performing logistic regression for species d, after converting to 
# presence/absence:

d01 <- d > 0
summary(glm(d01 ~ ys + xs, family = binomial))

Call:
glm(formula = d01 ~ ys + xs, family = binomial)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2749   0.2139   0.2913   0.3884   1.2874  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)   2.7124     0.8533   3.179  0.00148 **
ys           -1.1773     0.6522  -1.805  0.07105 . 
xs           -0.1369     0.7097  -0.193  0.84708   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 19.505  on 29  degrees of freedom
Residual deviance: 15.563  on 27  degrees of freedom
AIC: 21.563

Number of Fisher Scoring iterations: 6

# to get error deviance for logistic regression of species d:

logistic.glm <- glm(d01 ~ ys + xs, family = binomial)
logistic.glm$deviance / logistic.glm$null.deviance

[1] 0.7979165

# EXTRA: saving all the coefficients for the logistic regressions 
# (Exhibit 3.5)
abcde <- bioenv[,1:5]
abcde01 <- abcde > 0
logit_abcde_coefs <- glm(abcde01[,1] ~ ys + xs, 
                         family = binomial)$coefficients
for(j in 2:5){ 
  logit_abcde_coefs <- rbind(logit_abcde_coefs, 
        glm(abcde01[,j] ~ ys + xs, family = binomial)$coefficients)
  }
logit_abcde_coefs

                  (Intercept)         ys         xs
logit_abcde_coefs   2.3842273 -2.8885410  0.8631339
                    1.2731575 -1.4183387 -0.1431720
                    0.8309807  0.9726882  0.3148828
                    2.7124406 -1.1772989 -0.1368686
                    0.2532229 -1.2796484 -0.7858995

# EXTRA: another example of a plot (Example 3.6)
# This time the arrows are drawn to the exact point and labels 
# are offset
plot(xs, ys, type = "n", xlab = "x* (depth)", ylab = "y* (pollution)", 
     ylim = c(-3,2.6), cex.axis = 0.7, asp = 1)
text(xs, ys, labels = rownames(bioenv), col = "forestgreen", cex=0.8)
arrows(0, 0, logit_abcde_coefs[,3], logit_abcde_coefs[,2], 
       col = "chocolate", length = 0.1, angle = 15, lwd = 1)
text(1.05*logit_abcde_coefs[,3:2], labels = colnames(bioenv[,1:5]), 
     col = "chocolate", font = 4, cex = 1.1)

# Chapter 4: Multidimensional scaling biplots

# 'countries' data set -- dissimilarities between 13 countries 
# (Exhibit 4.1) -- 
# assumed to have been read into data frame MT_matrix 
# R function cmdscale performs classical multidimensional scaling
# Exhibit 4.2 obtained as follows

plot(cmdscale(MT_matrix), type = "n", xlab = "dim 1", ylab = "dim 2", 
     asp = 1)
text(cmdscale(MT_matrix), labels = colnames(MT_matrix), 
     col = "forestgreen")

# EXTRA (*only* for MDS experts -- there is a bug in the cmdscale 
# function... read on...)
# The eigenvalues of the MDS (notice that warnings are given that 
# some are negative)
cmdscale(MT_matrix, eig = TRUE, k = 1)$eig

# Function cmdscale does not allow obtaining the last (13th) 
# eigenvalue, and there is thus a bug in the computation of the 
# goodness-of-fit (GOF), which doesn't take last dimension into 
# account
cmdscale(MT_matrix, eig = TRUE)$GOF

[1] 0.6177913 0.6500204

# In the above the first reported GOF value is wrong. It is based on 
# the first 12 eigenvalues:
sum(abs(cmdscale(MT_matrix, eig = TRUE, k = 12)$eig))

[1] 264.5464

# (it is not possible to set k=13)
# Here is the correct total, which includes the last (negative) 
# eigenvalue
MT_sum <- sum(abs(eigen(-0.5*cmdscale(MT_matrix, eig = TRUE, k = 12, 
           x.ret = TRUE)$x)$values))
MT_sum

[1] 288.0903

# The following verifiess the second GOF value, which is correct:
cmdscale(MT_matrix, eig = TRUE, k = 12)$eig[1:2] / 
  sum(cmdscale(MT_matrix, eig = TRUE, k = 12)$eig[1:8])

[1] 0.3821252 0.2678952

# (0.6500204 = 0.3821252 + 0.2678952)
# And the following revised calculation gives what the first GOF 
# should have been:
cmdscale(MT_matrix, eig = TRUE, k = 12)$eig[1:2] / MT_sum

[1] 0.3334984 0.2338045

# that is, the sum for the two-dimensional solution is not 0.6177913, 
# but rather
sum(cmdscale(MT_matrix, eig = TRUE, k = 12)$eig[1:2] / MT_sum)

[1] 0.5673029

# 'atributes' data set -- ratings of countries on 6 attributes 
# (Exhibit 4.3) -- 
# assumed to have been read into data frame MT_ratings

# keep coordinates of MDS plot in MT_dims

MT_dims <- cmdscale(MT_matrix, eig = TRUE, k = 2)$points
colnames(MT_dims) <- c("dim1","dim2")

# calculate the regression coefficients of the 6 attributes on the 
# coordinates (Exhibit 4.4)

MT_coefs <- lm(MT_ratings[,1] ~ MT_dims[,1] + 
                                MT_dims[,2])$coefficients
for(j in 2:6){ 
  MT_coefs <- rbind(MT_coefs, MT_coefs <- 
          lm(MT_ratings[,j] ~ MT_dims[,1] + MT_dims[,2])$coefficients)
  }

# plot the regression coefficients on the MDS plot (Exhibit 4.5)

plot(cmdscale(MT_matrix), type = "n", xlab = "dim 1", ylab = "dim 2", 
     asp = 1)
text(cmdscale(MT_matrix), labels = colnames(MT_matrix), 
     col = "forestgreen")
arrows(0, 0, MT_coefs[,2], MT_coefs[,3], col = "chocolate", 
       length = 0.1, angle = 10)
text(1.2*MT_coefs[,2:3], labels = colnames(MT_ratings), 
     col = "chocolate", font = 4, cex = 0.8)

# Function to calculate chi-square distances between row or column
# profiles of a matrix
# Usage: chidist(N,1) calculates the chi-square distances between row 
# profiles
#        (for row profiles, chidist(N) is sufficient)
#        chidist(N,2) calculates the chi-square distances between 
#        column profiles

chidist <- function(mat, rowcol = 1) {
  if(rowcol == 1) {
    prof <- mat / apply(mat, 1, sum)
    rootaveprof <- sqrt(apply(mat, 2, sum) / sum(mat))
    }
  if(rowcol == 2) {
    prof <- t(mat) / apply(mat, 2, sum)
    rootaveprof <- sqrt(apply(mat, 1, sum) / sum(mat))
    }
  dist(scale(prof, FALSE, rootaveprof))  
  }

# Note: functions defined in this way are added as objects to your 
# workspace, and are saved if the workspace is saved, otherwise they
# are lost

# Compute chi-square distances between row profiles, perform 
# classical MDS and plot

abcde <- bioenv[,1:5]
abcde_mds <- cmdscale(chidist(abcde), eig = TRUE, k = 4)
100 * abcde_mds$eig / sum(abcde_mds$eig)

[1] 52.364447 22.014980 16.187784  9.432789

plot(abcde_mds$points[,1:2], type = "n", xlab = "dim 1", 
     ylab = "dim 2", xlim = c(-1.2,1.6), ylim = c(-1.1,1.8), asp = 1) 
text(abcde_mds$points[,1:2], labels = rownames(abcde), 
     col = "forestgreen")

# Now add columns (species) as biplot vectors
# First convert to row profiles and standardize in the chi-squared way

abcde_prof <- abcde / apply(abcde, 1, sum)
abcde_prof_stand <- t(t(abcde_prof) / sqrt(apply(abcde, 2, sum) / 
                                           sum(abcde)))

# perform regressions and save coefficients for plotting

mds_coefs <- lm(abcde_prof_stand[,1] ~ abcde_mds$points[,1] + 
                abcde_mds$points[,2])$coefficients
for(j in 2:5){
  mds_coefs <- rbind(mds_coefs, MT_coefs <- 
                 lm(abcde_prof_stand[,j] ~ abcde_mds$points[,1] + 
                    abcde_mds$points[,2])$coefficients)
  }
arrows(0, 0, mds_coefs[,2], mds_coefs[,3], col = "chocolate", 
       length = 0.1, angle = 10)
text(1.1*mds_coefs[,2:3], labels = colnames(abcde), col = "chocolate", 
     font = 4, cex = 0.9)

# Assume that the sediment vector has been read into a character 
# vector (e.g., in Windows, copy the vector from the Excel file, 
# without the column label, and then type in the command:
#   sediment <- scan("clipboard", what="character")   

# convert sediment (consisting of "C","G" and "S") to factor object

sediment <- as.factor(sediment)

# compute mean positions in biplot and plot them

sediment.means <- cbind(tapply(abcde_mds$points[,1], sediment, mean), 
                        tapply(abcde_mds$points[,2], sediment, mean))
text(sediment.means[,1], sediment.means[,2], labels = c("C","G","S"), 
     font = 4, cex = 0.8, col = "chocolate")

# convert sediment categories to dummy variables

clay01   <- sediment == "C"
gravel01 <- sediment == "G"
sand01   <- sediment == "S"

# compute logistic regression coefficients and plot them connected 
# to origin

sediment_coefs <- glm(as.numeric(clay01) ~ abcde_mds$points[,1] + 
    abcde_mds$points[,2], family = "binomial")$coefficients
sediment_coefs <- rbind(sediment_coefs, 
    glm(as.numeric(gravel01) ~ abcde_mds$points[,1] + 
        abcde_mds$points[,2], family = "binomial")$coefficients)
sediment_coefs <- rbind(sediment_coefs, 
    glm(as.numeric(sand01) ~ abcde_mds$points[,1] + 
        abcde_mds$points[,2], family = "binomial")$coefficients)
segments(0, 0, sediment_coefs[,2], sediment_coefs[,3], 
         col = "chocolate")
text(sediment_coefs[,2:3], labels = c("C","G","S"), col = "chocolate", 
     font = 4)

# Chapter 5: Reduced-dimension biplots

# Small example from Chapter 1, biplotted
# (Note that the last two singular values, theoretically zero, and 
# their respective singular vectors, can turn out slightly differently 
# in different versions of R)

Y <- matrix(c(8,5,-2,2,4,2,0,-3,3,6,2,3,3,-3,-6,-6,-4,1,-1,-2), 
            nrow = 5)
svd(Y)

$d
[1] 1.412505e+01 9.822577e+00 6.351831e-16 3.592426e-33

$u
           [,1]       [,2]        [,3]          [,4]
[1,] -0.6634255 -0.4574027 -0.59215653  2.640623e-35
[2,] -0.3641420 -0.4939878  0.78954203  2.167265e-34
[3,]  0.2668543 -0.3018716 -0.06579517 -9.128709e-01
[4,] -0.2668543  0.3018716  0.06579517 -1.825742e-01
[5,] -0.5337085  0.6037432  0.13159034 -3.651484e-01

$v
           [,1]       [,2]       [,3]       [,4]
[1,] -0.7313508 -0.2551980 -0.6276102 -0.0781372
[2,] -0.4339970  0.4600507  0.2264451  0.7407581
[3,]  0.1687853 -0.7971898  0.0556340  0.5769791
[4,]  0.4982812  0.2961685 -0.7427873  0.3350628

colnames(Y) <- c("A","B","C","D")
rowcoord <- svd(Y)$u %*% diag(sqrt(svd(Y)$d))
colcoord <- svd(Y)$v %*% diag(sqrt(svd(Y)$d))
plot(rbind(rowcoord,colcoord)[,1:2], type = "n", xlab = "", ylab = "", 
     asp = 1, cex.axis = 0.7)
abline(h = 0, v = 0, lty = "dotted")
text(rowcoord[,1:2], labels = 1:5, col = "forestgreen")
text(colcoord[,1:2], labels = colnames(Y), col = "chocolate", 
     font = 3)

# Chapter 6: Principal component biplots

# 'attributes' data set already in data frame MT_ratings (Chap. 4)

# centre the data by the column means

MT_means <- apply(MT_ratings, 2, mean)
MT_Y     <- sweep(MT_ratings, 2, MT_means)

# compute matrix with equal row and column weights and compute SVD

MT_Y   <- MT_Y/sqrt(nrow(MT_Y)*ncol(MT_Y))
MT_SVD <- svd(MT_Y)

# form biplot: row principal, column standard coordinates

MT_F <- sqrt(nrow(MT_Y))*MT_SVD$u%*%diag(MT_SVD$d)
MT_G <- sqrt(ncol(MT_Y))*MT_SVD$v
plot(rbind(MT_F[,1:2],MT_G[,1:2]), type = "n", asp = 1, 
     xlim = c(-3.6,2.3), xlab = "dim 1", ylab = "dim 2", 
     cex.axis = 0.7) 
text(MT_F[,1:2], labels = rownames(MT_ratings), col = "forestgreen", 
     cex = 0.8) 
arrows(0, 0, MT_G[,1], MT_G[,2], col = "chocolate", length = 0.1, 
       angle = 10)
text(c(1.07,1.3,1.07,1.35,1.2,1.4)*MT_G[,1], 
     c(1.07,1.07,1.05,1,1.16,1.1)*MT_G[,2], 
     labels = colnames(MT_ratings), col = "chocolate", font = 4, 
     cex = 0.8)

# covariance biplot: row standard, column principal coordinates

MT_F <- sqrt(nrow(MT_Y))*MT_SVD$u
MT_G <- sqrt(ncol(MT_Y))*MT_SVD$v%*%diag(MT_SVD$d)
plot(rbind(MT_F[,1:2],MT_G[,1:2]), type = "n", asp = 1, 
     xlim = c(-3.6, 2.3), xlab = "dim 1", ylab = "dim 2", 
     cex.axis = 0.7) 
text(MT_F[,1:2], labels = rownames(MT_ratings), col = "forestgreen", 
     cex = 0.8) 
arrows(0, 0, MT_G[,1], MT_G[,2], col = "chocolate", length = 0.1, 
       angle = 10)
text(c(1.07,1.2,1.07,1.25,1.07,1.3)*MT_G[,1], 
     c(1.07,1.07,1.04,1.02,1.16,1.07)*MT_G[,2], 
     labels = colnames(MT_ratings), col = "chocolate", font = 4, 
     cex = 0.8)

# scree plot

MT_percents <- 100*MT_SVD$d^2/sum(MT_SVD$d^2)
MT_percents <- MT_percents[seq(6,1)]
barplot(MT_percents, horiz = TRUE, cex.axis = 0.7)

# Chapter 7: Log-ratio biplots

# get the US Arrests data provided in R

data(USArrests)

# perform (weighted) log-ratio analysis on columns 1, 2 and 4
# rm and cm are the row and column margins; 
# mr and mc are the weighted means used in the double-centering

N    <- USArrests[,c(1,2,4)]
P    <- N / sum(N)
rm   <- apply(P, 1, sum)
cm   <- apply(P, 2, sum)
Y    <- as.matrix(log(P))
mc   <- t(Y) %*% as.vector(rm)
Y    <- Y - rep(1, nrow(P)) %*% t(mc)
mr   <- Y %*% as.vector(cm)
Y    <- Y - mr %*% t(rep(1,ncol(P)))
Z    <- diag(sqrt(rm)) %*% Y %*% diag(sqrt(cm))
svdZ <- svd(Z)

# compute form biplot coordinates from results of SVD

USA_F <- diag(1/sqrt(rm)) %*% svdZ$u[,1:2] %*% diag(svdZ$d[1:2])
USA_G <- diag(1/sqrt(cm)) %*% svdZ$v[,1:2]

# biplot - axes with different scales plotted individually

plot(rbind(USA_F, USA_G/20), xlim = c(-0.35,0.45), 
     ylim = c(-0.18,0.23), asp = 1, type = "n", xlab = "", ylab = "", 
     xaxt = "n", yaxt = "n", cex.axis = 0.7)
axis(1, cex.axis = 0.7, col.axis = "forestgreen", 
     col.ticks = "forestgreen")
axis(2, cex.axis = 0.7, col.axis = "forestgreen", 
     col.ticks = "forestgreen", at = seq(-0.2, 0.2, 0.2))
axis(3, cex.axis = 0.7, col.axis = "chocolate", 
     col.ticks = "chocolate", at = seq(-0.4, 0.4, 0.2), 
     labels = seq(-8,8,4))
axis(4, cex.axis = 0.7, col.axis = "chocolate", 
     col.ticks = "chocolate", at = seq(-0.2, 0.2, 0.2), 
     labels = seq(-4,4,4))
text(USA_F, labels = rownames(N), col = "forestgreen", font = 2, 
     cex = 0.8)
text(USA_G/20, labels = colnames(N), col = "chocolate", font = 4, 
     cex = 0.9)

# total variance (inertia): either sum of squares of Z matrix, or sum 
# of squared singular values

sum(Z*Z)

[1] 0.01790182

sum(svdZ$d^2)

[1] 0.01790182

# fish morphology example: assume data read into data frame fish:
#   -- first two columns are sex, habitat
#   -- remaining 26 columns are the morphometric data

fish.morph<-fish[,3:ncol(fish)]

# log-ratio analysis is performed in the same way as above

N    <- fish.morph
P    <- N/sum(N)
rm   <- apply(P, 1, sum)
cm   <- apply(P, 2, sum)
Y    <- as.matrix(log(P))
mc   <- t(Y) %*% as.vector(rm)
Y    <- Y - rep(1,nrow(P)) %*% t(mc)
mr   <- Y %*% as.vector(cm)
Y    <- Y - mr %*% t(rep(1,ncol(P)))
Z    <- diag(sqrt(rm)) %*% Y %*% diag(sqrt(cm))
svdZ <- svd(Z)

# total variance

sum(Z*Z)

[1] 0.001960883

# calculate sex-habitat interaction and set up labels for four 
# sex*habitat groups

fish.sexhab                   <- 2*(fish[,2]-1)+fish[,1]
fish.labels                   <- rep("fL", nrow(fish))
fish.labels[fish.sexhab=="2"] <- "mL"
fish.labels[fish.sexhab=="3"] <- "fP"
fish.labels[fish.sexhab=="4"] <- "mP"

# EXTRA: log-ratio biplot of morphological data 
# Exhibit 7.3 (first axis coordinates reversed)
fish_F     <- diag(1/sqrt(rm)) %*% svdZ$u %*% diag(svdZ$d)
fish_G     <- diag(1/sqrt(cm)) %*% svdZ$v
fish_F[,1] <- -fish_F[,1]
fish_G[,1] <- -fish_G[,1]
plot(rbind(fish_F,fish_G/50), type = "n", xlab = "", ylab = "", 
     asp = 1, xaxt = "n", yaxt = "n", cex.axis = 0.7)
axis(1, col.axis = "forestgreen", col.ticks = "forestgreen")
axis(2, col.axis = "forestgreen", col.ticks = "forestgreen")
axis(3, col.axis = "chocolate", col.ticks = "chocolate", 
     at = seq(-0.06, 0.06, 0.02), labels = seq(-3,3,1))
axis(4, col.axis = "chocolate", col.ticks = "chocolate", 
     at = seq(-0.04, 0.04, 0.02), labels = seq(-2, 2, 1))
text(fish_F, labels = fish.labels, col = "forestgreen", font = 2, 
     cex = 0.7)
text(fish_G/50, labels = colnames(fish.morph), col = "chocolate", 
     font = 4, cex=0.8)

# scatterplot of log-ratios in Exhibit 7.4

logFdlFal <- log(fish.morph[,"Fdl"] / fish.morph[,"Fal"])	
logFdwFal <- log(fish.morph[,"Fdw"] / fish.morph[,"Fal"])
plot(logFdlFal, logFdwFal, asp = 1, pch = 24, cex = 0.7, 
     xlab = "log(Fdl/Fal)", ylab = "log(Fdw/Fal)", col = "forestgreen")
abline(a = 0.0107, b = 0.707, lty = 2, col = "chocolate")

# predictions of Fdw in Exhibit 7.5

Fdw_pred <- 1.0108*fish.morph[,"Fdl"]^0.707*fish.morph[,"Fal"]^0.293
plot(Fdw_pred, fish.morph[,"Fdw"], xlim = c(18,30), ylim = c(18,30), 
     pch = 24, cex = 0.7, xlab = "predicted Fdw", ylab = "actual Fdw", 
     col = "forestgreen")
abline(a = 0, b = 1, lty = 2, col = "chocolate")

# correlation between predicted and observed

cor(Fdw_pred, fish.morph[,"Fdw"])

[1] 0.7496034

# Chapter 8: Correspondence analysis biplots

# Data set 'smoking' is in the ca package (to be installed from CRAN 
# first, before loading)
# Its name is actually 'smoke' (Exhibit 8.1)

library(ca)
data(smoke)

# computation of CA

N    <- smoke
P    <- N/sum(N)
rm   <- apply(P, 1, sum)
cm   <- apply(P, 2, sum)
Dr   <- diag(rm)
Dc   <- diag(cm)
Z    <- diag(sqrt(1/rm))%*%(as.matrix(P)-rm%*%t(cm))%*%diag(sqrt(1/cm))
svdZ <- svd(Z)

# principal coordinates of rows, standard coordinates of columns

smoke_F <- diag(1/sqrt(rm))%*%svdZ$u %*%diag(svdZ$d)
smoke_G <- diag(1/sqrt(cm))%*%svdZ$v

# plotting (notice that by default the first two columns are used)

plot(rbind(smoke_F, smoke_G), type = "n", xlab = "Dim 1", 
     ylab = "Dim 2", asp = 1)
text(smoke_F, labels = rownames(smoke), col = "forestgreen", font = 2, 
     cex = 0.7)
text(smoke_G, labels = colnames(smoke), col = "chocolate", font = 2, 
     cex = 0.8)

# obtaining Exhibit 8.2 using ca package

plot(ca(smoke), map = "rowprincipal", 
     col = c("forestgreen","chocolate"))

# Summary of CA, with contributions to inertia

summary(ca(smoke))

Principal inertias (eigenvalues):

 dim    value      %   cum%   scree plot               
 1      0.074759  87.8  87.8  *************************
 2      0.010017  11.8  99.5  ***                      
 3      0.000414   0.5 100.0                           
        -------- -----                                 
 Total: 0.085190 100.0                                 


Rows:
    name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
1 |   SM |   57  893   31 |  -66  92   3 | -194 800 214 |
2 |   JM |   93  991  139 |  259 526  84 | -243 465 551 |
3 |   SE |  264 1000  450 | -381 999 512 |  -11   1   3 |
4 |   JE |  456 1000  308 |  233 942 331 |   58  58 152 |
5 |   SC |  130  999   71 | -201 865  70 |   79 133  81 |

Columns:
    name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
1 | none |  316 1000  577 | -393 994 654 |  -30   6  29 |
2 | lght |  233  984   83 |   99 327  31 |  141 657 463 |
3 | medm |  321  983  148 |  196 982 166 |    7   1   2 |
4 | hevy |  130  995  192 |  294 684 150 | -198 310 506 |

# suppose the 'benthos' data have been read into data frame benthos
# calculation of CA and the row contributions (ca package stores 
# standard coordinates in $rowcoord)

benthos.ca     <- ca(benthos)
benthos.F      <- benthos.ca$rowcoord %*% diag(benthos.ca$sv)
benthos.rowcon <- benthos.ca$rowmass * (benthos.F[,1]^2 + 
                    benthos.F[,2]^2) / sum(benthos.ca$sv[1:2]^2)

# set up a vector of species labels where those with less than 0.01 
# contribution are labelled "á"

benthos.names                        <- rownames(benthos)
benthos.names[benthos.rowcon < 0.01] <- "á"

# nonlinear transformation of contributions for plotting of labels

benthos.rowsize <- log(1+exp(1)*benthos.rowcon^0.3)
benthos.rowsize[benthos.rowcon<0.01] <- 1

# plot the asymmetric map with label size related to contribution 
# (Exhibit 8.3)

FF <- benthos.ca$rowcoord
GG <- benthos.ca$colcoord %*% diag(benthos.ca$sv)
plot(c(FF[,1],GG[,1]), c(FF[,2],GG[,2]), type = "n", xlab = "", 
     ylab = "", asp = 1)
text(FF[,1], FF[,2], labels = benthos.names, cex = benthos.rowsize, 
     col = "chocolate", font = 4)
text(GG[,1], GG[,2], labels = colnames(benthos), cex = 0.8, 
     col = "forestgreen", font = 2)

# The contribution biplot can be obtained using the map="colgreen" 
# option in the ca package where "green" refers to Greenacre, not a 
# colour! (The option "colgab" refers to Gabriel...)
# (remember that the columns are the sites, and the species the rows, 
# which is the tranpose of the usual format).

# EXTRA: First substitute the species labels, using blank labels for 
# low contributions
benthos.names[benthos.rowcon < 0.01] <- " "
benthos.ca$rownames <- benthos.names

# Contribution biplot of Exhibit 8.4

plot(benthos.ca, map = "colgreen", mass = c(1,0), 
     col = c("chocolate","forestgreen"), pch = c(17,24,16,1))

# draw lines to the species points

segments(0, 0, benthos.ca$rowcoord[,1]*sqrt(benthos.ca$rowmass), 
         benthos.ca$rowcoord[,2]*sqrt(benthos.ca$rowmass), 
         col = "chocolate")

# Chapter 9: Multiple correspondence analysis biplots I

# The original Spanish data (N=2471) is in the Excel file 
# women_Spain2002_original.xls
# Later (in Chapter 10) we will show how to create the concatenated 
# matrix in R.
# Here we use the prepared concatenated file for the reduced sample 
# (N=2107) which has no missing data and where categories H4 and H5 
# have already been combined -- this concatenated matrix (Exhibit 9.1) 
# is in the Excel file women_Spain2002_concat.xls
# So we assume that this concatenated table has been read into the data 
# frame 'women.concat'

# the symmetric CA map, which is the default plotting option of the 
# ca package

plot(ca(women.concat))

# to change the sign of the second axis (if required)

women.ca <- ca(women.concat)
women.ca$rowcoord[,2] <- -women.ca$rowcoord[,2]
women.ca$colcoord[,2] <- -women.ca$colcoord[,2]
plot(women.ca)

# to obtain Exhibit 9.2: plot symbols, compute principal coordinates, 
# add labels and inertias
# NOTE: after plotting, adjust the window to accommodate the points 
# and labels as in Exhibit 9.2. Repeat the plotting again, if 
# necessary, in that window to reposition labels

par(cex.axis = 0.7)
plot(women.ca, labels = 0, col = c("forestgreen", "chocolate"))
women.F <- women.ca$rowcoord %*% diag(women.ca$sv)
women.G <- women.ca$colcoord %*% diag(women.ca$sv)
text(women.F, labels = women.ca$rownames, pos = 4, offset = 0.3, 
     col = "forestgreen", cex = 0.8, font = 2)
text(women.G, labels = women.ca$colnames, pos = 4, offset = 0.3, 
     col = "chocolate", cex = 0.8, font = 4)
text(max(women.G[,1]), 0, "0.0571 (82.1%)", adj = c(0.6,-0.6), 
     cex = 0.7) 
text(0, max(women.G[,2]), "0.0030 (4.4%)", adj = c(-0.1,-3), 
     cex = 0.7)

# asymmetric biplot of Exhibit 9.3
# (again, if you need to adjust the window, repeat plotting in new 
# window to position labels better)

plot(women.ca, map = "rowprincipal", labels = c(0,2), 
     col = c("forestgreen", "chocolate"))

# EXTRA: same as above, but labelling categories in colour
plot(women.ca, map = "rowprincipal", labels = 0, 
     col = c("forestgreen", "chocolate"))
text(women.ca$colcoord, labels = women.ca$colnames, pos = 4, 
     offset = 0.3, col = "chocolate", cex = 0.8, font = 4)

# EXTRA: Exhibit 9.4 -- showing categories 'e5' and 'ma4' connected 
# to their 8 corresponding variables
# computation of coordinates of these demographic categories per 
# substantive variable (the slight complication here is that the last 
# variable has 4 categories, while all the others have 5)
women.e5 <- as.numeric(women.concat["e5",])
women.e5 <- women.e5 / sum(women.e5/8)
women.e5.coord <- apply(women.e5[36:39] * 
                  women.ca$colcoord[36:39,], 2, sum)
for(j in 7:1){
  women.e5.coord <- rbind(apply(women.e5[(5*(j-1)+1):(5*j)] * 
       women.ca$colcoord[(5*(j-1)+1):(5*j),], 2, sum), women.e5.coord)
  }
women.ma4 <- as.numeric(women.concat["ma4",])
women.ma4 <- women.ma4 / sum(women.ma4/8)
women.ma4.coord <- apply(women.ma4[36:39] * 
                         women.ca$colcoord[36:39,], 2, sum)
for(j in 7:1){
  women.ma4.coord <- rbind(apply(women.ma4[(5*(j-1)+1):(5*j)] * 
      women.ca$colcoord[(5*(j-1)+1):(5*j),], 2, sum), women.ma4.coord)
  }

# plot them with principal coordinates of rows and link to their 
# centroid
plot(rbind(women.F, women.e5.coord, women.ma4.coord), type = "n", 
     asp = 1, xlab = "", ylab = "")
points(women.F, pch = 19, col = "forestgreen")
text(women.F, labels = women.ca$rownames, pos = 4, offset = 0.3, 
     col = "forestgreen", cex = 0.9, font = 2)
points(rbind(women.e5.coord, women.ma4.coord), pch = 22, 
       col = "chocolate")
text(rbind(women.e5.coord, women.ma4.coord), labels=LETTERS[1:8], 
     pos = 4, offset = 0.3, col = "chocolate", cex = 0.9, font = 4)
rownames(women.F) <- women.ca$rownames
segments(women.F["e5",1], women.F["e5",2], women.e5.coord[,1], 
         women.e5.coord[,2], col = "chocolate")
segments(women.F["ma4",1], women.F["ma4",2], women.ma4.coord[,1], 
         women.ma4.coord[,2], col = "chocolate")
abline(h = 0, v = 0, lty = 3, col = "gray")

# Exhibit 9.6: contribution biplot using map="rowgreen" option

par(mar=c(3,2,1,1))         # (optional, reduces margins around plot)

plot(women.ca, map  ="rowgreen", mass = c(FALSE,TRUE), 
     col = c("forestgreen", "chocolate"))

# same again, but labelling points in colour after computing their 
# positions and also adding average age and gender groups, as shown in 
# Exhibit 9.6

plot(women.ca, map = "rowgreen", labels = 0, mass = c(FALSE,TRUE), 
     col = c("forestgreen", "chocolate"))
women.sex   <- c(rep("m", 6),rep("f", 6)) 
women.age   <- rep(c("a1","a2","a3","a4","a5","a6"), 2)
women.sex.F <- cbind(tapply(women.F[12:23,1], women.sex, mean), 
                     tapply(women.F[12:23,2], women.sex, mean))
women.age.F <- cbind(tapply(women.F[12:23,1], women.age, mean), 
                     tapply(women.F[12:23,2], women.age, mean))
points(rbind(women.sex.F, women.age.F), pch = 21, col = "forestgreen", 
       cex = 0.7)
text(rbind(women.sex.F, women.age.F), 
     labels = c("f","m","a1","a2","a3","a4","a5","a6"), pos = 4, 
     offset = 0.3, col = "forestgreen", cex = 0.8, font = 2)
text(women.F, labels = women.ca$rownames, pos = 4, offset = 0.3, 
     col = "forestgreen", cex = 0.8, font = 2)
text(sqrt(women.ca$colmass)*women.ca$colcoord, 
     labels = women.ca$colnames, pos = 4, offset = 0.3, 
     col = "chocolate", cex = 0.8, font = 4)

# Chapter 10: Multiple correspondence analysis biplots II

# EXTRA: We make rather a long detour here to show how to 
#    1. read in the original Spanish data (N=2471) and remove cases 
#       with missing data
#    2. compute the corresponding indicator matrix
#    3. compute the Burt matrix and extract the concatenated matrix 
#       used in Chap. 9

# First read in the original data (2471x12 matrix in 
# women_Spain2002_original.xls) into data frame 'women'. We count 
# missing data (=9) for each case, and then keep the rows that have 
# no missing data
dim(women)

[1] 2471   12

women.9 <- rep(0, nrow(women))
for(i in 1:nrow(women)) {
  for(j in 1:12) {
    if(women[i,j] == 9) women.9[i] <- women.9[i]+1
  }
}
women <- women[women.9 == 0,]

dim(women)

[1] 2107   12

# Compute the indicator matrix - there are 8x5 categories for the 
# substantive variables and 2+5+6+6 categories for the demographics, 
# totalling 59 
women.Z <- matrix(0, nrow = nrow(women), ncol = 59)

# We admit the following single statement is cryptic but it works!
# (remember that education categories start at 0)
women.Z[cbind(as.numeric(matrix(rep(1:nrow(women), 12), byrow = TRUE, 
          nrow = 12)), as.numeric(t(women[,1:12]) + 
          c(seq(0,35,5),40,42,48,53)))] <- 1
colnames(women.Z) <- c(sort(paste(c("A","B","C","D","E","F","G","H"),
        rep(1:5, 8), sep = "")),"m","f","ma","wi","di","se","si",
        paste("e", 0:5, sep = ""), paste("a", 1:6, sep = ""))
 
# column sums of women.Z are the frequencies of each category
apply(women.Z,2,sum)

  A1   A2   A3   A4   A5   B1   B2   B3   B4   B5   C1   C2   C3   C4   
 397  932   91  598   89  124  956  237  665  125  165  988  237  620 
  C5   D1   D2   D3   D4   D5   E1   E2   E3   E4   E5   F1   F2   F3
  97   92  763  298  741  213  130  749  249  747  232  506 1193  147
  F4   F5   G1   G2   G3   G4   G5   H1   H2   H3   H4   H5    m    f
 236   25   94  425  180  888  520 1105  917   52   31    2  972 1135
  ma   wi   di   se   si   e0   e1   e2   e3   e4   e5   a1   a2   a3
1169  172   45   69  652  240  522  580  431  166  168  334  453  374 
  a4   a5   a6 
 325  251  370

# amalgamate the H4 and H5 columns (39 & 40) into column 39 and then 
# delete the 40th column
women.Z[,39] <- women.Z[,39] + women.Z[,40]
women.Z      <- women.Z[,-40]
colnames(women.Z)[39] <- "H4,5"

# Compute the 39x39 Burt matrix pf the substantive variables from the 
# indicator matrix...
women.Burt <- t(women.Z[,1:39]) %*% women.Z[,1:39]

# ... or the Burt matrix is a by-product of the mjca function in the 
# ca package
women.Burt <- mjca(women[,1:8])$Burt

# after which you have to combine the categories H4 and H5
women.Burt[,39] <- women.Burt[,39]+women.Burt[,40]
women.Burt[39,] <- women.Burt[39,]+women.Burt[40,]
women.Burt <- women.Burt[-40,-40]
rownames(women.Burt)[39] <- colnames(women.Burt)[39] <- "H4,5"

# for the concatenated table we need to first create the interactive 
# variable and add it to the original data then compute the full Burt 
# matrix, then extract the required part of the Burt matrix and 
# combine H4 and H5 again
women               <- cbind(women, 6*(women[,"g"]-1)+ women[,"a"])
colnames(women)[13] <- "ga"
women.concat        <- mjca(women)$Burt[40+c(3:13,20:31),1:40]
women.concat[,39]   <- women.concat[,39]+women.concat[,40]
women.concat        <- women.concat[,1:39]
colnames(women.concat)[39] <- "H4,5"

# The matrix is correct, now just the labels of the interactively 
# coded variable have to be redefined!
# Presently they are ga1, ga2, ..., ga12; we want to code the first 6 
# as male, and the last 6 as female
rownames(women.concat)[12:23] <- paste(c(rep("ma", 6), 
                                       rep("fa", 6)), 1:6, sep = "")

# After that detour, back to MCA!
# The total inertia of the Burt matrix (check that it's 39x39 first)

dim(women.Burt)

[1] 39 39

sum(ca(women.Burt)$sv^2)

[1] 0.677625

# Adjusted total ienrtia (average off-diagonal inertia)

(8/7)*(sum(ca(women.Burt)$sv^2)-(31/64))

[1] 0.2208571

# Before continuing, check that you have the correct indicator matrix, 
# it should be 2107 x 39 in this case.
# Above we have described either reading in the full indicator matrix 
# from an Excel file, or computing it.
# In either case you probably have to extract the first 40 columns 
# corresponding to the substantive variables, then do the combining of 
# H4 and H5 in the indicator matrix:

women.Z      <- women.Z[,1:40]
women.Z[,39] <- women.Z[,39] + women.Z[,40]
women.Z      <- women.Z[,-40]

dim(women.Z)

[1] 2107 39

# total inertia of the 2107x39 indicator matrix:

sum(ca(women.Z)$sv^2)

[1] 3.875

# Exhibit 10.3 (with vertical axis possibly inverted, depending on R 
# version). More code given here for category labelling

par(cex.axis = 0.7)
plot(ca(women.Z), map = "rowprincipal", labels = 0, 
     col = c("lightgreen","chocolate"), pch = c(149,1,17,24), 
     mass = c(FALSE,TRUE))
women.Z.ca <- ca(women.Z)
text(women.Z.ca$colcoord, labels = women.Z.ca$colnames, pos = 2, 
     offset = 0.3, col = "chocolate", cex = 0.8, font = 4)

# How many singular values are greater than 1/8:

which(ca(women.Burt)$sv > (1/8))

[1] 1 2 3 4 5 6 7 8 9

# The adjusted singular values for these first 9 axes

(8/7)*(ca(women.Burt)$sv[1:9]-(1/8))

[1] 0.342185682 0.232597737 0.124150258 0.114994566 0.034506798 
[6] 0.025747549 0.014892541 0.008973295 0.006810267

# The parts of (adjusted) inertia on the first 9 axes

(64 / 49) * (ca(women.Burt)$sv[1:9] - (1/8))^2 / 0.2208571

[1] 0.5301665244 0.2449624996 0.0697885037 0.0598746893 0.0053913554 
[6] 0.0030016525 0.0010042139 0.0003645797 0.0002099988

# Compute MCA using ca function and then replace the singular values 
# with the adjusted ones

women.Burt.ca <- ca(women.Burt)
women.Burt.ca$sv[1:9] <- (8/7) * (ca(women.Burt)$sv[1:9] - (1/8))

# EXTRA: Exhibit 10.4 - first axis coordinates inverted to agree with 
# Exhibit, although this depends on version of R
# In Exhibit 10.4, the row labels were made lower case externally
women.Burt.ca$rowcoord[,1] <- -women.Burt.ca$rowcoord[,1]
women.Burt.ca$colcoord[,1] <- -women.Burt.ca$colcoord[,1]

plot(women.Burt.ca, map = "rowprincipal", labels = 0, 
     pch = c(149,1,17,24), col = c("lightgreen","chocolate"), 
     mass = c(FALSE,TRUE))
text(women.Burt.ca$colcoord, labels = women.Burt.ca$colnames, 
     pos = 2, offset = 0.3, col = "chocolate", cex = 0.8, font = 4)
women.Burt.Fadj <- women.Burt.ca$colcoord[,1:9] %*% 
                     diag((8/7) * (ca(women.Burt)$sv[1:9] - (1/8)))
points(women.Burt.Fadj, pch = 19, col = "lightgreen", cex = 0.7)
text(women.Burt.Fadj, labels = rownames(women.Burt), pos = 4, 
     offset = 0.3, col = "forestgreen", cex = 0.7, font = 2)

# EXTRA: Exhibit 10.5 - we assume that the singular values of 
# women.Burt.ca have already been substituted, and that the row 
# principal coordinates women.Burt.Fadj computed, as above.
# The column coordinates are shrunk by the square roots of their 
# masses, and then plotted as before:
women.Burt.Gctr <- sqrt(women.Burt.ca$colmass) * 
                     women.Burt.ca$colcoord
women.Burt.ca$colcoord <- women.Burt.Gctr

plot(women.Burt.ca, map = "rowprincipal", labels = 0, 
     pch = c(149,1,17,24), col = c("lightgreen","chocolate"))
text(women.Burt.Gctr, labels = women.Burt.ca$colnames, pos = 2, 
     offset = 0.3, col = "chocolate", cex = 0.8, font = 4)
points(women.Burt.Fadj, pch = 19, col = "lightgreen", cex = 0.7)
text(women.Burt.Fadj, labels = women.Burt.ca$rownames, pos = 2, 
     offset = 0.3, col = "forestgreen", cex = 0.7, font = 2)

# Exhibit 10.6 - some of the demographic category labels have been 
# edited externally in the book's version

women.BurtS.ca <- ca(rbind(women.Burt, women.concat), suprow = 40:62)
women.BurtS.ca$rowcoord[,1] <- -women.BurtS.ca$rowcoord[,1]
women.BurtS.ca$colcoord[,1] <- -women.BurtS.ca$colcoord[,1]
women.BurtS.Gctr <- sqrt(women.BurtS.ca$colmass) * 
                      women.BurtS.ca$colcoord
women.BurtS.ca$colcoord <- women.BurtS.Gctr
women.BurtS.ca$sv[1:9]  <- (8/7) * (women.BurtS.ca$sv[1:9]-(1/8))
plot(women.BurtS.ca, map = "rowprincipal", what = c("none","all"), 
     col = "chocolate", labels = 0, pch = c(20,1,17,24))
text(women.BurtS.Gctr, labels = women.Burt.ca$colnames, pos = 2, 
     offset = 0.3, col = "chocolate", cex = 0.8, font = 4)
women.BurtS.Fsup <- women.BurtS.ca$rowcoord[40:62,] %*% 
                      diag(women.BurtS.ca$sv)
points(women.BurtS.Fsup, pch = 21, col = "forestgreen")
text(women.BurtS.Fsup, labels = women.BurtS.ca$rownames[40:62], 
     pos = 2, offset = 0.3, col = "forestgreen", cex = 0.7, font = 2)

# EXTRA: an easier way to do the MCA using the mjca function in the 
# ca package.
# This package takes in original data, so one has to convert the 
# original data first (e.g., combining H4 and H5)
# before reading into the function
# So let's start again and read in the original data, cut out the 
# missing data, combine H4 and H5, and creating the interactive 
# variable, and then use the package.
# Suppose 'women' contains the original data again (2471 x 12) 

dim(women)

[1] 2471   12

# Take out any rows with missing data 
women.9 <- rep(0, nrow(women))
for(i in 1:nrow(women)) {
  for(j in 1:12) {
    if(women[i,j] == 9) women.9[i] <- women.9[i] + 1
    }
  }
women <- women[women.9 == 0,]

dim(women)

[1] 2107   12

# Replace 5's for variable 'H' with 4's
women[women[,"H"] == 5, "H"] <- 4

# Create interactive variable
women <- cbind(women, 6 * (women[,"g"]-1) + women[,"a"])
colnames(women)[13] <- "ga"

# Now use mjca function directly
plot(mjca(women[,1:8]), what = c("none","all"))

plot(mjca(women[,1:8]), what = c("none","all"), map = "rowgreen")

# Notice that to get exactly Exhibit 10.6, we will have to work via 
# the Burt matrix
women.BURT <- mjca(women)$Burt

dim(women.BURT)

[1] 70 70

#  8 questions + gender + marital + education + age + age*gender
#      39      +   2    +    5    +     6     +  6  +    12      = 70
plot(ca(women.BURT[c(1:39, 42:52, 59:70), 1:39], suprow = 40:62), 
     what = c("passive","all"), map = "rowgreen")

# only problem here is the scaling factor for the rows, because the 
# inertias are not adjusted