CARME-N: Correspondence Analysis and Related Methods Network

General information

Below you can find all the R commands from chapter 1-10 of Correspondence Analysis in Practice, Third Edition.

You can find the data sets as Excel files in the section 'data sets'

# Chapter 2: Profiles and the Profile Space

# The data file 'travels.xls' contains the original count data
# so first calculate the row profiles in Excel by dividing each
# row by its row total.  Then copy the profiles with their row
# and column names, remembering to leave the top left hand cell
# empty.
# Alternatively, copy the five lines below

               Holidays HalfDays FullDays
Norway            0.333    0.056    0.611
Canada            0.067    0.200    0.733
Greece            0.138    0.862    0.000
France/Germany    0.083    0.083    0.833

# To read the data into R the following statement must be typed 
# in the R command window and not copied and pasted (since the 
# data are in the "clipboard") [but Ctrl-R works from the script]

profiles <- read.table("clipboard")

# On a Mac the equivalent of the clipboard is the following:

profiles <- read.table(pipe("pbpaste"))

profiles

# You need to have installed and loaded the rgl package now

require(rgl)
rgl.lines(c(0,1.2), c(0,0),   c(0,0))
rgl.lines(c(0,0)  , c(0,1.2), c(0,0))
rgl.lines(c(0,0)  , c(0,0),   c(0,1.2))

rgl.lines(c(0,0), c(0,1), c(1,0), size = 2)
rgl.lines(c(0,1), c(1,0), c(0,0), size = 2)
rgl.lines(c(0,1), c(0,0), c(1,0), size = 2)

rgl.points(profiles[,3], profiles[,1], profiles[,2], size = 4)
rgl.texts(profiles[,3], profiles[,1], profiles[,2], text = rownames(profiles))
rgl.texts(matrix(c(0,1.25,0, 0,0,1.25,1.4,0,0), byrow = T, ncol = 3), 
          text = colnames(profiles), col = "red")

# Now expand the 3-d graphics window and spin the points around using the
# left mouse button; the right button or mouse wheel allows zooming

# Chapter 3: Masses and Centroids

# Copy data in file 'readers.xls' or copy following lines

   C1 C2 C3
E1  5  7  2
E2 18 46 20
E3 19 29 39
E4 12 40 49
E5  3  7 16

# Again, TYPE the following statement (or Mac equivalent)

table <- read.table("clipboard")

table

table.pro <- table/apply(table,1,sum)
table.x   <- 1-table.pro[,1]-table.pro[,3]/2
table.y   <- table.pro[,3]*sqrt(3)/2
plot.new()
lines(c(0,1,0.5,0), c(0,0,sqrt(3)/2,0), col = "gray")
text(c(0,1,0.5), c(0,0,sqrt(3)/2), labels = colnames(table), col = "red")
text(table.x, table.y, labels = rownames(table), col = "blue")

# Chapter 4: Chi-square Distance and Inertia

# calculate chi-square statistic

table.rowsum <- apply(table, 1, sum)
table.colsum <- apply(table, 2, sum)
table.sum    <- sum(table)
table.exp    <- table.rowsum%o%table.colsum/table.sum
chi2         <- sum((table-table.exp)^2/table.exp)
chi2

# chi-square distances of row profiles to average profile

table.colmass <- table.colsum/table.sum
sqrt(apply((t(table.pro)-table.colmass)^2/table.colmass, 2, sum))

# all chi-square distances, including those to average

tablec.pro <- rbind(table.pro,table.colmass)
rownames(tablec.pro)[6] <- "ave"
dist(sweep(tablec.pro, 2, sqrt(table.colmass), FUN = "/"))

# Chapter 5: Plotting Chi-square Distance

table.wt <- sqrt(table.colmass)

require(rgl)

rgl.lines(c(0,1.1/table.wt[2]), c(0,0), c(0,0))
rgl.lines(c(0,0), c(0,1.1/table.wt[3]), c(0,0))
rgl.lines(c(0,0), c(0,0), c(0,1.1/table.wt[1]))

rgl.lines(c(0,0), c(0,1/table.wt[3]), c(1/table.wt[1],0), size = 2)
rgl.lines(c(0,1/table.wt[2]), c(1/table.wt[3],0), c(0,0), size = 2)
rgl.lines(c(0,1/table.wt[2]), c(0,0), c(1/table.wt[1],0), size = 2)

table.chi <- t(t(table.pro)/table.wt)
rgl.texts(table.chi[,2], table.chi[,3], table.chi[,1], text = rownames(table))
rgl.texts(matrix(c( 0,0,1.15/table.wt[1], 1.15/table.wt[2],0,0, 0,1.15/table.wt[3],0), 
          byrow = T, ncol = 3), text = colnames(table))

# Chapter 6: Reduction of Dimensionality

# First read data from file 'health.xls' as described before into
# data frame health

health.P    <- health/sum(health)
health.r    <- apply(health.P, 1, sum)
health.c    <- apply(health.P, 2, sum)
health.Dr   <- diag(health.r)
health.Dc   <- diag(health.c)
health.Drmh <- diag(1/sqrt(health.r))
health.Dcmh <- diag(1/sqrt(health.c))

health.P   <- as.matrix(health.P)
health.S   <- health.Drmh%*%(health.P-health.r%o%health.c)%*%health.Dcmh
health.svd <- svd(health.S)

health.rsc <- health.Drmh%*%health.svd$u
health.csc <- health.Dcmh%*%health.svd$v
health.rpc <- health.rsc%*%diag(health.svd$d)
health.cpc <- health.csc%*%diag(health.svd$d)

plot(health.rpc[,1], health.rpc[,2], type = "n", asp = 1, 
     xlab = "CA dimension 1", ylab = "CA dimension 2")
abline(v = 0, h = 0, col = "gray", lty = 2)
text(health.rpc[,1], health.rpc[,2], label = rownames(health))

# Chapter 7: Optimal Scaling

# rescale coordinates to lie between 0 and 100

health.range <- diff(range(health.csc[,1]))
health.scale <- (health.csc[,1]-min(health.csc[,1]))*100/health.range
health.scale

# Chapter 8: Symmetry of Row and Column Analyses

# row-column correlation and square (inertia on 1st axis)

health.cor <- t(health.rsc[,1])%*%health.P%*%health.csc[,1]
health.cor^2

# standardization of row scores

t(health.rsc[,1])%*%health.Dr%*%health.rsc[,1]

# Chapter 9: Two-dimensional Maps

# For this example you can read in the data from 'smoke.xls' as
# before, or after installing and loading the ca package, the 
# data are available by simply entering:

require(ca)
data(smoke)
smoke

# CA of smoke data, and symmetric map

ca(smoke)
plot(ca(smoke))

# asymmetric map (this is a biplot)

plot(ca(smoke), map = "rowprincipal")

# Chapter 10: Three More Examples

# for example, author data set in ca package

require(ca)
data(author)
plot3d.ca(ca(author), labels = c(2,1), sf = 0.000001)

# The marine biology data are in file 'benthos.xls', with two
# sheets: one with the biological abundance data, and one with
# environmental data used later in Chapter 27.
# The funding data are in file 'funding.xls', see next chapter