General information
Below you can find all the R commands from chapter 1-10 of Correspondence Analysis in Practice, Third Edition.
You can find the data sets as Excel files in the section 'data sets'
# Chapter 2: Profiles and the Profile Space
# The data file 'travels.xls' contains the original count data # so first calculate the row profiles in Excel by dividing each # row by its row total. Then copy the profiles with their row # and column names, remembering to leave the top left hand cell # empty. # Alternatively, copy the five lines below
Holidays HalfDays FullDays Norway 0.333 0.056 0.611 Canada 0.067 0.200 0.733 Greece 0.138 0.862 0.000 France/Germany 0.083 0.083 0.833
# To read the data into R the following statement must be typed # in the R command window and not copied and pasted (since the # data are in the "clipboard") [but Ctrl-R works from the script]
profiles <- read.table("clipboard")
# On a Mac the equivalent of the clipboard is the following:
profiles <- read.table(pipe("pbpaste")) profiles
# You need to have installed and loaded the rgl package now
require(rgl) rgl.lines(c(0,1.2), c(0,0), c(0,0)) rgl.lines(c(0,0) , c(0,1.2), c(0,0)) rgl.lines(c(0,0) , c(0,0), c(0,1.2)) rgl.lines(c(0,0), c(0,1), c(1,0), size = 2) rgl.lines(c(0,1), c(1,0), c(0,0), size = 2) rgl.lines(c(0,1), c(0,0), c(1,0), size = 2) rgl.points(profiles[,3], profiles[,1], profiles[,2], size = 4) rgl.texts(profiles[,3], profiles[,1], profiles[,2], text = rownames(profiles)) rgl.texts(matrix(c(0,1.25,0, 0,0,1.25,1.4,0,0), byrow = T, ncol = 3), text = colnames(profiles), col = "red")
# Now expand the 3-d graphics window and spin the points around using the # left mouse button; the right button or mouse wheel allows zooming
# Chapter 3: Masses and Centroids
# Copy data in file 'readers.xls' or copy following lines
C1 C2 C3 E1 5 7 2 E2 18 46 20 E3 19 29 39 E4 12 40 49 E5 3 7 16
# Again, TYPE the following statement (or Mac equivalent)
table <- read.table("clipboard") table table.pro <- table/apply(table,1,sum) table.x <- 1-table.pro[,1]-table.pro[,3]/2 table.y <- table.pro[,3]*sqrt(3)/2 plot.new() lines(c(0,1,0.5,0), c(0,0,sqrt(3)/2,0), col = "gray") text(c(0,1,0.5), c(0,0,sqrt(3)/2), labels = colnames(table), col = "red") text(table.x, table.y, labels = rownames(table), col = "blue")
# Chapter 4: Chi-square Distance and Inertia
# calculate chi-square statistic
table.rowsum <- apply(table, 1, sum) table.colsum <- apply(table, 2, sum) table.sum <- sum(table) table.exp <- table.rowsum%o%table.colsum/table.sum chi2 <- sum((table-table.exp)^2/table.exp) chi2
# chi-square distances of row profiles to average profile
table.colmass <- table.colsum/table.sum sqrt(apply((t(table.pro)-table.colmass)^2/table.colmass, 2, sum))
# all chi-square distances, including those to average
tablec.pro <- rbind(table.pro,table.colmass) rownames(tablec.pro)[6] <- "ave" dist(sweep(tablec.pro, 2, sqrt(table.colmass), FUN = "/"))
# Chapter 5: Plotting Chi-square Distance
table.wt <- sqrt(table.colmass) require(rgl) rgl.lines(c(0,1.1/table.wt[2]), c(0,0), c(0,0)) rgl.lines(c(0,0), c(0,1.1/table.wt[3]), c(0,0)) rgl.lines(c(0,0), c(0,0), c(0,1.1/table.wt[1])) rgl.lines(c(0,0), c(0,1/table.wt[3]), c(1/table.wt[1],0), size = 2) rgl.lines(c(0,1/table.wt[2]), c(1/table.wt[3],0), c(0,0), size = 2) rgl.lines(c(0,1/table.wt[2]), c(0,0), c(1/table.wt[1],0), size = 2) table.chi <- t(t(table.pro)/table.wt) rgl.texts(table.chi[,2], table.chi[,3], table.chi[,1], text = rownames(table)) rgl.texts(matrix(c( 0,0,1.15/table.wt[1], 1.15/table.wt[2],0,0, 0,1.15/table.wt[3],0), byrow = T, ncol = 3), text = colnames(table))
# Chapter 6: Reduction of Dimensionality
# First read data from file 'health.xls' as described before into # data frame health
health.P <- health/sum(health) health.r <- apply(health.P, 1, sum) health.c <- apply(health.P, 2, sum) health.Dr <- diag(health.r) health.Dc <- diag(health.c) health.Drmh <- diag(1/sqrt(health.r)) health.Dcmh <- diag(1/sqrt(health.c)) health.P <- as.matrix(health.P) health.S <- health.Drmh%*%(health.P-health.r%o%health.c)%*%health.Dcmh health.svd <- svd(health.S) health.rsc <- health.Drmh%*%health.svd$u health.csc <- health.Dcmh%*%health.svd$v health.rpc <- health.rsc%*%diag(health.svd$d) health.cpc <- health.csc%*%diag(health.svd$d) plot(health.rpc[,1], health.rpc[,2], type = "n", asp = 1, xlab = "CA dimension 1", ylab = "CA dimension 2") abline(v = 0, h = 0, col = "gray", lty = 2) text(health.rpc[,1], health.rpc[,2], label = rownames(health))
# Chapter 7: Optimal Scaling
# rescale coordinates to lie between 0 and 100
health.range <- diff(range(health.csc[,1])) health.scale <- (health.csc[,1]-min(health.csc[,1]))*100/health.range health.scale
# Chapter 8: Symmetry of Row and Column Analyses
# row-column correlation and square (inertia on 1st axis)
health.cor <- t(health.rsc[,1])%*%health.P%*%health.csc[,1] health.cor^2
# standardization of row scores
t(health.rsc[,1])%*%health.Dr%*%health.rsc[,1]
# Chapter 9: Two-dimensional Maps
# For this example you can read in the data from 'smoke.xls' as # before, or after installing and loading the ca package, the # data are available by simply entering:
require(ca) data(smoke) smoke
# CA of smoke data, and symmetric map
ca(smoke) plot(ca(smoke))
# asymmetric map (this is a biplot)
plot(ca(smoke), map = "rowprincipal")
# Chapter 10: Three More Examples
# for example, author data set in ca package
require(ca) data(author) plot3d.ca(ca(author), labels = c(2,1), sf = 0.000001)
# The marine biology data are in file 'benthos.xls', with two # sheets: one with the biological abundance data, and one with # environmental data used later in Chapter 27. # The funding data are in file 'funding.xls', see next chapter