# This file contains some dimension reduction methods illustrated on # the iris data # # Authors: # R.W. Oldford, 2004 # # quartz() is the Mac OS call for a new window. # Change the call inside from quartz() to whatever the call is # for the OS you are using. e.g. X11() or pdf(), etc. newwindow <- function(x) {quartz()} # # Get the data: # library(iris3) # # Now we will build the iris data frame just as was done # in the file web441("iris-multinom.R") # # Of the two methods of building a data frame described there, # The first will be used here. # # Each species is a slice in a three way array, # which will be separated into three matrices to # begin with. setosa <- iris3[,,1] versicolor <- iris3[,,2] virginica <- iris3[,,3] # Put these together into a single array iris <- rbind(setosa,versicolor,virginica) # Construct some Species labels # Species <- c(rep("Setosa",nrow(setosa)), rep("Versicolor",nrow(versicolor)), rep("Virginica",nrow(virginica)) ) # some meaningful row labels rownames(iris) <-c(paste("Setosa", 1:nrow(setosa)), paste("Versicolor", 1:nrow(versicolor)), paste("Virginica", 1:nrow(virginica))) # # And to be perverse, a more meaningful collection # of variate names # irisVars <- c("SepalLength", "SepalWidth", "PetalLength", "PetalWidth") # # which can replace the column names we started with # colnames(iris) <- irisVars # # Now to construct the data frame # iris.df <- data.frame(iris, Species = Species) # Select a training set (Say half the data) # Note that unlike other authors, I choose to randomly # sample from the whole data set rather than stratify by the # Species. The thinking here is to produce a training and # a test set according to how the data might arrive rather than # to have two sets which most resemble the whole dataset. # set.seed(2568) n <- nrow(iris.df) train <- sort(sample(1:n, floor(n/2))) # Training data will be: iris.train <- iris.df[train,] # negate the indices to get the test data: iris.test <- iris.df[-train,] # plot the data xvars <- 1:4 newwindow() pairs(iris.df[,xvars], main = "Anderson's Iris Data -- 3 species", col = c("red", "green3", "blue")[iris.df[,"Species"]]) # # Find the directions for the principal components # via the singular value decomposition of the X matrix # # First centre and scale the data iris.x <- iris.train[,xvars] iris.xbar <- apply(iris.x,2,mean) iris.sd <- apply(iris.x,2,sd) iris.x01 <- (iris.x - matrix(iris.xbar, nrow(iris.x), ncol(iris.x), byrow = TRUE) ) / matrix(iris.sd, nrow(iris.x), ncol(iris.x), byrow = TRUE) # plot the standardized (zero mean, sd = 1) newwindow() pairs(iris.x01[,xvars], main = "Anderson's Iris Data (standardized) -- 3 species", col = c("red", "green3", "blue")[iris.train[,"Species"]]) # # Now let's look for the principal directions # iris.svd <- svd (iris.x01[,xvars]) u <- iris.svd$u d <- iris.svd$d v <- iris.svd$v X <- iris.x01 # # Note that # u %*% diag(d) %*% t(v) # # equals # X # # the difference is zero (subject to floating point arithmetic) # X - u %*% diag(d) %*% t(v) # # Note that the values of d are the square roots of the eigen values # of t(X) %*% X and so are proportional to variances olong the # principal axes. # The proportion of the total variance explained by only the first principal # direction is # d2 <- d^2 d[1]^2/sum(d2) # # Or to do this for every direction sumulatively summing to get # the proportion of the total variance explained by the principal # directions up to that point cumsum(d2/sum(d2)) # I (rwo) think that ratios of standard deviations are more meaningful # (statistically and geometrically) and so would rather look at # d / max(d) # to give me an idea of the relative spreads of the data in the # different principal directions. # We might call these the standardized singular values. # These values kind of describe the dimensionality of the data. # as in sum(d/max(d)) # More interesting is to identify the point at which the # spread drops dramatically or where it drops beyond some # threshold (e.g. 1/30). An interesting plot might be # newwindow() par(mfcol=c(1,2)) plot(0,0,xlim=c(1,length(d)), ylim = c(0,1), main = "Standardized singular values", xlab = "i", ylab = "sing. val", type ="n") points(1:length(d), d / max(d)) lines(1:length(d), d/max(d), lty = 2) plot(0,0,xlim=c(1,length(d)), ylim = c(0,1), main = "Proportion of total variance", xlab = "i", ylab = "Cumulative proportion", type ="n") points(1:length(d),cumsum(d2/sum(d2))) lines(1:length(d), cumsum(d2/sum(d2)), lty = 2) # # Might get away with one or two of the principal components. # # # The data in the new coordinate system are # xpc <- u %*% diag(d) colnames(xpc) <- paste("PC", 1:ncol(xpc),sep="_") newwindow() par(mfcol=c(1,1)) pairs(xpc, main = "Anderson's Iris Data (principal components)-- 3 species", col = c("red", "green3", "blue")[iris.train[,"Species"]]) # # Let's look at the contents of the vectors that correspond to the first # two principal components v1 <- v[,1] v2 <- v[,2] # # We can associate the weights with the variates through the # following formatting (which is less complicated than it looks) cat("First principal component: ", paste(prettyNum(v1[1:(length(v1)-1)], digits = 2), irisVars[1:(length(v1)-1)], "+"), paste(prettyNum(v1[length(v1)], digits = 2), irisVars[length(v1)]) ) # looks like a difference on sepal dims (approx) + an average on the petals cat("Second principal component: ", paste(prettyNum(v2[1:(length(v2)-1)], digits = 2), irisVars[1:(length(v2)-1)], "+"), paste(prettyNum(v2[length(v2)], digits = 2), irisVars[length(v2)]) ) # which seems to be mostly a sepal size. ##################################### # # Try fitting a multinomial model using only the first two pcs # # # Fit a multinomial model # library(MASS) library(nnet) iris.pc <- data.frame(xpc, Species = Species[train]) fitpc <- multinom(formula = Species ~ PC_1 + PC_2, data = iris.pc, maxit = 1000 ) summary(fitpc) phat <- predict(fitpc, iris.pc[,1:4], type="probs") pred.tr <- apply(phat,1, function(x){ c("Setosa pred", "Versicolor pred", "Virginica pred")[x==max(x)] } ) table(c("Setosa", "Versicolor","Virginica")[iris.pc[,"Species"]], pred.tr) # # On the test data # iris.xte <- iris.test[,xvars] iris.x01te <- (iris.xte - matrix(iris.xbar, nrow(iris.xte), ncol(iris.xte), byrow = TRUE) ) / matrix(iris.sd, nrow(iris.xte), ncol(iris.xte), byrow = TRUE) xpc.te <- as.matrix(iris.x01te) %*% v colnames(xpc.te) <- colnames(xpc) phat.te <- predict(fitpc, xpc.te[,1:4], type="probs") pred.te <- apply(phat.te,1, function(x){ c("Setosa pred", "Versicolor pred", "Virginica pred")[x==max(x)] } ) table(c("Setosa", "Versicolor","Virginica")[iris.test[,"Species"]], pred.te) ############## # # Compare this to a multinomial model based on all 4 original variates # iris.mn4 <- multinom(formula = Species ~ PetalWidth + PetalLength + SepalWidth + SepalLength, data = iris.train, maxit = 1000 ) # # On the training data pred4 <- apply(predict(iris.mn4, iris.train[,1:4], type="probs"), 1,function(x){ c("Setosa pred", "Versicolor pred", "Virginica pred")[x==max(x)] } ) # # and to a multinomial model based on 2 of the original variates # iris.mn2 <- multinom(formula = Species ~ PetalWidth + PetalLength, data = iris.train, maxit = 1000 ) # # On the training data pred2 <- apply(predict(iris.mn2, iris.train[,1:4], type="probs"), 1,function(x){ c("Setosa pred", "Versicolor pred", "Virginica pred")[x==max(x)] } ) # # Using 4 variates table(c("Setosa", "Versicolor","Virginica")[iris.train[,"Species"]], pred4) # # Using 2 variates table(c("Setosa", "Versicolor","Virginica")[iris.train[,"Species"]], pred2) # # Using the first two pcs table(c("Setosa", "Versicolor","Virginica")[iris.pc[,"Species"]], pred.tr) # # On the test data pred4 <- apply(predict(iris.mn4, iris.test[,1:4], type="probs"), 1,function(x){ c("Setosa pred", "Versicolor pred", "Virginica pred")[x==max(x)] } ) # # On the test data pred2 <- apply(predict(iris.mn2, iris.test[,1:4], type="probs"), 1,function(x){ c("Setosa pred", "Versicolor pred", "Virginica pred")[x==max(x)] } ) # # Using 4 variates table(c("Setosa", "Versicolor","Virginica")[iris.test[,"Species"]], pred4) # # Using 2 variates table(c("Setosa", "Versicolor","Virginica")[iris.test[,"Species"]], pred2) # # Using the first two pcs table(c("Setosa", "Versicolor","Virginica")[iris.test[,"Species"]], pred.te)