################################################################# # # Main Functions for PGA and SGA # by Mu Zhu # Apr 2005 # ################################################################# pga <- function(y, X, m, N, B=100, mutation=1/ncol(X), start=NULL, prior = 0.3, ...) { # performs parallel genetic algorithm for variable selection # by Mu Zhu # first version = Jan 2002 # packaged again Apr 2005 # # Inputs: # m = population size, default set to roughly ncol(X) # N = number of generations per path # B = number of parallel paths # mutation = mutation rate, default set to roughly 1/m # start = starting population, pretty much useless # prior = controls the density of 1's in the initial population b <- 0; p <- ncol(X) rslt <- matrix(0, B, p) cat("Starting PGA...\n") while (b < B) { print(b) b <- b + 1 tmp <- sga(y, X, m, N, start, mutation, prior) rslt[b,] <- tmp$combo.gene } return(rslt) } sga <- function(y, X, m, N, start=NULL, mutation=1/ncol(X), prior=0.3, check.convg=F, thresh=0.05) { # performs single genetic algorithm for variable selection # by Mu Zhu # first version = Jan 2002 # packaged again Apr 2005 X <- as.matrix(X) n <- nrow(X); p <- ncol(X) perf <- convg <- numeric(N+1) if (check.convg) { entro<-numeric(N) } if (length(mutation) != N) { # possible to use a different mutation rate for each generation mutation <- rep(mutation[1], N) } if (is.null(start)) { curr <- matrix(sample(0:1, size=(m*p), prob=c((1-prior), prior), replace=T),m,p) } else { curr <- start } for (i in 1:N) { rslt <- onegen(curr, y, X, m, mutation[i]) curr <- rslt$new.gen # addition Jan 2005 by Mu Zhu if (check.convg) { pp <- apply(curr,2,mean) qq <- 1-pp lp <- ifelse(pp==0, 0, log(pp,base=2)) lq <- ifelse(qq==0, 0, log(qq,base=2)) entro[i]<-entropy <- mean(-(pp*lp+qq*lq)) print(entropy) if (i > 5 && mean(entro[(i-5):i]) < thresh) break } # addition ends convg[i] <- rslt$prev.best perf[i] <- rslt$prev.avg } final.score <- fitness(curr, y, X, m) convg[N+1] <- min(final.score) perf[N+1] <- mean(final.score) if (check.convg) # modified Jan 2005 by Mu Zhu { cat("Convergence achieved with ", i, "generations...\n") return(list(ans=i, popn=curr, combo.gene=apply(curr,2,mean))) } else{ # original return statement return(list(popn=curr, combo.gene=apply(curr,2,mean), best=curr[order(final.score),][1,], optval=min(final.score), convg, perf)) } } onegen <- function(curr, y, X, m, mutation) { # one iteration of GA # by Mu Zhu # Jan 2002 n <- nrow(X); p <- ncol(X) # evaluate the fitness of members of the current generation orig.score <- fitness(curr, y, X, m) # survival survived <- (1:m)[order(orig.score)][(1:(m/2))] new <- curr[survived,] # reproduction and mutation offspring <- get.offspring(new, p, m=m/2, mutation) combo <- rbind(new, offspring) return(list(prev.best=min(orig.score), prev.avg=mean(orig.score), new.gen=combo)) } get.offspring <- function(X, p, m, mutation) { # reproduction and mutation operations for GA # by Mu Zhu # Jan 2002 rslt <- matrix(0, m, p) for (i in 1:m) { # crossover/reproduction father <- sample(1:m, size=1) mother <- sample(1:m, size=1) breakpt <- sample(1:p, size=1) if (breakpt == 1) { tmp <- as.vector(X[mother,]) } else if(breakpt == p) { tmp <- as.vector(X[father,]) } else { tmp <- as.vector( c(X[father,][1:breakpt], X[mother,][(breakpt+1):p])) } # mutation u <- runif(1,0,1) if (u < mutation) { pos <- sample(1:p, size=1) tmp[pos] <- 1 - tmp[pos] } rslt[i,] <- tmp } return(rslt) } plot.bubble <- function(p, j, truth=NULL, num, grid=F, filename=NULL, ...) { # make a bubble plot # by Mu Zhu # first version = Jan 2002 # packaged again Apr 2005 # # Inputs: # p = dimension; how many variables in total # j = current generation summary to be plotted # truth = for simulation problems only; true variables are plotted with a different symbol # filename = if NULL, plot to standard R graphics window # num = how many parallel paths are used # grid = whether horizontal grid is turned on or not in the plot if (!is.null(filename)) { postscript(file=paste(filename, "-combo", ".ps", sep=""), width=8, height=8, horiz=F) } plot(1:p, j, type="n", xlab="Variables", ylab="Importance", main=paste("Bubble Plot: B =", num, "Parallel Paths", sep=" "), cex.lab=1.5, cex.main=1.5, ...) if (grid) abline(h=(1:9)/10, col="red", lty=2) if (!is.null(truth)) { points((1:p)[-truth], j[-truth], pch=4) points((1:p)[truth], j[truth], pch=1, cex=1.5) } else { points((1:p), j, pch=4) } if (!is.null(filename)) { dev.off() } } plot.movie <- function(p, info, truth=NULL, filename, ...) { # make a dynamic bubble plot # by Mu Zhu # first version = Jan 2002 # packaged again Apr 2005 # # Inputs: # p = dimension; how many variables in total # info = information to be plotted # truth = for simulation problems only; true variables are plotted with a different symbol # filename = can't be NULL for dynamic plots B <- nrow(info) postscript(file=paste(filename, "-movie", ".ps", sep=""), width=8, height=8, horiz=F) j.all<-info[1,] plot(1:p, j.all, type="n", xlab="Variables", ylab="Importance", main=paste("Using B = 1 Parallel Paths"), cex.lab=1.5, cex.main=1.5, ylim=c(0,1), ...) if (!is.null(truth)) { points((1:p)[-truth], j.all[-truth], pch=4) points((1:p)[truth], j.all[truth], pch=1, cex=1.5) } else { points((1:p), j.all, pch=1) } for (i in 2:B) { j.all<-apply(info[1:i,], 2, mean) plot(1:p, j.all, type="n", xlab="Variables", ylab="Importance", main=paste("Using B =", i, "Parallel Paths", sep=" "), cex.lab=1.5, cex.main=1.5, ylim=c(0,1), ...) if (!is.null(truth)) { points((1:p)[-truth], j.all[-truth], pch=4) points((1:p)[truth], j.all[truth], pch=1, cex=1.5) } else { points((1:p), j.all, pch=1) } } dev.off() } plot.order <- function(j.all, num, grid = F, filename=NULL, ...) { # make an order plot # by Mu Zhu # first version = Jan 2002 # packaged again Apr 2005 # # Inputs: # j.all = information to be plotted # filename = if NULL, plots to standard R graphics device # num = number of parallel paths used # grid = whether horizontal grid lines should be turned on or not if (!is.null(filename)) { postscript(file=paste(filename, "-val.ps", sep=""), height=8, width=8, horiz=F) } plot(j.all[order(-j.all)], xlab="Ordered Variables", ylab="Importance", main=paste("Order Plot: B=", num, "Parallel Paths", sep=" "), cex.lab=1.5, cex.main=1.5, ...) if (grid) abline(h=(1:9)/10, col="red", lty=2) if (!is.null(filename)) { dev.off() } } get.gap <- function(obj, subset) { tmp <- apply(obj,2,mean) return(min(tmp[subset])-max(tmp[-subset])) } get.pga.ans <- function(judge, B) { # make automatic variable selection decisions based on the largest # gap in the bubble plot # by Mu Zhu # Jan 2005 x <- sort(judge, decreasing=T); n<-length(x) gap <- x[1:(n-1)] - x[2:n]; m<-max(gap) number <- which(gap==m) if (m < 0.8225/sqrt(B)) cat("Gap not significant!\n") ans<-which(judge>=judge[order(judge,decreasing=T)][number]) return(ans) } get.topfew <- function(judge, num=5) { # pick the top few numbers of variables in the bubble plot # by Mu Zhu # Jan 2005 thresh <- judge[order(judge,decreasing=T)][num] ans <- which(judge >= thresh) return(ans) } summary.hard <- function(rslt, truth, name) { # summarizes simulation results; hard performanace metric # by Mu Zhu # Jan 2005 q <- length(truth) extra <- apply(rslt[,-truth],1,sum) miss <- q - apply(rslt[,truth],1,sum) correct <- (extra==0) & (miss==0) x<-data.frame( perc.correct=mean(correct)) dimnames(x)[[1]]<-name return(x) } summary.soft <- function(sv, truth, name) { # summarizes simulation results; soft performanace metric # by Mu Zhu # Feb 2005 q <- ncol(sv); k<-length(truth) ind<-rep(0,q); ind[truth]<-1 correct <- 0 for (i in 1:nrow(sv)) { ind.sorted <- ind[order(sv[i,], decreasing=T)] if (sum(which(ind.sorted==1)) == sum(1:k)) correct <- correct + 1 } x<-data.frame( perc.correct=correct/nrow(sv)) dimnames(x)[[1]]<-name return(x) }