############################################################################################### ## ## R Simulation Codes for "Bayesian Empirical Likelihood Inference with Complex Survey Data" ## ## A. Implement Bayesian Model Selection Procedure ## B. Quantile Regression Modelling with Survey Data ## C. Stratified PPS Sampling ## ## CREATED: Tuesday, January 28, 2018 ## MODIFIED: Saturday, June 07, 2019 ## AUTHORS: Puying Zhao and Changbao Wu ## AFFILIATION: Department of Statistics and Actuarial Science, University of Waterloo ## EMAIL: cbwu at uwaterloo dot ca ## ############################################################################################### set.seed(7654321,kind=NULL) library(MASS) library(quantreg) library(mvtnorm) ######################################################################################### # R Function for Randomized Systematic PPS Sampling ######################################################################################### syspps = function(x, n) { ## ##Population is first randomized! ## N = length(x) U = sample(N, N) xx = x[U] z = rep(0, N) for (i in 1:N) z[i] = n * sum(xx[1:i]) / sum(x) r = runif(1) s = numeric() for (i in 1:N) { if (z[i] >= r) { s = c(s, U[i]) r = r + 1 } } return(s[order(s)]) } ######################################################################################### # Metropolis-Hastings MCMC ######################################################################################### # Runs a Metropolis-Hasting MCMC chain for a given likelihood function. # Proposal steps are sampled from a Gaussian distribution, either in a single # step or sequentially over the parameter space. ######################################################################################### run_metropolis_MCMC <- function(startvalue, prop_sigma, iterations, burn_in, XY, ds, tau) { chain = array(dim = c(iterations + 1, length(startvalue))) chain[1, ] = startvalue for (i in 1:iterations) { proposal = mvrnorm(1,mu = chain[i, ], Sigma = prop_sigma) probab = exp(posterior_el(proposal, XY, ds, tau) - posterior_el(chain[i, ], XY, ds, tau)) if (runif(1) < probab) { chain[i + 1, ] = proposal } else{ chain[i + 1, ] = chain[i, ] } } chain <- chain[burn_in:iterations,] if (length(startvalue) > 1) accept_rate <- length(unique(chain[, 1])) / (iterations - burn_in) else accept_rate <- length(unique(chain)) / (iterations - burn_in) # message(sprintf("Acceptance ratio = %.4f", accept_rate)) val <- list("trace" = chain, 'accetance_ratio' = accept_rate) return(val) } ############################################################################## # Estimate Marginal Likelihood # # Reference: # Marginal likelihood from the Metropolis--Hastings output. # Chib, S. and Jeliazkov, I., # Journal of the American Statistical Association, 2001 ############################################################################## marginal.likelihood <- function(trace, lik.fun, prior.fun, num.samples = 1000, log = TRUE, mom, XY, ds, tau) { n = length(ds) # get mean parameters theta.star <- colMeans(trace) V = cov(trace) lik.star <- lik.fun(theta.star, mom, XY, ds, tau) # get samples from posterior g <- sample(1:nrow(trace), num.samples, replace = TRUE) theta.g <- trace[g, ] q.g <- dmvnorm(theta.g, mean = theta.star, sigma = V, log = FALSE) lik.g <- apply(theta.g, 1, lik.fun, mom, XY, ds, tau) alpha.g <- sapply(lik.g, function(l) min(1, exp(lik.star - l))) # get samples from proposal theta.j <- rmvnorm(num.samples, mean = theta.star, sigma = V) lik.j <- apply(theta.j, 1, lik.fun, mom, XY, ds, tau) alpha.j <- sapply(lik.j, function(l) min(1, exp(l - lik.star))) pi.hat <- mean(alpha.g * q.g)/mean(alpha.j) pi.star <- 0 if (!is.null(prior.fun)) pi.star <- prior.fun(theta.star) ln.m <- lik.star + pi.star - log(pi.hat) if (!log) ln.m <- exp(ln.m) list(ln.m = ln.m, ln.lik.star = lik.star, ln.pi.star = pi.star, ln.pi.hat = log(pi.hat)) } ######################################################################################### # The Following Procedures Calculate Empirical Likelihood Ratio for Estimating Equations ######################################################################################### # # The main function el.ee() calculates the empirical likelihood # ratio for estimating equations. # # INPUTS: # mx: The survey weighted estimating functions of the form m1(pars, x, ds, tau) and # which returns a n times r matrix with typical element # m1_j(pars,x_i, ds_i, tau) for j=1,...r and i=1,...,n. # lam Optional starting value for Lagrange multiplier # maxit Optional maximum number of iterations # gradtol Optional tolerance for convergence test # svdtol Optional tolerance for detecting singularity # while solving equations # itertrace Optional flag to print results during iteration # # OUTPUTS: # logelr Log empirical likelihood # lambda Lagrange multiplier # grad Gradient of log likelihood # hess Hessian of log likelihood # wts Relative observation weights at the solution # nits Number of iterations used logelr.ee <- function (mx, lam) { mx <- as.matrix(mx) n <- nrow(mx) p <- ncol(mx) if (n <= p) stop("Need more observations than variables in logelr.") z <- mx arg <- 1 + z %*% lam return(-sum(llog(arg, 1/n))) } llog <- function (z, eps) { ans <- z lo <- (z < eps) ans[lo] <- log(eps) - 1.5 + 2 * z[lo]/eps - 0.5 * (z[lo]/eps)^2 ans[!lo] <- log(z[!lo]) ans } llogp <- function (z, eps) { ans <- z lo <- (z < eps) ans[lo] <- 2/eps - z[lo]/eps^2 ans[!lo] <- 1/z[!lo] ans } llogpp <- function (z, eps) { ans <- z lo <- (z < eps) ans[lo] <- -1/eps^2 ans[!lo] <- -1/z[!lo]^2 ans } ## EL from Own, for estimating equation ## mx: m1(pars, x, ds, tau) el.ee <- function (mx, lam, maxit=25, gradtol=1e-7, svdtol=1e-9, itertrace=F, TINY=sqrt(.Machine$double.xmin)) { mx <- as.matrix(mx) n <- nrow(mx) p <- ncol(mx) if (n <= p) stop("Need more observations than length(mu) in el.test().") z <- mx scale <- mean(abs(z)) + TINY z <- z/scale if (!missing(lam)) { lam <- as.vector(lam) lam <- lam * scale if (logelr.ee(z, lam) > 0) lam <- rep(0, p) } if (missing(lam)) lam <- rep(0, p) if (svdtol < TINY) svdtol <- TINY if (gradtol < TINY) gradtol <- TINY nwts <- c(3^-c(0:3), rep(0, 12)) gwts <- 2^(-c(0:(length(nwts) - 1))) gwts <- (gwts^2 - nwts^2)^0.5 gwts[12:16] <- gwts[12:16] * 10^-c(1:5) nits <- 0 gsize <- gradtol + 1 while (nits < maxit && gsize > gradtol) { arg <- 1 + z %*% lam wts1 <- as.vector(llogp(arg, 1/(n))) wts2 <- as.vector(-llogpp(arg, 1/(n)))^0.5 grad <- as.matrix(-z * wts1) grad <- as.vector(rowsum(grad, rep(1, nrow(grad)))) gsize <- mean(abs(grad)) hess <- z * wts2 svdh <- svd(hess) if (min(svdh$d) < max(svdh$d) * svdtol) svdh$d <- svdh$d + max(svdh$d) * svdtol nstep <- svdh$v %*% (t(svdh$u)/svdh$d) nstep <- as.vector(nstep %*% matrix(wts1/wts2, n, 1)) gstep <- -grad if (sum(nstep^2) < sum(gstep^2)) gstep <- gstep * (sum(nstep^2)^0.5/sum(gstep^2)^0.5) ologelr <- -sum(llog(arg, 1/(n))) ninner <- 0 for (i in 1:length(nwts)) { nlogelr <- logelr.ee(z, lam + nwts[i] * nstep + gwts[i] * gstep) if (nlogelr < ologelr) { lam <- lam + nwts[i] * nstep + gwts[i] * gstep ninner <- i break } } nits <- nits + 1 if (ninner == 0) nits <- maxit if (itertrace) print(c(lam, nlogelr, gsize, ninner)) } list(`-2LLR` = -2 * nlogelr, Pval = 1 - pchisq(-2 * nlogelr, df = p), lambda = lam/scale, grad = grad * scale, hess = t(hess) %*% hess * scale^2, wts = wts1, nits = nits) } ######################################################################################### # R Function for Survey Weighted Estimating Functions Using Quantile Regression Modelling ######################################################################################### m1 = function(pars, XY, ds, tau) { X = XY[, -1] Y = XY[, 1] pars = as.numeric(pars) mc = c(ds * ((Y < X %*% pars) - tau)) * X return(mc) } ######################################################################################### ## R Function for Empirical Likelihood Function ######################################################################################### el_fun <- function(pars, mom, XY, ds, tau) { e <- mom(pars, XY, ds, tau) log_likelihood = el.ee(e)$'-2LLR' return(-0.5*log_likelihood) } ######################################################################################### ## R Function for the Prior Distribution ######################################################################################### prior_reg <- function(pars) { d_par = length(pars) prior_d = rep(0, d_par) for (i in 1:d_par) { prior_d[i] = dnorm(pars[i], 0, 100, log = TRUE) } return(sum(prior_d)) } ######################################################################################### ## R Function for Bayesian Empirical Likelihood Posterior Distribution ######################################################################################### posterior_el <- function(pars, XY, ds, tau) { e <- m1(pars, XY, ds, tau) log_likelihood = el.ee(e)$'-2LLR' prior <- prior_reg(pars) return(-0.5*log_likelihood + prior) } ######################################################################################### # Compute Bayesian Model Selection Results Based on 500 Simulation Runs: ######################################################################################### #--------------------------------------------------- ## Finite Population Generation #--------------------------------------------------- nsim = 500 N1 = 4000 N2 = 6000 N3 = 10000 N = N1 + N2 + N3 n1 = 15 n2 = 25 n3 = 60 n = n1 + n2 + n3 tau = 0.25 beta = c(0.5, 1, 0, 0) p = length(beta) rho = 0.7 K = 7 mf = matrix(0, nsim, K) X1 = runif(N) X2 = rnorm(N) X3 = 0.5 + X1 + X2 + rnorm(N) X = cbind(X1, X2, X3) X = cbind(rep(1,N), X) XX = rchisq(N, 1) / 2 err_n = rnorm(N) err_ca = rcauchy(N) err_t = rt(N, 3) err_chi = rchisq(N, 2) Y = X %*% beta + (err_n - qnorm(tau)) # Y = X %*% beta + (1 + X[,2])*(err_n - qnorm(tau)) XY = cbind(Y, X) fit1 = rq(Y ~ X - 1, tau = tau) beta_N = fit1$coefficients #--------------------------------------------- # Size variable for PPS sampling #--------------------------------------------- Z = X[, 2] + Y Z = Z - min(Z) + 0.2 #Z>=0.4 and max(Z)/min(Z)=40 Z = Z / sum(Z) #sum(Z)=1; cor(Z,Y)=0.557 Z1 = Z[1:N1] / sum(Z[1:N1]) Z2 = Z[(N1 + 1):(N1 + N2)] / sum(Z[(N1 + 1):(N1 + N2)]) Z3 = Z[(N1 + N2 + 1):N] / sum(Z[(N1 + N2 + 1):N]) #--------------------------------------------------- ## Repeated Simulations Start from Here: #--------------------------------------------------- for (m in 1:nsim) { sam1 = syspps(Z1, n1) sam2 = syspps(Z2, n2) sam3 = syspps(Z3, n3) XYs1 = XY[sam1,] XYs2 = XY[N1 + sam2,] XYs3 = XY[N1 + N2 + sam3,] pi11 = n1 * Z1[sam1] pi12 = n2 * Z2[sam2] pi13 = n3 * Z3[sam3] XYs = rbind(XYs1, XYs2, XYs3) pis = c(pi11, pi12, pi13) ds = 1 / pis XXs1 = XX[sam1] XXs2 = XX[N1 + sam2] XXs3 = XX[N1 + N2 + sam3] XXs = c(XXs1, XXs2, XXs3) #--------------------------------------------------- mcmc_gmm1 = run_metropolis_MCMC(startvalue = beta_N[1:2], prop_sigma = 0.25*diag(abs(beta_N[1:2])), iterations = 5000, burn_in = 2500, XY = XYs[,1:3], ds, tau) ml_1 = marginal.likelihood(trace = mcmc_gmm1$trace, lik.fun = el_fun, prior.fun = prior_reg, num.samples = 1000, log = TRUE, mom = m1, XY = XYs[,1:3], ds = ds, tau = tau) #--------------------------------------------------- mcmc_gmm2 = run_metropolis_MCMC(startvalue = beta_N[1:3], prop_sigma = 0.25*diag(abs(beta_N[1:3])), iterations = 5000, burn_in = 2500, XY = XYs[,1:4], ds, tau) ml_2 = marginal.likelihood(trace = mcmc_gmm2$trace, lik.fun = el_fun, prior.fun = prior_reg, num.samples = 1000, log = TRUE, mom = m1, XY = XYs[,1:4], ds = ds, tau = tau) #--------------------------------------------------- mcmc_gmm3 = run_metropolis_MCMC(startvalue = c(beta_N[1:2],beta_N[4]), prop_sigma = 0.25*diag(abs(c(beta_N[1:2],beta_N[4]))), iterations = 5000, burn_in = 2500, XY = cbind(XYs[, 1:3], XYs[, 5]), ds, tau) ml_3 = marginal.likelihood(trace = mcmc_gmm3$trace, lik.fun = el_fun, prior.fun = prior_reg, num.samples = 1000, log = TRUE, mom = m1, XY = cbind(XYs[, 1:3], XYs[, 5]), ds = ds, tau = tau) #--------------------------------------------------- mcmc_gmm4 = run_metropolis_MCMC(startvalue = beta_N, prop_sigma = 0.25*diag(abs(beta_N)), iterations = 5000, burn_in = 2500, XY = XYs, ds, tau) ml_4 = marginal.likelihood(trace = mcmc_gmm4$trace, lik.fun = el_fun, prior.fun = prior_reg, num.samples = 1000, log = TRUE, mom = m1, XY = XYs, ds = ds, tau = tau) #--------------------------------------------------- mcmc_gmm5 = run_metropolis_MCMC(startvalue = c(beta_N[1],beta_N[3]), prop_sigma = 0.25*diag(abs(c(beta_N[1],beta_N[3]))), iterations = 5000, burn_in = 2500, XY = cbind(XYs[, 1:2], XYs[, 4]), ds, tau) ml_5 = marginal.likelihood(trace = mcmc_gmm5$trace, lik.fun = el_fun, prior.fun = prior_reg, num.samples = 1000, log = TRUE, mom = m1, XY = cbind(XYs[, 1:2], XYs[, 4]), ds = ds, tau = tau) #--------------------------------------------------- mcmc_gmm6 = run_metropolis_MCMC(startvalue = c(beta_N[1],beta_N[4]), prop_sigma = 0.25*diag(abs(c(beta_N[1],beta_N[4]))), iterations = 5000, burn_in = 2500, XY = cbind(XYs[, 1:2], XYs[, 5]), ds, tau) ml_6 = marginal.likelihood(trace = mcmc_gmm6$trace, lik.fun = el_fun, prior.fun = prior_reg, num.samples = 1000, log = TRUE, mom = m1, XY = cbind(XYs[, 1:2], XYs[, 5]), ds = ds, tau = tau) #--------------------------------------------------- mcmc_gmm7 = run_metropolis_MCMC(startvalue = beta_N[1:2], prop_sigma = 0.25*diag(abs(beta_N[1:2])), iterations = 5000, burn_in = 2500, XY = cbind(XYs[, 1:2], XXs), ds, tau) ml_7 = marginal.likelihood(trace = mcmc_gmm7$trace, lik.fun = el_fun, prior.fun = prior_reg, num.samples = 1000, log = TRUE, mom = m1, XY = cbind(XYs[, 1:2], XXs), ds = ds, tau = tau) #--------------------------------------------------- lik_mat = c(ml_1$ln.m, ml_2$ln.m, ml_3$ln.m, ml_4$ln.m, ml_5$ln.m, ml_6$ln.m, ml_7$ln.m) lik_max = max(lik_mat) for (k in 1:K) { if (lik_mat[k] == lik_max) { mf[m, k] = 1 } else { mf[m, k] = 0 } } cat("ML =", lik_mat, "\n") cat("MS =", colSums(mf) / m, "\n") print(m) }