###############################################################################################
##
##   R Simulation Codes for "Bayesian Empirical Likelihood Inference with Complex Survey Data"
##
##   A. Obtain Bayesian Credible Intervals Under a Noninformative Prior
##   B. Quantile Regression Modelling with Survey Data
##   C. Two-stage Cluster Sampling
##
##   CREATED:        Tuesday, January 10, 2018
##   MODIFIED:       Saturday, June 04, 2019
##   AUTHORS:       Puying Zhao and Changbao Wu
##   AFFILIATION:   Department of Statistics and Actuarial Science, University of Waterloo
##   EMAIL:              cbwu at uwaterloo dot ca
##
###############################################################################################

set.seed(7654321,kind=NULL)
library(MASS)
library(quantreg)
library(MHadaptive)

#########################################################################################
#                     R Function for Randomized Systematic PPS Sampling
#########################################################################################
syspps = function(x, n) {
  ##
  ##Population is first randomized!
  ##
  N = length(x)
  U = sample(N, N)
  xx = x[U]
  z = rep(0, N)
  for (i in 1:N)
    z[i] = n * sum(xx[1:i]) / sum(x)
  r = runif(1)
  s = numeric()
  for (i in 1:N) {
    if (z[i] >= r) {
      s = c(s, U[i])
      r = r + 1
    }
  }
  return(s[order(s)])
}


#########################################################################################
# The Following Procedures Calculate Empirical Likelihood Ratio for Estimating Equations
#########################################################################################
#
#  The main function el.ee() calculates the empirical likelihood
#  ratio for estimating equations.  
#

#  INPUTS:
#     mx: The survey weighted estimating functions of the form m1(pars, x, ds, tau) and
#         which returns a n times r matrix with typical element 
#         m1_j(pars,x_i, ds_i, tau) for j=1,...r and i=1,...,n.
#     lam         Optional starting value for Lagrange multiplier
#     maxit       Optional maximum number of iterations
#     gradtol     Optional tolerance for convergence test
#     svdtol      Optional tolerance for detecting singularity
#                   while solving equations
#     itertrace   Optional flag to print results during iteration
#

#  OUTPUTS:
#     logelr      Log empirical likelihood
#     lambda      Lagrange multiplier
#     grad        Gradient of log likelihood
#     hess        Hessian of log likelihood
#     wts         Relative observation weights at the solution
#     nits        Number of iterations used

logelr.ee <- function (mx, lam)
{
  mx <- as.matrix(mx)
  n <- nrow(mx)
  p <- ncol(mx)
  if (n <= p)
    stop("Need more observations than variables in logelr.")
  z <- mx
  arg <- 1 + z %*% lam
  return(-sum(llog(arg, 1/n)))
}

llog <- function (z, eps)
{
  ans <- z
  lo <- (z < eps)
  ans[lo] <- log(eps) - 1.5 + 2 * z[lo]/eps - 0.5 * (z[lo]/eps)^2
  ans[!lo] <- log(z[!lo])
  ans
}

llogp <- function (z, eps)
{
  ans <- z
  lo <- (z < eps)
  ans[lo] <- 2/eps - z[lo]/eps^2
  ans[!lo] <- 1/z[!lo]
  ans
}

llogpp <- function (z, eps)
{
  ans <- z
  lo <- (z < eps)
  ans[lo] <- -1/eps^2
  ans[!lo] <- -1/z[!lo]^2
  ans
}


## EL from Own, for estimating equation
## mx: m1(pars, x, ds, tau)
el.ee <- function (mx, lam, maxit=25, gradtol=1e-7, svdtol=1e-9,
                   itertrace=F, TINY=sqrt(.Machine$double.xmin))
{
  mx <- as.matrix(mx)
  n <- nrow(mx)
  p <- ncol(mx)
  
  if (n <= p)
    stop("Need more observations than length(mu) in el.test().")
  z <- mx
  scale <- mean(abs(z)) + TINY
  z <- z/scale
  if (!missing(lam)) {
    lam <- as.vector(lam)
    lam <- lam * scale
    if (logelr.ee(z, lam) > 0)
      lam <- rep(0, p)
  }
  if (missing(lam))
    lam <- rep(0, p)
  if (svdtol < TINY)
    svdtol <- TINY
  if (gradtol < TINY)
    gradtol <- TINY
  nwts <- c(3^-c(0:3), rep(0, 12))
  gwts <- 2^(-c(0:(length(nwts) - 1)))
  gwts <- (gwts^2 - nwts^2)^0.5
  gwts[12:16] <- gwts[12:16] * 10^-c(1:5)
  nits <- 0
  gsize <- gradtol + 1
  while (nits < maxit && gsize > gradtol) {
    arg <- 1 + z %*% lam
    wts1 <- as.vector(llogp(arg, 1/(n)))
    wts2 <- as.vector(-llogpp(arg, 1/(n)))^0.5
    grad <- as.matrix(-z * wts1)
    grad <- as.vector(rowsum(grad, rep(1, nrow(grad))))
    gsize <- mean(abs(grad))
    hess <- z * wts2
    svdh <- svd(hess)
    if (min(svdh$d) < max(svdh$d) * svdtol)
      svdh$d <- svdh$d + max(svdh$d) * svdtol
    nstep <- svdh$v %*% (t(svdh$u)/svdh$d)
    nstep <- as.vector(nstep %*% matrix(wts1/wts2, n, 1))
    gstep <- -grad
    if (sum(nstep^2) < sum(gstep^2))
      gstep <- gstep * (sum(nstep^2)^0.5/sum(gstep^2)^0.5)
    ologelr <- -sum(llog(arg, 1/(n)))
    ninner <- 0
    for (i in 1:length(nwts)) {
      nlogelr <- logelr.ee(z, lam + nwts[i] * nstep + gwts[i] * gstep)
      if (nlogelr < ologelr) {
        lam <- lam + nwts[i] * nstep + gwts[i] * gstep
        ninner <- i
        break
      }
    }
    nits <- nits + 1
    if (ninner == 0)
      nits <- maxit
    if (itertrace)
      print(c(lam, nlogelr, gsize, ninner))
  }
  list(`-2LLR` = -2 * nlogelr, Pval = 1 - pchisq(-2 * nlogelr, df = p),
       lambda = lam/scale, grad = grad * scale, hess = t(hess) %*%
         hess * scale^2, wts = wts1, nits = nits)
}

#########################################################################################
# R Function for Survey Weighted Estimating Functions Using Quantile Regression Modelling
#########################################################################################
m1 = function(pars, XY, ds, tau) {
  X = XY[, -1]
  Y = XY[, 1]
  #browser()
  c(ds * ((Y < X %*% pars) - tau)) * X
}

#########################################################################################
##                         R Function for the Prior distributions
#########################################################################################
prior_reg <- function(pars)
{
  a <- pars[1] #intercept
  b <- pars[2] #slope
  prior_a <- dnorm(a, 0, 100, log = TRUE) ## non-informative (flat) priors on all
  prior_b <- dnorm(b, 0, 100, log = TRUE) ## parameters.
  return(prior_a + prior_b)
}

#########################################################################################
##        R Function for Bayesian Empirical Likelihood Posterior Distribution
#########################################################################################
li_reg1 <- function(pars, XY, ds, tau)
{
  e <- m1(pars, XY, ds, tau)
  log_likelihood = el.ee(e)$'-2LLR'
  prior <- prior_reg(pars)
  return(-0.5*log_likelihood + prior)
}

#########################################################################################
##    R Function for Bayesian Pseudo Likelhoood Posterior Distribution
##    Using Asymmetric Laplace distribution
#########################################################################################
li_reg2 <- function(pars, XY, ds, tau, rd)
{
  n = length(ds)
  X = XY[,-1]
  Y = XY[, 1]
  #browser()
  sd_res = (Y - X %*% pars) / rd
  
  obj = ds * (tau - (sd_res < 0)) * sd_res
  log_likelihood = (log(tau * (1 - tau)) - log(rd)) * sum(ds) - sum(obj)
  prior <- prior_reg(pars)
  return(log_likelihood + prior)
}

#########################################################################################
#         Compute Bayesian Credible Intervals Using 1000 Simulation Runs:
#########################################################################################

#---------------------------------------------------
##       Finite Population Generation:
#---------------------------------------------------
M1 = 30                 #large cluster size
M2 = 20                 #medium cluster size
M3 = 10                 #small cluster size
N = 200 * M1 + 250 * M2 + 900 * M3    #Population size
K = 1350                          #total number of cluster
M = c(rep(30, 200), rep(20, 250), rep(10, 900))    #Cluster size for PPS
MT = rep(0, K)                                 #Cumulative cluster size
for (i in 1:K)
  MT[i] = sum(M[1:i])

tau = 0.25
p = 2
beta = c(0.5, 1)

X = rnorm(N)
X = cbind(rep(1,N), X)
err_n = rnorm(N)
err_ca = rcauchy(N)
err_t = rt(N, 3)
err_chi = rchisq(N, 2)

Y = X %*% beta + (err_n - qnorm(tau))
#Y = X %*% beta + (1 + X[,2])*(err_n - qnorm(tau))
XY = cbind(Y, X)
fit1 = rq(Y ~ X - 1, tau = tau)
beta_N = fit1$coefficients

#---------------------------------------------
#    Size variable for PPS sampling
#---------------------------------------------
Z = M / sum(M)

#---------------------------------------------------

nsim = 1000
n = 500                           #Overall sample size
k = n / 5                         #Number of clusters to be selected

BIAS1 = c(0,0)
COV_PR1 = c(0,0)
LEN1 = c(0,0)
beta.bayes.est.matrix1 = matrix(0, nsim, p)
beta.bayes.bci.1.matrix1 = matrix(0, nsim, 2)
beta.bayes.bci.2.matrix1 = matrix(0, nsim, 2)

BIAS2 = c(0,0)
COV_PR2 = c(0,0)
LEN2 = c(0,0)
beta.bayes.est.matrix2 = matrix(0, nsim, p)
beta.bayes.bci.1.matrix2 = matrix(0, nsim, 2)
beta.bayes.bci.2.matrix2 = matrix(0, nsim, 2)

#---------------------------------------------------
##  Repeated Simulations Start from Here:
#---------------------------------------------------
for (m in 1:nsim) {
  
  sam = syspps(Z, k)
  sam = sam[order(sam)]
  Zs = Z[sam]
  Ms = M[sam]                            ## Sizes of selected clusters
  MTs = MT[sam]                        ## Positions of selected clusters
  #-----------------------------------------------------------
  # Second stage sample and final sample data
  #---
  Ys = rep(0, n)
  Xs = matrix(0, n, p)
  for (i in 1:k) {
    s2 = sample(Ms[i], 5)
    Ys[(5 * (i - 1) + 1):(5 * i)] = Y[MTs[i] - Ms[i] + s2]
    Xs[(5 * (i - 1) + 1):(5 * i),] = X[MTs[i] - Ms[i] + s2,]
  }
  #---
  # First order inclusion probabilities (= f = n/N)
  #---
  pi1 = rep(0, n)
  for (i in 1:k)
    pi1[(5 * (i - 1) + 1):(5 * k)] = rep(k * Zs[i] * 5 / Ms[i], 5)
  
  ds = 1 / pi1
  XYs = cbind(Ys, Xs)
  
  #---------------------------------------------------
  ##  Bayesian Empirical Likelihood inference
  #---------------------------------------------------
  mcmc_r1 <- Metro_Hastings(li_func = li_reg1, pars = c(0.45, 0.95),
                            prop_sigma = 0.0035*diag(rep(1,p)), 
                            par_names = c('a', 'b'), iterations=5000, burn_in=1000, 
                            XY = XYs, ds = ds, tau = tau)
  
  mcmc_r1 <- Metro_Hastings(li_func = li_reg1, pars = c(0.45, 0.95),
                            prop_sigma = mcmc_r1$prop_sigma, 
                            par_names = c('a', 'b'), iterations=10000, burn_in=5000, 
                            XY = XYs, ds = ds, tau = tau)
  
  BST1 = BCI(mcmc_r1)
  
  beta.bayes.est.matrix1[m,] = BST1[,3]
  beta.bayes.bci.1.matrix1[m,] = BST1[1,1:2]
  beta.bayes.bci.2.matrix1[m,] = BST1[2,1:2]
  
  #---------------------------------------------------
  ##   Bayesian Pseudo Likelhoood Inference
  #---------------------------------------------------
  Xs = XYs[, -1]
  Ys = XYs[, 1]
  #browser()
  fit_ds = rq(Ys ~ Xs - 1, tau = tau, weights = ds)
  pars_ds = fit_ds$coefficients
  res_ds = Ys - Xs %*% pars_ds
  rd = mean(ds * (tau - (res_ds < 0)) * res_ds)
  
  mcmc_r2 <- Metro_Hastings(li_func = li_reg2, pars = c(0.45, 0.95),
                            prop_sigma = 0.0035*diag(rep(1,p)),
                            par_names = c('a', 'b'), iterations=5000, burn_in=1000,
                            XY = XYs, ds = ds, tau = tau, rd = rd)
  
  mcmc_r2 <- Metro_Hastings(li_func = li_reg2, pars = c(0.45, 0.95),
                            prop_sigma = mcmc_r2$prop_sigma,
                            par_names = c('a', 'b'), iterations=10000, burn_in=5000,
                            XY = XYs, ds = ds, tau = tau, rd = rd)
  
  BST2 = BCI(mcmc_r2)
  
  beta.bayes.est.matrix2[m,] = BST2[,3]
  beta.bayes.bci.1.matrix2[m,] = BST2[1,1:2]
  beta.bayes.bci.2.matrix2[m,] = BST2[2,1:2]
  
  print(m)
  
}

#########################################################################################
##                         Simulation Results:
#########################################################################################
BIAS1 = colMeans(beta.bayes.est.matrix1) - beta_N
BIAS1 = as.numeric(BIAS1)
SD1 = apply(beta.bayes.est.matrix1,2,sd)
SD1 = as.numeric(SD1)
RMSE1 = sqrt(colMeans((beta.bayes.est.matrix1 - t(array(beta_N,c(2,nsim))))^2))
RMSE1 = as.numeric(RMSE1)

COV_PR1[1] = mean(beta.bayes.bci.1.matrix1[,2] > beta_N[1] & 
                    beta.bayes.bci.1.matrix1[,1] < beta_N[1])
COV_PR1[2] = mean(beta.bayes.bci.2.matrix1[,2] > beta_N[2] & 
                    beta.bayes.bci.2.matrix1[,1] < beta_N[2])
LEN1[1] = mean(beta.bayes.bci.1.matrix1[,2] - beta.bayes.bci.1.matrix1[,1])
LEN1[2] = mean(beta.bayes.bci.2.matrix1[,2] - beta.bayes.bci.2.matrix1[,1])

COV_PR1 = floor(COV_PR1 * 1000) / 1000
LEN1 = floor(LEN1 * 1000) / 1000

BIAS2 = colMeans(beta.bayes.est.matrix2) - beta_N
BIAS2 = as.numeric(BIAS2)
SD2 = apply(beta.bayes.est.matrix2,2,sd)
SD2 = as.numeric(SD2)
RMSE2 = sqrt(colMeans((beta.bayes.est.matrix2 - t(array(beta_N,c(2,nsim))))^2))
RMSE2 = as.numeric(RMSE2)

COV_PR2[1] = mean(beta.bayes.bci.1.matrix2[,2] > beta_N[1] & 
                    beta.bayes.bci.1.matrix2[,1] < beta_N[1])
COV_PR2[2] = mean(beta.bayes.bci.2.matrix2[,2] > beta_N[2] & 
                    beta.bayes.bci.2.matrix2[,1] < beta_N[2])
LEN2[1] = mean(beta.bayes.bci.1.matrix2[,2] - beta.bayes.bci.1.matrix2[,1])
LEN2[2] = mean(beta.bayes.bci.2.matrix2[,2] - beta.bayes.bci.2.matrix2[,1])

COV_PR2 = floor(COV_PR2 * 1000) / 1000
LEN2 = floor(LEN2 * 1000) / 1000

COV_PR1
LEN1

COV_PR2
LEN2