Blain Waan Blain Waan - 3 months ago 7
R Question

Simulating two conditional variable with specified means and a correlation using R

This question arose from one of my previous questions where the problem of generating two correlated series was resolved with some limitations. We were trying to produce two correlated series that followed exponential distributions with certain parameters. For example, a variable tr with mean 1 and another variable t with mean 2 with a correlation of -0.5 were required satisfying the condition that t>tr). The following codes were tried in

R
.

rho <- -0.5
mu <- rep(0,2)
Sigma <- matrix(rho, nrow=2, ncol=2) + diag(2)*(1 - rho)

library(MASS)

compute.tr.t <- function(req.n, paccept) {
req.n <- round(req.n / paccept)
rawvars <- mvrnorm(req.n, mu=mu, Sigma=Sigma)
pvars <- pnorm(rawvars)
tr <- qexp(pvars[,1], 1/1)
t <- qexp(pvars[,2], 1/2)
keep <- which(t > tr)
return(data.frame(t=t[keep],tr=tr[keep]))
}

req.n <- n
paccept <- 1
res <- data.frame()

while (req.n > 0) {
new.res <- compute.tr.t(req.n, paccept)
res <- rbind(res, new.res)
req.n <- n - nrow(res)
paccept <- nrow(new.res) / n# updated paccept according to last step
}


The problem that occurred due to pruning data that do not satisfy the condition t>tr:


  1. The means were not preserved.

  2. The correlation was not preserved.



See the output below. It is evident that because of imposing such a condition the locations shift.

mean(res$tr)
[1] 0.4660927
mean(res$t)
[1] 2.859441
print(cor(res$tr,res$t))
[1] -0.237159


My question: is there a way to achieve two correlated and conditional variables (such that t>tr) keeping the series means near to that of the specified means? We may go along with the reduced correlation, but is it possible to at least preserve the means?

Answer

Updated answer every element of t strictly greater than tr:

n     <- 100
rho   <- 0.5
mu    <- rep(0,2)
Sigma <- matrix(rho, nrow=2, ncol=2) + diag(2)*(1 - rho)

library(MASS)

compute.tr.t <- function(req.n, paccept) {
  req.n      <- round(req.n / paccept)
  rawvars    <- mvrnorm(req.n, mu=mu, Sigma=Sigma)
  pvars      <- pnorm(rawvars)
  tr         <- qexp(pvars[,1], 1/1)
  t          <- qexp(pvars[,2], 1/2)
  tr         <- tr[(tr-mean(tr))^2  <.25 ] # can play with this value
  t          <- t[(t-mean(t))^2  <.25 ]
  m          <- min(length(t), length(tr))
  t          <- t[1:m]
  tr         <- tr[1:m]
  return(data.frame(t=t,tr=tr))
}

req.n   <- n
paccept <- 1
res     <- data.frame()

while (req.n > 0) {
  new.res <- compute.tr.t(req.n, paccept)
  res     <- rbind(res, new.res)
  req.n   <- n - nrow(res)
  paccept <- nrow(new.res) / n
}

mean(res$t)

[1] 1.972218

mean(res$tr)

[1] 0.590776

table(res$t > res$tr) # should be all true, rarely you'll get 1 trivial false that you can kick out
 TRUE 
 132
cor(res$t,res$tr) # suffered a little but not too bad, can probably improve

[1] .2527064

Original answer mean(t) > mean(tr) but not every element:

n     <- 100
rho   <- 0.5
mu    <- rep(0,2)
Sigma <- matrix(rho, nrow=2, ncol=2) + diag(2)*(1 - rho)

library(MASS)

compute.tr.t <- function(req.n, paccept) {
  req.n      <- round(req.n / paccept)
  rawvars    <- mvrnorm(req.n, mu=mu, Sigma=Sigma)
  pvars      <- pnorm(rawvars)
  tr         <- qexp(pvars[,1], 1/1)
  t          <- qexp(pvars[,2], 1/2)
  keep       <- which(t > tr)
  return(data.frame(t=t,tr=tr))
}

req.n   <- n
paccept <- 1
res     <- data.frame()

while (req.n > 0) {
  new.res <- compute.tr.t(req.n, paccept)
  res     <- rbind(res, new.res)
  req.n   <- n - nrow(res)
  paccept <- nrow(new.res) / n# updated paccept according to last step
}

mean(res$tr)

[1] 0.9399213

mean(res$t)

[1] 1.795431

print(cor(res$tr,res$t)) 

[1] 0.5075668

Since there's some randomness in this for good measure I ran it a 2nd time and got the following result:

mean(res$tr)

[1] 1.001255

mean(res$t)

[1] 1.922343

print(cor(res$tr,res$t)) 

[1] 0.6648311

After you run it once if you don't quite like the results a simple hack to meet any desired level of precision is:

while(
  (cor(res$tr,res$t) > .55 | cor(res$tr,res$t) < .45)
){
  n     <- 100
  rho   <- 0.5
  mu    <- rep(0,2)
  Sigma <- matrix(rho, nrow=2, ncol=2) + diag(2)*(1 - rho)

  library(MASS)

  compute.tr.t <- function(req.n, paccept) {
    req.n      <- round(req.n / paccept)
    rawvars    <- mvrnorm(req.n, mu=mu, Sigma=Sigma)
    pvars      <- pnorm(rawvars)
    tr         <- qexp(pvars[,1], 1/1)
    t          <- qexp(pvars[,2], 1/2)
    keep       <- which(t > tr)
    return(data.frame(t=t,tr=tr))
  }

  req.n   <- n
  paccept <- 1
  res     <- data.frame()

  while (req.n > 0) {
    new.res <- compute.tr.t(req.n, paccept)
    res     <- rbind(res, new.res)
    req.n   <- n - nrow(res)
    paccept <- nrow(new.res) / n# updated paccept according to last step
  }
}