Setup

set.seed(1995)
## load required packages
library(floodgate)
library(methods)
library(conformalInference)
library(glmnet)
library(lars)
library(randomForest)
library(SAM)
library(ggplot2)
## load utility functions: related to model fitting
source("../utils/algo_utils.R")

#### problem setup
n = 1000 # sample size
p = 1000 # covariate dimension

Xmodel = "gaussian" # covariate distribution
rho = 0.3 # auto-correlation coefficient

Ydist = "binom" # conditional model of response
s = 20 # number of non-nulls
amplitude = 10 # signal amplitude value


split.prop = 0.5 # splitting proportion
K = 100
M_n = 400 # number of null replicates for esimating the conditional mean of mu(X)
K_all = K + M_n # number of total null replicates
alevel = 0.05 # confidence level

Prepare data

## load model parameters of the covariate distribution
load(paste0("../inst/rho", rho, "_Sigma.RData"))
load(paste0("../inst/rho", rho, "_X_paras_gaussian.RData"))

## choose non-null varaibles randomly 
S_star = sort(sample(1:p,s))
beta = rep(0,p)
beta[S_star] = sample(c(-1,1), s, replace = TRUE) * amplitude/sqrt(n)

## generate the covaraites X 
X = matrix(rnorm(n*p),n,p)%*% Sigma.chol
## Generate the response Y from a linear model
Y = gen.Y(X, beta, Ydist = Ydist)

Sample null covariates and compute variable importance measures

## sample null covariates 
nulls.list = sample.gaussian.nulls(X = X, S = as.list(1:p), K = K_all, gamma_X.list_S = gamma_X.list,
                                   sigma_X.list_S = sigma_X.list)
## compute MACMgap values
MACMgap = compute.movi(beta = beta, Xmodel = Xmodel, Ydist = Ydist,
                       sigma_X.list = NULL, X = X, nulls.list = nulls.list)

Run floodgate

## sample splitting
i1 = sample(1:n, floor(n*split.prop))
i2 = (1:n)[-i1]
n1 = length(i1)
n2 = length(i2)

## use LASSO to estimate the conditional mean
algo = "lasso"
funs = funs.list[[algo]]

Plot the results

The following plot shows the floodgate lower confidence bound (LCB): the horizontal bar with a black color and the mMSEgap: the star-shaped point with a red color. The step of model fitting on the training data also outputs a selected subset \(S\), in additional to a regression function estimator \(\mu\). We only plot the LCBs for covariates in \(S\) here.

## run floodgate to obtain LCBs
fg.out = floodgate.binary(X, Y, i1, i2, M_n = M_n, nulls.list = nulls.list,
                   gamma_X.list = gamma_X.list, sigma_X.list = sigma_X.list,
                   Xmodel = Xmodel, funs = funs, algo = algo,
                   alevel = alevel)
#> Initial training on 500 samples with lasso algorithm... 
#> Calculating mu(Xk) for variable 30 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 45 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 74 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 105 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 112 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 120 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 121 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 142 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 169 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 206 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 207 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 208 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 212 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 258 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 266 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 292 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 310 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 312 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 314 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 332 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 336 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 344 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 360 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 389 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 399 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 422 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 444 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 447 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 498 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 505 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 548 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 553 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 574 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 607 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 609 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 619 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 620 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 623 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 626 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 631 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 640 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 662 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 666 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 682 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 686 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 720 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 727 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 733 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 747 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 751 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 777 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 782 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 799 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 819 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 820 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 827 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 833 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 842 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 844 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 845 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 848 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 857 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 877 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 878 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 883 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 900 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 903 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 931 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 954 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Calculating mu(Xk) for variable 987 with lasso algorithm on 500 samples without Monte Carlo samples... 
#> Performing fg analysis...\n
inf.out = as.data.frame(fg.out$inf.out)
S = unlist(fg.out$S)
inf.out$MACMgap = MACMgap[S]

ggplot(data = inf.out, aes(x = S, y = MACMgap)) +
      ylim(0, max(inf.out$MACMgap, inf.out$LCB) + 0.05) +
      ggtitle(paste0("algo = ", algo )) +
      ylab("MACMgap and LCB") + xlab("Selected variables") +
      geom_point(color = "red", shape = 8, size = 3.5) +
      geom_errorbar(aes(ymin=LCB, ymax=LCB), width = 15, color = "black") +
      geom_segment(aes(x = S, y = LCB, xend = S, yend = MACMgap),
                  arrow = arrow(length = unit(0.15, "cm"), type = "closed"))

The arrow for a given covaraite starts from the LCB and ends at the mMSEgap. The arrow provides a good illustration of LCB’s performance, with its length being the half-width and its direction indicating coverage/miscoverage (upward: coverage; downward: miscovergae; leftward: coverage with \(\text{LCB}=\mathcal{I} =0\)).

Try with a different fitting algorithm

## use Binom_LASSO to estimate the conditional mean
algo = "binom_lasso"
funs = funs.list[[algo]]
## run floodgate to obtain LCBs
fg.out = floodgate.binary(X, Y, i1, i2, M_n = M_n, nulls.list = nulls.list,
                   gamma_X.list = gamma_X.list, sigma_X.list = sigma_X.list,
                   Xmodel = Xmodel, funs = funs, algo = algo,
                   alevel = alevel, verbose = TRUE)
#> Initial training on 500 samples with binom_lasso algorithm... 
#> Calculating mu(Xk) for variable 30 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 45 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 64 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 69 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 74 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 105 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 108 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 112 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 120 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 121 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 124 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 142 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 164 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 169 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 205 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 206 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 207 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 208 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 212 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 258 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 266 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 292 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 310 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 312 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 314 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 325 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 332 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 336 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 344 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 353 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 360 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 386 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 389 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 399 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 422 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 444 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 447 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 498 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 505 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 536 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 548 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 553 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 567 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 574 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 607 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 609 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 619 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 620 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 623 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 625 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 626 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 631 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 640 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 662 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 666 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 682 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 686 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 720 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 727 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 733 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 747 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 748 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 751 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 752 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 777 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 782 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 799 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 804 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 819 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 820 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 827 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 833 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 842 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 844 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 845 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 848 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 852 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 854 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 857 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 877 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 878 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 883 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 895 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 899 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 900 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 903 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 931 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 954 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Calculating mu(Xk) for variable 987 with binom_lasso algorithm on 500 samples and 500 null samples... 
#> Performing fg analysis...\n
## extract output and produce plots
inf.out = as.data.frame(fg.out$inf.out)
S = unlist(fg.out$S)
inf.out$MACMgap = MACMgap[S]
ggplot(data = inf.out, aes(x = S, y = MACMgap)) +
      ylim(0, max(inf.out$MACMgap, inf.out$LCB) + 0.05) +
      ggtitle(paste0("algo = ", algo )) +
      ylab("MACMgap and LCB") + xlab("Selected variables") +
      geom_point(color = "red", shape = 8, size = 3.5) +
      geom_errorbar(aes(ymin=LCB, ymax=LCB), width = 15, color = "black") +
      geom_segment(aes(x = S, y = LCB, xend = S, yend = MACMgap),
                  arrow = arrow(length = unit(0.15, "cm"), type = "closed"))