title: “SURF:Subsampling ranking forward selection”

author: “Lihui Liu”

package: SURF'


Summary

Performs variable selection based on subsampling, ranking forward selection. \textrm{Xo} is the matrix of predictor variables. \textrm{y} is the response variable. \textrm{X} is a matrix of additional predictors which should be scaled to have sum 1 prior to analysis. \textrm{fold} is the number of folds for cross-validation. \textrm{Alpha} is the parameter for the elastic net method used in the subsampling procedure: the default value of 1 corresponds to LASSO. alpha_u is the upper bound of the significance level for the permutation test.\textrm{alpha} is a single or a sequence of significance level of interest.\textrm{prop} is the proportion of variables to remove in the each subsample. \textrm{weights} indicates whether observations should be weighted by class size. When the class sizes are unbalanced, weighting observations can improve results. For binomial family, this paramter should be set to TRUE or FALSE; For other families, the parameter can be FALSE or the user supply the weights vector of the same length of the sample size N.\textrm{B} is the number of subsamples to use for ranking the variables. \textrm{C} is the number of permutations to use for estimating the critical value of the null distribution. If the \textrm{doParallel} package is installed, the function can be run in parallel by setting \textrm{ncores} to the number of threads to use. If the default value of 1 is used, or if the \textrm{doParallel} package is not installed, the function does not run in parallel. \textrm{display.progress} indicates whether the function should display messages indicating its progress. \textrm{family} is a family variable for the glm fitting. Note that the \texttt{glmnet} package does not currently permit the use of non-standard link functions, so will always use the default link function. However, the glm fitting will use the specified link. The default is binomial with logistic regression, because this is a common use case. \textrm{pval} is the \(p\)-value for inclusion of a variable in the model and it is discontinued since version 1.1.0. Under the null case, the number of false positives will be geometrically distributed with this as probability of success, so if this parameter is set to \(p\), the expected number of false positives should be \(\frac{p}{1-p}\).


set.seed(12345)

library(SuRF.vs)
N=50
p=20
nzc=p/3
X=matrix(rnorm(N*p),N,p)
beta=rnorm(nzc)
fx=X[,seq(nzc)]%*%beta/3
hx=exp(fx)
ty=rexp(N,hx)
tcens=rbinom(n=N,prob=.3,size=1)# censoring indicator (1 or 0)

Xo=NULL
B=20
Alpha=1
fold=5

ncores=1
prop=0.1
C=3
alpha_u=0.2

alpha=seq(0.01,0.1,len=5)



 #binomial model
 XX=X[,1:2]
 f=1+XX%*%c(2,1.5)
 p=exp(f)/(1+exp(f))
 y=rbinom(100,1,p)
 weights=FALSE
 family=stats::binomial(link="logit")
 surf_binary=SURF(Xo=X,y=y,X=NULL,fold=5,Alpha=1,prop=0.1,weights=weights,B=50,C=10,ncores=1,display.progress=TRUE,family=family,alpha_u=0.1,alpha=alpha)
#> [1] "clean completed"
#> Warning: from glmnet Fortran code (error code -99); Convergence for 99th lambda
#> value not reached after maxit=100000 iterations; solutions for larger lambdas
#> returned
#> Warning: from glmnet Fortran code (error code -100); Convergence for 100th
#> lambda value not reached after maxit=100000 iterations; solutions for larger
#> lambdas returned

#> Warning: from glmnet Fortran code (error code -100); Convergence for 100th
#> lambda value not reached after maxit=100000 iterations; solutions for larger
#> lambdas returned
#> [1] "subsample  completed"
#> [1] "ranking completed"
#> [1] "selection path completed"
#> [1] "select variable at the 'alpha' significance level completed"


 #linear regression
 y=1+XX%*%c(0.1,0.2)

 family=stats::gaussian(link="identity")
 surf_lm=SURF(Xo=X,y=y,X=NULL,fold=5,Alpha=1,prop=0.1,weights=weights,B=100,C=15,ncores=1,display.progress=TRUE,family=family,alpha_u=0.1,alpha=alpha)
#> [1] "clean completed"
#> [1] "subsample  completed"
#> [1] "ranking completed"
#> [1] "selection path completed"
#> [1] "select variable at the 'alpha' significance level completed"


 #cox proportional model
 y=cbind(time=ty,status=1-tcens)
 family=list(family="cox")
 surf_cox=SURF(Xo=X,y=y,X=NULL,fold=5,Alpha=1,prop=0.1,weights=FALSE,B=50,C=5,ncores=1,display.progress=TRUE,family=family,alpha_u=alpha_u,alpha=alpha)
#> [1] "clean completed"
#> [1] "subsample  completed"
#> [1] "ranking completed"
#> [1] "selection path completed"
#> [1] "select variable at the 'alpha' significance level completed"