From a4eeea45b48cd5660b86e324a640b56a165d077d Mon Sep 17 00:00:00 2001 From: YuZhang2019 Date: Tue, 3 Dec 2019 13:57:50 -0800 Subject: [PATCH 01/19] aa --- hal_screen2.R | 154 +++++++++++++++++++++++++++++++++++++++++++++ test-hal_screen2.R | 57 +++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 hal_screen2.R create mode 100644 test-hal_screen2.R diff --git a/hal_screen2.R b/hal_screen2.R new file mode 100644 index 00000000..4253232a --- /dev/null +++ b/hal_screen2.R @@ -0,0 +1,154 @@ +#step1:do regular lasso for main term functions and their interaction functions +#step2:rank those basis functions based on their speed to become zero and choose k top basis functions +#step3:generate K*n basis functions and do regular lasso +#step4:output the fitting results, mean square error and the running time of step3 + +# hal_screen_goodbasis is aimed to screen main term functions and their interaction functions x1*x2,x1*x2*x3,etc +# hal_screen_rank is aimed to rank all the covariates based on their speed to become zero +# hal_screen_output is aimed to do regular lasso for K*n basis function and output the fitting performance and running time + +hal_screen_rank<-function(x, y, family, k = NULL, foldid = NULL, offset = NULL){ + n <- length(y)# length of y + p <- ncol(x)# column number of x + + if (is.null(foldid)) { + foldid <- sample(1:5, n, replace = TRUE) + } + + if (is.null(offset)) { + offset <- rep(mean(y), n) + } + + rank_basis <- cv.glmnet(x, y, family = family, foldid = foldid, offset = offset) + rank_col <- list() + if (!is.null(k)){ + for (i in 1:length(rank_basis$lambda)) { + lambda <- as.matrix(rank_basis$lambda)# decreasing lambda + lam <- lambda[i] + coef <- as.list(coef(rank_basis, lam))# coef with lambda[i] + coef <- coef[-1]# remove the first row(intercept) + keep <- list(which(coef!=0, arr.ind = TRUE))# return the row which coef!=0 + rank_col <- c(rank_col, keep) + rank_col <- rank_col[!duplicated(rank_col)]#remove duplicated rows + } + rank_col <- rank_col[-1] #remove the last interger(0) row + rank_col <- rank_col[[k]] + rank_col <- lapply(rank_col, function(x) x=x) + } + else { + coef <- as.list(coef(rank_basis, rank_basis$lambda.min)) + coef <- coef[-1] + rank_col <- as.list(which(coef!=0, arr.ind = TRUE)) + } +} + + +hal_screen_goodbasis<-function(x, y, actual_max_degree, k = NULL, family, col_lists = NULL, foldid = NULL, offset = NULL, verbose = FALSE){ + n <- length(y) + p <- ncol(x) + + if (is.null(col_lists)) { + col_lists <- as.list(seq_len(p))# seq_len=(1,2,...,p) + } + + if (is.null(foldid)) { + foldid <- sample(1:5, n, replace = TRUE) + } + + if (is.null(offset)) { + offset <- rep(mean(y), n) + } + + good_cols <- unlist(col_lists) + interaction_col_lists <- list() + x_interaction_basis <- x + if (actual_max_degree >= 2) { + for (degree in 2:actual_max_degree) { + combs <- utils::combn(length(good_cols), degree) + degree_lists <- lapply(seq_len(ncol(combs)), function(col) good_cols[combs[, col]]) + interaction_col_lists <- c(interaction_col_lists, degree_lists) + for (col in seq_len(ncol(combs))) { + x_interaction <- matrix(1, ncol = 1, nrow = n) + for (row in combs[,col]) { + x_interaction <- x_interaction*x[,row] + } + x_interaction_basis <- cbind(x_interaction_basis, x_interaction) + } + }# get matrix[x1,x2,..,x1*x2,..,x1*x2*x3,..] + x_basis_lists<-as.list(matrix(0,ncol = length(col_lists)+length(interaction_col_lists))) + for (i in 1:length(x_basis_lists)) { + if (i<=length(col_lists)){ + x_basis_lists[[i]]<-col_lists[[i]] + } + else{ + x_basis_lists[[i]]<-interaction_col_lists[[i-length(col_lists)]] + } + }# get list((1,..)(12,13,...)(123,..)) + screened_rank <- hal_screen_rank(x_interaction_basis, y, k = k, + family = family, + foldid = foldid, + offset = offset) + screened_col <- lapply(screened_rank, function(x) x_basis_lists[[x]]) + set_interaction <- list() + set_mainterm <- list() + for(i in 1:length(screened_col)){ + if (length(screened_col[[i]])!=1){ + set_interaction <- c(set_interaction, as.list(screened_col[[i]])) + } + else{set_mainterm <- c(set_mainterm, as.list(screened_col[[i]]))}#get set of main terms + } + set_interaction <- set_interaction[!duplicated(set_interaction)]#get set of main terms that build all the interaction + screened_col <- c(screened_col, setdiff(set_interaction, set_mainterm))#include all the main terms that build the interaction terms + } + else {screened_rank <- hal_screen_rank(x, y, k = k, + family = family, + foldid = foldid, + offset = offset) + screened_col <- lapply(screened_rank, function(x) col_lists[[x]]) + } + return(screened_col) +}# find the K basis function + +hal_screen_output<-function(x, y, family, col_lists, foldid = NULL, offset = NULL){#generate K*n basis function and do regular lasso + n <- length(y)# length of y + p <- ncol(x)# column number of x + + if (is.null(foldid)) { + foldid <- sample(1:5, n, replace = TRUE) + } + + if (is.null(offset)) { + offset <- rep(mean(y), n) + } + + col_results <- list() + x_basis<-matrix(nrow = n, ncol = 1) + + for (i in seq_along(col_lists)) {# i from 1 to p + col_list <- col_lists[[i]] + basis_list <- basis_list_cols(col_list, x) #one by one generate basis_list + x_basis <- cbind(x_basis, make_design_matrix(x, basis_list))#generate k*n basis functions + } + x_basis<-as.matrix(x_basis[,-1]) + screen_goodcols <- cv.glmnet(x_basis, y, family = family, offset = offset, foldid = foldid)# do regular lasso for k*n basis functions + + + lambda_min <- screen_goodcols$lambda.min + lambda_1se <- screen_goodcols$lambda.1se + coef <- coef.cv.glmnet(screen_goodcols, s = 'lambda.1se') + coef_list <- list(which(!coef[-1] == 0))#find non-zero column lists + + pred <- predict.cv.glmnet(screen_goodcols, newx = x_basis, s = lambda_1se, newoffset = offset) + mse <- mean((pred - y)^2) + + col_result <- list( + coef_list = list(coef_list), + lambda_min = lambda_min, + lambda_1se = lambda_1se, + fit_performance = mse, + time = proc.time() + #TODO: calculate running time + ) + return(col_result) +} + diff --git a/test-hal_screen2.R b/test-hal_screen2.R new file mode 100644 index 00000000..3295c12b --- /dev/null +++ b/test-hal_screen2.R @@ -0,0 +1,57 @@ +devtools::uses_testthat() + +context("Unit test for HAL screening procedure") + +library(hal9001) +set.seed(749125) + +n <- 100 +p <- 3 +x <- xmat <- matrix(rnorm(n * p), n, p) +y <- sin(x[, 1]) * sin(x[, 2]) + rnorm(n, mean = 0, sd = 0.2) + +rank_b <- cv.glmnet(x, y, family = "gaussian") +coef_list <- as.list(coef(rank_b, rank_b$lambda.min)) +coef_list <- coef_list[-1] +select_list <- list(which(coef_list!=0, arr.ind = TRUE))# get selected columns +select_list <- as.list(select_list[[1]]) +select_rank1 <- hal_screen_rank(x, y, family = 'gaussian', k = length(select_list)) + +test_that("Rank function works properly with k(k!=NULL)", { + expect_equal(select_list, select_rank1)#k=length(select_list), equal +}) + +select_rank2 <- hal_screen_rank(x, y, family = 'gaussian') + +test_that("Rank function works properly without k", { + expect_equal(select_list, select_rank2)#k=NULL, equal +}) + +x_interaction_basis <- cbind(x, x[,1]*x[,2], x[,1]*x[,3], x[,2]*x[,3])# generate main terms and 2-way interaction +x_basis_lists <- list(1, 2, 3, c(1,2), c(1,3), c(2,3))#generate the column lists +goodbasis <- hal_screen_goodbasis(x, y, actual_max_degree = 2, k = 6, family = 'gaussian') + +test_that("Goodbasis function works properly with interaction", { + expect_equal(x_basis_lists, goodbasis)#when k=6, they must be equal, all columns would be selected +}) + +x_basis<-matrix(nrow = n, ncol = 1) + +for (i in seq_along(x_basis_lists)) { + col_list <- x_basis_lists[[i]] + basis_list <- basis_list_cols(col_list, x) + x_basis <- cbind(x_basis, make_design_matrix(x, basis_list))#generate k*n basis functions +} +x_basis<-as.matrix(x_basis[,-1]) + +screen_goodcols <- cv.glmnet(x_basis, y, family = 'gaussian') +lambda_min <- screen_goodcols$lambda.min +pred <- predict.cv.glmnet(screen_goodcols, newx = x_basis, s = screen_goodcols$lambda.1se, newoffset = offset) +mse <- mean((pred - y)^2) + +output_result <- hal_screen_output(x, y, family = 'gaussian', col_lists = goodbasis) + +test_that("Output function works properly with interaction", { + expect_equal(lambda_min, output_result$lambda_min) + expect_equal(mse, output_result$fit_performance) +}) From f2d8f36948c25b63793885814ace939ee1a7fe27 Mon Sep 17 00:00:00 2001 From: YuZhang2019 Date: Wed, 4 Dec 2019 15:22:24 -0800 Subject: [PATCH 02/19] new changes --- hal_screen2.R | 154 --------------------------------------------- test-hal_screen2.R | 57 ----------------- 2 files changed, 211 deletions(-) delete mode 100644 hal_screen2.R delete mode 100644 test-hal_screen2.R diff --git a/hal_screen2.R b/hal_screen2.R deleted file mode 100644 index 4253232a..00000000 --- a/hal_screen2.R +++ /dev/null @@ -1,154 +0,0 @@ -#step1:do regular lasso for main term functions and their interaction functions -#step2:rank those basis functions based on their speed to become zero and choose k top basis functions -#step3:generate K*n basis functions and do regular lasso -#step4:output the fitting results, mean square error and the running time of step3 - -# hal_screen_goodbasis is aimed to screen main term functions and their interaction functions x1*x2,x1*x2*x3,etc -# hal_screen_rank is aimed to rank all the covariates based on their speed to become zero -# hal_screen_output is aimed to do regular lasso for K*n basis function and output the fitting performance and running time - -hal_screen_rank<-function(x, y, family, k = NULL, foldid = NULL, offset = NULL){ - n <- length(y)# length of y - p <- ncol(x)# column number of x - - if (is.null(foldid)) { - foldid <- sample(1:5, n, replace = TRUE) - } - - if (is.null(offset)) { - offset <- rep(mean(y), n) - } - - rank_basis <- cv.glmnet(x, y, family = family, foldid = foldid, offset = offset) - rank_col <- list() - if (!is.null(k)){ - for (i in 1:length(rank_basis$lambda)) { - lambda <- as.matrix(rank_basis$lambda)# decreasing lambda - lam <- lambda[i] - coef <- as.list(coef(rank_basis, lam))# coef with lambda[i] - coef <- coef[-1]# remove the first row(intercept) - keep <- list(which(coef!=0, arr.ind = TRUE))# return the row which coef!=0 - rank_col <- c(rank_col, keep) - rank_col <- rank_col[!duplicated(rank_col)]#remove duplicated rows - } - rank_col <- rank_col[-1] #remove the last interger(0) row - rank_col <- rank_col[[k]] - rank_col <- lapply(rank_col, function(x) x=x) - } - else { - coef <- as.list(coef(rank_basis, rank_basis$lambda.min)) - coef <- coef[-1] - rank_col <- as.list(which(coef!=0, arr.ind = TRUE)) - } -} - - -hal_screen_goodbasis<-function(x, y, actual_max_degree, k = NULL, family, col_lists = NULL, foldid = NULL, offset = NULL, verbose = FALSE){ - n <- length(y) - p <- ncol(x) - - if (is.null(col_lists)) { - col_lists <- as.list(seq_len(p))# seq_len=(1,2,...,p) - } - - if (is.null(foldid)) { - foldid <- sample(1:5, n, replace = TRUE) - } - - if (is.null(offset)) { - offset <- rep(mean(y), n) - } - - good_cols <- unlist(col_lists) - interaction_col_lists <- list() - x_interaction_basis <- x - if (actual_max_degree >= 2) { - for (degree in 2:actual_max_degree) { - combs <- utils::combn(length(good_cols), degree) - degree_lists <- lapply(seq_len(ncol(combs)), function(col) good_cols[combs[, col]]) - interaction_col_lists <- c(interaction_col_lists, degree_lists) - for (col in seq_len(ncol(combs))) { - x_interaction <- matrix(1, ncol = 1, nrow = n) - for (row in combs[,col]) { - x_interaction <- x_interaction*x[,row] - } - x_interaction_basis <- cbind(x_interaction_basis, x_interaction) - } - }# get matrix[x1,x2,..,x1*x2,..,x1*x2*x3,..] - x_basis_lists<-as.list(matrix(0,ncol = length(col_lists)+length(interaction_col_lists))) - for (i in 1:length(x_basis_lists)) { - if (i<=length(col_lists)){ - x_basis_lists[[i]]<-col_lists[[i]] - } - else{ - x_basis_lists[[i]]<-interaction_col_lists[[i-length(col_lists)]] - } - }# get list((1,..)(12,13,...)(123,..)) - screened_rank <- hal_screen_rank(x_interaction_basis, y, k = k, - family = family, - foldid = foldid, - offset = offset) - screened_col <- lapply(screened_rank, function(x) x_basis_lists[[x]]) - set_interaction <- list() - set_mainterm <- list() - for(i in 1:length(screened_col)){ - if (length(screened_col[[i]])!=1){ - set_interaction <- c(set_interaction, as.list(screened_col[[i]])) - } - else{set_mainterm <- c(set_mainterm, as.list(screened_col[[i]]))}#get set of main terms - } - set_interaction <- set_interaction[!duplicated(set_interaction)]#get set of main terms that build all the interaction - screened_col <- c(screened_col, setdiff(set_interaction, set_mainterm))#include all the main terms that build the interaction terms - } - else {screened_rank <- hal_screen_rank(x, y, k = k, - family = family, - foldid = foldid, - offset = offset) - screened_col <- lapply(screened_rank, function(x) col_lists[[x]]) - } - return(screened_col) -}# find the K basis function - -hal_screen_output<-function(x, y, family, col_lists, foldid = NULL, offset = NULL){#generate K*n basis function and do regular lasso - n <- length(y)# length of y - p <- ncol(x)# column number of x - - if (is.null(foldid)) { - foldid <- sample(1:5, n, replace = TRUE) - } - - if (is.null(offset)) { - offset <- rep(mean(y), n) - } - - col_results <- list() - x_basis<-matrix(nrow = n, ncol = 1) - - for (i in seq_along(col_lists)) {# i from 1 to p - col_list <- col_lists[[i]] - basis_list <- basis_list_cols(col_list, x) #one by one generate basis_list - x_basis <- cbind(x_basis, make_design_matrix(x, basis_list))#generate k*n basis functions - } - x_basis<-as.matrix(x_basis[,-1]) - screen_goodcols <- cv.glmnet(x_basis, y, family = family, offset = offset, foldid = foldid)# do regular lasso for k*n basis functions - - - lambda_min <- screen_goodcols$lambda.min - lambda_1se <- screen_goodcols$lambda.1se - coef <- coef.cv.glmnet(screen_goodcols, s = 'lambda.1se') - coef_list <- list(which(!coef[-1] == 0))#find non-zero column lists - - pred <- predict.cv.glmnet(screen_goodcols, newx = x_basis, s = lambda_1se, newoffset = offset) - mse <- mean((pred - y)^2) - - col_result <- list( - coef_list = list(coef_list), - lambda_min = lambda_min, - lambda_1se = lambda_1se, - fit_performance = mse, - time = proc.time() - #TODO: calculate running time - ) - return(col_result) -} - diff --git a/test-hal_screen2.R b/test-hal_screen2.R deleted file mode 100644 index 3295c12b..00000000 --- a/test-hal_screen2.R +++ /dev/null @@ -1,57 +0,0 @@ -devtools::uses_testthat() - -context("Unit test for HAL screening procedure") - -library(hal9001) -set.seed(749125) - -n <- 100 -p <- 3 -x <- xmat <- matrix(rnorm(n * p), n, p) -y <- sin(x[, 1]) * sin(x[, 2]) + rnorm(n, mean = 0, sd = 0.2) - -rank_b <- cv.glmnet(x, y, family = "gaussian") -coef_list <- as.list(coef(rank_b, rank_b$lambda.min)) -coef_list <- coef_list[-1] -select_list <- list(which(coef_list!=0, arr.ind = TRUE))# get selected columns -select_list <- as.list(select_list[[1]]) -select_rank1 <- hal_screen_rank(x, y, family = 'gaussian', k = length(select_list)) - -test_that("Rank function works properly with k(k!=NULL)", { - expect_equal(select_list, select_rank1)#k=length(select_list), equal -}) - -select_rank2 <- hal_screen_rank(x, y, family = 'gaussian') - -test_that("Rank function works properly without k", { - expect_equal(select_list, select_rank2)#k=NULL, equal -}) - -x_interaction_basis <- cbind(x, x[,1]*x[,2], x[,1]*x[,3], x[,2]*x[,3])# generate main terms and 2-way interaction -x_basis_lists <- list(1, 2, 3, c(1,2), c(1,3), c(2,3))#generate the column lists -goodbasis <- hal_screen_goodbasis(x, y, actual_max_degree = 2, k = 6, family = 'gaussian') - -test_that("Goodbasis function works properly with interaction", { - expect_equal(x_basis_lists, goodbasis)#when k=6, they must be equal, all columns would be selected -}) - -x_basis<-matrix(nrow = n, ncol = 1) - -for (i in seq_along(x_basis_lists)) { - col_list <- x_basis_lists[[i]] - basis_list <- basis_list_cols(col_list, x) - x_basis <- cbind(x_basis, make_design_matrix(x, basis_list))#generate k*n basis functions -} -x_basis<-as.matrix(x_basis[,-1]) - -screen_goodcols <- cv.glmnet(x_basis, y, family = 'gaussian') -lambda_min <- screen_goodcols$lambda.min -pred <- predict.cv.glmnet(screen_goodcols, newx = x_basis, s = screen_goodcols$lambda.1se, newoffset = offset) -mse <- mean((pred - y)^2) - -output_result <- hal_screen_output(x, y, family = 'gaussian', col_lists = goodbasis) - -test_that("Output function works properly with interaction", { - expect_equal(lambda_min, output_result$lambda_min) - expect_equal(mse, output_result$fit_performance) -}) From c5b593ce48da9e9416ac133a309629dd07d94f77 Mon Sep 17 00:00:00 2001 From: YuZhang2019 Date: Wed, 4 Dec 2019 15:36:42 -0800 Subject: [PATCH 03/19] . --- R/hal_screen2.R | 154 ++++++++++++++++++++++++++++++ tests/testthat/test-hal_screen2.R | 57 +++++++++++ 2 files changed, 211 insertions(+) create mode 100644 R/hal_screen2.R create mode 100644 tests/testthat/test-hal_screen2.R diff --git a/R/hal_screen2.R b/R/hal_screen2.R new file mode 100644 index 00000000..4253232a --- /dev/null +++ b/R/hal_screen2.R @@ -0,0 +1,154 @@ +#step1:do regular lasso for main term functions and their interaction functions +#step2:rank those basis functions based on their speed to become zero and choose k top basis functions +#step3:generate K*n basis functions and do regular lasso +#step4:output the fitting results, mean square error and the running time of step3 + +# hal_screen_goodbasis is aimed to screen main term functions and their interaction functions x1*x2,x1*x2*x3,etc +# hal_screen_rank is aimed to rank all the covariates based on their speed to become zero +# hal_screen_output is aimed to do regular lasso for K*n basis function and output the fitting performance and running time + +hal_screen_rank<-function(x, y, family, k = NULL, foldid = NULL, offset = NULL){ + n <- length(y)# length of y + p <- ncol(x)# column number of x + + if (is.null(foldid)) { + foldid <- sample(1:5, n, replace = TRUE) + } + + if (is.null(offset)) { + offset <- rep(mean(y), n) + } + + rank_basis <- cv.glmnet(x, y, family = family, foldid = foldid, offset = offset) + rank_col <- list() + if (!is.null(k)){ + for (i in 1:length(rank_basis$lambda)) { + lambda <- as.matrix(rank_basis$lambda)# decreasing lambda + lam <- lambda[i] + coef <- as.list(coef(rank_basis, lam))# coef with lambda[i] + coef <- coef[-1]# remove the first row(intercept) + keep <- list(which(coef!=0, arr.ind = TRUE))# return the row which coef!=0 + rank_col <- c(rank_col, keep) + rank_col <- rank_col[!duplicated(rank_col)]#remove duplicated rows + } + rank_col <- rank_col[-1] #remove the last interger(0) row + rank_col <- rank_col[[k]] + rank_col <- lapply(rank_col, function(x) x=x) + } + else { + coef <- as.list(coef(rank_basis, rank_basis$lambda.min)) + coef <- coef[-1] + rank_col <- as.list(which(coef!=0, arr.ind = TRUE)) + } +} + + +hal_screen_goodbasis<-function(x, y, actual_max_degree, k = NULL, family, col_lists = NULL, foldid = NULL, offset = NULL, verbose = FALSE){ + n <- length(y) + p <- ncol(x) + + if (is.null(col_lists)) { + col_lists <- as.list(seq_len(p))# seq_len=(1,2,...,p) + } + + if (is.null(foldid)) { + foldid <- sample(1:5, n, replace = TRUE) + } + + if (is.null(offset)) { + offset <- rep(mean(y), n) + } + + good_cols <- unlist(col_lists) + interaction_col_lists <- list() + x_interaction_basis <- x + if (actual_max_degree >= 2) { + for (degree in 2:actual_max_degree) { + combs <- utils::combn(length(good_cols), degree) + degree_lists <- lapply(seq_len(ncol(combs)), function(col) good_cols[combs[, col]]) + interaction_col_lists <- c(interaction_col_lists, degree_lists) + for (col in seq_len(ncol(combs))) { + x_interaction <- matrix(1, ncol = 1, nrow = n) + for (row in combs[,col]) { + x_interaction <- x_interaction*x[,row] + } + x_interaction_basis <- cbind(x_interaction_basis, x_interaction) + } + }# get matrix[x1,x2,..,x1*x2,..,x1*x2*x3,..] + x_basis_lists<-as.list(matrix(0,ncol = length(col_lists)+length(interaction_col_lists))) + for (i in 1:length(x_basis_lists)) { + if (i<=length(col_lists)){ + x_basis_lists[[i]]<-col_lists[[i]] + } + else{ + x_basis_lists[[i]]<-interaction_col_lists[[i-length(col_lists)]] + } + }# get list((1,..)(12,13,...)(123,..)) + screened_rank <- hal_screen_rank(x_interaction_basis, y, k = k, + family = family, + foldid = foldid, + offset = offset) + screened_col <- lapply(screened_rank, function(x) x_basis_lists[[x]]) + set_interaction <- list() + set_mainterm <- list() + for(i in 1:length(screened_col)){ + if (length(screened_col[[i]])!=1){ + set_interaction <- c(set_interaction, as.list(screened_col[[i]])) + } + else{set_mainterm <- c(set_mainterm, as.list(screened_col[[i]]))}#get set of main terms + } + set_interaction <- set_interaction[!duplicated(set_interaction)]#get set of main terms that build all the interaction + screened_col <- c(screened_col, setdiff(set_interaction, set_mainterm))#include all the main terms that build the interaction terms + } + else {screened_rank <- hal_screen_rank(x, y, k = k, + family = family, + foldid = foldid, + offset = offset) + screened_col <- lapply(screened_rank, function(x) col_lists[[x]]) + } + return(screened_col) +}# find the K basis function + +hal_screen_output<-function(x, y, family, col_lists, foldid = NULL, offset = NULL){#generate K*n basis function and do regular lasso + n <- length(y)# length of y + p <- ncol(x)# column number of x + + if (is.null(foldid)) { + foldid <- sample(1:5, n, replace = TRUE) + } + + if (is.null(offset)) { + offset <- rep(mean(y), n) + } + + col_results <- list() + x_basis<-matrix(nrow = n, ncol = 1) + + for (i in seq_along(col_lists)) {# i from 1 to p + col_list <- col_lists[[i]] + basis_list <- basis_list_cols(col_list, x) #one by one generate basis_list + x_basis <- cbind(x_basis, make_design_matrix(x, basis_list))#generate k*n basis functions + } + x_basis<-as.matrix(x_basis[,-1]) + screen_goodcols <- cv.glmnet(x_basis, y, family = family, offset = offset, foldid = foldid)# do regular lasso for k*n basis functions + + + lambda_min <- screen_goodcols$lambda.min + lambda_1se <- screen_goodcols$lambda.1se + coef <- coef.cv.glmnet(screen_goodcols, s = 'lambda.1se') + coef_list <- list(which(!coef[-1] == 0))#find non-zero column lists + + pred <- predict.cv.glmnet(screen_goodcols, newx = x_basis, s = lambda_1se, newoffset = offset) + mse <- mean((pred - y)^2) + + col_result <- list( + coef_list = list(coef_list), + lambda_min = lambda_min, + lambda_1se = lambda_1se, + fit_performance = mse, + time = proc.time() + #TODO: calculate running time + ) + return(col_result) +} + diff --git a/tests/testthat/test-hal_screen2.R b/tests/testthat/test-hal_screen2.R new file mode 100644 index 00000000..3295c12b --- /dev/null +++ b/tests/testthat/test-hal_screen2.R @@ -0,0 +1,57 @@ +devtools::uses_testthat() + +context("Unit test for HAL screening procedure") + +library(hal9001) +set.seed(749125) + +n <- 100 +p <- 3 +x <- xmat <- matrix(rnorm(n * p), n, p) +y <- sin(x[, 1]) * sin(x[, 2]) + rnorm(n, mean = 0, sd = 0.2) + +rank_b <- cv.glmnet(x, y, family = "gaussian") +coef_list <- as.list(coef(rank_b, rank_b$lambda.min)) +coef_list <- coef_list[-1] +select_list <- list(which(coef_list!=0, arr.ind = TRUE))# get selected columns +select_list <- as.list(select_list[[1]]) +select_rank1 <- hal_screen_rank(x, y, family = 'gaussian', k = length(select_list)) + +test_that("Rank function works properly with k(k!=NULL)", { + expect_equal(select_list, select_rank1)#k=length(select_list), equal +}) + +select_rank2 <- hal_screen_rank(x, y, family = 'gaussian') + +test_that("Rank function works properly without k", { + expect_equal(select_list, select_rank2)#k=NULL, equal +}) + +x_interaction_basis <- cbind(x, x[,1]*x[,2], x[,1]*x[,3], x[,2]*x[,3])# generate main terms and 2-way interaction +x_basis_lists <- list(1, 2, 3, c(1,2), c(1,3), c(2,3))#generate the column lists +goodbasis <- hal_screen_goodbasis(x, y, actual_max_degree = 2, k = 6, family = 'gaussian') + +test_that("Goodbasis function works properly with interaction", { + expect_equal(x_basis_lists, goodbasis)#when k=6, they must be equal, all columns would be selected +}) + +x_basis<-matrix(nrow = n, ncol = 1) + +for (i in seq_along(x_basis_lists)) { + col_list <- x_basis_lists[[i]] + basis_list <- basis_list_cols(col_list, x) + x_basis <- cbind(x_basis, make_design_matrix(x, basis_list))#generate k*n basis functions +} +x_basis<-as.matrix(x_basis[,-1]) + +screen_goodcols <- cv.glmnet(x_basis, y, family = 'gaussian') +lambda_min <- screen_goodcols$lambda.min +pred <- predict.cv.glmnet(screen_goodcols, newx = x_basis, s = screen_goodcols$lambda.1se, newoffset = offset) +mse <- mean((pred - y)^2) + +output_result <- hal_screen_output(x, y, family = 'gaussian', col_lists = goodbasis) + +test_that("Output function works properly with interaction", { + expect_equal(lambda_min, output_result$lambda_min) + expect_equal(mse, output_result$fit_performance) +}) From 02346d2618c7bc1a803917355a2435b1bd5810df Mon Sep 17 00:00:00 2001 From: Jeremy Coyle Date: Wed, 11 Dec 2019 10:52:38 -0800 Subject: [PATCH 04/19] small screening fixes --- R/hal.R | 18 ++---- R/hal_screen2.R | 104 ++++++++++++++---------------- tests/testthat/test-hal_screen2.R | 71 +++++++++++++------- 3 files changed, 103 insertions(+), 90 deletions(-) diff --git a/R/hal.R b/R/hal.R index 9c77454d..e0a49cb1 100644 --- a/R/hal.R +++ b/R/hal.R @@ -145,17 +145,13 @@ fit_hal <- function(X, # make design matrix for HAL if (is.null(basis_list)) { if (screen_basis) { - # NOTE: foldid is never missing since created above if not supplied - good_basis <- hal_screen_basis( - x = X, - y = Y, - family = family, - offset = offset, - foldid = foldid, - max_degree = max_degree - ) - basis_lists <- lapply(good_basis, basis_list_cols, X) - basis_list <- unlist(basis_lists, recursive = FALSE) + selected_cols <- hal_screen_goodbasis(x, y, actual_max_degree = max_degree, k = NULL, family = 'gaussian') + basis_list <- c() + for (i in seq_along(selected_cols)) { + col_list <- selected_cols[[i]] + basis_list <- c(basis_list,basis_list_cols(col_list, x)) + + } } else { basis_list <- enumerate_basis(X, max_degree) } diff --git a/R/hal_screen2.R b/R/hal_screen2.R index 4253232a..a4b54d2f 100644 --- a/R/hal_screen2.R +++ b/R/hal_screen2.R @@ -10,55 +10,50 @@ hal_screen_rank<-function(x, y, family, k = NULL, foldid = NULL, offset = NULL){ n <- length(y)# length of y p <- ncol(x)# column number of x - + if (is.null(foldid)) { foldid <- sample(1:5, n, replace = TRUE) } - + if (is.null(offset)) { offset <- rep(mean(y), n) } - + rank_basis <- cv.glmnet(x, y, family = family, foldid = foldid, offset = offset) - rank_col <- list() + if (!is.null(k)){ - for (i in 1:length(rank_basis$lambda)) { - lambda <- as.matrix(rank_basis$lambda)# decreasing lambda - lam <- lambda[i] - coef <- as.list(coef(rank_basis, lam))# coef with lambda[i] - coef <- coef[-1]# remove the first row(intercept) - keep <- list(which(coef!=0, arr.ind = TRUE))# return the row which coef!=0 - rank_col <- c(rank_col, keep) - rank_col <- rank_col[!duplicated(rank_col)]#remove duplicated rows - } - rank_col <- rank_col[-1] #remove the last interger(0) row - rank_col <- rank_col[[k]] - rank_col <- lapply(rank_col, function(x) x=x) - } - else { - coef <- as.list(coef(rank_basis, rank_basis$lambda.min)) - coef <- coef[-1] - rank_col <- as.list(which(coef!=0, arr.ind = TRUE)) + coef_mat <- coef(rank_basis, rank_basis$lambda) + coef_mat <- coef_mat[-1,] + first_nz_lambda <- apply(coef_mat!=0,1,function(x)which(x)[1]) + rank_col <- order(first_nz_lambda) + select_col <- rank_col[1:k] + } else { + select_coefs <- coef(rank_basis, rank_basis$lambda.min) + select_coefs <- select_coefs[-1] + select_col <- which(select_coefs!=0) } + + return(select_col) + } hal_screen_goodbasis<-function(x, y, actual_max_degree, k = NULL, family, col_lists = NULL, foldid = NULL, offset = NULL, verbose = FALSE){ n <- length(y) p <- ncol(x) - + if (is.null(col_lists)) { col_lists <- as.list(seq_len(p))# seq_len=(1,2,...,p) } - + if (is.null(foldid)) { foldid <- sample(1:5, n, replace = TRUE) } - + if (is.null(offset)) { offset <- rep(mean(y), n) } - + good_cols <- unlist(col_lists) interaction_col_lists <- list() x_interaction_basis <- x @@ -104,51 +99,52 @@ hal_screen_goodbasis<-function(x, y, actual_max_degree, k = NULL, family, col_li family = family, foldid = foldid, offset = offset) - screened_col <- lapply(screened_rank, function(x) col_lists[[x]]) - } + screened_col <- lapply(screened_rank, function(x) col_lists[[x]]) + } return(screened_col) }# find the K basis function hal_screen_output<-function(x, y, family, col_lists, foldid = NULL, offset = NULL){#generate K*n basis function and do regular lasso n <- length(y)# length of y p <- ncol(x)# column number of x - + if (is.null(foldid)) { foldid <- sample(1:5, n, replace = TRUE) } - + if (is.null(offset)) { offset <- rep(mean(y), n) } - + col_results <- list() - x_basis<-matrix(nrow = n, ncol = 1) - + + basis_list <- c() + for (i in seq_along(col_lists)) {# i from 1 to p col_list <- col_lists[[i]] - basis_list <- basis_list_cols(col_list, x) #one by one generate basis_list - x_basis <- cbind(x_basis, make_design_matrix(x, basis_list))#generate k*n basis functions + basis_list <- c(basis_list,basis_list_cols(col_list, x)) #one by one generate basis_list + } - x_basis<-as.matrix(x_basis[,-1]) + x_basis <- make_design_matrix(x, basis_list)#generate k*n basis functions screen_goodcols <- cv.glmnet(x_basis, y, family = family, offset = offset, foldid = foldid)# do regular lasso for k*n basis functions - - - lambda_min <- screen_goodcols$lambda.min - lambda_1se <- screen_goodcols$lambda.1se - coef <- coef.cv.glmnet(screen_goodcols, s = 'lambda.1se') - coef_list <- list(which(!coef[-1] == 0))#find non-zero column lists - - pred <- predict.cv.glmnet(screen_goodcols, newx = x_basis, s = lambda_1se, newoffset = offset) - mse <- mean((pred - y)^2) - - col_result <- list( - coef_list = list(coef_list), - lambda_min = lambda_min, - lambda_1se = lambda_1se, - fit_performance = mse, - time = proc.time() - #TODO: calculate running time - ) - return(col_result) + + + lambda_min <- screen_goodcols$lambda.min + lambda_1se <- screen_goodcols$lambda.1se + coef <- coef.cv.glmnet(screen_goodcols, s = 'lambda.1se') + coef_list <- list(which(!coef[-1] == 0))#find non-zero column lists + + pred <- predict.cv.glmnet(screen_goodcols, newx = x_basis, s = lambda_1se, newoffset = offset) + mse <- mean((pred - y)^2) + + col_result <- list( + coef_list = list(coef_list), + lambda_min = lambda_min, + lambda_1se = lambda_1se, + fit_performance = mse, + time = proc.time() + #TODO: calculate running time + ) + return(col_result) } diff --git a/tests/testthat/test-hal_screen2.R b/tests/testthat/test-hal_screen2.R index 3295c12b..98abaa3d 100644 --- a/tests/testthat/test-hal_screen2.R +++ b/tests/testthat/test-hal_screen2.R @@ -3,6 +3,7 @@ devtools::uses_testthat() context("Unit test for HAL screening procedure") library(hal9001) +library(glmnet) set.seed(749125) n <- 100 @@ -10,48 +11,68 @@ p <- 3 x <- xmat <- matrix(rnorm(n * p), n, p) y <- sin(x[, 1]) * sin(x[, 2]) + rnorm(n, mean = 0, sd = 0.2) -rank_b <- cv.glmnet(x, y, family = "gaussian") -coef_list <- as.list(coef(rank_b, rank_b$lambda.min)) -coef_list <- coef_list[-1] -select_list <- list(which(coef_list!=0, arr.ind = TRUE))# get selected columns -select_list <- as.list(select_list[[1]]) -select_rank1 <- hal_screen_rank(x, y, family = 'gaussian', k = length(select_list)) +testn <- 10000 +testx <- xmat <- matrix(rnorm(testn * p), testn, p) +testy <- sin(x[, 1]) * sin(x[, 2]) + rnorm(testn, mean = 0, sd = 0.2) +select_list <- 2 +select_rank1 <- hal_screen_rank(x, y, k=1, family = 'gaussian') test_that("Rank function works properly with k(k!=NULL)", { expect_equal(select_list, select_rank1)#k=length(select_list), equal }) +select_list <- c(2,3) select_rank2 <- hal_screen_rank(x, y, family = 'gaussian') test_that("Rank function works properly without k", { expect_equal(select_list, select_rank2)#k=NULL, equal }) -x_interaction_basis <- cbind(x, x[,1]*x[,2], x[,1]*x[,3], x[,2]*x[,3])# generate main terms and 2-way interaction -x_basis_lists <- list(1, 2, 3, c(1,2), c(1,3), c(2,3))#generate the column lists -goodbasis <- hal_screen_goodbasis(x, y, actual_max_degree = 2, k = 6, family = 'gaussian') +# x_interaction_basis <- cbind(x, x[,1]*x[,2], x[,1]*x[,3], x[,2]*x[,3])# generate main terms and 2-way interaction +# x_basis_lists <- list(1, 2, 3, c(1,2), c(1,3), c(2,3))#generate the column lists +x_basis_lists <- list(1,2,c(1,2)) +goodbasis <- hal_screen_goodbasis(x, y, actual_max_degree = 2, k = NULL, family = 'gaussian') + test_that("Goodbasis function works properly with interaction", { - expect_equal(x_basis_lists, goodbasis)#when k=6, they must be equal, all columns would be selected + x_basis_str <- lapply(x_basis_lists, paste, collapse=",") + goodbasis_str <- lapply(goodbasis, paste, collapse=",") + expect_setequal(x_basis_str, goodbasis_str)#when k=6, they must be equal, all columns would be selected }) +# +# x_basis<-matrix(nrow = n, ncol = 1) +# +# basis_list <- c() +# for (i in seq_along(x_basis_lists)) { +# col_list <- x_basis_lists[[i]] +# basis_list <- c(basis_list,basis_list_cols(col_list, x)) +# +# } +# +# x_basis <- make_design_matrix(x, basis_list)#generate k*n basis functions +# +# test_x_basis <- make_design_matrix(testx, basis_list) + +hal_with_screening <- fit_hal(x,y,screen_basis=TRUE) +hal_without_screening <- fit_hal(x,y,screen_basis=FALSE) -x_basis<-matrix(nrow = n, ncol = 1) +preds <- predict(hal_with_screening, new_data=testx) +mse_w_screening <- mean((preds - testy)^2) +preds <- predict(hal_without_screening, new_data=testx) +mse_wo_screening <- mean((preds - testy)^2) -for (i in seq_along(x_basis_lists)) { - col_list <- x_basis_lists[[i]] - basis_list <- basis_list_cols(col_list, x) - x_basis <- cbind(x_basis, make_design_matrix(x, basis_list))#generate k*n basis functions -} -x_basis<-as.matrix(x_basis[,-1]) +hal_with_screening$times +hal_without_screening$times -screen_goodcols <- cv.glmnet(x_basis, y, family = 'gaussian') -lambda_min <- screen_goodcols$lambda.min -pred <- predict.cv.glmnet(screen_goodcols, newx = x_basis, s = screen_goodcols$lambda.1se, newoffset = offset) -mse <- mean((pred - y)^2) -output_result <- hal_screen_output(x, y, family = 'gaussian', col_lists = goodbasis) +test_that("screening makes things faster", { + with_time <- hal_with_screening$times["total","elapsed"] + wo_time <- hal_without_screening$times["total","elapsed"] + expect_lt(with_time,wo_time) +}) -test_that("Output function works properly with interaction", { - expect_equal(lambda_min, output_result$lambda_min) - expect_equal(mse, output_result$fit_performance) +test_that("screening doesn't hurt mse too much", { + + expect_lt(mse_w_screening,mse_wo_screening*1.2) }) + From 6b187463c8c5b26aee2f839eb7305d2e4c689a6b Mon Sep 17 00:00:00 2001 From: Jeremy Coyle Date: Wed, 11 Dec 2019 11:22:03 -0800 Subject: [PATCH 05/19] fix tests --- R/hal.R | 4 ++-- tests/testthat/test-hal_screen2.R | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/hal.R b/R/hal.R index e0a49cb1..64f0c5da 100644 --- a/R/hal.R +++ b/R/hal.R @@ -145,11 +145,11 @@ fit_hal <- function(X, # make design matrix for HAL if (is.null(basis_list)) { if (screen_basis) { - selected_cols <- hal_screen_goodbasis(x, y, actual_max_degree = max_degree, k = NULL, family = 'gaussian') + selected_cols <- hal_screen_goodbasis(X, Y, actual_max_degree = max_degree, k = NULL, family = 'gaussian') basis_list <- c() for (i in seq_along(selected_cols)) { col_list <- selected_cols[[i]] - basis_list <- c(basis_list,basis_list_cols(col_list, x)) + basis_list <- c(basis_list,basis_list_cols(col_list, X)) } } else { diff --git a/tests/testthat/test-hal_screen2.R b/tests/testthat/test-hal_screen2.R index 98abaa3d..2cfcdb0e 100644 --- a/tests/testthat/test-hal_screen2.R +++ b/tests/testthat/test-hal_screen2.R @@ -7,13 +7,13 @@ library(glmnet) set.seed(749125) n <- 100 -p <- 3 +p <- 5 x <- xmat <- matrix(rnorm(n * p), n, p) -y <- sin(x[, 1]) * sin(x[, 2]) + rnorm(n, mean = 0, sd = 0.2) +y <- 10*x[, 1]+ 5*x[,2]+ 6*x[,1]*x[,2] + rnorm(n, mean = 0, sd = 0.2) testn <- 10000 testx <- xmat <- matrix(rnorm(testn * p), testn, p) -testy <- sin(x[, 1]) * sin(x[, 2]) + rnorm(testn, mean = 0, sd = 0.2) +testy <- 10*testx[, 1]+ 5*testx[,2]+ 6*testx[,1]*testx[,2] + rnorm(n, mean = 0, sd = 0.2) select_list <- 2 select_rank1 <- hal_screen_rank(x, y, k=1, family = 'gaussian') From 96f6f1acf79c8f42e7ece1001527a76729b4f065 Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Tue, 17 Dec 2019 16:52:04 -0800 Subject: [PATCH 06/19] bump version for devel --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c81ed3d5..86038559 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: hal9001 Title: The Scalable Highly Adaptive Lasso -Version: 0.2.5 +Version: 0.2.6 Authors@R: c( person("Jeremy", "Coyle", email = "jeremyrcoyle@gmail.com", role = c("aut", "cre"), @@ -60,4 +60,4 @@ LinkingTo: Rcpp, RcppEigen VignetteBuilder: knitr -RoxygenNote: 7.0.1 +RoxygenNote: 7.0.2 From de8def3693f468f33eb6e3d947a834e7d4ab47ec Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Sat, 4 Apr 2020 15:43:46 -0700 Subject: [PATCH 07/19] bug fixes --- DESCRIPTION | 2 +- R/hal.R | 18 ++- R/hal_screen2.R | 166 ++++++++++++++----------- R/predict.R | 20 ++- man/hal_quotes.Rd | 4 +- man/predict.hal9001.Rd | 17 ++- tests/testthat/test-hal_screen2.R | 60 +++++---- tests/testthat/test-stat_performance.R | 9 +- 8 files changed, 175 insertions(+), 121 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7df8fda4..9c90939a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,4 +62,4 @@ LinkingTo: Rcpp, RcppEigen VignetteBuilder: knitr -RoxygenNote: 7.0.2 +RoxygenNote: 7.1.0 diff --git a/R/hal.R b/R/hal.R index 57506c2b..391d503f 100644 --- a/R/hal.R +++ b/R/hal.R @@ -116,12 +116,21 @@ fit_hal <- function(X, screen_lambda = FALSE, ..., yolo = TRUE) { - # check arguments and catch function call call <- match.call(expand.dots = TRUE) fit_type <- match.arg(fit_type) family <- match.arg(family) + # catch dot arguments to stop misuse of glmnet's `lambda.min.ratio` + dot_args <- list(...) + if ("lambda.min.ratio" %in% names(dot_args) & family == "binomial") { + msg <- paste( + "`glmnet` silently ignores `lambda.min.ratio` when", + "`family = 'binomial'`." + ) + stop(msg) + } + # NOTE: NOT supporting binomial outcomes with lassi method currently if (fit_type == "lassi" && family == "binomial") { stop("For binary outcomes, please set argument 'fit_type' to 'glmnet'.") @@ -155,12 +164,13 @@ fit_hal <- function(X, # make design matrix for HAL if (is.null(basis_list)) { if (screen_basis) { - selected_cols <- hal_screen_goodbasis(X, Y, actual_max_degree = max_degree, k = NULL, family = 'gaussian') + selected_cols <- hal_screen_goodbasis(X, Y, + actual_max_degree = max_degree, + k = NULL, family = "gaussian") basis_list <- c() for (i in seq_along(selected_cols)) { col_list <- selected_cols[[i]] - basis_list <- c(basis_list,basis_list_cols(col_list, X)) - + basis_list <- c(basis_list, basis_list_cols(col_list, X)) } } else { basis_list <- enumerate_basis(X, max_degree) diff --git a/R/hal_screen2.R b/R/hal_screen2.R index a4b54d2f..6e413469 100644 --- a/R/hal_screen2.R +++ b/R/hal_screen2.R @@ -1,150 +1,168 @@ -#step1:do regular lasso for main term functions and their interaction functions -#step2:rank those basis functions based on their speed to become zero and choose k top basis functions -#step3:generate K*n basis functions and do regular lasso -#step4:output the fitting results, mean square error and the running time of step3 +# step1:do regular lasso for main term functions and their interaction functions +# step2:rank those basis functions based on their speed to become zero and choose k top basis functions +# step3:generate K*n basis functions and do regular lasso +# step4:output the fitting results, mean square error and the running time of step3 # hal_screen_goodbasis is aimed to screen main term functions and their interaction functions x1*x2,x1*x2*x3,etc # hal_screen_rank is aimed to rank all the covariates based on their speed to become zero # hal_screen_output is aimed to do regular lasso for K*n basis function and output the fitting performance and running time -hal_screen_rank<-function(x, y, family, k = NULL, foldid = NULL, offset = NULL){ - n <- length(y)# length of y - p <- ncol(x)# column number of x - +hal_screen_rank <- function(x, y, family, k = NULL, foldid = NULL, + offset = NULL) { + n <- length(y) # length of y + p <- ncol(x) # column number of x + if (is.null(foldid)) { foldid <- sample(1:5, n, replace = TRUE) } - + if (is.null(offset)) { offset <- rep(mean(y), n) } - - rank_basis <- cv.glmnet(x, y, family = family, foldid = foldid, offset = offset) - - if (!is.null(k)){ + rank_basis <- cv.glmnet(x, y, family = family, foldid = foldid, + offset = offset) + + if (!is.null(k)) { coef_mat <- coef(rank_basis, rank_basis$lambda) - coef_mat <- coef_mat[-1,] - first_nz_lambda <- apply(coef_mat!=0,1,function(x)which(x)[1]) + coef_mat <- coef_mat[-1, ] + first_nz_lambda <- apply(coef_mat != 0, 1, function(x) which(x)[1]) rank_col <- order(first_nz_lambda) select_col <- rank_col[1:k] } else { select_coefs <- coef(rank_basis, rank_basis$lambda.min) select_coefs <- select_coefs[-1] - select_col <- which(select_coefs!=0) + select_col <- which(select_coefs != 0) } - + return(select_col) - } -hal_screen_goodbasis<-function(x, y, actual_max_degree, k = NULL, family, col_lists = NULL, foldid = NULL, offset = NULL, verbose = FALSE){ +hal_screen_goodbasis <- function(x, y, actual_max_degree, k = NULL, family, + col_lists = NULL, foldid = NULL, + offset = NULL, verbose = FALSE) { n <- length(y) p <- ncol(x) - + if (is.null(col_lists)) { - col_lists <- as.list(seq_len(p))# seq_len=(1,2,...,p) + col_lists <- as.list(seq_len(p)) # seq_len=(1,2,...,p) } - + if (is.null(foldid)) { foldid <- sample(1:5, n, replace = TRUE) } - + if (is.null(offset)) { offset <- rep(mean(y), n) } - + good_cols <- unlist(col_lists) interaction_col_lists <- list() x_interaction_basis <- x if (actual_max_degree >= 2) { for (degree in 2:actual_max_degree) { combs <- utils::combn(length(good_cols), degree) - degree_lists <- lapply(seq_len(ncol(combs)), function(col) good_cols[combs[, col]]) + degree_lists <- lapply(seq_len(ncol(combs)), function(col) { + good_cols[combs[, col]] + }) interaction_col_lists <- c(interaction_col_lists, degree_lists) for (col in seq_len(ncol(combs))) { x_interaction <- matrix(1, ncol = 1, nrow = n) - for (row in combs[,col]) { - x_interaction <- x_interaction*x[,row] + for (row in combs[, col]) { + x_interaction <- x_interaction * x[, row] } x_interaction_basis <- cbind(x_interaction_basis, x_interaction) } - }# get matrix[x1,x2,..,x1*x2,..,x1*x2*x3,..] - x_basis_lists<-as.list(matrix(0,ncol = length(col_lists)+length(interaction_col_lists))) + } # get matrix[x1,x2,..,x1*x2,..,x1*x2*x3,..] + x_basis_lists <- as.list(matrix(0, ncol = length(col_lists) + + length(interaction_col_lists))) for (i in 1:length(x_basis_lists)) { - if (i<=length(col_lists)){ - x_basis_lists[[i]]<-col_lists[[i]] - } - else{ - x_basis_lists[[i]]<-interaction_col_lists[[i-length(col_lists)]] + if (i <= length(col_lists)) { + x_basis_lists[[i]] <- col_lists[[i]] + } else { + x_basis_lists[[i]] <- interaction_col_lists[[i - length(col_lists)]] } - }# get list((1,..)(12,13,...)(123,..)) - screened_rank <- hal_screen_rank(x_interaction_basis, y, k = k, - family = family, - foldid = foldid, - offset = offset) + } # get list((1,..)(12,13,...)(123,..)) + screened_rank <- hal_screen_rank(x_interaction_basis, y, + k = k, + family = family, + foldid = foldid, + offset = offset + ) screened_col <- lapply(screened_rank, function(x) x_basis_lists[[x]]) set_interaction <- list() set_mainterm <- list() - for(i in 1:length(screened_col)){ - if (length(screened_col[[i]])!=1){ - set_interaction <- c(set_interaction, as.list(screened_col[[i]])) + if (length(screened_col) > 0) { + for (i in seq_along(screened_col)) { + if (length(screened_col[[i]]) > 1) { + set_interaction <- c(set_interaction, as.list(screened_col[[i]])) + } else { + set_mainterm <- c(set_mainterm, as.list(screened_col[[i]])) + } # get set of main terms } - else{set_mainterm <- c(set_mainterm, as.list(screened_col[[i]]))}#get set of main terms } - set_interaction <- set_interaction[!duplicated(set_interaction)]#get set of main terms that build all the interaction - screened_col <- c(screened_col, setdiff(set_interaction, set_mainterm))#include all the main terms that build the interaction terms - } - else {screened_rank <- hal_screen_rank(x, y, k = k, - family = family, - foldid = foldid, - offset = offset) - screened_col <- lapply(screened_rank, function(x) col_lists[[x]]) + # get set of main terms that build all the interaction + set_interaction <- set_interaction[!duplicated(set_interaction)] + # include all the main terms that build the interaction terms + screened_col <- c(screened_col, setdiff(set_interaction, set_mainterm)) + } else { + screened_rank <- hal_screen_rank(x, y, + k = k, + family = family, + foldid = foldid, + offset = offset + ) + screened_col <- lapply(screened_rank, function(x) col_lists[[x]]) } return(screened_col) -}# find the K basis function +} + +# find the K basis function +# generate K*n basis function and do regular lasso +hal_screen_output <- function(x, y, family, col_lists, foldid = NULL, + offset = NULL) { + n <- length(y) # length of y + p <- ncol(x) # column number of x -hal_screen_output<-function(x, y, family, col_lists, foldid = NULL, offset = NULL){#generate K*n basis function and do regular lasso - n <- length(y)# length of y - p <- ncol(x)# column number of x - if (is.null(foldid)) { foldid <- sample(1:5, n, replace = TRUE) } - + if (is.null(offset)) { offset <- rep(mean(y), n) } - + col_results <- list() - + basis_list <- c() - - for (i in seq_along(col_lists)) {# i from 1 to p + + for (i in seq_along(col_lists)) { # i from 1 to p col_list <- col_lists[[i]] - basis_list <- c(basis_list,basis_list_cols(col_list, x)) #one by one generate basis_list - + # one by one generate basis_list + basis_list <- c(basis_list, basis_list_cols(col_list, x)) } - x_basis <- make_design_matrix(x, basis_list)#generate k*n basis functions - screen_goodcols <- cv.glmnet(x_basis, y, family = family, offset = offset, foldid = foldid)# do regular lasso for k*n basis functions - - + # generate k*n basis functions + x_basis <- make_design_matrix(x, basis_list) + # do regular lasso for k*n basis functions + screen_goodcols <- cv.glmnet(x_basis, y, family = family, offset = offset, + foldid = foldid) + lambda_min <- screen_goodcols$lambda.min lambda_1se <- screen_goodcols$lambda.1se - coef <- coef.cv.glmnet(screen_goodcols, s = 'lambda.1se') - coef_list <- list(which(!coef[-1] == 0))#find non-zero column lists - - pred <- predict.cv.glmnet(screen_goodcols, newx = x_basis, s = lambda_1se, newoffset = offset) + coef <- coef.cv.glmnet(screen_goodcols, s = "lambda.1se") + coef_list <- list(which(!coef[-1] == 0)) # find non-zero column lists + + pred <- predict(screen_goodcols, newx = x_basis, s = lambda_1se, + newoffset = offset) mse <- mean((pred - y)^2) - + col_result <- list( coef_list = list(coef_list), lambda_min = lambda_min, lambda_1se = lambda_1se, fit_performance = mse, time = proc.time() - #TODO: calculate running time + # TODO: calculate running time ) return(col_result) } - diff --git a/R/predict.R b/R/predict.R index 8ead6ea5..f2001656 100644 --- a/R/predict.R +++ b/R/predict.R @@ -5,11 +5,15 @@ #' \code{hal9001}. #' #' @param object An object of class \code{hal9001}, containing the results of -#' fitting the Highly Adaptive Lasso, as produced by a call to \code{fit_hal}. +#' fitting the Highly Adaptive Lasso, as produced by \code{\link{fit_hal}}. #' @param offset A vector of offsets. Must be provided if provided at training #' @param lambda A single lambda value or a vector of lambdas to use for #' prediction. If \code{NULL}, a value of lambda will be selected based on -#' cross-validation, using \code{\link[glmnet]{cv.glmnet}}. +#' cross-validation, using \code{\link[glmnet]{cv.glmnet}}. NOTE that this +#' does NOT provide similar functionality to the equivalent argument in the +#' \pkg{glmnet} method of \code{\link[stats]{predict}}; rather, this argument +#' is used internally to screen lambdas (see \code{screen_lambda} argument of +#' \code{\link{fit_hal}} for details). #' @param ... Additional arguments passed to \code{predict} as necessary. #' @param new_data A \code{matrix} or \code{data.frame} containing new data #' (observations NOT used in fitting the \code{hal9001} object passed in via @@ -25,6 +29,14 @@ #' #' @export #' +#' @note This prediction method does not function similarly to the equivalent +#' method from \pkg{glmnet}. In particular, specifying argument \code{lambda} +#' will not return a subset of the lambdas originally specified in the call to +#' \code{\link{fit_hal}} nor result in re-fitting. This prediction method will +#' return predictions for all lambdas specified in \code{\link{fit_hal}} when +#' \code{cv_select = FALSE}. When \code{cv_select = TRUE}, predictions will +#' only be returned for the value of lambda selected by cross-validation. +#' #' @return A \code{numeric} vector of predictions from a fitted \code{hal9001} #' object. predict.hal9001 <- function(object, @@ -89,8 +101,8 @@ predict.hal9001 <- function(object, ) + object$coefs[1]) } } else { - # Note: there is no intercept in the Cox mode (its built into the baseline - # hazard, and like it, would cancel in the partial likelihood.) + # Note: there is no intercept in the Cox model (built into the baseline + # hazard and would cancel in the partial likelihood). # message(paste("The Cox Model is not commonly used for prediction,", # "proceed with caution.")) if (ncol(object$coefs) > 1) { diff --git a/man/hal_quotes.Rd b/man/hal_quotes.Rd index 83e2191d..eb6edd20 100644 --- a/man/hal_quotes.Rd +++ b/man/hal_quotes.Rd @@ -4,7 +4,9 @@ \name{hal_quotes} \alias{hal_quotes} \title{HAL9000 Quotes from "2001: A Space Odyssey"} -\format{A vector of quotes.} +\format{ +A vector of quotes. +} \usage{ hal_quotes } diff --git a/man/predict.hal9001.Rd b/man/predict.hal9001.Rd index 5ccb9c39..163242d5 100644 --- a/man/predict.hal9001.Rd +++ b/man/predict.hal9001.Rd @@ -15,13 +15,17 @@ } \arguments{ \item{object}{An object of class \code{hal9001}, containing the results of -fitting the Highly Adaptive Lasso, as produced by a call to \code{fit_hal}.} +fitting the Highly Adaptive Lasso, as produced by \code{\link{fit_hal}}.} \item{offset}{A vector of offsets. Must be provided if provided at training} \item{lambda}{A single lambda value or a vector of lambdas to use for prediction. If \code{NULL}, a value of lambda will be selected based on -cross-validation, using \code{\link[glmnet]{cv.glmnet}}.} +cross-validation, using \code{\link[glmnet]{cv.glmnet}}. NOTE that this +does NOT provide similar functionality to the equivalent argument in the +\pkg{glmnet} method of \code{\link[stats]{predict}}; rather, this argument +is used internally to screen lambdas (see \code{screen_lambda} argument of +\code{\link{fit_hal}} for details).} \item{...}{Additional arguments passed to \code{predict} as necessary.} @@ -46,3 +50,12 @@ Method for computing and extracting predictions from fits of the Highly Adaptive Lasso estimator, returned as a single S3 objects of class \code{hal9001}. } +\note{ +This prediction method does not function similarly to the equivalent + method from \pkg{glmnet}. In particular, specifying argument \code{lambda} + will not return a subset of the lambdas originally specified in the call to + \code{\link{fit_hal}} nor result in re-fitting. This prediction method will + return predictions for all lambdas specified in \code{\link{fit_hal}} when + \code{cv_select = FALSE}. When \code{cv_select = TRUE}, predictions will + only be returned for the value of lambda selected by cross-validation. +} diff --git a/tests/testthat/test-hal_screen2.R b/tests/testthat/test-hal_screen2.R index 2cfcdb0e..4b50743c 100644 --- a/tests/testthat/test-hal_screen2.R +++ b/tests/testthat/test-hal_screen2.R @@ -1,64 +1,64 @@ -devtools::uses_testthat() - context("Unit test for HAL screening procedure") - -library(hal9001) library(glmnet) set.seed(749125) n <- 100 p <- 5 x <- xmat <- matrix(rnorm(n * p), n, p) -y <- 10*x[, 1]+ 5*x[,2]+ 6*x[,1]*x[,2] + rnorm(n, mean = 0, sd = 0.2) +y <- 10 * x[, 1] + 5 * x[, 2] + 6 * x[, 1] * x[, 2] + + rnorm(n, mean = 0, sd = 0.2) testn <- 10000 testx <- xmat <- matrix(rnorm(testn * p), testn, p) -testy <- 10*testx[, 1]+ 5*testx[,2]+ 6*testx[,1]*testx[,2] + rnorm(n, mean = 0, sd = 0.2) +testy <- 10 * testx[, 1] + 5 * testx[, 2] + 6 * testx[, 1] * testx[, 2] + + rnorm(n, mean = 0, sd = 0.2) select_list <- 2 -select_rank1 <- hal_screen_rank(x, y, k=1, family = 'gaussian') +select_rank1 <- hal_screen_rank(x, y, k = 1, family = "gaussian") test_that("Rank function works properly with k(k!=NULL)", { - expect_equal(select_list, select_rank1)#k=length(select_list), equal + expect_equal(select_list, select_rank1) # k=length(select_list), equal }) -select_list <- c(2,3) -select_rank2 <- hal_screen_rank(x, y, family = 'gaussian') +select_list <- c(2, 3) +select_rank2 <- hal_screen_rank(x, y, family = "gaussian") test_that("Rank function works properly without k", { - expect_equal(select_list, select_rank2)#k=NULL, equal + expect_equal(select_list, select_rank2) # k=NULL, equal }) # x_interaction_basis <- cbind(x, x[,1]*x[,2], x[,1]*x[,3], x[,2]*x[,3])# generate main terms and 2-way interaction # x_basis_lists <- list(1, 2, 3, c(1,2), c(1,3), c(2,3))#generate the column lists -x_basis_lists <- list(1,2,c(1,2)) -goodbasis <- hal_screen_goodbasis(x, y, actual_max_degree = 2, k = NULL, family = 'gaussian') +x_basis_lists <- list(1, 2, c(1, 2)) +goodbasis <- hal_screen_goodbasis(x, y, actual_max_degree = 2, k = NULL, + family = "gaussian") test_that("Goodbasis function works properly with interaction", { - x_basis_str <- lapply(x_basis_lists, paste, collapse=",") - goodbasis_str <- lapply(goodbasis, paste, collapse=",") - expect_setequal(x_basis_str, goodbasis_str)#when k=6, they must be equal, all columns would be selected + x_basis_str <- lapply(x_basis_lists, paste, collapse = ",") + goodbasis_str <- lapply(goodbasis, paste, collapse = ",") + # when k=6, they must be equal, all columns would be selected + expect_setequal(x_basis_str, goodbasis_str) }) -# +# # x_basis<-matrix(nrow = n, ncol = 1) -# +# # basis_list <- c() # for (i in seq_along(x_basis_lists)) { # col_list <- x_basis_lists[[i]] # basis_list <- c(basis_list,basis_list_cols(col_list, x)) -# +# # } -# +# # x_basis <- make_design_matrix(x, basis_list)#generate k*n basis functions -# +# # test_x_basis <- make_design_matrix(testx, basis_list) -hal_with_screening <- fit_hal(x,y,screen_basis=TRUE) -hal_without_screening <- fit_hal(x,y,screen_basis=FALSE) +hal_with_screening <- fit_hal(x, y, screen_basis = TRUE) +hal_without_screening <- fit_hal(x, y, screen_basis = FALSE) -preds <- predict(hal_with_screening, new_data=testx) +preds <- predict(hal_with_screening, new_data = testx) mse_w_screening <- mean((preds - testy)^2) -preds <- predict(hal_without_screening, new_data=testx) +preds <- predict(hal_without_screening, new_data = testx) mse_wo_screening <- mean((preds - testy)^2) hal_with_screening$times @@ -66,13 +66,11 @@ hal_without_screening$times test_that("screening makes things faster", { - with_time <- hal_with_screening$times["total","elapsed"] - wo_time <- hal_without_screening$times["total","elapsed"] - expect_lt(with_time,wo_time) + with_time <- hal_with_screening$times["total", "elapsed"] + wo_time <- hal_without_screening$times["total", "elapsed"] + expect_lt(with_time, wo_time) }) test_that("screening doesn't hurt mse too much", { - - expect_lt(mse_w_screening,mse_wo_screening*1.2) + expect_lt(mse_w_screening, mse_wo_screening * 1.2) }) - diff --git a/tests/testthat/test-stat_performance.R b/tests/testthat/test-stat_performance.R index 27bf95f6..66b0d30b 100644 --- a/tests/testthat/test-stat_performance.R +++ b/tests/testthat/test-stat_performance.R @@ -1,4 +1,5 @@ context("Verify Statistical Performance") +library(glmnet) # generate training and test data # adapted from https://github.com/tlverse/hal9001/issues/9 @@ -70,8 +71,10 @@ X <- as.matrix(X) # test <- hal_screen_basis(X, Y,family="gaussian", verbose=TRUE, main_terms = FALSE) halres9001 <- fit_hal( Y = Y, X = X, - yolo = FALSE, screen_basis = TRUE, - screen_lambda = TRUE + yolo = FALSE, + screen_basis = TRUE + # NOTE: hal_screen_goodbasis is broken + #screen_lambda = TRUE ) pred9001 <- predict(halres9001, new_data = testX) @@ -92,13 +95,11 @@ x_basis <- hal9001:::make_design_matrix(X, basis_list) copy_map <- hal9001:::make_copy_map(x_basis) unique_columns <- as.numeric(names(copy_map)) x_basis <- x_basis[, unique_columns] - nbasis9001 <- ncol(x_basis) set.seed(1234) # attempt to control randomness in cv.glmnet fold generation # try to match hal param -library(glmnet) hal_lasso <- glmnet::cv.glmnet( x = x_basis, y = Y, nlambda = 100, lambda.min.ratio = 0.001, nfolds = 10, From c0efccfffcda131e9a05447a85068ed05c701ae7 Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Sun, 5 Apr 2020 17:02:55 -0700 Subject: [PATCH 08/19] run CI on devel --- .travis.yml | 1 + appveyor.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index ec02a3dc..1df0b2ec 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,7 @@ branches: only: - master + - devel env: global: diff --git a/appveyor.yml b/appveyor.yml index c1fa65aa..b1b77681 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -16,6 +16,7 @@ cache: branches: only: - master + - devel environment: global: From 1e33946066f00d502067b77af32981aeb32c3503 Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Mon, 6 Apr 2020 18:35:46 -0700 Subject: [PATCH 09/19] make pr --- R/hal.R | 42 +++-- R/predict.R | 24 +-- R/screening.R | 3 +- R/{hal_screen2.R => screening_experimental.R} | 22 ++- README.md | 10 +- docs/404.html | 32 ++-- docs/CONTRIBUTING.html | 54 +++--- docs/LICENSE-text.html | 32 ++-- docs/articles/index.html | 33 ++-- docs/articles/intro_hal9001.html | 144 ++++++++-------- docs/authors.html | 40 +++-- docs/bootstrap-toc.css | 60 +++++++ docs/bootstrap-toc.js | 159 ++++++++++++++++++ docs/index.html | 119 ++++++------- docs/news/index.html | 41 ++--- docs/pkgdown.css | 121 ++++++++++++- docs/pkgdown.js | 5 - docs/pkgdown.yml | 3 +- docs/reference/SL.hal9001.html | 38 ++--- docs/reference/apply_copy_map.html | 39 +++-- docs/reference/as_dgCMatrix.html | 38 ++--- docs/reference/basis_list_cols.html | 38 ++--- docs/reference/basis_of_degree.html | 38 ++--- docs/reference/cv_lasso.html | 37 ++-- docs/reference/cv_lasso_early_stopping.html | 37 ++-- docs/reference/enumerate_basis.html | 39 +++-- docs/reference/evaluate_basis.html | 37 ++-- docs/reference/fit_hal.html | 40 +++-- docs/reference/hal9000.html | 36 ++-- docs/reference/hal9001.html | 36 ++-- docs/reference/hal_quotes.html | 37 ++-- docs/reference/index.html | 40 +++-- docs/reference/index_first_copy.html | 37 ++-- docs/reference/lassi.html | 37 ++-- docs/reference/lassi_fit_module.html | 36 ++-- docs/reference/lassi_origami.html | 37 ++-- docs/reference/make_basis_list.html | 38 ++--- docs/reference/make_copy_map.html | 39 +++-- docs/reference/make_design_matrix.html | 39 +++-- docs/reference/make_reduced_basis_map.html | 38 ++--- docs/reference/meets_basis.html | 37 ++-- docs/reference/predict.SL.hal9001.html | 38 ++--- docs/reference/predict.hal9001.html | 69 ++++---- docs/reference/predict.lassi.html | 37 ++-- docs/reference/screening.html | 37 ++-- docs/reference/squash_hal_fit.html | 39 +++-- man/predict.hal9001.Rd | 33 +--- tests/testthat/test-hal_screen2.R | 76 --------- tests/testthat/test-screen_experimental.R | 81 +++++++++ tests/testthat/test-single_lambda.R | 3 +- tests/testthat/test-stat_performance.R | 6 +- 51 files changed, 1293 insertions(+), 938 deletions(-) rename R/{hal_screen2.R => screening_experimental.R} (91%) create mode 100644 docs/bootstrap-toc.css create mode 100644 docs/bootstrap-toc.js delete mode 100644 tests/testthat/test-hal_screen2.R create mode 100644 tests/testthat/test-screen_experimental.R diff --git a/R/hal.R b/R/hal.R index 391d503f..be137297 100644 --- a/R/hal.R +++ b/R/hal.R @@ -123,20 +123,35 @@ fit_hal <- function(X, # catch dot arguments to stop misuse of glmnet's `lambda.min.ratio` dot_args <- list(...) - if ("lambda.min.ratio" %in% names(dot_args) & family == "binomial") { - msg <- paste( - "`glmnet` silently ignores `lambda.min.ratio` when", - "`family = 'binomial'`." - ) - stop(msg) - } + assertthat::assert_that(!("lambda.min.ratio" %in% names(dot_args) & + family == "binomial"), + msg = paste( + "`glmnet` silently ignores", + "`lambda.min.ratio` when", + "`family = 'binomial'`." + ) + ) # NOTE: NOT supporting binomial outcomes with lassi method currently - if (fit_type == "lassi" && family == "binomial") { - stop("For binary outcomes, please set argument 'fit_type' to 'glmnet'.") + assertthat::assert_that(!(fit_type == "lassi" && family == "binomial"), + msg = paste( + "For binary outcomes, please set", + "argument 'fit_type' to 'glmnet'." + ) + ) + assertthat::assert_that(!(fit_type == "lassi" && family == "cox"), + msg = paste( + "For Cox models, please set argument", + "'fit_type' to 'glmnet'." + ) + ) + + # warn about screening functionality + if (screen_basis) { + warning("Basis screening functionality is currently experimental.") } - if (fit_type == "lassi" && family == "cox") { - stop("For Cox models, please set argument 'fit_type' to 'glmnet'.") + if (screen_lambda) { + warning("Lambda screening functionality is currently experimental.") } # cast X to matrix -- and don't start the timer until after @@ -165,8 +180,9 @@ fit_hal <- function(X, if (is.null(basis_list)) { if (screen_basis) { selected_cols <- hal_screen_goodbasis(X, Y, - actual_max_degree = max_degree, - k = NULL, family = "gaussian") + actual_max_degree = max_degree, + k = NULL, family = "gaussian" + ) basis_list <- c() for (i in seq_along(selected_cols)) { col_list <- selected_cols[[i]] diff --git a/R/predict.R b/R/predict.R index f2001656..0bf89f3e 100644 --- a/R/predict.R +++ b/R/predict.R @@ -7,13 +7,6 @@ #' @param object An object of class \code{hal9001}, containing the results of #' fitting the Highly Adaptive Lasso, as produced by \code{\link{fit_hal}}. #' @param offset A vector of offsets. Must be provided if provided at training -#' @param lambda A single lambda value or a vector of lambdas to use for -#' prediction. If \code{NULL}, a value of lambda will be selected based on -#' cross-validation, using \code{\link[glmnet]{cv.glmnet}}. NOTE that this -#' does NOT provide similar functionality to the equivalent argument in the -#' \pkg{glmnet} method of \code{\link[stats]{predict}}; rather, this argument -#' is used internally to screen lambdas (see \code{screen_lambda} argument of -#' \code{\link{fit_hal}} for details). #' @param ... Additional arguments passed to \code{predict} as necessary. #' @param new_data A \code{matrix} or \code{data.frame} containing new data #' (observations NOT used in fitting the \code{hal9001} object passed in via @@ -30,18 +23,17 @@ #' @export #' #' @note This prediction method does not function similarly to the equivalent -#' method from \pkg{glmnet}. In particular, specifying argument \code{lambda} -#' will not return a subset of the lambdas originally specified in the call to -#' \code{\link{fit_hal}} nor result in re-fitting. This prediction method will -#' return predictions for all lambdas specified in \code{\link{fit_hal}} when -#' \code{cv_select = FALSE}. When \code{cv_select = TRUE}, predictions will -#' only be returned for the value of lambda selected by cross-validation. +#' method from \pkg{glmnet}. In particular, this procedure will NOT return a +#' subset of lambdas originally specified in callingo \code{\link{fit_hal}} +#' nor result in re-fitting. Instead, it will return predictions for all of +#' the lambdas specified in the call to \code{\link{fit_hal}} that constructs +#' \code{object}, when \code{cv_select = FALSE}. When \code{cv_select = TRUE}, +#' predictions will only be returned for the value of lambda selected by +#' cross-validation. #' -#' @return A \code{numeric} vector of predictions from a fitted \code{hal9001} -#' object. +#' @return A \code{numeric} vector of predictions from a \code{hal9001} object. predict.hal9001 <- function(object, offset = NULL, - lambda = NULL, ..., new_data, new_X_unpenalized = NULL) { diff --git a/R/screening.R b/R/screening.R index 1be13cdf..c73157d7 100644 --- a/R/screening.R +++ b/R/screening.R @@ -60,7 +60,8 @@ hal_screen_cols <- function(x, y, V = 5, family, col_lists = NULL, # TODO: subsample param # subsample_size <- min(max(100, n * 0.1), length(basis_list)) - # basis_subsample <- sort(sample(seq_along(basis_list), subsample_size, replace = FALSE)) + # basis_subsample <- sort(sample(seq_along(basis_list), subsample_size, + # replace = FALSE)) basis_subsample <- seq_along(basis_list) x_basis <- make_design_matrix(x, basis_list[basis_subsample]) diff --git a/R/hal_screen2.R b/R/screening_experimental.R similarity index 91% rename from R/hal_screen2.R rename to R/screening_experimental.R index 6e413469..d2a51ac2 100644 --- a/R/hal_screen2.R +++ b/R/screening_experimental.R @@ -19,8 +19,10 @@ hal_screen_rank <- function(x, y, family, k = NULL, foldid = NULL, if (is.null(offset)) { offset <- rep(mean(y), n) } - rank_basis <- cv.glmnet(x, y, family = family, foldid = foldid, - offset = offset) + rank_basis <- cv.glmnet(x, y, + family = family, foldid = foldid, + offset = offset + ) if (!is.null(k)) { coef_mat <- coef(rank_basis, rank_basis$lambda) @@ -75,7 +77,7 @@ hal_screen_goodbasis <- function(x, y, actual_max_degree, k = NULL, family, } } # get matrix[x1,x2,..,x1*x2,..,x1*x2*x3,..] x_basis_lists <- as.list(matrix(0, ncol = length(col_lists) + - length(interaction_col_lists))) + length(interaction_col_lists))) for (i in 1:length(x_basis_lists)) { if (i <= length(col_lists)) { x_basis_lists[[i]] <- col_lists[[i]] @@ -144,16 +146,20 @@ hal_screen_output <- function(x, y, family, col_lists, foldid = NULL, # generate k*n basis functions x_basis <- make_design_matrix(x, basis_list) # do regular lasso for k*n basis functions - screen_goodcols <- cv.glmnet(x_basis, y, family = family, offset = offset, - foldid = foldid) + screen_goodcols <- cv.glmnet(x_basis, y, + family = family, offset = offset, + foldid = foldid + ) lambda_min <- screen_goodcols$lambda.min lambda_1se <- screen_goodcols$lambda.1se - coef <- coef.cv.glmnet(screen_goodcols, s = "lambda.1se") + coef <- stats::coef(screen_goodcols, s = "lambda.1se") coef_list <- list(which(!coef[-1] == 0)) # find non-zero column lists - pred <- predict(screen_goodcols, newx = x_basis, s = lambda_1se, - newoffset = offset) + pred <- predict(screen_goodcols, + newx = x_basis, s = lambda_1se, + newoffset = offset + ) mse <- mean((pred - y)^2) col_result <- list( diff --git a/README.md b/README.md index 7ffee9b3..8f1cedb4 100644 --- a/README.md +++ b/README.md @@ -98,12 +98,12 @@ hal_fit <- fit_hal(X = x, Y = y) #> [1] "I'm sorry, Dave. I'm afraid I can't do that." hal_fit$times #> user.self sys.self elapsed user.child sys.child -#> enumerate_basis 0.005 0.000 0.006 0 0 -#> design_matrix 0.005 0.000 0.005 0 0 -#> remove_duplicates 0.017 0.000 0.017 0 0 +#> enumerate_basis 0.001 0.000 0.002 0 0 +#> design_matrix 0.002 0.000 0.001 0 0 +#> remove_duplicates 0.005 0.000 0.005 0 0 #> reduce_basis 0.000 0.000 0.000 0 0 -#> lasso 0.261 0.005 0.270 0 0 -#> total 0.288 0.005 0.298 0 0 +#> lasso 0.275 0.007 0.284 0 0 +#> total 0.283 0.007 0.292 0 0 # training sample prediction preds <- predict(hal_fit, new_data = x) diff --git a/docs/404.html b/docs/404.html index de05a43f..58baf3f5 100644 --- a/docs/404.html +++ b/docs/404.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -63,7 +67,7 @@ - +
@@ -123,6 +127,12 @@

Page not found (404)

+ + @@ -133,7 +143,7 @@

Page not found (404)

-

Site built with pkgdown 1.4.1.

+

Site built with pkgdown 1.5.0.

diff --git a/docs/CONTRIBUTING.html b/docs/CONTRIBUTING.html index f4f90f1d..ab078250 100644 --- a/docs/CONTRIBUTING.html +++ b/docs/CONTRIBUTING.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -63,7 +67,7 @@ - +
@@ -132,19 +136,19 @@

Issues

When filing an issue, the most important thing is to include a minimal reproducible example so that we can quickly verify the problem, and then figure out how to fix it. There are three things you need to include to make your example reproducible: required packages, data, code.

    -
  1. Packages should be loaded at the top of the script, so it’s easy to see which ones the example needs.

  2. +
  3. Packages should be loaded at the top of the script, so it’s easy to see which ones the example needs.

  4. The easiest way to include data is to use dput() to generate the R code to recreate it.

  5. Spend a little bit of time ensuring that your code is easy for others to read:

      -
    • make sure you’ve used spaces and your variable names are concise, but informative

    • +
    • make sure you’ve used spaces and your variable names are concise, but informative

    • use comments to indicate where your problem lies

    • do your best to remove everything that is not related to the problem. The shorter your code is, the easier it is to understand.

You can check you have actually made a reproducible example by starting up a fresh R session and pasting your script in.

-

(Unless you’ve been specifically asked for it, please don’t include the output of sessionInfo().)

+

(Unless you’ve been specifically asked for it, please don’t include the output of sessionInfo().)

@@ -154,30 +158,36 @@

  • Create a branch in git and make your changes.
  • Push branch to GitHub and issue pull request (PR).
  • Discuss the pull request.
  • -
  • Iterate until either we accept the PR or decide that it’s not a good fit for hal9001.
  • +
  • Iterate until either we accept the PR or decide that it’s not a good fit for hal9001.
  • Each of these steps are described in more detail below. This might feel overwhelming the first time you get set up, but it gets easier with practice.

    -

    If you’re not familiar with git or GitHub, please start by reading http://r-pkgs.had.co.nz/git.html

    +

    If you’re not familiar with git or GitHub, please start by reading http://r-pkgs.had.co.nz/git.html

    Pull requests will be evaluated against a checklist:

    1. Motivation. Your pull request should clearly and concisely motivates the need for change. Please describe the problem your PR addresses and show how your pull request solves it as concisely as possible.
    -

    Also include this motivation in NEWS so that when a new release of hal9001 comes out it’s easy for users to see what’s changed. Add your item at the top of the file and use markdown for formatting. The news item should end with (@yourGithubUsername, #the_issue_number).

    +

    Also include this motivation in NEWS so that when a new release of hal9001 comes out it’s easy for users to see what’s changed. Add your item at the top of the file and use markdown for formatting. The news item should end with (@yourGithubUsername, #the_issue_number).

    1. -

      Only related changes. Before you submit your pull request, please check to make sure that you haven’t accidentally included any unrelated changes. These make it harder to see exactly what’s changed, and to evaluate any unexpected side effects.

      -

      Each PR corresponds to a git branch, so if you expect to submit multiple changes make sure to create multiple branches. If you have multiple changes that depend on each other, start with the first one and don’t submit any others until the first one has been processed.

      +

      Only related changes. Before you submit your pull request, please check to make sure that you haven’t accidentally included any unrelated changes. These make it harder to see exactly what’s changed, and to evaluate any unexpected side effects.

      +

      Each PR corresponds to a git branch, so if you expect to submit multiple changes make sure to create multiple branches. If you have multiple changes that depend on each other, start with the first one and don’t submit any others until the first one has been processed.

    2. -
    3. Use hal9001 coding style. To do so, please follow the official tidyverse style guide. Maintaining a consistent style across the whole code base makes it much easier to jump into the code. If you’re modifying existing hal9001 code that doesn’t follow the style guide, a separate pull request to fix the style would be greatly appreciated.

    4. -
    5. If you’re adding new parameters or a new function, you’ll also need to document them with roxygen2. Make sure to re-run devtools::document() on the code before submitting.

    6. +
    7. Use hal9001 coding style. To do so, please follow the official tidyverse style guide. Maintaining a consistent style across the whole code base makes it much easier to jump into the code. If you’re modifying existing hal9001 code that doesn’t follow the style guide, a separate pull request to fix the style would be greatly appreciated.

    8. +
    9. If you’re adding new parameters or a new function, you’ll also need to document them with roxygen2. Make sure to re-run devtools::document() on the code before submitting.

    -

    This seems like a lot of work but don’t worry if your pull request isn’t perfect. It’s a learning process. A pull request is a process, and unless you’ve submitted a few in the past it’s unlikely that your pull request will be accepted as is. Please don’t submit pull requests that change existing behaviour. Instead, think about how you can add a new feature in a minimally invasive way.

    +

    This seems like a lot of work but don’t worry if your pull request isn’t perfect. It’s a learning process. A pull request is a process, and unless you’ve submitted a few in the past it’s unlikely that your pull request will be accepted as is. Please don’t submit pull requests that change existing behaviour. Instead, think about how you can add a new feature in a minimally invasive way.

    + + @@ -188,7 +198,7 @@

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 0b823e59..9ba274f8 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -63,7 +67,7 @@ - +
    @@ -798,6 +802,12 @@

    License

    + + @@ -808,7 +818,7 @@

    License

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/articles/index.html b/docs/articles/index.html index b19b84a0..86aa8368 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -63,7 +67,7 @@ - +
    @@ -123,9 +127,10 @@

    Articles

    All vignettes

    - +
    +
    Fitting the Highly Adaptive Lasso with `hal9001`
    +
    +
    @@ -137,7 +142,7 @@

    All vignettes

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/articles/intro_hal9001.html b/docs/articles/intro_hal9001.html index aa2eef9b..914afb80 100644 --- a/docs/articles/intro_hal9001.html +++ b/docs/articles/intro_hal9001.html @@ -6,13 +6,13 @@ Fitting the Highly Adaptive Lasso with `hal9001` • hal9001 - - - - + + + + + - - + @@ -148,17 +148,17 @@

    Fitting the model: glmnet

    HAL uses the popular glmnet R package for the lasso step:

    -
    hal_fit <- fit_hal(X = x, Y = y, fit_type = "glmnet")
    +
    hal_fit <- fit_hal(X = x, Y = y, fit_type = "glmnet")
    ## [1] "Without your space helmet, Dave. You're going to find that rather difficult."
    - +
    hal_fit$times
    ##                   user.self sys.self elapsed user.child sys.child
    -## enumerate_basis       0.016    0.000   0.016          0         0
    -## design_matrix         0.033    0.000   0.033          0         0
    -## remove_duplicates     0.010    0.000   0.010          0         0
    +## enumerate_basis       0.005    0.000   0.005          0         0
    +## design_matrix         0.021    0.000   0.022          0         0
    +## remove_duplicates     0.014    0.000   0.014          0         0
     ## reduce_basis          0.000    0.000   0.000          0         0
    -## lasso                 0.619    0.016   0.635          0         0
    -## total                 0.678    0.016   0.694          0         0
    - +## lasso 0.783 0.005 0.788 0 0 +## total 0.823 0.005 0.829 0 0 +
    hal_fit
    ## $call
     ## fit_hal(X = x, Y = y, fit_type = "glmnet")
     ## 
    @@ -16416,12 +16416,12 @@ 

    ## ## $times ## user.self sys.self elapsed user.child sys.child -## enumerate_basis 0.016 0.000 0.016 0 0 -## design_matrix 0.033 0.000 0.033 0 0 -## remove_duplicates 0.010 0.000 0.010 0 0 +## enumerate_basis 0.005 0.000 0.005 0 0 +## design_matrix 0.021 0.000 0.022 0 0 +## remove_duplicates 0.014 0.000 0.014 0 0 ## reduce_basis 0.000 0.000 0.000 0 0 -## lasso 0.619 0.016 0.635 0 0 -## total 0.678 0.016 0.694 0 0 +## lasso 0.783 0.005 0.788 0 0 +## total 0.823 0.005 0.829 0 0 ## ## $lambda_star ## [1] 0.01537728 @@ -16559,20 +16559,20 @@

    Reducing basis functions

    As described in Benkeser and van der Laan (2016), the HAL algorithm operates by first constructing a set of basis functions and subsequently fitting a Lasso model with this set of basis functions as the design matrix. Several approaches are considered for reducing this set of basis functions: 1. Removing duplicated basis functions (done by default in the fit_hal function), 2. Removing basis functions that correspond to only a small set of observations; a good rule of thumb is to scale with \(\frac{1}{\sqrt{n}}\).

    The second of these two options may be invoked by specifying the reduce_basis argument to the fit_hal function:

    -
    hal_fit_reduced <- fit_hal(X = x, Y = y, fit_type = "lassi",
    -                           reduce_basis = 1/sqrt(length(y)))
    +
    hal_fit_reduced <- fit_hal(X = x, Y = y, fit_type = "lassi",
    +                           reduce_basis = 1/sqrt(length(y)))
    ## [1] "Dave, although you took very thorough precautions in the pod against my hearing you, I could see your lips move."
    ## 'lassi' is experimental: fit_type='glmnet' is recommended in nearly all cases.
    - +
    hal_fit_reduced$times
    ##                   user.self sys.self elapsed user.child sys.child
    -## enumerate_basis       0.005        0   0.005          0         0
    -## design_matrix         0.020        0   0.020          0         0
    -## remove_duplicates     0.009        0   0.009          0         0
    +## enumerate_basis       0.005        0   0.006          0         0
    +## design_matrix         0.025        0   0.025          0         0
    +## remove_duplicates     0.010        0   0.010          0         0
     ## reduce_basis          0.007        0   0.007          0         0
    -## lasso                 1.975        0   1.977          0         0
    -## total                 2.009        0   2.011          0         0
    +## lasso 2.207 0 2.223 0 0 +## total 2.247 0 2.264 0 0

    In the above, all basis functions with fewer than 7.0710678% of observations meeting the criterion imposed are automatically removed prior to the Lasso step of fitting the HAL regression. The results appear below

    - +
    hal_fit_reduced
    ## $call
     ## fit_hal(X = x, Y = y, fit_type = "lassi", reduce_basis = 1/sqrt(length(y)))
     ## 
    @@ -32663,12 +32663,12 @@ 

    ## ## $times ## user.self sys.self elapsed user.child sys.child -## enumerate_basis 0.005 0 0.005 0 0 -## design_matrix 0.020 0 0.020 0 0 -## remove_duplicates 0.009 0 0.009 0 0 +## enumerate_basis 0.005 0 0.006 0 0 +## design_matrix 0.025 0 0.025 0 0 +## remove_duplicates 0.010 0 0.010 0 0 ## reduce_basis 0.007 0 0.007 0 0 -## lasso 1.975 0 1.977 0 0 -## total 2.009 0 2.011 0 0 +## lasso 2.207 0 2.223 0 0 +## total 2.247 0 2.264 0 0 ## ## $lambda_star ## [1] 0.01073374 @@ -33815,18 +33815,18 @@

    Obtaining model predictions

    - +
    # training sample prediction for HAL vs HAL9000
    +mse <- function(preds, y) {
    +    mean((preds - y)^2)
    +}
    +
    +preds_hal <- predict(object = hal_fit, new_data = x)
    +mse_hal <- mse(preds = preds_hal, y = y)
    +mse_hal
    ## [1] 0.02493478
    - +
    oob_hal <- predict(object = hal_fit, new_data = test_x)
    +oob_hal_mse <- mse(preds = oob_hal, y = test_y)
    +oob_hal_mse
    ## [1] 1.543119

    @@ -33845,19 +33845,11 @@

    - @@ -33868,7 +33860,7 @@

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/authors.html b/docs/authors.html index 9eb64cdd..dc3614f7 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -63,7 +67,7 @@ - +
    @@ -122,13 +126,13 @@

    Citation

    Coyle JR, Hejazi NS, van der Laan MJ (2020). hal9001: The scalable highly adaptive lasso. -doi: 10.5281/zenodo.3558313, R package version 0.2.5, https://github.com/tlverse/hal9001. +doi: 10.5281/zenodo.3558313, R package version 0.2.6, https://github.com/tlverse/hal9001.

    @Manual{,
       title = {{hal9001}: The scalable highly adaptive lasso},
       author = {Jeremy R Coyle and Nima S Hejazi and Mark J {van der Laan}},
       year = {2020},
    -  note = {R package version 0.2.5},
    +  note = {R package version 0.2.6},
       doi = {10.5281/zenodo.3558313},
       url = {https://github.com/tlverse/hal9001},
     }
    @@ -139,15 +143,15 @@

    Authors

    • -

      Jeremy Coyle. Author, maintainer. ORCID +

      Jeremy Coyle. Author, maintainer.

    • -

      Nima Hejazi. Author. ORCID +

      Nima Hejazi. Author.

    • -

      David Benkeser. Contributor. ORCID +

      David Benkeser. Contributor.

    • @@ -155,11 +159,11 @@

      Authors

    • -

      Weixin Cai. Contributor. ORCID +

      Weixin Cai. Contributor.

    • -

      Mark van der Laan. Author, copyright holder, thesis advisor. ORCID +

      Mark van der Laan. Author, copyright holder, thesis advisor.

    @@ -176,7 +180,7 @@

    Authors

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/bootstrap-toc.css b/docs/bootstrap-toc.css new file mode 100644 index 00000000..5a859415 --- /dev/null +++ b/docs/bootstrap-toc.css @@ -0,0 +1,60 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ + +/* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ + +/* All levels of nav */ +nav[data-toggle='toc'] .nav > li > a { + display: block; + padding: 4px 20px; + font-size: 13px; + font-weight: 500; + color: #767676; +} +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 19px; + color: #563d7c; + text-decoration: none; + background-color: transparent; + border-left: 1px solid #563d7c; +} +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 18px; + font-weight: bold; + color: #563d7c; + background-color: transparent; + border-left: 2px solid #563d7c; +} + +/* Nav: second level (shown on .active) */ +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} +nav[data-toggle='toc'] .nav .nav > li > a { + padding-top: 1px; + padding-bottom: 1px; + padding-left: 30px; + font-size: 12px; + font-weight: normal; +} +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 29px; +} +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 28px; + font-weight: 500; +} + +/* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ +nav[data-toggle='toc'] .nav > .active > ul { + display: block; +} diff --git a/docs/bootstrap-toc.js b/docs/bootstrap-toc.js new file mode 100644 index 00000000..1cdd573b --- /dev/null +++ b/docs/bootstrap-toc.js @@ -0,0 +1,159 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ +(function() { + 'use strict'; + + window.Toc = { + helpers: { + // return all matching elements in the set, or their descendants + findOrFilter: function($el, selector) { + // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ + // http://stackoverflow.com/a/12731439/358804 + var $descendants = $el.find(selector); + return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); + }, + + generateUniqueIdBase: function(el) { + var text = $(el).text(); + var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); + return anchor || el.tagName.toLowerCase(); + }, + + generateUniqueId: function(el) { + var anchorBase = this.generateUniqueIdBase(el); + for (var i = 0; ; i++) { + var anchor = anchorBase; + if (i > 0) { + // add suffix + anchor += '-' + i; + } + // check if ID already exists + if (!document.getElementById(anchor)) { + return anchor; + } + } + }, + + generateAnchor: function(el) { + if (el.id) { + return el.id; + } else { + var anchor = this.generateUniqueId(el); + el.id = anchor; + return anchor; + } + }, + + createNavList: function() { + return $(''); + }, + + createChildNavList: function($parent) { + var $childList = this.createNavList(); + $parent.append($childList); + return $childList; + }, + + generateNavEl: function(anchor, text) { + var $a = $(''); + $a.attr('href', '#' + anchor); + $a.text(text); + var $li = $('
  • '); + $li.append($a); + return $li; + }, + + generateNavItem: function(headingEl) { + var anchor = this.generateAnchor(headingEl); + var $heading = $(headingEl); + var text = $heading.data('toc-text') || $heading.text(); + return this.generateNavEl(anchor, text); + }, + + // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). + getTopLevel: function($scope) { + for (var i = 1; i <= 6; i++) { + var $headings = this.findOrFilter($scope, 'h' + i); + if ($headings.length > 1) { + return i; + } + } + + return 1; + }, + + // returns the elements for the top level, and the next below it + getHeadings: function($scope, topLevel) { + var topSelector = 'h' + topLevel; + + var secondaryLevel = topLevel + 1; + var secondarySelector = 'h' + secondaryLevel; + + return this.findOrFilter($scope, topSelector + ',' + secondarySelector); + }, + + getNavLevel: function(el) { + return parseInt(el.tagName.charAt(1), 10); + }, + + populateNav: function($topContext, topLevel, $headings) { + var $context = $topContext; + var $prevNav; + + var helpers = this; + $headings.each(function(i, el) { + var $newNav = helpers.generateNavItem(el); + var navLevel = helpers.getNavLevel(el); + + // determine the proper $context + if (navLevel === topLevel) { + // use top level + $context = $topContext; + } else if ($prevNav && $context === $topContext) { + // create a new level of the tree and switch to it + $context = helpers.createChildNavList($prevNav); + } // else use the current $context + + $context.append($newNav); + + $prevNav = $newNav; + }); + }, + + parseOps: function(arg) { + var opts; + if (arg.jquery) { + opts = { + $nav: arg + }; + } else { + opts = arg; + } + opts.$scope = opts.$scope || $(document.body); + return opts; + } + }, + + // accepts a jQuery object, or an options object + init: function(opts) { + opts = this.helpers.parseOps(opts); + + // ensure that the data attribute is in place for styling + opts.$nav.attr('data-toggle', 'toc'); + + var $topContext = this.helpers.createChildNavList(opts.$nav); + var topLevel = this.helpers.getTopLevel(opts.$scope); + var $headings = this.helpers.getHeadings(opts.$scope, topLevel); + this.helpers.populateNav($topContext, topLevel, $headings); + } + }; + + $(function() { + $('nav[data-toggle="toc"]').each(function(i, el) { + var $nav = $(el); + Toc.init($nav); + }); + }); +})(); diff --git a/docs/index.html b/docs/index.html index 3aa71dd3..36b3372e 100644 --- a/docs/index.html +++ b/docs/index.html @@ -6,10 +6,11 @@ The Scalable Highly Adaptive Lasso • hal9001 - - - - + + + + + - - + - + - + + + + + - - + + - + - - + + @@ -63,7 +67,7 @@ - +
    @@ -116,12 +120,12 @@
    -

    +

    hal9001 0.2.5 2020-03-05

      @@ -130,13 +134,10 @@

    - @@ -148,7 +149,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/pkgdown.css b/docs/pkgdown.css index 91459581..c01e5923 100644 --- a/docs/pkgdown.css +++ b/docs/pkgdown.css @@ -17,6 +17,10 @@ html, body { height: 100%; } +body { + position: relative; +} + body > .container { display: flex; height: 100%; @@ -67,6 +71,10 @@ summary { margin-top: calc(-60px + 1em); } +dd { + margin-left: 3em; +} + /* Section anchors ---------------------------------*/ a.anchor { @@ -100,29 +108,132 @@ a.anchor { margin-top: -40px; } +/* Navbar submenu --------------------------*/ + +.dropdown-submenu { + position: relative; +} + +.dropdown-submenu>.dropdown-menu { + top: 0; + left: 100%; + margin-top: -6px; + margin-left: -1px; + border-radius: 0 6px 6px 6px; +} + +.dropdown-submenu:hover>.dropdown-menu { + display: block; +} + +.dropdown-submenu>a:after { + display: block; + content: " "; + float: right; + width: 0; + height: 0; + border-color: transparent; + border-style: solid; + border-width: 5px 0 5px 5px; + border-left-color: #cccccc; + margin-top: 5px; + margin-right: -10px; +} + +.dropdown-submenu:hover>a:after { + border-left-color: #ffffff; +} + +.dropdown-submenu.pull-left { + float: none; +} + +.dropdown-submenu.pull-left>.dropdown-menu { + left: -100%; + margin-left: 10px; + border-radius: 6px 0 6px 6px; +} + /* Sidebar --------------------------*/ -#sidebar { +#pkgdown-sidebar { margin-top: 30px; position: -webkit-sticky; position: sticky; top: 70px; } -#sidebar h2 { + +#pkgdown-sidebar h2 { font-size: 1.5em; margin-top: 1em; } -#sidebar h2:first-child { +#pkgdown-sidebar h2:first-child { margin-top: 0; } -#sidebar .list-unstyled li { +#pkgdown-sidebar .list-unstyled li { margin-bottom: 0.5em; } +/* bootstrap-toc tweaks ------------------------------------------------------*/ + +/* All levels of nav */ + +nav[data-toggle='toc'] .nav > li > a { + padding: 4px 20px 4px 6px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; +} + +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 5px; + color: inherit; + border-left: 1px solid #878787; +} + +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 5px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; + border-left: 2px solid #878787; +} + +/* Nav: second level (shown on .active) */ + +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} + +nav[data-toggle='toc'] .nav .nav > li > a { + padding-left: 16px; + font-size: 1.35rem; +} + +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 15px; +} + +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 15px; + font-weight: 500; + font-size: 1.35rem; +} + +/* orcid ------------------------------------------------------------------- */ + .orcid { - height: 16px; + font-size: 16px; + color: #A6CE39; /* margins are required by official ORCID trademark and display guidelines */ margin-left:4px; margin-right:4px; diff --git a/docs/pkgdown.js b/docs/pkgdown.js index 087a7622..7e7048fa 100644 --- a/docs/pkgdown.js +++ b/docs/pkgdown.js @@ -9,11 +9,6 @@ $('body').css('padding-top', $('.navbar').height() + 10); }); - $('body').scrollspy({ - target: '#sidebar', - offset: 60 - }); - $('[data-toggle="tooltip"]').tooltip(); var cur_path = paths(location.pathname); diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 0b14c8fd..22598671 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,8 +1,9 @@ pandoc: 2.2.1 -pkgdown: 1.4.1 +pkgdown: 1.5.0 pkgdown_sha: ~ articles: intro_hal9001: intro_hal9001.html +last_built: 2020-04-06T23:07Z urls: reference: https://tlverse.org/hal9001/reference article: https://tlverse.org/hal9001/articles diff --git a/docs/reference/SL.hal9001.html b/docs/reference/SL.hal9001.html index 837b2611..46742bdf 100644 --- a/docs/reference/SL.hal9001.html +++ b/docs/reference/SL.hal9001.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -206,13 +209,10 @@

    Value

    object and corresponding predictions based on the input data.

    - @@ -223,7 +223,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/apply_copy_map.html b/docs/reference/apply_copy_map.html index 1270dbe2..fbf070e0 100644 --- a/docs/reference/apply_copy_map.html +++ b/docs/reference/apply_copy_map.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -171,14 +174,10 @@

    Examp x_basis_uniq <- apply_copy_map(x_basis, copy_map) # }

    - @@ -189,7 +188,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/as_dgCMatrix.html b/docs/reference/as_dgCMatrix.html index 861c4dcb..d0948c4a 100644 --- a/docs/reference/as_dgCMatrix.html +++ b/docs/reference/as_dgCMatrix.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -39,7 +43,6 @@ - @@ -67,7 +70,7 @@ - +
    @@ -148,13 +151,10 @@

    Value

    An object of class dgCMatrix, coerced from input XX_.

    - @@ -165,7 +165,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/basis_list_cols.html b/docs/reference/basis_list_cols.html index 015a88cf..f9a98adc 100644 --- a/docs/reference/basis_list_cols.html +++ b/docs/reference/basis_list_cols.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -152,13 +155,10 @@

    Value

    input columns.

    - @@ -169,7 +169,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/basis_of_degree.html b/docs/reference/basis_of_degree.html index 4bbc0bcf..62707c3f 100644 --- a/docs/reference/basis_of_degree.html +++ b/docs/reference/basis_of_degree.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -151,13 +154,10 @@

    Value

    a set of input columns up to a particular pre-specified degree.

    - @@ -168,7 +168,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/cv_lasso.html b/docs/reference/cv_lasso.html index 2ec283ef..2882e7bc 100644 --- a/docs/reference/cv_lasso.html +++ b/docs/reference/cv_lasso.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -38,7 +42,6 @@ - @@ -66,7 +69,7 @@ - +
    @@ -164,12 +167,10 @@

    Arg

    - @@ -180,7 +181,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/cv_lasso_early_stopping.html b/docs/reference/cv_lasso_early_stopping.html index d276153a..996e9004 100644 --- a/docs/reference/cv_lasso_early_stopping.html +++ b/docs/reference/cv_lasso_early_stopping.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -38,7 +42,6 @@ - @@ -66,7 +69,7 @@ - +
    @@ -159,12 +162,10 @@

    Arg

    - @@ -175,7 +176,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/enumerate_basis.html b/docs/reference/enumerate_basis.html index 89393528..8369441b 100644 --- a/docs/reference/enumerate_basis.html +++ b/docs/reference/enumerate_basis.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -38,7 +42,6 @@ - @@ -66,7 +69,7 @@ - +
    @@ -172,14 +175,10 @@

    Examp basis_list <- enumerate_basis(X) # }

    - @@ -190,7 +189,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/evaluate_basis.html b/docs/reference/evaluate_basis.html index 9fe4cf13..67d6ac93 100644 --- a/docs/reference/evaluate_basis.html +++ b/docs/reference/evaluate_basis.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -152,12 +155,10 @@

    Arg

    - @@ -168,7 +169,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/fit_hal.html b/docs/reference/fit_hal.html index cb917b13..bd4e2e09 100644 --- a/docs/reference/fit_hal.html +++ b/docs/reference/fit_hal.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -309,15 +312,10 @@

    Examp preds <- predict(ml_hal_fit, new_data = x) # }

    - @@ -328,7 +326,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/hal9000.html b/docs/reference/hal9000.html index e55dc1c9..13925ad2 100644 --- a/docs/reference/hal9000.html +++ b/docs/reference/hal9000.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -132,11 +135,10 @@

    HAL 9000 Quotes

    - @@ -147,7 +149,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/hal9001.html b/docs/reference/hal9001.html index 82320096..ffc558ad 100644 --- a/docs/reference/hal9001.html +++ b/docs/reference/hal9001.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -131,11 +134,10 @@

    hal9001

    - @@ -146,7 +148,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/hal_quotes.html b/docs/reference/hal_quotes.html index 15e17320..7b38a97b 100644 --- a/docs/reference/hal_quotes.html +++ b/docs/reference/hal_quotes.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -38,7 +42,6 @@ - @@ -66,7 +69,7 @@ - +
    @@ -137,12 +140,10 @@

    FormatA vector of quotes.

    - @@ -153,7 +154,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/index.html b/docs/reference/index.html index 52bd3f47..d8f1f313 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -63,7 +67,7 @@ - +
    @@ -134,6 +138,11 @@

    + + + + + @@ -277,11 +286,10 @@

    -

    Contents

    -
    +

    @@ -292,7 +300,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/index_first_copy.html b/docs/reference/index_first_copy.html index 668e1f55..4e1fdb0d 100644 --- a/docs/reference/index_first_copy.html +++ b/docs/reference/index_first_copy.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -38,7 +42,6 @@ - @@ -66,7 +69,7 @@ - +
    @@ -142,12 +145,10 @@

    Arg

    - @@ -158,7 +159,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/lassi.html b/docs/reference/lassi.html index 1f2851a1..aec63281 100644 --- a/docs/reference/lassi.html +++ b/docs/reference/lassi.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -168,12 +171,10 @@

    Arg

    - @@ -184,7 +185,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/lassi_fit_module.html b/docs/reference/lassi_fit_module.html index 83542296..db09d179 100644 --- a/docs/reference/lassi_fit_module.html +++ b/docs/reference/lassi_fit_module.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -131,11 +134,10 @@

    Rcpp module: lassi_fit_module

    - @@ -146,7 +148,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/lassi_origami.html b/docs/reference/lassi_origami.html index 5e2da47e..16447845 100644 --- a/docs/reference/lassi_origami.html +++ b/docs/reference/lassi_origami.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -40,7 +44,6 @@ is meant to be called using cross_validate, which is done through cv_lasso. Note that this procedure is NOT meant to be invoked by itself. INTERNAL USE ONLY." /> - @@ -68,7 +71,7 @@ - +
    @@ -163,12 +166,10 @@

    Arg

    - @@ -179,7 +180,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/make_basis_list.html b/docs/reference/make_basis_list.html index d9297ff1..5e6054e8 100644 --- a/docs/reference/make_basis_list.html +++ b/docs/reference/make_basis_list.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -38,7 +42,6 @@ - @@ -66,7 +69,7 @@ - +
    @@ -150,13 +153,10 @@

    Details equals cols.length() and each basis function is a list(cols, cutoffs).

    - @@ -167,7 +167,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/make_copy_map.html b/docs/reference/make_copy_map.html index af8c5e1f..e298fee3 100644 --- a/docs/reference/make_copy_map.html +++ b/docs/reference/make_copy_map.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -166,14 +169,10 @@

    Examp copy_map <- make_copy_map(x_basis) # }

    - @@ -184,7 +183,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/make_design_matrix.html b/docs/reference/make_design_matrix.html index ba722ea7..4f3b817e 100644 --- a/docs/reference/make_design_matrix.html +++ b/docs/reference/make_design_matrix.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -38,7 +42,6 @@ - @@ -66,7 +69,7 @@ - +
    @@ -170,14 +173,10 @@

    Examp x_basis <- make_design_matrix(X, basis_list) # }

    - @@ -188,7 +187,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/make_reduced_basis_map.html b/docs/reference/make_reduced_basis_map.html index 873a4163..8afd594a 100644 --- a/docs/reference/make_reduced_basis_map.html +++ b/docs/reference/make_reduced_basis_map.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -39,7 +43,6 @@ - @@ -67,7 +70,7 @@ - +
    @@ -160,13 +163,10 @@

    Value

    a zero).

    - @@ -177,7 +177,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/meets_basis.html b/docs/reference/meets_basis.html index e2631ddb..71cbc82b 100644 --- a/docs/reference/meets_basis.html +++ b/docs/reference/meets_basis.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -38,7 +42,6 @@ - @@ -66,7 +69,7 @@ - +
    @@ -154,12 +157,10 @@

    Arg

    - @@ -170,7 +171,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/predict.SL.hal9001.html b/docs/reference/predict.SL.hal9001.html index 04aa0f0c..7544c240 100644 --- a/docs/reference/predict.SL.hal9001.html +++ b/docs/reference/predict.SL.hal9001.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -153,13 +156,10 @@

    Value

    object based on the provide newdata.

    - @@ -170,7 +170,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/predict.hal9001.html b/docs/reference/predict.hal9001.html index c5f86f5d..abdaafe0 100644 --- a/docs/reference/predict.hal9001.html +++ b/docs/reference/predict.hal9001.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -128,14 +131,7 @@

    Prediction from HAL fits

    # S3 method for hal9001
    -predict(
    -  object,
    -  offset = NULL,
    -  lambda = NULL,
    -  ...,
    -  new_data,
    -  new_X_unpenalized = NULL
    -)
    +predict(object, offset = NULL, ..., new_data, new_X_unpenalized = NULL)

    Arguments

    @@ -143,18 +139,12 @@

    Arg

    +fitting the Highly Adaptive Lasso, as produced by fit_hal.

    - - - - @@ -176,24 +166,29 @@

    Arg

    Value

    -

    A numeric vector of predictions from a fitted hal9001 - object.

    +

    A numeric vector of predictions from a hal9001 object.

    Details

    Method for computing and extracting predictions from fits of the Highly Adaptive Lasso estimator, returned as a single S3 objects of class hal9001.

    +

    Note

    - - + @@ -203,7 +198,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/predict.lassi.html b/docs/reference/predict.lassi.html index 85bb7c48..43e97b22 100644 --- a/docs/reference/predict.lassi.html +++ b/docs/reference/predict.lassi.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -149,12 +152,10 @@

    Arg

    - @@ -165,7 +166,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/screening.html b/docs/reference/screening.html index 7703fb94..69a3d5ea 100644 --- a/docs/reference/screening.html +++ b/docs/reference/screening.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -225,12 +228,10 @@

    Arg

    - @@ -241,7 +242,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/docs/reference/squash_hal_fit.html b/docs/reference/squash_hal_fit.html index 3048bcb3..4348290c 100644 --- a/docs/reference/squash_hal_fit.html +++ b/docs/reference/squash_hal_fit.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -37,7 +41,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -158,14 +161,10 @@

    Examp squashed <- squash_hal_fit(hal_fit) # }

    - @@ -176,7 +175,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.0.

    diff --git a/man/predict.hal9001.Rd b/man/predict.hal9001.Rd index 163242d5..a3f7d159 100644 --- a/man/predict.hal9001.Rd +++ b/man/predict.hal9001.Rd @@ -4,14 +4,7 @@ \alias{predict.hal9001} \title{Prediction from HAL fits} \usage{ -\method{predict}{hal9001}( - object, - offset = NULL, - lambda = NULL, - ..., - new_data, - new_X_unpenalized = NULL -) +\method{predict}{hal9001}(object, offset = NULL, ..., new_data, new_X_unpenalized = NULL) } \arguments{ \item{object}{An object of class \code{hal9001}, containing the results of @@ -19,14 +12,6 @@ fitting the Highly Adaptive Lasso, as produced by \code{\link{fit_hal}}.} \item{offset}{A vector of offsets. Must be provided if provided at training} -\item{lambda}{A single lambda value or a vector of lambdas to use for -prediction. If \code{NULL}, a value of lambda will be selected based on -cross-validation, using \code{\link[glmnet]{cv.glmnet}}. NOTE that this -does NOT provide similar functionality to the equivalent argument in the -\pkg{glmnet} method of \code{\link[stats]{predict}}; rather, this argument -is used internally to screen lambdas (see \code{screen_lambda} argument of -\code{\link{fit_hal}} for details).} - \item{...}{Additional arguments passed to \code{predict} as necessary.} \item{new_data}{A \code{matrix} or \code{data.frame} containing new data @@ -39,8 +24,7 @@ training, the user should also supply this matrix with the same number of observations as \code{new_data}. Optional.} } \value{ -A \code{numeric} vector of predictions from a fitted \code{hal9001} - object. +A \code{numeric} vector of predictions from a \code{hal9001} object. } \description{ Prediction from HAL fits @@ -52,10 +36,11 @@ Method for computing and extracting predictions from fits of the } \note{ This prediction method does not function similarly to the equivalent - method from \pkg{glmnet}. In particular, specifying argument \code{lambda} - will not return a subset of the lambdas originally specified in the call to - \code{\link{fit_hal}} nor result in re-fitting. This prediction method will - return predictions for all lambdas specified in \code{\link{fit_hal}} when - \code{cv_select = FALSE}. When \code{cv_select = TRUE}, predictions will - only be returned for the value of lambda selected by cross-validation. + method from \pkg{glmnet}. In particular, this procedure will NOT return a + subset of lambdas originally specified in callingo \code{\link{fit_hal}} + nor result in re-fitting. Instead, it will return predictions for all of + the lambdas specified in the call to \code{\link{fit_hal}} that constructs + \code{object}, when \code{cv_select = FALSE}. When \code{cv_select = TRUE}, + predictions will only be returned for the value of lambda selected by + cross-validation. } diff --git a/tests/testthat/test-hal_screen2.R b/tests/testthat/test-hal_screen2.R deleted file mode 100644 index 4b50743c..00000000 --- a/tests/testthat/test-hal_screen2.R +++ /dev/null @@ -1,76 +0,0 @@ -context("Unit test for HAL screening procedure") -library(glmnet) -set.seed(749125) - -n <- 100 -p <- 5 -x <- xmat <- matrix(rnorm(n * p), n, p) -y <- 10 * x[, 1] + 5 * x[, 2] + 6 * x[, 1] * x[, 2] + - rnorm(n, mean = 0, sd = 0.2) - -testn <- 10000 -testx <- xmat <- matrix(rnorm(testn * p), testn, p) -testy <- 10 * testx[, 1] + 5 * testx[, 2] + 6 * testx[, 1] * testx[, 2] + - rnorm(n, mean = 0, sd = 0.2) - -select_list <- 2 -select_rank1 <- hal_screen_rank(x, y, k = 1, family = "gaussian") -test_that("Rank function works properly with k(k!=NULL)", { - expect_equal(select_list, select_rank1) # k=length(select_list), equal -}) - -select_list <- c(2, 3) -select_rank2 <- hal_screen_rank(x, y, family = "gaussian") - -test_that("Rank function works properly without k", { - expect_equal(select_list, select_rank2) # k=NULL, equal -}) - -# x_interaction_basis <- cbind(x, x[,1]*x[,2], x[,1]*x[,3], x[,2]*x[,3])# generate main terms and 2-way interaction -# x_basis_lists <- list(1, 2, 3, c(1,2), c(1,3), c(2,3))#generate the column lists -x_basis_lists <- list(1, 2, c(1, 2)) -goodbasis <- hal_screen_goodbasis(x, y, actual_max_degree = 2, k = NULL, - family = "gaussian") - - -test_that("Goodbasis function works properly with interaction", { - x_basis_str <- lapply(x_basis_lists, paste, collapse = ",") - goodbasis_str <- lapply(goodbasis, paste, collapse = ",") - # when k=6, they must be equal, all columns would be selected - expect_setequal(x_basis_str, goodbasis_str) -}) -# -# x_basis<-matrix(nrow = n, ncol = 1) -# -# basis_list <- c() -# for (i in seq_along(x_basis_lists)) { -# col_list <- x_basis_lists[[i]] -# basis_list <- c(basis_list,basis_list_cols(col_list, x)) -# -# } -# -# x_basis <- make_design_matrix(x, basis_list)#generate k*n basis functions -# -# test_x_basis <- make_design_matrix(testx, basis_list) - -hal_with_screening <- fit_hal(x, y, screen_basis = TRUE) -hal_without_screening <- fit_hal(x, y, screen_basis = FALSE) - -preds <- predict(hal_with_screening, new_data = testx) -mse_w_screening <- mean((preds - testy)^2) -preds <- predict(hal_without_screening, new_data = testx) -mse_wo_screening <- mean((preds - testy)^2) - -hal_with_screening$times -hal_without_screening$times - - -test_that("screening makes things faster", { - with_time <- hal_with_screening$times["total", "elapsed"] - wo_time <- hal_without_screening$times["total", "elapsed"] - expect_lt(with_time, wo_time) -}) - -test_that("screening doesn't hurt mse too much", { - expect_lt(mse_w_screening, mse_wo_screening * 1.2) -}) diff --git a/tests/testthat/test-screen_experimental.R b/tests/testthat/test-screen_experimental.R new file mode 100644 index 00000000..99e13cd1 --- /dev/null +++ b/tests/testthat/test-screen_experimental.R @@ -0,0 +1,81 @@ +# 06 April 2020 - test is failing, corresponding code needs review/re-haul +if (FALSE) { + context("Unit test for HAL screening procedure") + library(glmnet) + set.seed(749125) + + n <- 100 + p <- 5 + x <- xmat <- matrix(rnorm(n * p), n, p) + y <- 10 * x[, 1] + 5 * x[, 2] + 6 * x[, 1] * x[, 2] + + rnorm(n, mean = 0, sd = 0.2) + + testn <- 10000 + testx <- xmat <- matrix(rnorm(testn * p), testn, p) + testy <- 10 * testx[, 1] + 5 * testx[, 2] + 6 * testx[, 1] * testx[, 2] + + rnorm(n, mean = 0, sd = 0.2) + + select_list <- 2 + select_rank1 <- hal_screen_rank(x, y, k = 1, family = "gaussian") + test_that("Rank function works properly with k(k!=NULL)", { + expect_equal(select_list, select_rank1) # k=length(select_list), equal + }) + + select_list <- c(2, 3) + select_rank2 <- hal_screen_rank(x, y, family = "gaussian") + + test_that("Rank function works properly without k", { + expect_equal(select_list, select_rank2) # k=NULL, equal + }) + + # x_interaction_basis <- cbind(x, x[,1]*x[,2], x[,1]*x[,3], x[,2]*x[,3])# generate main terms and 2-way interaction + # x_basis_lists <- list(1, 2, 3, c(1,2), c(1,3), c(2,3))#generate the column lists + x_basis_lists <- list(1, 2, c(1, 2)) + goodbasis <- hal_screen_goodbasis(x, y, + actual_max_degree = 2, k = NULL, + family = "gaussian" + ) + + + test_that("Goodbasis function works properly with interaction", { + x_basis_str <- lapply(x_basis_lists, paste, collapse = ",") + goodbasis_str <- lapply(goodbasis, paste, collapse = ",") + # when k=6, they must be equal, all columns would be selected + expect_setequal(x_basis_str, goodbasis_str) + }) + # + # x_basis<-matrix(nrow = n, ncol = 1) + # + # basis_list <- c() + # for (i in seq_along(x_basis_lists)) { + # col_list <- x_basis_lists[[i]] + # basis_list <- c(basis_list,basis_list_cols(col_list, x)) + # + # } + # + # x_basis <- make_design_matrix(x, basis_list)#generate k*n basis functions + # + # test_x_basis <- make_design_matrix(testx, basis_list) + + hal_with_screening <- fit_hal(x, y, screen_basis = TRUE) + hal_without_screening <- fit_hal(x, y, screen_basis = FALSE) + + preds <- predict(hal_with_screening, new_data = testx) + mse_w_screening <- mean((preds - testy)^2) + preds <- predict(hal_without_screening, new_data = testx) + mse_wo_screening <- mean((preds - testy)^2) + + hal_with_screening$times + hal_without_screening$times + + + test_that("screening makes things faster", { + with_time <- hal_with_screening$times["total", "elapsed"] + wo_time <- hal_without_screening$times["total", "elapsed"] + expect_lt(with_time, wo_time) + }) + + test_that("screening doesn't hurt mse too much", { + expect_lt(mse_w_screening, mse_wo_screening * 1.2) + }) +} diff --git a/tests/testthat/test-single_lambda.R b/tests/testthat/test-single_lambda.R index 5d8638f5..e9a58ada 100644 --- a/tests/testthat/test-single_lambda.R +++ b/tests/testthat/test-single_lambda.R @@ -1,9 +1,8 @@ context("feed single lambda into hal9001 (glmnet version) will not error.") - set.seed(1234) n <- 100 x <- rnorm(n) -y <- as.numeric(plogis(2 * x + rnorm(n)) > .5) +y <- as.numeric(plogis(2 * x + rnorm(n)) > 0.5) wgt <- rep(1, n) # fit via call to glmnet::glmnet for a single value of lambda diff --git a/tests/testthat/test-stat_performance.R b/tests/testthat/test-stat_performance.R index 66b0d30b..81b6542f 100644 --- a/tests/testthat/test-stat_performance.R +++ b/tests/testthat/test-stat_performance.R @@ -71,10 +71,10 @@ X <- as.matrix(X) # test <- hal_screen_basis(X, Y,family="gaussian", verbose=TRUE, main_terms = FALSE) halres9001 <- fit_hal( Y = Y, X = X, - yolo = FALSE, - screen_basis = TRUE + yolo = FALSE # NOTE: hal_screen_goodbasis is broken - #screen_lambda = TRUE + # screen_basis = TRUE + # screen_lambda = TRUE ) pred9001 <- predict(halres9001, new_data = testX) From 7a561604e967d7a4ba933145f09b3ddfa94f4a39 Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Wed, 8 Apr 2020 19:17:13 -0700 Subject: [PATCH 10/19] increase origami version as travis bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9c90939a..1e2969d3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,7 +43,7 @@ Imports: utils, methods, assertthat, - origami (>= 0.8.1), + origami (>= 1.0.3), glmnet Suggests: testthat, From 96dda58b6bf6b58a16143624e8d45f436ec14d44 Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Sat, 11 Apr 2020 14:03:29 -0700 Subject: [PATCH 11/19] readme tweak --- README.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.Rmd b/README.Rmd index 35e86b0b..a7b2a182 100644 --- a/README.Rmd +++ b/README.Rmd @@ -92,7 +92,7 @@ set.seed(385971) # simulate data n <- 100 p <- 3 -x <- xmat <- matrix(rnorm(n * p), n, p) +x <- matrix(rnorm(n * p), n, p) y <- x[, 1] * sin(x[, 2]) + rnorm(n, mean = 0, sd = 0.2) # fit the HAL regression From 58b75cef29a8b72a39db0895988fc89e22e2d8ce Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Tue, 9 Jun 2020 16:47:13 -0700 Subject: [PATCH 12/19] init paper --- paper/paper.md | 99 +++++++++++++++++++++++++++++++++++++++++++++ paper/refs.bib | 107 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 paper/paper.md create mode 100644 paper/refs.bib diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 00000000..5345a35a --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,99 @@ +--- +title: "`hal9001`: Scalable estimation with the highly adaptive lasso in `R`" +tags: + - machine learning + - targeted learning + - causal inference + - R +authors: + - name: Nima S. Hejazi + orcid: 0000-0002-7127-2789 + affiliation: 1, 4 + - name: Jeremy R. Coyle + orcid: 0000-0002-9874-6649 + affiliation: 2 + - name: Mark J. van der Laan + orcid: 0000-0003-1432-5511 + affiliation: 2, 3, 4 +affiliations: + - name: Graduate Group in Biostatistics, University of California, Berkeley + index: 1 + - name: Division of Epidemiology & Biostatistics, School of Public Health, University of California, Berkeley + index: 2 + - name: Department of Statistics, University of California, Berkeley + index: 3 + - name: Center for Computational Biology, University of California, Berkeley + index: 4 +date: 20 July 2020 +bibliography: refs.bib +--- + +# Summary + +A central problem in statistical learning theory and machine learning is the +development of robust prediction functions, both in terms of learning complex +functional forms and in the efficient estimation of low-dimensional +function(al)s of possibly complex data-generating processes. + + +Causal inference has traditionally focused on the effects of static +interventions, under which the magnitude of the treatment is set to a fixed, +prespecified value for each unit. The evaluation of such interventions faces +a host of issues, among them non-identification, violations of the assumption of +positivity, and inefficiency. Stochastic interventions provide a promising +solution to these fundamental issues by allowing for the target parameter to be +defined as the mean counterfactual outcome under a hypothetically shifted +version of the observed exposure distribution [@diaz2012population]. +Modified treatment policies, a particular class of such interventions, may be +interpreted as shifting the natural exposure level at the level of a given +observational unit [@haneuse2013estimation;@diaz2018stochastic]. + +Despite the promise of such advances in causal inference, real data analyses are +often further complicated by economic constraints, such as when the primary +variable of interest is far more expensive to collect than auxiliary covariates. +Two-phase sampling schemes are often used to bypass such limitations -- +unfortunately, their use produces side effects that require further adjustment +when formal statistical inference is the principal goal of a study. Among the +rich literature on two-phase designs, @rose2011targeted2sd stand out for +providing a study of nonparametric efficiency theory under such designs. Their +work can be used to construct efficient estimators of causal effects under +general two-phase sampling designs. + +Building on these prior works, @hejazi2020efficient outlined a novel approach +for use in such settings: augmented targeted minimum loss (TML) and one-step +estimators for the causal effects of stochastic interventions, with guarantees +of consistency, efficiency, and multiple robustness even in the presence of +two-phase sampling. These authors further outlined a technique that summarizes +the effect of shifting an exposure variable on the outcome of interest via +a nonparametric working marginal structural model, analogous to a dose-response +analysis. The `txshift` software package, for the `R` language and environment +for statistical computing [@R], implements this methodology. + + +`hal9001` is a scalable implementation of the highly adaptive lasso, built on +top of the extremely popular `glmnet` `R` package [@friedman2009glmnet]. The +`hal9001` `R` package includes tools + +for deploying these efficient estimators under two-phase +sampling designs, with two types of corrections: (1) a reweighting procedure +that introduces inverse probability of censoring weights directly into an +appropriate loss function, as discussed in @rose2011targeted2sd; as +well as (2) a correction based on the efficient influence function, studied more +thoroughly by @hejazi2020efficient. `txshift` +integrates with the [`sl3` package](https://github.com/tlverse/sl3) +[@coyle2020sl3] to allow for ensemble machine learning to be leveraged in the +estimation of nuisance parameters. What's more, the `txshift` package draws on +both the `hal9001` and `haldensify` `R` packages [@coyle2019hal9001; +@hejazi2020haldensify] to allow each of the estimators to be constructed in +a manner consistent with the theoretical results of @hejazi2020efficient. The +`txshift` package has been made publicly available via GitHub and will be +submitted to the Comprehensive `R` Archive Network in the near future. + +# Acknowledgments + +Nima Hejazi's contributions to this work were supported in part by a grant from +the National Institutes of Health: [T32 +LM012417-02](https://projectreporter.nih.gov/project_info_description.cfm?aid=9248418&icde=37849831&ddparam=&ddvalue=&ddsub=&cr=1&csb=default&cs=ASC&pball=). + +# References + diff --git a/paper/refs.bib b/paper/refs.bib new file mode 100644 index 00000000..8e4d6266 --- /dev/null +++ b/paper/refs.bib @@ -0,0 +1,107 @@ +@inproceedings{benkeser2016hal, + doi = {10.1109/dsaa.2016.93}, + url = {https://doi.org/10.1109/dsaa.2016.93}, + year = {2016}, + publisher = {{IEEE}}, + author = {Benkeser, David and {van der Laan}, Mark J}, + title = {The Highly Adaptive Lasso Estimator}, + booktitle = {2016 {IEEE} International Conference on Data Science and Advanced + Analytics ({DSAA})} +} + +@article{vdl2015generally, + author = {{van der Laan}, Mark J}, + publisher = {bepress}, + title = {A generally efficient targeted minimum loss based estimator}, + year = {2015} +} + +@article{vdl2017generally, + doi = {10.1515/ijb-2015-0097}, + url = {https://doi.org/10.1515/ijb-2015-0097}, + title = {A Generally Efficient Targeted Minimum Loss Based Estimator based on + the {Highly Adaptive Lasso}}, + author = {{van der Laan}, Mark J}, + journal = {The International Journal of Biostatistics}, + year = {2017}, + publisher = {De Gruyter} +} + +@article{vdl2017finite, + title = {Finite sample inference for {Targeted Learning}}, + author = {{van der Laan}, Mark J}, + journal = {ArXiv e-prints}, + archivePrefix = "arXiv", + eprint = {1708.09502}, + primaryClass = "math.ST", + keywords = {Mathematics - Statistics Theory}, + year = {2017} +} + +@article{bibaut2019fast, + author = {Bibaut, Aur{\'e}lien F and {van der Laan}, Mark J}, + journal = {arXiv preprint arXiv:1907.09244}, + title = {Fast rates for empirical risk minimization over + c\`{a}dl\`{a}g functions with bounded sectional variation norm}, + year = {2019} +} + +@article{bang2005doubly, + Author = {Bang, Heejung and Robins, James M}, + Journal = {Biometrics}, + Number = {4}, + Pages = {962--973}, + Publisher = {Wiley Online Library}, + Title = {Doubly robust estimation in missing data and causal inference + models}, + Volume = {61}, + Year = {2005} +} + +@article{vdl2019efficient, + Author = {{van der Laan}, Mark J and Benkeser, David and Cai, Weixin}, + Journal = {arXiv preprint arXiv:1908.05607}, + Title = {Efficient estimation of pathwise differentiable target parameters + with the undersmoothed highly adaptive lasso}, + Year = {2019} +} + +@article{vdl2017uniform, + Author = {{van der Laan}, Mark J and Bibaut, Aur{\'e}lien F}, + Journal = {arXiv preprint arXiv:1709.06256}, + Title = {Uniform Consistency of the Highly Adaptive Lasso Estimator of + Infinite-Dimensional Parameters}, + Year = {2017} +} + +@article{ertefaie2020nonparametric, + doi = {}, + url = {http://arxiv.org/abs/2005.11303}, + year = {2020}, + publisher = {}, + journal = {}, + volume = {}, + number = {}, + pages = {}, + author = {Ertefaie, Ashkan and Hejazi, Nima S and {van der Laan}, Mark J}, + title = {Nonparametric inverse probability weighted estimators based on the + highly adaptive lasso} +} + +@manual{R, + address = {Vienna, Austria}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + title = {\text{R}: A Language and Environment for Statistical Computing}, + url = {https://www.R-project.org/}, + year = {2020} +} + +@article{friedman2009glmnet, + title={glmnet: Lasso and elastic-net regularized generalized linear models}, + author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Rob}, + journal={R package version}, + volume={1}, + number={4}, + year={2009} +} From c9c641cb741d91c15bacfb999dc3262eb5194186 Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Fri, 12 Jun 2020 19:01:32 -0700 Subject: [PATCH 13/19] add paper summary, begin background --- paper/paper.md | 96 +++++++++++++++++++++----------------------------- 1 file changed, 40 insertions(+), 56 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 5345a35a..8544cdef 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,5 +1,5 @@ --- -title: "`hal9001`: Scalable estimation with the highly adaptive lasso in `R`" +title: "`hal9001`: Scalable highly adaptive lasso regression in `R`" tags: - machine learning - targeted learning @@ -24,70 +24,54 @@ affiliations: index: 3 - name: Center for Computational Biology, University of California, Berkeley index: 4 -date: 20 July 2020 +date: 22 June 2020 bibliography: refs.bib --- # Summary +The `hal9001` `R` package provides access to the _highly adaptive lasso_, +a highly flexible nonparametric regression and machine learning algorithm with +many desirable theoretical properties. `hal9001` provides the canonical +implementation of this algorithm, pairing the core statistical learning +methodology with an array of practical variable selection tools and sensible +defaults in order to improve the scalability of the procedure. By building off +of existing `R` packages for lasso regression and leveraging C++ in key internal +functions, the `hal9001` `R` attempts to provides relatively optimized highly +adaptive lasso functionality, suitable for use both in data analysis tasks and +modern (computationally intensive) statistics research. + +# Background + A central problem in statistical learning theory and machine learning is the -development of robust prediction functions, both in terms of learning complex -functional forms and in the efficient estimation of low-dimensional -function(al)s of possibly complex data-generating processes. - - -Causal inference has traditionally focused on the effects of static -interventions, under which the magnitude of the treatment is set to a fixed, -prespecified value for each unit. The evaluation of such interventions faces -a host of issues, among them non-identification, violations of the assumption of -positivity, and inefficiency. Stochastic interventions provide a promising -solution to these fundamental issues by allowing for the target parameter to be -defined as the mean counterfactual outcome under a hypothetically shifted -version of the observed exposure distribution [@diaz2012population]. -Modified treatment policies, a particular class of such interventions, may be -interpreted as shifting the natural exposure level at the level of a given -observational unit [@haneuse2013estimation;@diaz2018stochastic]. - -Despite the promise of such advances in causal inference, real data analyses are -often further complicated by economic constraints, such as when the primary -variable of interest is far more expensive to collect than auxiliary covariates. -Two-phase sampling schemes are often used to bypass such limitations -- -unfortunately, their use produces side effects that require further adjustment -when formal statistical inference is the principal goal of a study. Among the -rich literature on two-phase designs, @rose2011targeted2sd stand out for -providing a study of nonparametric efficiency theory under such designs. Their -work can be used to construct efficient estimators of causal effects under -general two-phase sampling designs. - -Building on these prior works, @hejazi2020efficient outlined a novel approach -for use in such settings: augmented targeted minimum loss (TML) and one-step -estimators for the causal effects of stochastic interventions, with guarantees -of consistency, efficiency, and multiple robustness even in the presence of -two-phase sampling. These authors further outlined a technique that summarizes -the effect of shifting an exposure variable on the outcome of interest via -a nonparametric working marginal structural model, analogous to a dose-response -analysis. The `txshift` software package, for the `R` language and environment -for statistical computing [@R], implements this methodology. - - -`hal9001` is a scalable implementation of the highly adaptive lasso, built on +development of efficient and robust prediction functions, which often require +the learning of complex functional forms or the construction of efficient +estimators of low-dimensional functionals of complex data-generating processes. +For example, one may be interested in a nonparametric regression function that +is unconstrained enough to (smoothly) estimate functions within a relatively +rich class, or to estimate causal parameters like the average treatment effect, +which require the consistent estimation of a limited set of nuisance functions. +Most often, strong assumptions are made about the functional forms of relevant +parts of the data-generating process, either out of convenience or due to +limited computational resources. + + +is a scalable implementation of the highly adaptive lasso, built on top of the extremely popular `glmnet` `R` package [@friedman2009glmnet]. The `hal9001` `R` package includes tools -for deploying these efficient estimators under two-phase -sampling designs, with two types of corrections: (1) a reweighting procedure -that introduces inverse probability of censoring weights directly into an -appropriate loss function, as discussed in @rose2011targeted2sd; as -well as (2) a correction based on the efficient influence function, studied more -thoroughly by @hejazi2020efficient. `txshift` -integrates with the [`sl3` package](https://github.com/tlverse/sl3) -[@coyle2020sl3] to allow for ensemble machine learning to be leveraged in the -estimation of nuisance parameters. What's more, the `txshift` package draws on -both the `hal9001` and `haldensify` `R` packages [@coyle2019hal9001; -@hejazi2020haldensify] to allow each of the estimators to be constructed in -a manner consistent with the theoretical results of @hejazi2020efficient. The -`txshift` package has been made publicly available via GitHub and will be -submitted to the Comprehensive `R` Archive Network in the near future. + +# `hal9001`'s Scope + +[TO FILL IN] + +# `hal9001`'s Functionality + +[TO FILL IN] + +# Future Work + +Spline HAL # Acknowledgments From 1d64eb350abc3509ac6e03405de499078553ffc1 Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Mon, 22 Jun 2020 22:12:07 -0700 Subject: [PATCH 14/19] add HAL paper for JOSS --- paper/paper.md | 135 ++++++++++++++++++++++++++++++++----------------- paper/refs.bib | 80 +++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+), 45 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 8544cdef..66a03bd6 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -30,54 +30,99 @@ bibliography: refs.bib # Summary -The `hal9001` `R` package provides access to the _highly adaptive lasso_, -a highly flexible nonparametric regression and machine learning algorithm with -many desirable theoretical properties. `hal9001` provides the canonical -implementation of this algorithm, pairing the core statistical learning -methodology with an array of practical variable selection tools and sensible -defaults in order to improve the scalability of the procedure. By building off -of existing `R` packages for lasso regression and leveraging C++ in key internal -functions, the `hal9001` `R` attempts to provides relatively optimized highly -adaptive lasso functionality, suitable for use both in data analysis tasks and -modern (computationally intensive) statistics research. +The `hal9001` `R` package provides an efficient implementation of the _highly +adaptive lasso_ (HAL), a flexible nonparametric regression and machine learning +algorithm endowed with several theoretically convenient properties. `hal9001` +pairs an implementation of this estimator with an array of practical variable +selection tools and sensible defaults in order to improve the scalability of the +algorithm. By building on existing `R` packages for lasso regression and +leveraging compiled code in key internal functions, the `hal9001` `R` package +provides a family of highly adaptive lasso estimators suitable for use both for +modern data analysis tasks and computationally intensive statistics and machine +learning research. # Background -A central problem in statistical learning theory and machine learning is the -development of efficient and robust prediction functions, which often require -the learning of complex functional forms or the construction of efficient -estimators of low-dimensional functionals of complex data-generating processes. -For example, one may be interested in a nonparametric regression function that -is unconstrained enough to (smoothly) estimate functions within a relatively -rich class, or to estimate causal parameters like the average treatment effect, -which require the consistent estimation of a limited set of nuisance functions. -Most often, strong assumptions are made about the functional forms of relevant -parts of the data-generating process, either out of convenience or due to -limited computational resources. - - -is a scalable implementation of the highly adaptive lasso, built on -top of the extremely popular `glmnet` `R` package [@friedman2009glmnet]. The -`hal9001` `R` package includes tools - - -# `hal9001`'s Scope - -[TO FILL IN] - -# `hal9001`'s Functionality - -[TO FILL IN] - -# Future Work - -Spline HAL - -# Acknowledgments - -Nima Hejazi's contributions to this work were supported in part by a grant from -the National Institutes of Health: [T32 -LM012417-02](https://projectreporter.nih.gov/project_info_description.cfm?aid=9248418&icde=37849831&ddparam=&ddvalue=&ddsub=&cr=1&csb=default&cs=ASC&pball=). +The highly adaptive lasso (HAL) is a nonparametric regression function capable +of estimating complex (e.g., possibly infinite-dimensional) functional +parameters at a near-parametric $n^{-1/3}$ rate under only relatively mild +conditions [@vdl2017generally; @vdl2017uniform; @bibaut2019fast]. The `hal9001` +package implements a zeroth-order HAL estimator, which constructs and selects +(by lasso penalization) a linear combination of indicator basis functions to +minimize the expected value of a loss function under the constraint that the +$L_1$-norm of the vector of coefficients is bounded by a finite constant. +Importantly, the estimator is formulated such that this finite constant is the +sectional variation norm of the target function's HAL representation. + +To formalize, consider the space of $d$-variate real-valued càdlàg functions +(right-hand continuous with left-hand limits) on a cube $[0,\tau] \in +\mathbb{R}^d$, letting $\mathbb{D}[0,\tau]$ denote this Banach space. For an +arbitrary functional $f \in \mathbb{D}[0,\tau]$, let the supremum norm be +$\lVert f \rVert_{\infty} := \sup_{x \in [0, \tau]} \lvert f(x) \rvert$; +morever, for any subset $s \subset \{0, \ldots, d\}$, partition the cube $[0, +\tau]$ into $\{0\} \{\cup_s (0_s, \tau_s]\}$. The sectional variation norm of +$f$ is defined +\begin{equation*} + \lVert f \rVert^{\star}_\nu = \lvert f(0) \rvert + \sum_{s + \subset\{1, \ldots, d\}} \int_{0_s}^{\tau_s} \lvert df_s(u_s) \rvert, +\end{equation*} +with the sum being over all subsets of $\{0, \ldots, d\}$. Define $u_s = (u_j +: j \in s)$ and $u_{-s}$ as the complement of $u_s$, for a given subset $s +\subset \{0, \ldots, d\}$. Then, let $f_s(u_s) = f(u_s, 0_{-s})$, which yields +$f_s: [0_s, \tau_s] \rightarrow \mathbb{R}$. $f_s(u_s)$ is simply a section of +$f$ that sets the components in the complement of the subset $s$ to zero, i.e, +allowing $f_s$ to vary only along components in $u_s$. Interestingly, this +definition of variation norm corresponds closely with the notion of Hardy-Krause +variation [@qiu2020universal; @owen2005multidimensional]. + +For the purpose of estimation, the integral over the domain $[0_s, \tau_s]$ may +be approximated by applying a discrete measure that places mass on observations +$X_{s,i}$, for which coefficients $\beta_{s,i}$ are generated. Define the +indicator $\phi_{s,i}(c_s)= \mathbb{I}(x_{s,i} \leq c_s)$, where $x_{s,i}$ are +support points of the functional. Then, we may express the approximation as +$\lVert \hat{f} \rVert^{\star}_\nu \approx \lvert \beta_0 \rvert + \sum_{s +\subset\{1,\ldots,d\}} \sum_{i=1}^{n} \lvert \beta_{s,i} \rvert$, which +approximates the sectional variation norm of the target functional. A loss-based +HAL estimator is based on a choice of the penalization parameter $\lambda$ that +minimizes the empirical risk under an appropriately chosen loss function. A data +adaptively selected choice of $\lambda$, typically denoted $\lambda_n$, may be +made by a cross-validation selector [@vdl2003unified; @vdv2006oracle], though +alternative selection criteria may be more appropriate when the estimand +functional is itself a nuisance component of the target parameter of interest +[e.g., @vdl2019efficient; @ertefaie2020nonparametric]. + +# `hal9001`'s core functionality + +The `hal9001` package, for the `R` language and environment for statistical +computing [@R], aims to provide a scalable implementation of the HAL regression +function. To provide a single, unified interface, the principal user-facing +function is `fit_hal()`, which, at minimum, requires a matrix of predictors `X` +and an outcome `Y`. By default, invocation of `fit_hal()` will build a HAL model +using indicator basis functions for up to a limited number of interactions of +the variables in `X`, fitting the penalized regression model via the lasso +procedure available in the extremely popular `glmnet` `R` package +[@friedman2009glmnet]. As creation of the design matrix of indicator basis +functions can be computationally expensive, several helper functions (e.g., +`make_design_matrix()`, `make_basis_list()`, `make_copy_map()`) have been +written in C++ and integrated into the package via the `Rcpp` framework +[@eddelbuettel2011rcpp; @eddelbuettel2013seamless]. `hal9001` additionally +supports the fitting of standard (Gaussian), logistic, and Cox proportional +hazards models (argument `family`), including variations that accommodate +offsets (argument `offset`) and partially penalized linear models (argument +`X_unpenalized`). + +Over several years of development and use, it was found that the performance of +HAL regression can suffer in high-dimensional settings. To alleviate +computational aspects of this issue, several screening and filtering approaches +were investigated and implemented. These include screening of variables prior to +creating the design matrix and filtering of indicator basis functions (arguments +`screen_basis` and `reduce_basis`), as well as either filtering of penalization +parameters (argument `screen_lambda`) or early stopping when fitting the +sequence of HAL models in $\lambda$. Future software development efforts will +continue to improve upon the computational aspects and performance of the HAL +regression options supported by `hal9001`. Currently, stable releases of the +`hal9001` package are made available on the Comprehensive `R` Archive Network at +https://CRAN.R-project.org/package=hal9001. # References diff --git a/paper/refs.bib b/paper/refs.bib index 8e4d6266..a8594104 100644 --- a/paper/refs.bib +++ b/paper/refs.bib @@ -105,3 +105,83 @@ @article{friedman2009glmnet number={4}, year={2009} } + +@incollection{owen2005multidimensional, + title={Multidimensional variation for quasi-Monte Carlo}, + author={Owen, Art B}, + booktitle={Contemporary Multivariate Analysis And Design Of Experiments: In + Celebration of Professor Kai-Tai Fang's 65th Birthday}, + pages={49--74}, + year={2005}, + publisher={World Scientific} +} + +@article{qiu2020universal, + title={Universal sieve-based strategies for efficient estimation using + machine learning tools}, + author={Qiu, Hongxiang and Luedtke, Alex and Carone, Marco}, + journal={arXiv preprint arXiv:2003.01856}, + year={2020} +} + +@techreport{vdl2003unified, +author = {{van der Laan}, Mark J and Dudoit, Sandrine}, +institution = {Division of Biostatistics, University of California, Berkeley}, +keywords = {Superlearner}, +number = {130}, +title = {{Unified cross-validation methodology for selection among estimators + and a general cross-validated adaptive epsilon-net estimator: finite sample + oracle inequalities and examples}}, +year = {2003} +} + +@article{vdv2006oracle, +author = {{van der Vaart}, Aad W and Dudoit, Sandrine and {van der Laan}, Mark + J}, +title = {{Oracle inequalities for multi-fold cross validation}}, +journal = {Statistics \& Decisions}, +year = {2006}, +volume = {24}, +number = {3}, +pages = {351--371} +} + +@article{ertefaie2020nonparametric, + doi = {}, + url = {http://arxiv.org/abs/2005.11303}, + year = {2020}, + publisher = {}, + journal = {}, + volume = {}, + number = {}, + pages = {}, + author = {Ertefaie, Ashkan and Hejazi, Nima S and {van der Laan}, Mark J}, + title = {Nonparametric inverse probability weighted estimators based on the + highly adaptive lasso} +} + +@article{vdl2019efficient, + title={Efficient estimation of pathwise differentiable target parameters with + the undersmoothed highly adaptive lasso}, + author={{van der Laan}, Mark J and Benkeser, David and Cai, Weixin}, + journal={arXiv preprint arXiv:1908.05607}, + year={2019} +} + +@article{eddelbuettel2011rcpp, + title={Rcpp: Seamless R and C++ integration}, + author={Eddelbuettel, Dirk and Fran{\c{c}}ois, Romain and Allaire, J and + Ushey, Kevin and Kou, Qiang and Russel, N and Chambers, John and Bates, D}, + journal={Journal of Statistical Software}, + volume={40}, + number={8}, + pages={1--18}, + year={2011} +} + +@book{eddelbuettel2013seamless, + title={Seamless R and C++ integration with Rcpp}, + author={Eddelbuettel, Dirk}, + year={2013}, + publisher={Springer} +} From b2e93b6c82db9352562c2d94306955929e50f6aa Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Tue, 23 Jun 2020 17:42:30 -0700 Subject: [PATCH 15/19] remove math from paper --- paper/paper.md | 76 ++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 43 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 66a03bd6..6213de76 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -24,7 +24,7 @@ affiliations: index: 3 - name: Center for Computational Biology, University of California, Berkeley index: 4 -date: 22 June 2020 +date: 24 June 2020 bibliography: refs.bib --- @@ -37,7 +37,7 @@ pairs an implementation of this estimator with an array of practical variable selection tools and sensible defaults in order to improve the scalability of the algorithm. By building on existing `R` packages for lasso regression and leveraging compiled code in key internal functions, the `hal9001` `R` package -provides a family of highly adaptive lasso estimators suitable for use both for +provides a family of highly adaptive lasso estimators suitable for use in both modern data analysis tasks and computationally intensive statistics and machine learning research. @@ -46,49 +46,39 @@ learning research. The highly adaptive lasso (HAL) is a nonparametric regression function capable of estimating complex (e.g., possibly infinite-dimensional) functional parameters at a near-parametric $n^{-1/3}$ rate under only relatively mild -conditions [@vdl2017generally; @vdl2017uniform; @bibaut2019fast]. The `hal9001` -package implements a zeroth-order HAL estimator, which constructs and selects -(by lasso penalization) a linear combination of indicator basis functions to -minimize the expected value of a loss function under the constraint that the -$L_1$-norm of the vector of coefficients is bounded by a finite constant. +conditions [@vdl2017generally; @vdl2017uniform; @bibaut2019fast]. HAL requires +that the space of the functional parameter be a subset of the set of càdlàg +(right-hand continuous with left-hand limits) functions with sectional +sectional variation norm bounded by a constant. In contrast to the wealth of +data adaptive regression techniques that make strong local smoothness +assumptions on the true form of the target functional, HAL regression's +assumption of a finite sectional variation norm constitutes only a _global_ +smoothness assumption, making it a powerful and versatile approach. The +`hal9001` package implements a zeroth-order HAL estimator, which constructs and +selects (by lasso penalization) a linear combination of indicator basis +functions to minimize the loss-specific empirical risk under the constraint that +the $L_1$-norm of the vector of coefficients be bounded by a finite constant. Importantly, the estimator is formulated such that this finite constant is the -sectional variation norm of the target function's HAL representation. +sectional variation norm of the target functional. -To formalize, consider the space of $d$-variate real-valued càdlàg functions -(right-hand continuous with left-hand limits) on a cube $[0,\tau] \in -\mathbb{R}^d$, letting $\mathbb{D}[0,\tau]$ denote this Banach space. For an -arbitrary functional $f \in \mathbb{D}[0,\tau]$, let the supremum norm be -$\lVert f \rVert_{\infty} := \sup_{x \in [0, \tau]} \lvert f(x) \rvert$; -morever, for any subset $s \subset \{0, \ldots, d\}$, partition the cube $[0, -\tau]$ into $\{0\} \{\cup_s (0_s, \tau_s]\}$. The sectional variation norm of -$f$ is defined -\begin{equation*} - \lVert f \rVert^{\star}_\nu = \lvert f(0) \rvert + \sum_{s - \subset\{1, \ldots, d\}} \int_{0_s}^{\tau_s} \lvert df_s(u_s) \rvert, -\end{equation*} -with the sum being over all subsets of $\{0, \ldots, d\}$. Define $u_s = (u_j -: j \in s)$ and $u_{-s}$ as the complement of $u_s$, for a given subset $s -\subset \{0, \ldots, d\}$. Then, let $f_s(u_s) = f(u_s, 0_{-s})$, which yields -$f_s: [0_s, \tau_s] \rightarrow \mathbb{R}$. $f_s(u_s)$ is simply a section of -$f$ that sets the components in the complement of the subset $s$ to zero, i.e, -allowing $f_s$ to vary only along components in $u_s$. Interestingly, this -definition of variation norm corresponds closely with the notion of Hardy-Krause -variation [@qiu2020universal; @owen2005multidimensional]. - -For the purpose of estimation, the integral over the domain $[0_s, \tau_s]$ may -be approximated by applying a discrete measure that places mass on observations -$X_{s,i}$, for which coefficients $\beta_{s,i}$ are generated. Define the -indicator $\phi_{s,i}(c_s)= \mathbb{I}(x_{s,i} \leq c_s)$, where $x_{s,i}$ are -support points of the functional. Then, we may express the approximation as -$\lVert \hat{f} \rVert^{\star}_\nu \approx \lvert \beta_0 \rvert + \sum_{s -\subset\{1,\ldots,d\}} \sum_{i=1}^{n} \lvert \beta_{s,i} \rvert$, which -approximates the sectional variation norm of the target functional. A loss-based -HAL estimator is based on a choice of the penalization parameter $\lambda$ that -minimizes the empirical risk under an appropriately chosen loss function. A data -adaptively selected choice of $\lambda$, typically denoted $\lambda_n$, may be -made by a cross-validation selector [@vdl2003unified; @vdv2006oracle], though -alternative selection criteria may be more appropriate when the estimand -functional is itself a nuisance component of the target parameter of interest +Intuitively, construction of a HAL estimator proceeds in two steps. First, +a design matrix composed of basis functions is generated based on the available +set of covariates. The zeroth-order HAL makes use of indicator basis functions, +resulting in a large, sparse matrix with binary entries; higher-order HAL +estimators, which replace the use of indicator basis functions with splines, +have been formulated but remain unimplemented. This representation of the target +functional $f$ in terms of indicator basis functions partitions the support of +$f$ into knot points, with indicator basis functions placed over subsets of the +sections of $f$. Generally, very many basis functions are created, with an +appropriate set of indicator bases then selected through lasso penalization. +Thus, the second step of fitting a HAL model is performing $L_1$-penalized +regression on the large, sparse design matrix of indicator bases. The selected +HAL regression model approximates the sectional variation norm of the target +functional as the absolute sum of the estimated coefficients of indicator basis +functions. The $L_1$ penalization parameter $\lambda$ can be data adaptively +selected based on a cross-validation selector [@vdl2003unified; @vdv2006oracle]; +however, alternative selection criteria may be more appropriate when the +estimand functional is itself a nuisance component of the target parameter [e.g., @vdl2019efficient; @ertefaie2020nonparametric]. # `hal9001`'s core functionality From f61f115efb3d301f9b6a67db0b1ff048b7240aa8 Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Tue, 23 Jun 2020 17:59:24 -0700 Subject: [PATCH 16/19] remove experimental screening --- R/hal.R | 37 +----- R/screening.R | 229 ------------------------------------- R/screening_experimental.R | 174 ---------------------------- man/fit_hal.Rd | 8 -- man/screening.Rd | 81 ------------- 5 files changed, 1 insertion(+), 528 deletions(-) delete mode 100644 R/screening.R delete mode 100644 R/screening_experimental.R delete mode 100644 man/screening.Rd diff --git a/R/hal.R b/R/hal.R index be137297..9852d3ab 100644 --- a/R/hal.R +++ b/R/hal.R @@ -65,10 +65,6 @@ #' @param id a vector of ID values, used to generate cross-validation folds for #' cross-validated selection of the regularization parameter lambda. #' @param offset a vector of offset values, used in fitting. -#' @param screen_basis If \code{TRUE}, use a screening procedure to reduce the -#' number of basis functions fitted. -#' @param screen_lambda If \code{TRUE}, use a screening procedure to reduce the -#' number of lambda values evaluated. #' @param ... Other arguments passed to \code{\link[glmnet]{cv.glmnet}}. Please #' consult its documentation for a full list of options. #' @param yolo A \code{logical} indicating whether to print one of a curated @@ -112,8 +108,6 @@ fit_hal <- function(X, id = NULL, offset = NULL, cv_select = TRUE, - screen_basis = FALSE, - screen_lambda = FALSE, ..., yolo = TRUE) { # check arguments and catch function call @@ -146,14 +140,6 @@ fit_hal <- function(X, ) ) - # warn about screening functionality - if (screen_basis) { - warning("Basis screening functionality is currently experimental.") - } - if (screen_lambda) { - warning("Lambda screening functionality is currently experimental.") - } - # cast X to matrix -- and don't start the timer until after if (!is.matrix(X)) { X <- as.matrix(X) @@ -178,19 +164,7 @@ fit_hal <- function(X, # make design matrix for HAL if (is.null(basis_list)) { - if (screen_basis) { - selected_cols <- hal_screen_goodbasis(X, Y, - actual_max_degree = max_degree, - k = NULL, family = "gaussian" - ) - basis_list <- c() - for (i in seq_along(selected_cols)) { - col_list <- selected_cols[[i]] - basis_list <- c(basis_list, basis_list_cols(col_list, X)) - } - } else { - basis_list <- enumerate_basis(X, max_degree) - } + basis_list <- enumerate_basis(X, max_degree) } # generate a vector of col lists corresponding to the bases generated @@ -262,15 +236,6 @@ fit_hal <- function(X, coefs <- hal_lasso$betas_mat[, "lambda_1se"] } } else if (fit_type == "glmnet") { - if ((screen_lambda) && (length(lambda) != 1)) { - # reduce the set of lambdas to fit - lambda <- hal_screen_lambda(x_basis, Y, - family = family, - lambda = lambda, - foldid = foldid, - offset = offset - ) - } # just use the standard implementation available in glmnet if (!cv_select) { hal_lasso <- glmnet::glmnet( diff --git a/R/screening.R b/R/screening.R deleted file mode 100644 index c73157d7..00000000 --- a/R/screening.R +++ /dev/null @@ -1,229 +0,0 @@ -#' Screen HAL Columns, Basis Functions, and lambda -#' -#' Smart Screening Stuff. TODO: Document fully -#' -#' @param x An input \code{matrix} containing observations of covariates. -#' @param y A \code{numeric} vector of obervations of the outcome variable. -#' @param V A \code{numeric} of the number of folds to use in cross-validation. -#' Defaults to five. If \code{foldid} is not specified, this is used. -#' @param max_degree The highest order of interaction terms for which the basis -#' functions ought to be generated. The default (\code{NULL}) corresponds to -#' generating basis functions for the full dimensionality of the input matrix. -#' @param family A \code{character} corresponding to the error family for a -#' generalized linear model. Options are limited to "gaussian" for fitting a -#' standard general linear model and "binomial" for logistic regression. -#' @param lambda A user-specified array of values of the lambda tuning -#' parameter of the Lasso L1 regression. \code{\link[glmnet]{cv.glmnet}} will -#' be used when set to \code{NULL}, automatically selecting an optimal value -#' based on a cross-validated fit criterion (e.g., MSE). If specified, Lasso -#' L1 regression model will be fit via \code{\link[glmnet]{glmnet}}, returning -#' regularized coefficient values for each value in the input array. -#' @param offset A vector of offset values, used in fitting. -#' @param foldid A vector of fold IDs, as in \code{\link[glmnet]{cv.glmnet}}. -#' @param verbose If \code{TRUE}, print details of screening steps. -#' @param col_lists A list of lists of column number, indicating which basis -#' columns to screen. -#' @param x_basis An \code{x_basis} sparse matrix. -#' @param main_terms If \code{TRUE}, only screen interactions for siginficant -#' main terms -#' -#' @importFrom glmnet cv.glmnet -#' @importFrom utils combn -#' -#' @name screening -#' -#' @keywords internal -hal_screen_cols <- function(x, y, V = 5, family, col_lists = NULL, - foldid = NULL, offset = NULL, verbose = FALSE) { - n <- length(y) - p <- ncol(x) - - if (is.null(col_lists)) { - col_lists <- as.list(seq_len(p)) - } - - if (is.null(foldid)) { - foldid <- sample(seq_len(V), n, replace = TRUE) - } - - if (is.null(offset)) { - offset <- rep(mean(y), n) - } - - - null_risk <- NA - col_results <- list() - thresh <- 1 / n - for (i in seq_along(col_lists)) { - col_list <- col_lists[[i]] - basis_list <- basis_list_cols(col_list, x) - - # TODO: subsample param - # subsample_size <- min(max(100, n * 0.1), length(basis_list)) - # basis_subsample <- sort(sample(seq_along(basis_list), subsample_size, - # replace = FALSE)) - basis_subsample <- seq_along(basis_list) - x_basis <- make_design_matrix(x, basis_list[basis_subsample]) - - screen_glmnet <- try( - { - glmnet::cv.glmnet( - x = x_basis, y = y, family = family, intercept = FALSE, - offset = offset, maxit = 10, thresh = thresh, foldid = - foldid, nlambda = 20 - ) - }, - silent = TRUE - ) - - if (inherits(screen_glmnet, "try-error")) { - reduction <- 0 - lambda_min <- NA - lambda_1se <- NA - } else { - if (is.na(null_risk)) { - null_risk <- screen_glmnet$cvm[1] - old_risk <- null_risk - } - - old_risk <- screen_glmnet$cvm[1] - new_risk <- min(screen_glmnet$cvm) - reduction <- (old_risk - new_risk) / null_risk - lambda_min <- screen_glmnet$lambda.min - lambda_1se <- screen_glmnet$lambda.1se - } - - if (verbose) { - print(sprintf( - "screening col %s -- null risk: %0.2f, old risk: %0.2f, new risk: %0.2f, percent reduction:%0.2f, min lambda: %0.3f", - paste0(col_list, collapse = ","), - null_risk, - old_risk, - new_risk, - 100 * reduction, - lambda_min - )) - } - - keep <- (reduction > thresh) - if (keep) { - new_offset <- predict(screen_glmnet, s = "lambda.min", x_basis, newoffset = offset) - offset <- new_offset - old_risk <- new_risk - } - - col_result <- list( - col_list = list(col_list), - reduction = reduction, - null_risk = null_risk, - old_risk = old_risk, - risk = new_risk, - lambda_min = lambda_min, - lambda_1se = lambda_1se, - selected = keep - ) - - col_results <- c(col_results, list(col_result)) - } - - individual_results <- data.table::rbindlist(col_results) - results <- list( - individual_results = individual_results, - final_offset = offset, - selected_cols = individual_results$col_list[individual_results$selected == TRUE] - ) - - return(results) -} - -#' @name screening -#' @keywords internal -hal_screen_basis <- function(x, y, family, foldid = NULL, offset = NULL, - verbose = FALSE, max_degree = NULL, main_terms = - NULL) { - n <- length(y) - p <- ncol(x) - - if (is.null(max_degree)) { - max_degree <- p - } - - if (is.null(main_terms)) { - main_terms <- (p > 10) - } - - # screen 1-d basis functions - col_lists <- as.list(seq_len(p)) - screened <- hal_screen_cols(x, y, - family = family, - foldid = foldid, - offset = offset, - col_lists = col_lists, - verbose = verbose - ) - - # limit to significant main terms if enabled - if (main_terms) { - good_cols <- unlist(screened$selected_cols) - } else { - good_cols <- unlist(col_lists) - } - - # construct all basis up to max based on selected columns - actual_max_degree <- min(max_degree, length(good_cols)) - - interaction_col_lists <- list() - if (actual_max_degree >= 2) { - for (degree in 2:actual_max_degree) { - combs <- utils::combn(length(good_cols), degree) - degree_lists <- lapply(seq_len(ncol(combs)), function(col) { - good_cols[combs[, col]] - }) - interaction_col_lists <- c(interaction_col_lists, degree_lists) - } - - interaction_screened <- hal_screen_cols(x, y, - family = family, - foldid = foldid, - offset = screened$final_offset, - col_lists = interaction_col_lists, - verbose = verbose - ) - - good_basis <- c(as.list(good_cols), interaction_screened$selected_cols) - } else { - good_basis <- as.list(good_cols) - } - - return(good_basis) -} - -#' @name screening -#' @keywords internal -hal_screen_lambda <- function(x_basis, y, family, offset = NULL, foldid = NULL, - lambda = NULL) { - if (!is.null(lambda)) { - # TODO: maybe downsample lambda here? - nlamba <- length(lambda) - } else { - nlambda <- 100 - } - - screen_glmnet <- glmnet::cv.glmnet( - x = x_basis, y = y, - family = family, - offset = offset, - foldid = foldid, - lambda = lambda, nlambda = nlambda, - maxit = 1, thresh = 1 - ) - - lambda_0 <- screen_glmnet$lambda[1] - lambda_min <- screen_glmnet$lambda[which.min(screen_glmnet$cvm)] - thresh <- min(screen_glmnet$cvm + screen_glmnet$cvsd) - lambda_1se_smaller <- min(screen_glmnet$lambda[screen_glmnet$cvm < thresh]) - screened_lambda <- screen_glmnet$lambda - selected_lambda <- screened_lambda[screened_lambda >= lambda_1se_smaller] - - return(selected_lambda) -} diff --git a/R/screening_experimental.R b/R/screening_experimental.R deleted file mode 100644 index d2a51ac2..00000000 --- a/R/screening_experimental.R +++ /dev/null @@ -1,174 +0,0 @@ -# step1:do regular lasso for main term functions and their interaction functions -# step2:rank those basis functions based on their speed to become zero and choose k top basis functions -# step3:generate K*n basis functions and do regular lasso -# step4:output the fitting results, mean square error and the running time of step3 - -# hal_screen_goodbasis is aimed to screen main term functions and their interaction functions x1*x2,x1*x2*x3,etc -# hal_screen_rank is aimed to rank all the covariates based on their speed to become zero -# hal_screen_output is aimed to do regular lasso for K*n basis function and output the fitting performance and running time - -hal_screen_rank <- function(x, y, family, k = NULL, foldid = NULL, - offset = NULL) { - n <- length(y) # length of y - p <- ncol(x) # column number of x - - if (is.null(foldid)) { - foldid <- sample(1:5, n, replace = TRUE) - } - - if (is.null(offset)) { - offset <- rep(mean(y), n) - } - rank_basis <- cv.glmnet(x, y, - family = family, foldid = foldid, - offset = offset - ) - - if (!is.null(k)) { - coef_mat <- coef(rank_basis, rank_basis$lambda) - coef_mat <- coef_mat[-1, ] - first_nz_lambda <- apply(coef_mat != 0, 1, function(x) which(x)[1]) - rank_col <- order(first_nz_lambda) - select_col <- rank_col[1:k] - } else { - select_coefs <- coef(rank_basis, rank_basis$lambda.min) - select_coefs <- select_coefs[-1] - select_col <- which(select_coefs != 0) - } - - return(select_col) -} - - -hal_screen_goodbasis <- function(x, y, actual_max_degree, k = NULL, family, - col_lists = NULL, foldid = NULL, - offset = NULL, verbose = FALSE) { - n <- length(y) - p <- ncol(x) - - if (is.null(col_lists)) { - col_lists <- as.list(seq_len(p)) # seq_len=(1,2,...,p) - } - - if (is.null(foldid)) { - foldid <- sample(1:5, n, replace = TRUE) - } - - if (is.null(offset)) { - offset <- rep(mean(y), n) - } - - good_cols <- unlist(col_lists) - interaction_col_lists <- list() - x_interaction_basis <- x - if (actual_max_degree >= 2) { - for (degree in 2:actual_max_degree) { - combs <- utils::combn(length(good_cols), degree) - degree_lists <- lapply(seq_len(ncol(combs)), function(col) { - good_cols[combs[, col]] - }) - interaction_col_lists <- c(interaction_col_lists, degree_lists) - for (col in seq_len(ncol(combs))) { - x_interaction <- matrix(1, ncol = 1, nrow = n) - for (row in combs[, col]) { - x_interaction <- x_interaction * x[, row] - } - x_interaction_basis <- cbind(x_interaction_basis, x_interaction) - } - } # get matrix[x1,x2,..,x1*x2,..,x1*x2*x3,..] - x_basis_lists <- as.list(matrix(0, ncol = length(col_lists) + - length(interaction_col_lists))) - for (i in 1:length(x_basis_lists)) { - if (i <= length(col_lists)) { - x_basis_lists[[i]] <- col_lists[[i]] - } else { - x_basis_lists[[i]] <- interaction_col_lists[[i - length(col_lists)]] - } - } # get list((1,..)(12,13,...)(123,..)) - screened_rank <- hal_screen_rank(x_interaction_basis, y, - k = k, - family = family, - foldid = foldid, - offset = offset - ) - screened_col <- lapply(screened_rank, function(x) x_basis_lists[[x]]) - set_interaction <- list() - set_mainterm <- list() - if (length(screened_col) > 0) { - for (i in seq_along(screened_col)) { - if (length(screened_col[[i]]) > 1) { - set_interaction <- c(set_interaction, as.list(screened_col[[i]])) - } else { - set_mainterm <- c(set_mainterm, as.list(screened_col[[i]])) - } # get set of main terms - } - } - # get set of main terms that build all the interaction - set_interaction <- set_interaction[!duplicated(set_interaction)] - # include all the main terms that build the interaction terms - screened_col <- c(screened_col, setdiff(set_interaction, set_mainterm)) - } else { - screened_rank <- hal_screen_rank(x, y, - k = k, - family = family, - foldid = foldid, - offset = offset - ) - screened_col <- lapply(screened_rank, function(x) col_lists[[x]]) - } - return(screened_col) -} - -# find the K basis function -# generate K*n basis function and do regular lasso -hal_screen_output <- function(x, y, family, col_lists, foldid = NULL, - offset = NULL) { - n <- length(y) # length of y - p <- ncol(x) # column number of x - - if (is.null(foldid)) { - foldid <- sample(1:5, n, replace = TRUE) - } - - if (is.null(offset)) { - offset <- rep(mean(y), n) - } - - col_results <- list() - - basis_list <- c() - - for (i in seq_along(col_lists)) { # i from 1 to p - col_list <- col_lists[[i]] - # one by one generate basis_list - basis_list <- c(basis_list, basis_list_cols(col_list, x)) - } - # generate k*n basis functions - x_basis <- make_design_matrix(x, basis_list) - # do regular lasso for k*n basis functions - screen_goodcols <- cv.glmnet(x_basis, y, - family = family, offset = offset, - foldid = foldid - ) - - lambda_min <- screen_goodcols$lambda.min - lambda_1se <- screen_goodcols$lambda.1se - coef <- stats::coef(screen_goodcols, s = "lambda.1se") - coef_list <- list(which(!coef[-1] == 0)) # find non-zero column lists - - pred <- predict(screen_goodcols, - newx = x_basis, s = lambda_1se, - newoffset = offset - ) - mse <- mean((pred - y)^2) - - col_result <- list( - coef_list = list(coef_list), - lambda_min = lambda_min, - lambda_1se = lambda_1se, - fit_performance = mse, - time = proc.time() - # TODO: calculate running time - ) - return(col_result) -} diff --git a/man/fit_hal.Rd b/man/fit_hal.Rd index 0e025322..4cca2223 100644 --- a/man/fit_hal.Rd +++ b/man/fit_hal.Rd @@ -22,8 +22,6 @@ fit_hal( id = NULL, offset = NULL, cv_select = TRUE, - screen_basis = FALSE, - screen_lambda = FALSE, ..., yolo = TRUE ) @@ -102,12 +100,6 @@ pick the optimal value (based on cross-validation) (when set to \code{TRUE}) or to simply fit along the sequence of values (or single value) using \code{\link[glmnet]{glmnet}} (when set to \code{FALSE}).} -\item{screen_basis}{If \code{TRUE}, use a screening procedure to reduce the -number of basis functions fitted.} - -\item{screen_lambda}{If \code{TRUE}, use a screening procedure to reduce the -number of lambda values evaluated.} - \item{...}{Other arguments passed to \code{\link[glmnet]{cv.glmnet}}. Please consult its documentation for a full list of options.} diff --git a/man/screening.Rd b/man/screening.Rd deleted file mode 100644 index d1e03aee..00000000 --- a/man/screening.Rd +++ /dev/null @@ -1,81 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/screening.R -\name{screening} -\alias{screening} -\alias{hal_screen_cols} -\alias{hal_screen_basis} -\alias{hal_screen_lambda} -\title{Screen HAL Columns, Basis Functions, and lambda} -\usage{ -hal_screen_cols( - x, - y, - V = 5, - family, - col_lists = NULL, - foldid = NULL, - offset = NULL, - verbose = FALSE -) - -hal_screen_basis( - x, - y, - family, - foldid = NULL, - offset = NULL, - verbose = FALSE, - max_degree = NULL, - main_terms = NULL -) - -hal_screen_lambda( - x_basis, - y, - family, - offset = NULL, - foldid = NULL, - lambda = NULL -) -} -\arguments{ -\item{x}{An input \code{matrix} containing observations of covariates.} - -\item{y}{A \code{numeric} vector of obervations of the outcome variable.} - -\item{V}{A \code{numeric} of the number of folds to use in cross-validation. -Defaults to five. If \code{foldid} is not specified, this is used.} - -\item{family}{A \code{character} corresponding to the error family for a -generalized linear model. Options are limited to "gaussian" for fitting a -standard general linear model and "binomial" for logistic regression.} - -\item{col_lists}{A list of lists of column number, indicating which basis -columns to screen.} - -\item{foldid}{A vector of fold IDs, as in \code{\link[glmnet]{cv.glmnet}}.} - -\item{offset}{A vector of offset values, used in fitting.} - -\item{verbose}{If \code{TRUE}, print details of screening steps.} - -\item{max_degree}{The highest order of interaction terms for which the basis -functions ought to be generated. The default (\code{NULL}) corresponds to -generating basis functions for the full dimensionality of the input matrix.} - -\item{main_terms}{If \code{TRUE}, only screen interactions for siginficant -main terms} - -\item{x_basis}{An \code{x_basis} sparse matrix.} - -\item{lambda}{A user-specified array of values of the lambda tuning -parameter of the Lasso L1 regression. \code{\link[glmnet]{cv.glmnet}} will -be used when set to \code{NULL}, automatically selecting an optimal value -based on a cross-validated fit criterion (e.g., MSE). If specified, Lasso -L1 regression model will be fit via \code{\link[glmnet]{glmnet}}, returning -regularized coefficient values for each value in the input array.} -} -\description{ -Smart Screening Stuff. TODO: Document fully -} -\keyword{internal} From 05b3529ff148081c9d01664df9f4774e471bf62f Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Tue, 23 Jun 2020 18:01:41 -0700 Subject: [PATCH 17/19] minor detail improvement --- paper/paper.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 6213de76..827e5fa9 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -76,9 +76,9 @@ regression on the large, sparse design matrix of indicator bases. The selected HAL regression model approximates the sectional variation norm of the target functional as the absolute sum of the estimated coefficients of indicator basis functions. The $L_1$ penalization parameter $\lambda$ can be data adaptively -selected based on a cross-validation selector [@vdl2003unified; @vdv2006oracle]; +chosen via a cross-validation selector [@vdl2003unified; @vdv2006oracle]; however, alternative selection criteria may be more appropriate when the -estimand functional is itself a nuisance component of the target parameter +estimand functional is not the target parameter but instead a nuisance function [e.g., @vdl2019efficient; @ertefaie2020nonparametric]. # `hal9001`'s core functionality From 2de9011a08f5755d70cc483a3706052c3c3170ac Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Tue, 23 Jun 2020 18:11:59 -0700 Subject: [PATCH 18/19] remove bit on experimental filtering --- paper/paper.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 827e5fa9..ceb74789 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -105,14 +105,15 @@ Over several years of development and use, it was found that the performance of HAL regression can suffer in high-dimensional settings. To alleviate computational aspects of this issue, several screening and filtering approaches were investigated and implemented. These include screening of variables prior to -creating the design matrix and filtering of indicator basis functions (arguments -`screen_basis` and `reduce_basis`), as well as either filtering of penalization -parameters (argument `screen_lambda`) or early stopping when fitting the -sequence of HAL models in $\lambda$. Future software development efforts will -continue to improve upon the computational aspects and performance of the HAL -regression options supported by `hal9001`. Currently, stable releases of the -`hal9001` package are made available on the Comprehensive `R` Archive Network at -https://CRAN.R-project.org/package=hal9001. +creating the design matrix and filtering of indicator basis functions (argument +`reduce_basis`) as well as early stopping when fitting the sequence of HAL +models in $\lambda$. Future software development efforts will continue to +improve upon the computational aspects and performance of the HAL regression +options supported by `hal9001`. Currently, stable releases of the `hal9001` +package are made available on the Comprehensive `R` Archive Network at +https://CRAN.R-project.org/package=hal9001, while both stable (branch `master`) +and development (branch `devel`) versions of the package are hosted at +https://github.com/tlverse/hal9001. # References From d05ca809920964f952be52dfd84a56981939c0ac Mon Sep 17 00:00:00 2001 From: Nima Hejazi Date: Tue, 23 Jun 2020 18:41:01 -0700 Subject: [PATCH 19/19] rebuild docs for release --- .Rbuildignore | 1 + README.Rmd | 36 ++++----- README.md | 87 ++++++++++++++------- docs/404.html | 2 +- docs/CONTRIBUTING.html | 24 +++--- docs/LICENSE-text.html | 2 +- docs/articles/index.html | 2 +- docs/articles/intro_hal9001.html | 48 ++++++------ docs/authors.html | 2 +- docs/index.html | 51 +++++++----- docs/news/index.html | 2 +- docs/pkgdown.yml | 4 +- docs/reference/SL.hal9001.html | 2 +- docs/reference/apply_copy_map.html | 2 +- docs/reference/as_dgCMatrix.html | 2 +- docs/reference/basis_list_cols.html | 2 +- docs/reference/basis_of_degree.html | 2 +- docs/reference/cv_lasso.html | 2 +- docs/reference/cv_lasso_early_stopping.html | 2 +- docs/reference/enumerate_basis.html | 2 +- docs/reference/evaluate_basis.html | 2 +- docs/reference/fit_hal.html | 14 +--- docs/reference/hal9000.html | 2 +- docs/reference/hal9001.html | 2 +- docs/reference/hal_quotes.html | 2 +- docs/reference/index.html | 2 +- docs/reference/index_first_copy.html | 2 +- docs/reference/lassi.html | 2 +- docs/reference/lassi_fit_module.html | 2 +- docs/reference/lassi_origami.html | 2 +- docs/reference/make_basis_list.html | 2 +- docs/reference/make_copy_map.html | 2 +- docs/reference/make_design_matrix.html | 2 +- docs/reference/make_reduced_basis_map.html | 2 +- docs/reference/meets_basis.html | 2 +- docs/reference/predict.SL.hal9001.html | 2 +- docs/reference/predict.hal9001.html | 2 +- docs/reference/predict.lassi.html | 2 +- docs/reference/squash_hal_fit.html | 2 +- docs/sitemap.xml | 3 - inst/REFERENCES.bib | 38 +++++++++ vignettes/intro_hal9001.Rmd | 2 +- 42 files changed, 217 insertions(+), 153 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index f4625516..a13360db 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -18,6 +18,7 @@ ^Makefile$ ^LICENSE$ ^sandbox$ +^paper$ ^docs$ ^_pkgdown\.yml$ ^CRAN-RELEASE$ diff --git a/README.Rmd b/README.Rmd index a7b2a182..46bcddbb 100644 --- a/README.Rmd +++ b/README.Rmd @@ -37,19 +37,18 @@ Laan](https://vanderlaan-lab.org/) `hal9001` is an R package providing an implementation of the scalable _highly adaptive lasso_ (HAL), a nonparametric regression estimator that applies -L1-regularized regression (i.e., the lasso) to a design matrix composed of -indicator functions corresponding to a set of covariates and interactions -thereof. Recent theoretical results show that HAL is endowed with several -important optimality properties, making it well-suited for the estimation of -highly complex functional forms while attaining fast convergence rates -($n^(1/4)$ and better) when used in the estimation of nuisance functions. HAL -has been quite successfully used in the construction of estimators at the -intersection of semiparametric theory and nonparametric causal inference (e.g., -the construction of efficient one-step or targeted minimum loss estimators). - -For detailed discussions of the highly adaptive lasso estimator, consider -consulting @benkeser2016hal, @vdl2017generally, and @vdl2017finite, among other -recent works. +L1-regularized lasso regression to a design matrix composed of indicator +functions corresponding to the support of the functional over a set of +covariates and interactions thereof. HAL regression allows for arbitrarily +complex functional forms to be estimated at fast (near-parametric) convergence +rates under only global smoothness assumptions [@vdl2017generally; +@bibaut2019fast]. For detailed theoretical discussions of the highly adaptive +lasso estimator, consider consulting, for example, @vdl2017generally, +@vdl2017finite, and @vdl2017uniform. For a computational demonstration of the +versatility of HAL regression, see @benkeser2016hal. Recent theoretical works +have demonstrated success in building efficient estimators of complex +parameters when particular variations of HAL regression are used to estimate +nuisance parameters [e.g., @vdl2019efficient; @ertefaie2020nonparametric]. --- @@ -80,9 +79,8 @@ issue](https://github.com/tlverse/hal9001/issues). ## Example -This minimal example shows how to use `hal9001` to obtain predictions based on -the Highly Adaptive Lasso. For details on the properties of the estimator, the -interested reader is referred to @benkeser2016hal and @vdl2017generally. +Consider the following minimal example in using `hal9001` to generate +predictions via Highly Adaptive Lasso regression: ```{r example} # load the package and set a seed @@ -119,13 +117,13 @@ prior to submitting a pull request. After using the `hal9001` R package, please cite the following: - @manual{coyle2019hal9001, + @manual{coyle2020hal9001, author = {Coyle, Jeremy R and Hejazi, Nima S and {van der Laan}, Mark J}, title = {{hal9001}: The scalable highly adaptive lasso}, - year = {2019}, + year = {2020}, howpublished = {\url{https://github.com/tlverse/hal9001}}, - note = {{R} package version 0.2.5}, + note = {{R} package version 0.2.6}, url = {https://doi.org/10.5281/zenodo.3558313}, doi = {10.5281/zenodo.3558313} } diff --git a/README.md b/README.md index 8f1cedb4..4c87252b 100644 --- a/README.md +++ b/README.md @@ -31,20 +31,21 @@ Laan](https://vanderlaan-lab.org/) `hal9001` is an R package providing an implementation of the scalable *highly adaptive lasso* (HAL), a nonparametric regression estimator that -applies L1-regularized regression (i.e., the lasso) to a design matrix -composed of indicator functions corresponding to a set of covariates and -interactions thereof. Recent theoretical results show that HAL is -endowed with several important optimality properties, making it -well-suited for the estimation of highly complex functional forms while -attaining fast convergence rates (\(n^(1/4)\) and better) when used in -the estimation of nuisance functions. HAL has been quite successfully -used in the construction of estimators at the intersection of -semiparametric theory and nonparametric causal inference (e.g., the -construction of efficient one-step or targeted minimum loss estimators). - -For detailed discussions of the highly adaptive lasso estimator, -consider consulting Benkeser and van der Laan (2016), van der Laan -(2017a), and van der Laan (2017b), among other recent works. +applies L1-regularized lasso regression to a design matrix composed of +indicator functions corresponding to the support of the functional over +a set of covariates and interactions thereof. HAL regression allows for +arbitrarily complex functional forms to be estimated at fast +(near-parametric) convergence rates under only global smoothness +assumptions (van der Laan 2017a; Bibaut and van der Laan 2019). For +detailed theoretical discussions of the highly adaptive lasso estimator, +consider consulting, for example, van der Laan (2017a), van der Laan +(2017b), and van der Laan and Bibaut (2017). For a computational +demonstration of the versatility of HAL regression, see Benkeser and van +der Laan (2016). Recent theoretical works have demonstrated success in +building efficient estimators of complex parameters when particular +variations of HAL regression are used to estimate nuisance parameters +(e.g., van der Laan, Benkeser, and Cai 2019; Ertefaie, Hejazi, and van +der Laan 2020). ----- @@ -75,22 +76,20 @@ If you encounter any bugs or have any specific feature requests, please ## Example -This minimal example shows how to use `hal9001` to obtain predictions -based on the Highly Adaptive Lasso. For details on the properties of the -estimator, the interested reader is referred to Benkeser and van der -Laan (2016) and van der Laan (2017a). +Consider the following minimal example in using `hal9001` to generate +predictions via Highly Adaptive Lasso regression: ``` r # load the package and set a seed library(hal9001) #> Loading required package: Rcpp -#> hal9001 v0.2.5: The Scalable Highly Adaptive Lasso +#> hal9001 v0.2.6: The Scalable Highly Adaptive Lasso set.seed(385971) # simulate data n <- 100 p <- 3 -x <- xmat <- matrix(rnorm(n * p), n, p) +x <- matrix(rnorm(n * p), n, p) y <- x[, 1] * sin(x[, 2]) + rnorm(n, mean = 0, sd = 0.2) # fit the HAL regression @@ -98,12 +97,12 @@ hal_fit <- fit_hal(X = x, Y = y) #> [1] "I'm sorry, Dave. I'm afraid I can't do that." hal_fit$times #> user.self sys.self elapsed user.child sys.child -#> enumerate_basis 0.001 0.000 0.002 0 0 -#> design_matrix 0.002 0.000 0.001 0 0 -#> remove_duplicates 0.005 0.000 0.005 0 0 +#> enumerate_basis 0.001 0.001 0.001 0 0 +#> design_matrix 0.001 0.000 0.002 0 0 +#> remove_duplicates 0.005 0.000 0.004 0 0 #> reduce_basis 0.000 0.000 0.000 0 0 -#> lasso 0.275 0.007 0.284 0 0 -#> total 0.283 0.007 0.292 0 0 +#> lasso 0.257 0.004 0.261 0 0 +#> total 0.264 0.005 0.268 0 0 # training sample prediction preds <- predict(hal_fit, new_data = x) @@ -127,13 +126,13 @@ prior to submitting a pull request. After using the `hal9001` R package, please cite the following: ``` - @manual{coyle2019hal9001, + @manual{coyle2020hal9001, author = {Coyle, Jeremy R and Hejazi, Nima S and {van der Laan}, Mark J}, title = {{hal9001}: The scalable highly adaptive lasso}, - year = {2019}, + year = {2020}, howpublished = {\url{https://github.com/tlverse/hal9001}}, - note = {{R} package version 0.2.5}, + note = {{R} package version 0.2.6}, url = {https://doi.org/10.5281/zenodo.3558313}, doi = {10.5281/zenodo.3558313} } @@ -164,6 +163,22 @@ and Advanced Analytics (DSAA)*. IEEE. +
    + +Bibaut, Aurélien F, and Mark J van der Laan. 2019. “Fast Rates for +Empirical Risk Minimization over Càdlàg Functions with Bounded Sectional +Variation Norm.” *arXiv Preprint arXiv:1907.09244*. + +
    + +
    + +Ertefaie, Ashkan, Nima S Hejazi, and Mark J van der Laan. 2020. +“Nonparametric Inverse Probability Weighted Estimators Based on the +Highly Adaptive Lasso.” . + +
    +
    van der Laan, Mark J. 2017a. “A Generally Efficient Targeted Minimum @@ -180,4 +195,20 @@ E-Prints*.
    +
    + +van der Laan, Mark J, David Benkeser, and Weixin Cai. 2019. “Efficient +Estimation of Pathwise Differentiable Target Parameters with the +Undersmoothed Highly Adaptive Lasso.” *arXiv Preprint arXiv:1908.05607*. + +
    + +
    + +van der Laan, Mark J, and Aurélien F Bibaut. 2017. “Uniform Consistency +of the Highly Adaptive Lasso Estimator of Infinite-Dimensional +Parameters.” *arXiv Preprint arXiv:1709.06256*. + +
    + diff --git a/docs/404.html b/docs/404.html index 58baf3f5..51361783 100644 --- a/docs/404.html +++ b/docs/404.html @@ -143,7 +143,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/CONTRIBUTING.html b/docs/CONTRIBUTING.html index ab078250..d59a6652 100644 --- a/docs/CONTRIBUTING.html +++ b/docs/CONTRIBUTING.html @@ -136,19 +136,19 @@

    Issues

    When filing an issue, the most important thing is to include a minimal reproducible example so that we can quickly verify the problem, and then figure out how to fix it. There are three things you need to include to make your example reproducible: required packages, data, code.

      -
    1. Packages should be loaded at the top of the script, so it’s easy to see which ones the example needs.

    2. +
    3. Packages should be loaded at the top of the script, so it’s easy to see which ones the example needs.

    4. The easiest way to include data is to use dput() to generate the R code to recreate it.

    5. Spend a little bit of time ensuring that your code is easy for others to read:

        -
      • make sure you’ve used spaces and your variable names are concise, but informative

      • +
      • make sure you’ve used spaces and your variable names are concise, but informative

      • use comments to indicate where your problem lies

      • do your best to remove everything that is not related to the problem. The shorter your code is, the easier it is to understand.

    You can check you have actually made a reproducible example by starting up a fresh R session and pasting your script in.

    -

    (Unless you’ve been specifically asked for it, please don’t include the output of sessionInfo().)

    +

    (Unless you’ve been specifically asked for it, please don’t include the output of sessionInfo().)

    @@ -158,25 +158,25 @@

  • Create a branch in git and make your changes.
  • Push branch to GitHub and issue pull request (PR).
  • Discuss the pull request.
  • -
  • Iterate until either we accept the PR or decide that it’s not a good fit for hal9001.
  • +
  • Iterate until either we accept the PR or decide that it’s not a good fit for hal9001.
  • Each of these steps are described in more detail below. This might feel overwhelming the first time you get set up, but it gets easier with practice.

    -

    If you’re not familiar with git or GitHub, please start by reading http://r-pkgs.had.co.nz/git.html

    +

    If you’re not familiar with git or GitHub, please start by reading http://r-pkgs.had.co.nz/git.html

    Pull requests will be evaluated against a checklist:

    1. Motivation. Your pull request should clearly and concisely motivates the need for change. Please describe the problem your PR addresses and show how your pull request solves it as concisely as possible.
    -

    Also include this motivation in NEWS so that when a new release of hal9001 comes out it’s easy for users to see what’s changed. Add your item at the top of the file and use markdown for formatting. The news item should end with (@yourGithubUsername, #the_issue_number).

    +

    Also include this motivation in NEWS so that when a new release of hal9001 comes out it’s easy for users to see what’s changed. Add your item at the top of the file and use markdown for formatting. The news item should end with (@yourGithubUsername, #the_issue_number).

    1. -

      Only related changes. Before you submit your pull request, please check to make sure that you haven’t accidentally included any unrelated changes. These make it harder to see exactly what’s changed, and to evaluate any unexpected side effects.

      -

      Each PR corresponds to a git branch, so if you expect to submit multiple changes make sure to create multiple branches. If you have multiple changes that depend on each other, start with the first one and don’t submit any others until the first one has been processed.

      +

      Only related changes. Before you submit your pull request, please check to make sure that you haven’t accidentally included any unrelated changes. These make it harder to see exactly what’s changed, and to evaluate any unexpected side effects.

      +

      Each PR corresponds to a git branch, so if you expect to submit multiple changes make sure to create multiple branches. If you have multiple changes that depend on each other, start with the first one and don’t submit any others until the first one has been processed.

    2. -
    3. Use hal9001 coding style. To do so, please follow the official tidyverse style guide. Maintaining a consistent style across the whole code base makes it much easier to jump into the code. If you’re modifying existing hal9001 code that doesn’t follow the style guide, a separate pull request to fix the style would be greatly appreciated.

    4. -
    5. If you’re adding new parameters or a new function, you’ll also need to document them with roxygen2. Make sure to re-run devtools::document() on the code before submitting.

    6. +
    7. Use hal9001 coding style. To do so, please follow the official tidyverse style guide. Maintaining a consistent style across the whole code base makes it much easier to jump into the code. If you’re modifying existing hal9001 code that doesn’t follow the style guide, a separate pull request to fix the style would be greatly appreciated.

    8. +
    9. If you’re adding new parameters or a new function, you’ll also need to document them with roxygen2. Make sure to re-run devtools::document() on the code before submitting.

    -

    This seems like a lot of work but don’t worry if your pull request isn’t perfect. It’s a learning process. A pull request is a process, and unless you’ve submitted a few in the past it’s unlikely that your pull request will be accepted as is. Please don’t submit pull requests that change existing behaviour. Instead, think about how you can add a new feature in a minimally invasive way.

    +

    This seems like a lot of work but don’t worry if your pull request isn’t perfect. It’s a learning process. A pull request is a process, and unless you’ve submitted a few in the past it’s unlikely that your pull request will be accepted as is. Please don’t submit pull requests that change existing behaviour. Instead, think about how you can add a new feature in a minimally invasive way.

    @@ -198,7 +198,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 9ba274f8..4549c471 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -818,7 +818,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/articles/index.html b/docs/articles/index.html index 86aa8368..e1d56adb 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -142,7 +142,7 @@

    All vignettes

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/articles/intro_hal9001.html b/docs/articles/intro_hal9001.html index 914afb80..8d5cfb12 100644 --- a/docs/articles/intro_hal9001.html +++ b/docs/articles/intro_hal9001.html @@ -82,7 +82,7 @@

    Nima Hejazi and Jeremy Coyle

    -

    2020-04-06

    +

    2020-06-23

    Source: vignettes/intro_hal9001.Rmd @@ -152,12 +152,12 @@

    ## [1] "Without your space helmet, Dave. You're going to find that rather difficult."
    hal_fit$times
    ##                   user.self sys.self elapsed user.child sys.child
    -## enumerate_basis       0.005    0.000   0.005          0         0
    -## design_matrix         0.021    0.000   0.022          0         0
    -## remove_duplicates     0.014    0.000   0.014          0         0
    +## enumerate_basis       0.004    0.000   0.004          0         0
    +## design_matrix         0.017    0.001   0.017          0         0
    +## remove_duplicates     0.009    0.000   0.009          0         0
     ## reduce_basis          0.000    0.000   0.000          0         0
    -## lasso                 0.783    0.005   0.788          0         0
    -## total                 0.823    0.005   0.829          0         0
    +## lasso 0.773 0.007 0.781 0 0 +## total 0.803 0.008 0.811 0 0
    hal_fit
    ## $call
     ## fit_hal(X = x, Y = y, fit_type = "glmnet")
    @@ -16416,12 +16416,12 @@ 

    ## ## $times ## user.self sys.self elapsed user.child sys.child -## enumerate_basis 0.005 0.000 0.005 0 0 -## design_matrix 0.021 0.000 0.022 0 0 -## remove_duplicates 0.014 0.000 0.014 0 0 +## enumerate_basis 0.004 0.000 0.004 0 0 +## design_matrix 0.017 0.001 0.017 0 0 +## remove_duplicates 0.009 0.000 0.009 0 0 ## reduce_basis 0.000 0.000 0.000 0 0 -## lasso 0.783 0.005 0.788 0 0 -## total 0.823 0.005 0.829 0 0 +## lasso 0.773 0.007 0.781 0 0 +## total 0.803 0.008 0.811 0 0 ## ## $lambda_star ## [1] 0.01537728 @@ -16565,12 +16565,12 @@

    ## 'lassi' is experimental: fit_type='glmnet' is recommended in nearly all cases.
    hal_fit_reduced$times
    ##                   user.self sys.self elapsed user.child sys.child
    -## enumerate_basis       0.005        0   0.006          0         0
    -## design_matrix         0.025        0   0.025          0         0
    -## remove_duplicates     0.010        0   0.010          0         0
    -## reduce_basis          0.007        0   0.007          0         0
    -## lasso                 2.207        0   2.223          0         0
    -## total                 2.247        0   2.264          0         0
    +## enumerate_basis 0.004 0 0.004 0 0 +## design_matrix 0.015 0 0.016 0 0 +## remove_duplicates 0.006 0 0.006 0 0 +## reduce_basis 0.005 0 0.004 0 0 +## lasso 1.967 0 1.967 0 0 +## total 1.992 0 1.993 0 0

    In the above, all basis functions with fewer than 7.0710678% of observations meeting the criterion imposed are automatically removed prior to the Lasso step of fitting the HAL regression. The results appear below

    hal_fit_reduced
    ## $call
    @@ -32663,12 +32663,12 @@ 

    ## ## $times ## user.self sys.self elapsed user.child sys.child -## enumerate_basis 0.005 0 0.006 0 0 -## design_matrix 0.025 0 0.025 0 0 -## remove_duplicates 0.010 0 0.010 0 0 -## reduce_basis 0.007 0 0.007 0 0 -## lasso 2.207 0 2.223 0 0 -## total 2.247 0 2.264 0 0 +## enumerate_basis 0.004 0 0.004 0 0 +## design_matrix 0.015 0 0.016 0 0 +## remove_duplicates 0.006 0 0.006 0 0 +## reduce_basis 0.005 0 0.004 0 0 +## lasso 1.967 0 1.967 0 0 +## total 1.992 0 1.993 0 0 ## ## $lambda_star ## [1] 0.01073374 @@ -33860,7 +33860,7 @@

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/authors.html b/docs/authors.html index dc3614f7..94f28a63 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -180,7 +180,7 @@

    Authors

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/index.html b/docs/index.html index 36b3372e..33ceb348 100644 --- a/docs/index.html +++ b/docs/index.html @@ -95,9 +95,8 @@

    -What’s hal9001?

    -

    hal9001 is an R package providing an implementation of the scalable highly adaptive lasso (HAL), a nonparametric regression estimator that applies L1-regularized regression (i.e., the lasso) to a design matrix composed of indicator functions corresponding to a set of covariates and interactions thereof. Recent theoretical results show that HAL is endowed with several important optimality properties, making it well-suited for the estimation of highly complex functional forms while attaining fast convergence rates (n(1/4) and better) when used in the estimation of nuisance functions. HAL has been quite successfully used in the construction of estimators at the intersection of semiparametric theory and nonparametric causal inference (e.g., the construction of efficient one-step or targeted minimum loss estimators).

    -

    For detailed discussions of the highly adaptive lasso estimator, consider consulting Benkeser and van der Laan (2016), van der Laan (2017a), and van der Laan (2017b), among other recent works.

    +What’s hal9001?

    +

    hal9001 is an R package providing an implementation of the scalable highly adaptive lasso (HAL), a nonparametric regression estimator that applies L1-regularized lasso regression to a design matrix composed of indicator functions corresponding to the support of the functional over a set of covariates and interactions thereof. HAL regression allows for arbitrarily complex functional forms to be estimated at fast (near-parametric) convergence rates under only global smoothness assumptions (van der Laan 2017a; Bibaut and van der Laan 2019). For detailed theoretical discussions of the highly adaptive lasso estimator, consider consulting, for example, van der Laan (2017a), van der Laan (2017b), and van der Laan and Bibaut (2017). For a computational demonstration of the versatility of HAL regression, see Benkeser and van der Laan (2016). Recent theoretical works have demonstrated success in building efficient estimators of complex parameters when particular variations of HAL regression are used to estimate nuisance parameters (e.g., van der Laan, Benkeser, and Cai 2019; Ertefaie, Hejazi, and van der Laan 2020).


    @@ -118,17 +117,17 @@

    Example

    -

    This minimal example shows how to use hal9001 to obtain predictions based on the Highly Adaptive Lasso. For details on the properties of the estimator, the interested reader is referred to Benkeser and van der Laan (2016) and van der Laan (2017a).

    +

    Consider the following minimal example in using hal9001 to generate predictions via Highly Adaptive Lasso regression:

    # load the package and set a seed
     library(hal9001)
     #> Loading required package: Rcpp
    -#> hal9001 v0.2.5: The Scalable Highly Adaptive Lasso
    +#> hal9001 v0.2.6: The Scalable Highly Adaptive Lasso
     set.seed(385971)
     
     # simulate data
     n <- 100
     p <- 3
    -x <- xmat <- matrix(rnorm(n * p), n, p)
    +x <- matrix(rnorm(n * p), n, p)
     y <- x[, 1] * sin(x[, 2]) + rnorm(n, mean = 0, sd = 0.2)
     
     # fit the HAL regression
    @@ -136,12 +135,12 @@ 

    #> [1] "I'm sorry, Dave. I'm afraid I can't do that." hal_fit$times #> user.self sys.self elapsed user.child sys.child -#> enumerate_basis 0.001 0.000 0.002 0 0 -#> design_matrix 0.002 0.000 0.001 0 0 -#> remove_duplicates 0.005 0.000 0.005 0 0 +#> enumerate_basis 0.001 0.001 0.001 0 0 +#> design_matrix 0.001 0.000 0.002 0 0 +#> remove_duplicates 0.005 0.000 0.004 0 0 #> reduce_basis 0.000 0.000 0.000 0 0 -#> lasso 0.275 0.007 0.284 0 0 -#> total 0.283 0.007 0.292 0 0 +#> lasso 0.257 0.004 0.261 0 0 +#> total 0.264 0.005 0.268 0 0 # training sample prediction preds <- predict(hal_fit, new_data = x) @@ -159,13 +158,13 @@

    Citation

    After using the hal9001 R package, please cite the following:

    -
        @manual{coyle2019hal9001,
    +
        @manual{coyle2020hal9001,
           author = {Coyle, Jeremy R and Hejazi, Nima S and {van der Laan}, Mark
             J},
           title = {{hal9001}: The scalable highly adaptive lasso},
    -      year  = {2019},
    +      year  = {2020},
           howpublished = {\url{https://github.com/tlverse/hal9001}},
    -      note = {{R} package version 0.2.5},
    +      note = {{R} package version 0.2.6},
           url = {https://doi.org/10.5281/zenodo.3558313},
           doi = {10.5281/zenodo.3558313}
         }
    @@ -174,7 +173,7 @@

    License

    -

    © 2017-2020 Jeremy R. Coyle & Nima S. Hejazi

    +

    © 2017-2020 Jeremy R. Coyle & Nima S. Hejazi

    The contents of this repository are distributed under the GPL-3 license. See file LICENSE for details.


    @@ -183,13 +182,25 @@

    References

    -

    Benkeser, David, and Mark J van der Laan. 2016. “The Highly Adaptive Lasso Estimator.” In 2016 IEEE International Conference on Data Science and Advanced Analytics (DSAA). IEEE. https://doi.org/10.1109/dsaa.2016.93.

    +

    Benkeser, David, and Mark J van der Laan. 2016. “The Highly Adaptive Lasso Estimator.” In 2016 IEEE International Conference on Data Science and Advanced Analytics (DSAA). IEEE. https://doi.org/10.1109/dsaa.2016.93.

    +
    +
    +

    Bibaut, Aurélien F, and Mark J van der Laan. 2019. “Fast Rates for Empirical Risk Minimization over Càdlàg Functions with Bounded Sectional Variation Norm.” arXiv Preprint arXiv:1907.09244.

    +
    +
    +

    Ertefaie, Ashkan, Nima S Hejazi, and Mark J van der Laan. 2020. “Nonparametric Inverse Probability Weighted Estimators Based on the Highly Adaptive Lasso.” http://arxiv.org/abs/2005.11303.

    -

    van der Laan, Mark J. 2017a. “A Generally Efficient Targeted Minimum Loss Based Estimator Based on the Highly Adaptive Lasso.” The International Journal of Biostatistics. De Gruyter. https://doi.org/10.1515/ijb-2015-0097.

    +

    van der Laan, Mark J. 2017a. “A Generally Efficient Targeted Minimum Loss Based Estimator Based on the Highly Adaptive Lasso.” The International Journal of Biostatistics. De Gruyter. https://doi.org/10.1515/ijb-2015-0097.

    -

    ———. 2017b. “Finite Sample Inference for Targeted Learning.” ArXiv E-Prints.

    +

    ———. 2017b. “Finite Sample Inference for Targeted Learning.” ArXiv E-Prints.

    +
    +
    +

    van der Laan, Mark J, David Benkeser, and Weixin Cai. 2019. “Efficient Estimation of Pathwise Differentiable Target Parameters with the Undersmoothed Highly Adaptive Lasso.” arXiv Preprint arXiv:1908.05607.

    +
    +
    +

    van der Laan, Mark J, and Aurélien F Bibaut. 2017. “Uniform Consistency of the Highly Adaptive Lasso Estimator of Infinite-Dimensional Parameters.” arXiv Preprint arXiv:1709.06256.

    @@ -244,7 +255,7 @@

    Dev status

  • Coverage Status
  • CRAN
  • CRAN downloads
  • -
  • Project Status: Active – The project has reached a stable, usable state and is being actively developed.
  • +
  • Project Status: Active – The project has reached a stable, usable state and is being actively developed.
  • License: GPL v3
  • DOI
  • @@ -258,7 +269,7 @@

    Dev status

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/news/index.html b/docs/news/index.html index fe805c34..a00f95db 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -149,7 +149,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 22598671..24ec6222 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,9 +1,9 @@ pandoc: 2.2.1 -pkgdown: 1.5.0 +pkgdown: 1.5.1 pkgdown_sha: ~ articles: intro_hal9001: intro_hal9001.html -last_built: 2020-04-06T23:07Z +last_built: 2020-06-24T01:40Z urls: reference: https://tlverse.org/hal9001/reference article: https://tlverse.org/hal9001/articles diff --git a/docs/reference/SL.hal9001.html b/docs/reference/SL.hal9001.html index 46742bdf..b1396f2e 100644 --- a/docs/reference/SL.hal9001.html +++ b/docs/reference/SL.hal9001.html @@ -223,7 +223,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/apply_copy_map.html b/docs/reference/apply_copy_map.html index fbf070e0..bdb621e0 100644 --- a/docs/reference/apply_copy_map.html +++ b/docs/reference/apply_copy_map.html @@ -188,7 +188,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/as_dgCMatrix.html b/docs/reference/as_dgCMatrix.html index d0948c4a..386145cf 100644 --- a/docs/reference/as_dgCMatrix.html +++ b/docs/reference/as_dgCMatrix.html @@ -165,7 +165,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/basis_list_cols.html b/docs/reference/basis_list_cols.html index f9a98adc..19d4ea7c 100644 --- a/docs/reference/basis_list_cols.html +++ b/docs/reference/basis_list_cols.html @@ -169,7 +169,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/basis_of_degree.html b/docs/reference/basis_of_degree.html index 62707c3f..cf66b80a 100644 --- a/docs/reference/basis_of_degree.html +++ b/docs/reference/basis_of_degree.html @@ -168,7 +168,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/cv_lasso.html b/docs/reference/cv_lasso.html index 2882e7bc..3a1c67d3 100644 --- a/docs/reference/cv_lasso.html +++ b/docs/reference/cv_lasso.html @@ -181,7 +181,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/cv_lasso_early_stopping.html b/docs/reference/cv_lasso_early_stopping.html index 996e9004..d3261ec1 100644 --- a/docs/reference/cv_lasso_early_stopping.html +++ b/docs/reference/cv_lasso_early_stopping.html @@ -176,7 +176,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/enumerate_basis.html b/docs/reference/enumerate_basis.html index 8369441b..73b67991 100644 --- a/docs/reference/enumerate_basis.html +++ b/docs/reference/enumerate_basis.html @@ -189,7 +189,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/evaluate_basis.html b/docs/reference/evaluate_basis.html index 67d6ac93..951d6275 100644 --- a/docs/reference/evaluate_basis.html +++ b/docs/reference/evaluate_basis.html @@ -169,7 +169,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/fit_hal.html b/docs/reference/fit_hal.html index bd4e2e09..70b99fe7 100644 --- a/docs/reference/fit_hal.html +++ b/docs/reference/fit_hal.html @@ -148,8 +148,6 @@

    HAL: The Highly Adaptive Lasso

    id = NULL, offset = NULL, cv_select = TRUE, - screen_basis = FALSE, - screen_lambda = FALSE, ..., yolo = TRUE ) @@ -263,16 +261,6 @@

    Arg pick the optimal value (based on cross-validation) (when set to TRUE) or to simply fit along the sequence of values (or single value) using glmnet (when set to FALSE).

    -

    - - - - - - - @@ -326,7 +314,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/hal9000.html b/docs/reference/hal9000.html index 13925ad2..eba37244 100644 --- a/docs/reference/hal9000.html +++ b/docs/reference/hal9000.html @@ -149,7 +149,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/hal9001.html b/docs/reference/hal9001.html index ffc558ad..dc481d27 100644 --- a/docs/reference/hal9001.html +++ b/docs/reference/hal9001.html @@ -148,7 +148,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/hal_quotes.html b/docs/reference/hal_quotes.html index 7b38a97b..a6ecd2c8 100644 --- a/docs/reference/hal_quotes.html +++ b/docs/reference/hal_quotes.html @@ -154,7 +154,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/index.html b/docs/reference/index.html index d8f1f313..cc104c37 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -300,7 +300,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/index_first_copy.html b/docs/reference/index_first_copy.html index 4e1fdb0d..74a6e8c4 100644 --- a/docs/reference/index_first_copy.html +++ b/docs/reference/index_first_copy.html @@ -159,7 +159,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/lassi.html b/docs/reference/lassi.html index aec63281..c11431c0 100644 --- a/docs/reference/lassi.html +++ b/docs/reference/lassi.html @@ -185,7 +185,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/lassi_fit_module.html b/docs/reference/lassi_fit_module.html index db09d179..b21f83c9 100644 --- a/docs/reference/lassi_fit_module.html +++ b/docs/reference/lassi_fit_module.html @@ -148,7 +148,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/lassi_origami.html b/docs/reference/lassi_origami.html index 16447845..b00bbf97 100644 --- a/docs/reference/lassi_origami.html +++ b/docs/reference/lassi_origami.html @@ -180,7 +180,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/make_basis_list.html b/docs/reference/make_basis_list.html index 5e6054e8..57fccaa3 100644 --- a/docs/reference/make_basis_list.html +++ b/docs/reference/make_basis_list.html @@ -167,7 +167,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/make_copy_map.html b/docs/reference/make_copy_map.html index e298fee3..d21213c7 100644 --- a/docs/reference/make_copy_map.html +++ b/docs/reference/make_copy_map.html @@ -183,7 +183,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/make_design_matrix.html b/docs/reference/make_design_matrix.html index 4f3b817e..e05c5fcd 100644 --- a/docs/reference/make_design_matrix.html +++ b/docs/reference/make_design_matrix.html @@ -187,7 +187,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/make_reduced_basis_map.html b/docs/reference/make_reduced_basis_map.html index 8afd594a..e14718dd 100644 --- a/docs/reference/make_reduced_basis_map.html +++ b/docs/reference/make_reduced_basis_map.html @@ -177,7 +177,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/meets_basis.html b/docs/reference/meets_basis.html index 71cbc82b..b230e46b 100644 --- a/docs/reference/meets_basis.html +++ b/docs/reference/meets_basis.html @@ -171,7 +171,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/predict.SL.hal9001.html b/docs/reference/predict.SL.hal9001.html index 7544c240..9ede4271 100644 --- a/docs/reference/predict.SL.hal9001.html +++ b/docs/reference/predict.SL.hal9001.html @@ -170,7 +170,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/predict.hal9001.html b/docs/reference/predict.hal9001.html index abdaafe0..5fa0575a 100644 --- a/docs/reference/predict.hal9001.html +++ b/docs/reference/predict.hal9001.html @@ -198,7 +198,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/predict.lassi.html b/docs/reference/predict.lassi.html index 43e97b22..28d59338 100644 --- a/docs/reference/predict.lassi.html +++ b/docs/reference/predict.lassi.html @@ -166,7 +166,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/squash_hal_fit.html b/docs/reference/squash_hal_fit.html index 4348290c..7a2e41d7 100644 --- a/docs/reference/squash_hal_fit.html +++ b/docs/reference/squash_hal_fit.html @@ -175,7 +175,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 73394752..4a4d0de8 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -78,9 +78,6 @@ https://tlverse.org/hal9001/reference/predict.lassi.html - - https://tlverse.org/hal9001/reference/screening.html - https://tlverse.org/hal9001/reference/squash_hal_fit.html diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 722c1dd8..675c7a9f 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -30,3 +30,41 @@ @article{vdl2017finite keywords = {Mathematics - Statistics Theory}, year = {2017} } + +@article{bibaut2019fast, + author = {Bibaut, Aur{\'e}lien F and {van der Laan}, Mark J}, + journal = {arXiv preprint arXiv:1907.09244}, + title = {Fast rates for empirical risk minimization over + c\`{a}dl\`{a}g functions with bounded sectional variation norm}, + year = {2019} +} + +@article{vdl2019efficient, + Author = {{van der Laan}, Mark J and Benkeser, David and Cai, Weixin}, + Journal = {arXiv preprint arXiv:1908.05607}, + Title = {Efficient estimation of pathwise differentiable target parameters + with the undersmoothed highly adaptive lasso}, + Year = {2019} +} + +@article{vdl2017uniform, + Author = {{van der Laan}, Mark J and Bibaut, Aur{\'e}lien F}, + Journal = {arXiv preprint arXiv:1709.06256}, + Title = {Uniform Consistency of the Highly Adaptive Lasso Estimator of + Infinite-Dimensional Parameters}, + Year = {2017} +} + +@article{ertefaie2020nonparametric, + doi = {}, + url = {http://arxiv.org/abs/2005.11303}, + year = {2020}, + publisher = {}, + journal = {}, + volume = {}, + number = {}, + pages = {}, + author = {Ertefaie, Ashkan and Hejazi, Nima S and {van der Laan}, Mark J}, + title = {Nonparametric inverse probability weighted estimators based on the + highly adaptive lasso} +} diff --git a/vignettes/intro_hal9001.Rmd b/vignettes/intro_hal9001.Rmd index 4da3686d..9b3b6391 100644 --- a/vignettes/intro_hal9001.Rmd +++ b/vignettes/intro_hal9001.Rmd @@ -6,7 +6,7 @@ date: "`r Sys.Date()`" output: rmarkdown::html_vignette bibliography: ../inst/REFERENCES.bib vignette: > - %\VignetteIndexEntry{Introduction to the HAL estimator} + %\VignetteIndexEntry{Fitting the Highly Adaptive Lasso with hal9001} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} ---
    object

    An object of class hal9001, containing the results of -fitting the Highly Adaptive Lasso, as produced by a call to fit_hal.

    offset

    A vector of offsets. Must be provided if provided at training

    lambda

    A single lambda value or a vector of lambdas to use for -prediction. If NULL, a value of lambda will be selected based on -cross-validation, using cv.glmnet.

    ...

    Additional arguments passed to predict as necessary.

    screen_basis

    If TRUE, use a screening procedure to reduce the -number of basis functions fitted.

    screen_lambda

    If TRUE, use a screening procedure to reduce the -number of lambda values evaluated.

    ...