Merge pull request #427 from tlverse/devel

Merge devel into master
tlverse · Apr 29, 2024 · b794bb1 · b794bb1
2 parents 6544257 + fdfe83f
commit b794bb1
Show file tree

Hide file tree

Showing 359 changed files with 9,497 additions and 44,463 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -30,3 +30,4 @@ deploy.sh
 ^LICENSE$
 man-roxygen
 ^_pkgdown\.yml$
+^pkgdown$
diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml
@@ -29,18 +29,18 @@ jobs:
 
     steps:
       - name: Checkout repo
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       - name: Setup R
-        uses: r-lib/actions/setup-r@master
+        uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ matrix.config.r }}
 
       - name: Install pandoc
-        uses: r-lib/actions/setup-pandoc@v1
+        uses: r-lib/actions/setup-pandoc@v2
 
       - name: Install tinyTeX
-        uses: r-lib/actions/setup-tinytex@v1
+        uses: r-lib/actions/setup-tinytex@v2
 
       - name: Install system dependencies
         if: runner.os == 'Linux'
@@ -50,8 +50,7 @@ jobs:
 
       - name: Install package dependencies
         run: |
-          install.packages(c("remotes", "rcmdcheck", "covr", "sessioninfo"))
-          if(Sys.info()["sysname"] == "Windows") install.packages("igraph", type = "binary")
+          install.packages(c("remotes", "devtools", "rcmdcheck", "covr", "sessioninfo"))
           remotes::install_deps(dependencies = TRUE)
         shell: Rscript {0}
 

diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ README.html
 .DS_Store
 doc
 Meta
+docs
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: sl3
 Title: Pipelines for Machine Learning and Super Learning
-Version: 1.4.4
+Version: 1.4.5
 Authors@R: c(
     person("Jeremy", "Coyle", email = "jeremyrcoyle@gmail.com",
            role = c("aut", "cre", "cph"),
@@ -27,9 +27,9 @@ Authors@R: c(
   )
 Maintainer: Jeremy Coyle <jeremyrcoyle@gmail.com>
 Description: A modern implementation of the Super Learner prediction algorithm,
-    coupled with a general-purpose framework for composing arbitrary pipelines
+    coupled with a general purpose framework for composing arbitrary pipelines
     for machine learning tasks.
-Depends: R (>= 3.1.0)
+Depends: R (>= 3.6.0)
 Imports:
     data.table,
     assertthat,
@@ -44,7 +44,6 @@ Imports:
     ggplot2,
     digest,
     Rdpack,
-    imputeMissings,
     dplyr,
     caret,
     ROCR
@@ -69,10 +68,9 @@ Suggests:
     glmnet,
     grf,
     gbm,
-    hal9001 (>= 0.4.0),
+    hal9001 (>= 0.4.4),
     h2o,
     keras,
-    kerasR,
     nloptr,
     nnls,
     randomForest,
@@ -87,10 +85,12 @@ Suggests:
     lightgbm,
     dbarts,
     gam (>= 1.15.0),
-    haldensify (>= 0.1.5),
+    haldensify (>= 0.2.3),
     mgcv,
     hts,
-    GA
+    GA,
+    SIS,
+    partykit
 Remotes:
     github::tlverse/origami,
     github::tlverse/hal9001@devel,
@@ -106,5 +106,5 @@ VignetteBuilder:
     knitr,
     R.rsp
 Roxygen: list(markdown = TRUE, old_usage = TRUE, r6 = FALSE)
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.3
 RdMacros: Rdpack
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,14 +1,12 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method("[",sl3_Task)
-export(CV_lrnr_sl)
 export(Custom_chain)
 export(Lrnr_HarmonicReg)
 export(Lrnr_arima)
 export(Lrnr_bartMachine)
 export(Lrnr_base)
 export(Lrnr_bayesglm)
-export(Lrnr_bilstm)
 export(Lrnr_bound)
 export(Lrnr_caret)
 export(Lrnr_cv)
@@ -25,8 +23,11 @@ export(Lrnr_gam)
 export(Lrnr_gbm)
 export(Lrnr_glm)
 export(Lrnr_glm_fast)
+export(Lrnr_glm_semiparametric)
 export(Lrnr_glmnet)
+export(Lrnr_glmtree)
 export(Lrnr_grf)
+export(Lrnr_grfcate)
 export(Lrnr_gru_keras)
 export(Lrnr_gts)
 export(Lrnr_h2o_classifier)
@@ -76,6 +77,7 @@ export(Variable_Type)
 export(args_to_list)
 export(custom_ROCR_risk)
 export(customize_chain)
+export(cv_sl)
 export(debug_predict)
 export(debug_train)
 export(debugonce_predict)
@@ -113,6 +115,7 @@ export(pack_predictions)
 export(pooled_hazard_task)
 export(predict_classes)
 export(prediction_plot)
+export(process_data)
 export(risk)
 export(safe_dim)
 export(sl3Options)
@@ -161,19 +164,20 @@ importFrom(ggplot2,geom_point)
 importFrom(ggplot2,ggplot)
 importFrom(ggplot2,labs)
 importFrom(ggplot2,scale_x_discrete)
-importFrom(imputeMissings,impute)
 importFrom(methods,is)
 importFrom(origami,combiner_c)
 importFrom(origami,cross_validate)
 importFrom(origami,fold_index)
 importFrom(origami,folds2foldvec)
+importFrom(origami,folds_vfold)
 importFrom(origami,id_folds_to_folds)
 importFrom(origami,make_folds)
 importFrom(origami,training)
 importFrom(origami,validation)
 importFrom(stats,aggregate)
 importFrom(stats,arima)
 importFrom(stats,binomial)
+importFrom(stats,coef)
 importFrom(stats,family)
 importFrom(stats,gaussian)
 importFrom(stats,glm)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,96 @@
+# sl3 1.4.5
+* Changed `CV_lrnr_sl` to `cv_sl`
+* Added `Lrnr_glmtree`, which uses the `partykit` R package to fit recursive
+  partitioning and regression trees in a generalized linear model.
+* Added fold-specific SL coefficients to the output of `cv_sl`, and removed
+  the coefficients column from the returned `cv_risk` table.
+* Added `get_sl_revere_risk` argument  to `Lrnr_sl`'s `cv_risk` method to 
+  provide the option (with default of `FALSE`) to add a super learner's
+  revere-based risk (not a true cross-validated risk) to `cv_risk` output.
+* Changed default metalearner to `Lrnr_nnls` for binary and continuous outcomes.
+* Added `cv_control` argument to `Lrnr_sl`, which allows users to define 
+  specific cross-validation structures for fitting the super learner. This is
+  intended for use in a nested cross-validation scheme (such as cross-validated 
+  super learner, `cv_sl`, or when `Lrnr_sl` is considered in the list of 
+  candidate `learners` in another `Lrnr_sl`). In addition to constructing 
+  clustered cross-validation with respect to `id`, `cv_control` also 
+  can be used to construct stratified cross-validation folds for `Lrnr_sl`.
+* `Lrnr_caret` now works for binary and categorical outcomes. Previous versions 
+  state that these discrete outcome types are supported by `Lrnr_caret`, but 
+  the functionality would brake. 
+* Added public function for `sl3_Task`, `get_folds`, which takes in 
+  `origami::make_folds` arguments and returns the folds. This function is 
+  now called by `task$folds` and it can be called in train as well, to obtain 
+  folds from a task that have a non-default fold structure. 
+* Learners that use CV internally (i.e., as part of their procedure to select
+  tuning parameters), including `Lrnr_caret`, `Lrnr_glmnet`, `Lrnr_hal9001`, 
+  and `Lrnr_sl`, use `task$get_folds` to create folds. The learners' folds 
+  respect the default CV fold structure in `sl3` tasks (clustered CV when `id` 
+  is supplied in the task; and stratified CV when outcomes are categorical or 
+  binary, and when `id` are nested in strata if `id` supplied to task). However, 
+  `V` can be modified according to the learner-specific parameters. (`Lrnr_sl`
+  has a few extra CV tuning arguments, which are thoroughly documented in 
+  `cv_control` and modifications are only recommended for advanced use of 
+  `Lrnr_sl`.)
+* Fixed learner parameter `formula` bug, which was causing formulas with "." to 
+  return an empty task, and therefore learners with these formulas to fail. 
+* Fixed bug in `Lrnr_cv_selector` metalearner, which was using the wrong folds 
+  to calculate the cross-validated risk estimate. This impacted
+  `Lrnr_cv_selector` when `eval_function` was not a loss function, e.g. AUC.
+  By calling `task$folds` on the metalearner's training task, we were deriving 
+  folds from the matrix of cross-validated predictions, and not using the folds 
+  for cross-validating the candidates. We now require the folds for cross-
+  validating the candidates (i.e., the folds in task for training `Lrnr_sl`) to 
+  be supplied when `Lrnr_cv_selector`'s `eval_function` is not a loss function.
+* `Lrnr_caret` and `Lrnr_rpart` factor binary outcomes in their `train` methods,
+  thereby considering a classification prediction problem. To avoid this 
+  behavior and consider a regression prediction problem with a binary outcome 
+  (e.g., to minimize the squared error or negative log likelihood loss in a 
+  binary outcome prediction problem), users can set 
+  `factor_binary_outcome = FALSE` when they instantiate the learner. 
+* Tasks can be created without an outcome. This comes in handy when creating 
+  a task that is used only for prediction, not for training, and leads 
+  to the task's outcome type being set to "none" if it's not supplied. 
+* When the variable type of the outcome (i.e., `outcome_type`) is necessary for 
+  a learner's `predict` method (e.g., if categorical outcome predictions need to 
+  be "packed" together), the outcome type in the **training task** should be 
+  used. That is, `private$.training_outcome_type` should be used to obtain
+  the outcome type in a learner's `predict` method; the task supplied to 
+  `predict` should not be used. The following learners were referring to the
+  task supplied to `predict` in order to retain the outcome type, and they were 
+  modified to use the training task's outcome type instead: `Lrnr_svm`, 
+  `Lrnr_randomForest`, `Lrnr_ranger`, `Lrnr_rpart`, `Lrnr_polspline`. The 
+  issue with pulling the outcome type from the task supplied to `predict` is 
+  that the outcome type of that task might be "none", if the `outcome` argument 
+  is not supplied to it.
+* Updated the learner template (inst/templates/Lrnr_template.R) to reflect the 
+  new formatting guidelines for learner documentation.
+* Updated documentation for `sl3_Task` parameters (man-roxygen/sl3_Task_extra.R). 
+  Specifically, `drop_missing_outcome` and `flag` were added; `offset` 
+  description was fixed; description of `folds` was added, including how to 
+  modify it and the default; and description of how the default cross-validation 
+  structure considers `id` and discrete (binary and categorical) outcome types 
+  to construct clustered and stratified cross-validation schemes, respectively,
+  was added.
+* Added documentation for the function `process_data` (R/process_data.R), which
+  is called when instantiating a task, to process the covariates and identify 
+  missingness in the outcome.
+* Added `Lrnr_grfcate`, a prediction function estimator for conditional average 
+  treatment effect (CATE), which uses the `causal_forest` function in `grf` 
+  package. This learner is intended for use in the `tmle3mopttx` package, where 
+  CATE estimation and prediction is required.
+* Added flexibility and error handling to optional `sl3_Task` argument
+  `outcome_type`. Either `"binomial"`, `"binary"` or `binomial()` can be 
+  supplied for a binary outcome; `"continuous"`,`"gaussian"`, or `gaussian()` 
+  for a continuous outcome; `"categorical"`, `"multinomial"`, or `mutlinomial()"` 
+  for a categorical outcome. As before, when `outcome_type` is not supplied, we 
+  will try to detect it from the outcome values. If the supplied `outcome_type` 
+  differs from the detected one, a warning is now thrown. If `outcome_type` is 
+  supplied but invalid, then an error is thrown upon `sl3_Task` instantiation, 
+  opposed to learner training.
+* Cross-validated super learner (`cv_sl`) returns the cross-validated 
+  predictions for the super learner and its candidates. 
+
 # sl3 1.4.4
 * Updates to `Lrnr_nnls` to support binary outcomes, including support for
   convexity of the resultant model fit and warnings on prediction quality.

diff --git a/R/CV_Lrnr_sl.R b/R/CV_Lrnr_sl.R
diff --git a/R/Lrnr_arima.R b/R/Lrnr_arima.R
@@ -87,8 +87,8 @@ Lrnr_arima <- R6Class(
         if (length(rm_idx) > 0) {
           params$xreg <- as.matrix(task$X[, -rm_idx, with = FALSE])
           print(paste(c(
-            "ARIMA requires matrix of external regressors to not be rank ",
-            "deficient. The following covariates were removed to counter the ",
+            "ARIMA requires matrix of external regressors to not be rank",
+            "deficient. The following covariates were removed to counter the",
             "linear combinations:", names(task$X)[rm_idx]
           ), collapse = " "))
         } else {