Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rm impute missings #426

Merged
merged 4 commits into from
Oct 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ Imports:
ggplot2,
digest,
Rdpack,
imputeMissings,
dplyr,
caret,
ROCR
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ importFrom(ggplot2,geom_point)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,labs)
importFrom(ggplot2,scale_x_discrete)
importFrom(imputeMissings,impute)
importFrom(methods,is)
importFrom(origami,combiner_c)
importFrom(origami,cross_validate)
Expand Down
74 changes: 56 additions & 18 deletions R/process_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,35 +51,34 @@
#' when creating the \code{\link{sl3_Task}} by setting
#' \code{drop_missing_outcome = TRUE}.
#'
#' @importFrom imputeMissings impute
#'
#' @return A list of processed data, nodes and column names
#'
#' @export
process_data <- function(data, nodes, column_names, flag = TRUE,
drop_missing_outcome = FALSE) {

# force a copy so we can mutate data in place w/o distrupting a user's data
if (inherits(data, "data.table")) {
data <- data.table::copy(data)
} else {
data <- as.data.table(data)
}

all_nodes <- unlist(nodes)


if (length(all_nodes) == 0) {
return(list(data = data, nodes = nodes, column_names = column_names))
}
node_columns <- unlist(column_names[all_nodes])
covariates_columns <- unlist(column_names[nodes$covariates])
outcome_columns <- unlist(column_names[nodes$outcome])

factorized <- FALSE
dropped <- FALSE
imputed <- FALSE

# process characters
is_character <- which(data[, sapply(.SD, is.character), .SDcols = node_columns])
char_cols <- node_columns[is_character]
Expand All @@ -89,23 +88,23 @@ process_data <- function(data, nodes, column_names, flag = TRUE,
"Character variables found: %s;\nConverting these to factors",
paste0(char_vars, collapse = ", ")
))

# convert data
for (char_col in char_cols) {
set(data, , char_col, as.factor(unlist(data[, char_col, with = FALSE])))
}
factorized <- TRUE
}

# process missing
has_missing <- data[, sapply(.SD, function(x) any(is.na(x))), .SDcols = node_columns]
miss_cols <- node_columns[has_missing]
miss_vars <- all_nodes[has_missing]

missing_Y <- any(nodes$outcome %in% miss_vars)
missing_covar_cols <- intersect(miss_cols, covariates_columns)
missing_covar_vars <- intersect(miss_vars, nodes$covariates)

if (length(miss_cols) > 0) {
if (missing_Y && drop_missing_outcome) {
if (flag) {
Expand All @@ -114,36 +113,75 @@ process_data <- function(data, nodes, column_names, flag = TRUE,
keep_rows <- stats::complete.cases(data[, outcome_columns, with = FALSE])
data <- data[keep_rows, ]
}

if (length(missing_covar_cols) > 0) {
warning(sprintf(
"Imputing missing values and adding missingness indicators for the following covariates with missing values: %s. See documentation of the process_data function for details.",
paste0(missing_covar_cols, collapse = ", ")
))
# make indicators and add to data
missing_indicators <- data[, lapply(.SD, function(x) as.numeric(!is.na(x))),
.SDcols = missing_covar_cols
.SDcols = missing_covar_cols
]

missing_indicator_cols <- sprintf("delta_%s", missing_covar_cols)
missing_indicator_vars <- sprintf("delta_%s", missing_covar_vars)
setnames(missing_indicators, missing_indicator_cols)
set(data, , missing_indicator_cols, missing_indicators)

# add inidicators to column map and covariate list
column_names[missing_indicator_vars] <- missing_indicator_cols
nodes$covariates <- c(nodes$covariates, missing_indicator_vars)
}
# impute covariates
imputed <- impute(data[, missing_covar_cols, with = FALSE], flag = FALSE)

data_missing_covars <- data[, missing_covar_cols, with = FALSE]
imputed <- impute(data.frame(data_missing_covars))

# update data
set(data, , missing_covar_cols, imputed)
}

na_Y <- (!is.null(nodes$outcome) && any(is.na(data[, outcome_columns, with = F])))
if (na_Y && flag) {
warning("Missing outcome data detected. This is okay for prediction, but will likely break training. \n You can drop observations with missing outcomes by setting drop_missing_outcome=TRUE in make_sl3_Task.")
}
list(data = data, nodes = nodes, column_names = column_names)
}

#' Impute missing values with the median/mode
#' based on imputeMissings R package (removed from CRAN)
#'
#' Character vectors and factors are imputed with the mode.
#' Numeric and integer vectors are imputed with median.
#'
#' @param data A data frame with dummies or numeric variables.
#'
#' @keywords internal
impute <- function(data){
compute <- function (data){
Mode <- function(x) {
xtab <- table(x)
xmode <- names(which(xtab == max(xtab)))
return(xmode[1])
}
values <- sapply(data, function(x) {
if (class(x) %in% c("character", "factor"))
Mode(x)
else if (class(x) %in% c("numeric", "integer"))
median(x, na.rm = TRUE)
}, simplify = FALSE)
values
}
object <- compute(data)
if (!identical(colnames(data), names(object))){
stop('Variable names and variable positions need to be identical in compute and impute')
}
data <- data.frame(sapply(1:ncol(data), function(i) {
fact <- is.factor(data[,i])
if (fact) data[,i] <- as.character(data[,i])
data[is.na(data[,i]),i] <- object[[i]]
if (fact) data[,i] <- as.factor(data[,i])
return(data[,i,drop=FALSE])
}, simplify = FALSE))
data
}
1 change: 1 addition & 0 deletions man/Lrnr_density_hse.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/Lrnr_grf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/Lrnr_h2o_glm.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/Lrnr_multivariate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/Lrnr_screener_augment.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/Lrnr_screener_coefs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/Lrnr_screener_correlation.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/Lrnr_subset_covariates.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/impute.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading