Skip to content

Commit

Permalink
correct windowpane
Browse files Browse the repository at this point in the history
  • Loading branch information
emdelponte committed Oct 20, 2024
1 parent 45f5860 commit 694c63b
Show file tree
Hide file tree
Showing 9 changed files with 313 additions and 165 deletions.
2 changes: 0 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ export(theme_r4pde)
export(windowpane)
import(car)
import(dplyr)
import(lubridate)
import(progress)
import(tidyr)
importFrom(dplyr,"%>%")
importFrom(dplyr,arrange)
Expand Down
196 changes: 99 additions & 97 deletions R/windowpane.R
Original file line number Diff line number Diff line change
@@ -1,83 +1,89 @@
#' Windowpane Analysis of Epidemiological Data
#' Window Pane for Epidemiological Analysis
#'
#' This function calculates rolling statistics (e.g., mean, sum) within defined time windows
#' for a specified variable in epidemiological data.
#' This function calculates summary statistics within specified windows around a given end date
#' in a dataset, facilitating epidemiological analysis. It allows backward or forward window
#' calculations based on a user-defined variable and window lengths.
#'
#' @param data A dataframe containing the data.
#' @param end_date_col A character string specifying the column name for the end date.
#' @param date_col A character string specifying the column name for the date.
#' @param variable A character string specifying the name of the variable to analyze.
#' @param study_col A character string specifying the column name for study IDs.
#' @param summary_type A character string specifying the type of summary ("mean", "sum", "above_threshold").
#' @param threshold A numeric value for the threshold (used only if summary_type is "above_threshold").
#' @param window_lengths A numeric vector specifying the window lengths to use (default is c(2, 4, 8, 16)).
#' @param direction A character string specifying the direction of the rolling window ("backward" or "forward").
#' @param data A dataframe containing the input data.
#' @param end_date_col The name of the column representing the end date.
#' @param date_col The name of the column representing the date variable.
#' @param variable The name of the column for which summary statistics are calculated.
#' @param summary_type The type of summary to calculate. Options are "mean", "sum",
#' or "above_threshold".
#' @param threshold Optional numeric value used when summary_type is "above_threshold".
#' @param window_lengths A vector of window lengths (in days) for the calculations.
#' @param direction The direction of the window. Options are "backward" (default) or "forward".
#' @param group_by_cols Optional vector of column names for grouping the data.
#' @param date_format The format of the date columns. Default is "%Y-%m-%d".
#'
#' @return A dataframe with the calculated summary values for each window.
#'
#' @return A dataframe with the results of the windowpane analysis in wide format.
#' @import dplyr tidyr lubridate progress
#' @examples
#' # Simulated data example
#' set.seed(123)
#' n <- 100
#' sim_data <- data.frame(
#' study = sample(1:5, n, replace = TRUE),
#' heading = as.Date('2024-01-01') + sample(0:30, n, replace = TRUE),
#' YYYYMMDD = as.Date('2024-01-01') + sample(0:30, n, replace = TRUE),
#' T2M = runif(n, 15, 30)
#' )
#' result <- windowpane(
#' data = sim_data,
#' end_date_col = "heading",
#' date_col = "YYYYMMDD",
#' variable = "T2M",
#' study_col = "study",
#' summary_type = "mean",
#' window_lengths = c(2, 4, 8, 16),
#' direction = "backward"
#' # Example usage
#' data <- data.frame(
#' study = rep(1, 10),
#' date = as.Date("2023-01-01") + 0:9,
#' value = rnorm(10),
#' end_date = rep(as.Date("2023-01-10"), 10)
#' )
#' print(result)
#' windowpane(data, end_date_col = end_date, date_col = date,
#' variable = value, summary_type = "mean",
#' window_lengths = c(3, 5), direction = "backward")
#'
#' @import dplyr tidyr
#' @export
windowpane <- function(data,
end_date_col,
date_col,
variable,
study_col,
summary_type,
threshold = NULL,
window_lengths = c(2, 4, 8, 16),
direction = "backward") {
variable_name <- deparse(substitute(variable))

results_list <- list() # Initialize list to store results

# Unique combinations of study and end_date
unique_combinations <- data %>%
distinct(!!sym(study_col), !!sym(end_date_col))
window_lengths,
direction = "backward",
group_by_cols = NULL,
date_format = "%Y-%m-%d") {

# Convert columns to symbols for tidy evaluation
end_date_col <- enquo(end_date_col)
date_col <- enquo(date_col)
variable <- enquo(variable)

# Prepare data and convert date columns
data <- data %>%
mutate(
{{ date_col }} := as.Date(as.character({{ date_col }}), format = date_format),
{{ end_date_col }} := as.Date(as.character({{ end_date_col }}), format = date_format)
)

# Calculate total steps accurately based on window_lengths
total_steps <- 0
for (window_size in window_lengths) {
total_steps <- total_steps + (max(window_lengths) - window_size + 1)
# Handle grouping
if (!is.null(group_by_cols)) {
grouped_data <- data %>% group_by(across(all_of(group_by_cols)))
} else {
grouped_data <- data
}
total_steps <- total_steps * nrow(unique_combinations)

# Initialize the progress bar
pb <- progress_bar$new(
format = " Processing [:bar] :percent in :elapsed seconds",
total = total_steps,
clear = FALSE, width = 60
)
# Get unique combinations of grouping variables and end_date_col
unique_combinations <- grouped_data %>%
distinct(across(c(group_by_cols, !!end_date_col))) %>%
ungroup()

# Iterate through each combination of study and end_date
for (i in 1:nrow(unique_combinations)) {
study_id <- unique_combinations[[study_col]][i]
end_date <- unique_combinations[[end_date_col]][i]
results_list <- list()

subset_data <- data %>%
filter(!!sym(study_col) == study_id, !!sym(end_date_col) == end_date)
# Iterate over each unique combination
for (i in seq_len(nrow(unique_combinations))) {
current_group <- unique_combinations[i, ]
end_date <- pull(current_group, !!end_date_col)

start_date <- end_date - days(max(window_lengths))
# Filter data for the current group
subset_data <- data
if (!is.null(group_by_cols)) {
for (col in group_by_cols) {
subset_data <- subset_data %>% filter(.data[[col]] == current_group[[col]])
}
}
subset_data <- subset_data %>% filter((!!end_date_col) == end_date)

start_date <- end_date - max(window_lengths)

results <- tibble(
start_day = numeric(),
Expand All @@ -90,26 +96,30 @@ windowpane <- function(data,
for (j in 0:(max(window_lengths) - window_size)) {

if (direction == "backward") {
window_end_date <- end_date - days(j)
window_start_date <- window_end_date - days(window_size - 1)
window_end_date <- end_date - j
window_start_date <- window_end_date - (window_size - 1)
} else {
window_start_date <- start_date + days(j)
window_end_date <- window_start_date + days(window_size - 1)
window_start_date <- start_date + j
window_end_date <- window_start_date + (window_size - 1)
}

# Filter data for the current window
window_data <- subset_data %>%
filter(!!sym(date_col) >= window_start_date & !!sym(date_col) <= window_end_date)

# Calculate summary value based on the summary_type
if (summary_type == "mean") {
value <- mean(window_data[[variable]], na.rm = TRUE)
} else if (summary_type == "sum") {
value <- sum(window_data[[variable]], na.rm = TRUE)
} else if (summary_type == "above_threshold" & !is.null(threshold)) {
value <- sum(window_data[[variable]] > threshold, na.rm = TRUE)
filter((!!date_col) >= window_start_date & (!!date_col) <= window_end_date)

# Calculate summary value
if (nrow(window_data) == 0) {
value <- 0
} else {
value <- NA
if (summary_type == "mean") {
value <- mean(pull(window_data, !!variable), na.rm = TRUE)
} else if (summary_type == "sum") {
value <- sum(pull(window_data, !!variable), na.rm = TRUE)
} else if (summary_type == "above_threshold" & !is.null(threshold)) {
value <- sum(pull(window_data, !!variable) > threshold, na.rm = TRUE)
} else {
value <- NA
}
}

# Save results
Expand All @@ -118,9 +128,6 @@ windowpane <- function(data,
end_day = ifelse(direction == "backward", -(j + window_size - 1), j + window_size - 1),
value = value
))

# Update progress bar after each iteration
pb$tick()
}
}

Expand All @@ -129,32 +136,27 @@ windowpane <- function(data,
unite("column_name", start_day, end_day, sep = "_") %>%
pivot_wider(names_from = column_name, values_from = value)

# Append variable name to column names
colnames(results_wide) <- paste0(variable_name, "_", colnames(results_wide))
# Identify grouping columns present in results_wide
existing_group_cols <- intersect(names(results_wide), c(group_by_cols, as_label(end_date_col)))

# Convert 'end_date' to Date format if it is numeric
if (is.numeric(end_date)) {
end_date <- as.Date(end_date, origin = "1970-01-01")
}

# Add end_date and study information
# Add the variable name as a prefix to all non-grouping columns
results_wide <- results_wide %>%
mutate(!!end_date_col := end_date,
!!study_col := study_id) %>%
relocate(!!sym(end_date_col), .before = everything()) %>%
relocate(!!sym(study_col), .before = everything())
rename_with(~ paste0(as_label(variable), "_", .), .cols = -all_of(existing_group_cols))

results_list[[paste0(study_id, "_", end_date)]] <- results_wide
# Add grouping columns and end_date column
if (!is.null(group_by_cols)) {
for (col in group_by_cols) {
results_wide[[col]] <- current_group[[col]]
}
}
results_wide[[as_label(end_date_col)]] <- as.character(end_date)

# Store the results for the current group
results_list[[i]] <- results_wide
}

# Combine all results into a single dataframe
final_results <- bind_rows(results_list)

return(final_results)
}






2 changes: 1 addition & 1 deletion docs/pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ pandoc: 2.19.2
pkgdown: 2.0.7
pkgdown_sha: ~
articles: {}
last_built: 2024-10-19T01:31Z
last_built: 2024-10-20T14:53Z

Loading

0 comments on commit 694c63b

Please sign in to comment.