correct windowpane

emdelponte · Oct 20, 2024 · 694c63b · 694c63b
1 parent 45f5860
commit 694c63b
Show file tree

Hide file tree

Showing 9 changed files with 313 additions and 165 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -17,8 +17,6 @@ export(theme_r4pde)
 export(windowpane)
 import(car)
 import(dplyr)
-import(lubridate)
-import(progress)
 import(tidyr)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,arrange)

diff --git a/R/windowpane.R b/R/windowpane.R
@@ -1,83 +1,89 @@
-#' Windowpane Analysis of Epidemiological Data
+#' Window Pane for Epidemiological Analysis
 #'
-#' This function calculates rolling statistics (e.g., mean, sum) within defined time windows
-#' for a specified variable in epidemiological data.
+#' This function calculates summary statistics within specified windows around a given end date
+#' in a dataset, facilitating epidemiological analysis. It allows backward or forward window
+#' calculations based on a user-defined variable and window lengths.
 #'
-#' @param data A dataframe containing the data.
-#' @param end_date_col A character string specifying the column name for the end date.
-#' @param date_col A character string specifying the column name for the date.
-#' @param variable A character string specifying the name of the variable to analyze.
-#' @param study_col A character string specifying the column name for study IDs.
-#' @param summary_type A character string specifying the type of summary ("mean", "sum", "above_threshold").
-#' @param threshold A numeric value for the threshold (used only if summary_type is "above_threshold").
-#' @param window_lengths A numeric vector specifying the window lengths to use (default is c(2, 4, 8, 16)).
-#' @param direction A character string specifying the direction of the rolling window ("backward" or "forward").
+#' @param data A dataframe containing the input data.
+#' @param end_date_col The name of the column representing the end date.
+#' @param date_col The name of the column representing the date variable.
+#' @param variable The name of the column for which summary statistics are calculated.
+#' @param summary_type The type of summary to calculate. Options are "mean", "sum",
+#'   or "above_threshold".
+#' @param threshold Optional numeric value used when summary_type is "above_threshold".
+#' @param window_lengths A vector of window lengths (in days) for the calculations.
+#' @param direction The direction of the window. Options are "backward" (default) or "forward".
+#' @param group_by_cols Optional vector of column names for grouping the data.
+#' @param date_format The format of the date columns. Default is "%Y-%m-%d".
+#'
+#' @return A dataframe with the calculated summary values for each window.
 #'
-#' @return A dataframe with the results of the windowpane analysis in wide format.
-#' @import dplyr tidyr lubridate progress
 #' @examples
-#' # Simulated data example
-#' set.seed(123)
-#' n <- 100
-#' sim_data <- data.frame(
-#'   study = sample(1:5, n, replace = TRUE),
-#'   heading = as.Date('2024-01-01') + sample(0:30, n, replace = TRUE),
-#'   YYYYMMDD = as.Date('2024-01-01') + sample(0:30, n, replace = TRUE),
-#'   T2M = runif(n, 15, 30)
-#' )
-#' result <- windowpane(
-#'   data = sim_data,
-#'   end_date_col = "heading",
-#'   date_col = "YYYYMMDD",
-#'   variable = "T2M",
-#'   study_col = "study",
-#'   summary_type = "mean",
-#'   window_lengths = c(2, 4, 8, 16),
-#'   direction = "backward"
+#' # Example usage
+#' data <- data.frame(
+#'   study = rep(1, 10),
+#'   date = as.Date("2023-01-01") + 0:9,
+#'   value = rnorm(10),
+#'   end_date = rep(as.Date("2023-01-10"), 10)
 #' )
-#' print(result)
+#' windowpane(data, end_date_col = end_date, date_col = date,
+#'            variable = value, summary_type = "mean",
+#'            window_lengths = c(3, 5), direction = "backward")
 #'
+#' @import dplyr tidyr
 #' @export
 windowpane <- function(data,
                        end_date_col,
                        date_col,
                        variable,
-                       study_col,
                        summary_type,
                        threshold = NULL,
-                       window_lengths = c(2, 4, 8, 16),
-                       direction = "backward") {
-  variable_name <- deparse(substitute(variable))
-
-  results_list <- list()  # Initialize list to store results
-
-  # Unique combinations of study and end_date
-  unique_combinations <- data %>%
-    distinct(!!sym(study_col), !!sym(end_date_col))
+                       window_lengths,
+                       direction = "backward",
+                       group_by_cols = NULL,
+                       date_format = "%Y-%m-%d") {
+
+  # Convert columns to symbols for tidy evaluation
+  end_date_col <- enquo(end_date_col)
+  date_col <- enquo(date_col)
+  variable <- enquo(variable)
+
+  # Prepare data and convert date columns
+  data <- data %>%
+    mutate(
+      {{ date_col }} := as.Date(as.character({{ date_col }}), format = date_format),
+      {{ end_date_col }} := as.Date(as.character({{ end_date_col }}), format = date_format)
+    )
 
-  # Calculate total steps accurately based on window_lengths
-  total_steps <- 0
-  for (window_size in window_lengths) {
-    total_steps <- total_steps + (max(window_lengths) - window_size + 1)
+  # Handle grouping
+  if (!is.null(group_by_cols)) {
+    grouped_data <- data %>% group_by(across(all_of(group_by_cols)))
+  } else {
+    grouped_data <- data
   }
-  total_steps <- total_steps * nrow(unique_combinations)
 
-  # Initialize the progress bar
-  pb <- progress_bar$new(
-    format = "  Processing [:bar] :percent in :elapsed seconds",
-    total = total_steps,
-    clear = FALSE, width = 60
-  )
+  # Get unique combinations of grouping variables and end_date_col
+  unique_combinations <- grouped_data %>%
+    distinct(across(c(group_by_cols, !!end_date_col))) %>%
+    ungroup()
 
-  # Iterate through each combination of study and end_date
-  for (i in 1:nrow(unique_combinations)) {
-    study_id <- unique_combinations[[study_col]][i]
-    end_date <- unique_combinations[[end_date_col]][i]
+  results_list <- list()
 
-    subset_data <- data %>%
-      filter(!!sym(study_col) == study_id, !!sym(end_date_col) == end_date)
+  # Iterate over each unique combination
+  for (i in seq_len(nrow(unique_combinations))) {
+    current_group <- unique_combinations[i, ]
+    end_date <- pull(current_group, !!end_date_col)
 
-    start_date <- end_date - days(max(window_lengths))
+    # Filter data for the current group
+    subset_data <- data
+    if (!is.null(group_by_cols)) {
+      for (col in group_by_cols) {
+        subset_data <- subset_data %>% filter(.data[[col]] == current_group[[col]])
+      }
+    }
+    subset_data <- subset_data %>% filter((!!end_date_col) == end_date)
+
+    start_date <- end_date - max(window_lengths)
 
     results <- tibble(
       start_day = numeric(),
@@ -90,26 +96,30 @@ windowpane <- function(data,
       for (j in 0:(max(window_lengths) - window_size)) {
 
         if (direction == "backward") {
-          window_end_date <- end_date - days(j)
-          window_start_date <- window_end_date - days(window_size - 1)
+          window_end_date <- end_date - j
+          window_start_date <- window_end_date - (window_size - 1)
         } else {
-          window_start_date <- start_date + days(j)
-          window_end_date <- window_start_date + days(window_size - 1)
+          window_start_date <- start_date + j
+          window_end_date <- window_start_date + (window_size - 1)
         }
 
         # Filter data for the current window
         window_data <- subset_data %>%
-          filter(!!sym(date_col) >= window_start_date & !!sym(date_col) <= window_end_date)
-
-        # Calculate summary value based on the summary_type
-        if (summary_type == "mean") {
-          value <- mean(window_data[[variable]], na.rm = TRUE)
-        } else if (summary_type == "sum") {
-          value <- sum(window_data[[variable]], na.rm = TRUE)
-        } else if (summary_type == "above_threshold" & !is.null(threshold)) {
-          value <- sum(window_data[[variable]] > threshold, na.rm = TRUE)
+          filter((!!date_col) >= window_start_date & (!!date_col) <= window_end_date)
+
+        # Calculate summary value
+        if (nrow(window_data) == 0) {
+          value <- 0
         } else {
-          value <- NA
+          if (summary_type == "mean") {
+            value <- mean(pull(window_data, !!variable), na.rm = TRUE)
+          } else if (summary_type == "sum") {
+            value <- sum(pull(window_data, !!variable), na.rm = TRUE)
+          } else if (summary_type == "above_threshold" & !is.null(threshold)) {
+            value <- sum(pull(window_data, !!variable) > threshold, na.rm = TRUE)
+          } else {
+            value <- NA
+          }
         }
 
         # Save results
@@ -118,9 +128,6 @@ windowpane <- function(data,
           end_day = ifelse(direction == "backward", -(j + window_size - 1), j + window_size - 1),
           value = value
         ))
-
-        # Update progress bar after each iteration
-        pb$tick()
       }
     }
 
@@ -129,32 +136,27 @@ windowpane <- function(data,
       unite("column_name", start_day, end_day, sep = "_") %>%
       pivot_wider(names_from = column_name, values_from = value)
 
-    # Append variable name to column names
-    colnames(results_wide) <- paste0(variable_name, "_", colnames(results_wide))
+    # Identify grouping columns present in results_wide
+    existing_group_cols <- intersect(names(results_wide), c(group_by_cols, as_label(end_date_col)))
 
-    # Convert 'end_date' to Date format if it is numeric
-    if (is.numeric(end_date)) {
-      end_date <- as.Date(end_date, origin = "1970-01-01")
-    }
-
-    # Add end_date and study information
+    # Add the variable name as a prefix to all non-grouping columns
     results_wide <- results_wide %>%
-      mutate(!!end_date_col := end_date,
-             !!study_col := study_id) %>%
-      relocate(!!sym(end_date_col), .before = everything()) %>%
-      relocate(!!sym(study_col), .before = everything())
+      rename_with(~ paste0(as_label(variable), "_", .), .cols = -all_of(existing_group_cols))
 
-    results_list[[paste0(study_id, "_", end_date)]] <- results_wide
+    # Add grouping columns and end_date column
+    if (!is.null(group_by_cols)) {
+      for (col in group_by_cols) {
+        results_wide[[col]] <- current_group[[col]]
+      }
+    }
+    results_wide[[as_label(end_date_col)]] <- as.character(end_date)
+
+    # Store the results for the current group
+    results_list[[i]] <- results_wide
   }
 
   # Combine all results into a single dataframe
   final_results <- bind_rows(results_list)
 
   return(final_results)
 }
-
-
-
-
-
-
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
@@ -2,5 +2,5 @@ pandoc: 2.19.2
 pkgdown: 2.0.7
 pkgdown_sha: ~
 articles: {}
-last_built: 2024-10-19T01:31Z
+last_built: 2024-10-20T14:53Z