diff --git a/DESCRIPTION b/DESCRIPTION index 97831eff..041fb8ef 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: sitrep Title: Report templates and helper functions for EPI -Version: 0.1.0 +Version: 0.1.2 Authors@R: c(person(given = "Dirk", family = "Schumacher", @@ -65,7 +65,8 @@ Imports: srvyr, stats, utils, - apyramid + apyramid, + msfdict Suggests: testthat (>= 2.1.0), sessioninfo, @@ -73,7 +74,8 @@ Suggests: covr Remotes: reconhub/linelist, - R4EPI/apyramid + R4EPI/apyramid, + R4EPI/msfdict Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) @@ -89,20 +91,18 @@ Collate: 'descriptive-table.R' 'find_breaks.R' 'find_date_cause.R' - 'gen_data.R' 'gen_eligible_interviewed.R' - 'gen_polygon.R' 'gen_population.R' 'helpers.R' 'inline_fun.R' - 'msf_dict_survey.R' + 'msf_dict_rename_helper.R' + 'msfdict_exports.R' 'prettify_tabulation.R' 'relabel_proportions.R' 'sample-size.R' 'tab_descriptive.R' 'tab_univariate.R' 'tabulate_survey.R' - 'template_data_frame.R' 'transpose_pretty.R' 'two_by_two_funs.R' 'unite_ci.R' diff --git a/NAMESPACE b/NAMESPACE index c4d786f9..17919db2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,12 +45,10 @@ importFrom(dplyr,funs) importFrom(dplyr,group_by) importFrom(dplyr,mutate) importFrom(dplyr,mutate_at) -importFrom(dplyr,row_number) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,summarise_all) importFrom(dplyr,ungroup) -importFrom(epitrix,clean_labels) importFrom(ggplot2,aes) importFrom(ggplot2,expand_scale) importFrom(ggplot2,geom_density) @@ -62,26 +60,20 @@ importFrom(ggplot2,scale_y_continuous) importFrom(ggplot2,stat_density) importFrom(ggplot2,stat_function) importFrom(glue,glue) -importFrom(rio,import) +importFrom(msfdict,gen_data) +importFrom(msfdict,gen_polygon) +importFrom(msfdict,msf_dict) +importFrom(msfdict,msf_dict_survey) importFrom(rlang,"!!") importFrom(rlang,".data") importFrom(rlang,":=") importFrom(rlang,sym) importFrom(scales,percent_format) -importFrom(sf,st_intersection) -importFrom(sf,st_make_grid) -importFrom(sf,st_polygon) -importFrom(sf,st_set_crs) -importFrom(sf,st_sf) importFrom(srvyr,survey_mean) importFrom(srvyr,survey_total) -importFrom(stats,aggregate) -importFrom(stats,runif) importFrom(stats,setNames) importFrom(tibble,as_tibble) importFrom(tidyr,complete) -importFrom(tidyr,fill) importFrom(tidyr,gather) importFrom(tidyr,spread) importFrom(tidyr,unite) -importFrom(utils,read.csv) diff --git a/NEWS.md b/NEWS.md index 4295f701..c4e1b0b5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,17 @@ +# sitrep 0.1.2 + +* Import {msfdict}. The `msf_dict()`, `msf_dict_survey()`, `gen_data()` and + `gen_polygon()` functions (and all associated internal functions) were moved + to the {msfdict} package and re-exported (@zkamvar, #228) + +# sitrep 0.1.1 + +* Import {apyramid}. Code from `plot_age_pyramid()` was moved to the {apyramid} + package, which is currently on GitHub, but soon to be on CRAN (@zkamvar, #225) +* Removed erroneous emoji in comments that was causing errors in Windows + installations (@zkamvar, #227) +* NOTE: this release on GitHub was not associated with a formal version change. + # sitrep 0.1.0 * Added a `NEWS.md` file to track changes to the package. diff --git a/R/gen_data.R b/R/gen_data.R deleted file mode 100644 index 9ce84ac2..00000000 --- a/R/gen_data.R +++ /dev/null @@ -1,649 +0,0 @@ -#' MSF data dictionaries and dummy datasets -#' -#' These function reads in MSF data dictionaries and produces randomised -#' datasets based on values defined in the dictionaries. The randomised -#' dataset produced should mimic an excel export from DHIS2. -#' -#' @param disease Specify which disease you would like to use. -#' Currently supports "Cholera", "Measles" and "Meningitis". -#' -#' @param name the name of the dictionary stored in the package. -#' -#' @param dictionary Specify which dictionary you would like to use. -#' Currently supports "Cholera", "Measles", "Meningitis", "AJS" and "Mortality". -#' -#' @param varnames Specify name of column that contains varnames. Currently -#' default set to "Item". (this can probably be deleted once dictionaries -#' standardise) If `dictionary` is "Mortality", `varnames` needs to be "column_name"`. -#' -#' @param numcases For fake data, specify the number of cases you want (default is 300 -#' -#' @param tibble Return data dictionary as a tidyverse tibble (default is TRUE) -#' -#' @param compact if `TRUE` (default), then a nested data frame is returned -#' where each row represents a single variable and a nested data frame column -#' called "options", which can be expanded with [tidyr::unnest()]. This only -#' works if `long = TRUE`. -#' -#' @param long If TRUE (default), the returned data dictionary is in long format with -#' each option getting one row. If `FALSE`, then two data frames are returned, -#' one with variables and the other with content options. -#' -#' @param copy_to_clipboard if `TRUE` (default), the rename template will be -#' copied to the user's clipboard with [clipr::write_clip()]. If `FALSE`, the -#' rename template will be printed to the user's console. -#' -#' @importFrom rio import -#' @importFrom epitrix clean_labels -#' @importFrom tibble as_tibble -#' @importFrom stats aggregate runif -#' @importFrom utils read.csv -#' @seealso [linelist::clean_variable_spelling()] -#' @export -#' @examples -#' -#' if (require('dplyr') & require('linelist')) { withAutoprint({ -#' # You will often want to use MSF dictionaries to translate codes to human- -#' # readable variables. Here, we generate a data set of 20 cases: -#' dat <- gen_data(dictionary = "Cholera", varnames = "data_element_shortname", -#' numcases = 20) -#' print(dat) -#' -#' # We want the expanded dictionary, so we will select `compact = FALSE` -#' dict <- msf_dict(disease = "Cholera", long = TRUE, compact = FALSE, tibble = TRUE) -#' print(dict) -#' -#' # We can use linelist's clean_variable_spelling to translate the codes. First, -#' # we want to reorder the columns of the dictionary like so: -#' # -#' # - 1st column: option codes -#' # - 2nd column: translations -#' # - 3rd column: data column name -#' # - 4th column: order of options -#' # -#' # we also want to make sure to filter out any columns that are blank for -#' # the option codes, because this means that they don't have a fixed number of -#' # options -#' dict <- dict %>% -#' select(option_code, option_name, data_element_shortname, option_order_in_set) %>% -#' filter(!is.na(option_code)) -#' print(dict) -#' -#' # Now we can use linelist to filter the data: -#' dat_clean <- clean_variable_spelling(dat, dict) -#' print(dat_clean) -#' })} - - -# function to pull outbreak data dicationaries together -msf_dict <- function(disease, name = "MSF-outbreak-dict.xlsx", tibble = TRUE, - compact = TRUE, long = TRUE) { - - # get excel file path (need to specify the file name) - path <- system.file("extdata", name, package = "sitrep") - - # read in categorical variable content options - dat_opts <- rio::import(path, which = "OptionCodes") - - # read in data set - pasting the disease name for sheet - dat_dict <- rio::import(path, which = disease) - - # clean col names - colnames(dat_dict) <- epitrix::clean_labels(colnames(dat_dict)) - colnames(dat_opts) <- epitrix::clean_labels(colnames(dat_opts)) - - # clean future var names - # excel names (data element shortname) - # csv names (data_element_name) - dat_dict$data_element_shortname <- epitrix::clean_labels(dat_dict$data_element_shortname) - dat_dict$data_element_name <- epitrix::clean_labels(dat_dict$data_element_name) - - # Adding hardcoded var types to options list - # 2 types added to - BOOLEAN, TRUE_ONLY - BOOLEAN <- data.frame(option_code = c(1, 0), - option_name = c("[1] Yes", "[0] No"), - option_uid = c(NA, NA), - option_order_in_set = c(1,2), - optionset_uid = c("BOOLEAN", "BOOLEAN") - ) - - TRUE_ONLY <- data.frame(option_code = c(1, "NA"), - option_name = c("[1] TRUE", "[NA] Not TRUE"), - option_uid = c(NA, NA), - option_order_in_set = c(1,2), - optionset_uid = c("TRUE_ONLY", "TRUE_ONLY") - ) - - # bind these on to the bottom of dat_opts (option list) as rows - dat_opts <- do.call("rbind", list(dat_opts, BOOLEAN, TRUE_ONLY)) - - - - # add the unique identifier to link above three in dictionary to options list - for (i in c("BOOLEAN", "TRUE_ONLY")) { - dat_dict$used_optionset_uid[dat_dict$data_element_valuetype == i] <- i - } - - # remove back end codes from front end var in the options list - dat_opts$option_name <- gsub(".*] ", "", dat_opts$option_name) - - if (long) { - - outtie <- dplyr::left_join(dat_dict, dat_opts, - by = c("used_optionset_uid" = "optionset_uid")) - - outtie <- if (tibble) tibble::as_tibble(outtie) else outtie - - } - - # produce clean compact data dictionary for use in gen_data - if (long && compact == TRUE) { - - squished <- dplyr::group_by(outtie, !! quote(data_element_shortname)) - - if (utils::packageVersion("tidyr") > "0.8.99") { - squished <- tidyr::nest(squished, options = dplyr::starts_with("option_")) - } else { - squished <- tidyr::nest(squished, dplyr::starts_with("option_"), .key = "options") - outtie <- dplyr::select(outtie, -dplyr::starts_with("option_")) - outtie <- dplyr::distinct(outtie) - squished <- dplyr::left_join(outtie, squished, by = "data_element_shortname") - } - - return(dplyr::ungroup(squished)) - - } - - # Return second option: a list with data dictionary and value options seperate - if (!long) { - - if (tibble == TRUE) { - outtie <- list(dictionary = tibble::as_tibble(dat_dict), - options = tibble::as_tibble(dat_opts) - ) - } - - if (tibble == FALSE) { - outtie <- list(dictionary = dat_dict, - options = dat_opts) - } - - } - - # return dictionary dataset - outtie -} - -#' @export -#' @rdname msf_dict -msf_dict_rename_helper <- function(disease, varnames = "data_element_shortname", copy_to_clipboard = TRUE) { - # get msf disease specific data dictionary - dat_dict <- msf_dict(disease = disease, compact = TRUE) - # get the outbreak Rmd to check if the variable is optional or required - outbreak_file <- available_sitrep_templates(recursive = TRUE, pattern = ".Rmd", full.names = TRUE) - outbreak_file <- grep(tolower(disease), outbreak_file, value = TRUE)[[1]] - outbreak_file <- readLines(outbreak_file) - - dat_dict[["var_required"]] <- vapply(dat_dict[[varnames]], - FUN = function(i, o) if (any(grepl(paste0("^[^#]*", i), o))) "REQUIRED" else "optional", - FUN.VALUE = character(1), - o = outbreak_file - ) - dat_dict <- dat_dict[order(dat_dict[["var_required"]] != "REQUIRED", - dat_dict[[varnames]]), ] - msg <- "## Add the appropriate column names after the equals signs\n\n" - msg <- paste0(msg, "linelist_cleaned <- rename(linelist_cleaned,\n") - the_renames <- sprintf(" %s = , # %s (%s)", - format(dat_dict[[varnames]]), - format(dat_dict[["data_element_valuetype"]]), - dat_dict[["var_required"]] - ) - the_renames[length(the_renames)] <- gsub(",", " ", the_renames[length(the_renames)]) - msg <- paste0(msg, paste(the_renames, collapse = "\n"), "\n)\n") - if (copy_to_clipboard) { - x <- try(clipr::write_clip(msg), silent = TRUE) - if (inherits(x, "try-error")) { - if (interactive()) cat(msg) - return(invisible()) - } - message("rename template copied to clipboard. Paste the contents to your RMarkdown file and enter in the column names from your data set.") - } else { - cat(msg) - } -} - - -# function to generate fake dataset based on data dictionary -#' @export -#' @rdname msf_dict -gen_data <- function(dictionary, varnames = "data_element_shortname", numcases = 300) { - - # Three datasets: - # 1) dat_dict = msf data dicationary generated by (msf_dict) - # 2) dat_output = formatting of data dictionary to make use for sampling - # 3) dis_output = dictionary dataset generated from sampling (exported) - - # define which ones are outbreaks and which ones are survey datasets - SURVEYS <- c("Mortality", "Nutrition", "Vaccination") - OUTBREAKS <- c("Cholera", "Measles", "Meningitis", "AJS") - - # get msf dictionary specific data dictionary - if (dictionary %in% SURVEYS) { - dat_dict <- msf_dict_survey(disease = dictionary, tibble = FALSE, compact = TRUE) - } else if (dictionary %in% OUTBREAKS) { - dat_dict <- msf_dict(disease = dictionary, tibble = FALSE, compact = TRUE) - } else { - stop("'dictionary' must be one of: 'Cholera', 'Measles', 'Meningitis', 'AJS', 'Mortality', 'Nutrition', 'Vaccination'") - } - - - # # drop extra columns (keep varnames and code options) - # varcol <- which(names(dat_dict) == varnames) - # codecol <- grep("Code", names(dat_dict)) - # dat_output <- dat_dict[, c(varcol, codecol), drop = FALSE] - - - # # use the var names as rows - # row.names(dat_output) <- dat_output[[varnames]] - # # remove the var names column - # dat_output <- dat_output[-1] - # # flip the dataset - # dat_output <- data.frame(t(dat_output)) - # # remove rownames - # row.names(dat_output) <- NULL - - # # define variables that do not have any contents in the data dictionary - - # # create a NEW empty dataframe with the names from the data dictionary - # dis_output <- data.frame(matrix(ncol = ncol(dat_output), nrow = numcases) ) - # colnames(dis_output) <- colnames(dat_output) - - # # take samples for vars with defined options (non empties) - # categories <- lapply(dat_output, function(i) i[!is.na(i)]) - # categories <- categories[lengths(categories) > 0] - # for (i in names(categories)) { - # dis_output[[i]] <- sample(categories[[i]], numcases, replace = TRUE) - # } - - dis_output <- template_data_frame_categories(dat_dict, numcases, varnames, dictionary %in% SURVEYS) - - # Use data dictionary to define which vars are multiple choice - # ZNK 2019-05-01 ---- - # These type of columns are currently only present in the survey data sets - # multivars <- dat_dict[dat_dict$data_element_valuetype == "MULTI", varnames] - - # if (length(multivars) > 0) { - # sample_multivars <- lapply(multivars, sample_cats) - # sample_multivars <- do.call(cbind, sample_multivars) - - # dis_output[, multivars] <- NULL - # dis_output <- cbind(dis_output, sample_multivars) - # } - - - # Use data dictionary to define which vars are dates - datevars <- dat_dict[[varnames]][dat_dict$data_element_valuetype == "DATE"] - - # sample between two dates - posidates <- seq(as.Date("2018-01-01"), as.Date("2018-04-30"), by = "day") - - # fill the date columns with dates - for (i in datevars) { - dis_output[[i]] <- sample(posidates, numcases, replace = TRUE) - } - - if (dictionary %in% OUTBREAKS) { - # Fix DATES - # exit dates before date of entry - # just add 20 to admission.... (was easiest...) - dis_output <- enforce_timing(dis_output, - first = "date_of_consultation_admission", - second = "date_of_exit", - 20) - - # lab sample dates before admission - # add 2 to admission.... - dis_output <- enforce_timing(dis_output, - first = "date_of_consultation_admission", - second = "date_lab_sample_taken", - 2) - # vaccination dates after admission - # minus 20 to admission... - dis_output <- enforce_timing(dis_output, - first = "date_of_consultation_admission", - second = "date_of_last_vaccination", - 20) - -# dis_output$date_of_last_vaccination[dis_output$date_of_exit > -# dis_output$date_of_consultation_admission] <- -# dis_output$date_of_consultation_admission[dis_output$date_of_exit > -# dis_output$date_of_consultation_admission] - 20 - # symptom onset after admission - # minus 20 to admission... - dis_output <- enforce_timing(dis_output, - first = "date_of_consultation_admission", - second = "date_of_onset", - 20) - # dis_output$date_of_onset[dis_output$date_of_onset > - # dis_output$date_of_consultation_admission] <- - # dis_output$date_of_consultation_admission[dis_output$date_of_onset > - # dis_output$date_of_consultation_admission] - 20 - - - # Patient identifiers - dis_output$case_number <- sprintf("A%d", seq(numcases)) - - # treatment site facility - dis_output$treatment_facility_site <- sample(1:50, - numcases, replace = TRUE) - - # patient origin (categorical from a dropdown) - dis_output$patient_origin <- sample(c("Village A", "Village B", - "Village C", "Village D"), - numcases, replace = TRUE) - - # treatment location (categorical from a dropdown) - dis_output$treatment_location <- sample(c("Ward A", "Ward B", - "Ward C", "Ward D"), - numcases, replace = TRUE) - - # patient origin free text - dis_output$patient_origin_free_text <- sample(c("Messy location A", "Messy location B", - "Messy location C", "Messy location D"), - numcases, replace = TRUE) - } - - # sample age_month and age_days if appropriate - age_year_var <- grep("age.*year", names(dis_output), value = TRUE)[1] - age_month_var <- grep("age.*month", names(dis_output), value = TRUE)[1] - age_day_var <- grep("age.*day", names(dis_output), value = TRUE)[1] - - # set_age_na controlls if age_year_var should be set to NA if age_month_var is sampled - # same is done for age_month_var and age_day_var - set_age_na <- TRUE - if (dictionary == "Mortality") { - set_age_na <- FALSE - } - - if (!is.na(age_year_var)) { - # sample 0:120 - dis_output[, age_year_var] <- sample(0:120L, numcases, replace = TRUE) - U2_YEARS <- which(dis_output[, age_year_var] <= 2) - if (set_age_na) { - dis_output[U2_YEARS, age_year_var] <- NA_integer_ - } - - U2_MONTHS <- integer(0) - if (!is.na(age_month_var)) { - # age_month - if (length(U2_YEARS) > 0) { - dis_output[U2_YEARS, age_month_var] <- sample(0:24L, - length(U2_YEARS), - replace = TRUE) - U2_MONTHS <- which(dis_output[, age_month_var] <= 2) - if (set_age_na) { - dis_output[U2_MONTHS, age_month_var] <- NA_integer_ - } - } - - if (!is.na(age_day_var)) { - # age_days - if (length(U2_MONTHS) > 0) { - dis_output[U2_MONTHS, age_day_var] <- sample(0:60L, - length(U2_MONTHS), - replace = TRUE) - } - } - } - } - - - - if (dictionary == "Cholera" | dictionary == "Measles" | - dictionary == "AJS") { - # fix pregnancy stuff - dis_output$pregnant[dis_output$sex != "F"] <- "NA" - PREGNANT_FEMALE <- dis_output$sex != "F" | dis_output$pregnant != "Y" - - dis_output$foetus_alive_at_admission[PREGNANT_FEMALE] <- NA - dis_output$trimester[PREGNANT_FEMALE] <- NA - dis_output$delivery_event[PREGNANT_FEMALE] <- "NA" - dis_output$pregnancy_outcome_at_exit[PREGNANT_FEMALE] <- NA - dis_output$pregnancy_outcome_at_exit[dis_output$delivery_event != "1"] <- NA - } - - - if (dictionary == "Cholera") { - dis_output$ors_consumed_litres <- sample(1:10, numcases, replace = TRUE) - dis_output$iv_fluids_received_litres <- sample(1:10, numcases, replace = TRUE) - } - - if (dictionary == "Measles") { - dis_output$baby_born_with_complications[PREGNANT_FEMALE & - dis_output$delivery_event != "1"] <- NA - - # fix vaccine stuff among non vaccinated - NOTVACC <- which(!dis_output$previously_vaccinated %in% c("C", "V")) - - dis_output$previous_vaccine_doses_received[NOTVACC] <- NA - dis_output$date_of_last_vaccination[NOTVACC] <- NA - - } - - if (dictionary == "Meningitis") { - # T1 lab sample dates before admission - # add 2 to admission.... - - dis_output <- enforce_timing(dis_output, - first = "date_of_consultation_admission", - second = "date_ti_sample_sent", - 2) - # dis_output$date_ti_sample_sent[dis_output$date_ti_sample_sent <= - # dis_output$date_of_consultation_admission] <- - # dis_output$date_of_consultation_admission[dis_output$date_ti_sample_sent <= - # dis_output$date_of_consultation_admission] + 2 - - # fix pregnancy delivery - dis_output$delivery_event[dis_output$sex != "F"] <- "NA" - - # fix vaccine stuff among not vaccinated - NOTVACC <- which(!dis_output$vaccinated_meningitis_routine %in% c("C", "V") & - !dis_output$vaccinated_meningitis_mvc %in% c("C", "V")) - - dis_output$name_meningitis_vaccine[NOTVACC] <- NA - dis_output$date_of_last_vaccination[NOTVACC] <- NA - } - - if (dictionary == "Mortality") { - - - # sample villages - dis_output$village <- sample(c("Village A", "Village B", - "Village C", "Village D"), - numcases, replace = TRUE) - - # make two health districts - dis_output$health_district <- ifelse(dis_output$village == "Village A" | - dis_output$village == "Village B", - "District A", "District B") - - # cluster ID (based on village) - dis_output$cluster_number <- as.numeric(factor(dis_output$village)) - - # q65_iq4 household ID (the GPS point number) - (numbering starts again for each cluster) - for (i in unique(dis_output$cluster_number)) { - - nums <- nrow(dis_output[dis_output$cluster_number == i,]) - - dis_output[dis_output$cluster_number == i, "q65_iq4"] <- sample(1:(as.integer(nums/5) + 1), nums, replace = TRUE) - } - - - - dis_output <- gen_eligible_interviewed(dis_output, - household = "q65_iq4", - cluster = "cluster_number" - ) - - # use household num as a standin for fact_0_id for now - dis_output$fact_0_id <- dis_output$q65_iq4 - - # q53_cq4a ("Why is no occupant agreeing to participate?") shoud be NA if - # Head of Household answers the questions (q49_cq3) - dis_output$q53_cq4a[dis_output$q49_cq3 == "Yes"] <- factor(NA, levels(dis_output$q53_cq4a)) - # assume person is not born during study when age > 1 - dis_output$q87_q32_born[dis_output$q155_q5_age_year > 1] <- factor("No", levels(dis_output$q87_q32_born)) - dis_output$q88_q33_born_date[dis_output$q155_q5_age_year > 1] <- NA - # pregnancy set to NA for males - dis_output$q152_q7_pregnant[dis_output$q4_q6_sex == "Male"] <- NA - - # resample death yes/no to have lower death rates - dis_output$q136_q34_died <- sample(c("Yes", "No"), nrow(dis_output), prob = c(0.05, 0.95), replace = TRUE) - # set Columns that are relate to "death" as NA if "q136_q34_died" is "No" - died <- dis_output$q136_q34_died == "No" - dis_output[died, c("q137_q35_died_date", "q138_q36_died_cause", - "q141_q37_died_violence", "q143_q41_died_place", - "q145_q43_died_country")] <- NA - # pregnancy related cause of death n.a. for too old/young and for males - no_pregnancy <- dis_output$q138_q36_died_cause == "Pregnancy-related" & - (dis_output$q4_q6_sex == "Male" | - dis_output$q155_q5_age_year >= 50 | - dis_output$q155_q5_age_year < 12 - ) - - no_pregnancy[is.na(no_pregnancy)] <- FALSE # replace NAs - dis_output[no_pregnancy, "q138_q36_died_cause"] <- "Unknown" - - # fix arrival/leave dates - dis_output$q41_q25_hh_arrive_date <- - pmin(dis_output$q41_q25_hh_arrive_date, - dis_output$q45_q29_hh_leave_date, - dis_output$q88_q33_born_date, na.rm = TRUE) - - # leave date - chn_date <- dis_output$q45_q29_hh_leave_date <= dis_output$q41_q25_hh_arrive_date - chn_date2 <- dis_output$q45_q29_hh_leave_date < dis_output$q88_q33_born_date - chn_date[is.na(chn_date)] <- FALSE - chn_date2[is.na(chn_date2)] <- FALSE - - dis_output$q45_q29_hh_leave_date[chn_date] <- dis_output$q41_q25_hh_arrive_date[chn_date] + sample(5:30, sum(chn_date), replace = TRUE) - dis_output$q45_q29_hh_leave_date[chn_date2] <- dis_output$q88_q33_born_date[chn_date2] + sample(5:30, sum(chn_date2), replace = TRUE) - - # died date - chn_date <- dis_output$q137_q35_died_date <= dis_output$q41_q25_hh_arrive_date - chn_date2 <- dis_output$q137_q35_died_date < dis_output$q88_q33_born_date - chn_date[is.na(chn_date)] <- FALSE - chn_date2[is.na(chn_date2)] <- FALSE - dis_output$q137_q35_died_date[chn_date] <- dis_output$q41_q25_hh_arrive_date[chn_date] + sample(5:30, sum(chn_date), replace = TRUE) - dis_output$q137_q35_died_date[chn_date2] <- dis_output$q88_q33_born_date[chn_date2] + sample(5:30, sum(chn_date2), replace = TRUE) - - dis_output$q45_q29_hh_leave_date[!is.na(dis_output$q137_q35_died_date)] <- NA - - # more plausibility checks of generated data might be implemented in the future - } - - if (dictionary == "Nutrition") { - - # sample villages - dis_output$village <- sample(c("Village A", "Village B", - "Village C", "Village D"), - numcases, replace = TRUE) - - # make two health districts - dis_output$health_district <- ifelse(dis_output$village == "Village A" | - dis_output$village == "Village B", - "District A", "District B") - - # cluster ID (based on village) - dis_output$cluster_number <- as.numeric(factor(dis_output$village)) - - - # household ID (numbering starts again for each cluster) - for (i in unique(dis_output$cluster_number)) { - - nums <- nrow(dis_output[dis_output$cluster_number == i,]) - - dis_output[dis_output$cluster_number == i, "household_id"] <- sample(1:(as.integer(nums/5) + 1), nums, replace = TRUE) - } - - ## create a var for eligible and interviewed - - dis_output <- gen_eligible_interviewed(dis_output, - household = "household_id", - cluster = "cluster_number") - - # use household num as a standin for fact_0_id for now - dis_output$fact_0_id <- dis_output$household_id - - - # age in months (1 to 60 - i.e. under 5 years) - dis_output$age_month <- sample(1:60L, numcases, replace = TRUE) - - # height in cm - dis_output$height <- round( - runif(numcases, 40, 120), - digits = 1) - - # weight in kg - dis_output$weight <- round( - runif(numcases, 2, 30), - digits = 1) - - # MUAC in mm - dis_output$muac_mm_left_arm <- sample(80:190, numcases, replace = TRUE) - - } - if (dictionary == "Vaccination") { - - - # sample villages - dis_output$village <- sample(c("Village A", "Village B", - "Village C", "Village D"), - numcases, replace = TRUE) - - # make two health districts - dis_output$health_district <- ifelse(dis_output$village == "Village A" | - dis_output$village == "Village B", - "District A", "District B") - - # cluster ID (based on village) - dis_output$q77_what_is_the_cluster_number <- as.numeric(factor(dis_output$village)) - - # household ID (numbering starts again for each cluster) - for (i in unique(dis_output$q77_what_is_the_cluster_number)) { - - nums <- nrow(dis_output[dis_output$q77_what_is_the_cluster_number == i,]) - - dis_output[dis_output$q77_what_is_the_cluster_number == i, "q14_hh_no"] <- sample(1:(as.integer(nums/5) + 1), nums, replace = TRUE) - } - - ## create a var for eligible and interviewed - - dis_output <- gen_eligible_interviewed(dis_output, - household = "q14_hh_no", - cluster = "q77_what_is_the_cluster_number") - - # use household num as a standin for fact_0_id for now - dis_output$fact_0_id <- dis_output$q14_hh_no - - - # age in yr (0 to 14) - assuming doing vaccination coverage among those aged less than 15 yrs - dis_output$q10_age_yr <- sample(0:14L, numcases, replace = TRUE) - - # age in mth (0 to 11) - dis_output$q55_age_mth[dis_output$q10_age_yr < 1] <- sample(0:11L, - nrow(dis_output[dis_output$q10_age_yr < 1,]), - replace = TRUE) - - } - - - # return dataset as a tibble - dplyr::as_tibble(dis_output) - -} - - - - - diff --git a/R/gen_polygon.R b/R/gen_polygon.R deleted file mode 100644 index b2ad40d9..00000000 --- a/R/gen_polygon.R +++ /dev/null @@ -1,51 +0,0 @@ -#' Fake spatial data as polygons -#' This function returns a polygon which is split in to regions based on a -#' supplied vector of names -#' @param regions A string of names for each region to label the polygon with -#' @importFrom sf st_polygon st_make_grid st_intersection st_sf st_set_crs -#' @references The coordinates used for the polygon are of Vienna, Austria. -#' based off government data (see [metadata](https://www.data.gv.at/katalog/dataset/stadt-wien_bezirksgrenzenwien)) -#' @export - -gen_polygon <- function(regions) { - - # get file path - path <- system.file("extdata", "coords.csv", package = "sitrep") - - # read in coordinates as matrix - coords <- as.matrix(read.csv(path)) - # change to list - coords <- list(coords) - - # create a polygon from coordinates - original_poly <- sf::st_polygon(coords) - - # define how many regions we want in our polygon - # high <- ceiling(length(regions) / 2) - high <- ceiling(sqrt(length(regions))) - # change polygon to grid (subdivisions as squares) - gridding <- sf::st_make_grid(original_poly, n = c(high, high), - square = TRUE , what = "polygons") - - # only keep grid parts inside original boundary - geometry <- sf::st_intersection(gridding, original_poly) - - # check if regions is less that grid produces (for odd nums regions) - squares <- length(geometry) - length(regions) - - # create labels for regions - labeling <- regions - - # fix names for those with umatched regions - if (squares > 0) { - labeling <- c(regions, rep.int(NA, squares)) - } - - # polygon in to a list column which can be used as simple features for plot - output_poly <- sf::st_sf(tibble::tibble(name = labeling, geometry = geometry)) - - # Sets coordinate reference systwem to WGS84 - output_poly <- sf::st_set_crs(output_poly, value = 4326) - - output_poly -} diff --git a/R/msf_dict_rename_helper.R b/R/msf_dict_rename_helper.R new file mode 100644 index 00000000..4092663e --- /dev/null +++ b/R/msf_dict_rename_helper.R @@ -0,0 +1,52 @@ +#' Helper for aligning your data to the dictionary +#' +#' @export +#' @param disease Specify which disease you would like to use. +#' Currently supports "Cholera", "Measles" and "Meningitis", "AJS". +#' +#' @param varnames Specify name of column that contains varnames. Currently +#' default set to "Item". (this can probably be deleted once dictionaries +#' standardise) If `dictionary` is "Mortality", `varnames` needs to be "column_name"`. +#' +#' @param copy_to_clipboard if `TRUE` (default), the rename template will be +#' copied to the user's clipboard with [clipr::write_clip()]. If `FALSE`, the +#' rename template will be printed to the user's console. +#' +#' @return a dplyr command used to rename columns in your data frame according +#' to the dictionary +msf_dict_rename_helper <- function(disease, varnames = "data_element_shortname", copy_to_clipboard = TRUE) { + # get msf disease specific data dictionary + dat_dict <- msf_dict(disease = disease, compact = TRUE) + # get the outbreak Rmd to check if the variable is optional or required + outbreak_file <- available_sitrep_templates(recursive = TRUE, pattern = ".Rmd", full.names = TRUE) + outbreak_file <- grep(tolower(disease), outbreak_file, value = TRUE)[[1]] + outbreak_file <- readLines(outbreak_file) + + dat_dict[["var_required"]] <- vapply(dat_dict[[varnames]], + FUN = function(i, o) if (any(grepl(paste0("^[^#]*", i), o))) "REQUIRED" else "optional", + FUN.VALUE = character(1), + o = outbreak_file + ) + dat_dict <- dat_dict[order(dat_dict[["var_required"]] != "REQUIRED", + dat_dict[[varnames]]), ] + msg <- "## Add the appropriate column names after the equals signs\n\n" + msg <- paste0(msg, "linelist_cleaned <- rename(linelist_cleaned,\n") + the_renames <- sprintf(" %s = , # %s (%s)", + format(dat_dict[[varnames]]), + format(dat_dict[["data_element_valuetype"]]), + dat_dict[["var_required"]] + ) + the_renames[length(the_renames)] <- gsub(",", " ", the_renames[length(the_renames)]) + msg <- paste0(msg, paste(the_renames, collapse = "\n"), "\n)\n") + if (copy_to_clipboard) { + x <- try(clipr::write_clip(msg), silent = TRUE) + if (inherits(x, "try-error")) { + if (interactive()) cat(msg) + return(invisible()) + } + message("rename template copied to clipboard. Paste the contents to your RMarkdown file and enter in the column names from your data set.") + } else { + cat(msg) + } +} + diff --git a/R/msf_dict_survey.R b/R/msf_dict_survey.R deleted file mode 100644 index e54c61d7..00000000 --- a/R/msf_dict_survey.R +++ /dev/null @@ -1,94 +0,0 @@ -# function to load MSF data dictionary for mortality surveys - -#' @importFrom rio import -#' @importFrom epitrix clean_labels -#' @importFrom tibble as_tibble -#' @importFrom tidyr fill spread -#' @importFrom dplyr mutate group_by row_number ungroup -#' @export -#' @rdname msf_dict -msf_dict_survey <- function(disease, name = "MSF-survey-dict.xlsx", - tibble = TRUE, - compact = FALSE) { - - # get excel file path (need to specify the file name) - path <- system.file("extdata", name, package = "sitrep") - - # read in data set - pasting the disease name for sheet - dat_dict <- rio::import(path, which = disease) - - # clean col names - colnames(dat_dict) <- epitrix::clean_labels(colnames(dat_dict)) - - # fill NA values with previous non-NA value, replace "." in codes and names - dat_dict <- tidyr::fill(dat_dict, colnames(dat_dict), .direction = "down") - dat_dict <- dplyr::rename_at(dat_dict, - dplyr::vars(dplyr::starts_with("choice_")), - .funs = ~gsub("choice", "option", .) - ) - - # minor tidying, e.g.: create "CodeX" assignments - dat_dict$option_code[dat_dict$option_code == "."] <- NA - dat_dict$option_name[dat_dict$option_name == "."] <- NA - dat_dict$type <- gsub(pattern = "Question", - replacement = "", - x = dat_dict$type) - # dat_dict <- dplyr::group_by(dat_dict, .data$column_name) - # dat_dict <- dplyr::mutate(dat_dict, - # code = paste0("Code", dplyr::row_number())) - # dat_dict <- dplyr::ungroup(dat_dict) - # dat_dict$code <- factor(dat_dict$code, unique(dat_dict$code)) - - # transform dat_dict to wide format (like outbreak dictionary) - dat_dict <- dplyr::select(dat_dict, - !! quote(level), - !! quote(column_name), - !! quote(description), - !! quote(type), - dplyr::starts_with("option_")) - dat_dict <- dplyr::group_by(dat_dict, !! quote(column_name)) - - dat_dict <- dplyr::mutate(dat_dict, option_order_in_set = seq(dplyr::n())) - - if (compact) { - if (utils::packageVersion("tidyr") > "0.8.99") { - dat_dict <- tidyr::nest(dat_dict, options = dplyr::starts_with("option_")) - } else { - squished <- tidyr::nest(dat_dict, dplyr::starts_with("option_"), .key = "options") - dat_dict <- dplyr::select(dat_dict, -dplyr::starts_with("option_")) - dat_dict <- dplyr::distinct(dat_dict) - dat_dict <- dplyr::left_join(dat_dict, squished, by = "column_name") - } - } - dat_dict <- dplyr::ungroup(dat_dict) - - - dat_dict$type <- dplyr::case_when( - dat_dict$type == "Integer" ~ "INTEGER_POSITIVE", - dat_dict$type == "Binary" ~ "TEXT", - dat_dict$type == "ChoiceMulti" ~ "MULTI", - dat_dict$type == "Text" ~ "LONG_TEXT", - dat_dict$type == "Geo" ~ "LONG_TEXT", - dat_dict$type == "Date" ~ "DATE", - dat_dict$type == "Choice" ~ "TEXT", - dat_dict$type == "Number" ~ "INTEGER_POSITIVE" - ) - - dat_dict <- dplyr::rename(dat_dict, "data_element_valuetype" = "type") - - # clean future var names - # excel names (data element shortname) - # csv names (data_element_name) - dat_dict$column_name <- epitrix::clean_labels(dat_dict$column_name) - - dat_dict <- data.frame(dat_dict, stringsAsFactors = FALSE) - - # return a tibble - if (compact || tibble) { - dat_dict <- tibble::as_tibble(dat_dict) - } - - dat_dict -} - - diff --git a/R/msfdict_exports.R b/R/msfdict_exports.R new file mode 100644 index 00000000..6de4fed5 --- /dev/null +++ b/R/msfdict_exports.R @@ -0,0 +1,34 @@ +# These functions have been re-exported from the {msfdict} package. They were +# previously part of the sitrep package, but have been moved into their own +# separate package for maintenance. By re-exporting them, the user does not +# see much of a difference. This is different than the {apyramid} package where +# we use the function internally as it has modified syntax. + +#' Functions re-expored from {msfdict} +#' +#' @seealso Dictionaries: [msfdict::msf_dict()], [msfdict::msf_dict_survey()]\cr +#' Generators: [msfdict::gen_data()], [msfdict::gen_polygon()] +#' @name msf_dict_survey +#' @importFrom msfdict msf_dict_survey +#' @export +#' @rdname msf_dict +"msf_dict_survey" + +#' @name msf_dict +#' @importFrom msfdict msf_dict +#' @export +#' @rdname msf_dict +"msf_dict" + +#' @name gen_data +#' @importFrom msfdict gen_data +#' @export +#' @rdname msf_dict +"gen_data" + +#' @name gen_polygon +#' @importFrom msfdict gen_polygon +#' @export +#' @rdname msf_dict +"gen_polygon" + diff --git a/R/template_data_frame.R b/R/template_data_frame.R deleted file mode 100644 index de1303d9..00000000 --- a/R/template_data_frame.R +++ /dev/null @@ -1,76 +0,0 @@ -template_data_frame_categories <- function(dat_dict, numcases, varnames, survey = FALSE) { - - dat_output <- dat_dict[, c(varnames, "options"), drop = FALSE] - - # create a NEW empty dataframe with the names from the data dictionary - dis_output <- data.frame(matrix(ncol = nrow(dat_output), nrow = numcases)) - - colnames(dis_output) <- dat_dict[[varnames]] - - - if (utils::packageVersion("tidyr") > "0.8.99") { - categories <- tidyr::unnest(dat_dict, cols = "options") - } else { - categories <- tidyr::unnest(dat_dict) - } - categories <- dplyr::filter(categories, !is.na(!! quote(option_name))) - - # take samples for vars with defined options (non empties) - for (i in unique(categories[[varnames]])) { - vals <- categories[categories[[varnames]] == i, ] - if (survey) { - vals <- factor(vals$option_name, vals$option_name[vals$option_order_in_set]) - } else { - vals <- factor(vals$option_code, vals$option_code[vals$option_order_in_set]) - } - dis_output[[i]] <- sample(vals, numcases, replace = TRUE) - } - - multivars <- dat_dict[[varnames]][dat_dict$data_element_valuetype == "MULTI"] - - if (length(multivars) > 0) { - sample_multivars <- lapply(multivars, sample_cats, numcases = numcases, - df = categories, varnames = varnames) - - dis_output[, multivars] <- NULL - dis_output <- dplyr::bind_cols(dis_output, sample_multivars) - } - - dis_output -} - - -# Enforces timing between two columns in a data frame. -# -# The data in the first column must come before the second column. If the timing -# isn't correct, then force the timing to be correct by making the second column -# bigger than the first by `add`. -enforce_timing <- function(x, first, second, add = 2) { - - mistakes <- x[[second]] <= x[[first]] - x[[second]][mistakes] <- x[[first]][mistakes] + add - x - -} - -# sample of a single value and NA -sample_single <- function(x, size, prob = 0.1) { - sample(c(x, NA), size = size, prob = c(prob, 1 - prob), replace = TRUE) -} - - -# random data for one single "MULTI" variable (split into multiple columns) -sample_cats <- function(category, numcases, df, varnames) { - - dat <- df[df[[varnames]] == category, , drop = FALSE] - - lvls <- dat$option_name - # define suffixes for column names, e.g. 000, 001, 002, ... - suffixes <- formatC((seq_along(lvls)) - 1, width = 3, format = "d", flag = "0") - - # create columns with randomized lvls with randomized probability - extra_cols <- vapply(lvls, sample_single, FUN.VALUE = character(numcases), - size = numcases, prob = sample(5:15, 1) / 100) - colnames(extra_cols) <- paste0(category, "_", suffixes) - as.data.frame(extra_cols, stringsAsFactors = FALSE) -} diff --git a/man/gen_polygon.Rd b/man/gen_polygon.Rd deleted file mode 100644 index bb5bf245..00000000 --- a/man/gen_polygon.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gen_polygon.R -\name{gen_polygon} -\alias{gen_polygon} -\title{Fake spatial data as polygons -This function returns a polygon which is split in to regions based on a -supplied vector of names} -\usage{ -gen_polygon(regions) -} -\arguments{ -\item{regions}{A string of names for each region to label the polygon with} -} -\description{ -Fake spatial data as polygons -This function returns a polygon which is split in to regions based on a -supplied vector of names -} -\references{ -The coordinates used for the polygon are of Vienna, Austria. -based off government data (see \href{https://www.data.gv.at/katalog/dataset/stadt-wien_bezirksgrenzenwien}{metadata}) -} diff --git a/man/msf_dict.Rd b/man/msf_dict.Rd index 9302ba4a..3ab64500 100644 --- a/man/msf_dict.Rd +++ b/man/msf_dict.Rd @@ -1,104 +1,27 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gen_data.R, R/msf_dict_survey.R -\name{msf_dict} +% Please edit documentation in R/msfdict_exports.R +\docType{data} +\name{msf_dict_survey} +\alias{msf_dict_survey} \alias{msf_dict} -\alias{msf_dict_rename_helper} \alias{gen_data} -\alias{msf_dict_survey} -\title{MSF data dictionaries and dummy datasets} +\alias{gen_polygon} +\title{Functions re-expored from {msfdict}} +\format{An object of class \code{function} of length 1.} \usage{ -msf_dict( - disease, - name = "MSF-outbreak-dict.xlsx", - tibble = TRUE, - compact = TRUE, - long = TRUE -) - -msf_dict_rename_helper( - disease, - varnames = "data_element_shortname", - copy_to_clipboard = TRUE -) - -gen_data(dictionary, varnames = "data_element_shortname", numcases = 300) - -msf_dict_survey( - disease, - name = "MSF-survey-dict.xlsx", - tibble = TRUE, - compact = FALSE -) -} -\arguments{ -\item{disease}{Specify which disease you would like to use. -Currently supports "Cholera", "Measles" and "Meningitis".} - -\item{name}{the name of the dictionary stored in the package.} - -\item{tibble}{Return data dictionary as a tidyverse tibble (default is TRUE)} +msf_dict_survey -\item{compact}{if \code{TRUE} (default), then a nested data frame is returned -where each row represents a single variable and a nested data frame column -called "options", which can be expanded with \code{\link[tidyr:unnest]{tidyr::unnest()}}. This only -works if \code{long = TRUE}.} +msf_dict -\item{long}{If TRUE (default), the returned data dictionary is in long format with -each option getting one row. If \code{FALSE}, then two data frames are returned, -one with variables and the other with content options.} +gen_data -\item{varnames}{Specify name of column that contains varnames. Currently -default set to "Item". (this can probably be deleted once dictionaries -standardise) If \code{dictionary} is "Mortality", \code{varnames} needs to be "column_name"`.} - -\item{copy_to_clipboard}{if \code{TRUE} (default), the rename template will be -copied to the user's clipboard with \code{\link[clipr:write_clip]{clipr::write_clip()}}. If \code{FALSE}, the -rename template will be printed to the user's console.} - -\item{dictionary}{Specify which dictionary you would like to use. -Currently supports "Cholera", "Measles", "Meningitis", "AJS" and "Mortality".} - -\item{numcases}{For fake data, specify the number of cases you want (default is 300} +gen_polygon } \description{ -These function reads in MSF data dictionaries and produces randomised -datasets based on values defined in the dictionaries. The randomised -dataset produced should mimic an excel export from DHIS2. -} -\examples{ - -if (require('dplyr') & require('linelist')) { withAutoprint({ -# You will often want to use MSF dictionaries to translate codes to human- -# readable variables. Here, we generate a data set of 20 cases: -dat <- gen_data(dictionary = "Cholera", varnames = "data_element_shortname", - numcases = 20) -print(dat) - -# We want the expanded dictionary, so we will select `compact = FALSE` -dict <- msf_dict(disease = "Cholera", long = TRUE, compact = FALSE, tibble = TRUE) -print(dict) - -# We can use linelist's clean_variable_spelling to translate the codes. First, -# we want to reorder the columns of the dictionary like so: -# -# - 1st column: option codes -# - 2nd column: translations -# - 3rd column: data column name -# - 4th column: order of options -# -# we also want to make sure to filter out any columns that are blank for -# the option codes, because this means that they don't have a fixed number of -# options -dict <- dict \%>\% - select(option_code, option_name, data_element_shortname, option_order_in_set) \%>\% - filter(!is.na(option_code)) -print(dict) - -# Now we can use linelist to filter the data: -dat_clean <- clean_variable_spelling(dat, dict) -print(dat_clean) -})} +Functions re-expored from {msfdict} } \seealso{ -\code{\link[linelist:clean_variable_spelling]{linelist::clean_variable_spelling()}} +Dictionaries: \code{\link[msfdict:msf_dict]{msfdict::msf_dict()}}, \code{\link[msfdict:msf_dict_survey]{msfdict::msf_dict_survey()}}\cr +Generators: \code{\link[msfdict:gen_data]{msfdict::gen_data()}}, \code{\link[msfdict:gen_polygon]{msfdict::gen_polygon()}} } +\keyword{datasets} diff --git a/man/msf_dict_rename_helper.Rd b/man/msf_dict_rename_helper.Rd new file mode 100644 index 00000000..9aeff734 --- /dev/null +++ b/man/msf_dict_rename_helper.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/msf_dict_rename_helper.R +\name{msf_dict_rename_helper} +\alias{msf_dict_rename_helper} +\title{Helper for aligning your data to the dictionary} +\usage{ +msf_dict_rename_helper( + disease, + varnames = "data_element_shortname", + copy_to_clipboard = TRUE +) +} +\arguments{ +\item{disease}{Specify which disease you would like to use. +Currently supports "Cholera", "Measles" and "Meningitis", "AJS".} + +\item{varnames}{Specify name of column that contains varnames. Currently +default set to "Item". (this can probably be deleted once dictionaries +standardise) If \code{dictionary} is "Mortality", \code{varnames} needs to be "column_name"`.} + +\item{copy_to_clipboard}{if \code{TRUE} (default), the rename template will be +copied to the user's clipboard with \code{\link[clipr:write_clip]{clipr::write_clip()}}. If \code{FALSE}, the +rename template will be printed to the user's console.} +} +\value{ +a dplyr command used to rename columns in your data frame according +to the dictionary +} +\description{ +Helper for aligning your data to the dictionary +}