From 95c2250557cd3b5301453e99072f971769d19ed7 Mon Sep 17 00:00:00 2001 From: Steffen Ehrmann Date: Mon, 21 Oct 2024 10:32:00 +0200 Subject: [PATCH] . --- DESCRIPTION | 140 +- NAMESPACE | 224 +-- R/find.R | 180 +- R/getIDVars.R | 298 ++-- R/getObsVars.R | 362 ++-- R/helpers.R | 1914 +++++++++++----------- R/reorganise.R | 204 +-- R/reportProblems.R | 158 +- R/schema.R | 1418 ++++++++-------- R/setClusters.R | 196 +-- R/setFilter.R | 166 +- R/setFormat.R | 156 +- R/setGroups.R | 106 +- R/setIDVar.R | 190 +-- R/setObsVar.R | 188 +-- R/sum.R | 112 +- R/validateInput.R | 276 ++-- R/validateSchema.R | 542 +++--- R/zzz.R | 8 +- man/dot-sum.Rd | 74 +- man/setGroups.Rd | 78 +- man/setIDVar.Rd | 158 +- tests/testthat/test-02_column_mismatch.R | 120 +- tests/testthat/test-03_wide_id.R | 264 +-- tests/testthat/test-08_groups.R | 136 +- tests/testthat/test-sum.R | 44 +- vignettes/tabshiftr.Rmd | 1042 ++++++------ 27 files changed, 4377 insertions(+), 4377 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4a65a35..eeef618 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,70 +1,70 @@ -Package: tabshiftr -Title: Reshape Disorganised Messy Data -Version: 0.5.1 -Authors@R: - c(person(given = "Steffen", - family = "Ehrmann", - role = c("aut", "cre"), - email = "steffen.ehrmann@posteo.de", - comment = c(ORCID = "0000-0002-2958-0796")), - person(given = "Tsvetelina", - family = "Tomova", - role = "ctb", - email = "tsvetelina.tomova@gmail.com"), - person(given = "Carsten", - family = "Meyer", - role = "aut", - email = "carsten.meyer@idiv.de", - comment = c(ORCID = "0000-0003-3927-5856")), - person(given = "Abdualmaged", - family = "Alhemiary", - role = "ctb"), - person(given = "Amelie", - family = "Haas", - role = "ctb"), - person(given = "Annika", - family = "Ertel", - role = "ctb"), - person(given = "Arne", - family = "Rümmler", - role = "ctb", - email = "arne.ruemmler@tu-dresden.de", - comment = c(ORCID = "0000-0001-8637-9071")), - person(given = "Caroline", - family = "Busse", - role = "ctb")) -Description: Helps the user to build and register schema descriptions of - disorganised (messy) tables. Disorganised tables are tables that are - not in a topologically coherent form, where packages such as 'tidyr' could - be used for reshaping. The schema description documents the arrangement of - input tables and is used to reshape them into a standardised (tidy) output - format. -URL: https://luckinet.github.io/tabshiftr/, https://github.com/luckinet/tabshiftr -BugReports: https://github.com/luckinet/tabshiftr/issues -Depends: - R (>= 2.10) -Language: en-gb -License: GPL-3 -Encoding: UTF-8 -LazyData: true -Imports: - checkmate, - rlang, - tibble, - dplyr, - tidyr, - magrittr, - tidyselect, - testthat, - crayon, - methods, - purrr, - stringr, - lubridate -RoxygenNote: 7.3.2 -Suggests: - knitr, - rmarkdown, - bookdown, - readr -VignetteBuilder: knitr +Package: tabshiftr +Title: Reshape Disorganised Messy Data +Version: 0.5.1 +Authors@R: + c(person(given = "Steffen", + family = "Ehrmann", + role = c("aut", "cre"), + email = "steffen.ehrmann@posteo.de", + comment = c(ORCID = "0000-0002-2958-0796")), + person(given = "Tsvetelina", + family = "Tomova", + role = "ctb", + email = "tsvetelina.tomova@gmail.com"), + person(given = "Carsten", + family = "Meyer", + role = "aut", + email = "carsten.meyer@idiv.de", + comment = c(ORCID = "0000-0003-3927-5856")), + person(given = "Abdualmaged", + family = "Alhemiary", + role = "ctb"), + person(given = "Amelie", + family = "Haas", + role = "ctb"), + person(given = "Annika", + family = "Ertel", + role = "ctb"), + person(given = "Arne", + family = "Rümmler", + role = "ctb", + email = "arne.ruemmler@tu-dresden.de", + comment = c(ORCID = "0000-0001-8637-9071")), + person(given = "Caroline", + family = "Busse", + role = "ctb")) +Description: Helps the user to build and register schema descriptions of + disorganised (messy) tables. Disorganised tables are tables that are + not in a topologically coherent form, where packages such as 'tidyr' could + be used for reshaping. The schema description documents the arrangement of + input tables and is used to reshape them into a standardised (tidy) output + format. +URL: https://luckinet.github.io/tabshiftr/, https://github.com/luckinet/tabshiftr +BugReports: https://github.com/luckinet/tabshiftr/issues +Depends: + R (>= 2.10) +Language: en-gb +License: GPL-3 +Encoding: UTF-8 +LazyData: true +Imports: + checkmate, + rlang, + tibble, + dplyr, + tidyr, + magrittr, + tidyselect, + testthat, + crayon, + methods, + purrr, + stringr, + lubridate +RoxygenNote: 7.3.2 +Suggests: + knitr, + rmarkdown, + bookdown, + readr +VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index f2e47cb..f0ca08b 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,112 +1,112 @@ -# Generated by roxygen2: do not edit by hand - -export("%>%") -export(.find) -export(.sum) -export(getClusterVar) -export(getGroupVar) -export(getIDVars) -export(getObsVars) -export(reorganise) -export(setCluster) -export(setFilter) -export(setFormat) -export(setGroups) -export(setIDVar) -export(setObsVar) -export(validateInput) -export(validateSchema) -importFrom(checkmate,assert) -importFrom(checkmate,assertCharacter) -importFrom(checkmate,assertChoice) -importFrom(checkmate,assertClass) -importFrom(checkmate,assertDataFrame) -importFrom(checkmate,assertFunction) -importFrom(checkmate,assertIntegerish) -importFrom(checkmate,assertList) -importFrom(checkmate,assertLogical) -importFrom(checkmate,assertNames) -importFrom(checkmate,assertNumeric) -importFrom(checkmate,assertSetEqual) -importFrom(checkmate,assertSubset) -importFrom(checkmate,assertTRUE) -importFrom(checkmate,expect_list) -importFrom(checkmate,expect_names) -importFrom(checkmate,expect_tibble) -importFrom(checkmate,testCharacter) -importFrom(checkmate,testClass) -importFrom(checkmate,testFunction) -importFrom(checkmate,testIntegerish) -importFrom(checkmate,testList) -importFrom(crayon,yellow) -importFrom(dplyr,across) -importFrom(dplyr,add_row) -importFrom(dplyr,arrange) -importFrom(dplyr,arrange_at) -importFrom(dplyr,bind_cols) -importFrom(dplyr,bind_rows) -importFrom(dplyr,case_when) -importFrom(dplyr,distinct) -importFrom(dplyr,everything) -importFrom(dplyr,filter) -importFrom(dplyr,full_join) -importFrom(dplyr,group_by) -importFrom(dplyr,if_any) -importFrom(dplyr,if_else) -importFrom(dplyr,left_join) -importFrom(dplyr,mutate) -importFrom(dplyr,mutate_all) -importFrom(dplyr,n) -importFrom(dplyr,na_if) -importFrom(dplyr,pull) -importFrom(dplyr,right_join) -importFrom(dplyr,row_number) -importFrom(dplyr,select) -importFrom(dplyr,slice) -importFrom(dplyr,summarise) -importFrom(dplyr,ungroup) -importFrom(lubridate,is.Date) -importFrom(magrittr,"%>%") -importFrom(methods,new) -importFrom(purrr,map) -importFrom(purrr,map_chr) -importFrom(purrr,map_dfc) -importFrom(purrr,map_int) -importFrom(purrr,map_lgl) -importFrom(purrr,reduce) -importFrom(purrr,set_names) -importFrom(rlang,`:=`) -importFrom(rlang,enquo) -importFrom(rlang,enquos) -importFrom(rlang,eval_tidy) -importFrom(rlang,is_integerish) -importFrom(rlang,is_primitive) -importFrom(rlang,is_quosure) -importFrom(rlang,prim_name) -importFrom(stats,na.omit) -importFrom(stringr,coll) -importFrom(stringr,str_c) -importFrom(stringr,str_count) -importFrom(stringr,str_detect) -importFrom(stringr,str_extract_all) -importFrom(stringr,str_remove_all) -importFrom(stringr,str_split) -importFrom(stringr,str_sub) -importFrom(testthat,expect_identical) -importFrom(tibble,as_tibble) -importFrom(tibble,as_tibble_row) -importFrom(tibble,rownames_to_column) -importFrom(tibble,tibble) -importFrom(tidyr,everything) -importFrom(tidyr,extract) -importFrom(tidyr,fill) -importFrom(tidyr,pivot_longer) -importFrom(tidyr,pivot_wider) -importFrom(tidyr,replace_na) -importFrom(tidyr,separate) -importFrom(tidyr,separate_longer_delim) -importFrom(tidyr,unite) -importFrom(tidyselect,all_of) -importFrom(tidyselect,everything) -importFrom(tidyselect,starts_with) -importFrom(tidyselect,where) +# Generated by roxygen2: do not edit by hand + +export("%>%") +export(.find) +export(.sum) +export(getClusterVar) +export(getGroupVar) +export(getIDVars) +export(getObsVars) +export(reorganise) +export(setCluster) +export(setFilter) +export(setFormat) +export(setGroups) +export(setIDVar) +export(setObsVar) +export(validateInput) +export(validateSchema) +importFrom(checkmate,assert) +importFrom(checkmate,assertCharacter) +importFrom(checkmate,assertChoice) +importFrom(checkmate,assertClass) +importFrom(checkmate,assertDataFrame) +importFrom(checkmate,assertFunction) +importFrom(checkmate,assertIntegerish) +importFrom(checkmate,assertList) +importFrom(checkmate,assertLogical) +importFrom(checkmate,assertNames) +importFrom(checkmate,assertNumeric) +importFrom(checkmate,assertSetEqual) +importFrom(checkmate,assertSubset) +importFrom(checkmate,assertTRUE) +importFrom(checkmate,expect_list) +importFrom(checkmate,expect_names) +importFrom(checkmate,expect_tibble) +importFrom(checkmate,testCharacter) +importFrom(checkmate,testClass) +importFrom(checkmate,testFunction) +importFrom(checkmate,testIntegerish) +importFrom(checkmate,testList) +importFrom(crayon,yellow) +importFrom(dplyr,across) +importFrom(dplyr,add_row) +importFrom(dplyr,arrange) +importFrom(dplyr,arrange_at) +importFrom(dplyr,bind_cols) +importFrom(dplyr,bind_rows) +importFrom(dplyr,case_when) +importFrom(dplyr,distinct) +importFrom(dplyr,everything) +importFrom(dplyr,filter) +importFrom(dplyr,full_join) +importFrom(dplyr,group_by) +importFrom(dplyr,if_any) +importFrom(dplyr,if_else) +importFrom(dplyr,left_join) +importFrom(dplyr,mutate) +importFrom(dplyr,mutate_all) +importFrom(dplyr,n) +importFrom(dplyr,na_if) +importFrom(dplyr,pull) +importFrom(dplyr,right_join) +importFrom(dplyr,row_number) +importFrom(dplyr,select) +importFrom(dplyr,slice) +importFrom(dplyr,summarise) +importFrom(dplyr,ungroup) +importFrom(lubridate,is.Date) +importFrom(magrittr,"%>%") +importFrom(methods,new) +importFrom(purrr,map) +importFrom(purrr,map_chr) +importFrom(purrr,map_dfc) +importFrom(purrr,map_int) +importFrom(purrr,map_lgl) +importFrom(purrr,reduce) +importFrom(purrr,set_names) +importFrom(rlang,`:=`) +importFrom(rlang,enquo) +importFrom(rlang,enquos) +importFrom(rlang,eval_tidy) +importFrom(rlang,is_integerish) +importFrom(rlang,is_primitive) +importFrom(rlang,is_quosure) +importFrom(rlang,prim_name) +importFrom(stats,na.omit) +importFrom(stringr,coll) +importFrom(stringr,str_c) +importFrom(stringr,str_count) +importFrom(stringr,str_detect) +importFrom(stringr,str_extract_all) +importFrom(stringr,str_remove_all) +importFrom(stringr,str_split) +importFrom(stringr,str_sub) +importFrom(testthat,expect_identical) +importFrom(tibble,as_tibble) +importFrom(tibble,as_tibble_row) +importFrom(tibble,rownames_to_column) +importFrom(tibble,tibble) +importFrom(tidyr,everything) +importFrom(tidyr,extract) +importFrom(tidyr,fill) +importFrom(tidyr,pivot_longer) +importFrom(tidyr,pivot_wider) +importFrom(tidyr,replace_na) +importFrom(tidyr,separate) +importFrom(tidyr,separate_longer_delim) +importFrom(tidyr,unite) +importFrom(tidyselect,all_of) +importFrom(tidyselect,everything) +importFrom(tidyselect,starts_with) +importFrom(tidyselect,where) diff --git a/R/find.R b/R/find.R index f3d2cfd..78a4b1b 100644 --- a/R/find.R +++ b/R/find.R @@ -1,91 +1,91 @@ -#' Determine row or column on the fly -#' -#' Find the location of a variable not based on it's columns/rows, but based on -#' a regular expression or function -#' @param fun [\code{character(1)}]\cr function to identify columns or rows in -#' the input table on the fly. -#' @param pattern [\code{character(1)}]\cr character string containing a regular -#' expression to identify columns or rows in the input table on the fly. -#' @param col [\code{integerish(1)}]\cr optionally, in case this function should -#' only be applied to certain columns, provides this here. -#' @param row [\code{integerish(1)}]\cr optionally, in case this function should -#' only be applied to certain rows, provides this here. -#' @param invert [\code{logical(1)}]\cr whether or not the identified columns or -#' rows should be inverted, i.e., all other columns or rows should be -#' selected. -#' @param relative [\code{logical(1)}]\cr whether or not the values provided in -#' \code{col} or \code{row} are relative to the cluster position(s) or whether -#' they are absolute positions, i.e, refer to the overall table. -#' @details This functions is basically a wild-card for when columns or rows are -#' not known ad-hoc, but have to be assigned on the fly. This can be very -#' helpful when several tables contain the same variables, but the arrangement -#' may be slightly different. -#' @section How does this work: The first step in using any schema is validating -#' it via the function \code{\link{validateSchema}}. This happens by default -#' in \code{\link{reorganise}}, but can also be done manually, for example -#' when debugging complicated schema descriptions. -#' -#' In case that function encounters a schema that wants to find columns or -#' rows on the fly via \code{.find}, it combines all cells of columns and all -#' cells of rows into one character string and matches the regular expression -#' or function on those. Columns/rows that have a match are returned as the -#' respective column/row value. -#' @return the index values where the target was found. -#' @examples -#' # use regular expressions to find cell positions -#' (input <- tabs2shift$clusters_messy) -#' -#' schema <- setCluster(id = "territories", -#' left = .find(pattern = "comm*"), top = .find(pattern = "comm*")) %>% -#' setIDVar(name = "territories", columns = c(1, 1, 4), rows = c(2, 9, 9)) %>% -#' setIDVar(name = "year", columns = 4, rows = c(3:6), distinct = TRUE) %>% -#' setIDVar(name = "commodities", columns = c(1, 1, 4)) %>% -#' setObsVar(name = "harvested", columns = c(2, 2, 5)) %>% -#' setObsVar(name = "production", columns = c(3, 3, 6)) -#' -#' schema -#' validateSchema(schema = schema, input = input) -#' -#' # use a function to find rows -#' (input <- tabs2shift$messy_rows) -#' -#' schema <- -#' setFilter(rows = .find(fun = is.numeric, col = 1, invert = TRUE)) %>% -#' setIDVar(name = "territories", columns = 1) %>% -#' setIDVar(name = "year", columns = 2) %>% -#' setIDVar(name = "commodities", columns = 3) %>% -#' setObsVar(name = "harvested", columns = 5) %>% -#' setObsVar(name = "production", columns = 6) -#' -#' reorganise(schema = schema, input = input) -#' @importFrom checkmate testCharacter testFunction assert assertLogical -#' @importFrom purrr map_chr -#' @importFrom rlang enquo -#' @export - -.find <- function(fun = NULL, pattern = NULL, col = NULL, row = NULL, - invert = FALSE, relative = FALSE){ - - # assertions ---- - assertFunction(x = fun, null.ok = TRUE) - assertCharacter(x = pattern, null.ok = TRUE) - assertLogical(x = invert, len = 1) - assertLogical(x = relative, len = 1) - - if(!is.null(fun) & !is.null(pattern)){ - stop("please specifiy either 'fun' or 'pattern', but not both.") - } - - if(!is.null(fun)){ - temp <- enquo(fun) - } else if(!is.null(pattern)){ - temp <- enquo(pattern) - } else { - temp <- NULL - } - - out <- list(find = list(by = temp, col = col, row = row, invert = invert, relative = relative)) - - return(out) - +#' Determine row or column on the fly +#' +#' Find the location of a variable not based on it's columns/rows, but based on +#' a regular expression or function +#' @param fun [\code{character(1)}]\cr function to identify columns or rows in +#' the input table on the fly. +#' @param pattern [\code{character(1)}]\cr character string containing a regular +#' expression to identify columns or rows in the input table on the fly. +#' @param col [\code{integerish(1)}]\cr optionally, in case this function should +#' only be applied to certain columns, provides this here. +#' @param row [\code{integerish(1)}]\cr optionally, in case this function should +#' only be applied to certain rows, provides this here. +#' @param invert [\code{logical(1)}]\cr whether or not the identified columns or +#' rows should be inverted, i.e., all other columns or rows should be +#' selected. +#' @param relative [\code{logical(1)}]\cr whether or not the values provided in +#' \code{col} or \code{row} are relative to the cluster position(s) or whether +#' they are absolute positions, i.e, refer to the overall table. +#' @details This functions is basically a wild-card for when columns or rows are +#' not known ad-hoc, but have to be assigned on the fly. This can be very +#' helpful when several tables contain the same variables, but the arrangement +#' may be slightly different. +#' @section How does this work: The first step in using any schema is validating +#' it via the function \code{\link{validateSchema}}. This happens by default +#' in \code{\link{reorganise}}, but can also be done manually, for example +#' when debugging complicated schema descriptions. +#' +#' In case that function encounters a schema that wants to find columns or +#' rows on the fly via \code{.find}, it combines all cells of columns and all +#' cells of rows into one character string and matches the regular expression +#' or function on those. Columns/rows that have a match are returned as the +#' respective column/row value. +#' @return the index values where the target was found. +#' @examples +#' # use regular expressions to find cell positions +#' (input <- tabs2shift$clusters_messy) +#' +#' schema <- setCluster(id = "territories", +#' left = .find(pattern = "comm*"), top = .find(pattern = "comm*")) %>% +#' setIDVar(name = "territories", columns = c(1, 1, 4), rows = c(2, 9, 9)) %>% +#' setIDVar(name = "year", columns = 4, rows = c(3:6), distinct = TRUE) %>% +#' setIDVar(name = "commodities", columns = c(1, 1, 4)) %>% +#' setObsVar(name = "harvested", columns = c(2, 2, 5)) %>% +#' setObsVar(name = "production", columns = c(3, 3, 6)) +#' +#' schema +#' validateSchema(schema = schema, input = input) +#' +#' # use a function to find rows +#' (input <- tabs2shift$messy_rows) +#' +#' schema <- +#' setFilter(rows = .find(fun = is.numeric, col = 1, invert = TRUE)) %>% +#' setIDVar(name = "territories", columns = 1) %>% +#' setIDVar(name = "year", columns = 2) %>% +#' setIDVar(name = "commodities", columns = 3) %>% +#' setObsVar(name = "harvested", columns = 5) %>% +#' setObsVar(name = "production", columns = 6) +#' +#' reorganise(schema = schema, input = input) +#' @importFrom checkmate testCharacter testFunction assert assertLogical +#' @importFrom purrr map_chr +#' @importFrom rlang enquo +#' @export + +.find <- function(fun = NULL, pattern = NULL, col = NULL, row = NULL, + invert = FALSE, relative = FALSE){ + + # assertions ---- + assertFunction(x = fun, null.ok = TRUE) + assertCharacter(x = pattern, null.ok = TRUE) + assertLogical(x = invert, len = 1) + assertLogical(x = relative, len = 1) + + if(!is.null(fun) & !is.null(pattern)){ + stop("please specifiy either 'fun' or 'pattern', but not both.") + } + + if(!is.null(fun)){ + temp <- enquo(fun) + } else if(!is.null(pattern)){ + temp <- enquo(pattern) + } else { + temp <- NULL + } + + out <- list(find = list(by = temp, col = col, row = row, invert = invert, relative = relative)) + + return(out) + } \ No newline at end of file diff --git a/R/getIDVars.R b/R/getIDVars.R index b322e14..c09aa8a 100644 --- a/R/getIDVars.R +++ b/R/getIDVars.R @@ -1,150 +1,150 @@ -#' Extract identifying variables -#' -#' This function extracts the identifying variables from a table by applying a -#' schema description to it. -#' @param schema [\code{character(1)}]\cr the (validated) schema description of -#' \code{input}. -#' @param input [\code{character(1)}]\cr table to reorganise. -#' @return a list per cluster with values of the identifying variables -#' @examples -#' input <- tabs2shift$clusters_nested -#' schema <- setCluster(id = "sublevel", -#' group = "territories", member = c(1, 1, 2), -#' left = 1, top = c(3, 8, 15)) %>% -#' setIDVar(name = "territories", columns = 1, rows = c(2, 14)) %>% -#' setIDVar(name = "sublevel", columns = 1, rows = c(3, 8, 15)) %>% -#' setIDVar(name = "year", columns = 7) %>% -#' setIDVar(name = "commodities", columns = 2) %>% -#' setObsVar(name = "harvested", columns = 5) %>% -#' setObsVar(name = "production", columns = 6) -#' -#' validateSchema(schema = schema, input = input) %>% -#' getIDVars(input = input) -#' @importFrom checkmate assertTRUE -#' @importFrom tibble tibble -#' @importFrom purrr map set_names map_dfc -#' @importFrom dplyr row_number filter select -#' @importFrom tidyr extract unite fill -#' @importFrom tidyselect all_of -#' @export - -getIDVars <- function(schema = NULL, input = NULL){ - - assertTRUE(x = schema@validated) - - clusters <- schema@clusters - nClusters <- max(lengths(clusters)) - - variables <- schema@variables - filter <- schema@filter - - idVars <- map(.x = seq_along(variables), .f = function(ix){ - # unselect those id variables that are also cluster or group id - if(variables[[ix]]$vartype == "id" & !names(variables)[ix] %in% c(clusters$id, clusters$group)){ - variables[ix] - } - }) - idVars <- unlist(idVars, recursive = FALSE) - - if(length(idVars) != 0){ - - out <- map(.x = 1:nClusters, .f = function(ix){ - vars <- NULL - for(i in 1:length(idVars)){ - - tempVar <- idVars[[i]] - varRow <- clusters$row[ix]:(clusters$row[ix]+clusters$height[ix] - 1) - - if(!is.null(tempVar$value)){ - temp <- tibble(X = tempVar$value) - } else { - - if(!is.null(tempVar$row[ix])){ - if(!tempVar$dist){ - # in case a row value is set, this means we deal with a variable that is not tidy ... - temp <- input[tempVar$row[ix], tempVar$col] - rowFilter <- NULL - if(!is.null(filter$col)){ - colFilter <- colnames(temp)[tempVar$col %in% filter$col] - } else { - colFilter <- NULL - } - } else { - # ... or distinct from clusters - temp <- input[unique(tempVar$row), unique(tempVar$col)] - rowFilter <- NULL - colFilter <- NULL - } - } else { - - if(!is.null(tempVar$merge)){ - temp <- input[varRow, tempVar$col] - rowFilter <- filter$row - colFilter <- NULL - } else { - temp <- input[varRow, tempVar$col[ix]] - rowFilter <- which(varRow %in% filter$row) - colFilter <- NULL - } - - } - - # apply a row filter ... - if(!is.null(rowFilter)){ - temp <- temp %>% - filter(row_number() %in% rowFilter) - } - - # ... and column filter - if(!is.null(colFilter)){ - temp <- temp %>% - select(all_of(colFilter)) - } - - # copy missing values downwards - if(anyNA(temp[1])){ - message("filling NA-values in variable '", names(idVars[i]),"'.") - temp <- temp %>% - fill(1, .direction = "down") - } - - # split ... - if(!is.null(tempVar$split)){ - # need to distinguish between one and several columns - if(dim(temp)[2] == 1){ - temp <- temp %>% - extract(col = 1, into = names(temp), regex = tempVar$split) - } else { - temp <- map(.x = seq_along(temp), .f = function(iy){ - temp %>% - select(all_of(iy)) %>% - tidyr::extract(col = 1, into = names(temp)[iy], regex = tempVar$split) - }) %>% bind_cols(.name_repair = "check_unique") - } - } - - # ... or merge the variable - if(!is.null(tempVar$merge)){ - newName <- paste0(names(temp), collapse = tempVar$merge) - temp <- temp %>% - unite(col = !!newName, sep = tempVar$merge) - } - - } - - vars <- c(vars, set_names(x = list(temp), nm = names(idVars)[i])) - - } - return(vars) - - }) - - - - } else { - out <- NULL - } - - return(out) - +#' Extract identifying variables +#' +#' This function extracts the identifying variables from a table by applying a +#' schema description to it. +#' @param schema [\code{character(1)}]\cr the (validated) schema description of +#' \code{input}. +#' @param input [\code{character(1)}]\cr table to reorganise. +#' @return a list per cluster with values of the identifying variables +#' @examples +#' input <- tabs2shift$clusters_nested +#' schema <- setCluster(id = "sublevel", +#' group = "territories", member = c(1, 1, 2), +#' left = 1, top = c(3, 8, 15)) %>% +#' setIDVar(name = "territories", columns = 1, rows = c(2, 14)) %>% +#' setIDVar(name = "sublevel", columns = 1, rows = c(3, 8, 15)) %>% +#' setIDVar(name = "year", columns = 7) %>% +#' setIDVar(name = "commodities", columns = 2) %>% +#' setObsVar(name = "harvested", columns = 5) %>% +#' setObsVar(name = "production", columns = 6) +#' +#' validateSchema(schema = schema, input = input) %>% +#' getIDVars(input = input) +#' @importFrom checkmate assertTRUE +#' @importFrom tibble tibble +#' @importFrom purrr map set_names map_dfc +#' @importFrom dplyr row_number filter select +#' @importFrom tidyr extract unite fill +#' @importFrom tidyselect all_of +#' @export + +getIDVars <- function(schema = NULL, input = NULL){ + + assertTRUE(x = schema@validated) + + clusters <- schema@clusters + nClusters <- max(lengths(clusters)) + + variables <- schema@variables + filter <- schema@filter + + idVars <- map(.x = seq_along(variables), .f = function(ix){ + # unselect those id variables that are also cluster or group id + if(variables[[ix]]$vartype == "id" & !names(variables)[ix] %in% c(clusters$id, clusters$group)){ + variables[ix] + } + }) + idVars <- unlist(idVars, recursive = FALSE) + + if(length(idVars) != 0){ + + out <- map(.x = 1:nClusters, .f = function(ix){ + vars <- NULL + for(i in 1:length(idVars)){ + + tempVar <- idVars[[i]] + varRow <- clusters$row[ix]:(clusters$row[ix]+clusters$height[ix] - 1) + + if(!is.null(tempVar$value)){ + temp <- tibble(X = tempVar$value) + } else { + + if(!is.null(tempVar$row[ix])){ + if(!tempVar$dist){ + # in case a row value is set, this means we deal with a variable that is not tidy ... + temp <- input[tempVar$row[ix], tempVar$col] + rowFilter <- NULL + if(!is.null(filter$col)){ + colFilter <- colnames(temp)[tempVar$col %in% filter$col] + } else { + colFilter <- NULL + } + } else { + # ... or distinct from clusters + temp <- input[unique(tempVar$row), unique(tempVar$col)] + rowFilter <- NULL + colFilter <- NULL + } + } else { + + if(!is.null(tempVar$merge)){ + temp <- input[varRow, tempVar$col] + rowFilter <- filter$row + colFilter <- NULL + } else { + temp <- input[varRow, tempVar$col[ix]] + rowFilter <- which(varRow %in% filter$row) + colFilter <- NULL + } + + } + + # apply a row filter ... + if(!is.null(rowFilter)){ + temp <- temp %>% + filter(row_number() %in% rowFilter) + } + + # ... and column filter + if(!is.null(colFilter)){ + temp <- temp %>% + select(all_of(colFilter)) + } + + # copy missing values downwards + if(anyNA(temp[1])){ + message("filling NA-values in variable '", names(idVars[i]),"'.") + temp <- temp %>% + fill(1, .direction = "down") + } + + # split ... + if(!is.null(tempVar$split)){ + # need to distinguish between one and several columns + if(dim(temp)[2] == 1){ + temp <- temp %>% + extract(col = 1, into = names(temp), regex = tempVar$split) + } else { + temp <- map(.x = seq_along(temp), .f = function(iy){ + temp %>% + select(all_of(iy)) %>% + tidyr::extract(col = 1, into = names(temp)[iy], regex = tempVar$split) + }) %>% bind_cols(.name_repair = "check_unique") + } + } + + # ... or merge the variable + if(!is.null(tempVar$merge)){ + newName <- paste0(names(temp), collapse = tempVar$merge) + temp <- temp %>% + unite(col = !!newName, sep = tempVar$merge) + } + + } + + vars <- c(vars, set_names(x = list(temp), nm = names(idVars)[i])) + + } + return(vars) + + }) + + + + } else { + out <- NULL + } + + return(out) + } \ No newline at end of file diff --git a/R/getObsVars.R b/R/getObsVars.R index 42f8810..95a7302 100644 --- a/R/getObsVars.R +++ b/R/getObsVars.R @@ -1,181 +1,181 @@ -#' Extract observed variables -#' -#' This function extracts the observed variables from a table by applying a -#' schema description to it. -#' @param schema [\code{character(1)}]\cr the (validated) schema description of -#' \code{input}. -#' @param input [\code{character(1)}]\cr table to reorganise. -#' @return a list per cluster with values of the observed variables -#' @examples -#' input <- tabs2shift$clusters_nested -#' schema <- setCluster(id = "sublevel", -#' group = "territories", member = c(1, 1, 2), -#' left = 1, top = c(3, 8, 15)) %>% -#' setIDVar(name = "territories", columns = 1, rows = c(2, 14)) %>% -#' setIDVar(name = "sublevel", columns = 1, rows = c(3, 8, 15)) %>% -#' setIDVar(name = "year", columns = 7) %>% -#' setIDVar(name = "commodities", columns = 2) %>% -#' setObsVar(name = "harvested", columns = 5) %>% -#' setObsVar(name = "production", columns = 6) -#' -#' validateSchema(schema = schema, input = input) %>% -#' getObsVars(input = input) -#' @importFrom checkmate assertTRUE -#' @importFrom purrr map set_names map_chr reduce -#' @importFrom dplyr row_number filter -#' @export - -getObsVars <- function(schema = NULL, input = NULL){ - - assertTRUE(x = schema@validated) - - clusters <- schema@clusters - nClusters <- max(lengths(clusters)) - - variables <- schema@variables - filter <- schema@filter - - obsVars <- map(.x = seq_along(variables), .f = function(ix){ - if(variables[[ix]]$vartype == "observed"){ - variables[ix] - } - }) - obsVars <- unlist(obsVars, recursive = FALSE) - - # if there are listed observed variables, act as if they were clusters - listedObs <- map(.x = seq_along(variables), .f = function(ix){ - theVar <- variables[[ix]] - if(theVar$vartype == "observed"){ - if(is.numeric(theVar$key) | is.list(theVar$key)){ - if(!any(0 %in% theVar$key)){ - c(theVar$key, theVar$col) - } - } - } - }) - listedObs <- listedObs[lengths(listedObs) != 0] - - if(length(obsVars) != 0){ - - out <- map(.x = 1:nClusters, .f = function(ix){ - vars <- NULL - if(length(listedObs) != 0){ - - if(is.list(listedObs[[1]][1])){ - listedObs[[1]][1] <- .eval_find(input = input, col = listedObs[[1]][1], clusters = clusters) - names(listedObs[[1]]) <- NULL - listedObs[[1]] <- unlist(listedObs[[1]]) - } - - listedCols <- reduce(.x = listedObs, .f = function(x,y) if (identical(x,y)) x else FALSE) - - varRows <- clusters$row[ix]:(clusters$row[ix]+clusters$height[ix] - 1) - if(isFALSE(listedCols)){ - stop("implement case where not all observed variables are listed.") - } - temp <- input[varRows, listedCols] - names(temp)[1] <- "key" - rowFilter <- which(varRows %in% filter$row) - colFilter <- NULL - - # apply a row filter ... - if(!is.null(rowFilter)){ - temp <- temp %>% - filter(row_number() %in% rowFilter) - } - - # ... and column filter - if(!is.null(colFilter)){ - temp <- temp %>% - select(all_of(colFilter)) - } - - # replace keys with their variable name - old <- map_chr(.x = seq_along(obsVars), .f = function(iy){ - obsVars[[iy]]$value - }) - new <- names(obsVars) - temp$key[temp$key %in% old] <- new[match(temp$key, old, nomatch = 0)] - - vars <- c(vars, set_names(x = list(temp), nm = "listed")) - - } else { - for(i in 1:length(obsVars)){ - - tempVar <- obsVars[[i]] - varRows <- clusters$row[ix]:(clusters$row[ix]+clusters$height[ix] - 1) - - if(!is.null(tempVar$key)){ - if(tempVar$key == "cluster"){ - if(tempVar$value != ix){ - next - } - if(length(unique(tempVar$col)) == 1){ - temp <- input[varRows, tempVar$col[ix]] - } else { - temp <- input[varRows, tempVar$col] - } - rowFilter <- which(varRows %in% filter$row) - colFilter <- NULL - } else if(is.numeric(tempVar$key)){ - temp <- input[varRows, tempVar$col] - if(!tempVar$key == 0){ - rowFilter <- NULL - colFilter <- NULL - } else { - rowFilter <- which(varRows %in% filter$row) - colFilter <- NULL - } - } - } else { - - if(!is.null(tempVar$row[ix])){ - if(nClusters != 1){ - temp <- input[varRows, tempVar$col[ix]] - rowFilter <- which(varRows %in% filter$row) - colFilter <- NULL - } else { - temp <- input[varRows, tempVar$col] - rowFilter <- which(varRows %in% filter$row) - if(!is.null(filter$col)){ - colFilter <- colnames(temp)[tempVar$col %in% filter$col] - } else{ - colFilter <- NULL - } - } - } else { - temp <- input[varRows, tempVar$col[ix]] - rowFilter <- which(varRows %in% filter$row) - colFilter <- NULL - } - - } - - # apply a row filter ... - if(!is.null(rowFilter)){ - temp <- temp %>% - filter(row_number() %in% rowFilter) - } - - # ... and column filter - if(!is.null(colFilter)){ - temp <- temp %>% - select(all_of(colFilter)) - } - - vars <- c(vars, set_names(x = list(temp), nm = names(obsVars)[i])) - - } - } - - return(vars) - - }) - - } else { - out <- NULL - } - - return(out) - -} +#' Extract observed variables +#' +#' This function extracts the observed variables from a table by applying a +#' schema description to it. +#' @param schema [\code{character(1)}]\cr the (validated) schema description of +#' \code{input}. +#' @param input [\code{character(1)}]\cr table to reorganise. +#' @return a list per cluster with values of the observed variables +#' @examples +#' input <- tabs2shift$clusters_nested +#' schema <- setCluster(id = "sublevel", +#' group = "territories", member = c(1, 1, 2), +#' left = 1, top = c(3, 8, 15)) %>% +#' setIDVar(name = "territories", columns = 1, rows = c(2, 14)) %>% +#' setIDVar(name = "sublevel", columns = 1, rows = c(3, 8, 15)) %>% +#' setIDVar(name = "year", columns = 7) %>% +#' setIDVar(name = "commodities", columns = 2) %>% +#' setObsVar(name = "harvested", columns = 5) %>% +#' setObsVar(name = "production", columns = 6) +#' +#' validateSchema(schema = schema, input = input) %>% +#' getObsVars(input = input) +#' @importFrom checkmate assertTRUE +#' @importFrom purrr map set_names map_chr reduce +#' @importFrom dplyr row_number filter +#' @export + +getObsVars <- function(schema = NULL, input = NULL){ + + assertTRUE(x = schema@validated) + + clusters <- schema@clusters + nClusters <- max(lengths(clusters)) + + variables <- schema@variables + filter <- schema@filter + + obsVars <- map(.x = seq_along(variables), .f = function(ix){ + if(variables[[ix]]$vartype == "observed"){ + variables[ix] + } + }) + obsVars <- unlist(obsVars, recursive = FALSE) + + # if there are listed observed variables, act as if they were clusters + listedObs <- map(.x = seq_along(variables), .f = function(ix){ + theVar <- variables[[ix]] + if(theVar$vartype == "observed"){ + if(is.numeric(theVar$key) | is.list(theVar$key)){ + if(!any(0 %in% theVar$key)){ + c(theVar$key, theVar$col) + } + } + } + }) + listedObs <- listedObs[lengths(listedObs) != 0] + + if(length(obsVars) != 0){ + + out <- map(.x = 1:nClusters, .f = function(ix){ + vars <- NULL + if(length(listedObs) != 0){ + + if(is.list(listedObs[[1]][1])){ + listedObs[[1]][1] <- .eval_find(input = input, col = listedObs[[1]][1], clusters = clusters) + names(listedObs[[1]]) <- NULL + listedObs[[1]] <- unlist(listedObs[[1]]) + } + + listedCols <- reduce(.x = listedObs, .f = function(x,y) if (identical(x,y)) x else FALSE) + + varRows <- clusters$row[ix]:(clusters$row[ix]+clusters$height[ix] - 1) + if(isFALSE(listedCols)){ + stop("implement case where not all observed variables are listed.") + } + temp <- input[varRows, listedCols] + names(temp)[1] <- "key" + rowFilter <- which(varRows %in% filter$row) + colFilter <- NULL + + # apply a row filter ... + if(!is.null(rowFilter)){ + temp <- temp %>% + filter(row_number() %in% rowFilter) + } + + # ... and column filter + if(!is.null(colFilter)){ + temp <- temp %>% + select(all_of(colFilter)) + } + + # replace keys with their variable name + old <- map_chr(.x = seq_along(obsVars), .f = function(iy){ + obsVars[[iy]]$value + }) + new <- names(obsVars) + temp$key[temp$key %in% old] <- new[match(temp$key, old, nomatch = 0)] + + vars <- c(vars, set_names(x = list(temp), nm = "listed")) + + } else { + for(i in 1:length(obsVars)){ + + tempVar <- obsVars[[i]] + varRows <- clusters$row[ix]:(clusters$row[ix]+clusters$height[ix] - 1) + + if(!is.null(tempVar$key)){ + if(tempVar$key == "cluster"){ + if(tempVar$value != ix){ + next + } + if(length(unique(tempVar$col)) == 1){ + temp <- input[varRows, tempVar$col[ix]] + } else { + temp <- input[varRows, tempVar$col] + } + rowFilter <- which(varRows %in% filter$row) + colFilter <- NULL + } else if(is.numeric(tempVar$key)){ + temp <- input[varRows, tempVar$col] + if(!tempVar$key == 0){ + rowFilter <- NULL + colFilter <- NULL + } else { + rowFilter <- which(varRows %in% filter$row) + colFilter <- NULL + } + } + } else { + + if(!is.null(tempVar$row[ix])){ + if(nClusters != 1){ + temp <- input[varRows, tempVar$col[ix]] + rowFilter <- which(varRows %in% filter$row) + colFilter <- NULL + } else { + temp <- input[varRows, tempVar$col] + rowFilter <- which(varRows %in% filter$row) + if(!is.null(filter$col)){ + colFilter <- colnames(temp)[tempVar$col %in% filter$col] + } else{ + colFilter <- NULL + } + } + } else { + temp <- input[varRows, tempVar$col[ix]] + rowFilter <- which(varRows %in% filter$row) + colFilter <- NULL + } + + } + + # apply a row filter ... + if(!is.null(rowFilter)){ + temp <- temp %>% + filter(row_number() %in% rowFilter) + } + + # ... and column filter + if(!is.null(colFilter)){ + temp <- temp %>% + select(all_of(colFilter)) + } + + vars <- c(vars, set_names(x = list(temp), nm = names(obsVars)[i])) + + } + } + + return(vars) + + }) + + } else { + out <- NULL + } + + return(out) + +} diff --git a/R/helpers.R b/R/helpers.R index db045ca..f46dc08 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -1,958 +1,958 @@ -#' Update the formating of a table -#' -#' This function updates the format of a table by applying a schema description -#' to it. -#' @param input [\code{character(1)}]\cr table to reorganise. -#' @param schema [\code{character(1)}]\cr the schema description of -#' \code{input}. -#' @importFrom purrr map -#' @importFrom dplyr row_number arrange_at -#' @importFrom stringr str_remove_all str_extract_all coll -#' @importFrom tidyselect starts_with - -.updateFormat <- function(input = NULL, schema = NULL){ - - clusters <- schema@clusters - variables <- schema@variables - format <- schema@format - - if(!is.null(format$del)){ - if(format$del == "."){ - format$del <- "[.]" - } - } - - if(!is.null(format$dec)){ - if(format$dec == "."){ - format$dec <- "[.]" - } - } - - idVars <- map(.x = seq_along(variables), .f = function(ix){ - if(variables[[ix]]$vartype == "id"){ - variables[ix] - } - }) - idVars <- unlist(idVars, recursive = FALSE) - - for(i in seq_along(idVars)){ - # set the desired type - - if(idVars[[i]]$datype == "Date"){ - input[[which(names(idVars)[i] == names(input))]] <- as.Date(input[[which(names(idVars)[i] == names(input))]], tryFormats = c("%Y-%m-%d"), optional = TRUE) - } else { - class(input[[which(names(idVars)[i] == names(input))]]) <- idVars[[i]]$datype - } - - } - - obsVars <- map(.x = seq_along(variables), .f = function(ix){ - if(variables[[ix]]$vartype == "observed"){ - variables[ix] - } - }) - obsVars <- unlist(obsVars, recursive = FALSE) - - # set all observed variables to the correct format - for(i in seq_along(obsVars)){ - theVar <- input[[which(names(obsVars)[i] == names(input))]] - - # capture flags - if(length(format$flags$flag) != 0){ - theFlags <- map(seq_along(theVar), function(ix){ - temp <- str_extract_all(string = theVar[[ix]], pattern = coll(paste0(format$flags$flag, collapse = ""))) %>% - unlist() - if(length(temp) == 0){ - NA - } else { - temp - } - }) - } - - theVar <- map(seq_along(theVar), function(ix){ - tmp <- theVar[[ix]] - - if(length(tmp) != 0){ - # replace white-spaces - tmp <- gsub(" |\xe2\x80\x80|\xe2\x80\x81|\xe2\x80\x82|\xe2\x80\x83|\xe2\x80\x84|\xe2\x80\x85|\xe2\x80\x86|\xe2\x80\x87|\xe2\x80\x88|\xe2\x80\x89|\xe2\x80\x8a|\xe2\x80\x8b|\xe2\x80\x8c|\xe2\x80\x8d|", "", tmp) - - # replace NA values - if(!is.null(format$na)){ - tmp[tmp %in% format$na] <- NA - } - - # replace thousands seperator - if(!is.null(format$del)){ - tmp <- gsub(format$del, "", tmp) - } - - # replace decimal seperator - if(!is.null(format$dec)){ - tmp <- gsub(format$dec, ".", tmp) - } - - # remove flags - if(length(format$flags$flag) != 0){ - tmp <- str_remove_all(string = tmp, pattern = paste0("[", paste0(format$flags$flag, collapse = ""), "]")) - } - - # multiply with factor - if(!all(is.na(as.numeric(tmp)))){ - tmp <- suppressWarnings(as.numeric(tmp)) * obsVars[[i]]$factor - } - - # apply function to aggregate duplicated values - if(length(tmp) > 1){ - tmp <- sum(tmp, na.rm = TRUE) - } - return(tmp) - } else { - NA - } - - }) - - if(length(format$flags$flag) != 0){ - input <- input %>% bind_cols(tibble(!!paste0("flag_", names(obsVars)[i]) := unlist(theFlags))) - } - - input[[which(names(obsVars)[i] == names(input))]] <- unlist(theVar) - - # ... also set the desired type - class(input[[which(names(obsVars)[i] == names(input))]]) <- obsVars[[i]]$datype - } - - if(length(format$flags$flag) != 0){ - input <- input %>% - unite(col = "flag", starts_with("flag_"), sep = ", ", na.rm = TRUE) - } - - out <- input #%>% - # arrange_at(.vars = names(idVars)) - - return(out) - -} - - -#' Convenience wrapper around tidyr::fill() -#' -#' @param x [\code{data.frame(1)}]\cr table in which to fill NA values. -#' @param direction [\code{character(3)}]\cr direction in which to fill missing values, -#' possible values are "down", "up" and "right"; if several directions are -#' required, provide them in the order required. -#' @importFrom checkmate assertDataFrame assertCharacter -#' @importFrom tidyr pivot_longer pivot_wider fill -#' @importFrom dplyr group_by everything ungroup - -.fill <- function(x = NULL, direction = TRUE){ - - assertDataFrame(x = x) - assertCharacter(x = direction, len = 1) - - if(direction == "down"){ - - out <- x |> - fill(everything(), .direction = "down") - - } else if(direction == "up"){ - - out <- x |> - fill(everything(), .direction = "up") - - } else if(direction == "right"){ - - out <- x |> - rownames_to_column("rn") |> - pivot_longer(!rn) %>% - group_by(rn) %>% - fill(value) %>% - pivot_wider(names_from = name, values_from = value) %>% - ungroup() |> - select(-rn) - - } - - return(out) - -} - - -#' Get the column types of a tibble -#' -#' @param input [\code{data.frame(1)}]\cr table of which to get the column -#' types. -#' @param collapse [\code{logical(1)}]\cr whether or not to paste all column -#' types into one string. -#' @importFrom checkmate assertDataFrame assertLogical -#' @importFrom tibble tibble -#' @importFrom purrr map -#' @importFrom dplyr left_join pull -#' @importFrom stringr str_c - -.getColTypes <- function(input = NULL, collapse = TRUE){ - - assertDataFrame(x = input) - assertLogical(x = collapse, len = 1) - - types <- tibble(col_type = c("character", "integer", "numeric", "double", "logical", "Date", "units", "sfc_POLYGON", "arrow_binary"), - code = c("c", "i", "n", "d", "l", "D", "u", "g", "a")) - - out <- map(1:dim(input)[2], function(ix){ - class(input[[ix]])[1] - }) %>% - unlist() %>% - tibble(col_type = .) %>% - left_join(y = types, by = "col_type") %>% - pull("code") - - if(collapse){ - out <- out %>% - str_c(collapse = "") - } - - return(out) - -} - - -#' Match variables -#' -#' This function matches id and observed variables and reshapes them accordingly -#' @param ids list of id variables -#' @param obs list of observed variables -#' @param clust list of cluster variables -#' @param grp list of group variables -#' @return a symmetric list of variables (all with the same dimensions) -#' @importFrom checkmate assertSetEqual -#' @importFrom purrr reduce map_int map set_names -#' @importFrom tidyr pivot_longer pivot_wider fill separate separate_longer_delim -#' @importFrom dplyr distinct select bind_cols if_any full_join -#' @importFrom tidyselect all_of everything -#' @importFrom rlang `:=` - -.tidyVars <- function(ids = NULL, obs = NULL, clust = NULL, grp = NULL){ - - outIDs <- ids - outObs <- obs - - uniqueIDs <- map(.x = seq_along(ids), .f = function(ix){ - unique(unlist(ids[[ix]])) - }) - - idCols <- map(.x = seq_along(ids), .f = function(ix){ - names(ids[[ix]]) - }) - - # first, assess whether there are any wide observed variables involved - widthObs <- map_int(.x = seq_along(obs), .f = function(ix){ - dim(obs[[ix]])[[2]] - }) - - if(!all(1 == widthObs)){ - # if yes, ... - - wideObs <- obs[which(widthObs != 1)] - nonWideObs <- obs[which(widthObs == 1)] - - # ... identify for each wide observed variable ... - for(i in seq_along(wideObs)){ - - temp <- wideObs[[i]] - varName <- names(wideObs[i]) - - # ... the corresponding wide id variables that gives the column names - wideID <- map(.x = seq_along(idCols), .f = function(ix){ - if(any(idCols[[ix]] %in% names(temp))){ - ids[ix] - } else { - # if no names are matching, reuse names of previous iteration - if(length(names(temp)) == length(idCols[[ix]])){ - ind <- which(lengths(wideID) == length(names(temp))) - repNames <- tibble(old = names(wideID[[ind]]), new = names(temp)) - tempWideID <- map(.x = seq_along(wideID), .f = function(iy){ - newNames <- repNames$new[which(repNames$old %in% names(wideID[[iy]]))] - names(wideID[[iy]]) <- newNames - wideID[iy] - }) - tempWideID <- unlist(tempWideID, recursive = FALSE) - } - } - }) - wideID <- unlist(wideID, recursive = FALSE) - - # in case one of the columns in temp contains only NA-values, remove that empty column - naCol <- NULL - for(j in 1:dim(temp)[2]){ - if(all(is.na(temp[,j]))){ - naCol <- c(naCol, j) - } - } - if(!is.null(naCol)){ - temp <- temp %>% - select(-all_of(naCol)) - if(varName == "listed"){ - wideID[[1]] <- wideID[[1]] %>% - select(-all_of(naCol-1)) - } else { - wideID[[1]] <- wideID[[1]] %>% - select(-all_of(naCol)) - } - - } - tempDim <- dim(temp) - - # build a tibble for joining with the column names of temp - wideColnames <- map(.x = seq_along(wideID), .f = function(jx){ - wideID[[jx]] %>% - pivot_longer(cols = everything(), - names_to = "name", - values_to = names(wideID)[jx]) - }) - - # sort the resulting list by the length of the tables, so that longer - # tables are at the beginning of the following "join" sequence - tempDims <- map_int(.x = seq_along(wideColnames), .f = function(jx){ - dim(wideColnames[[jx]])[[1]] - }) - wideColnames <- wideColnames[order(tempDims, decreasing = TRUE)] - - if(!is.null(wideID)){ - wideColnames <- reduce(wideColnames, full_join, by = "name") %>% - fill(everything()) - } - - - if(varName == "listed"){ - - obsNames <- unique(outObs$listed$key) - obsNames <- obsNames[!is.na(obsNames)] - idNames <- names(ids) - - equalID <- map(.x = seq_along(outIDs), .f = function(ix){ - if(tempDim[1] == dim(ids[[ix]])[1]){ - set_names(outIDs[[ix]], idNames[ix]) - } else if(all(dim(ids[[ix]]) == c(1, 1))){ - set_names(list(tibble(!!names(ids[ix]) := rep(ids[[ix]][[1]], tempDim[1]))), names(ids[ix])) - } - }) - - if(length(wideID) > 1){ - wideNames <- wideColnames %>% - select(all_of(names(wideID)), everything()) %>% - unite(col = "new", !name, sep = "-_-_", na.rm = TRUE) %>% - pivot_wider(names_from = "name", values_from = "new") %>% - unlist(use.names = FALSE) - wideName <- paste0(names(wideID), collapse = "-_-_") - } else { - wideNames <- unlist(wideID, use.names = FALSE) - wideName <- names(wideID) - } - - # remove columns that are both in equalID and tempObs (for example, when an id-variable is used as key) - # dupEqualIDs <- which(!as.list(bind_cols(equalID)) %in% as.list(bind_cols(tempObs))) - - tempObs <- list(temp) - if(!is.null(wideID)){ - names(tempObs[[1]]) <- c("key", wideNames) - # newObs <- bind_cols(c(equalID[dupEqualIDs], tempObs), .name_repair = "minimal") %>% - newObs <- bind_cols(c(equalID, tempObs), .name_repair = "minimal") %>% - pivot_longer(cols = all_of(wideNames), names_to = wideName) - valueNames <- "value" - } else { - newObs <- bind_cols(c(equalID, tempObs), .name_repair = "minimal") - valueNames <- names(newObs)[!names(newObs) %in% c(idNames, "key")] - } - - # dupObs <- newObs %>% - # pivot_wider(names_from = "key", - # values_from = all_of(valueNames), - # values_fn = length) %>% - # mutate(row = row_number()) %>% - # filter(if_any(all_of(obsNames), ~ . != 1)) - # - # if(dim(dupObs)[1] != 0){ - # warning("rows(", paste0(dupObs$row, collapse = ", "), ") are summarised from several values.", call. = FALSE) - # } - - newObs <- newObs %>% - pivot_wider(names_from = "key", - values_from = all_of(valueNames), - values_fn = ~ paste(.x, collapse = " | ")) |> - separate_longer_delim(cols = all_of(obsNames), delim = " | ") - - if(length(wideID) > 1){ - newObs <- newObs %>% - separate(col = wideName, - into = names(wideID), - sep = "-_-_") - } - - } else { - - obsNames <- names(obs) - idNames <- names(ids) - - wideColnames <- wideColnames %>% - select(all_of(names(wideID)), everything()) - - # find the correct name by joining via the column names - tempColnames <- temp %>% - pivot_longer(cols = everything(), names_to = "name", values_to = varName) - wideNames <- left_join(tempColnames, wideColnames, by = "name") %>% - select(-all_of(varName)) %>% - distinct() %>% - unite(col = "new", !name, sep = "-_-_", na.rm = TRUE) %>% - pivot_wider(names_from = "name", values_from = "new") %>% - unlist(use.names = FALSE) - - assertSetEqual(x = length(wideNames), y = tempDim[2]) - names(temp) <- wideNames - - # ... all id variables that have the same length - equalID <- map(.x = seq_along(ids), .f = function(ix){ - if(!names(ids[ix]) %in% names(wideID)){ - if(tempDim[1] == dim(ids[[ix]])[1]){ - bla <- ids[ix] - names(bla[[1]]) <- names(ids[ix]) - return(bla) - } else if(all(dim(ids[[ix]]) == c(1, 1))){ - set_names(list(tibble(!!names(ids[ix]) := rep(ids[[ix]][[1]], tempDim[1]))), names(ids[ix])) - } - } - - }) - equalID <- unlist(equalID, recursive = FALSE) - temp <- bind_cols(temp, equalID, .name_repair = "minimal") - - # and pivot those into longer form - temp <- pivot_longer(data = temp, - cols = all_of(wideNames), - names_to = paste0(names(wideID), collapse = "-_-_"), - values_to = varName) %>% - separate(col = paste0(names(wideID), collapse = "-_-_"), into = names(wideID), sep = "-_-_") - - if(i != 1){ - newObs <- suppressMessages(temp %>% - left_join(newObs)) - } else { - newObs <- temp - } - } - - } - - # for(i in seq_along(nonWideObs)){ - # - # } - - targetRows <- dim(newObs)[1] - - # sort the resulting tibble into the previous lists 'ids' and 'obs' - outIDs <- map(.x = seq_along(idNames), .f = function(ix) { - newObs[idNames[ix]] - }) - names(outIDs) <- idNames - - outObs <- map(.x = seq_along(obsNames), .f = function(ix) { - newObs[obsNames[ix]] - }) - names(outObs) <- obsNames - - } else { - targetRows <- unique(map_int(.x = seq_along(outObs), .f = function(ix){ - dim(outObs[[ix]])[1] - })) - assertIntegerish(x = targetRows, len = 1) - } - - # take care of variables that are too wide - widthsIDs <- map_int(.x = seq_along(outIDs), .f = function(ix){ - dim(outIDs[[ix]])[[2]] - }) - - if(!all(1 == widthsIDs)){ - - wideIDs <- ids[which(widthsIDs != 1)] - newIDs <- map(.x = seq_along(wideIDs), .f = function(ix){ - temp <- wideIDs[ix] - temp <- pivot_longer(data = temp[[1]], cols = everything(), values_to = paste0(names(temp[[1]]), collapse = " ")) %>% - select(-name) - }) - newIDs <- set_names(x = newIDs, nm = names(wideIDs)) - outIDs[which(widthsIDs != 1)] <- newIDs - - } - - # take care of variables that are not long enough - lengthIDs <- map_int(.x = seq_along(outIDs), .f = function(ix){ - dim(outIDs[[ix]])[[1]] - }) - - if(any(lengthIDs != targetRows)){ - - tooShort <- which(lengthIDs != targetRows) - for(i in seq_along(tooShort)){ - ind <- tooShort[i] - if(lengthIDs[ind] == 1){ - outIDs[[ind]] <- tibble(!!names(ids[[ind]]) := rep(ids[[ind]][[1]], targetRows)) - } - } - } - - outGrp <- NULL - if(!is.null(grp)){ - dims <- dim(grp[[1]]) - - nrRows <- targetRows * length(unique(unlist(grp))) - - if(all(dims == 1)){ - temp <- tibble(X = rep(unlist(grp, use.names = FALSE), nrRows)) - } else { - temp <- tibble(X = unlist(grp, use.names = FALSE)) - } - outGrp <- set_names(x = list(temp), nm = names(grp)) - - } - - outClust <- NULL - if(!is.null(clust)){ - if(is.list(clust)){ - dims <- dim(clust[[1]]) - - nrRows <- targetRows * length(unique(unlist(clust))) - - if(all(dims == 1)){ - temp <- tibble(X = rep(unlist(clust, use.names = FALSE), nrRows)) - } else { - if(dims[1] != nrRows){ - temp <- tibble(X = rep(unlist(clust, use.names = FALSE), nrRows/dims[1])) - } else { - temp <- tibble(X = unlist(clust, use.names = FALSE)) - } - } - outClust <- set_names(x = list(temp), nm = names(clust)) - - } - } - - return(c(outGrp, outClust, outIDs, outObs)) - -} - -#' Evaluate .sum constructs -#' -#' @param input [\code{data.frame(1)}]\cr table to reorganise. -#' @param groups [\code{list(3)}]\cr the groups-slot from a schema. -#' @param data [\code{integerish(.)}]\cr the cell column or row that should be -#' adapted to groupings. -#' @return the position of the evaluated position -#' @importFrom dplyr mutate row_number if_else group_by summarise n ungroup -#' left_join pull -#' @importFrom rlang eval_tidy - -.eval_sum <- function(input = NULL, groups = NULL, data = NULL){ - - out <- data - - if(!is.null(data)){ - - rowGroups <- input %>% - mutate(rn = as.double(row_number()), - rn_new = rn) - - if(!is.null(groups$rows)){ - - for(i in seq_along(groups$rows)){ - - temp <- groups$rows[[i]] - targetRows <- eval_tidy(temp$groups[[1]]) - - rowGroups <- rowGroups %>% - mutate(rn_new = if_else(rn_new %in% targetRows, min(targetRows), rn_new)) - - } - nrs <- rowGroups %>% - group_by(rn_new) %>% - summarise(nr = n()) %>% - ungroup() %>% - mutate(id = row_number()) - rowGroups$rn_new <- rep(nrs$id, nrs$nr) - - out <- rowGroups %>% - left_join(tibble(rn = out), ., by = "rn") %>% - pull(rn_new) - - } - - # if(!is.null(groups$cols)){ - # - # } - - } - - return(out) - -} - -#' Evaluate .find constructs -#' -#' @param input [\code{data.frame(1)}]\cr table to reorganise. -#' @param col [\code{list(2)}]\cr the output of the respective .find construct -#' used to match in columns. -#' @param row [\code{list(2)}]\cr the output of the respective .find construct -#' used to match in rows. -#' @param clusters [\code{list(7)}]\cr the cluster slot of the schema. -#' @return the columns or rows of the evaluated position -#' @importFrom checkmate assertNumeric assertList assertDataFrame -#' @importFrom rlang eval_tidy prim_name -#' @importFrom purrr map_int map_lgl -#' @importFrom tibble rownames_to_column -#' @importFrom tidyr pivot_longer pivot_wider -#' @importFrom dplyr select mutate across pull if_else -#' @importFrom stringr str_count str_detect - -.eval_find <- function(input = NULL, col = NULL, row = NULL, clusters = NULL){ - - assertDataFrame(x = input) - assertList(x = row, min.len = 1, null.ok = TRUE) - assertList(x = col, min.len = 1, null.ok = TRUE) - assertList(x = clusters, len = 7, null.ok = TRUE) - - # in case to look for columns - if(!is.null(col)){ - - theCols <- NULL - for(i in seq_along(col)){ - - if(names(col)[i] == "position"){ - cols <- as.numeric(1:dim(input)[2] %in% col[[i]]) - } else if(names(col)[i] == "find"){ - - theCol <- col[[i]] - term <- eval_tidy(theCol$by) - - if(!is.null(term)){ - - if(is.function(term)){ - - if(!is.null(theCol$row)){ - assertNumeric(x = theCol$row, len = 1, any.missing = FALSE) - if(theCol$relative){ - subset <- input[unique(theCol$row + clusters$row - 1),] - } else { - subset <- input[unique(theCol$row),] - } - } else { - subset <- input - } - - # make a subset table that contains numbers when possible - subset <- subset %>% - mutate(across(.cols = where(function(x) suppressWarnings(!anyNA(as.numeric(x[!is.na(x)]))) & !all(is.na(x))), .fns = as.numeric)) - - cols <- map_int(.x = 1:dim(input)[2], .f = function(ix){ - if(prim_name(term) != "is.na"){ - map(subset[,ix], term)[[1]] & !all(is.na(subset[,ix])) - } else { - map(subset[,ix], term)[[1]] - } - }) - - } else { - cols <- map_int(.x = 1:dim(input)[2], .f = function(ix){ - if(!is.null(theCol$row)){ - str_count(string = paste(input[[ix]][theCol$row], collapse = " "), pattern = term) - } else { - str_count(string = paste(input[[ix]], collapse = " "), pattern = term) - } - }) - } - - } else if(theCol$relative) { - - targetCols <- theCol$col + clusters$col - 1 - cols <- rep(0, dim(input)[2]) - cols[as.numeric(names(table(targetCols)))] <- table(targetCols) - - } else { - stop("not yet implemented") - } - - if(theCol$invert){ - temp <- cols - temp[cols == 0] <- 1 - temp[cols != 0] <- 0 - cols <- temp - } - - } else { - next - } - - theCols <- c(theCols, list(cols)) - - if(i != 1){ - theCols <- reduce(theCols, col[[i - 1]]) - theCols <- list(theCols) - } else { - if(length(col) == 1){ - theCols <- theCols[[1]] - } - } - - } - - theCols <- unlist(theCols) - out <- rep(seq_along(theCols), theCols) - - } - - # in case to look for rows - if(!is.null(row)){ - - theRows <- NULL - for(i in seq_along(row)){ - - if(names(row)[i] == "position"){ - inVal <- 1 - outVal <- 0 - - if(any(names(row) %in% "invert")){ - if(row$invert){ - inVal <- 0 - outVal <- 1 - } - } - - rows <- input %>% - mutate(it = if_else(row_number() %in% row[[i]], inVal, outVal)) %>% - pull(it) - } else if(names(row)[i] == "find"){ - - theRow <- row[[i]] - term <- eval_tidy(theRow$by) - - if(!is.null(term)){ - - if(is.function(term)){ - - if(!is.null(theRow$col)){ - assertNumeric(x = theRow$col, min.len = 1, any.missing = FALSE) - subset <- input[,unique(theRow$col)] - } else { - subset <- input - } - - # make a subset table that contains numbers when possible - subset <- subset %>% - rownames_to_column() %>% - pivot_longer(cols = -rowname, names_to = 'variable', values_to = 'value') %>% - pivot_wider(id_cols = variable, names_from = rowname, values_from = value) %>% - select(-variable) %>% - mutate(across(.cols = where(function(x) suppressWarnings(!anyNA(as.numeric(x)))), .fns = as.numeric)) - - rows <- map_int(.x = 1:dim(subset)[2], .f = function(ix){ - all(map(subset[,ix], term)[[1]]) - }) - } else { - - if(length(term) > 1){ - term <- paste0(term, collapse = "|") - } - - if(!is.null(theRow$col)){ - rows <- input %>% - mutate(it = if_else(if_any(theRow$col, ~ grepl(x = .x, pattern = term)), 1, 0)) %>% - pull(it) - } else { - rows <- input %>% - unite(col = all, everything(), sep = " ", na.rm = TRUE) - - # if the theRow is the clusters row, count elements, otherwise detect them - if(is.list(clusters$row)){ - clustRows <- clusters$row$find - } else { - clustRows <- NULL - } - if(identical(theRow, clustRows)){ - rows <- rows %>% - mutate(it = str_count(string = all, pattern = term)) - } else { - rows <- rows %>% - mutate(it = as.numeric(str_detect(string = all, pattern = term))) - } - rows <- rows %>% - pull(it) - } - - } - } else if(theRow$relative) { - - targetRows <- theRow$row + clusters$row - 1 - rows <- input %>% - mutate(rn = row_number()) %>% - left_join(., tibble(rn = targetRows), by = "rn") %>% - mutate(it = if_else(rn %in% targetRows, 1, 0)) %>% - group_by(rn) %>% - summarise(it = sum(it, na.rm = TRUE)) %>% - pull(it) - - } else { - stop("not yet implemented") - } - - - if(theRow$invert){ - temp <- rows - temp[rows == 0] <- 1 - temp[rows != 0] <- 0 - rows <- temp - } - - } else { - next - } - - theRows <- c(theRows, list(rows)) - - if(i != 1){ - theRows <- reduce(theRows, row[[i - 1]]) - theRows <- list(theRows) - } else { - if(length(row) == 1){ - theRows <- theRows[[1]] - } - } - - } - - theRows <- unlist(theRows) - out <- rep(seq_along(theRows), theRows) - - } - - return(out) -} - - -#' Test for a valid table -#' -#' This function is a collection of expectations which ensure that the output of -#' \code{\link{reorganise}} is formally and contentwise correct. It is used in -#' the tests of this package. -#' @param x a table to test. -#' @param units the number of units in the output table (from 1 to 3) -#' @param variables the variables that should be in the output table (either -#' "harvested" or "production") -#' @param groups whether or not groups are in the test table. -#' @param flags whether or not flags are in the test table. -#' @return Either an error message of the invalid expectations, or the output of -#' the last successful expectation. -#' @importFrom testthat expect_identical -#' @importFrom checkmate expect_names expect_tibble expect_list assertChoice - -.expect_valid_table <- function(x = NULL, units = 1, variables = NULL, - groups = FALSE, flags = FALSE){ - - assertChoice(x = units, choices = c(1:3)) - assertChoice(x = variables, choices = c("harvested", "production"), null.ok = TRUE) - - if(units == 1){ - - if(groups){ - expect_identical(object = x$region, expected = c("group 1", "group 1", "group 1", "group 1")) - } - expect_identical(object = x$territories, expected = c("unit 1", "unit 1", "unit 1", "unit 1")) - expect_identical(object = x$year, expected = c("year 1", "year 1", "year 2", "year 2")) - expect_identical(object = x$commodities, expected = c("maize", "soybean", "maize", "soybean")) - if(is.null(variables)){ - if(groups){ - expect_tibble(x = x, any.missing = FALSE, nrows = 4, ncols = 6) - expect_names(x = colnames(x), permutation.of = c("region", "territories", "year", "commodities", "harvested", "production") ) - } else { - if(flags){ - expect_tibble(x = x, any.missing = FALSE, nrows = 4, ncols = 6) - expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production", "flag") ) - } else { - expect_tibble(x = x, any.missing = FALSE, nrows = 4, ncols = 5) - expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production") ) - } - } - expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211)) - expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212)) - } else { - expect_tibble(x = x, any.missing = FALSE, nrows = 4, ncols = 4) - expect_names(x = colnames(x), permutation.of = c("territories", "year", "commodities", variables) ) - if(variables == "harvested") expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211)) - if(variables == "production") expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212)) - } - - } else if(units == 2){ - - if(groups){ - expect_identical(object = x$region, expected = c("group 1", "group 1", "group 1", "group 1", "group 1", "group 1", "group 1", "group 1")) - expect_identical(object = x$territories, expected = c("unit 1", "unit 1", "unit 1", "unit 1", "unit 2", "unit 2", "unit 2", "unit 2")) - } else { - expect_identical(object = x$territories, expected = c("unit 1", "unit 1", "unit 1", "unit 1", "unit 2", "unit 2", "unit 2", "unit 2")) - } - expect_identical(object = x$year, expected = c("year 1", "year 1", "year 2", "year 2", "year 1", "year 1", "year 2", "year 2")) - expect_identical(object = x$commodities, expected = c("maize", "soybean", "maize", "soybean", "maize", "soybean", "maize", "soybean")) - if(is.null(variables)){ - if(groups){ - expect_tibble(x = x, any.missing = FALSE, nrows = 8, ncols = 6) - expect_names(x = colnames(x), permutation.of = c("region", "territories", "year", "commodities", "harvested", "production") ) - } else { - if(flags){ - expect_tibble(x = x, any.missing = FALSE, nrows = 8, ncols = 6) - expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production", "flag") ) - } else { - expect_tibble(x = x, any.missing = FALSE, nrows = 8, ncols = 5) - expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production") ) - } - } - - expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211, 2121, 2111, 2221, 2211)) - expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212, 2122, 2112, 2222, 2212)) - } else { - expect_tibble(x = x, any.missing = FALSE, nrows = 8, ncols = 4) - expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", variables) ) - if(variables == "harvested") expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211, 2121, 2111, 2221, 2211)) - if(variables == "production") expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212, 2122, 2112, 2222, 2212)) - } - - - } else if(units == 3){ - - if(groups){ - expect_identical(object = x$region, expected = c("group 1", "group 1", "group 1", "group 1", "group 1", "group 1", "group 1", "group 1", "group 2", "group 2", "group 2", "group 2")) - expect_identical(object = x$territories, expected = c("unit 1", "unit 1", "unit 1", "unit 1", "unit 2", "unit 2", "unit 2", "unit 2", "unit 3", "unit 3", "unit 3", "unit 3")) - } else { - expect_identical(object = x$territories, expected = c("unit 1", "unit 1", "unit 1", "unit 1", "unit 2", "unit 2", "unit 2", "unit 2", "unit 3", "unit 3", "unit 3", "unit 3")) - } - expect_identical(object = x$year, expected = c("year 1", "year 1", "year 2", "year 2", "year 1", "year 1", "year 2", "year 2", "year 1", "year 1", "year 2", "year 2")) - expect_identical(object = x$commodities, expected = c("maize", "soybean", "maize", "soybean", "maize", "soybean", "maize", "soybean", "maize", "soybean", "maize", "soybean")) - if(is.null(variables)){ - if(groups){ - expect_tibble(x = x, any.missing = FALSE, nrows = 12, ncols = 6) - expect_names(x = colnames(x), permutation.of = c("region", "territories", "year", "commodities", "harvested", "production") ) - } else { - if(flags){ - expect_tibble(x = x, any.missing = FALSE, nrows = 12, ncols = 6) - expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production", "flag") ) - } else { - expect_tibble(x = x, any.missing = FALSE, nrows = 12, ncols = 5) - expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production") ) - } - } - expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211, 2121, 2111, 2221, 2211, 3121, 3111, 3221, 3211)) - expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212, 2122, 2112, 2222, 2212, 3122, 3112, 3222, 3212)) - } else { - expect_tibble(x = x, any.missing = FALSE, nrows = 12, ncols = 4) - expect_names(x = colnames(x), permutation.of = c("territories", "year", "commodities", variables) ) - if(variables == "harvested") expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211, 2121, 2111, 2221, 2211, 3121, 3111, 3221, 3211)) - if(variables == "production") expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212, 2122, 2112, 2222, 2212, 3122, 3112, 3222, 3212)) - } - } +#' Update the formating of a table +#' +#' This function updates the format of a table by applying a schema description +#' to it. +#' @param input [\code{character(1)}]\cr table to reorganise. +#' @param schema [\code{character(1)}]\cr the schema description of +#' \code{input}. +#' @importFrom purrr map +#' @importFrom dplyr row_number arrange_at +#' @importFrom stringr str_remove_all str_extract_all coll +#' @importFrom tidyselect starts_with + +.updateFormat <- function(input = NULL, schema = NULL){ + + clusters <- schema@clusters + variables <- schema@variables + format <- schema@format + + if(!is.null(format$del)){ + if(format$del == "."){ + format$del <- "[.]" + } + } + + if(!is.null(format$dec)){ + if(format$dec == "."){ + format$dec <- "[.]" + } + } + + idVars <- map(.x = seq_along(variables), .f = function(ix){ + if(variables[[ix]]$vartype == "id"){ + variables[ix] + } + }) + idVars <- unlist(idVars, recursive = FALSE) + + for(i in seq_along(idVars)){ + # set the desired type + + if(idVars[[i]]$datype == "Date"){ + input[[which(names(idVars)[i] == names(input))]] <- as.Date(input[[which(names(idVars)[i] == names(input))]], tryFormats = c("%Y-%m-%d"), optional = TRUE) + } else { + class(input[[which(names(idVars)[i] == names(input))]]) <- idVars[[i]]$datype + } + + } + + obsVars <- map(.x = seq_along(variables), .f = function(ix){ + if(variables[[ix]]$vartype == "observed"){ + variables[ix] + } + }) + obsVars <- unlist(obsVars, recursive = FALSE) + + # set all observed variables to the correct format + for(i in seq_along(obsVars)){ + theVar <- input[[which(names(obsVars)[i] == names(input))]] + + # capture flags + if(length(format$flags$flag) != 0){ + theFlags <- map(seq_along(theVar), function(ix){ + temp <- str_extract_all(string = theVar[[ix]], pattern = coll(paste0(format$flags$flag, collapse = ""))) %>% + unlist() + if(length(temp) == 0){ + NA + } else { + temp + } + }) + } + + theVar <- map(seq_along(theVar), function(ix){ + tmp <- theVar[[ix]] + + if(length(tmp) != 0){ + # replace white-spaces + tmp <- gsub(" |\xe2\x80\x80|\xe2\x80\x81|\xe2\x80\x82|\xe2\x80\x83|\xe2\x80\x84|\xe2\x80\x85|\xe2\x80\x86|\xe2\x80\x87|\xe2\x80\x88|\xe2\x80\x89|\xe2\x80\x8a|\xe2\x80\x8b|\xe2\x80\x8c|\xe2\x80\x8d|", "", tmp) + + # replace NA values + if(!is.null(format$na)){ + tmp[tmp %in% format$na] <- NA + } + + # replace thousands seperator + if(!is.null(format$del)){ + tmp <- gsub(format$del, "", tmp) + } + + # replace decimal seperator + if(!is.null(format$dec)){ + tmp <- gsub(format$dec, ".", tmp) + } + + # remove flags + if(length(format$flags$flag) != 0){ + tmp <- str_remove_all(string = tmp, pattern = paste0("[", paste0(format$flags$flag, collapse = ""), "]")) + } + + # multiply with factor + if(!all(is.na(as.numeric(tmp)))){ + tmp <- suppressWarnings(as.numeric(tmp)) * obsVars[[i]]$factor + } + + # apply function to aggregate duplicated values + if(length(tmp) > 1){ + tmp <- sum(tmp, na.rm = TRUE) + } + return(tmp) + } else { + NA + } + + }) + + if(length(format$flags$flag) != 0){ + input <- input %>% bind_cols(tibble(!!paste0("flag_", names(obsVars)[i]) := unlist(theFlags))) + } + + input[[which(names(obsVars)[i] == names(input))]] <- unlist(theVar) + + # ... also set the desired type + class(input[[which(names(obsVars)[i] == names(input))]]) <- obsVars[[i]]$datype + } + + if(length(format$flags$flag) != 0){ + input <- input %>% + unite(col = "flag", starts_with("flag_"), sep = ", ", na.rm = TRUE) + } + + out <- input #%>% + # arrange_at(.vars = names(idVars)) + + return(out) + +} + + +#' Convenience wrapper around tidyr::fill() +#' +#' @param x [\code{data.frame(1)}]\cr table in which to fill NA values. +#' @param direction [\code{character(3)}]\cr direction in which to fill missing values, +#' possible values are "down", "up" and "right"; if several directions are +#' required, provide them in the order required. +#' @importFrom checkmate assertDataFrame assertCharacter +#' @importFrom tidyr pivot_longer pivot_wider fill +#' @importFrom dplyr group_by everything ungroup + +.fill <- function(x = NULL, direction = TRUE){ + + assertDataFrame(x = x) + assertCharacter(x = direction, len = 1) + + if(direction == "down"){ + + out <- x |> + fill(everything(), .direction = "down") + + } else if(direction == "up"){ + + out <- x |> + fill(everything(), .direction = "up") + + } else if(direction == "right"){ + + out <- x |> + rownames_to_column("rn") |> + pivot_longer(!rn) %>% + group_by(rn) %>% + fill(value) %>% + pivot_wider(names_from = name, values_from = value) %>% + ungroup() |> + select(-rn) + + } + + return(out) + +} + + +#' Get the column types of a tibble +#' +#' @param input [\code{data.frame(1)}]\cr table of which to get the column +#' types. +#' @param collapse [\code{logical(1)}]\cr whether or not to paste all column +#' types into one string. +#' @importFrom checkmate assertDataFrame assertLogical +#' @importFrom tibble tibble +#' @importFrom purrr map +#' @importFrom dplyr left_join pull +#' @importFrom stringr str_c + +.getColTypes <- function(input = NULL, collapse = TRUE){ + + assertDataFrame(x = input) + assertLogical(x = collapse, len = 1) + + types <- tibble(col_type = c("character", "integer", "numeric", "double", "logical", "Date", "units", "sfc_POLYGON", "arrow_binary"), + code = c("c", "i", "n", "d", "l", "D", "u", "g", "a")) + + out <- map(1:dim(input)[2], function(ix){ + class(input[[ix]])[1] + }) %>% + unlist() %>% + tibble(col_type = .) %>% + left_join(y = types, by = "col_type") %>% + pull("code") + + if(collapse){ + out <- out %>% + str_c(collapse = "") + } + + return(out) + +} + + +#' Match variables +#' +#' This function matches id and observed variables and reshapes them accordingly +#' @param ids list of id variables +#' @param obs list of observed variables +#' @param clust list of cluster variables +#' @param grp list of group variables +#' @return a symmetric list of variables (all with the same dimensions) +#' @importFrom checkmate assertSetEqual +#' @importFrom purrr reduce map_int map set_names +#' @importFrom tidyr pivot_longer pivot_wider fill separate separate_longer_delim +#' @importFrom dplyr distinct select bind_cols if_any full_join +#' @importFrom tidyselect all_of everything +#' @importFrom rlang `:=` + +.tidyVars <- function(ids = NULL, obs = NULL, clust = NULL, grp = NULL){ + + outIDs <- ids + outObs <- obs + + uniqueIDs <- map(.x = seq_along(ids), .f = function(ix){ + unique(unlist(ids[[ix]])) + }) + + idCols <- map(.x = seq_along(ids), .f = function(ix){ + names(ids[[ix]]) + }) + + # first, assess whether there are any wide observed variables involved + widthObs <- map_int(.x = seq_along(obs), .f = function(ix){ + dim(obs[[ix]])[[2]] + }) + + if(!all(1 == widthObs)){ + # if yes, ... + + wideObs <- obs[which(widthObs != 1)] + nonWideObs <- obs[which(widthObs == 1)] + + # ... identify for each wide observed variable ... + for(i in seq_along(wideObs)){ + + temp <- wideObs[[i]] + varName <- names(wideObs[i]) + + # ... the corresponding wide id variables that gives the column names + wideID <- map(.x = seq_along(idCols), .f = function(ix){ + if(any(idCols[[ix]] %in% names(temp))){ + ids[ix] + } else { + # if no names are matching, reuse names of previous iteration + if(length(names(temp)) == length(idCols[[ix]])){ + ind <- which(lengths(wideID) == length(names(temp))) + repNames <- tibble(old = names(wideID[[ind]]), new = names(temp)) + tempWideID <- map(.x = seq_along(wideID), .f = function(iy){ + newNames <- repNames$new[which(repNames$old %in% names(wideID[[iy]]))] + names(wideID[[iy]]) <- newNames + wideID[iy] + }) + tempWideID <- unlist(tempWideID, recursive = FALSE) + } + } + }) + wideID <- unlist(wideID, recursive = FALSE) + + # in case one of the columns in temp contains only NA-values, remove that empty column + naCol <- NULL + for(j in 1:dim(temp)[2]){ + if(all(is.na(temp[,j]))){ + naCol <- c(naCol, j) + } + } + if(!is.null(naCol)){ + temp <- temp %>% + select(-all_of(naCol)) + if(varName == "listed"){ + wideID[[1]] <- wideID[[1]] %>% + select(-all_of(naCol-1)) + } else { + wideID[[1]] <- wideID[[1]] %>% + select(-all_of(naCol)) + } + + } + tempDim <- dim(temp) + + # build a tibble for joining with the column names of temp + wideColnames <- map(.x = seq_along(wideID), .f = function(jx){ + wideID[[jx]] %>% + pivot_longer(cols = everything(), + names_to = "name", + values_to = names(wideID)[jx]) + }) + + # sort the resulting list by the length of the tables, so that longer + # tables are at the beginning of the following "join" sequence + tempDims <- map_int(.x = seq_along(wideColnames), .f = function(jx){ + dim(wideColnames[[jx]])[[1]] + }) + wideColnames <- wideColnames[order(tempDims, decreasing = TRUE)] + + if(!is.null(wideID)){ + wideColnames <- reduce(wideColnames, full_join, by = "name") %>% + fill(everything()) + } + + + if(varName == "listed"){ + + obsNames <- unique(outObs$listed$key) + obsNames <- obsNames[!is.na(obsNames)] + idNames <- names(ids) + + equalID <- map(.x = seq_along(outIDs), .f = function(ix){ + if(tempDim[1] == dim(ids[[ix]])[1]){ + set_names(outIDs[[ix]], idNames[ix]) + } else if(all(dim(ids[[ix]]) == c(1, 1))){ + set_names(list(tibble(!!names(ids[ix]) := rep(ids[[ix]][[1]], tempDim[1]))), names(ids[ix])) + } + }) + + if(length(wideID) > 1){ + wideNames <- wideColnames %>% + select(all_of(names(wideID)), everything()) %>% + unite(col = "new", !name, sep = "-_-_", na.rm = TRUE) %>% + pivot_wider(names_from = "name", values_from = "new") %>% + unlist(use.names = FALSE) + wideName <- paste0(names(wideID), collapse = "-_-_") + } else { + wideNames <- unlist(wideID, use.names = FALSE) + wideName <- names(wideID) + } + + # remove columns that are both in equalID and tempObs (for example, when an id-variable is used as key) + # dupEqualIDs <- which(!as.list(bind_cols(equalID)) %in% as.list(bind_cols(tempObs))) + + tempObs <- list(temp) + if(!is.null(wideID)){ + names(tempObs[[1]]) <- c("key", wideNames) + # newObs <- bind_cols(c(equalID[dupEqualIDs], tempObs), .name_repair = "minimal") %>% + newObs <- bind_cols(c(equalID, tempObs), .name_repair = "minimal") %>% + pivot_longer(cols = all_of(wideNames), names_to = wideName) + valueNames <- "value" + } else { + newObs <- bind_cols(c(equalID, tempObs), .name_repair = "minimal") + valueNames <- names(newObs)[!names(newObs) %in% c(idNames, "key")] + } + + # dupObs <- newObs %>% + # pivot_wider(names_from = "key", + # values_from = all_of(valueNames), + # values_fn = length) %>% + # mutate(row = row_number()) %>% + # filter(if_any(all_of(obsNames), ~ . != 1)) + # + # if(dim(dupObs)[1] != 0){ + # warning("rows(", paste0(dupObs$row, collapse = ", "), ") are summarised from several values.", call. = FALSE) + # } + + newObs <- newObs %>% + pivot_wider(names_from = "key", + values_from = all_of(valueNames), + values_fn = ~ paste(.x, collapse = " | ")) |> + separate_longer_delim(cols = all_of(obsNames), delim = " | ") + + if(length(wideID) > 1){ + newObs <- newObs %>% + separate(col = wideName, + into = names(wideID), + sep = "-_-_") + } + + } else { + + obsNames <- names(obs) + idNames <- names(ids) + + wideColnames <- wideColnames %>% + select(all_of(names(wideID)), everything()) + + # find the correct name by joining via the column names + tempColnames <- temp %>% + pivot_longer(cols = everything(), names_to = "name", values_to = varName) + wideNames <- left_join(tempColnames, wideColnames, by = "name") %>% + select(-all_of(varName)) %>% + distinct() %>% + unite(col = "new", !name, sep = "-_-_", na.rm = TRUE) %>% + pivot_wider(names_from = "name", values_from = "new") %>% + unlist(use.names = FALSE) + + assertSetEqual(x = length(wideNames), y = tempDim[2]) + names(temp) <- wideNames + + # ... all id variables that have the same length + equalID <- map(.x = seq_along(ids), .f = function(ix){ + if(!names(ids[ix]) %in% names(wideID)){ + if(tempDim[1] == dim(ids[[ix]])[1]){ + bla <- ids[ix] + names(bla[[1]]) <- names(ids[ix]) + return(bla) + } else if(all(dim(ids[[ix]]) == c(1, 1))){ + set_names(list(tibble(!!names(ids[ix]) := rep(ids[[ix]][[1]], tempDim[1]))), names(ids[ix])) + } + } + + }) + equalID <- unlist(equalID, recursive = FALSE) + temp <- bind_cols(temp, equalID, .name_repair = "minimal") + + # and pivot those into longer form + temp <- pivot_longer(data = temp, + cols = all_of(wideNames), + names_to = paste0(names(wideID), collapse = "-_-_"), + values_to = varName) %>% + separate(col = paste0(names(wideID), collapse = "-_-_"), into = names(wideID), sep = "-_-_") + + if(i != 1){ + newObs <- suppressMessages(temp %>% + left_join(newObs)) + } else { + newObs <- temp + } + } + + } + + # for(i in seq_along(nonWideObs)){ + # + # } + + targetRows <- dim(newObs)[1] + + # sort the resulting tibble into the previous lists 'ids' and 'obs' + outIDs <- map(.x = seq_along(idNames), .f = function(ix) { + newObs[idNames[ix]] + }) + names(outIDs) <- idNames + + outObs <- map(.x = seq_along(obsNames), .f = function(ix) { + newObs[obsNames[ix]] + }) + names(outObs) <- obsNames + + } else { + targetRows <- unique(map_int(.x = seq_along(outObs), .f = function(ix){ + dim(outObs[[ix]])[1] + })) + assertIntegerish(x = targetRows, len = 1) + } + + # take care of variables that are too wide + widthsIDs <- map_int(.x = seq_along(outIDs), .f = function(ix){ + dim(outIDs[[ix]])[[2]] + }) + + if(!all(1 == widthsIDs)){ + + wideIDs <- ids[which(widthsIDs != 1)] + newIDs <- map(.x = seq_along(wideIDs), .f = function(ix){ + temp <- wideIDs[ix] + temp <- pivot_longer(data = temp[[1]], cols = everything(), values_to = paste0(names(temp[[1]]), collapse = " ")) %>% + select(-name) + }) + newIDs <- set_names(x = newIDs, nm = names(wideIDs)) + outIDs[which(widthsIDs != 1)] <- newIDs + + } + + # take care of variables that are not long enough + lengthIDs <- map_int(.x = seq_along(outIDs), .f = function(ix){ + dim(outIDs[[ix]])[[1]] + }) + + if(any(lengthIDs != targetRows)){ + + tooShort <- which(lengthIDs != targetRows) + for(i in seq_along(tooShort)){ + ind <- tooShort[i] + if(lengthIDs[ind] == 1){ + outIDs[[ind]] <- tibble(!!names(ids[[ind]]) := rep(ids[[ind]][[1]], targetRows)) + } + } + } + + outGrp <- NULL + if(!is.null(grp)){ + dims <- dim(grp[[1]]) + + nrRows <- targetRows * length(unique(unlist(grp))) + + if(all(dims == 1)){ + temp <- tibble(X = rep(unlist(grp, use.names = FALSE), nrRows)) + } else { + temp <- tibble(X = unlist(grp, use.names = FALSE)) + } + outGrp <- set_names(x = list(temp), nm = names(grp)) + + } + + outClust <- NULL + if(!is.null(clust)){ + if(is.list(clust)){ + dims <- dim(clust[[1]]) + + nrRows <- targetRows * length(unique(unlist(clust))) + + if(all(dims == 1)){ + temp <- tibble(X = rep(unlist(clust, use.names = FALSE), nrRows)) + } else { + if(dims[1] != nrRows){ + temp <- tibble(X = rep(unlist(clust, use.names = FALSE), nrRows/dims[1])) + } else { + temp <- tibble(X = unlist(clust, use.names = FALSE)) + } + } + outClust <- set_names(x = list(temp), nm = names(clust)) + + } + } + + return(c(outGrp, outClust, outIDs, outObs)) + +} + +#' Evaluate .sum constructs +#' +#' @param input [\code{data.frame(1)}]\cr table to reorganise. +#' @param groups [\code{list(3)}]\cr the groups-slot from a schema. +#' @param data [\code{integerish(.)}]\cr the cell column or row that should be +#' adapted to groupings. +#' @return the position of the evaluated position +#' @importFrom dplyr mutate row_number if_else group_by summarise n ungroup +#' left_join pull +#' @importFrom rlang eval_tidy + +.eval_sum <- function(input = NULL, groups = NULL, data = NULL){ + + out <- data + + if(!is.null(data)){ + + rowGroups <- input %>% + mutate(rn = as.double(row_number()), + rn_new = rn) + + if(!is.null(groups$rows)){ + + for(i in seq_along(groups$rows)){ + + temp <- groups$rows[[i]] + targetRows <- eval_tidy(temp$groups[[1]]) + + rowGroups <- rowGroups %>% + mutate(rn_new = if_else(rn_new %in% targetRows, min(targetRows), rn_new)) + + } + nrs <- rowGroups %>% + group_by(rn_new) %>% + summarise(nr = n()) %>% + ungroup() %>% + mutate(id = row_number()) + rowGroups$rn_new <- rep(nrs$id, nrs$nr) + + out <- rowGroups %>% + left_join(tibble(rn = out), ., by = "rn") %>% + pull(rn_new) + + } + + # if(!is.null(groups$cols)){ + # + # } + + } + + return(out) + +} + +#' Evaluate .find constructs +#' +#' @param input [\code{data.frame(1)}]\cr table to reorganise. +#' @param col [\code{list(2)}]\cr the output of the respective .find construct +#' used to match in columns. +#' @param row [\code{list(2)}]\cr the output of the respective .find construct +#' used to match in rows. +#' @param clusters [\code{list(7)}]\cr the cluster slot of the schema. +#' @return the columns or rows of the evaluated position +#' @importFrom checkmate assertNumeric assertList assertDataFrame +#' @importFrom rlang eval_tidy prim_name +#' @importFrom purrr map_int map_lgl +#' @importFrom tibble rownames_to_column +#' @importFrom tidyr pivot_longer pivot_wider +#' @importFrom dplyr select mutate across pull if_else +#' @importFrom stringr str_count str_detect + +.eval_find <- function(input = NULL, col = NULL, row = NULL, clusters = NULL){ + + assertDataFrame(x = input) + assertList(x = row, min.len = 1, null.ok = TRUE) + assertList(x = col, min.len = 1, null.ok = TRUE) + assertList(x = clusters, len = 7, null.ok = TRUE) + + # in case to look for columns + if(!is.null(col)){ + + theCols <- NULL + for(i in seq_along(col)){ + + if(names(col)[i] == "position"){ + cols <- as.numeric(1:dim(input)[2] %in% col[[i]]) + } else if(names(col)[i] == "find"){ + + theCol <- col[[i]] + term <- eval_tidy(theCol$by) + + if(!is.null(term)){ + + if(is.function(term)){ + + if(!is.null(theCol$row)){ + assertNumeric(x = theCol$row, len = 1, any.missing = FALSE) + if(theCol$relative){ + subset <- input[unique(theCol$row + clusters$row - 1),] + } else { + subset <- input[unique(theCol$row),] + } + } else { + subset <- input + } + + # make a subset table that contains numbers when possible + subset <- subset %>% + mutate(across(.cols = where(function(x) suppressWarnings(!anyNA(as.numeric(x[!is.na(x)]))) & !all(is.na(x))), .fns = as.numeric)) + + cols <- map_int(.x = 1:dim(input)[2], .f = function(ix){ + if(prim_name(term) != "is.na"){ + map(subset[,ix], term)[[1]] & !all(is.na(subset[,ix])) + } else { + map(subset[,ix], term)[[1]] + } + }) + + } else { + cols <- map_int(.x = 1:dim(input)[2], .f = function(ix){ + if(!is.null(theCol$row)){ + str_count(string = paste(input[[ix]][theCol$row], collapse = " "), pattern = term) + } else { + str_count(string = paste(input[[ix]], collapse = " "), pattern = term) + } + }) + } + + } else if(theCol$relative) { + + targetCols <- theCol$col + clusters$col - 1 + cols <- rep(0, dim(input)[2]) + cols[as.numeric(names(table(targetCols)))] <- table(targetCols) + + } else { + stop("not yet implemented") + } + + if(theCol$invert){ + temp <- cols + temp[cols == 0] <- 1 + temp[cols != 0] <- 0 + cols <- temp + } + + } else { + next + } + + theCols <- c(theCols, list(cols)) + + if(i != 1){ + theCols <- reduce(theCols, col[[i - 1]]) + theCols <- list(theCols) + } else { + if(length(col) == 1){ + theCols <- theCols[[1]] + } + } + + } + + theCols <- unlist(theCols) + out <- rep(seq_along(theCols), theCols) + + } + + # in case to look for rows + if(!is.null(row)){ + + theRows <- NULL + for(i in seq_along(row)){ + + if(names(row)[i] == "position"){ + inVal <- 1 + outVal <- 0 + + if(any(names(row) %in% "invert")){ + if(row$invert){ + inVal <- 0 + outVal <- 1 + } + } + + rows <- input %>% + mutate(it = if_else(row_number() %in% row[[i]], inVal, outVal)) %>% + pull(it) + } else if(names(row)[i] == "find"){ + + theRow <- row[[i]] + term <- eval_tidy(theRow$by) + + if(!is.null(term)){ + + if(is.function(term)){ + + if(!is.null(theRow$col)){ + assertNumeric(x = theRow$col, min.len = 1, any.missing = FALSE) + subset <- input[,unique(theRow$col)] + } else { + subset <- input + } + + # make a subset table that contains numbers when possible + subset <- subset %>% + rownames_to_column() %>% + pivot_longer(cols = -rowname, names_to = 'variable', values_to = 'value') %>% + pivot_wider(id_cols = variable, names_from = rowname, values_from = value) %>% + select(-variable) %>% + mutate(across(.cols = where(function(x) suppressWarnings(!anyNA(as.numeric(x)))), .fns = as.numeric)) + + rows <- map_int(.x = 1:dim(subset)[2], .f = function(ix){ + all(map(subset[,ix], term)[[1]]) + }) + } else { + + if(length(term) > 1){ + term <- paste0(term, collapse = "|") + } + + if(!is.null(theRow$col)){ + rows <- input %>% + mutate(it = if_else(if_any(theRow$col, ~ grepl(x = .x, pattern = term)), 1, 0)) %>% + pull(it) + } else { + rows <- input %>% + unite(col = all, everything(), sep = " ", na.rm = TRUE) + + # if the theRow is the clusters row, count elements, otherwise detect them + if(is.list(clusters$row)){ + clustRows <- clusters$row$find + } else { + clustRows <- NULL + } + if(identical(theRow, clustRows)){ + rows <- rows %>% + mutate(it = str_count(string = all, pattern = term)) + } else { + rows <- rows %>% + mutate(it = as.numeric(str_detect(string = all, pattern = term))) + } + rows <- rows %>% + pull(it) + } + + } + } else if(theRow$relative) { + + targetRows <- theRow$row + clusters$row - 1 + rows <- input %>% + mutate(rn = row_number()) %>% + left_join(., tibble(rn = targetRows), by = "rn") %>% + mutate(it = if_else(rn %in% targetRows, 1, 0)) %>% + group_by(rn) %>% + summarise(it = sum(it, na.rm = TRUE)) %>% + pull(it) + + } else { + stop("not yet implemented") + } + + + if(theRow$invert){ + temp <- rows + temp[rows == 0] <- 1 + temp[rows != 0] <- 0 + rows <- temp + } + + } else { + next + } + + theRows <- c(theRows, list(rows)) + + if(i != 1){ + theRows <- reduce(theRows, row[[i - 1]]) + theRows <- list(theRows) + } else { + if(length(row) == 1){ + theRows <- theRows[[1]] + } + } + + } + + theRows <- unlist(theRows) + out <- rep(seq_along(theRows), theRows) + + } + + return(out) +} + + +#' Test for a valid table +#' +#' This function is a collection of expectations which ensure that the output of +#' \code{\link{reorganise}} is formally and contentwise correct. It is used in +#' the tests of this package. +#' @param x a table to test. +#' @param units the number of units in the output table (from 1 to 3) +#' @param variables the variables that should be in the output table (either +#' "harvested" or "production") +#' @param groups whether or not groups are in the test table. +#' @param flags whether or not flags are in the test table. +#' @return Either an error message of the invalid expectations, or the output of +#' the last successful expectation. +#' @importFrom testthat expect_identical +#' @importFrom checkmate expect_names expect_tibble expect_list assertChoice + +.expect_valid_table <- function(x = NULL, units = 1, variables = NULL, + groups = FALSE, flags = FALSE){ + + assertChoice(x = units, choices = c(1:3)) + assertChoice(x = variables, choices = c("harvested", "production"), null.ok = TRUE) + + if(units == 1){ + + if(groups){ + expect_identical(object = x$region, expected = c("group 1", "group 1", "group 1", "group 1")) + } + expect_identical(object = x$territories, expected = c("unit 1", "unit 1", "unit 1", "unit 1")) + expect_identical(object = x$year, expected = c("year 1", "year 1", "year 2", "year 2")) + expect_identical(object = x$commodities, expected = c("maize", "soybean", "maize", "soybean")) + if(is.null(variables)){ + if(groups){ + expect_tibble(x = x, any.missing = FALSE, nrows = 4, ncols = 6) + expect_names(x = colnames(x), permutation.of = c("region", "territories", "year", "commodities", "harvested", "production") ) + } else { + if(flags){ + expect_tibble(x = x, any.missing = FALSE, nrows = 4, ncols = 6) + expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production", "flag") ) + } else { + expect_tibble(x = x, any.missing = FALSE, nrows = 4, ncols = 5) + expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production") ) + } + } + expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211)) + expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212)) + } else { + expect_tibble(x = x, any.missing = FALSE, nrows = 4, ncols = 4) + expect_names(x = colnames(x), permutation.of = c("territories", "year", "commodities", variables) ) + if(variables == "harvested") expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211)) + if(variables == "production") expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212)) + } + + } else if(units == 2){ + + if(groups){ + expect_identical(object = x$region, expected = c("group 1", "group 1", "group 1", "group 1", "group 1", "group 1", "group 1", "group 1")) + expect_identical(object = x$territories, expected = c("unit 1", "unit 1", "unit 1", "unit 1", "unit 2", "unit 2", "unit 2", "unit 2")) + } else { + expect_identical(object = x$territories, expected = c("unit 1", "unit 1", "unit 1", "unit 1", "unit 2", "unit 2", "unit 2", "unit 2")) + } + expect_identical(object = x$year, expected = c("year 1", "year 1", "year 2", "year 2", "year 1", "year 1", "year 2", "year 2")) + expect_identical(object = x$commodities, expected = c("maize", "soybean", "maize", "soybean", "maize", "soybean", "maize", "soybean")) + if(is.null(variables)){ + if(groups){ + expect_tibble(x = x, any.missing = FALSE, nrows = 8, ncols = 6) + expect_names(x = colnames(x), permutation.of = c("region", "territories", "year", "commodities", "harvested", "production") ) + } else { + if(flags){ + expect_tibble(x = x, any.missing = FALSE, nrows = 8, ncols = 6) + expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production", "flag") ) + } else { + expect_tibble(x = x, any.missing = FALSE, nrows = 8, ncols = 5) + expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production") ) + } + } + + expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211, 2121, 2111, 2221, 2211)) + expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212, 2122, 2112, 2222, 2212)) + } else { + expect_tibble(x = x, any.missing = FALSE, nrows = 8, ncols = 4) + expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", variables) ) + if(variables == "harvested") expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211, 2121, 2111, 2221, 2211)) + if(variables == "production") expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212, 2122, 2112, 2222, 2212)) + } + + + } else if(units == 3){ + + if(groups){ + expect_identical(object = x$region, expected = c("group 1", "group 1", "group 1", "group 1", "group 1", "group 1", "group 1", "group 1", "group 2", "group 2", "group 2", "group 2")) + expect_identical(object = x$territories, expected = c("unit 1", "unit 1", "unit 1", "unit 1", "unit 2", "unit 2", "unit 2", "unit 2", "unit 3", "unit 3", "unit 3", "unit 3")) + } else { + expect_identical(object = x$territories, expected = c("unit 1", "unit 1", "unit 1", "unit 1", "unit 2", "unit 2", "unit 2", "unit 2", "unit 3", "unit 3", "unit 3", "unit 3")) + } + expect_identical(object = x$year, expected = c("year 1", "year 1", "year 2", "year 2", "year 1", "year 1", "year 2", "year 2", "year 1", "year 1", "year 2", "year 2")) + expect_identical(object = x$commodities, expected = c("maize", "soybean", "maize", "soybean", "maize", "soybean", "maize", "soybean", "maize", "soybean", "maize", "soybean")) + if(is.null(variables)){ + if(groups){ + expect_tibble(x = x, any.missing = FALSE, nrows = 12, ncols = 6) + expect_names(x = colnames(x), permutation.of = c("region", "territories", "year", "commodities", "harvested", "production") ) + } else { + if(flags){ + expect_tibble(x = x, any.missing = FALSE, nrows = 12, ncols = 6) + expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production", "flag") ) + } else { + expect_tibble(x = x, any.missing = FALSE, nrows = 12, ncols = 5) + expect_names(x = colnames(x), permutation.of =c("territories", "year", "commodities", "harvested", "production") ) + } + } + expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211, 2121, 2111, 2221, 2211, 3121, 3111, 3221, 3211)) + expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212, 2122, 2112, 2222, 2212, 3122, 3112, 3222, 3212)) + } else { + expect_tibble(x = x, any.missing = FALSE, nrows = 12, ncols = 4) + expect_names(x = colnames(x), permutation.of = c("territories", "year", "commodities", variables) ) + if(variables == "harvested") expect_identical(object = x$harvested, expected = c(1121, 1111, 1221, 1211, 2121, 2111, 2221, 2211, 3121, 3111, 3221, 3211)) + if(variables == "production") expect_identical(object = x$production, expected = c(1122, 1112, 1222, 1212, 2122, 2112, 2222, 2212, 3122, 3112, 3222, 3212)) + } + } } \ No newline at end of file diff --git a/R/reorganise.R b/R/reorganise.R index 8d1a8ba..b5a137c 100755 --- a/R/reorganise.R +++ b/R/reorganise.R @@ -1,102 +1,102 @@ -#' Reorganise a table -#' -#' This function takes a disorganised messy table and rearranges columns and -#' rows into a tidy table based on a schema description. -#' @param input [\code{data.frame(1)}]\cr table to reorganise. -#' @param schema [\code{symbol(1)}]\cr the schema description of \code{input}. -#' @return A (tidy) table which is the result of reorganising \code{input} based -#' on \code{schema}. -#' @examples -#' # a rather disorganised table with messy clusters and a distinct variable -#' (input <- tabs2shift$clusters_messy) -#' -#' # put together schema description by ... -#' # ... identifying cluster positions -#' schema <- setCluster(id = "territories", left = c(1, 1, 4), top = c(1, 8, 8)) -#' -#' # ... specifying the cluster ID as id variable (obligatory) -#' schema <- schema %>% -#' setIDVar(name = "territories", columns = c(1, 1, 4), rows = c(2, 9, 9)) -#' -#' # ... specifying the distinct variable (explicit position) -#' schema <- schema %>% -#' setIDVar(name = "year", columns = 4, rows = c(3:6), distinct = TRUE) -#' -#' # ... specifying a tidy variable (by giving the column values) -#' schema <- schema %>% -#' setIDVar(name = "commodities", columns = c(1, 1, 4)) -#' -#' # ... identifying the (tidy) observed variables -#' schema <- schema %>% -#' setObsVar(name = "harvested", columns = c(2, 2, 5)) %>% -#' setObsVar(name = "production", columns = c(3, 3, 6)) -#' -#' # get the tidy output -#' reorganise(input, schema) -#' -#' @importFrom checkmate assertDataFrame assertIntegerish -#' @importFrom dplyr bind_rows select bind_cols left_join mutate_all -#' @importFrom tibble tibble -#' @importFrom purrr reduce map map_int -#' @export - -reorganise <- function(input = NULL, schema = NULL){ - - # check validity of arguments - assertDataFrame(x = input) - - # 1. add missing information in schema ---- - schema <- validateSchema(input = input, schema = schema) - - # 2. select data ---- - input <- validateInput(input = input, schema = schema) - - # if a cluster id has been specified, extract the variable values - clusterVar <- getClusterVar(input = input, schema = schema) - - # if a group id has been specified, extract the variable values - groupVar <- getGroupVar(input = input, schema = schema) - - # select the id variables - idVars <- getIDVars(input = input, schema = schema) - - # select the observed variables - obsVars <- getObsVars(input = input, schema = schema) - - nClusters <- length(idVars) - - theValues <- list() - for(i in 1:nClusters){ - - # match all of the readily available variables - # ids = idVars[[i]]; obs = obsVars[[i]]; clust = clusterVar[i]; grp = groupVar[i] - tidyVars <- .tidyVars(ids = idVars[[i]], obs = obsVars[[i]], - clust = clusterVar[i], grp = groupVar[i]) - - # put together the table - theTable <- bind_cols(tidyVars, .name_repair = "minimal") - names(theTable) <- names(tidyVars) - - # append the data to the overall output list - theValues <- c(theValues, list(theTable)) - - } - - clustNames <- map(.x = seq_along(theValues), .f = function(ix){ - names(theValues[[ix]]) - }) - - differentNames <- isFALSE(reduce(.x = clustNames, .f = function(x,y) if (identical(x,y)) x else FALSE)) - - if(differentNames){ - out <- suppressMessages(reduce(theValues, left_join)) - } else { - out <- bind_rows(theValues) - } - - out <- out %>% - select(names(schema@variables)) %>% - .updateFormat(schema = schema) - - return(out) -} +#' Reorganise a table +#' +#' This function takes a disorganised messy table and rearranges columns and +#' rows into a tidy table based on a schema description. +#' @param input [\code{data.frame(1)}]\cr table to reorganise. +#' @param schema [\code{symbol(1)}]\cr the schema description of \code{input}. +#' @return A (tidy) table which is the result of reorganising \code{input} based +#' on \code{schema}. +#' @examples +#' # a rather disorganised table with messy clusters and a distinct variable +#' (input <- tabs2shift$clusters_messy) +#' +#' # put together schema description by ... +#' # ... identifying cluster positions +#' schema <- setCluster(id = "territories", left = c(1, 1, 4), top = c(1, 8, 8)) +#' +#' # ... specifying the cluster ID as id variable (obligatory) +#' schema <- schema %>% +#' setIDVar(name = "territories", columns = c(1, 1, 4), rows = c(2, 9, 9)) +#' +#' # ... specifying the distinct variable (explicit position) +#' schema <- schema %>% +#' setIDVar(name = "year", columns = 4, rows = c(3:6), distinct = TRUE) +#' +#' # ... specifying a tidy variable (by giving the column values) +#' schema <- schema %>% +#' setIDVar(name = "commodities", columns = c(1, 1, 4)) +#' +#' # ... identifying the (tidy) observed variables +#' schema <- schema %>% +#' setObsVar(name = "harvested", columns = c(2, 2, 5)) %>% +#' setObsVar(name = "production", columns = c(3, 3, 6)) +#' +#' # get the tidy output +#' reorganise(input, schema) +#' +#' @importFrom checkmate assertDataFrame assertIntegerish +#' @importFrom dplyr bind_rows select bind_cols left_join mutate_all +#' @importFrom tibble tibble +#' @importFrom purrr reduce map map_int +#' @export + +reorganise <- function(input = NULL, schema = NULL){ + + # check validity of arguments + assertDataFrame(x = input) + + # 1. add missing information in schema ---- + schema <- validateSchema(input = input, schema = schema) + + # 2. select data ---- + input <- validateInput(input = input, schema = schema) + + # if a cluster id has been specified, extract the variable values + clusterVar <- getClusterVar(input = input, schema = schema) + + # if a group id has been specified, extract the variable values + groupVar <- getGroupVar(input = input, schema = schema) + + # select the id variables + idVars <- getIDVars(input = input, schema = schema) + + # select the observed variables + obsVars <- getObsVars(input = input, schema = schema) + + nClusters <- length(idVars) + + theValues <- list() + for(i in 1:nClusters){ + + # match all of the readily available variables + # ids = idVars[[i]]; obs = obsVars[[i]]; clust = clusterVar[i]; grp = groupVar[i] + tidyVars <- .tidyVars(ids = idVars[[i]], obs = obsVars[[i]], + clust = clusterVar[i], grp = groupVar[i]) + + # put together the table + theTable <- bind_cols(tidyVars, .name_repair = "minimal") + names(theTable) <- names(tidyVars) + + # append the data to the overall output list + theValues <- c(theValues, list(theTable)) + + } + + clustNames <- map(.x = seq_along(theValues), .f = function(ix){ + names(theValues[[ix]]) + }) + + differentNames <- isFALSE(reduce(.x = clustNames, .f = function(x,y) if (identical(x,y)) x else FALSE)) + + if(differentNames){ + out <- suppressMessages(reduce(theValues, left_join)) + } else { + out <- bind_rows(theValues) + } + + out <- out %>% + select(names(schema@variables)) %>% + .updateFormat(schema = schema) + + return(out) +} diff --git a/R/reportProblems.R b/R/reportProblems.R index 32d2736..88f6dbd 100644 --- a/R/reportProblems.R +++ b/R/reportProblems.R @@ -1,80 +1,80 @@ -#' Catch and report problems in a schema description -#' -#' This function checks the current setup of a schema and reports problems that -#' will lead to an error of \code{reorganise} if not fixed. -#' @param schema [\code{character(1)}]\cr the schema description to check. - -reportProblems <- function(schema = NULL){ - - clusters <- schema@clusters - format <- schema@format - groups <- schema@groups - filter <- schema@filter - variables <- schema@variables - - # obvious errors - # 1. a wide observed variable is defined, but no matching wide id variable - # in setObsVar: - # - ensure that 'value' is actually a value of the provided column in 'key' - # in setIDVar: - # - ensure that if dist = TRUE, values are absolute and the defined fields contain valid values - # - ensure that relative values are still within the cluster - # - all the below - # # if(!is.null(columns)){ - # # ensure that a row is set, in case the variable is contained in several columns - # if(nClusters == 1){ - # if(length(columns) > 1){ - # if(is.null(rows)){ - # if(is.null(merge)){ - # message(" -> the variable '", name, "' is wide (i.e., in several columns), but no row with the names, nor the merge option is set.") - # } - # } - # } else{ - # if(!is.null(rows)){ - # message(" -> 'row' is set for the variable '", name, "', even though it is not needed.") - # } - # } - # } - # # ensure that a split expression is set, in case the variable is contained in a column that already contains another variable - # if(!colQuo){ - # if(any(columns %in% prevIDcols)){ - # if(is.null(split)){ - # message(" -> the variable '", name, "' is in a column (", paste(columns, collapse = ", "), ") that already contains another variable, but no split-expression is set.") - # } - # } - # } - # } else{ - # # if(!is.null(rows)){ - # # message(" -> 'rows' is set for the variable '", name, "', even though it is not needed.") - # # } - # } - - # ensure that split results in a non-empty value - # if(!is.null(split)){ - # if(is.null(columns)){ - # message(" -> the variable '", name, "' has a split-expression, but no column is set.") - # } else { - # # test that the split expression doesn't lead to an empty value - # # recently not yet defined to have the input table in an environment for those "in-situ" tests - # } - # } - - # ensure that when not using 'value', either columns or rows is set - # if(is.null(value)){ - # if(is.null(columns) & is.null(rows)){ - # message(" -> for the variable '", name, "' there is neither an explicit 'value' set, nor are there any column(s) (and rows).") - # } - # } - - # in case the user thought that it's sufficient to specify a row - # if(!is.null(rows)){ - # if(is.null(columns)){ - # message(" -> in case the variable '", name, "' is in several columns, set first those columns and then the row of the variable names.") - # } else{ - # # test that the column/row combination (here the variable names should be) leads to non-empty character values - # - # } - # } - - +#' Catch and report problems in a schema description +#' +#' This function checks the current setup of a schema and reports problems that +#' will lead to an error of \code{reorganise} if not fixed. +#' @param schema [\code{character(1)}]\cr the schema description to check. + +reportProblems <- function(schema = NULL){ + + clusters <- schema@clusters + format <- schema@format + groups <- schema@groups + filter <- schema@filter + variables <- schema@variables + + # obvious errors + # 1. a wide observed variable is defined, but no matching wide id variable + # in setObsVar: + # - ensure that 'value' is actually a value of the provided column in 'key' + # in setIDVar: + # - ensure that if dist = TRUE, values are absolute and the defined fields contain valid values + # - ensure that relative values are still within the cluster + # - all the below + # # if(!is.null(columns)){ + # # ensure that a row is set, in case the variable is contained in several columns + # if(nClusters == 1){ + # if(length(columns) > 1){ + # if(is.null(rows)){ + # if(is.null(merge)){ + # message(" -> the variable '", name, "' is wide (i.e., in several columns), but no row with the names, nor the merge option is set.") + # } + # } + # } else{ + # if(!is.null(rows)){ + # message(" -> 'row' is set for the variable '", name, "', even though it is not needed.") + # } + # } + # } + # # ensure that a split expression is set, in case the variable is contained in a column that already contains another variable + # if(!colQuo){ + # if(any(columns %in% prevIDcols)){ + # if(is.null(split)){ + # message(" -> the variable '", name, "' is in a column (", paste(columns, collapse = ", "), ") that already contains another variable, but no split-expression is set.") + # } + # } + # } + # } else{ + # # if(!is.null(rows)){ + # # message(" -> 'rows' is set for the variable '", name, "', even though it is not needed.") + # # } + # } + + # ensure that split results in a non-empty value + # if(!is.null(split)){ + # if(is.null(columns)){ + # message(" -> the variable '", name, "' has a split-expression, but no column is set.") + # } else { + # # test that the split expression doesn't lead to an empty value + # # recently not yet defined to have the input table in an environment for those "in-situ" tests + # } + # } + + # ensure that when not using 'value', either columns or rows is set + # if(is.null(value)){ + # if(is.null(columns) & is.null(rows)){ + # message(" -> for the variable '", name, "' there is neither an explicit 'value' set, nor are there any column(s) (and rows).") + # } + # } + + # in case the user thought that it's sufficient to specify a row + # if(!is.null(rows)){ + # if(is.null(columns)){ + # message(" -> in case the variable '", name, "' is in several columns, set first those columns and then the row of the variable names.") + # } else{ + # # test that the column/row combination (here the variable names should be) leads to non-empty character values + # + # } + # } + + } \ No newline at end of file diff --git a/R/schema.R b/R/schema.R index f7954c3..96d6ef2 100755 --- a/R/schema.R +++ b/R/schema.R @@ -1,709 +1,709 @@ -#' The \code{schema} class (S4) and its methods -#' -#' A \code{schema} stores the information of where which information is stored -#' in a table of data. -#' @slot cluster [\code{list(1)}]\cr description of -#' \code{\link[=setCluster]{clusters}} in the table. -#' @slot format [\code{list(1)}]\cr description of the table -#' \code{\link[=setFormat]{format}} -#' @slot variables [\code{named list(.)}]\cr description of -#' \code{\link[=setIDVar]{identifying}} and \code{\link[=setObsVar]{observed}} -#' variables. -#' @section Setting up schema descriptions: This section outlines the currently -#' recommended strategy for setting up schema descriptions. For example tables -#' and the respective schemas, see the vignette. -#' -#' \enumerate{ \item \emph{Variables}: Clarify which are the identifying -#' variables and which are the observed variables. Make sure not to mistake a -#' listed observed variable as identifying variable. -#' -#' \item \emph{Clusters}: Determine whether there are clusters and if so, find -#' the origin (top left cell) of each cluster and provide the required -#' information in \code{\link[=setCluster]{setCluster}(top = ..., left = -#' ...)}. It is advised to treat a table that contains meta-data in the top -#' rows as cluster, as this is often the case with implicit variables. All -#' variables need to be specified in each cluster (in case clusters are all -#' organised in the same arrangement), or \code{relative = TRUE} can be used. -#' Data may be organised into clusters a) whenever a set of variables occurs -#' more than once in the same table, nested into another variable, or b) when -#' the data are organised into separate spreadsheets or files according to one -#' of the variables (depending on the context, these issues can also be solved -#' differently). In both cases the variable responsible for clustering (the -#' cluster ID) can be either an identifying variable, or a categorical -#' observed variable: \itemize{ -#' -#' \item in case the cluster ID is an identifying variable, provide its name -#' in \code{\link[=setCluster]{setCluster(id = ...)}} and specify it as an -#' identifying variable (\code{\link{setIDVar}}) -#' -#' \item in case it is a observed variable, provide simply -#' \code{\link[=setCluster]{setCluster}(..., id = "observed")}. } -#' -#' \item \emph{Meta-data}: Provide potentially information about the format -#' (\code{\link{setFormat}}). -#' -#' \item \emph{Identifying variables}: Determine the following: \itemize{ -#' -#' \item is the variable available at all? This is particularly important when -#' the data are split up into tables that are in spreadsheets or files. Often -#' the variable that splits up the data (and thus identifies the clusters) is -#' not explicitly available in the table anymore. In such a case, provide the -#' value in \code{\link[=setIDVar]{setIDVar}(..., value = ...)}. -#' -#' \item all columns in which the variable values sit. -#' -#' \item in case the variable is in several columns, determine additionally -#' the row in which its values sit. In this case, the values will look like -#' they are part of a header. -#' -#' \item in case the variable must be split off of another column, provide a -#' regular expression that results in the target subset via -#' \code{\link[=setIDVar]{setIDVar}(..., split = ...)}. -#' -#' \item in case the variable is distinct from the main table, provide the -#' explicit (non-relative) position and set -#' \code{\link[=setIDVar]{setIDVar}(..., distinct = TRUE)}. } -#' -#' \item \emph{Observed variable}: Determine the following: \itemize{ -#' -#' \item all columns in which the values of the variable sit. -#' -#' \item the conversion factor. -#' -#' \item in case the variable is not tidy, go through the following cases one -#' after the other: \itemize{ -#' -#' \item in case the variable is nested in a wide identifying variable, -#' determine in addition to the columns in which the values sit also the rows -#' in which the \emph{variable name} sits. -#' -#' \item in case the names of the variable are given as a value of an -#' identifying variable, give the column name as -#' \code{\link[=setObsVar]{setObsVar}(..., key = ...)}, together with the name -#' of the respective observed variable (as it appears in the table) in -#' \code{values}. -#' -#' \item in case the name of the variable is the ID of clusters, specify -#' \code{\link[=setObsVar]{setObsVar}(..., key = "cluster", value = ...)}, -#' where \code{values} has the cluster number the variable refers to. } } } -#' @importFrom rlang is_integerish -#' @importFrom stringr str_sub - -schema <- setClass(Class = "schema", - slots = c(clusters = "list", - format = "list", - groups = "list", - filter = "list", - variables = "list", - validated = "logical" - ) -) - -setValidity(Class = "schema", function(object){ - - errors <- character() - - if(!.hasSlot(object = object, name = "clusters")){ - errors <- c(errors, "the schema does not have a 'clusters' slot.") - } else { - if(!is.list(object@clusters)){ - errors <- c(errors, "the slot 'clusters' is not a list.") - } - if(!all(names(object@clusters) %in% c("id", "group", "row", "col", "width", "height", "member"))){ - errors <- c(errors, "'names(schema$clusters)' must be a permutation of set {id,group,row,col,width,height,member}") - } - if(!is.null(object@clusters$row)){ - if(!is.numeric(object@clusters$row)){ - errors <- c(errors, "'schema$clusters$row' must have a numeric value.") - } - } - if(!is.null(object@clusters$col)){ - if(!is.numeric(object@clusters$col)){ - errors <- c(errors, "'schema$clusters$col' must have a numeric value.") - } - } - if(!is.null(object@clusters$width)){ - if(!is.numeric(object@clusters$width)){ - errors <- c(errors, "'schema$clusters$width' must have a numeric value.") - } - } - if(!is.null(object@clusters$height)){ - if(!is.numeric(object@clusters$height)){ - errors <- c(errors, "'schema$clusters$height' must have a numeric value.") - } - } - if(!is.null(object@clusters$id)){ - if(!is.character(object@clusters$id)){ - errors <- c(errors, "'schema$clusters$id' must have a character value.") - } - } - if(!is.null(object@clusters$group)){ - if(!is.character(object@clusters$group)){ - errors <- c(errors, "'schema$clusters$group' must have a character value.") - } - } - if(!is.null(object@clusters$member)){ - if(!is.numeric(object@clusters$member)){ - errors <- c(errors, "'schema$clusters$member' must have a numeric value.") - } - } - } - - if(!.hasSlot(object = object, name = "format")){ - errors <- c(errors, "the schema does not have a 'format' slot.") - } else { - if(!is.list(object@format)){ - errors <- c(errors, "the slot 'format' is not a list.") - } - if(length(object@format) == 0){ - errors <- c(errors, "the slot 'format' does not contain any entries.") - } - if(!all(names(object@format) %in% c("header", "del", "dec", "na", "flags"))){ - errors <- c(errors, "'names(schema$format)' must be a permutation of set {header,del,dec,na,flags}") - } - if(!is.null(object@format$header)){ - if(!is_integerish(object@format$header)){ - errors <- c(errors, "'schema$format$header' must must have a integer value.") - } - } - if(!is.null(object@format$del)){ - if(!is.character(object@format$del)){ - errors <- c(errors, "'schema$format$del' must have a character value.") - } - } - if(!is.null(object@format$dec)){ - if(!is.character(object@format$dec)){ - errors <- c(errors, "'schema$format$dec' must have a character value.") - } - } - if(!is.null(object@format$na)){ - if(!is.character(object@format$na)){ - errors <- c(errors, "'schema$format$na' must have a character value.") - } - } - if(!is.null(object@format$flags)){ - if(!is.data.frame(object@format$flags)){ - errors <- c(errors, "'schema$format$flags' must be a data.frame with column names 'flag' and 'value'.") - } - } - } - - if(!.hasSlot(object = object, name = "groups")){ - errors <- c(errors, "the schema does not have a 'groups' slot.") - } else { - if(!is.list(object@groups)){ - errors <- c(errors, "the slot 'groups' is not a list.") - } - if(!all(names(object@groups) %in% c("rows", "cols", "clusters"))){ - errors <- c(errors, "'names(schema$groups)' must be a permutation of set {rows,cols,clusters}") - } - if(!is.null(object@groups$rows)){ - if(!is.list(object@groups$rows)){ - errors <- c(errors, "'object@groups$rows' must be a list.") - } - } - if(!is.null(object@groups$cols)){ - if(!is.list(object@groups$cols)){ - errors <- c(errors, "'object@groups$cols' must be a list.") - } - } - if(!is.null(object@groups$clusters)){ - if(!is.list(object@groups$clusters)){ - errors <- c(errors, "'object@groups$clusters' must be a list.") - } - } - } - - if(!.hasSlot(object = object, name = "filter")){ - errors <- c(errors, "the schema does not have a 'filter' slot.") - } else { - if(!is.list(object@filter)){ - errors <- c(errors, "the slot 'filter' is not a list.") - } - if(length(object@filter) == 0){ - errors <- c(errors, "the slot 'filter' does not contain any entries.") - } - if(!all(names(object@filter) %in% c("row", "col"))){ - errors <- c(errors, "'names(schema$filter)' must be a permutation of set {row,col}") - } - if(!is.null(object@filter$row)){ - if(!is.numeric(object@filter$row)){ - errors <- c(errors, "'schema$filter$row' must have a numeric value.") - } - } - if(!is.null(object@filter$col)){ - if(!is.numeric(object@filter$col)){ - errors <- c(errors, "'schema$filter$col' must have a numeric value.") - } - } - } - - if(!.hasSlot(object = object, name = "variables")){ - errors <- c(errors, "the schema does not have a 'variables' slot.") - } else { - if(!is.list(object@variables)){ - errors <- c(errors, "the slot 'variables' is not a list.") - } - if(length(object@variables) == 0){ - errors <- c(errors, "the slot 'variables' does not contain any entries.") - } - for(i in seq_along(object@variables)){ - theVariable <- object@variables[[i]] - theName <- names(object@variables)[i] - - if(!theVariable$vartype %in% c("id", "observed")){ - errors <- c(errors, paste0("the variables '", theName, "' does must be of type 'id' or 'observed'.")) - return(paste0("\n", errors)) - } - - if(theVariable$vartype == "id"){ - if(!all(names(theVariable) %in% c("vartype", "datype", "value", "row", "col", "split", "dist", "merge"))){ - errors <- c(errors, paste0("'names(", theName, ")' must be a permutation of set {vartype,datype,value,row,col,split,merge,dist}")) - } - if(!is.null(theVariable$datype)){ - if(!is.character(theVariable$datype)){ - errors <- c(errors, paste0("'", theName, "$datype' must have a character value.")) - } - } - # if(!is.null(theVariable$value)){ - # if(!is.character(theVariable$value)){ - # errors <- c(errors, paste0("'", theName, "$value' must have a character value.")) - # } - # } - if(!is.null(theVariable$split)){ - if(!is.character(theVariable$split)){ - errors <- c(errors, paste0("'", theName, "$split' must have a character value.")) - } - } - if(!is.null(theVariable$row)){ - if(!(is.numeric(theVariable$row) | testClass(x = theVariable$row, classes = "quosure"))){ - errors <- c(errors, paste0("'", theName, "$row' must have a numeric value.")) - } - } - if(!is.null(theVariable$col)){ - if(!(is.numeric(theVariable$col) | testClass(x = theVariable$col, classes = "quosure"))){ - errors <- c(errors, paste0("'", theName, "$col' must have a numeric value.")) - } - } - if(!is.logical(theVariable$dist)){ - errors <- c(errors, paste0("'", theName, "$dist' must either be 'TRUE' or 'FALSE'.")) - } - - } else { - if(!all(names(theVariable) %in% c("vartype", "datype", "factor", "row", "col", "dist", "key", "value"))){ - errors <- c(errors, paste0("'names(", theName, ")' must be a permutation of set {vartype,datype,factor,row,col,dist,key,value}")) - } - if(!is.null(theVariable$datype)){ - if(!is.character(theVariable$datype)){ - errors <- c(errors, paste0("'", theName, "$datype' must have a character value.")) - } - } - if(!is.null(theVariable$factor)){ - if(!is.numeric(theVariable$factor)){ - errors <- c(errors, paste0("'", theName, "$factor' must have a numeric value.")) - } - } - if(!is.null(theVariable$row)){ - if(!(is.numeric(theVariable$row) | testClass(x = theVariable$row, classes = "quosure"))){ - errors <- c(errors, paste0("'", theName, "$row' must have a numeric value.")) - } - } - if(!is.null(theVariable$col)){ - if(!(is.numeric(theVariable$col) | testClass(x = theVariable$col, classes = "quosure"))){ - errors <- c(errors, paste0("'", theName, "$col' must have a numeric value.")) - } - } - if(!is.logical(theVariable$dist)){ - errors <- c(errors, paste0("'", theName, "$dist' must either be 'TRUE' or 'FALSE'.")) - } - if(!is.null(theVariable$value)){ - if(theVariable$key == "cluster"){ - if(!rlang::is_integerish(theVariable$value)){ - errors <- c(errors, paste0("'", theName, "$value' must have an integer value.")) - } - } else { - if(!is.character(theVariable$value)){ - errors <- c(errors, paste0("'", theName, "$value' must have a character value.")) - } - } - } - - } - - } - } - - if(!.hasSlot(object = object, name = "validated")){ - errors <- c(errors, "the schema does not have a 'validated' slot.") - } else { - if(!is.logical(object@validated)){ - errors <- c(errors, "the slot 'validated' is not a logical value.") - } - } - - - if(length(errors) == 0){ - return(TRUE) - } else { - return(paste0("\n", errors)) - } -}) - -#' Print the \code{schema} -#' -#' @param object [\code{schema}]\cr the schema to print. -#' @importFrom crayon yellow -#' @importFrom rlang is_primitive -#' @importFrom stringr str_split -#' @importFrom rlang eval_tidy is_quosure prim_name - -setMethod(f = "show", - signature = "schema", - definition = function(object){ - clusters <- object@clusters - filter <- object@filter - variables <- object@variables - - nClusters <- ifelse(length(clusters$row) == 0, 1, length(clusters$row)) - nvars <- length(variables) - theNames <- names(variables) - nClustName <- ifelse(nClusters > 1, "clusters", "cluster") - - # make and print cluster info ---- - if(is.null(clusters$row) & is.null(clusters$col) & is.null(clusters$width) & is.null(clusters$height)){ - clusterSpecs <- paste0(" (whole spreadsheet)") - } else { - if(is.null(clusters$col)){ - left <- 1 - } else { - left <- clusters$col - } - if(is.null(clusters$row)){ - top <- 1 - } else { - top <- clusters$row - } - clusterSpecs <- paste0("\n origin : ", paste(top, left, collapse = ", ", sep = "|"), " (row|col)", - ifelse(!is.null(clusters$group), paste0("\n groups : ", clusters$group), ""), - ifelse(!is.null(clusters$id), paste0("\n id : ", clusters$id), "")) - } - cat(paste0(" ", nClusters, " ", nClustName, clusterSpecs, "\n\n")) - - # make and print filter info ---- - if(is.null(filter$col) & is.null(filter$row)){ - filterSpecs <- paste0("") - } else { - - filterSpecs <- paste0(" filter ", - # ifelse(!is.null(filter$col), paste0("\n col: [", ifelse(length(filter$col) > 10, paste0(c(filter$col[1:10], "..."), collapse = ", "), paste0(filter$col, collapse = ", ")), "]"), ""), - paste0(" [", - ifelse(is.list(filter$row), - paste0("by '", as.character(filter$row$by[2]), "' in column ", filter$row$col), - paste0("rows ", ifelse(length(filter$row) > 10, paste0(c(filter$row[1:10], "..."), collapse = ", "), paste0(filter$row, collapse = ", ")))), - "]"), "\n\n") - } - cat(filterSpecs) - - # make and print variable info ---- - included <- c(TRUE, TRUE) - theNames <- sapply(seq_along(variables), function(x){ - names(variables)[x] - }) - nNames <- sapply(seq_along(theNames), function(x){ - ifelse(test = is.null(theNames[[x]]) , yes = 0, no = nchar(theNames[x])) - }) - maxNames <- ifelse(any(nNames > 8), max(nNames), 8) - theTypes <- sapply(seq_along(variables), function(x){ - variables[[x]]$vartype - }) - - # rows - theRows <- sapply(seq_along(variables), function(x){ - if(variables[[x]]$vartype == "id"){ - if(is.null(variables[[x]]$row)){ - "" - } else if(is.list(variables[[x]]$row)){ - if(names(variables[[x]]$row) == "find"){ - eval_tidy(variables[[x]]$row$find$by) - } - } else { - temp <- unique(variables[[x]]$row) - # make a short sequence of 'theRows' - dists <- temp - c(temp[1]-1, temp)[-(length(temp)+1)] - if(all(dists == 1) & length(temp) > 1){ - paste0(min(temp), ":", max(temp)) - } else { - temp - } - } - } else { - "" - } - - }) - nRow <- sapply(seq_along(theRows), function(x){ - ifelse(test = is.null(theRows[[x]]) , yes = 0, no = nchar(paste0(theRows[[x]], collapse = ", "))) - }) - maxRows <- ifelse(any(nRow > 3), max(nRow), 3) - if(any(nRow != 0)){ - included <- c(included, TRUE) - } else { - included <- c(included, FALSE) - } - - theTops <- sapply(seq_along(variables), function(x){ - if(variables[[x]]$vartype == "observed"){ - if(is.null(variables[[x]]$row)){ - "" - } else if(is.list(variables[[x]]$row)){ - if(names(variables[[x]]$row) == "find"){ - eval_tidy(variables[[x]]$row$find$by) - } - } else { - if(all(variables[[x]]$value != "{all_rows}")){ - temp <- unique(variables[[x]]$row) - # make a short sequence of 'theRows' - dists <- temp - c(temp[1]-1, temp)[-(length(temp)+1)] - if(all(dists == 1) & length(temp) > 1){ - paste0(min(temp), ":", max(temp)) - } else { - temp - } - } else { - "" - } - } - } else { - "" - } - - }) - nTop <- sapply(seq_along(theTops), function(x){ - ifelse(test = is.null(theTops[[x]]) , yes = 0, no = nchar(paste0(theTops[[x]], collapse = ", "))) - }) - maxTops <- ifelse(any(nTop > 3), max(nTop), 3) - if(any(nTop != 0)){ - included <- c(included, TRUE) - } else { - included <- c(included, FALSE) - } - - # columns - theCols <- sapply(seq_along(variables), function(x){ - if(is.null(variables[[x]]$col)){ - "" - } else if(is.list(variables[[x]]$col)){ - if(names(variables[[x]]$col) == "find"){ - temp <- eval_tidy(variables[[x]]$col$find$by) - if(is_primitive(temp)){ - prim_name(temp) - } else { - temp - } - } - } else { - temp <- unique(variables[[x]]$col) - # make a short sequence of 'theCols' - if(is.numeric(temp)){ - dists <- temp - c(temp[1]-1, temp)[-(length(temp)+1)] - } else { - dists <- 0 - } - if(all(dists == 1) & length(temp) > 1){ - paste0(min(temp), ":", max(temp)) - } else { - temp - } - } - }) - nCols <- sapply(seq_along(theCols), function(x){ - ifelse(test = is.null(theCols[[x]]) | is.function(theCols[[x]]), yes = 0, no = nchar(paste0(theCols[[x]], collapse = ", "))) - }) - maxCols <- ifelse(any(nCols > 3), max(nCols), 3) - if(any(nCols != 0)){ - included <- c(included, TRUE) - } else { - included <- c(included, FALSE) - } - - # keys - theKeys <- sapply(seq_along(variables), function(x){ - if(variables[[x]]$vartype == "id"){ - NULL - } else { - if(!is.null(variables[[x]]$key)){ - if(grepl(pattern = "\n", variables[[x]]$key)){ - paste0(str_split(string = variables[[x]]$key, pattern = "\n", simplify = TRUE)[1], " ...") - } else { - variables[[x]]$key - } - } else { - NULL - } - - } - }) - nKeys <- sapply(seq_along(theKeys), function(x){ - ifelse(test = is.null(theKeys[[x]]) , yes = 0, no = nchar(theKeys[x])) - }) - maxKeys <- ifelse(any(nKeys > 3), max(nKeys), 3) - if(any(nKeys != 0)){ - included <- c(included, TRUE) - } else { - included <- c(included, FALSE) - } - - # values - theValues <- sapply(seq_along(variables), function(x){ - if(variables[[x]]$vartype == "id"){ - NULL - } else { - if(!is.null(variables[[x]]$value)){ - if(grepl(pattern = "\n", variables[[x]]$value)){ - paste0(str_split(string = variables[[x]]$value, pattern = "\n", simplify = TRUE)[1], " ...") - } else { - variables[[x]]$value - } - } else { - NULL - } - } - }) - nVals <- sapply(seq_along(theValues), function(x){ - ifelse(test = is.null(theValues[[x]]) , yes = 0, no = nchar(theValues[x])) - }) - maxVals <- ifelse(any(nVals > 5), max(nVals), 5) - if(any(nVals != 0)){ - included <- c(included, TRUE) - } else { - included <- c(included, FALSE) - } - - # whether variables are relative - # theRels <- sapply(seq_along(variables), function(x){ - # str_sub(as.character(variables[[x]]$rel), 1, 1) - # }) - # if(all(theRels == "F")){ - included <- c(included, FALSE) - # } else { - # included <- c(included, TRUE) - # } - - # whether variables are distinct - theDist <- sapply(seq_along(variables), function(x){ - str_sub(as.character(variables[[x]]$dist), 1, 1) - }) - if(all(theDist == "F")){ - included <- c(included, FALSE) - } else { - included <- c(included, TRUE) - } - - for(i in 1:(length(variables)+1)){ - - if(i == 1){ - - head1 <- paste0(" ", "variable", paste0(rep(" ", times = maxNames-5), collapse = "")) - line1 <- paste0(c(rep("-", maxNames+2), " "), collapse = "") - head2 <- paste0("type ") - line2 <- paste0(c(rep("-", 10), " "), collapse = "") - if(included[3]){ - head3 <- paste0("row", paste0(rep(" ", times = maxRows), collapse = "")) - line3 <- paste0(c(rep("-", maxRows+2), " "), collapse = "") - } else { - head3 <- line3 <- "" - } - if(included[4]){ - head41 <- paste0("top", paste0(rep(" ", times = maxTops), collapse = "")) - line41 <- paste0(c(rep("-", maxTops+2), " "), collapse = "") - } else { - head41 <- line41 <- "" - } - if(included[5]){ - head4 <- paste0("col", paste0(rep(" ", times = maxCols), collapse = "")) - line4 <- paste0(c(rep("-", maxCols+2), " "), collapse = "") - } else { - head4 <- line4 <- "" - } - if(included[6]){ - head5 <- paste0("key", paste0(rep(" ", times = maxKeys), collapse = "")) - line5 <- paste0(c(rep("-", maxKeys+2), " "), collapse = "") - } else { - head5 <- line5 <- "" - } - if(included[7]){ - head6 <- paste0("value", paste0(rep(" ", times = maxVals-2), collapse = "")) - line6 <- paste0(c(rep("-", maxVals+2), " "), collapse = "") - } else { - head6 <- line6 <- "" - } - if(included[8]){ - head7 <- paste0("rel ") - line7 <- paste0(c(rep("-", 5), " "), collapse = "") - } else { - head7 <- line7 <- "" - } - if(included[9]){ - head8 <-paste0("dist") - line8 <- paste0(c(rep("-", 6), " "), collapse = "") - } else { - head8 <- line8 <- "" - } - - cat(paste0(head1, head2, head3, head41, head4, head5, head6, head7, head8), "\n") - cat(" ", paste0(line1, line2, line3, line41, line4, line5, line6, line7, line8), "\n") - - } else { - - var1 <- paste0(" ", yellow(theNames[[i-1]]), - paste0(rep(" ", times = maxNames+3-nchar(theNames[[i-1]])), collapse = "")) - var2 <- paste0(theTypes[[i-1]], ifelse(theTypes[[i-1]] == "id", " ", " ")) - if(included[3]){ - var3 <- paste0(paste0(theRows[[i-1]], collapse = ", "), - paste0(rep(" ", times = maxRows+3-nRow[[i-1]]), collapse = "")) - } else { - var3 <- "" - } - if(included[4]){ - var41 <- paste0(paste0(theTops[[i-1]], collapse = ", "), - paste0(rep(" ", times = maxCols+3-nTop[[i-1]]), collapse = "")) - } else { - var41 <- "" - } - if(included[5]){ - var4 <- paste0(paste0(theCols[[i-1]], collapse = ", "), - paste0(rep(" ", times = maxCols+3-nCols[[i-1]]), collapse = "")) - } else { - var4 <- "" - } - if(included[6]){ - var5 <- paste0(theKeys[[i-1]], - paste0(rep(" ", times = maxKeys+3-nKeys[[i-1]]), collapse = "")) - } else { - var5 <- "" - } - if(included[7]){ - var6 <- paste0(theValues[[i-1]], - paste0(rep(" ", times = maxVals+3-nVals[[i-1]]), collapse = "")) - } else { - var6 <- "" - } - if(included[8]){ - # var7 <- paste0(theRels[[i-1]], " ") - } else { - var7 <- "" - } - if(included[9]){ - var8 <- paste0(theDist[[i-1]], " ") - } else { - var8 <- "" - } - - cat(paste0(var1, var2, var3, var41, var4, var5, var6, var7, var8, "\n")) - - } - - - } - - }) +#' The \code{schema} class (S4) and its methods +#' +#' A \code{schema} stores the information of where which information is stored +#' in a table of data. +#' @slot cluster [\code{list(1)}]\cr description of +#' \code{\link[=setCluster]{clusters}} in the table. +#' @slot format [\code{list(1)}]\cr description of the table +#' \code{\link[=setFormat]{format}} +#' @slot variables [\code{named list(.)}]\cr description of +#' \code{\link[=setIDVar]{identifying}} and \code{\link[=setObsVar]{observed}} +#' variables. +#' @section Setting up schema descriptions: This section outlines the currently +#' recommended strategy for setting up schema descriptions. For example tables +#' and the respective schemas, see the vignette. +#' +#' \enumerate{ \item \emph{Variables}: Clarify which are the identifying +#' variables and which are the observed variables. Make sure not to mistake a +#' listed observed variable as identifying variable. +#' +#' \item \emph{Clusters}: Determine whether there are clusters and if so, find +#' the origin (top left cell) of each cluster and provide the required +#' information in \code{\link[=setCluster]{setCluster}(top = ..., left = +#' ...)}. It is advised to treat a table that contains meta-data in the top +#' rows as cluster, as this is often the case with implicit variables. All +#' variables need to be specified in each cluster (in case clusters are all +#' organised in the same arrangement), or \code{relative = TRUE} can be used. +#' Data may be organised into clusters a) whenever a set of variables occurs +#' more than once in the same table, nested into another variable, or b) when +#' the data are organised into separate spreadsheets or files according to one +#' of the variables (depending on the context, these issues can also be solved +#' differently). In both cases the variable responsible for clustering (the +#' cluster ID) can be either an identifying variable, or a categorical +#' observed variable: \itemize{ +#' +#' \item in case the cluster ID is an identifying variable, provide its name +#' in \code{\link[=setCluster]{setCluster(id = ...)}} and specify it as an +#' identifying variable (\code{\link{setIDVar}}) +#' +#' \item in case it is a observed variable, provide simply +#' \code{\link[=setCluster]{setCluster}(..., id = "observed")}. } +#' +#' \item \emph{Meta-data}: Provide potentially information about the format +#' (\code{\link{setFormat}}). +#' +#' \item \emph{Identifying variables}: Determine the following: \itemize{ +#' +#' \item is the variable available at all? This is particularly important when +#' the data are split up into tables that are in spreadsheets or files. Often +#' the variable that splits up the data (and thus identifies the clusters) is +#' not explicitly available in the table anymore. In such a case, provide the +#' value in \code{\link[=setIDVar]{setIDVar}(..., value = ...)}. +#' +#' \item all columns in which the variable values sit. +#' +#' \item in case the variable is in several columns, determine additionally +#' the row in which its values sit. In this case, the values will look like +#' they are part of a header. +#' +#' \item in case the variable must be split off of another column, provide a +#' regular expression that results in the target subset via +#' \code{\link[=setIDVar]{setIDVar}(..., split = ...)}. +#' +#' \item in case the variable is distinct from the main table, provide the +#' explicit (non-relative) position and set +#' \code{\link[=setIDVar]{setIDVar}(..., distinct = TRUE)}. } +#' +#' \item \emph{Observed variable}: Determine the following: \itemize{ +#' +#' \item all columns in which the values of the variable sit. +#' +#' \item the conversion factor. +#' +#' \item in case the variable is not tidy, go through the following cases one +#' after the other: \itemize{ +#' +#' \item in case the variable is nested in a wide identifying variable, +#' determine in addition to the columns in which the values sit also the rows +#' in which the \emph{variable name} sits. +#' +#' \item in case the names of the variable are given as a value of an +#' identifying variable, give the column name as +#' \code{\link[=setObsVar]{setObsVar}(..., key = ...)}, together with the name +#' of the respective observed variable (as it appears in the table) in +#' \code{values}. +#' +#' \item in case the name of the variable is the ID of clusters, specify +#' \code{\link[=setObsVar]{setObsVar}(..., key = "cluster", value = ...)}, +#' where \code{values} has the cluster number the variable refers to. } } } +#' @importFrom rlang is_integerish +#' @importFrom stringr str_sub + +schema <- setClass(Class = "schema", + slots = c(clusters = "list", + format = "list", + groups = "list", + filter = "list", + variables = "list", + validated = "logical" + ) +) + +setValidity(Class = "schema", function(object){ + + errors <- character() + + if(!.hasSlot(object = object, name = "clusters")){ + errors <- c(errors, "the schema does not have a 'clusters' slot.") + } else { + if(!is.list(object@clusters)){ + errors <- c(errors, "the slot 'clusters' is not a list.") + } + if(!all(names(object@clusters) %in% c("id", "group", "row", "col", "width", "height", "member"))){ + errors <- c(errors, "'names(schema$clusters)' must be a permutation of set {id,group,row,col,width,height,member}") + } + if(!is.null(object@clusters$row)){ + if(!is.numeric(object@clusters$row)){ + errors <- c(errors, "'schema$clusters$row' must have a numeric value.") + } + } + if(!is.null(object@clusters$col)){ + if(!is.numeric(object@clusters$col)){ + errors <- c(errors, "'schema$clusters$col' must have a numeric value.") + } + } + if(!is.null(object@clusters$width)){ + if(!is.numeric(object@clusters$width)){ + errors <- c(errors, "'schema$clusters$width' must have a numeric value.") + } + } + if(!is.null(object@clusters$height)){ + if(!is.numeric(object@clusters$height)){ + errors <- c(errors, "'schema$clusters$height' must have a numeric value.") + } + } + if(!is.null(object@clusters$id)){ + if(!is.character(object@clusters$id)){ + errors <- c(errors, "'schema$clusters$id' must have a character value.") + } + } + if(!is.null(object@clusters$group)){ + if(!is.character(object@clusters$group)){ + errors <- c(errors, "'schema$clusters$group' must have a character value.") + } + } + if(!is.null(object@clusters$member)){ + if(!is.numeric(object@clusters$member)){ + errors <- c(errors, "'schema$clusters$member' must have a numeric value.") + } + } + } + + if(!.hasSlot(object = object, name = "format")){ + errors <- c(errors, "the schema does not have a 'format' slot.") + } else { + if(!is.list(object@format)){ + errors <- c(errors, "the slot 'format' is not a list.") + } + if(length(object@format) == 0){ + errors <- c(errors, "the slot 'format' does not contain any entries.") + } + if(!all(names(object@format) %in% c("header", "del", "dec", "na", "flags"))){ + errors <- c(errors, "'names(schema$format)' must be a permutation of set {header,del,dec,na,flags}") + } + if(!is.null(object@format$header)){ + if(!is_integerish(object@format$header)){ + errors <- c(errors, "'schema$format$header' must must have a integer value.") + } + } + if(!is.null(object@format$del)){ + if(!is.character(object@format$del)){ + errors <- c(errors, "'schema$format$del' must have a character value.") + } + } + if(!is.null(object@format$dec)){ + if(!is.character(object@format$dec)){ + errors <- c(errors, "'schema$format$dec' must have a character value.") + } + } + if(!is.null(object@format$na)){ + if(!is.character(object@format$na)){ + errors <- c(errors, "'schema$format$na' must have a character value.") + } + } + if(!is.null(object@format$flags)){ + if(!is.data.frame(object@format$flags)){ + errors <- c(errors, "'schema$format$flags' must be a data.frame with column names 'flag' and 'value'.") + } + } + } + + if(!.hasSlot(object = object, name = "groups")){ + errors <- c(errors, "the schema does not have a 'groups' slot.") + } else { + if(!is.list(object@groups)){ + errors <- c(errors, "the slot 'groups' is not a list.") + } + if(!all(names(object@groups) %in% c("rows", "cols", "clusters"))){ + errors <- c(errors, "'names(schema$groups)' must be a permutation of set {rows,cols,clusters}") + } + if(!is.null(object@groups$rows)){ + if(!is.list(object@groups$rows)){ + errors <- c(errors, "'object@groups$rows' must be a list.") + } + } + if(!is.null(object@groups$cols)){ + if(!is.list(object@groups$cols)){ + errors <- c(errors, "'object@groups$cols' must be a list.") + } + } + if(!is.null(object@groups$clusters)){ + if(!is.list(object@groups$clusters)){ + errors <- c(errors, "'object@groups$clusters' must be a list.") + } + } + } + + if(!.hasSlot(object = object, name = "filter")){ + errors <- c(errors, "the schema does not have a 'filter' slot.") + } else { + if(!is.list(object@filter)){ + errors <- c(errors, "the slot 'filter' is not a list.") + } + if(length(object@filter) == 0){ + errors <- c(errors, "the slot 'filter' does not contain any entries.") + } + if(!all(names(object@filter) %in% c("row", "col"))){ + errors <- c(errors, "'names(schema$filter)' must be a permutation of set {row,col}") + } + if(!is.null(object@filter$row)){ + if(!is.numeric(object@filter$row)){ + errors <- c(errors, "'schema$filter$row' must have a numeric value.") + } + } + if(!is.null(object@filter$col)){ + if(!is.numeric(object@filter$col)){ + errors <- c(errors, "'schema$filter$col' must have a numeric value.") + } + } + } + + if(!.hasSlot(object = object, name = "variables")){ + errors <- c(errors, "the schema does not have a 'variables' slot.") + } else { + if(!is.list(object@variables)){ + errors <- c(errors, "the slot 'variables' is not a list.") + } + if(length(object@variables) == 0){ + errors <- c(errors, "the slot 'variables' does not contain any entries.") + } + for(i in seq_along(object@variables)){ + theVariable <- object@variables[[i]] + theName <- names(object@variables)[i] + + if(!theVariable$vartype %in% c("id", "observed")){ + errors <- c(errors, paste0("the variables '", theName, "' does must be of type 'id' or 'observed'.")) + return(paste0("\n", errors)) + } + + if(theVariable$vartype == "id"){ + if(!all(names(theVariable) %in% c("vartype", "datype", "value", "row", "col", "split", "dist", "merge"))){ + errors <- c(errors, paste0("'names(", theName, ")' must be a permutation of set {vartype,datype,value,row,col,split,merge,dist}")) + } + if(!is.null(theVariable$datype)){ + if(!is.character(theVariable$datype)){ + errors <- c(errors, paste0("'", theName, "$datype' must have a character value.")) + } + } + # if(!is.null(theVariable$value)){ + # if(!is.character(theVariable$value)){ + # errors <- c(errors, paste0("'", theName, "$value' must have a character value.")) + # } + # } + if(!is.null(theVariable$split)){ + if(!is.character(theVariable$split)){ + errors <- c(errors, paste0("'", theName, "$split' must have a character value.")) + } + } + if(!is.null(theVariable$row)){ + if(!(is.numeric(theVariable$row) | testClass(x = theVariable$row, classes = "quosure"))){ + errors <- c(errors, paste0("'", theName, "$row' must have a numeric value.")) + } + } + if(!is.null(theVariable$col)){ + if(!(is.numeric(theVariable$col) | testClass(x = theVariable$col, classes = "quosure"))){ + errors <- c(errors, paste0("'", theName, "$col' must have a numeric value.")) + } + } + if(!is.logical(theVariable$dist)){ + errors <- c(errors, paste0("'", theName, "$dist' must either be 'TRUE' or 'FALSE'.")) + } + + } else { + if(!all(names(theVariable) %in% c("vartype", "datype", "factor", "row", "col", "dist", "key", "value"))){ + errors <- c(errors, paste0("'names(", theName, ")' must be a permutation of set {vartype,datype,factor,row,col,dist,key,value}")) + } + if(!is.null(theVariable$datype)){ + if(!is.character(theVariable$datype)){ + errors <- c(errors, paste0("'", theName, "$datype' must have a character value.")) + } + } + if(!is.null(theVariable$factor)){ + if(!is.numeric(theVariable$factor)){ + errors <- c(errors, paste0("'", theName, "$factor' must have a numeric value.")) + } + } + if(!is.null(theVariable$row)){ + if(!(is.numeric(theVariable$row) | testClass(x = theVariable$row, classes = "quosure"))){ + errors <- c(errors, paste0("'", theName, "$row' must have a numeric value.")) + } + } + if(!is.null(theVariable$col)){ + if(!(is.numeric(theVariable$col) | testClass(x = theVariable$col, classes = "quosure"))){ + errors <- c(errors, paste0("'", theName, "$col' must have a numeric value.")) + } + } + if(!is.logical(theVariable$dist)){ + errors <- c(errors, paste0("'", theName, "$dist' must either be 'TRUE' or 'FALSE'.")) + } + if(!is.null(theVariable$value)){ + if(theVariable$key == "cluster"){ + if(!rlang::is_integerish(theVariable$value)){ + errors <- c(errors, paste0("'", theName, "$value' must have an integer value.")) + } + } else { + if(!is.character(theVariable$value)){ + errors <- c(errors, paste0("'", theName, "$value' must have a character value.")) + } + } + } + + } + + } + } + + if(!.hasSlot(object = object, name = "validated")){ + errors <- c(errors, "the schema does not have a 'validated' slot.") + } else { + if(!is.logical(object@validated)){ + errors <- c(errors, "the slot 'validated' is not a logical value.") + } + } + + + if(length(errors) == 0){ + return(TRUE) + } else { + return(paste0("\n", errors)) + } +}) + +#' Print the \code{schema} +#' +#' @param object [\code{schema}]\cr the schema to print. +#' @importFrom crayon yellow +#' @importFrom rlang is_primitive +#' @importFrom stringr str_split +#' @importFrom rlang eval_tidy is_quosure prim_name + +setMethod(f = "show", + signature = "schema", + definition = function(object){ + clusters <- object@clusters + filter <- object@filter + variables <- object@variables + + nClusters <- ifelse(length(clusters$row) == 0, 1, length(clusters$row)) + nvars <- length(variables) + theNames <- names(variables) + nClustName <- ifelse(nClusters > 1, "clusters", "cluster") + + # make and print cluster info ---- + if(is.null(clusters$row) & is.null(clusters$col) & is.null(clusters$width) & is.null(clusters$height)){ + clusterSpecs <- paste0(" (whole spreadsheet)") + } else { + if(is.null(clusters$col)){ + left <- 1 + } else { + left <- clusters$col + } + if(is.null(clusters$row)){ + top <- 1 + } else { + top <- clusters$row + } + clusterSpecs <- paste0("\n origin : ", paste(top, left, collapse = ", ", sep = "|"), " (row|col)", + ifelse(!is.null(clusters$group), paste0("\n groups : ", clusters$group), ""), + ifelse(!is.null(clusters$id), paste0("\n id : ", clusters$id), "")) + } + cat(paste0(" ", nClusters, " ", nClustName, clusterSpecs, "\n\n")) + + # make and print filter info ---- + if(is.null(filter$col) & is.null(filter$row)){ + filterSpecs <- paste0("") + } else { + + filterSpecs <- paste0(" filter ", + # ifelse(!is.null(filter$col), paste0("\n col: [", ifelse(length(filter$col) > 10, paste0(c(filter$col[1:10], "..."), collapse = ", "), paste0(filter$col, collapse = ", ")), "]"), ""), + paste0(" [", + ifelse(is.list(filter$row), + paste0("by '", as.character(filter$row$by[2]), "' in column ", filter$row$col), + paste0("rows ", ifelse(length(filter$row) > 10, paste0(c(filter$row[1:10], "..."), collapse = ", "), paste0(filter$row, collapse = ", ")))), + "]"), "\n\n") + } + cat(filterSpecs) + + # make and print variable info ---- + included <- c(TRUE, TRUE) + theNames <- sapply(seq_along(variables), function(x){ + names(variables)[x] + }) + nNames <- sapply(seq_along(theNames), function(x){ + ifelse(test = is.null(theNames[[x]]) , yes = 0, no = nchar(theNames[x])) + }) + maxNames <- ifelse(any(nNames > 8), max(nNames), 8) + theTypes <- sapply(seq_along(variables), function(x){ + variables[[x]]$vartype + }) + + # rows + theRows <- sapply(seq_along(variables), function(x){ + if(variables[[x]]$vartype == "id"){ + if(is.null(variables[[x]]$row)){ + "" + } else if(is.list(variables[[x]]$row)){ + if(names(variables[[x]]$row) == "find"){ + eval_tidy(variables[[x]]$row$find$by) + } + } else { + temp <- unique(variables[[x]]$row) + # make a short sequence of 'theRows' + dists <- temp - c(temp[1]-1, temp)[-(length(temp)+1)] + if(all(dists == 1) & length(temp) > 1){ + paste0(min(temp), ":", max(temp)) + } else { + temp + } + } + } else { + "" + } + + }) + nRow <- sapply(seq_along(theRows), function(x){ + ifelse(test = is.null(theRows[[x]]) , yes = 0, no = nchar(paste0(theRows[[x]], collapse = ", "))) + }) + maxRows <- ifelse(any(nRow > 3), max(nRow), 3) + if(any(nRow != 0)){ + included <- c(included, TRUE) + } else { + included <- c(included, FALSE) + } + + theTops <- sapply(seq_along(variables), function(x){ + if(variables[[x]]$vartype == "observed"){ + if(is.null(variables[[x]]$row)){ + "" + } else if(is.list(variables[[x]]$row)){ + if(names(variables[[x]]$row) == "find"){ + eval_tidy(variables[[x]]$row$find$by) + } + } else { + if(all(variables[[x]]$value != "{all_rows}")){ + temp <- unique(variables[[x]]$row) + # make a short sequence of 'theRows' + dists <- temp - c(temp[1]-1, temp)[-(length(temp)+1)] + if(all(dists == 1) & length(temp) > 1){ + paste0(min(temp), ":", max(temp)) + } else { + temp + } + } else { + "" + } + } + } else { + "" + } + + }) + nTop <- sapply(seq_along(theTops), function(x){ + ifelse(test = is.null(theTops[[x]]) , yes = 0, no = nchar(paste0(theTops[[x]], collapse = ", "))) + }) + maxTops <- ifelse(any(nTop > 3), max(nTop), 3) + if(any(nTop != 0)){ + included <- c(included, TRUE) + } else { + included <- c(included, FALSE) + } + + # columns + theCols <- sapply(seq_along(variables), function(x){ + if(is.null(variables[[x]]$col)){ + "" + } else if(is.list(variables[[x]]$col)){ + if(names(variables[[x]]$col) == "find"){ + temp <- eval_tidy(variables[[x]]$col$find$by) + if(is_primitive(temp)){ + prim_name(temp) + } else { + temp + } + } + } else { + temp <- unique(variables[[x]]$col) + # make a short sequence of 'theCols' + if(is.numeric(temp)){ + dists <- temp - c(temp[1]-1, temp)[-(length(temp)+1)] + } else { + dists <- 0 + } + if(all(dists == 1) & length(temp) > 1){ + paste0(min(temp), ":", max(temp)) + } else { + temp + } + } + }) + nCols <- sapply(seq_along(theCols), function(x){ + ifelse(test = is.null(theCols[[x]]) | is.function(theCols[[x]]), yes = 0, no = nchar(paste0(theCols[[x]], collapse = ", "))) + }) + maxCols <- ifelse(any(nCols > 3), max(nCols), 3) + if(any(nCols != 0)){ + included <- c(included, TRUE) + } else { + included <- c(included, FALSE) + } + + # keys + theKeys <- sapply(seq_along(variables), function(x){ + if(variables[[x]]$vartype == "id"){ + NULL + } else { + if(!is.null(variables[[x]]$key)){ + if(grepl(pattern = "\n", variables[[x]]$key)){ + paste0(str_split(string = variables[[x]]$key, pattern = "\n", simplify = TRUE)[1], " ...") + } else { + variables[[x]]$key + } + } else { + NULL + } + + } + }) + nKeys <- sapply(seq_along(theKeys), function(x){ + ifelse(test = is.null(theKeys[[x]]) , yes = 0, no = nchar(theKeys[x])) + }) + maxKeys <- ifelse(any(nKeys > 3), max(nKeys), 3) + if(any(nKeys != 0)){ + included <- c(included, TRUE) + } else { + included <- c(included, FALSE) + } + + # values + theValues <- sapply(seq_along(variables), function(x){ + if(variables[[x]]$vartype == "id"){ + NULL + } else { + if(!is.null(variables[[x]]$value)){ + if(grepl(pattern = "\n", variables[[x]]$value)){ + paste0(str_split(string = variables[[x]]$value, pattern = "\n", simplify = TRUE)[1], " ...") + } else { + variables[[x]]$value + } + } else { + NULL + } + } + }) + nVals <- sapply(seq_along(theValues), function(x){ + ifelse(test = is.null(theValues[[x]]) , yes = 0, no = nchar(theValues[x])) + }) + maxVals <- ifelse(any(nVals > 5), max(nVals), 5) + if(any(nVals != 0)){ + included <- c(included, TRUE) + } else { + included <- c(included, FALSE) + } + + # whether variables are relative + # theRels <- sapply(seq_along(variables), function(x){ + # str_sub(as.character(variables[[x]]$rel), 1, 1) + # }) + # if(all(theRels == "F")){ + included <- c(included, FALSE) + # } else { + # included <- c(included, TRUE) + # } + + # whether variables are distinct + theDist <- sapply(seq_along(variables), function(x){ + str_sub(as.character(variables[[x]]$dist), 1, 1) + }) + if(all(theDist == "F")){ + included <- c(included, FALSE) + } else { + included <- c(included, TRUE) + } + + for(i in 1:(length(variables)+1)){ + + if(i == 1){ + + head1 <- paste0(" ", "variable", paste0(rep(" ", times = maxNames-5), collapse = "")) + line1 <- paste0(c(rep("-", maxNames+2), " "), collapse = "") + head2 <- paste0("type ") + line2 <- paste0(c(rep("-", 10), " "), collapse = "") + if(included[3]){ + head3 <- paste0("row", paste0(rep(" ", times = maxRows), collapse = "")) + line3 <- paste0(c(rep("-", maxRows+2), " "), collapse = "") + } else { + head3 <- line3 <- "" + } + if(included[4]){ + head41 <- paste0("top", paste0(rep(" ", times = maxTops), collapse = "")) + line41 <- paste0(c(rep("-", maxTops+2), " "), collapse = "") + } else { + head41 <- line41 <- "" + } + if(included[5]){ + head4 <- paste0("col", paste0(rep(" ", times = maxCols), collapse = "")) + line4 <- paste0(c(rep("-", maxCols+2), " "), collapse = "") + } else { + head4 <- line4 <- "" + } + if(included[6]){ + head5 <- paste0("key", paste0(rep(" ", times = maxKeys), collapse = "")) + line5 <- paste0(c(rep("-", maxKeys+2), " "), collapse = "") + } else { + head5 <- line5 <- "" + } + if(included[7]){ + head6 <- paste0("value", paste0(rep(" ", times = maxVals-2), collapse = "")) + line6 <- paste0(c(rep("-", maxVals+2), " "), collapse = "") + } else { + head6 <- line6 <- "" + } + if(included[8]){ + head7 <- paste0("rel ") + line7 <- paste0(c(rep("-", 5), " "), collapse = "") + } else { + head7 <- line7 <- "" + } + if(included[9]){ + head8 <-paste0("dist") + line8 <- paste0(c(rep("-", 6), " "), collapse = "") + } else { + head8 <- line8 <- "" + } + + cat(paste0(head1, head2, head3, head41, head4, head5, head6, head7, head8), "\n") + cat(" ", paste0(line1, line2, line3, line41, line4, line5, line6, line7, line8), "\n") + + } else { + + var1 <- paste0(" ", yellow(theNames[[i-1]]), + paste0(rep(" ", times = maxNames+3-nchar(theNames[[i-1]])), collapse = "")) + var2 <- paste0(theTypes[[i-1]], ifelse(theTypes[[i-1]] == "id", " ", " ")) + if(included[3]){ + var3 <- paste0(paste0(theRows[[i-1]], collapse = ", "), + paste0(rep(" ", times = maxRows+3-nRow[[i-1]]), collapse = "")) + } else { + var3 <- "" + } + if(included[4]){ + var41 <- paste0(paste0(theTops[[i-1]], collapse = ", "), + paste0(rep(" ", times = maxCols+3-nTop[[i-1]]), collapse = "")) + } else { + var41 <- "" + } + if(included[5]){ + var4 <- paste0(paste0(theCols[[i-1]], collapse = ", "), + paste0(rep(" ", times = maxCols+3-nCols[[i-1]]), collapse = "")) + } else { + var4 <- "" + } + if(included[6]){ + var5 <- paste0(theKeys[[i-1]], + paste0(rep(" ", times = maxKeys+3-nKeys[[i-1]]), collapse = "")) + } else { + var5 <- "" + } + if(included[7]){ + var6 <- paste0(theValues[[i-1]], + paste0(rep(" ", times = maxVals+3-nVals[[i-1]]), collapse = "")) + } else { + var6 <- "" + } + if(included[8]){ + # var7 <- paste0(theRels[[i-1]], " ") + } else { + var7 <- "" + } + if(included[9]){ + var8 <- paste0(theDist[[i-1]], " ") + } else { + var8 <- "" + } + + cat(paste0(var1, var2, var3, var41, var4, var5, var6, var7, var8, "\n")) + + } + + + } + + }) diff --git a/R/setClusters.R b/R/setClusters.R index 35f1279..8ec6636 100644 --- a/R/setClusters.R +++ b/R/setClusters.R @@ -1,99 +1,99 @@ -#' Set where the clusters are -#' -#' There is hardly any limit to how data can be arranged in a spreadsheet, apart -#' from the apparent organisation into a lattice of cells. However, it is often -#' the case that data are gathered into topologically coherent chunks. Those -#' chunks are what is called 'cluster' in \code{tabshiftr}. -#' @param schema [\code{schema(1)}]\cr In case this information is added to an -#' already existing schema, provide that schema here (overwrites previous -#' information). -#' @param id [\code{character(1)}]\cr When data are clustered, it is typically -#' the case that the data are segregated according to a categorical variables -#' of interest. In such cases, this variable needs to be registered as cluster -#' ID. -#' @param group [\code{character(1)}]\cr When clusters themselves are -#' clustered, they are typically nested into another categorical variable, -#' which needs to be registered as group ID. -#' @param left [\code{integerish(.)}]\cr The horizontal cell value of the -#' top-left cell of each cluster. This can also be a vector of values in case -#' there are several clusters. -#' @param top [\code{integerish(.)}]\cr The vertical cell values of the top-left -#' cell of each cluster. This can also be a vector of values in case there are -#' several clusters. -#' @param width [\code{integerish(.)}]\cr The width of each cluster in cells. -#' This can also be a vector of values in case there are several clusters. -#' @param height [\code{integerish(.)}]\cr The height of each cluster in cells. -#' This can also be a vector of values in case there are several clusters. -#' @param member [\code{integerish(.)}]\cr For each cluster, specify here to -#' which group it belongs. Clusters are enumerated from left to right and -#' from top to bottom. -#' @details Please also take a look at the currently suggested strategy to set -#' up a \link[=schema]{schema description}. -#' @return An object of class \code{\link{schema}}. -#' @examples -#' # please check the vignette for examples -#' @family functions to describe table arrangement -#' @importFrom checkmate assertClass assertCharacter assertIntegerish -#' @export - -setCluster <- function(schema = NULL, id = NULL, group = NULL, member = NULL, - left = NULL, top = NULL, width = NULL, height = NULL){ - - # assertions ---- - assertClass(x = schema, classes = "schema", null.ok = TRUE) - assertCharacter(x = id, len = 1, any.missing = FALSE) - assertCharacter(x = group, len = 1, any.missing = FALSE, null.ok = TRUE) - colInt <- testIntegerish(x = left, lower = 1, min.len = 1, null.ok = TRUE) - colList <- testList(x = left, len = 1) - assert(colInt, colList) - if(colList) assertSubset(x = names(left), choices = c("find")) - rowInt <- testIntegerish(x = top, lower = 1, min.len = 1, null.ok = TRUE) - rowList <- testList(x = top, len = 1) - assert(rowInt, rowList) - if(rowList) assertSubset(x = names(top), choices = c("find")) - assertIntegerish(x = width, null.ok = TRUE) - assertIntegerish(x = height, null.ok = TRUE) - assertIntegerish(x = member, null.ok = TRUE) - - # update schema ---- - if(is.null(schema)){ - schema <- schema_default - } - - if(!is.null(left)){ - schema@clusters$col <- left - } else { - schema@clusters$col <- 1 - } - - if(!is.null(top)){ - schema@clusters$row <- top - } else { - schema@clusters$row <- 1 - } - - if(!is.null(width)){ - schema@clusters$width <- width - } - - if(!is.null(height)){ - schema@clusters$height <- height - } - - if(!is.null(id)){ - schema@clusters$id <- id - } - - if(!is.null(group)){ - schema@clusters$group <- group - } - - if(!is.null(member)){ - schema@clusters$member <- member - } - - # test for problems ---- - reportProblems(schema = schema) - - return(schema) +#' Set where the clusters are +#' +#' There is hardly any limit to how data can be arranged in a spreadsheet, apart +#' from the apparent organisation into a lattice of cells. However, it is often +#' the case that data are gathered into topologically coherent chunks. Those +#' chunks are what is called 'cluster' in \code{tabshiftr}. +#' @param schema [\code{schema(1)}]\cr In case this information is added to an +#' already existing schema, provide that schema here (overwrites previous +#' information). +#' @param id [\code{character(1)}]\cr When data are clustered, it is typically +#' the case that the data are segregated according to a categorical variables +#' of interest. In such cases, this variable needs to be registered as cluster +#' ID. +#' @param group [\code{character(1)}]\cr When clusters themselves are +#' clustered, they are typically nested into another categorical variable, +#' which needs to be registered as group ID. +#' @param left [\code{integerish(.)}]\cr The horizontal cell value of the +#' top-left cell of each cluster. This can also be a vector of values in case +#' there are several clusters. +#' @param top [\code{integerish(.)}]\cr The vertical cell values of the top-left +#' cell of each cluster. This can also be a vector of values in case there are +#' several clusters. +#' @param width [\code{integerish(.)}]\cr The width of each cluster in cells. +#' This can also be a vector of values in case there are several clusters. +#' @param height [\code{integerish(.)}]\cr The height of each cluster in cells. +#' This can also be a vector of values in case there are several clusters. +#' @param member [\code{integerish(.)}]\cr For each cluster, specify here to +#' which group it belongs. Clusters are enumerated from left to right and +#' from top to bottom. +#' @details Please also take a look at the currently suggested strategy to set +#' up a \link[=schema]{schema description}. +#' @return An object of class \code{\link{schema}}. +#' @examples +#' # please check the vignette for examples +#' @family functions to describe table arrangement +#' @importFrom checkmate assertClass assertCharacter assertIntegerish +#' @export + +setCluster <- function(schema = NULL, id = NULL, group = NULL, member = NULL, + left = NULL, top = NULL, width = NULL, height = NULL){ + + # assertions ---- + assertClass(x = schema, classes = "schema", null.ok = TRUE) + assertCharacter(x = id, len = 1, any.missing = FALSE) + assertCharacter(x = group, len = 1, any.missing = FALSE, null.ok = TRUE) + colInt <- testIntegerish(x = left, lower = 1, min.len = 1, null.ok = TRUE) + colList <- testList(x = left, len = 1) + assert(colInt, colList) + if(colList) assertSubset(x = names(left), choices = c("find")) + rowInt <- testIntegerish(x = top, lower = 1, min.len = 1, null.ok = TRUE) + rowList <- testList(x = top, len = 1) + assert(rowInt, rowList) + if(rowList) assertSubset(x = names(top), choices = c("find")) + assertIntegerish(x = width, null.ok = TRUE) + assertIntegerish(x = height, null.ok = TRUE) + assertIntegerish(x = member, null.ok = TRUE) + + # update schema ---- + if(is.null(schema)){ + schema <- schema_default + } + + if(!is.null(left)){ + schema@clusters$col <- left + } else { + schema@clusters$col <- 1 + } + + if(!is.null(top)){ + schema@clusters$row <- top + } else { + schema@clusters$row <- 1 + } + + if(!is.null(width)){ + schema@clusters$width <- width + } + + if(!is.null(height)){ + schema@clusters$height <- height + } + + if(!is.null(id)){ + schema@clusters$id <- id + } + + if(!is.null(group)){ + schema@clusters$group <- group + } + + if(!is.null(member)){ + schema@clusters$member <- member + } + + # test for problems ---- + reportProblems(schema = schema) + + return(schema) } \ No newline at end of file diff --git a/R/setFilter.R b/R/setFilter.R index 626d3b4..9f56873 100644 --- a/R/setFilter.R +++ b/R/setFilter.R @@ -1,84 +1,84 @@ -#' Set filters -#' -#' This function allows to specify additional rules to filter certain rows -#' @param schema [\code{schema(1)}]\cr In case this information is added to an -#' already existing schema, provide that schema here (overwrites previous -#' information). -#' @param rows [\code{integerish(.)}]\cr rows that are mentioned here are kept. -#' @param columns [\code{integerish(.)}]\cr columns that are mentioned here are -#' kept. -#' @param invert [\code{logical(1)}]\cr whether or not to invert the specified -#' columns or rows. -#' @param operator [\code{function(1)}]\cr \code{\link[base]{Logic}} operators -#' by which the current filter should be combined with the directly preceeding -#' filter; hence this argument is not used in case no other filter was defined -#' before it. -#' @return An object of class \code{\link{schema}}. -#' @examples -#' (input <- tabs2shift$messy_rows) -#' -#' # select rows where there is 'unit 2' in column 1 or 'year 2' in column 2 -#' schema <- -#' setFilter(rows = .find(pattern = "unit 2", col = 1)) %>% -#' setFilter(rows = .find(pattern = "year 2", col = 2), operator = `|`) %>% -#' setIDVar(name = "territories", columns = 1) %>% -#' setIDVar(name = "year", columns = 2) %>% -#' setIDVar(name = "commodities", columns = 3) %>% -#' setObsVar(name = "harvested", columns = 5) %>% -#' setObsVar(name = "production", columns = 6) -#' -#' reorganise(schema = schema, input = input) -#' @family functions to describe table arrangement -#' @importFrom checkmate assertClass testIntegerish testClass assertLogical -#' @export - -setFilter <- function(schema = NULL, rows = NULL, columns = NULL, invert = FALSE, - operator = NULL){ - - # assertions ---- - assertClass(x = schema, classes = "schema", null.ok = TRUE) - rowInt <- testIntegerish(x = rows, lower = 1, min.len = 1, null.ok = TRUE) - rowList <- testList(x = rows, len = 1) - assert(rowInt, rowList) - colInt <- testIntegerish(x = columns, lower = 1, min.len = 1, null.ok = TRUE) - colList <- testList(x = columns, len = 1) - assert(colInt, colList) - if(rowList) assertSubset(x = names(rows), choices = c("find")) - if(colList) assertSubset(x = names(columns), choices = c("find")) - assertLogical(x = invert, any.missing = FALSE) - - # update schema ---- - if(is.null(schema)){ - schema <- schema_default - } - - if(is.null(operator)){ - operator <- `&` - } - - if(!is.null(rows)){ - if(!is.list(rows)){ - rows <- list(position = rows, invert = invert) - } - if(!is.null(schema@filter$row)){ - rows <- c(operator = operator, rows) - } - schema@filter$row <- c(schema@filter$row, rows) - } - - if(!is.null(columns)){ - if(!is.list(columns)){ - columns <- list(position = columns) - } - if(!is.null(schema@filter$row)){ - columns <- c(operator = operator, columns) - } - schema@filter$col <- c(schema@filter$col, columns) - } - - # test for problems ---- - # reportProblems(schema = schema) - - return(schema) - +#' Set filters +#' +#' This function allows to specify additional rules to filter certain rows +#' @param schema [\code{schema(1)}]\cr In case this information is added to an +#' already existing schema, provide that schema here (overwrites previous +#' information). +#' @param rows [\code{integerish(.)}]\cr rows that are mentioned here are kept. +#' @param columns [\code{integerish(.)}]\cr columns that are mentioned here are +#' kept. +#' @param invert [\code{logical(1)}]\cr whether or not to invert the specified +#' columns or rows. +#' @param operator [\code{function(1)}]\cr \code{\link[base]{Logic}} operators +#' by which the current filter should be combined with the directly preceeding +#' filter; hence this argument is not used in case no other filter was defined +#' before it. +#' @return An object of class \code{\link{schema}}. +#' @examples +#' (input <- tabs2shift$messy_rows) +#' +#' # select rows where there is 'unit 2' in column 1 or 'year 2' in column 2 +#' schema <- +#' setFilter(rows = .find(pattern = "unit 2", col = 1)) %>% +#' setFilter(rows = .find(pattern = "year 2", col = 2), operator = `|`) %>% +#' setIDVar(name = "territories", columns = 1) %>% +#' setIDVar(name = "year", columns = 2) %>% +#' setIDVar(name = "commodities", columns = 3) %>% +#' setObsVar(name = "harvested", columns = 5) %>% +#' setObsVar(name = "production", columns = 6) +#' +#' reorganise(schema = schema, input = input) +#' @family functions to describe table arrangement +#' @importFrom checkmate assertClass testIntegerish testClass assertLogical +#' @export + +setFilter <- function(schema = NULL, rows = NULL, columns = NULL, invert = FALSE, + operator = NULL){ + + # assertions ---- + assertClass(x = schema, classes = "schema", null.ok = TRUE) + rowInt <- testIntegerish(x = rows, lower = 1, min.len = 1, null.ok = TRUE) + rowList <- testList(x = rows, len = 1) + assert(rowInt, rowList) + colInt <- testIntegerish(x = columns, lower = 1, min.len = 1, null.ok = TRUE) + colList <- testList(x = columns, len = 1) + assert(colInt, colList) + if(rowList) assertSubset(x = names(rows), choices = c("find")) + if(colList) assertSubset(x = names(columns), choices = c("find")) + assertLogical(x = invert, any.missing = FALSE) + + # update schema ---- + if(is.null(schema)){ + schema <- schema_default + } + + if(is.null(operator)){ + operator <- `&` + } + + if(!is.null(rows)){ + if(!is.list(rows)){ + rows <- list(position = rows, invert = invert) + } + if(!is.null(schema@filter$row)){ + rows <- c(operator = operator, rows) + } + schema@filter$row <- c(schema@filter$row, rows) + } + + if(!is.null(columns)){ + if(!is.list(columns)){ + columns <- list(position = columns) + } + if(!is.null(schema@filter$row)){ + columns <- c(operator = operator, columns) + } + schema@filter$col <- c(schema@filter$col, columns) + } + + # test for problems ---- + # reportProblems(schema = schema) + + return(schema) + } \ No newline at end of file diff --git a/R/setFormat.R b/R/setFormat.R index 3d0eee6..e012098 100644 --- a/R/setFormat.R +++ b/R/setFormat.R @@ -1,79 +1,79 @@ -#' Set the specific format of a table -#' -#' Any table makes some assumptions about the data, but they are mostly not -#' explicitly recorded in the commonly available table format. This concerns, -#' for example, the symbol(s) that signal "not available" values or the symbol -#' that is used as decimal sign. -#' @param schema [\code{schema(1)}]\cr In case this information is added to an -#' already existing schema, provide that schema here (overwrites previous -#' information). -#' @param header [\code{integerish(1)}]\cr The number of header rows. Optimally, -#' a table is read so that column names are ignored (for example -#' \code{readr::read_csv(file = ..., col_names = FALSE)}). If relatively well -#' defined tables are processed, where the header is always only one row, the -#' table can be read in with the default and the header can be spliced into -#' the table by specifying the number of rows here. -#' @param decimal [\code{character(1)}]\cr The symbols that should be -#' interpreted as decimal separator. -#' @param thousand [\code{character(1)}]\cr The symbols that should be -#' interpreted as thousand separator. -#' @param na_values [\code{character(.)}]\cr The symbols that should be -#' interpreted as \code{NA}. -#' @param flags [\code{data.frame(2)}]\cr The typically character based flags -#' that should be shaved off of observed variables to make them identifiable -#' as numeric values. This must be a data.frame with two columns with names -#' \code{flag} and \code{value}. -#' @details Please also take a look at the currently suggested strategy to set -#' up a \link[=schema]{schema description}. -#' @return An object of class \code{\link{schema}}. -#' @examples -#' # please check the vignette for examples -#' @family functions to describe table arrangement -#' @importFrom checkmate assertClass assertCharacter -#' @importFrom dplyr bind_rows -#' @export - -setFormat <- function(schema = NULL, header = 0, decimal = NULL, - thousand = NULL, na_values = NULL, flags = NULL){ - - # assertions ---- - assertClass(x = schema, classes = "schema", null.ok = TRUE) - assertIntegerish(x = header, len = 1, lower = 0, any.missing = FALSE) - assertCharacter(x = decimal, len = 1, any.missing = FALSE, null.ok = TRUE) - assertCharacter(x = thousand, len = 1, any.missing = FALSE, null.ok = TRUE) - assertCharacter(x = na_values, any.missing = FALSE, null.ok = TRUE) - assertDataFrame(x = flags, any.missing = FALSE, ncols = 2, null.ok = TRUE) - if(!is.null(flags)){ - assertNames(x = names(flags), must.include = c("flag", "value")) - } - - # update schema ---- - if(is.null(schema)){ - schema <- schema_default - } - - if(!is.null(header)){ - schema@format$header <- header - } - - if(!is.null(decimal)){ - schema@format$dec <- decimal - } - - if(!is.null(thousand)){ - schema@format$del <- thousand - } - - if(!is.null(na_values)){ - schema@format$na <- na_values - } - - if(!is.null(flags)){ - schema@format$flags <- bind_rows(schema@format$flags, flags) - } - - # test for problems ---- - # reportProblems(schema = schema) - - return(schema) +#' Set the specific format of a table +#' +#' Any table makes some assumptions about the data, but they are mostly not +#' explicitly recorded in the commonly available table format. This concerns, +#' for example, the symbol(s) that signal "not available" values or the symbol +#' that is used as decimal sign. +#' @param schema [\code{schema(1)}]\cr In case this information is added to an +#' already existing schema, provide that schema here (overwrites previous +#' information). +#' @param header [\code{integerish(1)}]\cr The number of header rows. Optimally, +#' a table is read so that column names are ignored (for example +#' \code{readr::read_csv(file = ..., col_names = FALSE)}). If relatively well +#' defined tables are processed, where the header is always only one row, the +#' table can be read in with the default and the header can be spliced into +#' the table by specifying the number of rows here. +#' @param decimal [\code{character(1)}]\cr The symbols that should be +#' interpreted as decimal separator. +#' @param thousand [\code{character(1)}]\cr The symbols that should be +#' interpreted as thousand separator. +#' @param na_values [\code{character(.)}]\cr The symbols that should be +#' interpreted as \code{NA}. +#' @param flags [\code{data.frame(2)}]\cr The typically character based flags +#' that should be shaved off of observed variables to make them identifiable +#' as numeric values. This must be a data.frame with two columns with names +#' \code{flag} and \code{value}. +#' @details Please also take a look at the currently suggested strategy to set +#' up a \link[=schema]{schema description}. +#' @return An object of class \code{\link{schema}}. +#' @examples +#' # please check the vignette for examples +#' @family functions to describe table arrangement +#' @importFrom checkmate assertClass assertCharacter +#' @importFrom dplyr bind_rows +#' @export + +setFormat <- function(schema = NULL, header = 0, decimal = NULL, + thousand = NULL, na_values = NULL, flags = NULL){ + + # assertions ---- + assertClass(x = schema, classes = "schema", null.ok = TRUE) + assertIntegerish(x = header, len = 1, lower = 0, any.missing = FALSE) + assertCharacter(x = decimal, len = 1, any.missing = FALSE, null.ok = TRUE) + assertCharacter(x = thousand, len = 1, any.missing = FALSE, null.ok = TRUE) + assertCharacter(x = na_values, any.missing = FALSE, null.ok = TRUE) + assertDataFrame(x = flags, any.missing = FALSE, ncols = 2, null.ok = TRUE) + if(!is.null(flags)){ + assertNames(x = names(flags), must.include = c("flag", "value")) + } + + # update schema ---- + if(is.null(schema)){ + schema <- schema_default + } + + if(!is.null(header)){ + schema@format$header <- header + } + + if(!is.null(decimal)){ + schema@format$dec <- decimal + } + + if(!is.null(thousand)){ + schema@format$del <- thousand + } + + if(!is.null(na_values)){ + schema@format$na <- na_values + } + + if(!is.null(flags)){ + schema@format$flags <- bind_rows(schema@format$flags, flags) + } + + # test for problems ---- + # reportProblems(schema = schema) + + return(schema) } \ No newline at end of file diff --git a/R/setGroups.R b/R/setGroups.R index 47f2a2a..d250dd4 100644 --- a/R/setGroups.R +++ b/R/setGroups.R @@ -1,54 +1,54 @@ -#' Set Groups -#' -#' This function allows to set groups for rows, columns or clusters that shall -#' be summarised. -#' @param schema [\code{schema(1)}]\cr In case this information is added to an -#' already existing schema, provide that schema here (overwrites previous -#' information). -#' @param rows [\code{list(3)}]\cr the output of \code{\link{.sum}} indicating -#' the rows and a function according to which those rows should be summarised. -#' @param columns [\code{list(3)}]\cr the output of \code{\link{.sum}} -#' indicating the columns and a function according to which those columns -#' should be summarised. -#' @return An object of class \code{\link{schema}}. -#' @examples -#' # please check the vignette for examples -#' @family functions to describe table arrangement -#' @importFrom checkmate assertClass assertList -#' @export - -setGroups <- function(schema = NULL, rows = NULL, columns = NULL){ - - # assertions ---- - assertClass(x = schema, classes = "schema", null.ok = TRUE) - # rowInt <- testIntegerish(x = rows, lower = 1, min.len = 1, null.ok = TRUE) - # rowList <- testList(x = rows) - assertList(x = rows, len = 1, null.ok = TRUE) - # assert(rowInt, rowList) - # colInt <- testIntegerish(x = columns, lower = 1, min.len = 1, null.ok = TRUE) - # colList <- testList(x = columns) - assertList(x = columns, len = 1, null.ok = TRUE) - # assert(colInt, colList) - # clustInt <- testIntegerish(x = clusters, lower = 1, min.len = 1, null.ok = TRUE) - # clustList <- testList(x = clusters) - # assert(clustInt, clustList) - - # update schema ---- - if(is.null(schema)){ - schema <- schema_default - } - - if(!is.null(rows)){ - schema@groups$rows <- c(schema@groups$rows, rows) - } - - if(!is.null(columns)){ - schema@groups$cols <- c(schema@groups$cols, columns) - } - - # test for problems ---- - # reportProblems(schema = schema) - - return(schema) - +#' Set Groups +#' +#' This function allows to set groups for rows, columns or clusters that shall +#' be summarised. +#' @param schema [\code{schema(1)}]\cr In case this information is added to an +#' already existing schema, provide that schema here (overwrites previous +#' information). +#' @param rows [\code{list(3)}]\cr the output of \code{\link{.sum}} indicating +#' the rows and a function according to which those rows should be summarised. +#' @param columns [\code{list(3)}]\cr the output of \code{\link{.sum}} +#' indicating the columns and a function according to which those columns +#' should be summarised. +#' @return An object of class \code{\link{schema}}. +#' @examples +#' # please check the vignette for examples +#' @family functions to describe table arrangement +#' @importFrom checkmate assertClass assertList +#' @export + +setGroups <- function(schema = NULL, rows = NULL, columns = NULL){ + + # assertions ---- + assertClass(x = schema, classes = "schema", null.ok = TRUE) + # rowInt <- testIntegerish(x = rows, lower = 1, min.len = 1, null.ok = TRUE) + # rowList <- testList(x = rows) + assertList(x = rows, len = 1, null.ok = TRUE) + # assert(rowInt, rowList) + # colInt <- testIntegerish(x = columns, lower = 1, min.len = 1, null.ok = TRUE) + # colList <- testList(x = columns) + assertList(x = columns, len = 1, null.ok = TRUE) + # assert(colInt, colList) + # clustInt <- testIntegerish(x = clusters, lower = 1, min.len = 1, null.ok = TRUE) + # clustList <- testList(x = clusters) + # assert(clustInt, clustList) + + # update schema ---- + if(is.null(schema)){ + schema <- schema_default + } + + if(!is.null(rows)){ + schema@groups$rows <- c(schema@groups$rows, rows) + } + + if(!is.null(columns)){ + schema@groups$cols <- c(schema@groups$cols, columns) + } + + # test for problems ---- + # reportProblems(schema = schema) + + return(schema) + } \ No newline at end of file diff --git a/R/setIDVar.R b/R/setIDVar.R index 2f84754..4b7708f 100644 --- a/R/setIDVar.R +++ b/R/setIDVar.R @@ -1,96 +1,96 @@ -#' Set an identifying variable -#' -#' Identifying variables are those variables that describe the (qualitative) -#' properties that make each observation (as described by the -#' \code{\link[=setObsVar]{observed variables}}) unique. -#' @param schema [\code{schema(1)}]\cr In case this information is added to an -#' already existing schema, provide that schema here (overwrites previous -#' information). -#' @param name [\code{character(1)}]\cr Name of the new identifying variable. -#' @param type [\code{character(1)}]\cr data type of the new identifying -#' variable. Possible values are \code{"c/character"}, \code{"i/integer"}, -#' \code{"n/numeric"}, \code{"l/logical"}, \code{"D/Date"} or \code{"_/skip"}. -#' For \code{"D/Date"}, the value has to follow the form \code{YYYY-MM-DD}, -#' where dates that don't match that are replaced by NA. -#' @param value [\code{character(1)}]\cr In case the variable is an implicit -#' variable (i.e., which is not in the origin table), specify it here. -#' @param columns [\code{integerish(.)}]\cr The column(s) in which the -#' \emph{values} of the new variable are recorded. -#' @param rows [\code{integerish(.)}]\cr In case the variable is in several -#' columns, specify here additionally the row in which the \emph{names} are -#' recorded. -#' @param split [\code{character(1)}]\cr In case the variable is part of a -#' compound value, this should be a regular expression that splits the -#' respective value off of that compound value. See -#' \code{\link[tidyr]{extract}} on how to set up the regular expression. -#' @param merge [\code{character(1)}]\cr In case a variable is made up of -#' several columns, this should be the character string that would connect the -#' two columns (e.g., an empty space \code{" "}). -#' @param distinct [\code{logical(1)}]\cr whether or not the variable is -#' distinct from a cluster. This is the case when the variable is not -#' systematically available for all clusters and thus needs to be registered -#' separately from clusters. -#' @details Please also take a look at the currently suggested strategy to set -#' up a \link[=schema]{schema description}. -#' @return An object of class \code{\link{schema}}. -#' @examples -#' # please check the vignette for examples -#' @family functions to describe table arrangement -#' @importFrom checkmate assertClass assertCharacter assertLogical -#' testIntegerish testList -#' @importFrom dplyr case_when -#' @export - -setIDVar <- function(schema = NULL, name = NULL, type = "character", - value = NULL, columns = NULL, rows = NULL, split = NULL, - merge = NULL, distinct = FALSE){ - - # assertions ---- - assertClass(x = schema, classes = "schema", null.ok = TRUE) - assertCharacter(x = name, len = 1, any.missing = FALSE) - colInt <- testIntegerish(x = columns, lower = 1, min.len = 1, null.ok = TRUE) - colList <- testList(x = columns, len = 1) - assert(colInt, colList) - if(colList) assertSubset(x = names(columns), choices = c("find")) - rowInt <- testIntegerish(x = rows, lower = 1, min.len = 1, null.ok = TRUE) - rowList <- testList(x = rows, len = 1) - assert(rowInt, rowList) - if(rowList) assertSubset(x = names(rows), choices = c("find")) - assertCharacter(x = split, len = 1, any.missing = FALSE, null.ok = TRUE) - assertCharacter(x = merge, len = 1, any.missing = FALSE, null.ok = TRUE) - assertLogical(x = distinct, any.missing = FALSE, len = 1) - - data_type <- case_when( - type %in% c("i", "integer") ~ "integer", - type %in% c("n", "numeric") ~ "numeric", - type %in% c("l", "logical") ~ "logical", - type %in% c("D", "Date") ~ "Date", - type %in% c("_", "skip") ~ "skip", - .default = "character" - ) - - # if type-check should be skipped, don't assert class - if(data_type != "skip" & !is.null(value)){ - assertClass(x = value, classes = data_type) - } - - if(is.null(schema)){ - schema <- schema_default - } - - # update schema ---- - temp <- list(vartype = "id", - datype = data_type, - value = value, - col = columns, - row = rows, - split = split, - merge = merge, - dist = distinct) - schema@variables[[name]] <- temp - - # test for problems ---- - # reportProblems(schema = schema) - - return(schema) +#' Set an identifying variable +#' +#' Identifying variables are those variables that describe the (qualitative) +#' properties that make each observation (as described by the +#' \code{\link[=setObsVar]{observed variables}}) unique. +#' @param schema [\code{schema(1)}]\cr In case this information is added to an +#' already existing schema, provide that schema here (overwrites previous +#' information). +#' @param name [\code{character(1)}]\cr Name of the new identifying variable. +#' @param type [\code{character(1)}]\cr data type of the new identifying +#' variable. Possible values are \code{"c/character"}, \code{"i/integer"}, +#' \code{"n/numeric"}, \code{"l/logical"}, \code{"D/Date"} or \code{"_/skip"}. +#' For \code{"D/Date"}, the value has to follow the form \code{YYYY-MM-DD}, +#' where dates that don't match that are replaced by NA. +#' @param value [\code{character(1)}]\cr In case the variable is an implicit +#' variable (i.e., which is not in the origin table), specify it here. +#' @param columns [\code{integerish(.)}]\cr The column(s) in which the +#' \emph{values} of the new variable are recorded. +#' @param rows [\code{integerish(.)}]\cr In case the variable is in several +#' columns, specify here additionally the row in which the \emph{names} are +#' recorded. +#' @param split [\code{character(1)}]\cr In case the variable is part of a +#' compound value, this should be a regular expression that splits the +#' respective value off of that compound value. See +#' \code{\link[tidyr]{extract}} on how to set up the regular expression. +#' @param merge [\code{character(1)}]\cr In case a variable is made up of +#' several columns, this should be the character string that would connect the +#' two columns (e.g., an empty space \code{" "}). +#' @param distinct [\code{logical(1)}]\cr whether or not the variable is +#' distinct from a cluster. This is the case when the variable is not +#' systematically available for all clusters and thus needs to be registered +#' separately from clusters. +#' @details Please also take a look at the currently suggested strategy to set +#' up a \link[=schema]{schema description}. +#' @return An object of class \code{\link{schema}}. +#' @examples +#' # please check the vignette for examples +#' @family functions to describe table arrangement +#' @importFrom checkmate assertClass assertCharacter assertLogical +#' testIntegerish testList +#' @importFrom dplyr case_when +#' @export + +setIDVar <- function(schema = NULL, name = NULL, type = "character", + value = NULL, columns = NULL, rows = NULL, split = NULL, + merge = NULL, distinct = FALSE){ + + # assertions ---- + assertClass(x = schema, classes = "schema", null.ok = TRUE) + assertCharacter(x = name, len = 1, any.missing = FALSE) + colInt <- testIntegerish(x = columns, lower = 1, min.len = 1, null.ok = TRUE) + colList <- testList(x = columns, len = 1) + assert(colInt, colList) + if(colList) assertSubset(x = names(columns), choices = c("find")) + rowInt <- testIntegerish(x = rows, lower = 1, min.len = 1, null.ok = TRUE) + rowList <- testList(x = rows, len = 1) + assert(rowInt, rowList) + if(rowList) assertSubset(x = names(rows), choices = c("find")) + assertCharacter(x = split, len = 1, any.missing = FALSE, null.ok = TRUE) + assertCharacter(x = merge, len = 1, any.missing = FALSE, null.ok = TRUE) + assertLogical(x = distinct, any.missing = FALSE, len = 1) + + data_type <- case_when( + type %in% c("i", "integer") ~ "integer", + type %in% c("n", "numeric") ~ "numeric", + type %in% c("l", "logical") ~ "logical", + type %in% c("D", "Date") ~ "Date", + type %in% c("_", "skip") ~ "skip", + .default = "character" + ) + + # if type-check should be skipped, don't assert class + if(data_type != "skip" & !is.null(value)){ + assertClass(x = value, classes = data_type) + } + + if(is.null(schema)){ + schema <- schema_default + } + + # update schema ---- + temp <- list(vartype = "id", + datype = data_type, + value = value, + col = columns, + row = rows, + split = split, + merge = merge, + dist = distinct) + schema@variables[[name]] <- temp + + # test for problems ---- + # reportProblems(schema = schema) + + return(schema) } \ No newline at end of file diff --git a/R/setObsVar.R b/R/setObsVar.R index f339929..92a7395 100644 --- a/R/setObsVar.R +++ b/R/setObsVar.R @@ -1,95 +1,95 @@ -#' Set an observed variable -#' -#' Observed variables are those variables that contain the (quantitative) -#' observed/measured values of each unique unit (as described by the -#' \code{\link[=setIDVar]{identifying variables}}). There may be several of them -#' and in a tidy table they'd be recorded as separate columns. -#' @param schema [\code{schema(1)}]\cr In case this information is added to an -#' already existing schema, provide that schema here (overwrites previous -#' information). -#' @param name [\code{character(1)}]\cr Name of the new observed variable. -#' @param type [\code{character(1)}]\cr data type of the new observed -#' variable. Possible values are \code{"c/character"}, \code{"i/integer"}, -#' \code{"n/numeric"}, \code{"l/logical"}, \code{"D/date"} or \code{"_/skip"}. -#' @param columns [\code{integerish(.)}]\cr The column(s) in which the -#' \emph{values} of the new variable are recorded. -#' @param top [\code{integerish(.)}]\cr In case the variable is nested in a wide -#' identifying variable, specify here additionally the topmost row in which -#' the variable \emph{name} sits. -#' @param factor [\code{numeric(1)}]\cr the factor that needs to be multiplied -#' with the values to convert to the target unit, defaults to 1. For instance, -#' if values are recorded in acres, but shall be recorded in hectare, the -#' factor would be 0.40468. -#' @param key [\code{integerish(1)}]\cr If the variable is recorded (together -#' with other variables) so that the variable names are listed in one column -#' and the respective values are listed in another column, give here the -#' number of the column that contains the variable names. Can alternatively be -#' "cluster", in case observed variables are the cluster ID. -#' @param value [\code{character(1)}]\cr If the variable is recorded (together -#' with other variables) so that the variable names are listed in one column -#' and the respective values are listed in another column, give here the level -#' in the names column that refer to the values of this variable. -#' @param distinct [\code{logical(1)}]\cr Whether or not the variable is -#' distinct from a cluster. This is the case when the variable is recorded -#' somewhere 'on the side' and thus not explicitly included in all clusters. -#' @details Please also take a look at the currently suggested strategy to set -#' up a \link[=schema]{schema description}. -#' @return An object of class \code{\link{schema}}. -#' @examples -#' # please check the vignette for examples -#' @family functions to describe table arrangement -#' @importFrom checkmate assertClass assertIntegerish assertLogical assertSubset -#' assertCharacter assertNumeric testIntegerish testCharacter assert -#' @importFrom dplyr case_when -#' @export - -setObsVar <- function(schema = NULL, name = NULL, type = "numeric", - columns = NULL, top = NULL, distinct = FALSE, factor = 1, - key = NULL, value = NULL){ - - # assertions ---- - assertClass(x = schema, classes = "schema", null.ok = TRUE) - assertCharacter(x = name, len = 1, any.missing = FALSE) - colInt <- testIntegerish(x = columns, lower = 1, min.len = 1, null.ok = TRUE) - colList <- testList(x = columns, len = 1) - assert(colInt, colList) - if(colList) assertSubset(x = names(columns), choices = c("find")) - rowInt <- testIntegerish(x = top, lower = 1, min.len = 1, null.ok = TRUE) - rowList <- testList(x = top, len = 1) - assert(rowInt, rowList) - if(rowList) assertSubset(x = names(top), choices = c("find")) - assertLogical(x = distinct, any.missing = FALSE, len = 1) - assertNumeric(x = factor, len = 1, any.missing = FALSE) - if(is.character(key)){ - assertSubset(x = key, choices = "cluster", empty.ok = FALSE) - } - - data_type <- case_when( - type %in% c("i", "integer") ~ "integer", - type %in% c("n", "numeric") ~ "numeric", - type %in% c("l", "logical") ~ "logical", - type %in% c("D", "Date") ~ "Date", - type %in% c("_", "skip") ~ "skip", - .default = "character" - ) - - if(is.null(schema)){ - schema <- schema_default - } - - # update schema ---- - temp <- list(vartype = "observed", - datype = data_type, - col = columns, - row = top, - dist = distinct, - factor = factor, - key = key, - value = value) - schema@variables[[name]] <- temp - - # test for problems ---- - # reportProblems(schema = schema) - - return(schema) +#' Set an observed variable +#' +#' Observed variables are those variables that contain the (quantitative) +#' observed/measured values of each unique unit (as described by the +#' \code{\link[=setIDVar]{identifying variables}}). There may be several of them +#' and in a tidy table they'd be recorded as separate columns. +#' @param schema [\code{schema(1)}]\cr In case this information is added to an +#' already existing schema, provide that schema here (overwrites previous +#' information). +#' @param name [\code{character(1)}]\cr Name of the new observed variable. +#' @param type [\code{character(1)}]\cr data type of the new observed +#' variable. Possible values are \code{"c/character"}, \code{"i/integer"}, +#' \code{"n/numeric"}, \code{"l/logical"}, \code{"D/date"} or \code{"_/skip"}. +#' @param columns [\code{integerish(.)}]\cr The column(s) in which the +#' \emph{values} of the new variable are recorded. +#' @param top [\code{integerish(.)}]\cr In case the variable is nested in a wide +#' identifying variable, specify here additionally the topmost row in which +#' the variable \emph{name} sits. +#' @param factor [\code{numeric(1)}]\cr the factor that needs to be multiplied +#' with the values to convert to the target unit, defaults to 1. For instance, +#' if values are recorded in acres, but shall be recorded in hectare, the +#' factor would be 0.40468. +#' @param key [\code{integerish(1)}]\cr If the variable is recorded (together +#' with other variables) so that the variable names are listed in one column +#' and the respective values are listed in another column, give here the +#' number of the column that contains the variable names. Can alternatively be +#' "cluster", in case observed variables are the cluster ID. +#' @param value [\code{character(1)}]\cr If the variable is recorded (together +#' with other variables) so that the variable names are listed in one column +#' and the respective values are listed in another column, give here the level +#' in the names column that refer to the values of this variable. +#' @param distinct [\code{logical(1)}]\cr Whether or not the variable is +#' distinct from a cluster. This is the case when the variable is recorded +#' somewhere 'on the side' and thus not explicitly included in all clusters. +#' @details Please also take a look at the currently suggested strategy to set +#' up a \link[=schema]{schema description}. +#' @return An object of class \code{\link{schema}}. +#' @examples +#' # please check the vignette for examples +#' @family functions to describe table arrangement +#' @importFrom checkmate assertClass assertIntegerish assertLogical assertSubset +#' assertCharacter assertNumeric testIntegerish testCharacter assert +#' @importFrom dplyr case_when +#' @export + +setObsVar <- function(schema = NULL, name = NULL, type = "numeric", + columns = NULL, top = NULL, distinct = FALSE, factor = 1, + key = NULL, value = NULL){ + + # assertions ---- + assertClass(x = schema, classes = "schema", null.ok = TRUE) + assertCharacter(x = name, len = 1, any.missing = FALSE) + colInt <- testIntegerish(x = columns, lower = 1, min.len = 1, null.ok = TRUE) + colList <- testList(x = columns, len = 1) + assert(colInt, colList) + if(colList) assertSubset(x = names(columns), choices = c("find")) + rowInt <- testIntegerish(x = top, lower = 1, min.len = 1, null.ok = TRUE) + rowList <- testList(x = top, len = 1) + assert(rowInt, rowList) + if(rowList) assertSubset(x = names(top), choices = c("find")) + assertLogical(x = distinct, any.missing = FALSE, len = 1) + assertNumeric(x = factor, len = 1, any.missing = FALSE) + if(is.character(key)){ + assertSubset(x = key, choices = "cluster", empty.ok = FALSE) + } + + data_type <- case_when( + type %in% c("i", "integer") ~ "integer", + type %in% c("n", "numeric") ~ "numeric", + type %in% c("l", "logical") ~ "logical", + type %in% c("D", "Date") ~ "Date", + type %in% c("_", "skip") ~ "skip", + .default = "character" + ) + + if(is.null(schema)){ + schema <- schema_default + } + + # update schema ---- + temp <- list(vartype = "observed", + datype = data_type, + col = columns, + row = top, + dist = distinct, + factor = factor, + key = key, + value = value) + schema@variables[[name]] <- temp + + # test for problems ---- + # reportProblems(schema = schema) + + return(schema) } \ No newline at end of file diff --git a/R/sum.R b/R/sum.R index 41d42a0..d940c44 100644 --- a/R/sum.R +++ b/R/sum.R @@ -1,57 +1,57 @@ -#' Summarise groups of rows or columns -#' -#' @param ... [\code{integerish(1)}]\cr columns or rows that shall be combined. -#' If there are several items provided, they will be summarised into one group -#' that is combined according to its type and the respective function provided -#' in \code{character} or \code{numeric}. -#' @param character [\code{function(1)}]\cr function by which character columns -#' or rows shall be combined. -#' @param numeric [\code{function(1)}]\cr function by which numeric columns or -#' rows shall be combined. -#' @param fill [\code{character(3)}]\cr direction in which to fill missing values, -#' possible values are "down", "up" and "right"; if several directions are -#' required, provide them in the order required. -#' @details By default \code{character} values are summarised with the function -#' \code{paste0(na.omit(x), collapse = "-/-")} and \code{numeric} values with -#' the function \code{sum(x, na.rm = TRUE)}. To avoid un-intuitive behavior, -#' it is wisest to explicitly specify how all exceptions, such as NA-values, -#' shall be handled and thus to provide a new function. -#' @return the index values where the target was found. -#' @importFrom checkmate assertFunction -#' @importFrom rlang enquo enquos eval_tidy -#' @importFrom purrr map -#' @importFrom stats na.omit -#' @export - -.sum <- function(..., character = NULL, numeric = NULL, fill = NULL){ - - charFun <- testFunction(x = character) - numFun <- testFunction(x = numeric) - assertSubset(x = fill, choices = c("down", "up", "right"), empty.ok = TRUE) - - # if(!charFun){ - # character <- function(x) paste0(na.omit(x), collapse = "-/-") - # } - # if(!numFun){ - # numeric <- function(x) sum(x, na.rm = TRUE) - # } - - temp <- list(char = enquo(character), num = enquo(numeric)) - grps <- unlist(enquos(...)) - - # return(grps) - - if(length(grps) > 1){ - grps <- map(seq_along(grps), function(ix){ - eval_tidy(grps[[ix]]) - }) %>% - unlist() %>% - unique() %>% - sort() - } - - out <- list(group = list(by = temp, groups = grps, fill = fill)) - - return(out) - +#' Summarise groups of rows or columns +#' +#' @param ... [\code{integerish(1)}]\cr columns or rows that shall be combined. +#' If there are several items provided, they will be summarised into one group +#' that is combined according to its type and the respective function provided +#' in \code{character} or \code{numeric}. +#' @param character [\code{function(1)}]\cr function by which character columns +#' or rows shall be combined. +#' @param numeric [\code{function(1)}]\cr function by which numeric columns or +#' rows shall be combined. +#' @param fill [\code{character(3)}]\cr direction in which to fill missing values, +#' possible values are "down", "up" and "right"; if several directions are +#' required, provide them in the order required. +#' @details By default \code{character} values are summarised with the function +#' \code{paste0(na.omit(x), collapse = "-/-")} and \code{numeric} values with +#' the function \code{sum(x, na.rm = TRUE)}. To avoid un-intuitive behavior, +#' it is wisest to explicitly specify how all exceptions, such as NA-values, +#' shall be handled and thus to provide a new function. +#' @return the index values where the target was found. +#' @importFrom checkmate assertFunction +#' @importFrom rlang enquo enquos eval_tidy +#' @importFrom purrr map +#' @importFrom stats na.omit +#' @export + +.sum <- function(..., character = NULL, numeric = NULL, fill = NULL){ + + charFun <- testFunction(x = character) + numFun <- testFunction(x = numeric) + assertSubset(x = fill, choices = c("down", "up", "right"), empty.ok = TRUE) + + # if(!charFun){ + # character <- function(x) paste0(na.omit(x), collapse = "-/-") + # } + # if(!numFun){ + # numeric <- function(x) sum(x, na.rm = TRUE) + # } + + temp <- list(char = enquo(character), num = enquo(numeric)) + grps <- unlist(enquos(...)) + + # return(grps) + + if(length(grps) > 1){ + grps <- map(seq_along(grps), function(ix){ + eval_tidy(grps[[ix]]) + }) %>% + unlist() %>% + unique() %>% + sort() + } + + out <- list(group = list(by = temp, groups = grps, fill = fill)) + + return(out) + } \ No newline at end of file diff --git a/R/validateInput.R b/R/validateInput.R index d709702..32b1b8a 100644 --- a/R/validateInput.R +++ b/R/validateInput.R @@ -1,139 +1,139 @@ -#' Pre-process input table -#' -#' This function groups rows, splices the header into the table and fills -#' missing values where they should not exist. -#' @param schema [\code{character(1)}]\cr the validated schema description of -#' \code{input}. -#' @param input [\code{character(1)}]\cr table to reorganise. -#' @details -#' -#' @return a table where columns and rows are grouped and headers are spliced -#' into the table. -#' @examples -#' @importFrom checkmate assertTRUE -#' @importFrom dplyr row_number group_by summarise na_if across select mutate -#' if_else arrange add_row slice -#' @importFrom tibble as_tibble as_tibble_row -#' @importFrom tidyselect everything where -#' @importFrom lubridate is.Date -#' @importFrom rlang eval_tidy -#' @export - -validateInput <- function(schema = NULL, input = NULL){ - - assertDataFrame(x = input) - assertClass(x = schema, classes = "schema") - assertTRUE(x = schema@validated) - - header <- schema@format$header - groups <- schema@groups - - assertIntegerish(x = header, len = 1, lower = 0, upper = dim(input)[1], any.missing = FALSE, null.ok = TRUE) - - # first splice the header into the table, if it hasn't been read without column names - if(header != 0L){ - - input <- input %>% - mutate(across(where(is.double) | where(is.integer) | where(is.logical) | where(is.Date), as.character)) - non_char <- .getColTypes(input = input, collapse = FALSE) != "c" - - if(header != 1L){ - stop("! implement case where more than one rows need to be shifted !") - } else { - vec <- colnames(input) - names(vec) <- paste0("X", seq_along(vec)) - vec <- as_tibble_row(vec) - vec[, non_char] <- NA - - colnames(input) <- paste0("X", seq_along(vec)) - - input <- bind_rows(vec, input) - } - - } - - if(!is.null(groups$rows)){ - - tempTab <- input |> - mutate(rn = as.double(row_number()), .before = 1) - - isNumeric <- suppressWarnings( - input %>% - mutate(across(everything(), ~if_else(!is.na(as.numeric(.x)), TRUE, FALSE))) - ) - - for(i in seq_along(groups$rows)){ - - temp <- groups$rows[[i]] - - charBy <- eval_tidy(temp$by$char) - if(is.null(charBy)) charBy <- ~ paste0(na.omit(.x), collapse = " ") - numBy <- eval_tidy(temp$by$num) - if(is.null(numBy)) numBy <- ~ sum(.x, na.rm = TRUE) - targetRows <- eval_tidy(temp$groups[[1]]) - - tempRows <- input %>% - slice(targetRows) - typeRows <- isNumeric |> - slice(targetRows) - typeCols <- typeRows |> - summarise(across(everything(), ~ any(.x))) - - # fill NA-values in grouped rows - if(!is.null(temp$fill) & anyNA(tempRows)){ - for(j in seq_along(temp$fill)){ - tempRows <- .fill(x = tempRows, direction = temp$fill[j]) - } - } - - if(any(typeRows)){ - - charRows <- tempRows[!unlist(typeCols)] |> - summarise(across(everything(), charBy)) |> - mutate(across(where(is.character), ~na_if(x = ., y = ""))) - - numRows <- suppressWarnings( - tempRows[unlist(typeCols)] |> - mutate(across(everything(), as.numeric)) |> - summarise(across(everything(), numBy)) - ) - - tempRows <- bind_cols(charRows, numRows) - tempRows <- select(tempRows, sort(names(tempRows))) |> - mutate(across(everything(), as.character)) - - - } else { - - tempRows <- tempRows |> - summarise(across(everything(), charBy)) |> - mutate(across(where(is.character), ~na_if(x = ., y = ""))) - - } - - tempRows$rn <- min(targetRows) - - tempTab <- tempTab |> - filter(!rn %in% targetRows) |> - add_row(tempRows) - - } - - input <- tempTab |> - arrange(rn) |> - select(-rn) - } - - # if(!is.null(groups$cols)){ - # - # for(i in seq_along(groups$cols)){ - # - # temp <- groups$cols[[1]] - # - # } - # - # } - - - return(input) +#' Pre-process input table +#' +#' This function groups rows, splices the header into the table and fills +#' missing values where they should not exist. +#' @param schema [\code{character(1)}]\cr the validated schema description of +#' \code{input}. +#' @param input [\code{character(1)}]\cr table to reorganise. +#' @details +#' +#' @return a table where columns and rows are grouped and headers are spliced +#' into the table. +#' @examples +#' @importFrom checkmate assertTRUE +#' @importFrom dplyr row_number group_by summarise na_if across select mutate +#' if_else arrange add_row slice +#' @importFrom tibble as_tibble as_tibble_row +#' @importFrom tidyselect everything where +#' @importFrom lubridate is.Date +#' @importFrom rlang eval_tidy +#' @export + +validateInput <- function(schema = NULL, input = NULL){ + + assertDataFrame(x = input) + assertClass(x = schema, classes = "schema") + assertTRUE(x = schema@validated) + + header <- schema@format$header + groups <- schema@groups + + assertIntegerish(x = header, len = 1, lower = 0, upper = dim(input)[1], any.missing = FALSE, null.ok = TRUE) + + # first splice the header into the table, if it hasn't been read without column names + if(header != 0L){ + + input <- input %>% + mutate(across(where(is.double) | where(is.integer) | where(is.logical) | where(is.Date), as.character)) + non_char <- .getColTypes(input = input, collapse = FALSE) != "c" + + if(header != 1L){ + stop("! implement case where more than one rows need to be shifted !") + } else { + vec <- colnames(input) + names(vec) <- paste0("X", seq_along(vec)) + vec <- as_tibble_row(vec) + vec[, non_char] <- NA + + colnames(input) <- paste0("X", seq_along(vec)) + + input <- bind_rows(vec, input) + } + + } + + if(!is.null(groups$rows)){ + + tempTab <- input |> + mutate(rn = as.double(row_number()), .before = 1) + + isNumeric <- suppressWarnings( + input %>% + mutate(across(everything(), ~if_else(!is.na(as.numeric(.x)), TRUE, FALSE))) + ) + + for(i in seq_along(groups$rows)){ + + temp <- groups$rows[[i]] + + charBy <- eval_tidy(temp$by$char) + if(is.null(charBy)) charBy <- ~ paste0(na.omit(.x), collapse = " ") + numBy <- eval_tidy(temp$by$num) + if(is.null(numBy)) numBy <- ~ sum(.x, na.rm = TRUE) + targetRows <- eval_tidy(temp$groups[[1]]) + + tempRows <- input %>% + slice(targetRows) + typeRows <- isNumeric |> + slice(targetRows) + typeCols <- typeRows |> + summarise(across(everything(), ~ any(.x))) + + # fill NA-values in grouped rows + if(!is.null(temp$fill) & anyNA(tempRows)){ + for(j in seq_along(temp$fill)){ + tempRows <- .fill(x = tempRows, direction = temp$fill[j]) + } + } + + if(any(typeRows)){ + + charRows <- tempRows[!unlist(typeCols)] |> + summarise(across(everything(), charBy)) |> + mutate(across(where(is.character), ~na_if(x = ., y = ""))) + + numRows <- suppressWarnings( + tempRows[unlist(typeCols)] |> + mutate(across(everything(), as.numeric)) |> + summarise(across(everything(), numBy)) + ) + + tempRows <- bind_cols(charRows, numRows) + tempRows <- select(tempRows, sort(names(tempRows))) |> + mutate(across(everything(), as.character)) + + + } else { + + tempRows <- tempRows |> + summarise(across(everything(), charBy)) |> + mutate(across(where(is.character), ~na_if(x = ., y = ""))) + + } + + tempRows$rn <- min(targetRows) + + tempTab <- tempTab |> + filter(!rn %in% targetRows) |> + add_row(tempRows) + + } + + input <- tempTab |> + arrange(rn) |> + select(-rn) + } + + # if(!is.null(groups$cols)){ + # + # for(i in seq_along(groups$cols)){ + # + # temp <- groups$cols[[1]] + # + # } + # + # } + + + return(input) } \ No newline at end of file diff --git a/R/validateSchema.R b/R/validateSchema.R index 0f82357..11bf4b9 100644 --- a/R/validateSchema.R +++ b/R/validateSchema.R @@ -1,271 +1,271 @@ -#' Check and update schema descriptions -#' -#' This function takes a raw schema description and updates values that were -#' only given as wildcard or implied values. It is automatically called by -#' \code{reorganise}, but can also be used in concert with the getters to debug -#' a schema. -#' @param input [\code{data.frame(1)}]\cr an input for which to check a schema -#' description. -#' @param schema [\code{symbol(1)}]\cr the schema description. -#' @details The core idea of a schema description is that it can be written in a -#' very generic way, as long as it describes sufficiently where in a table -#' what variable can be found. A very generic way can be via using the -#' function \code{\link{.find}} to identify the initially unknown -#' cell-locations of a variable on-the-fly, for example when it is merely -#' known that a variable must be in the table, but not where it is. -#' -#' \code{validateSchema} matches a schema with an input table and inserts the -#' accordingly evaluated positions (of clusters, filters and variables), -#' adapts some of the meta-data and ensures formal consistency of the schema. -#' @return An updated schema description -#' @examples -#' # build a schema for an already tidy table -#' (tidyTab <- tabs2shift$tidy) -#' -#' schema <- -#' setIDVar(name = "territories", col = 1) %>% -#' setIDVar(name = "year", col = .find(pattern = "period")) %>% -#' setIDVar(name = "commodities", col = 3) %>% -#' setObsVar(name = "harvested", col = 5) %>% -#' setObsVar(name = "production", col = 6) -#' -#' # before ... -#' schema -#' -#' # ... after -#' validateSchema(schema = schema, input = tidyTab) -#' -#' @importFrom checkmate assertNames assertClass assertNumeric -#' @importFrom rlang is_quosure -#' @importFrom dplyr mutate across ungroup n right_join -#' @importFrom tidyr replace_na everything -#' @importFrom purrr map_int map_lgl map -#' @importFrom methods new -#' @export - -validateSchema <- function(schema = NULL, input = NULL){ - - assertDataFrame(x = input) - assertClass(x = schema, classes = "schema") - - filter <- schema@filter - clusters <- schema@clusters - groups <- schema@groups - variables <- schema@variables - tabDim <- dim(input) - - # 1. evaluate filter ---- - allRows <- 1:dim(input)[1] - if(!is.null(filter$row)){ - filter$row <- .eval_find(input = input, row = filter$row) - } - if(!is.null(filter$col)){ - filter$col <- .eval_find(input = input, col = filter$col) - } - - # 2. complete cluster information ---- - # set cluster start if it is NULL or a qousure - if(is.null(clusters$row)){ - clusters$row <- 1 - } else if(is.list(clusters$row)){ - clusters$row <- .eval_find(input = input, row = clusters$row, clusters = clusters) - - # ignore filter rows - if(!is.null(filter$row)){ - clusters$row <- clusters$row[clusters$row %in% filter$row] - } - clusters$row <- .eval_sum(input = input, groups = groups, data = clusters$row) - } - - if(is.null(clusters$col)){ - clusters$col <- 1 - } else if(is.list(clusters$col)){ - clusters$col <- .eval_find(input = input, col = clusters$col, clusters = clusters) - } - - if(is.null(clusters$width)){ - nPos <- table(clusters$col) - dist <- diff(c(unique(clusters$col), tabDim[2]+1)) - clusters$width <- rep(dist, times = nPos) - } - - if(is.null(clusters$height)){ - if(length(clusters$row) > 1){ - nPos <- table(clusters$row) - dist <- diff(c(unique(clusters$row), tabDim[1]+1)) - clusters$height <- rep(dist, times = nPos) - } else { - clusters$height <- tabDim[1]+1 - min(clusters$row) - } - } - - nClusters <- max(lengths(clusters)) - if(nClusters == 0) nClusters <- 1 - - # make sure that all elements occur the same number of times - clusters$row <- rep(x = clusters$row, length.out = nClusters) - clusters$col <- rep(x = clusters$col, length.out = nClusters) - clusters$width <- rep(x = clusters$width, length.out = nClusters) - clusters$height <- rep(x = clusters$height, length.out = nClusters) - - - # 3. adjust variables ---- - outsideCluster <- filterOut <- isAbs <- NULL - selectRows <- selectCols <- idCols <- NULL - clusterID <- clusters$id - groupID <- clusters$group - - # first, evaluate whether any variable other than clusterID or groupID has a 'row' set - headerRows <- map(.x = seq_along(variables), .f = function(ix){ - tempName <- names(variables)[ix] - if(!tempName %in% c(groupID, clusterID)){ - temp <- variables[[ix]] - if(temp$vartype == "observed"){ - temp$row - } else { - NULL - } - } - }) - headerRows <- unlist(headerRows, use.names = FALSE) - - for(i in seq_along(variables)){ - - varProp <- variables[[i]] - varName <- names(variables)[i] - - # resolve quosures from grep-ing unknown col/rows ---- - if(!is.null(varProp$row)){ - if(is.list(varProp$row)){ - varProp$row <- .eval_find(input = input, row = varProp$row, clusters = clusters) - - # ignore filter rows - if(!is.null(filter$row)){ - varProp$row <- varProp$row[varProp$row %in% filter$row] - } - - # ignore header rows - varProp$row <- varProp$row[!varProp$row %in% headerRows] - } - } - - if(!is.null(varProp$col)){ - if(is.list(varProp$col)){ - varProp$col <- .eval_find(input = input, col = varProp$col, clusters = clusters) - } - } - - # check whether the variable is wide ---- - if(varProp$vartype == "observed"){ - isWide <- map_lgl(.x = seq_along(idCols), function(ix){ - if(length(varProp$col) == length(idCols[[ix]])){ - all(varProp$col == idCols[[ix]]) - } else { - FALSE - } - }) - if(any(isWide) & is.null(varProp$key)){ - varProp$key <- 0 - varProp$value <- "{all_rows}" - } - } - - # figure out which rows to filter out - if(!varProp$dist & !varName %in% c(groupID, clusterID)){ - if(varProp$vartype == "observed"){ - if(is.null(varProp$row)){ - if(is.null(varProp$key)){ - varProp$row <- clusters$row - } else { - varProp$row <- 1 - } - } - } - - if(!is.null(varProp$row)){ - if(is.null(names(filter$row[[1]]))){ - filterOut <- sort(unique(c(filterOut, varProp$row))) - } - } - } - - if(varProp$vartype == "id"){ - if(!is.null(varProp$val)){ - varProp$dist <- TRUE - } - idCols <- c(idCols, list(varProp$col)) - } - - # identify all selected columns ---- - selectCols <- unique(c(selectCols, varProp$col)) - - # make sure that all elements occur the same number of times ---- - if(!is.null(varProp$row)){ - - if(length(varProp$row) == 1){ - varProp$row <- rep(x = varProp$row, length.out = nClusters) - } - if(any(varName == groupID)){ - varProp$row <- varProp$row[clusters$member] - } - } - if(any(varName == groupID)){ - if(!is.null(varProp$col)){ - varProp$col <- rep(x = varProp$col, length.out = length(varProp$row)) - } - } else { - if(!is.null(varProp$col)){ - if(length(varProp$col) == 1){ - varProp$col <- rep(x = varProp$col, length.out = nClusters) - } - } - } - - # make sure that cluster or group IDs are set to NA ---- - # that their rows can be recognised as removable, in case there is nothing - # else in that row - if(any(varName %in% c(clusterID, groupID))){ - for(j in seq_along(varProp$col)){ - input[varProp$row[j], varProp$col[j]] <- NA - } - } - - # adapt rows and columns if there are groups ---- - varProp$row <- .eval_sum(input = input, groups = groups, data = varProp$row) - - variables[[i]] <- varProp - names(variables)[i] <- varName - } - - - # 4. remove empty rows ---- - testRows <- input[,selectCols] - emptyRows <- which(rowSums(is.na(testRows)) == ncol(testRows)) - - - # 5. adapt filter and cluster position to groups ---- - filterOut <- .eval_sum(input = input, groups = groups, data = filterOut) - allRows <- .eval_sum(input = input, groups = groups, data = allRows) - emptyRows <- .eval_sum(input = input, groups = groups, data = emptyRows) - groupRows <- eval_tidy(groups$rows$group$groups[[1]]) - - if(!is.null(filter$row)){ - filter$row <- unique(.eval_sum(input = input, groups = groups, data = filter$row)) - filter$row <- filter$row[filter$row %in% sort(unique(allRows[!allRows %in% c(filterOut, emptyRows, groupRows)]))] - } else { - filter$row <- sort(unique(allRows[!allRows %in% c(filterOut, emptyRows)])) - } - - - # 6. write it all ---- - out <- new(Class = "schema", - clusters = clusters, - format = schema@format, - groups = schema@groups, - filter = filter, - variables = variables, - validated = TRUE) - - return(out) - -} +#' Check and update schema descriptions +#' +#' This function takes a raw schema description and updates values that were +#' only given as wildcard or implied values. It is automatically called by +#' \code{reorganise}, but can also be used in concert with the getters to debug +#' a schema. +#' @param input [\code{data.frame(1)}]\cr an input for which to check a schema +#' description. +#' @param schema [\code{symbol(1)}]\cr the schema description. +#' @details The core idea of a schema description is that it can be written in a +#' very generic way, as long as it describes sufficiently where in a table +#' what variable can be found. A very generic way can be via using the +#' function \code{\link{.find}} to identify the initially unknown +#' cell-locations of a variable on-the-fly, for example when it is merely +#' known that a variable must be in the table, but not where it is. +#' +#' \code{validateSchema} matches a schema with an input table and inserts the +#' accordingly evaluated positions (of clusters, filters and variables), +#' adapts some of the meta-data and ensures formal consistency of the schema. +#' @return An updated schema description +#' @examples +#' # build a schema for an already tidy table +#' (tidyTab <- tabs2shift$tidy) +#' +#' schema <- +#' setIDVar(name = "territories", col = 1) %>% +#' setIDVar(name = "year", col = .find(pattern = "period")) %>% +#' setIDVar(name = "commodities", col = 3) %>% +#' setObsVar(name = "harvested", col = 5) %>% +#' setObsVar(name = "production", col = 6) +#' +#' # before ... +#' schema +#' +#' # ... after +#' validateSchema(schema = schema, input = tidyTab) +#' +#' @importFrom checkmate assertNames assertClass assertNumeric +#' @importFrom rlang is_quosure +#' @importFrom dplyr mutate across ungroup n right_join +#' @importFrom tidyr replace_na everything +#' @importFrom purrr map_int map_lgl map +#' @importFrom methods new +#' @export + +validateSchema <- function(schema = NULL, input = NULL){ + + assertDataFrame(x = input) + assertClass(x = schema, classes = "schema") + + filter <- schema@filter + clusters <- schema@clusters + groups <- schema@groups + variables <- schema@variables + tabDim <- dim(input) + + # 1. evaluate filter ---- + allRows <- 1:dim(input)[1] + if(!is.null(filter$row)){ + filter$row <- .eval_find(input = input, row = filter$row) + } + if(!is.null(filter$col)){ + filter$col <- .eval_find(input = input, col = filter$col) + } + + # 2. complete cluster information ---- + # set cluster start if it is NULL or a qousure + if(is.null(clusters$row)){ + clusters$row <- 1 + } else if(is.list(clusters$row)){ + clusters$row <- .eval_find(input = input, row = clusters$row, clusters = clusters) + + # ignore filter rows + if(!is.null(filter$row)){ + clusters$row <- clusters$row[clusters$row %in% filter$row] + } + clusters$row <- .eval_sum(input = input, groups = groups, data = clusters$row) + } + + if(is.null(clusters$col)){ + clusters$col <- 1 + } else if(is.list(clusters$col)){ + clusters$col <- .eval_find(input = input, col = clusters$col, clusters = clusters) + } + + if(is.null(clusters$width)){ + nPos <- table(clusters$col) + dist <- diff(c(unique(clusters$col), tabDim[2]+1)) + clusters$width <- rep(dist, times = nPos) + } + + if(is.null(clusters$height)){ + if(length(clusters$row) > 1){ + nPos <- table(clusters$row) + dist <- diff(c(unique(clusters$row), tabDim[1]+1)) + clusters$height <- rep(dist, times = nPos) + } else { + clusters$height <- tabDim[1]+1 - min(clusters$row) + } + } + + nClusters <- max(lengths(clusters)) + if(nClusters == 0) nClusters <- 1 + + # make sure that all elements occur the same number of times + clusters$row <- rep(x = clusters$row, length.out = nClusters) + clusters$col <- rep(x = clusters$col, length.out = nClusters) + clusters$width <- rep(x = clusters$width, length.out = nClusters) + clusters$height <- rep(x = clusters$height, length.out = nClusters) + + + # 3. adjust variables ---- + outsideCluster <- filterOut <- isAbs <- NULL + selectRows <- selectCols <- idCols <- NULL + clusterID <- clusters$id + groupID <- clusters$group + + # first, evaluate whether any variable other than clusterID or groupID has a 'row' set + headerRows <- map(.x = seq_along(variables), .f = function(ix){ + tempName <- names(variables)[ix] + if(!tempName %in% c(groupID, clusterID)){ + temp <- variables[[ix]] + if(temp$vartype == "observed"){ + temp$row + } else { + NULL + } + } + }) + headerRows <- unlist(headerRows, use.names = FALSE) + + for(i in seq_along(variables)){ + + varProp <- variables[[i]] + varName <- names(variables)[i] + + # resolve quosures from grep-ing unknown col/rows ---- + if(!is.null(varProp$row)){ + if(is.list(varProp$row)){ + varProp$row <- .eval_find(input = input, row = varProp$row, clusters = clusters) + + # ignore filter rows + if(!is.null(filter$row)){ + varProp$row <- varProp$row[varProp$row %in% filter$row] + } + + # ignore header rows + varProp$row <- varProp$row[!varProp$row %in% headerRows] + } + } + + if(!is.null(varProp$col)){ + if(is.list(varProp$col)){ + varProp$col <- .eval_find(input = input, col = varProp$col, clusters = clusters) + } + } + + # check whether the variable is wide ---- + if(varProp$vartype == "observed"){ + isWide <- map_lgl(.x = seq_along(idCols), function(ix){ + if(length(varProp$col) == length(idCols[[ix]])){ + all(varProp$col == idCols[[ix]]) + } else { + FALSE + } + }) + if(any(isWide) & is.null(varProp$key)){ + varProp$key <- 0 + varProp$value <- "{all_rows}" + } + } + + # figure out which rows to filter out + if(!varProp$dist & !varName %in% c(groupID, clusterID)){ + if(varProp$vartype == "observed"){ + if(is.null(varProp$row)){ + if(is.null(varProp$key)){ + varProp$row <- clusters$row + } else { + varProp$row <- 1 + } + } + } + + if(!is.null(varProp$row)){ + if(is.null(names(filter$row[[1]]))){ + filterOut <- sort(unique(c(filterOut, varProp$row))) + } + } + } + + if(varProp$vartype == "id"){ + if(!is.null(varProp$val)){ + varProp$dist <- TRUE + } + idCols <- c(idCols, list(varProp$col)) + } + + # identify all selected columns ---- + selectCols <- unique(c(selectCols, varProp$col)) + + # make sure that all elements occur the same number of times ---- + if(!is.null(varProp$row)){ + + if(length(varProp$row) == 1){ + varProp$row <- rep(x = varProp$row, length.out = nClusters) + } + if(any(varName == groupID)){ + varProp$row <- varProp$row[clusters$member] + } + } + if(any(varName == groupID)){ + if(!is.null(varProp$col)){ + varProp$col <- rep(x = varProp$col, length.out = length(varProp$row)) + } + } else { + if(!is.null(varProp$col)){ + if(length(varProp$col) == 1){ + varProp$col <- rep(x = varProp$col, length.out = nClusters) + } + } + } + + # make sure that cluster or group IDs are set to NA ---- + # that their rows can be recognised as removable, in case there is nothing + # else in that row + if(any(varName %in% c(clusterID, groupID))){ + for(j in seq_along(varProp$col)){ + input[varProp$row[j], varProp$col[j]] <- NA + } + } + + # adapt rows and columns if there are groups ---- + varProp$row <- .eval_sum(input = input, groups = groups, data = varProp$row) + + variables[[i]] <- varProp + names(variables)[i] <- varName + } + + + # 4. remove empty rows ---- + testRows <- input[,selectCols] + emptyRows <- which(rowSums(is.na(testRows)) == ncol(testRows)) + + + # 5. adapt filter and cluster position to groups ---- + filterOut <- .eval_sum(input = input, groups = groups, data = filterOut) + allRows <- .eval_sum(input = input, groups = groups, data = allRows) + emptyRows <- .eval_sum(input = input, groups = groups, data = emptyRows) + groupRows <- eval_tidy(groups$rows$group$groups[[1]]) + + if(!is.null(filter$row)){ + filter$row <- unique(.eval_sum(input = input, groups = groups, data = filter$row)) + filter$row <- filter$row[filter$row %in% sort(unique(allRows[!allRows %in% c(filterOut, emptyRows, groupRows)]))] + } else { + filter$row <- sort(unique(allRows[!allRows %in% c(filterOut, emptyRows)])) + } + + + # 6. write it all ---- + out <- new(Class = "schema", + clusters = clusters, + format = schema@format, + groups = schema@groups, + filter = filter, + variables = variables, + validated = TRUE) + + return(out) + +} diff --git a/R/zzz.R b/R/zzz.R index eb8cd4b..67b7605 100755 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,4 +1,4 @@ -globalVariables(c("key", "values", "meta_default", "schema_default", "rn", "val", - ".", "name", "count", "value", "x", "where", "rowname", - "variable", "grps", "targetCols", "it", "ind", "rn_new", "is.Date" - )) +globalVariables(c("key", "values", "meta_default", "schema_default", "rn", "val", + ".", "name", "count", "value", "x", "where", "rowname", + "variable", "grps", "targetCols", "it", "ind", "rn_new", "is.Date" + )) diff --git a/man/dot-sum.Rd b/man/dot-sum.Rd index 33c1826..38b470f 100644 --- a/man/dot-sum.Rd +++ b/man/dot-sum.Rd @@ -1,37 +1,37 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sum.R -\name{.sum} -\alias{.sum} -\title{Summarise groups of rows or columns} -\usage{ -.sum(..., character = NULL, numeric = NULL, fill = NULL) -} -\arguments{ -\item{...}{[\code{integerish(1)}]\cr columns or rows that shall be combined. -If there are several items provided, they will be summarised into one group -that is combined according to its type and the respective function provided -in \code{character} or \code{numeric}.} - -\item{character}{[\code{function(1)}]\cr function by which character columns -or rows shall be combined.} - -\item{numeric}{[\code{function(1)}]\cr function by which numeric columns or -rows shall be combined.} - -\item{fill}{[\code{character(3)}]\cr direction in which to fill missing values, -possible values are "down", "up" and "right"; if several directions are -required, provide them in the order required.} -} -\value{ -the index values where the target was found. -} -\description{ -Summarise groups of rows or columns -} -\details{ -By default \code{character} values are summarised with the function - \code{paste0(na.omit(x), collapse = "-/-")} and \code{numeric} values with - the function \code{sum(x, na.rm = TRUE)}. To avoid un-intuitive behavior, - it is wisest to explicitly specify how all exceptions, such as NA-values, - shall be handled and thus to provide a new function. -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sum.R +\name{.sum} +\alias{.sum} +\title{Summarise groups of rows or columns} +\usage{ +.sum(..., character = NULL, numeric = NULL, fill = NULL) +} +\arguments{ +\item{...}{[\code{integerish(1)}]\cr columns or rows that shall be combined. +If there are several items provided, they will be summarised into one group +that is combined according to its type and the respective function provided +in \code{character} or \code{numeric}.} + +\item{character}{[\code{function(1)}]\cr function by which character columns +or rows shall be combined.} + +\item{numeric}{[\code{function(1)}]\cr function by which numeric columns or +rows shall be combined.} + +\item{fill}{[\code{character(3)}]\cr direction in which to fill missing values, +possible values are "down", "up" and "right"; if several directions are +required, provide them in the order required.} +} +\value{ +the index values where the target was found. +} +\description{ +Summarise groups of rows or columns +} +\details{ +By default \code{character} values are summarised with the function + \code{paste0(na.omit(x), collapse = "-/-")} and \code{numeric} values with + the function \code{sum(x, na.rm = TRUE)}. To avoid un-intuitive behavior, + it is wisest to explicitly specify how all exceptions, such as NA-values, + shall be handled and thus to provide a new function. +} diff --git a/man/setGroups.Rd b/man/setGroups.Rd index aaa731f..700ab9c 100644 --- a/man/setGroups.Rd +++ b/man/setGroups.Rd @@ -1,39 +1,39 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/setGroups.R -\name{setGroups} -\alias{setGroups} -\title{Set Groups} -\usage{ -setGroups(schema = NULL, rows = NULL, columns = NULL) -} -\arguments{ -\item{schema}{[\code{schema(1)}]\cr In case this information is added to an -already existing schema, provide that schema here (overwrites previous -information).} - -\item{rows}{[\code{list(3)}]\cr the output of \code{\link{.sum}} indicating -the rows and a function according to which those rows should be summarised.} - -\item{columns}{[\code{list(3)}]\cr the output of \code{\link{.sum}} -indicating the columns and a function according to which those columns -should be summarised.} -} -\value{ -An object of class \code{\link{schema}}. -} -\description{ -This function allows to set groups for rows, columns or clusters that shall -be summarised. -} -\examples{ -# please check the vignette for examples -} -\seealso{ -Other functions to describe table arrangement: -\code{\link{setCluster}()}, -\code{\link{setFilter}()}, -\code{\link{setFormat}()}, -\code{\link{setIDVar}()}, -\code{\link{setObsVar}()} -} -\concept{functions to describe table arrangement} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/setGroups.R +\name{setGroups} +\alias{setGroups} +\title{Set Groups} +\usage{ +setGroups(schema = NULL, rows = NULL, columns = NULL) +} +\arguments{ +\item{schema}{[\code{schema(1)}]\cr In case this information is added to an +already existing schema, provide that schema here (overwrites previous +information).} + +\item{rows}{[\code{list(3)}]\cr the output of \code{\link{.sum}} indicating +the rows and a function according to which those rows should be summarised.} + +\item{columns}{[\code{list(3)}]\cr the output of \code{\link{.sum}} +indicating the columns and a function according to which those columns +should be summarised.} +} +\value{ +An object of class \code{\link{schema}}. +} +\description{ +This function allows to set groups for rows, columns or clusters that shall +be summarised. +} +\examples{ +# please check the vignette for examples +} +\seealso{ +Other functions to describe table arrangement: +\code{\link{setCluster}()}, +\code{\link{setFilter}()}, +\code{\link{setFormat}()}, +\code{\link{setIDVar}()}, +\code{\link{setObsVar}()} +} +\concept{functions to describe table arrangement} diff --git a/man/setIDVar.Rd b/man/setIDVar.Rd index 5ea79fd..97b09bb 100644 --- a/man/setIDVar.Rd +++ b/man/setIDVar.Rd @@ -1,79 +1,79 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/setIDVar.R -\name{setIDVar} -\alias{setIDVar} -\title{Set an identifying variable} -\usage{ -setIDVar( - schema = NULL, - name = NULL, - type = "character", - value = NULL, - columns = NULL, - rows = NULL, - split = NULL, - merge = NULL, - distinct = FALSE -) -} -\arguments{ -\item{schema}{[\code{schema(1)}]\cr In case this information is added to an -already existing schema, provide that schema here (overwrites previous -information).} - -\item{name}{[\code{character(1)}]\cr Name of the new identifying variable.} - -\item{type}{[\code{character(1)}]\cr data type of the new identifying -variable. Possible values are \code{"c/character"}, \code{"i/integer"}, -\code{"n/numeric"}, \code{"l/logical"}, \code{"D/Date"} or \code{"_/skip"}. -For \code{"D/Date"}, the value has to follow the form \code{YYYY-MM-DD}, -where dates that don't match that are replaced by NA.} - -\item{value}{[\code{character(1)}]\cr In case the variable is an implicit -variable (i.e., which is not in the origin table), specify it here.} - -\item{columns}{[\code{integerish(.)}]\cr The column(s) in which the -\emph{values} of the new variable are recorded.} - -\item{rows}{[\code{integerish(.)}]\cr In case the variable is in several -columns, specify here additionally the row in which the \emph{names} are -recorded.} - -\item{split}{[\code{character(1)}]\cr In case the variable is part of a -compound value, this should be a regular expression that splits the -respective value off of that compound value. See -\code{\link[tidyr]{extract}} on how to set up the regular expression.} - -\item{merge}{[\code{character(1)}]\cr In case a variable is made up of -several columns, this should be the character string that would connect the -two columns (e.g., an empty space \code{" "}).} - -\item{distinct}{[\code{logical(1)}]\cr whether or not the variable is -distinct from a cluster. This is the case when the variable is not -systematically available for all clusters and thus needs to be registered -separately from clusters.} -} -\value{ -An object of class \code{\link{schema}}. -} -\description{ -Identifying variables are those variables that describe the (qualitative) -properties that make each observation (as described by the -\code{\link[=setObsVar]{observed variables}}) unique. -} -\details{ -Please also take a look at the currently suggested strategy to set - up a \link[=schema]{schema description}. -} -\examples{ -# please check the vignette for examples -} -\seealso{ -Other functions to describe table arrangement: -\code{\link{setCluster}()}, -\code{\link{setFilter}()}, -\code{\link{setFormat}()}, -\code{\link{setGroups}()}, -\code{\link{setObsVar}()} -} -\concept{functions to describe table arrangement} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/setIDVar.R +\name{setIDVar} +\alias{setIDVar} +\title{Set an identifying variable} +\usage{ +setIDVar( + schema = NULL, + name = NULL, + type = "character", + value = NULL, + columns = NULL, + rows = NULL, + split = NULL, + merge = NULL, + distinct = FALSE +) +} +\arguments{ +\item{schema}{[\code{schema(1)}]\cr In case this information is added to an +already existing schema, provide that schema here (overwrites previous +information).} + +\item{name}{[\code{character(1)}]\cr Name of the new identifying variable.} + +\item{type}{[\code{character(1)}]\cr data type of the new identifying +variable. Possible values are \code{"c/character"}, \code{"i/integer"}, +\code{"n/numeric"}, \code{"l/logical"}, \code{"D/Date"} or \code{"_/skip"}. +For \code{"D/Date"}, the value has to follow the form \code{YYYY-MM-DD}, +where dates that don't match that are replaced by NA.} + +\item{value}{[\code{character(1)}]\cr In case the variable is an implicit +variable (i.e., which is not in the origin table), specify it here.} + +\item{columns}{[\code{integerish(.)}]\cr The column(s) in which the +\emph{values} of the new variable are recorded.} + +\item{rows}{[\code{integerish(.)}]\cr In case the variable is in several +columns, specify here additionally the row in which the \emph{names} are +recorded.} + +\item{split}{[\code{character(1)}]\cr In case the variable is part of a +compound value, this should be a regular expression that splits the +respective value off of that compound value. See +\code{\link[tidyr]{extract}} on how to set up the regular expression.} + +\item{merge}{[\code{character(1)}]\cr In case a variable is made up of +several columns, this should be the character string that would connect the +two columns (e.g., an empty space \code{" "}).} + +\item{distinct}{[\code{logical(1)}]\cr whether or not the variable is +distinct from a cluster. This is the case when the variable is not +systematically available for all clusters and thus needs to be registered +separately from clusters.} +} +\value{ +An object of class \code{\link{schema}}. +} +\description{ +Identifying variables are those variables that describe the (qualitative) +properties that make each observation (as described by the +\code{\link[=setObsVar]{observed variables}}) unique. +} +\details{ +Please also take a look at the currently suggested strategy to set + up a \link[=schema]{schema description}. +} +\examples{ +# please check the vignette for examples +} +\seealso{ +Other functions to describe table arrangement: +\code{\link{setCluster}()}, +\code{\link{setFilter}()}, +\code{\link{setFormat}()}, +\code{\link{setGroups}()}, +\code{\link{setObsVar}()} +} +\concept{functions to describe table arrangement} diff --git a/tests/testthat/test-02_column_mismatch.R b/tests/testthat/test-02_column_mismatch.R index 0643b0a..47e2b3a 100644 --- a/tests/testthat/test-02_column_mismatch.R +++ b/tests/testthat/test-02_column_mismatch.R @@ -1,60 +1,60 @@ -library(tabshiftr) -library(testthat) -library(checkmate) -library(dplyr) -context("mismatch") - - -test_that("split a column that contains several identifying variables in one column", { - - input <- tabs2shift$merged_column - - schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 2, split = "(.+?(?=_))") %>% - setIDVar(name = "commodities", columns = 2, split = "((?<=\\_).*)") %>% - setObsVar(name = "harvested", columns = 4) %>% - setObsVar(name = "production", columns = 5) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) - - -test_that("recognise an identifying variable that is actually a merge of two columns", { - - input <- tabs2shift$split_column - - schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = c(2, 4), merge = " ") %>% - setIDVar(name = "commodities", columns = 5) %>% - setObsVar(name = "harvested", columns = 6) %>% - setObsVar(name = "production", columns = 7) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) - -test_that("recognise a distinct variable that is not available for every cluster", { - - input <- tabs2shift$clusters_messy - - schema <- setCluster(id = "territories", - left = c(1, 1, 4), top = c(1, 8, 8)) %>% - setIDVar(name = "territories", columns = c(1, 1, 4), rows = c(2, 9, 9)) %>% - setIDVar(name = "year", columns = 4, rows = c(3:6), distinct = TRUE) %>% - setIDVar(name = "commodities", columns = c(1, 1, 4)) %>% - setObsVar(name = "harvested", columns = c(2, 2, 5)) %>% - setObsVar(name = "production", columns = c(3, 3, 6)) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 3) - -}) - +library(tabshiftr) +library(testthat) +library(checkmate) +library(dplyr) +context("mismatch") + + +test_that("split a column that contains several identifying variables in one column", { + + input <- tabs2shift$merged_column + + schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 2, split = "(.+?(?=_))") %>% + setIDVar(name = "commodities", columns = 2, split = "((?<=\\_).*)") %>% + setObsVar(name = "harvested", columns = 4) %>% + setObsVar(name = "production", columns = 5) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) + + +test_that("recognise an identifying variable that is actually a merge of two columns", { + + input <- tabs2shift$split_column + + schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = c(2, 4), merge = " ") %>% + setIDVar(name = "commodities", columns = 5) %>% + setObsVar(name = "harvested", columns = 6) %>% + setObsVar(name = "production", columns = 7) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) + +test_that("recognise a distinct variable that is not available for every cluster", { + + input <- tabs2shift$clusters_messy + + schema <- setCluster(id = "territories", + left = c(1, 1, 4), top = c(1, 8, 8)) %>% + setIDVar(name = "territories", columns = c(1, 1, 4), rows = c(2, 9, 9)) %>% + setIDVar(name = "year", columns = 4, rows = c(3:6), distinct = TRUE) %>% + setIDVar(name = "commodities", columns = c(1, 1, 4)) %>% + setObsVar(name = "harvested", columns = c(2, 2, 5)) %>% + setObsVar(name = "production", columns = c(3, 3, 6)) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 3) + +}) + diff --git a/tests/testthat/test-03_wide_id.R b/tests/testthat/test-03_wide_id.R index 55fdab6..b2f71fd 100644 --- a/tests/testthat/test-03_wide_id.R +++ b/tests/testthat/test-03_wide_id.R @@ -1,132 +1,132 @@ -library(tabshiftr) -library(testthat) -library(checkmate) -context("wide_id") - - -test_that("one wide identifying variable into long form", { - - input <- tabs2shift$one_wide_id_alt - - schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 3) %>% - setIDVar(name = "commodities", columns = c(4:7), rows = 1) %>% - setObsVar(name = "harvested", columns = c(4, 5), top = 2) %>% - setObsVar(name = "production", columns = c(6, 7), top = 2) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) - - -test_that("wide variable in first row of header", { - - input <- tabs2shift$one_wide_id - - schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 3) %>% - setIDVar(name = "commodities", columns = c(4, 6), rows = 1) %>% - setObsVar(name = "harvested", columns = c(4, 6), top = 2) %>% - setObsVar(name = "production", columns = c(5, 7), top = 2) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) - - -test_that("wide variable (that needs to be split) in first row of header", { - - input <- tabs2shift$one_wide_id - input$X4[1] <- "soybean_something" - input$X6[1] <- "maize_something" - - schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 3) %>% - setIDVar(name = "commodities", columns = c(4, 6), rows = 1, split = "(.+?(?=_))") %>% - setObsVar(name = "harvested", columns = c(4, 6), top = 2) %>% - setObsVar(name = "production", columns = c(5, 7), top = 2) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) - - -test_that("wide variable in second row of header", { - - input <- tabs2shift$wide_obs - - schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 2) %>% - setIDVar(name = "commodities", columns = c(3:6), rows = 2) %>% - setObsVar(name = "harvested", columns = c(3, 4)) %>% - setObsVar(name = "production", columns = c(5, 6)) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) - - -test_that("wide variable in second rows of header, values spearated", { - - input <- tabs2shift$wide_obs_alt - - schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 2) %>% - setIDVar(name = "commodities", columns = c(3, 5), rows = 2) %>% - setObsVar(name = "harvested", columns = c(3, 5)) %>% - setObsVar(name = "production", columns = c(4, 6)) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) - - -test_that("several wide identifying variables", { - - input <- tabs2shift$two_wide_id - - schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = c(2, 6), rows = 1) %>% - setIDVar(name = "commodities", columns = c(2, 4, 6, 8), rows = 2) %>% - setObsVar(name = "harvested", columns = c(2, 4, 6, 8), top = 3) %>% - setObsVar(name = "production", columns = c(3, 5, 7, 9), top = 3) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) - - -test_that("when the 'wider' identifying variable is registered first", { - - input <- tabs2shift$two_wide_id2 - - schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = c(2, 4, 6, 8), rows = 2) %>% - setIDVar(name = "commodities", columns = c(2, 6), rows = 1) %>% - setObsVar(name = "harvested", columns = c(2, 4, 6, 8), top = 3) %>% - setObsVar(name = "production", columns = c(3, 5, 7, 9), top = 3) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) +library(tabshiftr) +library(testthat) +library(checkmate) +context("wide_id") + + +test_that("one wide identifying variable into long form", { + + input <- tabs2shift$one_wide_id_alt + + schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 3) %>% + setIDVar(name = "commodities", columns = c(4:7), rows = 1) %>% + setObsVar(name = "harvested", columns = c(4, 5), top = 2) %>% + setObsVar(name = "production", columns = c(6, 7), top = 2) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) + + +test_that("wide variable in first row of header", { + + input <- tabs2shift$one_wide_id + + schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 3) %>% + setIDVar(name = "commodities", columns = c(4, 6), rows = 1) %>% + setObsVar(name = "harvested", columns = c(4, 6), top = 2) %>% + setObsVar(name = "production", columns = c(5, 7), top = 2) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) + + +test_that("wide variable (that needs to be split) in first row of header", { + + input <- tabs2shift$one_wide_id + input$X4[1] <- "soybean_something" + input$X6[1] <- "maize_something" + + schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 3) %>% + setIDVar(name = "commodities", columns = c(4, 6), rows = 1, split = "(.+?(?=_))") %>% + setObsVar(name = "harvested", columns = c(4, 6), top = 2) %>% + setObsVar(name = "production", columns = c(5, 7), top = 2) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) + + +test_that("wide variable in second row of header", { + + input <- tabs2shift$wide_obs + + schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 2) %>% + setIDVar(name = "commodities", columns = c(3:6), rows = 2) %>% + setObsVar(name = "harvested", columns = c(3, 4)) %>% + setObsVar(name = "production", columns = c(5, 6)) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) + + +test_that("wide variable in second rows of header, values spearated", { + + input <- tabs2shift$wide_obs_alt + + schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 2) %>% + setIDVar(name = "commodities", columns = c(3, 5), rows = 2) %>% + setObsVar(name = "harvested", columns = c(3, 5)) %>% + setObsVar(name = "production", columns = c(4, 6)) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) + + +test_that("several wide identifying variables", { + + input <- tabs2shift$two_wide_id + + schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = c(2, 6), rows = 1) %>% + setIDVar(name = "commodities", columns = c(2, 4, 6, 8), rows = 2) %>% + setObsVar(name = "harvested", columns = c(2, 4, 6, 8), top = 3) %>% + setObsVar(name = "production", columns = c(3, 5, 7, 9), top = 3) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) + + +test_that("when the 'wider' identifying variable is registered first", { + + input <- tabs2shift$two_wide_id2 + + schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = c(2, 4, 6, 8), rows = 2) %>% + setIDVar(name = "commodities", columns = c(2, 6), rows = 1) %>% + setObsVar(name = "harvested", columns = c(2, 4, 6, 8), top = 3) %>% + setObsVar(name = "production", columns = c(3, 5, 7, 9), top = 3) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) diff --git a/tests/testthat/test-08_groups.R b/tests/testthat/test-08_groups.R index ec3295f..fb66753 100644 --- a/tests/testthat/test-08_groups.R +++ b/tests/testthat/test-08_groups.R @@ -1,68 +1,68 @@ -library(tabshiftr) -library(testthat) -library(checkmate) -context("setGroups") - -test_that("groups of rows", { - - input <- tabs2shift$group_simple - - schema <- - setGroups(rows = .sum(c(1, 2), fill = c("down", "up", "right"), character = function(x) paste0(unique(na.omit(x)), collapse = "--"))) %>% - setGroups(rows = .sum(c(4, 5))) %>% - setGroups(rows = .sum(c(7, 8))) %>% - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 2) %>% - setIDVar(name = "commodities", columns = c(3:6), rows = 3) %>% - setObsVar(name = "harvested", columns = c(3, 4)) %>% - setObsVar(name = "production", columns = c(5, 6)) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) - -# test_that("groups of columns", { -# -# -# }) - -test_that("apply function to summarise merged rows", { - - input <- tabs2shift$group_sum - - schema <- - setGroups(rows = .sum(c(3, 4))) %>% - setGroups(rows = .sum(c(6, 7))) %>% - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 2) %>% - setIDVar(name = "commodities", columns = c(3:6), rows = 2) %>% - setObsVar(name = "harvested", columns = c(3, 4)) %>% - setObsVar(name = "production", columns = c(5, 6)) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) - -test_that("apply function to merge wide variables", { - - input <- tabs2shift$one_wide_id_sum - - schema <- - setGroups(rows = .sum(c(1, 2), character = function(x) paste0(na.omit(x), collapse = ""))) %>% - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 3) %>% - setIDVar(name = "commodities", columns = c(4, 6), rows = 2) %>% - setObsVar(name = "harvested", columns = c(4, 6), top = 3) %>% - setObsVar(name = "production", columns = c(5, 7), top = 3) - - reorganise(input = input, schema = schema) %>% - arrange(territories, year, commodities) %>% - .expect_valid_table(units = 2) - -}) - - +library(tabshiftr) +library(testthat) +library(checkmate) +context("setGroups") + +test_that("groups of rows", { + + input <- tabs2shift$group_simple + + schema <- + setGroups(rows = .sum(c(1, 2), fill = c("down", "up", "right"), character = function(x) paste0(unique(na.omit(x)), collapse = "--"))) %>% + setGroups(rows = .sum(c(4, 5))) %>% + setGroups(rows = .sum(c(7, 8))) %>% + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 2) %>% + setIDVar(name = "commodities", columns = c(3:6), rows = 3) %>% + setObsVar(name = "harvested", columns = c(3, 4)) %>% + setObsVar(name = "production", columns = c(5, 6)) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) + +# test_that("groups of columns", { +# +# +# }) + +test_that("apply function to summarise merged rows", { + + input <- tabs2shift$group_sum + + schema <- + setGroups(rows = .sum(c(3, 4))) %>% + setGroups(rows = .sum(c(6, 7))) %>% + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 2) %>% + setIDVar(name = "commodities", columns = c(3:6), rows = 2) %>% + setObsVar(name = "harvested", columns = c(3, 4)) %>% + setObsVar(name = "production", columns = c(5, 6)) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) + +test_that("apply function to merge wide variables", { + + input <- tabs2shift$one_wide_id_sum + + schema <- + setGroups(rows = .sum(c(1, 2), character = function(x) paste0(na.omit(x), collapse = ""))) %>% + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 3) %>% + setIDVar(name = "commodities", columns = c(4, 6), rows = 2) %>% + setObsVar(name = "harvested", columns = c(4, 6), top = 3) %>% + setObsVar(name = "production", columns = c(5, 7), top = 3) + + reorganise(input = input, schema = schema) %>% + arrange(territories, year, commodities) %>% + .expect_valid_table(units = 2) + +}) + + diff --git a/tests/testthat/test-sum.R b/tests/testthat/test-sum.R index 2d22f3c..2d1542d 100644 --- a/tests/testthat/test-sum.R +++ b/tests/testthat/test-sum.R @@ -1,22 +1,22 @@ -library(tabshiftr) -library(testthat) -library(checkmate) -context(".group") - - -test_that(".group and summarise numeric values", { - - -}) - -test_that(".group and summarise character values", { - - -}) - -test_that(".group several collections of rows", { - - -}) - - +library(tabshiftr) +library(testthat) +library(checkmate) +context(".group") + + +test_that(".group and summarise numeric values", { + + +}) + +test_that(".group and summarise character values", { + + +}) + +test_that(".group several collections of rows", { + + +}) + + diff --git a/vignettes/tabshiftr.Rmd b/vignettes/tabshiftr.Rmd index 251fbf1..a2f8585 100755 --- a/vignettes/tabshiftr.Rmd +++ b/vignettes/tabshiftr.Rmd @@ -1,522 +1,522 @@ ---- -title: "tabshiftr - Reshape disorganized messy data" -output: - bookdown::html_document2: - fig_caption: yes - number_sections: yes - theme: spacelab - highlight: pygments - toc: yes - toc_float: yes -bibliography: references.bib -urlcolor: blue -fontsize: 12pt -linestretch: 1.15 -link-citations: yes -vignette: > - %\VignetteIndexEntry{reorganising messy data} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - fig.path = "man/figures/README-", - out.width = "100%" -) -options(knitr.kable.NA = '.') -library(tabshiftr) -library(knitr) -``` - -# Rationale - -Tables as places where data are recorded can be pretty messy. The 'tidy' paradigm in R proposes that data are organised so that variables are recorded in columns, observations in rows and that there is only one value per cell [@Wickham2014]. This, however, is only one interpretation of how data should be organised [@Broman] and especially when scraping data off the internet, one frequently encounters spreadsheets that don't follow this paradigm. - -The `tidyr` package is one of the most popular tools to bring data into a tidy format. However, up until today it is limited to tables that are already organised into topologically coherent (rectangular) chunks and any messiness beyond that requires dedicated scripts for reorganization. In `tabshiftr` we try to describe and work with a further dimension of messiness, where data are available as so-called *disorganized (messy) data*, data that are not yet arranged into rectangular form. - -The approach of `tabshiftr` is based on describing the arrangement of such tables in a so-called [schema](https://en.wikipedia.org/wiki/Database_schema) description, which is then the basis for automatic reorganization via the function `reorganise()`. Typically there is an input and an output schema, describing the arrangement of the input and output tables, respectively. The advantage of this procedure is that input and output tables exist explicitly and the schema maps the transformation of the data. As we want to end up with tidy tables, the output schema is pre-determined by a tidy table of the included variables and the input [schema needs to be put together](#makeSchema) by you, the user. - - -# The basics - -Data can be disorganised according to multiple dimensions. To understand those dimensions, we first need to understand the nature of data. Data \"of the same kind\" are collected in a **variable**, which is always a combination of a **name** and the **values**. In a table, names are typically in the topmost row and values are in the column below that name ([Tab. 1](#tabExample1)). Conceptually, there are two types of variables in any table: - -1. Variables that contain categorical levels that identify the units for which values have been observed (they are called *identifying variables* here). -2. Variables that have been measured or observed and that consequently represent the target values of that measurement, be they continuous or categorical (they are called *observed variables* here). - -Moreover, a table is part of a **series** when other tables of that series contain the same variables with slightly different values for the identifying variables, irrespective of how the distinct tables of that series are arranged. - -| identifying variable | observed variable (categorical) | observed variable (continuous) | -| :- | :- | :- | -| sample 1 | blue | 10 | -| sample 2 | green | 20 | -| sample 3 | red | 30 | - -Table: Table 1: An example table containing one identifying and two observed variables, with the variable names in the topmost row and the values in all consecutive rows. - -Here, we do use the term **spreadsheet** to denote what would typically be seen as *table*, as the data we deal with here are typically \"disorganised\" and thus not what would be recognised as a table. Even though data in many spreadsheets are disorganised, they are mostly not non-systematic. Especially in complex spreadsheets, one often encounters a situation where a set of variables occurs more than once with the same or very similar arrangement, which we call **cluster** here. Data that are part of clusters are split along one of the, typically categorical, variables (the **cluster ID**), with the aim to increase the visual accessibility or direct the focus for human readers ([Tab. 2](#tabExample2)). This may also be the case where data are split up into several files or spreadsheets of a file, where the cluster ID can be found in the file or spreasheet name or the meta-data. In many cases, the cluster ID is an **implicit variable**, a variable that is not immediately recogniseable as such. - -| sample | colour | intensity | sample | colour | intensity | -| :- | :- | :- | :- | :- | :- | -| sample 1 | | | sample 2 | | | -| | blue | 10 | | blue | 11 | -| | green | 20 | | green | 24 | -| | red | 30 | | red | 13 | -| | | | | | | -| sample 3 | | | sample 4 | | | -| | blue | 20 | | blue | 10 | -| | green | 15 | | green | 16 | -| | red | 33 | | red | 21 | - -Table: Table 2: An example of a table with several clusters of comparable variables. - -# How to make a schema description - - -## Decision tree - -To set up a schema description, go through the following questions step by step and provide the respective answer in the respective function. [Linked tables](#tablesTypes) can serve as examples to compare against and right after the decision tree there is a step-by-step hands-on example of how to build a schema description. - -1. **Variables**: Clarify which are the identifying variables and which are the observed variables. Make sure not to mistake a listed observed variable ([Tab. 10](#long)) as an identifying variable. - -2. **Format**: Provide potentially information about the table format in `setFormat()`. - -3. **Clusters**: In case there are clusters, provide in `setCluster()` - - - are data clustered into several files or spreadsheets and is the information used to separate the data (i.e., the spreadsheet name) a variable of interest ([Tab. 6](#implicitVar)) or are the data clustered within one spreadsheet ([Tab. 12](#clusHor), [Tab. 13](#clusId), [Tab. 14](#clusMeas) & [Tab. 15](#clustNest))? - - are data clustered according to an identifying variable ([Tab. 12](#clusHor), [Tab. 13](#clusId)), or are observed variables grouped into clusters ([Tab. 14](#clusMeas))? - - are clusters nested into a grouping variable of interest ([Tab. 15](#clustNest))? -

- -4. **Identifying variables**: provide in `setIDVar()` - - - in which column(s) is the variable? - - is the variable a result of merging several columns ([Tab. 4](#mergeCol)) or must it be split off of another column ([Tab. 5](#splitCol))? - - is the variable wide (i.e., its values are in several columns) ([Tab. 7](#one_wide_id), [Tab. 8](#wide_obs) & [Tab. 9](#two_wide_id))? In this case, the values will look like they are part of the header. - - is the variable distinct from the main table ([Tab. 16](#distVar))? -

- -5. **Observed variable**: provide in `setObsVar()` - - - in which column(s) is the variable? - - is the variable wide and nested in a wide identifying variable (i.e., the name of the observed variable is below the name of the identifying variable) ([Tab. 7](#one_wide_id) & [Tab. 9](#two_wide_id))? - - is the variable listed (i.e., the names of the variable are given as a value of an identifying variable) ([Tab. 10](#long))? -

- - -## Additional remarks - -- To work with `tabshiftr`, tables need to be read in while treating any header rows *as data*, i.e., by not setting the first row as header -- To spell this out explicitly, there are two almost inverse cases where the type of a variable is confused and the **name** and **values** of a variable end up in the wrong place: - 1. Tables are wide when the header contains the **values** of one or more (identifying) variables as if they were names. - 2. When tables are long, it can occur that the **name** of several observed variables are gathered as if they were values into a single column. -- Each column should be treated as a character data type, because some columns might contain data with both numeric and character cells. - -```{r, eval=F} -input <- read_csv(file = ..., - col_names = FALSE, - col_types = cols(.default = "c")) -``` - -- `reorganise()` takes care of reformatting the data-types into the most permissive data type that does not introduce `NA`s where there should be data, i.e, if a variable can be numeric, it is formatted as numeric column, otherwise it will be stored as a character column. -- When setting up a schema description, some generalisations are hard to wrap your mind around. Try imagining that once you have set a variable, it's values are "cut out" from the input table and pasted into the reshaped table. For instance, once clusters are set for [Tab. 6](#implicitVar), imagine that the meta-data header isn't part of the table anymore, resulting in a much simpler table. Or for [Tab. 16](#distVar), once territories are set, imagine that the respective `unit1/2/3` values disappear, which makes it easier to see that the respective column is actually a tidy column of commodities. - -## An example - -As an example, we show here how to build a schema description for a table that has a wide identifying variable and a listed observed variable. This table contains additionally some dummy information one would typically encounter in tables, such as `empty_col`s and rows and data that are not immediately of interest (`other_observed`). - -```{r} -kable(input <- tabs2shift$listed_column_wide) -``` - -In this case we don't need to set clusters and can start immediately with setting the first id variable `territories`, which is in the first column and otherwise tidy. The order by which we set the variables determines where they ocurr in the output table. Any of the setters start by default with an empty schema, in case none is provided to them from a previous setter, thus none needs to be provided at the beginning of a schema. - -```{r} -schema <- setIDVar(name = "territories", columns = 1) -``` - -Since version 0.3.0, `tabshiftr` comes with getters that allow to debug the current schema description. To do this, however, the schema first needs to be validated. This is in order to make sure that all the generic information are evaluated with the respective input. After that, a getter can be used to extract the respective information, for example the reorganised id variables with `getIDVars()`. - -```{r} -validateSchema(schema = schema, input = input) %>% - getIDVars(input = input) -``` - -After seeing that our specification results in a meaningful output, we can continue setting the other id variables `years` (tidy and in column 2) and `commodities` (spread over two columns and the values are in the first row). Note, how we pipe the previous schema into the next setter. This results in the next variable being added to that schema. - -```{r} -schema <- schema %>% - setIDVar(name = "year", columns = 2) %>% - setIDVar(name = "commodities", columns = c(6, 7), rows = 1) -``` - -Validating and checking for id variables again results in the following - -```{r} -validateSchema(schema = schema, input = input) %>% - getIDVars(input = input) -``` - -The id variable `commodities` is clearly wide (more than one column) and its' values are not repeated four times, as it should be, judging by the combination of the other variables. However, this is an expected tentative output that will be handled in a later step and the id variables have been specified correctly. - -Next, we set the listed observed variables. *Listed* means that the column names of the observed variables are treated as if they were the values of an identifying variable (in column 4), while the values are in the columns 6 and 7. In this case, the values need to be filtered by `value` (i.e., the values of that variable are found in columns 6 and 7, where column 4 contains `value`). - -```{r} -schema <- schema %>% - setObsVar(name = "harvested", columns = c(6, 7), key = 4, value = "harvested") %>% - setObsVar(name = "production", columns = c(6, 7), key = 4, value = "production") -``` - -We then get the following observed variables, which is also an expected tentative output. - -```{r} -validateSchema(schema = schema, input = input) %>% - getObsVars(input = input) -``` - -From both, the output of `getIDVars` and `getObsVars` we can calculate how many and which combinations of data exist (e.g., the two columns in the observed variables correspond to the two values of the identifying variable `commodities`) and that they still need to be pivoted to be in a tidy arrangement. - -The `reorganise()` function carries out the steps of validating, extracting the variables, pivoting the tentative output and putting the final table together automatically, so it merely requires the finalized (non-validated) `schema` and the `input` table. - -```{r} -schema # has a pretty print function - -reorganise(input = input, schema = schema) -``` - -# Table types - - -In this section we look at some examples of disorganized data, discuss the dimension along which they are disorganized and show which schema description should be used to reorganize them. - -All of the following examples contain an `other_observed`, an `empty_col` column and an empty row, which serve the purpose of dummy information or formatting that could be found in any table and should not disturb the process of reorganizing. You can run all the examples by simply loading the schema and calling `reorganise(input = tabs2shift$..., schema = schema)` with the respective table that is plotted for this example. - -## Table contains one cluster - -### Tidy table - -In case the observed variables are arranged into individual columns ([Tab. 3](#tabTidy)), we have tidy data [@Wickham2014], which are largely already in the target arrangement. -The tidy table may however, still contain unneeded data, need different names, or transformation factors for the values. - -```{r} -kable(tabs2shift$tidy) -``` - -Table: Table 3: A largely tidy table. - - -```{r} -schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 2) %>% - setIDVar(name = "commodities", columns = 3) %>% - setObsVar(name = "harvested", columns = 5) %>% - setObsVar(name = "production", columns = 6, factor = 0.1) - -reorganise(input = tabs2shift$tidy, schema = schema) -``` - - -### Mismatch of columns and variables - -Sometimes it may be the case that the number of variables is not the same as there are columns because either one variable is spread over several column, or one column contains several variables. - -In the former case, columns need to be merged ([Tab. 4](#splitCol)) and in the latter case, columns need to be split via [regular expressions](https://edrub.in/CheatSheets/cheatSheetStringr.pdf) ([Tab. 5](#mergeCol)). For example, `.+?(?=_)` gives everything up until the first `_` and `(?<=\\_).*` everything after the `_ `. - -```{r} -kable(input <- tabs2shift$split_column) -``` - -Table: Table 4: The variables `year` is split up into two columns. - -```{r, eval=FALSE} -schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = c(2, 4), merge = " ") %>% - setIDVar(name = "commodities", columns = 5) %>% - setObsVar(name = "harvested", columns = 6) %>% - setObsVar(name = "production", columns = 7) -``` - -```{r} -kable(tabs2shift$merged_column) -``` - -Table: Table 5: The variables `year` and `commodities` are stored in the same column. - -```{r, eval=FALSE} -schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 2, split = ".+?(?=_)") %>% - setIDVar(name = "commodities", columns = 2, split = "(?<=\\_).*") %>% - setObsVar(name = "harvested", columns = 4) %>% - setObsVar(name = "production", columns = 5) -``` - - -### Implicit variables - -When data are split up into clusters that are stored in separate files or spreadsheets, the cluster ID is often recorded not in the table as an explicit variable, but is only provided in the file or table name. In those cases, we have to register this cluster ID as an identifying variable nevertheless, to output a consistent table. - -```{r} -kable(input <- tabs2shift$implicit_variable) -``` - -Table: Table 6: The information about which territory we are dealing with is missing or implied by some meta-data. - -```{r, eval=FALSE} -schema <- setCluster(id = "territories", - left = 1, top = 4) %>% - setIDVar(name = "territories", value = "unit 1") %>% - setIDVar(name = "year", columns = 4) %>% - setIDVar(name = "commodities", columns = 1) %>% - setObsVar(name = "harvested", columns = 2) %>% - setObsVar(name = "production", columns = 3) -``` - - -### Wide variables - -In case identifying variables are factors with a small number of levels, those levels may be falsely used as names of other variables, where they would be next to each other and thus \"wide\" ([Tab. 7](#one_wide_id)). Those other variables (both identifying and observed variables) would then be \"nested\" in the wide identifying variables. In those cases we have to record for the identifying variable(s) the columns and the row in which the **values of the identifying variable** are found (they will look like they are part of the header). For the observed variable(s) we need to record the columns and the row where the **name of that variable** is found. - -```{r} -kable(input <- tabs2shift$one_wide_id) -``` - -Table: Table 7: The observed variables are nested within the identifying variable `commodities`. - -```{r, eval=FALSE} -schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 3) %>% - setIDVar(name = "commodities", columns = c(4, 6), rows = 1) %>% - setObsVar(name = "harvested", columns = c(4, 6), top = 2) %>% - setObsVar(name = "production", columns = c(5, 7), top = 2) -``` - -The same is also true in case the identifying variables are actually nested within the observed variables, i.e., the names of the observed variables are on top of the names of the id variables ([Tab. 8](#wide_obs)). However, if an observed variables is in the topmost row (and there are no clusters), `top = 1` can be omitted. - -```{r} -kable(input <- tabs2shift$wide_obs) -``` - -Table: Table 8: The identifying variable `commodities` is nested within the observed variables. - - -```{r, eval=FALSE} -schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 2) %>% - setIDVar(name = "commodities", columns = c(3:6), rows = 2) %>% - setObsVar(name = "harvested", columns = c(3, 4)) %>% - setObsVar(name = "production", columns = c(5, 6)) -``` - - -In case several variables are nested within other variables, we have to specify for all nested or nesting variables in which respective rows their values sit. - -```{r} -kable(input <- tabs2shift$two_wide_id) -``` - -Table: Table 9: The observed variables are nested within the identifying variable `year` and `commodities`. - - -```{r, eval=FALSE} -schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = c(2, 6), rows = 1) %>% - setIDVar(name = "commodities", columns = c(2, 4, 6, 8), rows = 2) %>% - setObsVar(name = "harvested", columns = c(2, 4, 6, 8), top = 3) %>% - setObsVar(name = "production", columns = c(3, 5, 7, 9), top = 3) -``` - - -### Listed *observed variables* - -Some tables contain a column where the names of observed variables (`harvested` and `production`) are treated as if they were the values of an identifying variable (`dimension`), while the values are presented in only one column (`values`) ([Tab. 10](#long)). To end up with tidy data in those cases, we need to extract the values associated with the observed variables. Thus, we define the observed variables and specify the `key = ` in which the variable names sit, and the `value = ` the variable name has, to extract that variable. - -```{r} -kable(input <- tabs2shift$listed_column) -``` - -Table: Table 10: The variable names of the observed variable are treated as if they were the values of the identifying variable `dimension`. - - -```{r, eval=FALSE} -schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 2) %>% - setIDVar(name = "commodities", columns = 3) %>% - setObsVar(name = "harvested", columns = 7, key = 6, value = "harvested") %>% - setObsVar(name = "production", columns = 7, key = 6, value = "production") -``` - -Moreover, (several) identifying variables may be wide additionally and we have to proceed as mentioned above, by providing the columns and the rows of the variable values (which appear to be names). - -```{r} -kable(input <- tabs2shift$listed_column_wide) -``` - -Table: Table 11: The identifying variable `commodities` is treated as if it were the observed variables while the variable names of the observed variable are treated as if they were the values of the identifying variable `dimension`. - - -```{r, eval=FALSE} -schema <- - setIDVar(name = "territories", columns = 1) %>% - setIDVar(name = "year", columns = 2) %>% - setIDVar(name = "commodities", columns = c(6, 7), rows = 1) %>% - setObsVar(name = "harvested", columns = c(6, 7), key = 4, value = "harvested") %>% - setObsVar(name = "production", columns = c(6, 7), key = 4, value = "production") -``` - - -### Misplaced columns or rows - -WIP - - -## Table contains several clusters - -Clusters are typically of the same arrangement within one table, they can be repeated along rows (horizontally) or along columns (vertically), but also a non-systematic distribution is possible. Moreover, clusters may be nested into some grouping variable in a similar way that data are nested into clusters. A table could also be treated like a cluster when the table is not only composed of the table, but perhaps also text in the form of some table description (that may be scattered in the document and) that does not allow the table to start at the table origin in the topmost left cell. - -### Horizontal clusters - -In case clusters are sitting right next to each other in the same origin row ([Tab. 12](#clusHor)), it is sufficient to provide the topmost row and all leftmost columns at which a new cluster starts. - -```{r} -kable(input <- tabs2shift$clusters_horizontal) -``` - -Table: Table 12: Horizontal clusters of the identifying variable `period`. - - -```{r, eval=FALSE} -schema <- setCluster(id = "territories", - left = c(1, 6), top = 2) %>% - setIDVar(name = "territories", columns = c(1, 6), rows = 2) %>% - setIDVar(name = "year", columns = c(2, 7)) %>% - setIDVar(name = "commodities", columns = c(1, 6)) %>% - setObsVar(name = "harvested", columns = c(3, 8)) %>% - setObsVar(name = "production", columns = c(4, 9)) -``` - - -### Vertical clusters - -For vertically arranged clusters ([Tab. 13](#clusId)), just like for the horizontal case, the respective rows and columns need to be provided. - -```{r} -kable(input <- tabs2shift$clusters_vertical) -``` - -Table: Table 13: Vertical clusters of the identifying variable `period`. - - -```{r, eval=FALSE} -schema <- setCluster(id = "territories", - left = 1, top = c(3, 9)) %>% - setIDVar(name = "territories", columns = 1, rows = c(3, 9)) %>% - setIDVar(name = "year", columns = 2) %>% - setIDVar(name = "commodities", columns = 5) %>% - setObsVar(name = "harvested", columns = 6) %>% - setObsVar(name = "production", columns = 7) -``` - - -### Clusters of observed variables - -The previous two types of clusters are clusters of identifying variables, but it may also be the case that the observed variables are split up into distinct clusters. Here, we need to specify first of all `setClusters(..., id = "observed")` to indicate that clusters are observed variables. Next, we need to set up the observed variables so that they contain `"key = "cluster"` and in `value` the number of the cluster this variable can be found in. - -```{r} -kable(input <- tabs2shift$clusters_observed) -``` - -Table: Table 14: Vertical clusters of the observed variables. - - -```{r, eval=FALSE} -schema <- setCluster(id = "observed", - left = 1, top = c(2, 12)) %>% - setIDVar(name = "territories", columns = 2) %>% - setIDVar(name = "year", columns = 3) %>% - setIDVar(name = "commodities", columns = 5) %>% - setObsVar(name = "harvested", columns = 7, key = "cluster", value = 1) %>% - setObsVar(name = "production", columns = 7, key = "cluster", value = 2) -``` - - -### Clusters that are nested into another variable - -When (some) clusters are nested into a (grouping) variable of interest, not only the cluster positions need to be specified, but also their relation to the grouping variable. Similar to the cluster ID, this group ID also needs to be specified as an identifying variable and needs to be provided as `group = ...` and the membership of each cluster to a group needs to be specified in `member = ...`. The cluster position needs to be specified just as it would be without groups. - -```{r} -kable(input <- tabs2shift$clusters_nested) -``` - -Table: Table 15: Clusters are grouped according to a variable of interest that should also be captured. - - -```{r, eval=FALSE} -schema <- setCluster(id = "sublevel", - group = "territories", member = c(1, 1, 2), - left = 1, top = c(3, 8, 15)) %>% - setIDVar(name = "territories", columns = 1, rows = c(2, 14)) %>% - setIDVar(name = "sublevel", columns = 1, rows = c(3, 8, 15)) %>% - setIDVar(name = "year", columns = 7) %>% - setIDVar(name = "commodities", columns = 2) %>% - setObsVar(name = "harvested", columns = 5) %>% - setObsVar(name = "production", columns = 6) -``` - - -### Variables that are distinct from a cluster - -When not all identifying variables can be provided relative to the cluster origin, for example because they are missing for some clusters, it makes more sense to define such a variable as a distinct variable. This is done by providing `row` and `col` as absolute values and setting `distinct = TRUE`. Other variables that are all arranged in the same way in each cluster can be specified so that their row and column indices are given relative to the cluster position (`relative = TRUE`), as shown in the alternative shema below. - -```{r} -kable(input <- tabs2shift$clusters_messy) -``` - -Table: Table 16: Several clusters where one variable is not available for each cluster, but distinct of them. - - -```{r, eval=FALSE} -schema <- setCluster(id = "territories", - left = c(1, 1, 4), top = c(1, 8, 8)) %>% - setIDVar(name = "territories", columns = c(1, 1, 4), rows = c(2, 9, 9)) %>% - setIDVar(name = "year", columns = 4, rows = c(3:6), distinct = TRUE) %>% - setIDVar(name = "commodities", columns = c(1, 1, 4)) %>% - setObsVar(name = "harvested", columns = c(2, 2, 5)) %>% - setObsVar(name = "production", columns = c(3, 3, 6)) - -schema_alt <- setCluster(id = "territories", - left = c(1, 1, 4), top = c(1, 8, 8)) %>% - setIDVar(name = "territories", columns = 1, rows = .find(row = 2, relative = TRUE)) %>% - setIDVar(name = "year", columns = 4, rows = c(3:6), distinct = TRUE) %>% - setIDVar(name = "commodities", columns = .find(col = 1, relative = TRUE)) %>% - setObsVar(name = "harvested", columns = .find(col = 2, relative = TRUE)) %>% - setObsVar(name = "production", columns = .find(col = 3, relative = TRUE)) -``` - - -### Messy clusters - -In case several clusters are neither aligned along a row nor a column, and are all of differing size, the respective information need to be provided at the same index of the respective property. For example, three clusters, where the first cluster starts at (1,1) and is 3 by 4 cells in size, where the second clusters starts at (5,2) and is 5 by 5 cells in size, and so on, needs to be specified as below. - -```{r, eval=FALSE} -schema <- setCluster(id = ..., - left = c(1, 2, 5), top = c(1, 5, 1), - width = c(3, 5, 2), height = c(4, 5, 3), - ...) %>% - setIDVar(name = "territories", columns = .find(col = 1, relative = TRUE)) %>% - ... -``` - -Additionally, given that at least the tables within each cluster are all arranged in the same way, the contained variables can be specified so that their row and column indices are given relative to the cluster position (`relative = TRUE`). If also that is not the case, the row and column values for each cluster need to be provided for the respective variables in the same way as for cluster positions. - - +--- +title: "tabshiftr - Reshape disorganized messy data" +output: + bookdown::html_document2: + fig_caption: yes + number_sections: yes + theme: spacelab + highlight: pygments + toc: yes + toc_float: yes +bibliography: references.bib +urlcolor: blue +fontsize: 12pt +linestretch: 1.15 +link-citations: yes +vignette: > + %\VignetteIndexEntry{reorganising messy data} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "man/figures/README-", + out.width = "100%" +) +options(knitr.kable.NA = '.') +library(tabshiftr) +library(knitr) +``` + +# Rationale + +Tables as places where data are recorded can be pretty messy. The 'tidy' paradigm in R proposes that data are organised so that variables are recorded in columns, observations in rows and that there is only one value per cell [@Wickham2014]. This, however, is only one interpretation of how data should be organised [@Broman] and especially when scraping data off the internet, one frequently encounters spreadsheets that don't follow this paradigm. + +The `tidyr` package is one of the most popular tools to bring data into a tidy format. However, up until today it is limited to tables that are already organised into topologically coherent (rectangular) chunks and any messiness beyond that requires dedicated scripts for reorganization. In `tabshiftr` we try to describe and work with a further dimension of messiness, where data are available as so-called *disorganized (messy) data*, data that are not yet arranged into rectangular form. + +The approach of `tabshiftr` is based on describing the arrangement of such tables in a so-called [schema](https://en.wikipedia.org/wiki/Database_schema) description, which is then the basis for automatic reorganization via the function `reorganise()`. Typically there is an input and an output schema, describing the arrangement of the input and output tables, respectively. The advantage of this procedure is that input and output tables exist explicitly and the schema maps the transformation of the data. As we want to end up with tidy tables, the output schema is pre-determined by a tidy table of the included variables and the input [schema needs to be put together](#makeSchema) by you, the user. + + +# The basics + +Data can be disorganised according to multiple dimensions. To understand those dimensions, we first need to understand the nature of data. Data \"of the same kind\" are collected in a **variable**, which is always a combination of a **name** and the **values**. In a table, names are typically in the topmost row and values are in the column below that name ([Tab. 1](#tabExample1)). Conceptually, there are two types of variables in any table: + +1. Variables that contain categorical levels that identify the units for which values have been observed (they are called *identifying variables* here). +2. Variables that have been measured or observed and that consequently represent the target values of that measurement, be they continuous or categorical (they are called *observed variables* here). + +Moreover, a table is part of a **series** when other tables of that series contain the same variables with slightly different values for the identifying variables, irrespective of how the distinct tables of that series are arranged. + +| identifying variable | observed variable (categorical) | observed variable (continuous) | +| :- | :- | :- | +| sample 1 | blue | 10 | +| sample 2 | green | 20 | +| sample 3 | red | 30 | + +Table: Table 1: An example table containing one identifying and two observed variables, with the variable names in the topmost row and the values in all consecutive rows. + +Here, we do use the term **spreadsheet** to denote what would typically be seen as *table*, as the data we deal with here are typically \"disorganised\" and thus not what would be recognised as a table. Even though data in many spreadsheets are disorganised, they are mostly not non-systematic. Especially in complex spreadsheets, one often encounters a situation where a set of variables occurs more than once with the same or very similar arrangement, which we call **cluster** here. Data that are part of clusters are split along one of the, typically categorical, variables (the **cluster ID**), with the aim to increase the visual accessibility or direct the focus for human readers ([Tab. 2](#tabExample2)). This may also be the case where data are split up into several files or spreadsheets of a file, where the cluster ID can be found in the file or spreasheet name or the meta-data. In many cases, the cluster ID is an **implicit variable**, a variable that is not immediately recogniseable as such. + +| sample | colour | intensity | sample | colour | intensity | +| :- | :- | :- | :- | :- | :- | +| sample 1 | | | sample 2 | | | +| | blue | 10 | | blue | 11 | +| | green | 20 | | green | 24 | +| | red | 30 | | red | 13 | +| | | | | | | +| sample 3 | | | sample 4 | | | +| | blue | 20 | | blue | 10 | +| | green | 15 | | green | 16 | +| | red | 33 | | red | 21 | + +Table: Table 2: An example of a table with several clusters of comparable variables. + +# How to make a schema description + + +## Decision tree + +To set up a schema description, go through the following questions step by step and provide the respective answer in the respective function. [Linked tables](#tablesTypes) can serve as examples to compare against and right after the decision tree there is a step-by-step hands-on example of how to build a schema description. + +1. **Variables**: Clarify which are the identifying variables and which are the observed variables. Make sure not to mistake a listed observed variable ([Tab. 10](#long)) as an identifying variable. + +2. **Format**: Provide potentially information about the table format in `setFormat()`. + +3. **Clusters**: In case there are clusters, provide in `setCluster()` + + - are data clustered into several files or spreadsheets and is the information used to separate the data (i.e., the spreadsheet name) a variable of interest ([Tab. 6](#implicitVar)) or are the data clustered within one spreadsheet ([Tab. 12](#clusHor), [Tab. 13](#clusId), [Tab. 14](#clusMeas) & [Tab. 15](#clustNest))? + - are data clustered according to an identifying variable ([Tab. 12](#clusHor), [Tab. 13](#clusId)), or are observed variables grouped into clusters ([Tab. 14](#clusMeas))? + - are clusters nested into a grouping variable of interest ([Tab. 15](#clustNest))? +

+ +4. **Identifying variables**: provide in `setIDVar()` + + - in which column(s) is the variable? + - is the variable a result of merging several columns ([Tab. 4](#mergeCol)) or must it be split off of another column ([Tab. 5](#splitCol))? + - is the variable wide (i.e., its values are in several columns) ([Tab. 7](#one_wide_id), [Tab. 8](#wide_obs) & [Tab. 9](#two_wide_id))? In this case, the values will look like they are part of the header. + - is the variable distinct from the main table ([Tab. 16](#distVar))? +

+ +5. **Observed variable**: provide in `setObsVar()` + + - in which column(s) is the variable? + - is the variable wide and nested in a wide identifying variable (i.e., the name of the observed variable is below the name of the identifying variable) ([Tab. 7](#one_wide_id) & [Tab. 9](#two_wide_id))? + - is the variable listed (i.e., the names of the variable are given as a value of an identifying variable) ([Tab. 10](#long))? +

+ + +## Additional remarks + +- To work with `tabshiftr`, tables need to be read in while treating any header rows *as data*, i.e., by not setting the first row as header +- To spell this out explicitly, there are two almost inverse cases where the type of a variable is confused and the **name** and **values** of a variable end up in the wrong place: + 1. Tables are wide when the header contains the **values** of one or more (identifying) variables as if they were names. + 2. When tables are long, it can occur that the **name** of several observed variables are gathered as if they were values into a single column. +- Each column should be treated as a character data type, because some columns might contain data with both numeric and character cells. + +```{r, eval=F} +input <- read_csv(file = ..., + col_names = FALSE, + col_types = cols(.default = "c")) +``` + +- `reorganise()` takes care of reformatting the data-types into the most permissive data type that does not introduce `NA`s where there should be data, i.e, if a variable can be numeric, it is formatted as numeric column, otherwise it will be stored as a character column. +- When setting up a schema description, some generalisations are hard to wrap your mind around. Try imagining that once you have set a variable, it's values are "cut out" from the input table and pasted into the reshaped table. For instance, once clusters are set for [Tab. 6](#implicitVar), imagine that the meta-data header isn't part of the table anymore, resulting in a much simpler table. Or for [Tab. 16](#distVar), once territories are set, imagine that the respective `unit1/2/3` values disappear, which makes it easier to see that the respective column is actually a tidy column of commodities. + +## An example + +As an example, we show here how to build a schema description for a table that has a wide identifying variable and a listed observed variable. This table contains additionally some dummy information one would typically encounter in tables, such as `empty_col`s and rows and data that are not immediately of interest (`other_observed`). + +```{r} +kable(input <- tabs2shift$listed_column_wide) +``` + +In this case we don't need to set clusters and can start immediately with setting the first id variable `territories`, which is in the first column and otherwise tidy. The order by which we set the variables determines where they ocurr in the output table. Any of the setters start by default with an empty schema, in case none is provided to them from a previous setter, thus none needs to be provided at the beginning of a schema. + +```{r} +schema <- setIDVar(name = "territories", columns = 1) +``` + +Since version 0.3.0, `tabshiftr` comes with getters that allow to debug the current schema description. To do this, however, the schema first needs to be validated. This is in order to make sure that all the generic information are evaluated with the respective input. After that, a getter can be used to extract the respective information, for example the reorganised id variables with `getIDVars()`. + +```{r} +validateSchema(schema = schema, input = input) %>% + getIDVars(input = input) +``` + +After seeing that our specification results in a meaningful output, we can continue setting the other id variables `years` (tidy and in column 2) and `commodities` (spread over two columns and the values are in the first row). Note, how we pipe the previous schema into the next setter. This results in the next variable being added to that schema. + +```{r} +schema <- schema %>% + setIDVar(name = "year", columns = 2) %>% + setIDVar(name = "commodities", columns = c(6, 7), rows = 1) +``` + +Validating and checking for id variables again results in the following + +```{r} +validateSchema(schema = schema, input = input) %>% + getIDVars(input = input) +``` + +The id variable `commodities` is clearly wide (more than one column) and its' values are not repeated four times, as it should be, judging by the combination of the other variables. However, this is an expected tentative output that will be handled in a later step and the id variables have been specified correctly. + +Next, we set the listed observed variables. *Listed* means that the column names of the observed variables are treated as if they were the values of an identifying variable (in column 4), while the values are in the columns 6 and 7. In this case, the values need to be filtered by `value` (i.e., the values of that variable are found in columns 6 and 7, where column 4 contains `value`). + +```{r} +schema <- schema %>% + setObsVar(name = "harvested", columns = c(6, 7), key = 4, value = "harvested") %>% + setObsVar(name = "production", columns = c(6, 7), key = 4, value = "production") +``` + +We then get the following observed variables, which is also an expected tentative output. + +```{r} +validateSchema(schema = schema, input = input) %>% + getObsVars(input = input) +``` + +From both, the output of `getIDVars` and `getObsVars` we can calculate how many and which combinations of data exist (e.g., the two columns in the observed variables correspond to the two values of the identifying variable `commodities`) and that they still need to be pivoted to be in a tidy arrangement. + +The `reorganise()` function carries out the steps of validating, extracting the variables, pivoting the tentative output and putting the final table together automatically, so it merely requires the finalized (non-validated) `schema` and the `input` table. + +```{r} +schema # has a pretty print function + +reorganise(input = input, schema = schema) +``` + +# Table types + + +In this section we look at some examples of disorganized data, discuss the dimension along which they are disorganized and show which schema description should be used to reorganize them. + +All of the following examples contain an `other_observed`, an `empty_col` column and an empty row, which serve the purpose of dummy information or formatting that could be found in any table and should not disturb the process of reorganizing. You can run all the examples by simply loading the schema and calling `reorganise(input = tabs2shift$..., schema = schema)` with the respective table that is plotted for this example. + +## Table contains one cluster + +### Tidy table + +In case the observed variables are arranged into individual columns ([Tab. 3](#tabTidy)), we have tidy data [@Wickham2014], which are largely already in the target arrangement. +The tidy table may however, still contain unneeded data, need different names, or transformation factors for the values. + +```{r} +kable(tabs2shift$tidy) +``` + +Table: Table 3: A largely tidy table. + + +```{r} +schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 2) %>% + setIDVar(name = "commodities", columns = 3) %>% + setObsVar(name = "harvested", columns = 5) %>% + setObsVar(name = "production", columns = 6, factor = 0.1) + +reorganise(input = tabs2shift$tidy, schema = schema) +``` + + +### Mismatch of columns and variables + +Sometimes it may be the case that the number of variables is not the same as there are columns because either one variable is spread over several column, or one column contains several variables. + +In the former case, columns need to be merged ([Tab. 4](#splitCol)) and in the latter case, columns need to be split via [regular expressions](https://edrub.in/CheatSheets/cheatSheetStringr.pdf) ([Tab. 5](#mergeCol)). For example, `.+?(?=_)` gives everything up until the first `_` and `(?<=\\_).*` everything after the `_ `. + +```{r} +kable(input <- tabs2shift$split_column) +``` + +Table: Table 4: The variables `year` is split up into two columns. + +```{r, eval=FALSE} +schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = c(2, 4), merge = " ") %>% + setIDVar(name = "commodities", columns = 5) %>% + setObsVar(name = "harvested", columns = 6) %>% + setObsVar(name = "production", columns = 7) +``` + +```{r} +kable(tabs2shift$merged_column) +``` + +Table: Table 5: The variables `year` and `commodities` are stored in the same column. + +```{r, eval=FALSE} +schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 2, split = ".+?(?=_)") %>% + setIDVar(name = "commodities", columns = 2, split = "(?<=\\_).*") %>% + setObsVar(name = "harvested", columns = 4) %>% + setObsVar(name = "production", columns = 5) +``` + + +### Implicit variables + +When data are split up into clusters that are stored in separate files or spreadsheets, the cluster ID is often recorded not in the table as an explicit variable, but is only provided in the file or table name. In those cases, we have to register this cluster ID as an identifying variable nevertheless, to output a consistent table. + +```{r} +kable(input <- tabs2shift$implicit_variable) +``` + +Table: Table 6: The information about which territory we are dealing with is missing or implied by some meta-data. + +```{r, eval=FALSE} +schema <- setCluster(id = "territories", + left = 1, top = 4) %>% + setIDVar(name = "territories", value = "unit 1") %>% + setIDVar(name = "year", columns = 4) %>% + setIDVar(name = "commodities", columns = 1) %>% + setObsVar(name = "harvested", columns = 2) %>% + setObsVar(name = "production", columns = 3) +``` + + +### Wide variables + +In case identifying variables are factors with a small number of levels, those levels may be falsely used as names of other variables, where they would be next to each other and thus \"wide\" ([Tab. 7](#one_wide_id)). Those other variables (both identifying and observed variables) would then be \"nested\" in the wide identifying variables. In those cases we have to record for the identifying variable(s) the columns and the row in which the **values of the identifying variable** are found (they will look like they are part of the header). For the observed variable(s) we need to record the columns and the row where the **name of that variable** is found. + +```{r} +kable(input <- tabs2shift$one_wide_id) +``` + +Table: Table 7: The observed variables are nested within the identifying variable `commodities`. + +```{r, eval=FALSE} +schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 3) %>% + setIDVar(name = "commodities", columns = c(4, 6), rows = 1) %>% + setObsVar(name = "harvested", columns = c(4, 6), top = 2) %>% + setObsVar(name = "production", columns = c(5, 7), top = 2) +``` + +The same is also true in case the identifying variables are actually nested within the observed variables, i.e., the names of the observed variables are on top of the names of the id variables ([Tab. 8](#wide_obs)). However, if an observed variables is in the topmost row (and there are no clusters), `top = 1` can be omitted. + +```{r} +kable(input <- tabs2shift$wide_obs) +``` + +Table: Table 8: The identifying variable `commodities` is nested within the observed variables. + + +```{r, eval=FALSE} +schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 2) %>% + setIDVar(name = "commodities", columns = c(3:6), rows = 2) %>% + setObsVar(name = "harvested", columns = c(3, 4)) %>% + setObsVar(name = "production", columns = c(5, 6)) +``` + + +In case several variables are nested within other variables, we have to specify for all nested or nesting variables in which respective rows their values sit. + +```{r} +kable(input <- tabs2shift$two_wide_id) +``` + +Table: Table 9: The observed variables are nested within the identifying variable `year` and `commodities`. + + +```{r, eval=FALSE} +schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = c(2, 6), rows = 1) %>% + setIDVar(name = "commodities", columns = c(2, 4, 6, 8), rows = 2) %>% + setObsVar(name = "harvested", columns = c(2, 4, 6, 8), top = 3) %>% + setObsVar(name = "production", columns = c(3, 5, 7, 9), top = 3) +``` + + +### Listed *observed variables* + +Some tables contain a column where the names of observed variables (`harvested` and `production`) are treated as if they were the values of an identifying variable (`dimension`), while the values are presented in only one column (`values`) ([Tab. 10](#long)). To end up with tidy data in those cases, we need to extract the values associated with the observed variables. Thus, we define the observed variables and specify the `key = ` in which the variable names sit, and the `value = ` the variable name has, to extract that variable. + +```{r} +kable(input <- tabs2shift$listed_column) +``` + +Table: Table 10: The variable names of the observed variable are treated as if they were the values of the identifying variable `dimension`. + + +```{r, eval=FALSE} +schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 2) %>% + setIDVar(name = "commodities", columns = 3) %>% + setObsVar(name = "harvested", columns = 7, key = 6, value = "harvested") %>% + setObsVar(name = "production", columns = 7, key = 6, value = "production") +``` + +Moreover, (several) identifying variables may be wide additionally and we have to proceed as mentioned above, by providing the columns and the rows of the variable values (which appear to be names). + +```{r} +kable(input <- tabs2shift$listed_column_wide) +``` + +Table: Table 11: The identifying variable `commodities` is treated as if it were the observed variables while the variable names of the observed variable are treated as if they were the values of the identifying variable `dimension`. + + +```{r, eval=FALSE} +schema <- + setIDVar(name = "territories", columns = 1) %>% + setIDVar(name = "year", columns = 2) %>% + setIDVar(name = "commodities", columns = c(6, 7), rows = 1) %>% + setObsVar(name = "harvested", columns = c(6, 7), key = 4, value = "harvested") %>% + setObsVar(name = "production", columns = c(6, 7), key = 4, value = "production") +``` + + +### Misplaced columns or rows + +WIP + + +## Table contains several clusters + +Clusters are typically of the same arrangement within one table, they can be repeated along rows (horizontally) or along columns (vertically), but also a non-systematic distribution is possible. Moreover, clusters may be nested into some grouping variable in a similar way that data are nested into clusters. A table could also be treated like a cluster when the table is not only composed of the table, but perhaps also text in the form of some table description (that may be scattered in the document and) that does not allow the table to start at the table origin in the topmost left cell. + +### Horizontal clusters + +In case clusters are sitting right next to each other in the same origin row ([Tab. 12](#clusHor)), it is sufficient to provide the topmost row and all leftmost columns at which a new cluster starts. + +```{r} +kable(input <- tabs2shift$clusters_horizontal) +``` + +Table: Table 12: Horizontal clusters of the identifying variable `period`. + + +```{r, eval=FALSE} +schema <- setCluster(id = "territories", + left = c(1, 6), top = 2) %>% + setIDVar(name = "territories", columns = c(1, 6), rows = 2) %>% + setIDVar(name = "year", columns = c(2, 7)) %>% + setIDVar(name = "commodities", columns = c(1, 6)) %>% + setObsVar(name = "harvested", columns = c(3, 8)) %>% + setObsVar(name = "production", columns = c(4, 9)) +``` + + +### Vertical clusters + +For vertically arranged clusters ([Tab. 13](#clusId)), just like for the horizontal case, the respective rows and columns need to be provided. + +```{r} +kable(input <- tabs2shift$clusters_vertical) +``` + +Table: Table 13: Vertical clusters of the identifying variable `period`. + + +```{r, eval=FALSE} +schema <- setCluster(id = "territories", + left = 1, top = c(3, 9)) %>% + setIDVar(name = "territories", columns = 1, rows = c(3, 9)) %>% + setIDVar(name = "year", columns = 2) %>% + setIDVar(name = "commodities", columns = 5) %>% + setObsVar(name = "harvested", columns = 6) %>% + setObsVar(name = "production", columns = 7) +``` + + +### Clusters of observed variables + +The previous two types of clusters are clusters of identifying variables, but it may also be the case that the observed variables are split up into distinct clusters. Here, we need to specify first of all `setClusters(..., id = "observed")` to indicate that clusters are observed variables. Next, we need to set up the observed variables so that they contain `"key = "cluster"` and in `value` the number of the cluster this variable can be found in. + +```{r} +kable(input <- tabs2shift$clusters_observed) +``` + +Table: Table 14: Vertical clusters of the observed variables. + + +```{r, eval=FALSE} +schema <- setCluster(id = "observed", + left = 1, top = c(2, 12)) %>% + setIDVar(name = "territories", columns = 2) %>% + setIDVar(name = "year", columns = 3) %>% + setIDVar(name = "commodities", columns = 5) %>% + setObsVar(name = "harvested", columns = 7, key = "cluster", value = 1) %>% + setObsVar(name = "production", columns = 7, key = "cluster", value = 2) +``` + + +### Clusters that are nested into another variable + +When (some) clusters are nested into a (grouping) variable of interest, not only the cluster positions need to be specified, but also their relation to the grouping variable. Similar to the cluster ID, this group ID also needs to be specified as an identifying variable and needs to be provided as `group = ...` and the membership of each cluster to a group needs to be specified in `member = ...`. The cluster position needs to be specified just as it would be without groups. + +```{r} +kable(input <- tabs2shift$clusters_nested) +``` + +Table: Table 15: Clusters are grouped according to a variable of interest that should also be captured. + + +```{r, eval=FALSE} +schema <- setCluster(id = "sublevel", + group = "territories", member = c(1, 1, 2), + left = 1, top = c(3, 8, 15)) %>% + setIDVar(name = "territories", columns = 1, rows = c(2, 14)) %>% + setIDVar(name = "sublevel", columns = 1, rows = c(3, 8, 15)) %>% + setIDVar(name = "year", columns = 7) %>% + setIDVar(name = "commodities", columns = 2) %>% + setObsVar(name = "harvested", columns = 5) %>% + setObsVar(name = "production", columns = 6) +``` + + +### Variables that are distinct from a cluster + +When not all identifying variables can be provided relative to the cluster origin, for example because they are missing for some clusters, it makes more sense to define such a variable as a distinct variable. This is done by providing `row` and `col` as absolute values and setting `distinct = TRUE`. Other variables that are all arranged in the same way in each cluster can be specified so that their row and column indices are given relative to the cluster position (`relative = TRUE`), as shown in the alternative shema below. + +```{r} +kable(input <- tabs2shift$clusters_messy) +``` + +Table: Table 16: Several clusters where one variable is not available for each cluster, but distinct of them. + + +```{r, eval=FALSE} +schema <- setCluster(id = "territories", + left = c(1, 1, 4), top = c(1, 8, 8)) %>% + setIDVar(name = "territories", columns = c(1, 1, 4), rows = c(2, 9, 9)) %>% + setIDVar(name = "year", columns = 4, rows = c(3:6), distinct = TRUE) %>% + setIDVar(name = "commodities", columns = c(1, 1, 4)) %>% + setObsVar(name = "harvested", columns = c(2, 2, 5)) %>% + setObsVar(name = "production", columns = c(3, 3, 6)) + +schema_alt <- setCluster(id = "territories", + left = c(1, 1, 4), top = c(1, 8, 8)) %>% + setIDVar(name = "territories", columns = 1, rows = .find(row = 2, relative = TRUE)) %>% + setIDVar(name = "year", columns = 4, rows = c(3:6), distinct = TRUE) %>% + setIDVar(name = "commodities", columns = .find(col = 1, relative = TRUE)) %>% + setObsVar(name = "harvested", columns = .find(col = 2, relative = TRUE)) %>% + setObsVar(name = "production", columns = .find(col = 3, relative = TRUE)) +``` + + +### Messy clusters + +In case several clusters are neither aligned along a row nor a column, and are all of differing size, the respective information need to be provided at the same index of the respective property. For example, three clusters, where the first cluster starts at (1,1) and is 3 by 4 cells in size, where the second clusters starts at (5,2) and is 5 by 5 cells in size, and so on, needs to be specified as below. + +```{r, eval=FALSE} +schema <- setCluster(id = ..., + left = c(1, 2, 5), top = c(1, 5, 1), + width = c(3, 5, 2), height = c(4, 5, 3), + ...) %>% + setIDVar(name = "territories", columns = .find(col = 1, relative = TRUE)) %>% + ... +``` + +Additionally, given that at least the tables within each cluster are all arranged in the same way, the contained variables can be specified so that their row and column indices are given relative to the cluster position (`relative = TRUE`). If also that is not the case, the row and column values for each cluster need to be provided for the respective variables in the same way as for cluster positions. + + # References \ No newline at end of file