From feb054b7356f362fa84fe8b73d8b1f323b5aa936 Mon Sep 17 00:00:00 2001 From: EhrmannS Date: Fri, 16 Feb 2024 01:54:15 +0100 Subject: [PATCH 1/4] update docs --- man/dot-eval_find.Rd | 2 +- man/dot-eval_sum.Rd | 2 +- man/dot-getColTypes.Rd | 18 ++++++++++++++++++ man/dot-spliceHeader.Rd | 17 +++++++++++++++++ man/setFormat.Rd | 8 ++++++++ 5 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 man/dot-getColTypes.Rd create mode 100644 man/dot-spliceHeader.Rd diff --git a/man/dot-eval_find.Rd b/man/dot-eval_find.Rd index 54dda85..b90cf3b 100644 --- a/man/dot-eval_find.Rd +++ b/man/dot-eval_find.Rd @@ -7,7 +7,7 @@ .eval_find(input = NULL, col = NULL, row = NULL, clusters = NULL) } \arguments{ -\item{input}{[\code{character(1)}]\cr table to reorganise.} +\item{input}{[\code{data.frame(1)}]\cr table to reorganise.} \item{col}{[\code{list(2)}]\cr the output of the respective .find construct used to match in columns.} diff --git a/man/dot-eval_sum.Rd b/man/dot-eval_sum.Rd index ffb226d..d634980 100644 --- a/man/dot-eval_sum.Rd +++ b/man/dot-eval_sum.Rd @@ -7,7 +7,7 @@ .eval_sum(input = NULL, groups = NULL, data = NULL) } \arguments{ -\item{input}{[\code{character(1)}]\cr table to reorganise.} +\item{input}{[\code{data.frame(1)}]\cr table to reorganise.} \item{groups}{[\code{list(3)}]\cr the groups-slot from a schema.} diff --git a/man/dot-getColTypes.Rd b/man/dot-getColTypes.Rd new file mode 100644 index 0000000..fb6d738 --- /dev/null +++ b/man/dot-getColTypes.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpers.R +\name{.getColTypes} +\alias{.getColTypes} +\title{Get the column types of a tibble} +\usage{ +.getColTypes(input = NULL, collapse = TRUE) +} +\arguments{ +\item{input}{[\code{data.frame(1)}]\cr table of which to get the column +types.} + +\item{collapse}{[\code{logical(1)}]\cr whether or not to paste all column +types into one string.} +} +\description{ +Get the column types of a tibble +} diff --git a/man/dot-spliceHeader.Rd b/man/dot-spliceHeader.Rd new file mode 100644 index 0000000..55661e0 --- /dev/null +++ b/man/dot-spliceHeader.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpers.R +\name{.spliceHeader} +\alias{.spliceHeader} +\title{Splice the header into the table} +\usage{ +.spliceHeader(input, rows = NULL) +} +\arguments{ +\item{input}{[\code{data.frame(1)}]\cr table of which the header should be +shifted into the table.} + +\item{rows}{[\{integeris(1)]\cr the number of rows to shift into the table.} +} +\description{ +Splice the header into the table +} diff --git a/man/setFormat.Rd b/man/setFormat.Rd index 0f58479..f8ba549 100644 --- a/man/setFormat.Rd +++ b/man/setFormat.Rd @@ -6,6 +6,7 @@ \usage{ setFormat( schema = NULL, + header = 0L, decimal = NULL, thousand = NULL, na_values = NULL, @@ -17,6 +18,13 @@ setFormat( already existing schema, provide that schema here (overwrites previous information).} +\item{header}{[\code{integerish(1)}]\cr The number of header rows. Optimally, +a table is read so that column names are ignored (for example +\code{readr::read_csv(file = ..., col_names = FALSE)}). If relatively well +defined tables are processed, where the header is always only one row, the +table can be read in with the default and the header can be spliced into +the table by specifying the number of rows here.} + \item{decimal}{[\code{character(1)}]\cr The symbols that should be interpreted as decimal separator.} From 4e4ca601edc48528e7c4c271b9d9edde275f876f Mon Sep 17 00:00:00 2001 From: EhrmannS Date: Fri, 16 Feb 2024 01:54:28 +0100 Subject: [PATCH 2/4] update package --- DESCRIPTION | 3 ++- NAMESPACE | 3 +++ _pkgdown.yml | 4 +++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fdd452f..50e7e10 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -59,7 +59,8 @@ Imports: crayon, methods, purrr, - stringr + stringr, + lubridate RoxygenNote: 7.2.3 Suggests: knitr, diff --git a/NAMESPACE b/NAMESPACE index 050d428..f77b6d2 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ importFrom(dplyr,row_number) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,ungroup) +importFrom(lubridate,is.Date) importFrom(magrittr,"%>%") importFrom(methods,new) importFrom(purrr,map) @@ -80,6 +81,7 @@ importFrom(rlang,is_quosure) importFrom(rlang,prim_name) importFrom(stats,na.omit) importFrom(stringr,coll) +importFrom(stringr,str_c) importFrom(stringr,str_count) importFrom(stringr,str_detect) importFrom(stringr,str_extract_all) @@ -88,6 +90,7 @@ importFrom(stringr,str_split) importFrom(stringr,str_sub) importFrom(testthat,expect_identical) importFrom(tibble,as_tibble) +importFrom(tibble,as_tibble_row) importFrom(tibble,rownames_to_column) importFrom(tibble,tibble) importFrom(tidyr,everything) diff --git a/_pkgdown.yml b/_pkgdown.yml index 9c16e9d..1cd6422 100755 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -32,11 +32,13 @@ reference: - title: reorganise tables contents: - reorganise - - title: other helpers + - title: other helper functions contents: - .eval_find - .eval_sum - .expect_valid_table + - .getColTypes + - .shiftHeader - .tidyVars - .updateFormat - show,schema-method From 8a6a0670264cd0494ddc82c406840e51b12006db Mon Sep 17 00:00:00 2001 From: EhrmannS Date: Fri, 16 Feb 2024 01:55:33 +0100 Subject: [PATCH 3/4] incl. header row number in schema and a new function to splice the header into the table, in case the table has colnames in the header --- R/helpers.R | 85 +++++++++++++++++++++++++++++++++++++++- R/reorganise.R | 3 +- R/schema.R | 9 ++++- R/setFormat.R | 15 ++++++- data/schema_default.rda | Bin 335 -> 342 bytes 5 files changed, 104 insertions(+), 8 deletions(-) diff --git a/R/helpers.R b/R/helpers.R index 3a2596c..b4bb8a6 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -135,6 +135,87 @@ } + +#' Get the column types of a tibble +#' +#' @param input [\code{data.frame(1)}]\cr table of which to get the column +#' types. +#' @param collapse [\code{logical(1)}]\cr whether or not to paste all column +#' types into one string. +#' @importFrom checkmate assertDataFrame assertLogical +#' @importFrom tibble tibble +#' @importFrom purrr map +#' @importFrom dplyr left_join pull +#' @importFrom stringr str_c + +.getColTypes <- function(input = NULL, collapse = TRUE){ + + assertDataFrame(x = input) + assertLogical(x = collapse, len = 1) + + types <- tibble(col_type = c("character", "integer", "numeric", "double", "logical", "Date", "units", "sfc_POLYGON", "arrow_binary"), + code = c("c", "i", "n", "d", "l", "D", "u", "g", "a")) + + out <- map(1:dim(input)[2], function(ix){ + class(input[[ix]])[1] + }) %>% + unlist() %>% + tibble(col_type = .) %>% + left_join(y = types, by = "col_type") %>% + pull("code") + + if(collapse){ + out <- out %>% + str_c(collapse = "") + } + + return(out) + +} + + +#' Splice the header into the table +#' +#' @param input [\code{data.frame(1)}]\cr table of which the header should be +#' shifted into the table. +#' @param rows [\{integeris(1)]\cr the number of rows to shift into the table. +#' @importFrom checkmate assertDataFrame assertIntegerish +#' @importFrom dplyr mutate across bind_rows +#' @importFrom tidyselect where +#' @importFrom lubridate is.Date +#' @importFrom tibble as_tibble_row + +.spliceHeader <- function(input, rows = NULL){ + + assertDataFrame(x = input) + assertIntegerish(x = rows, len = 1, lower = 0, upper = dim(input)[1], any.missing = FALSE) + + input <- input %>% + mutate(across(where(is.double) | where(is.integer) | where(is.logical) | where(is.Date), as.character)) + + if(rows != 0L){ + + non_char <- .getColTypes(input = input, collapse = FALSE) != "c" + + if(rows != 1){ + stop("! implement case where more than one rows need to be shifted !") + } else { + vec <- colnames(input) + names(vec) <- paste0("X", seq_along(vec)) + vec <- as_tibble_row(vec) + vec[, non_char] <- NA + + colnames(input) <- paste0("X", seq_along(vec)) + + input <- bind_rows(vec, input) + } + + } + + return(input) +} + + #' Match variables #' #' This function matches id and observed variables and reshapes them accordingly @@ -454,7 +535,7 @@ #' Evaluate .sum constructs #' -#' @param input [\code{character(1)}]\cr table to reorganise. +#' @param input [\code{data.frame(1)}]\cr table to reorganise. #' @param groups [\code{list(3)}]\cr the groups-slot from a schema. #' @param data [\code{integerish(.)}]\cr the cell column or row that should be #' adapted to groupings. @@ -509,7 +590,7 @@ #' Evaluate .find constructs #' -#' @param input [\code{character(1)}]\cr table to reorganise. +#' @param input [\code{data.frame(1)}]\cr table to reorganise. #' @param col [\code{list(2)}]\cr the output of the respective .find construct #' used to match in columns. #' @param row [\code{list(2)}]\cr the output of the respective .find construct diff --git a/R/reorganise.R b/R/reorganise.R index 62c69ef..46b8094 100755 --- a/R/reorganise.R +++ b/R/reorganise.R @@ -45,8 +45,7 @@ reorganise <- function(input = NULL, schema = NULL){ # check validity of arguments assertDataFrame(x = input) - input <- input %>% - mutate_all(as.character) + input <- .spliceHeader(input = input, rows = schema@format$header) # 1. add missing information in schema ---- schema <- validateSchema(input = input, schema = schema) diff --git a/R/schema.R b/R/schema.R index 74fc2a5..aad046c 100755 --- a/R/schema.R +++ b/R/schema.R @@ -158,8 +158,13 @@ setValidity(Class = "schema", function(object){ if(length(object@format) == 0){ errors <- c(errors, "the slot 'format' does not contain any entries.") } - if(!all(names(object@format) %in% c("del", "dec", "na", "flags"))){ - errors <- c(errors, "'names(schema$format)' must be a permutation of set {del,dec,na,flags}") + if(!all(names(object@format) %in% c("header", "del", "dec", "na", "flags"))){ + errors <- c(errors, "'names(schema$format)' must be a permutation of set {header,del,dec,na,flags}") + } + if(!is.null(object@format$header)){ + if(!is.integer(object@format$header)){ + errors <- c(errors, "'schema$format$header' must must have a integer value.") + } } if(!is.null(object@format$del)){ if(!is.character(object@format$del)){ diff --git a/R/setFormat.R b/R/setFormat.R index 31d2e2c..23af1f2 100644 --- a/R/setFormat.R +++ b/R/setFormat.R @@ -7,6 +7,12 @@ #' @param schema [\code{schema(1)}]\cr In case this information is added to an #' already existing schema, provide that schema here (overwrites previous #' information). +#' @param header [\code{integerish(1)}]\cr The number of header rows. Optimally, +#' a table is read so that column names are ignored (for example +#' \code{readr::read_csv(file = ..., col_names = FALSE)}). If relatively well +#' defined tables are processed, where the header is always only one row, the +#' table can be read in with the default and the header can be spliced into +#' the table by specifying the number of rows here. #' @param decimal [\code{character(1)}]\cr The symbols that should be #' interpreted as decimal separator. #' @param thousand [\code{character(1)}]\cr The symbols that should be @@ -27,10 +33,11 @@ #' @importFrom dplyr bind_rows #' @export -setFormat <- function(schema = NULL, decimal = NULL, thousand = NULL, - na_values = NULL, flags = NULL){ +setFormat <- function(schema = NULL, header = 0L, decimal = NULL, + thousand = NULL, na_values = NULL, flags = NULL){ assertClass(x = schema, classes = "schema", null.ok = TRUE) + assertIntegerish(x = header, len = 1, lower = 0L, any.missing = FALSE) assertCharacter(x = decimal, len = 1, any.missing = FALSE, null.ok = TRUE) assertCharacter(x = thousand, len = 1, any.missing = FALSE, null.ok = TRUE) assertCharacter(x = na_values, any.missing = FALSE, null.ok = TRUE) @@ -43,6 +50,10 @@ setFormat <- function(schema = NULL, decimal = NULL, thousand = NULL, schema <- schema_default } + if(!is.null(header)){ + schema@format$header <- header + } + if(!is.null(decimal)){ schema@format$dec <- decimal } diff --git a/data/schema_default.rda b/data/schema_default.rda index d5e3b61fd9dfb5ce505666622b13a20ddcac5c48..a5579926451b17a1fe6de033598ab1788f5fda74 100755 GIT binary patch delta 331 zcmV-R0krLXi;`e^VGkRQ(YKsq9nAYJR7wJd-2T z9#d)zNur*Sw15K;02&CCQK_e-!$L3uU`-85>}L?9h;&jh3?U;x?jZtDn-_KjvoSl@ zn%b*yF-T^>42Ve=_7`{ws|g~hOX6^1h!YA24PNyx#wmU%2eeHP7Z)${9xNZdGFI$4 zbW^Rge|~#iToTxpOOeA6cBl~qvI|8a5(`+;7oY=Jq=bS2)9!&}y#S>EFrB*7wzj`* zoWL~z7As7(I_26oGtX6E!?ejneIN)i{#7($7w_l%Rv;tLqAL8!}> zd_%W4_C|F_70jVxwwZWe9FP%{pbrNJtjdAsf}O d9cHYUL469?hp<})MIcwuyOJrwgoVO_kZGeqwK5oJ zWHia37>1f?(9wcKM4q6@sp@HjGz>s`m+ZnReaM!ZR3Mv`-3F&xg3z;?tRuY&Q?Yi=Zw5#_t9d3PcU*%V?7s!y=?bO9nQr=X3PX8hq^6z;N! zK!)_=YS Date: Fri, 16 Feb 2024 01:58:09 +0100 Subject: [PATCH 4/4] increase version --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 50e7e10..3aa6d25 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tabshiftr Title: Reshape Disorganised Messy Data -Version: 0.4.2 +Version: 0.5.0 Authors@R: c(person(given = "Steffen", family = "Ehrmann", diff --git a/NEWS.md b/NEWS.md index d926078..3dd46af 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# tabshiftr 0.5.0 - variable types + +- include the possibility to specify variable data type, which will result in a column of that type +- include header into `setFormat()` again, which enables providing tables where the column names are in the header, where they will be spliced into the table. + # tabshiftr 0.4.2 - include split and merge functionality for cluster ID.