aim-rsf · RayStick · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,9 +1,17 @@
-^.*\.Rproj$
-^\.Rproj\.user$
+^\.git$
+^\.github$
+^.gitignore$
 ^LICENSE\.md$
-^doc$
-^Meta$
+^.all-contributorsrc$
+^CITATION\.cff$
+^CONTRIBUTING\.md$
 ^_pkgdown\.yml$
-^docs$
 ^pkgdown$
-^\.github$
+^docs$
+^doc$
+^.Rhistory$
+^.*\.Rproj$
+^\.Rproj\.user$
+
+
+
diff --git a/.github/.gitignore b/.github/.gitignore
diff --git a/.gitignore b/.gitignore
@@ -1,36 +1,54 @@
-# Distribution / packaging
-.Python
-build/
-downloads/
-.eggs/
-lib/
-lib64/
-var/
-*.pyc
-*~
 .DS_Store
-.*history
+
+# History files
 .Rhistory
+.Rapp.history
+.*history
+
+# Session Data files
+.RData
+.RDataTmp
+
+# User-specific files
+.Ruserdata
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj/
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# knitr and R markdown default cache directories
+*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# R Environment Variables
+.Renviron
+
+# pkgdown site
+docs/
+doc/
+
+# Ignore generated HTML and R files in vignettes
+vignettes/*.html
+vignettes/*.R
+
+
+
+
 
-# pyenv
-.python-version
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-.Rproj.user
-
-# Test/Input files 
-input_files/*
-output_files/*
-test_code/*
-inst/doc
-/doc/
-/Meta/
-docs
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,27 +1,37 @@
-Package: browseMetadata
 Type: Package
-Title: Browses available metadata, to catergorise or label each variable in a dataset
+Package: browseMetadata
+Title: Browses available health metadata, and aids in categorising variables
 Version: 1.2.1
 Authors@R: 
-    person("Rachael", "Stickland", email = "rstickland@turing.ac.uk", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-3398-4272"))
+    person("Rachael", "Stickland", , "rstickland@turing.ac.uk", role = c("aut", "cre"),
+           comment = c(ORCID = "0000-0003-3398-4272"))
 Maintainer: Rachael Stickland <rstickland@turing.ac.uk>
-Description: See https://github.com/aim-rsf/browseMetadata/main/README.md
+Description: This R package helps a researcher browse datasets in SAIL databank. 
+ It has scope to be applied to other health datasets. 
+ It is useful in the earlier stages of a project; prior to data access, 
+ researchers can use the metadata to browse and categorise variables.
 License: GPL (>= 3)
-Encoding: UTF-8
-LazyData: true
-RoxygenNote: 7.3.2
+URL: https://github.com/aim-rsf/browseMetadata
 Depends: 
     R (>= 2.10)
 Imports: 
     cli,
-    devtools,
     dplyr,
     ggplot2,
-    grid,
     gridExtra,
-    rjson
+    htmlwidgets,
+    plotly,
+    rjson,
+    tidyr
 Suggests: 
     knitr,
-    rmarkdown
-VignetteBuilder: knitr
-URL: https://aim-rsf.github.io/browseMetadata/
+    rmarkdown,
+    devtools,
+    testthat (>= 3.0.0),
+    mockery
+VignetteBuilder: 
+    knitr
+Config/testthat/edition: 3
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.3.2
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,23 +1,39 @@
 # Generated by roxygen2: do not edit by hand
 
-export(compare_sessions)
-export(convert_output)
-export(domain_mapping)
-export(user_categorisation)
-import(cli)
-import(devtools)
+export(browseMetadata)
+export(mapMetadata)
+export(mapMetadata_compare_outputs)
+export(mapMetadata_convert_outputs)
 import(ggplot2)
-import(grid)
-import(gridExtra)
-import(rjson)
+importFrom(cli,cli_alert_danger)
+importFrom(cli,cli_alert_info)
+importFrom(cli,cli_alert_success)
+importFrom(cli,cli_alert_warning)
+importFrom(cli,cli_h1)
 importFrom(dplyr,"%>%")
+importFrom(dplyr,add_row)
 importFrom(dplyr,arrange)
+importFrom(dplyr,contains)
 importFrom(dplyr,count)
 importFrom(dplyr,distinct)
 importFrom(dplyr,group_by)
 importFrom(dplyr,join_by)
 importFrom(dplyr,left_join)
+importFrom(dplyr,n)
 importFrom(dplyr,select)
+importFrom(dplyr,summarize)
+importFrom(ggplot2,ggsave)
 importFrom(graphics,plot.new)
+importFrom(gridExtra,grid.arrange)
+importFrom(gridExtra,tableGrob)
+importFrom(htmlwidgets,saveWidget)
+importFrom(plotly,layout)
+importFrom(plotly,plot_ly)
+importFrom(rjson,fromJSON)
+importFrom(stats,reorder)
+importFrom(tidyr,complete)
+importFrom(tidyr,pivot_longer)
+importFrom(tools,file_path_sans_ext)
+importFrom(utils,packageVersion)
 importFrom(utils,read.csv)
 importFrom(utils,write.csv)
diff --git a/R/browseMetadata-package.R b/R/browseMetadata-package.R
diff --git a/R/browseMetadata.R b/R/browseMetadata.R
@@ -0,0 +1,159 @@
+#' browseMetadata
+#'
+#' Run this function before MapMetadata. \cr \cr
+#' This function will read in the metadata file for a chosen dataset and save
+#' two summary outputs. The first is a table output, storing the name and
+#' description of the dataset, and each table within it. The second is a bar
+#' chart, summarising how many variables there are for each table, and whether
+#' these variables have a missing description. \cr \cr
+#' @param json_file The metadata file. This should be a json download from the
+#' metadata catalogue. By default, 'data/json_metadata.rda' is used - run
+#' '?json_metadata' to see how it was created.
+#' @param output_dir The path to the directory where the two output files
+#' will be saved. By default, the current working directory is used.
+#' @return The function will return two files, 'BROWSE_table_' and 'BROWSE_bar'
+#' which gives summary informatin for this dataset and can be used as reference
+#' when running the MapMetadata function. Open these outputs in a browser.
+#' @export
+#' @importFrom dplyr %>% add_row
+#' @importFrom rjson fromJSON
+#' @importFrom cli cli_alert_info
+#' @importFrom plotly plot_ly layout
+#' @importFrom htmlwidgets saveWidget
+#' @importFrom tidyr pivot_longer
+
+browseMetadata <- function(json_file,output_dir = NULL) {
+
+  # DEFINE INPUTS AND OUTPUTS ----
+
+  ## Read in the json file containing the meta data
+  meta_json <- fromJSON(file = json_file)
+
+  ## Set output_dir to current wd if user has not provided it
+  if (is.null(output_dir)) {
+    output_dir = getwd()
+  }
+  ## Extract Dataset from json_file
+  Dataset <- meta_json$dataModel
+  Dataset_Name <- Dataset$label
+  dataset_version <- meta_json[["dataModel"]][["documentationVersion"]]
+
+  # PREPARE 2 OUTPUT DATAFRAMES FOR LATER PLOTTING ----
+
+  ## 1. Information about dataset and each table
+  dataset_desc <- data.frame(N = character(0), Name = character(0),
+                       Description = character(0))
+  ### add information about the dataset at the top
+  dataset_desc <- dataset_desc %>% add_row(N = '',Name = Dataset$label,
+                          Description = gsub('\n\n', '', Dataset$description))
+  dataset_desc <- dataset_desc %>%
+    add_row(N = 'N',Name = 'Table',Description = '')
+
+  ## 2. Counts of empty description fields for each table
+  count_empty <- data.frame(Empty = c('No','Yes'))
+
+  # LOOP THROUGH EACH TABLE IN DATASET ----
+
+  ntables <- length(Dataset$childDataClasses)
+  ntables_digits <- nchar(ntables)
+
+  for (dc in 1:ntables) {
+    cat("\n")
+    Table_name <- Dataset$childDataClasses[[dc]]$label
+    cli_alert_info(paste0("Processing Table {dc} of {ntables} (",
+                          Table_name,")"))
+
+    ## Add to the dataset_desc data frame
+    dataset_desc <- dataset_desc %>% add_row(
+      N = as.character(dc),
+      Name = Table_name,
+      Description = gsub('\n\n', '',Dataset$childDataClasses[[dc]]$description))
+
+    ## Use 'json_table_to_df.R' to extract table from meta_json into a df
+    Table_df <- json_table_to_df(Dataset = Dataset,n = dc)
+
+    ## Use 'count_empty_desc.R' to count number of empty descriptions
+    Table_colname <- paste0(Table_name,'(',dc,')')
+    count_empty_table <- count_empty_desc(Table_df,Table_colname)
+
+    ## Add to group dataframe for later plotting
+    count_empty[[Table_colname]] <- count_empty_table[[Table_colname]]
+
+  } # end of loop through each table
+
+  # 1. TABLE SUMMARISING THE DATASET ----
+
+  ## Create a matrix for cell colors
+  cell_colors <- matrix("white", nrow = nrow(dataset_desc) + 1,
+                        ncol = ncol(dataset_desc))
+  cell_colors[2, ] <- "lightgrey"  # Change the color of the second row
+
+  table_fig <- plot_ly(
+    type = 'table',
+    columnorder = c(0,1,2),
+    columnwidth = c(ntables_digits, max(nchar(dataset_desc$Name)), 100),
+    header = list(
+      values = c("", "Dataset", "Description"),
+      align = c("center", "center", "center"),
+      line = list(width = 1, color = 'black'),
+      fill = list(color = c("grey", "grey")),
+      font = list(family = "Arial", size = 14, color = "white")
+    ),
+    cells = list(
+      values = rbind(t(as.matrix(unname(dataset_desc)))),
+      align = c("center", "center", "left"),
+      line = list(color = "black", width = 1),
+      fill = list(color = apply(cell_colors, 2, as.list)),  # Apply cell colors
+      font = list(family = "Arial", size = 12, color = c("black"))
+    )
+  )
+
+  # 2. BAR CHART DISPLAYING COUNTS OF MISSING DESCRIPTIONS FOR ALL TABLES ----
+  count_empty_long <- count_empty %>%
+    pivot_longer(cols = -Empty, names_to = "Table", values_to = "N_Variables")
+
+  empty_fig <- plot_ly(count_empty_long,
+                       x = ~Table,
+                       y = ~N_Variables,
+                       color = ~Empty,
+                       colors = c("grey","darkturquoise"),
+                       type = 'bar',
+                       text = ~N_Variables,
+                       textposition = 'inside', # Position text inside the bars
+                       texttemplate = '%{text}', # Ensure text displayed as is
+                       textfont = list(color = 'black',size = 10)) %>%
+    layout(barmode = 'stack',
+           title = paste0('\n',Dataset_Name,' contains ',ntables,' table(s)'),
+           xaxis = list(title = 'Table'),
+           yaxis = list(title = 'N_Variables'),
+           legend = list(title = list(text = 'Empty Description')))
+
+  # SAVE PLOTS ----
+
+  original_wd <- getwd()
+  setwd(output_dir) #saveWidget has a bug with paths & saving
+  base_fname <- paste0(gsub(" ", "", Dataset_Name),"_V",dataset_version)
+
+  ## Save the table plot to a HTML file
+  table_fname <- paste0("BROWSE_table_",base_fname,".html")
+  saveWidget(widget = table_fig, file = table_fname, selfcontained = TRUE)
+
+  ## Save the bar plot to a HTML file
+  bar_fname <- paste0("BROWSE_bar_",base_fname,".html")
+  saveWidget(widget = empty_fig, file = bar_fname, selfcontained = TRUE)
+
+  ## Save the data that made the bar plot to a csv file
+  bar_fname <- paste0("BROWSE_bar_",base_fname,".csv")
+  write.csv(count_empty_long,bar_fname,row.names = FALSE)
+
+  setwd(original_wd) #saveWidget has a bug with paths & saving
+
+  # OUTPUTS ----
+  cat ("\n")
+  cli_alert_info("Three outputs have been saved to your output directory.")
+  cli_alert_info("Open the two html files in your browser for full screen viewing.")
+  cat ("\n")
+
+  list(table_fig = table_fig, empty_fig = empty_fig)
+
+} # end of function