Merge pull request #52 from OHDSI/develop

V2.0 release candidate
OHDSI · Apr 23, 2024 · 79c8944 · 79c8944
2 parents 0f95ce9 + eb374fb
commit 79c8944
Show file tree

Hide file tree

Showing 56 changed files with 2,095 additions and 549 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -9,3 +9,5 @@ docs
 _pkgdown.yml
 cran-comments.md
 ^CRAN-RELEASE$
+work/*
+^CRAN-SUBMISSION$
diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,5 @@
 R/TC
 statement_*.sql
 errorReport.txt
+work/*
 
diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION
@@ -0,0 +1,3 @@
+Version: 2.0.0
+Date: 2024-04-16 20:03:15 UTC
+SHA: 53f57cba7a55b79e60ec84baea9399f4ae19743c
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,26 +1,34 @@
 Package: Eunomia
 Type: Package
-Title: A Standard Dataset in the OMOP Common Data Model
-Version: 1.0.3
-Date: 2024-03-27
+Title: Standard Dataset Manager for Observational Medical Outcomes Partnership Common Data Model Sample Datasets
+Version: 2.0.0
+Date: 2024-04-22
 Authors@R: c(
   person("Frank", "DeFalco", , "fdefalco@ohdsi.org", role = c("aut", "cre")),
   person("Martijn", "Schuemie", , "schuemie@ohdsi.org", role = c("aut")),
+  person("Anthony", "Sena", , "sena@ohdsi.org", role=c("aut")),
+  person("Natthawut", "Adulyanukosol", , "na339@unc.edu", role=c("aut")),
+  person("Star", "Liu", , "sliu197@jhmi.edu", role=c("aut")),
+  person("Adam", "Black", , "black@ohdsi.org", role = c("aut")),
   person("Observational Health Data Science and Informatics", role = c("cph"))
   )
 Maintainer: Frank DeFalco <fdefalco@ohdsi.org>
-Description: A sample dataset in the OMOP (Observational Medical Outcomes Partnership) Common Data Model (CDM) format. The CDM enables uniform storage of observational health care data, and is widely used for health care analytics. 'Eunomia' contains simulated data as well as a subset of the OMOP Vocabulary, and enables testing of additional packages and is used for educational and demonstration purposes.
+Description: Facilitates access to sample datasets from the 'EunomiaDatasets' repository (<https://github.com/ohdsi/EunomiaDatasets>).
 License: Apache License 2.0
 URL: https://github.com/OHDSI/Eunomia
 BugReports: https://github.com/OHDSI/Eunomia/issues
-Depends:
-  DatabaseConnector (>= 2.2.0)
 Imports:
-  SqlRender,
-  RSQLite (> 2.1.1),
-  readr
+  readr,
+  rlang,
+  RSQLite,
+  DBI,
+  arrow,
+  CommonDataModel
 Suggests:
-  testthat
+  testthat,
+  withr,
+  duckdb,
+  DatabaseConnector
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,11 +1,16 @@
 # Generated by roxygen2: do not edit by hand
 
 export(createCohorts)
-export(exportToCsv)
+export(downloadEunomiaData)
+export(exportDataFiles)
+export(extractLoadData)
+export(getDatabaseFile)
 export(getEunomiaConnectionDetails)
-import(DatabaseConnector)
+export(loadDataFiles)
 import(RSQLite)
-importFrom(readr,write_csv)
+importFrom(readr,read_csv)
+importFrom(tools,file_ext)
+importFrom(utils,download.file)
 importFrom(utils,read.csv)
 importFrom(utils,untar)
 importFrom(utils,unzip)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,13 @@
+Eunomia 2.0
+=============
+Changes
+- Updated package to no longer contain a dataset rather facilitate access to sample datasets
+  stored in the https://github.com/OHDSI/EunomiaDatasets repository
+- Backward compatibility maintained with getEunomiaConnectionDetails function
+- New function added for getDatabaseFile
+- Embedded sample dataset removed
+- Remove dependency on DatabaseConnector and Java
+
 Eunomia 1.0.3
 =============
 

diff --git a/R/Cohorts.R b/R/Cohorts.R
@@ -1,4 +1,4 @@
-# Copyright 2020 Observational Health Data Sciences and Informatics
+# Copyright 2023 Observational Health Data Sciences and Informatics
 #
 # This file is part of Eunomia
 #
@@ -14,75 +14,65 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 #' Construct cohorts
 #'
 #' @description
-#' Creates a set of predefined cohorts in a cohort table.
-#' WARNING: this will delete all existing cohorts in the table!
+#' Creates a set of predefined cohorts in a cohort table. WARNING: this will delete all existing
+#' cohorts in the table!
 #'
 #' @param connectionDetails      The connection details to connect to the (Eunomia) database.
-#' @param cdmDatabaseSchema      The name of the database schema holding the CDM data.
-#' @param cohortDatabaseSchema   The name of the database schema where the cohorts will be written.
-#' @param cohortTable            The name of the table in the cohortDatabaseSchema where the cohorts
-#'                               will be written.
+#' @param cdmDatabaseSchema      Deprecated. The cdm must be created in the main schema.
+#' @param cohortDatabaseSchema   Deprecated. The cohort table will be created in the main schema.
+#' @param cohortTable            Deprecated. Cohort table will be named "cohort".
 #'
 #' @return
 #' A data frame listing all created cohorts.
 #'
-#' @examples
-#' connectionDetails <- getEunomiaConnectionDetails()
-#' createCohorts(connectionDetails)
-#'
-#' connection <- connect(connectionDetails)
-#'
-#' sql <- "SELECT COUNT(*)
-#' FROM main.cohort
-#' WHERE cohort_definition_id = 1;"
-#'
-#' renderTranslateQuerySql(connection, sql)
-#'
-#' disconnect(connection)
-#'
 #' @export
 createCohorts <- function(connectionDetails,
                           cdmDatabaseSchema = "main",
                           cohortDatabaseSchema = "main",
                           cohortTable = "cohort") {
-  connection <- DatabaseConnector::connect(connectionDetails)
-  on.exit(DatabaseConnector::disconnect(connection))
 
-  # Create study cohort table structure:
-  sql <- SqlRender::loadRenderTranslateSql(sqlFilename = "CreateCohortTable.sql",
-                                           packageName = "Eunomia",
-                                           dbms = connectionDetails$dbms,
-                                           cohort_database_schema = cohortDatabaseSchema,
-                                           cohort_table = cohortTable)
-  DatabaseConnector::executeSql(connection, sql, progressBar = FALSE, reportOverallTime = FALSE)
+  if (!("ConnectionDetails" %in% class(connectionDetails))) {
+    stop("connectionDetails is not valid.")
+  }
+
+  if (connectionDetails$dbms != "sqlite") {
+    stop("createCohorts only supports sqlite")
+  }
+
+  if (cdmDatabaseSchema != "main" || cohortDatabaseSchema != "main") {
+    stop("sqlite only supports the main schema")
+  }
+
+  if (cohortTable != "cohort") {
+    warning("The cohortTable argument to createCohorts was deprecated in Eunomia v2.1.0")
+  }
+
+  connection <- DBI::dbConnect(RSQLite::SQLite(), connectionDetails$server())
+  on.exit(DBI::dbDisconnect(connection))
 
-  # Instantiate cohorts:
-  pathToCsv <- system.file("settings", "CohortsToCreate.csv", package = "Eunomia")
-  cohortsToCreate <- read.csv(pathToCsv)
-  for (i in 1:nrow(cohortsToCreate)) {
-    writeLines(paste("Creating cohort:", cohortsToCreate$name[i]))
-    sql <- SqlRender::loadRenderTranslateSql(sqlFilename = paste0(cohortsToCreate$name[i], ".sql"),
-                                             packageName = "Eunomia",
-                                             dbms = connectionDetails$dbms,
-                                             cdm_database_schema = cdmDatabaseSchema,
-                                             cohort_database_schema = cohortDatabaseSchema,
-                                             cohort_table = cohortTable,
-                                             cohort_definition_id = cohortsToCreate$cohortId[i])
-    DatabaseConnector::executeSql(connection, sql)
+  # Create example cohort table
+  pathToSql <- system.file("sql", "CreateCohortTable.sql",package = "Eunomia", mustWork = TRUE)
+  sql <- readChar(pathToSql, file.info(pathToSql)$size)
+  sql <- gsub("--[a-zA-Z0-9 ]*", "", sql) # remove comments in sql
+  sql <- strsplit(gsub("\n", " ", sql), ";")[[1]] # remove newlines, split on semicolon
+  sql <- trimws(sql) # trim white space
+  sql <- sql[-which(sql == "")] # remove empty lines
+
+  for (i in seq_along(sql)) {
+    DBI::dbExecute(connection, sql[i])
   }
 
   # Fetch cohort counts:
-  sql <- "SELECT cohort_definition_id, COUNT(*) AS count FROM @cohort_database_schema.@cohort_table GROUP BY cohort_definition_id"
-  counts <- DatabaseConnector::renderTranslateQuerySql(connection,
-                                                       sql,
-                                                       cohort_database_schema = cohortDatabaseSchema,
-                                                       cohort_table = cohortTable,
-                                                       snakeCaseToCamelCase = TRUE)
-  counts <- merge(cohortsToCreate, counts, by.x = "cohortId", by.y = "cohortDefinitionId")
-  writeLines(sprintf("Cohorts created in table %s.%s", cohortDatabaseSchema, cohortTable))
+  sql <- "SELECT cohort_definition_id, COUNT(*) AS count
+          FROM main.cohort
+          GROUP BY cohort_definition_id"
+  counts <- DBI::dbGetQuery(connection, sql)
+
+  cohortsToCreate <- read.csv(system.file("settings", "CohortsToCreate.csv", package = "Eunomia", mustWork = T))
+  counts <- merge(cohortsToCreate, counts, by.x = "cohortId", by.y = "cohort_definition_id")
+  writeLines("Cohorts created in table main.cohort")
   return(counts)
 }
diff --git a/R/Connection.R b/R/Connection.R
@@ -1,4 +1,4 @@
-# Copyright 2020 Observational Health Data Sciences and Informatics
+# Copyright 2023 Observational Health Data Sciences and Informatics
 #
 # This file is part of Eunomia
 #
@@ -15,31 +15,151 @@
 # limitations under the License.
 
 
-#' Get Eunomia Connection Details
+#' Get Default Eunomia Connection Details
 #'
 #' @description
-#' Creates a copy of the Eunomia database, and provides details for connecting to that copy.
+#' Creates a copy of the default (GiBleed) Eunomia database, and provides details for connecting to
+#' that copy. Function provides backwards compatibility to prior releases of Eunomia default (GiBleed)
+#' dataset
 #'
+#' @param databaseFile The path where the database file will be copied to. By default, the database will
+#'                     be copied to a temporary folder, and will be deleted at the end of the R session.
+#' @param dbms         The target dialect, by default "sqlite".
+#'
+#' @return
+#' A ConnectionDetails object, to be used with the \code{DatabaseConnector} package.
+#'
+#' @export
+getEunomiaConnectionDetails <- function(databaseFile = tempfile(fileext = ".sqlite"), dbms = "sqlite") {
+
+  if (interactive() & !("DatabaseConnector" %in% rownames(utils::installed.packages()))) {
+    message("The DatabaseConnector package is required but not installed.")
+    if (!isTRUE(utils::askYesNo("Would you like to install DatabaseConnector?"))) {
+      return(invisible(NULL))
+    } else {
+      utils::install.packages("DatabaseConnector")
+    }
+  }
+
+  datasetLocation <- getDatabaseFile(datasetName = "GiBleed", dbms = dbms, databaseFile = databaseFile)
+  DatabaseConnector::createConnectionDetails(dbms = dbms, server = datasetLocation)
+}
+
+#' Create a copy of a Eunomia dataset
+#'
+#' @description
+#' Creates a copy of a Eunomia database, and returns the path to the new database file.
+#' If the dataset does not yet exist on the user's computer it will attempt to download the source data
+#' to the the path defined by the EUNOMIA_DATA_FOLDER environment variable.
+#'
+#' @param datasetName    The data set name as found on https://github.com/OHDSI/EunomiaDatasets. The
+#'                       data set name corresponds to the folder with the data set ZIP files
+#' @param cdmVersion     The OMOP CDM version. This version will appear in the suffix of the data file,
+#'                       for example: <datasetName>_<cdmVersion>.zip. Default: '5.3'
+#' @param pathToData     The path where the Eunomia data is stored on the file system., By default the
+#'                       value of the environment variable "EUNOMIA_DATA_FOLDER" is used.
+#' @param dbms           The database system to use. "sqlite" (default) or "duckdb"
 #' @param databaseFile   The path where the database file will be copied to. By default, the database
 #'                       will be copied to a temporary folder, and will be deleted at the end of the R
 #'                       session.
+#' @param inputFormat    The format of the files expected in the archive. (csv or parquet)
+#' @param verbose        Provide additional logging details during execution
+#' @param overwrite      Remove and replace an existing data set.
 #'
-#' @return
-#' A ConnectionDetails object, to be used with the \code{DatabaseConnector} package.
+#' @return The file path to the new Eunomia dataset copy
+#' @export
 #'
 #' @examples
-#' connectionDetails <- getEunomiaConnectionDetails()
-#' connection <- connect(connectionDetails)
-#' querySql(connection, "SELECT COUNT(*) FROM person;")
-#' disconnect(connection)
+#' \dontrun{
+#'  conn <- DBI::dbConnect(RSQLite::SQLite(), getDatabaseFile("GiBleed"))
+#'  DBI::dbDisconnect(conn)
 #'
-#' @export
-getEunomiaConnectionDetails <- function(databaseFile = tempfile(fileext = ".sqlite")) {
-  extractFolder <- tempdir()
-  file <- xzfile(system.file("sqlite", "cdm.tar.xz", package = "Eunomia"), open = "rb")
-  untar(file, exdir = extractFolder)
-  close(file)
-  file.rename(from = file.path(extractFolder, "cdm.sqlite"), to = databaseFile)
-  details <- DatabaseConnector::createConnectionDetails(dbms = "sqlite", server = databaseFile)
-  return(details)
+#'  conn <- DBI::dbConnect(duckdb::duckdb(), getDatabaseFile("GiBleed", dbms = "duckdb"))
+#'  DBI::dbDisconnect(conn, shutdown = TRUE)
+#'
+#'  conn <- DatabaseConnector::connect(dbms = "sqlite", server = getDatabaseFile("GiBleed"))
+#'  DatabaseConnector::disconnect(conn)
+#' }
+#'
+getDatabaseFile <- function(datasetName,
+                            cdmVersion = "5.3",
+                            pathToData = Sys.getenv("EUNOMIA_DATA_FOLDER"),
+                            dbms = "sqlite",
+                            databaseFile = tempfile(fileext = paste0(".", dbms)),
+                            inputFormat = "csv",
+                            verbose = FALSE,
+                            overwrite = TRUE) {
+
+  if (is.null(pathToData) || is.na(pathToData) || pathToData == "") {
+    pathToData <- tempdir()
+  }
+
+  stopifnot(is.character(dbms), length(dbms) == 1, dbms %in% c("sqlite", "duckdb"))
+  stopifnot(is.character(cdmVersion), length(cdmVersion) == 1, cdmVersion %in% c("5.3", "5.4"))
+
+  if (dbms == "duckdb") {
+    rlang::check_installed("duckdb")
+    # duckdb database are tied to a specific version of duckdb until it reaches v1.0
+    duckdbVersion <- substr(utils::packageVersion("duckdb"), 1, 3)
+    datasetFileName <- paste0(datasetName, "_", cdmVersion, "_", duckdbVersion, ".", dbms)
+  } else {
+    datasetFileName <- paste0(datasetName, "_", cdmVersion, ".", dbms)
+  }
+
+  # cached sqlite or duckdb file to be copied
+  datasetLocation <- file.path(pathToData, datasetFileName)
+  datasetAvailable <- file.exists(datasetLocation)
+  if (datasetAvailable && overwrite) {
+    if (verbose) {
+      message("overwrite specified, deleting existing dataset: ", datasetLocation, appendLF = TRUE)
+    }
+    unlink(datasetLocation)
+    datasetAvailable <- FALSE
+  }
+
+  if (verbose) {
+    message("dataset: ",datasetLocation, " available: ",datasetAvailable, appendLF = TRUE)
+  }
+
+  # zip archive of csv source files
+  archiveName <- paste0(datasetName, "_", cdmVersion, ".zip")
+  archiveLocation <- file.path(pathToData, archiveName)
+  archiveAvailable <- file.exists(archiveLocation)
+
+  if (archiveAvailable && overwrite) {
+    if (verbose) {
+      message("overwrite specified, deleting existing archive: ", archiveLocation, appendLF = TRUE)
+    }
+    unlink(archiveLocation)
+    archiveAvailable <- FALSE
+  }
+
+  if (verbose) {
+    message("archive: ",archiveLocation," available:",archiveAvailable,appendLF = TRUE)
+  }
+
+  if (!datasetAvailable && !archiveAvailable) {
+    message(paste("attempting to download", datasetName))
+    downloadedData <- downloadEunomiaData(datasetName = datasetName, cdmVersion = cdmVersion, pathToData = pathToData, verbose=verbose)
+    if (verbose) {
+      message("downloaded: ",downloadedData,appendLF = TRUE)
+    }
+    archiveAvailable <- TRUE
+  }
+
+  if (!datasetAvailable && archiveAvailable) {
+    message("attempting to extract and load: ", archiveLocation," to: ",datasetLocation,appendLF = TRUE)
+    extractLoadData(from = archiveLocation, to = datasetLocation, dbms = dbms, cdmVersion = cdmVersion, inputFormat=inputFormat, verbose=verbose)
+    datasetAvailable <- TRUE
+  }
+
+  if (verbose) {
+    message("copying: ",datasetLocation," to: ", databaseFile, appendLF = TRUE)
+  }
+
+  copySuccess <- file.copy(from = datasetLocation, to = databaseFile, overwrite = overwrite)
+  if (isFALSE(copySuccess)) {
+    stop(paste("File copy from", datasetLocation, "to", databaseFile, "failed!"))
+  }
+  invisible(databaseFile)
 }