Included alpha code for export.socrata

Chicago · May 5, 2017 · 4efc592 · 4efc592
1 parent f6dbf80
commit 4efc592
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -10,8 +10,8 @@ Description: Provides easier interaction with
     format and manages throttling by 'Socrata'.
     Users can upload data to Socrata portals directly
     from R.
-Version: 1.7.2-12
-Date: 2017-03-16
+Version: 1.8.0-1
+Date: 2017-05-05
 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc
 Maintainer: "Tom Schenk Jr." <developers@cityofchicago.org>
 Depends:

diff --git a/R/RSocrata.R b/R/RSocrata.R
@@ -458,3 +458,38 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email,
   return(response)
 
 }
+
+#' Exports CSVs from Socrata data portals
+#' 
+#' Input the URL of a data portal (e.g., "data.cityofchicago.org") and
+#' will download all CSV files (no other files supported) and saved in
+#' a single directory named after the root URL (e.g., "data.cityofchicago.org/").
+#' Downloaded files are compressed to GZip format and timestamped so the download
+#' time is saved. No data is saved within the R workspace.
+#' @param url - the base URL of a domain (e.g., "data.cityofchicago.org")
+#' @return a Gzipped file with the four-by-four and timestamp of when the download began in filename
+#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org}
+#' @export
+export.socrata <- function(url) {
+  dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL
+  ls <- ls.socrata(url = url)
+  for (i in 1:dim(ls)[1]) {
+    # Track timestamp before download
+    downloadTime <- Sys.time()
+    downloadTz <- Sys.timezone()
+
+    # Download data
+    downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element
+    d <- read.socrata(downloadUrl)
+
+    # Construct the filename output
+    downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore
+    downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename
+    filename <- httr::parse_url(ls$identifier[i])
+    filename$path <- substr(filename$path, 11, 19)
+    filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format, ".gz")
+
+    # Write file
+    write.csv(d, file = gzfile(filename))
+  }
+}