From afd8d644a780533a967693f54af3911a931526f7 Mon Sep 17 00:00:00 2001 From: Nick Lucius Date: Fri, 5 May 2017 22:34:31 -0500 Subject: [PATCH] download non-tabular datasets with export.socrata #126 --- DESCRIPTION | 2 +- NAMESPACE | 2 ++ R/RSocrata.R | 43 ++++++++++++++++++++++++++++++++----------- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 32fdfc4..0c38881 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,7 +10,7 @@ Description: Provides easier interaction with format and manages throttling by 'Socrata'. Users can upload data to Socrata portals directly from R. -Version: 1.8.0-1 +Version: 1.8.0-2 Date: 2017-05-05 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc Maintainer: "Tom Schenk Jr." diff --git a/NAMESPACE b/NAMESPACE index a9900d0..82595fd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(export.socrata) export(fieldName) export(isFourByFour) export(ls.socrata) @@ -17,3 +18,4 @@ importFrom(jsonlite,fromJSON) importFrom(mime,guess_type) importFrom(plyr,rbind.fill) importFrom(utils,read.csv) +importFrom(utils,write.csv) diff --git a/R/RSocrata.R b/R/RSocrata.R index 5b35358..2ec8840 100644 --- a/R/RSocrata.R +++ b/R/RSocrata.R @@ -469,6 +469,8 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email, #' @param url - the base URL of a domain (e.g., "data.cityofchicago.org") #' @return a Gzipped file with the four-by-four and timestamp of when the download began in filename #' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} +#' @importFrom httr GET +#' @importFrom utils write.csv #' @export export.socrata <- function(url) { dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL @@ -480,16 +482,35 @@ export.socrata <- function(url) { # Download data downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element - d <- read.socrata(downloadUrl) - - # Construct the filename output - downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore - downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename - filename <- httr::parse_url(ls$identifier[i]) - filename$path <- substr(filename$path, 11, 19) - filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format, ".gz") - - # Write file - write.csv(d, file = gzfile(filename)) + if (grepl(".csv", downloadUrl)) { + d <- read.socrata(downloadUrl) + + # Construct the filename output + default_format <- "csv" + downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore + downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename + filename <- httr::parse_url(ls$identifier[i]) + filename$path <- substr(filename$path, 11, 19) + filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format, ".gz") + + # Write file + write.csv(d, file = gzfile(filename)) + + } else { + response <- GET(downloadUrl) + + # Construct the filename output + default_format <- response$headers$`content-disposition` + default_format <- strsplit(default_format, "filename=")[[1]][2] + downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore + downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename + filename <- httr::parse_url(ls$identifier[i]) + filename$path <- substr(filename$path, 11, 19) + filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format) + + # Write file + writeBin(response$content, filename) + } + } } \ No newline at end of file