From d167e8e1d8e6192132d1b532a7cfcd30b3cb5f5f Mon Sep 17 00:00:00 2001 From: dmpe Date: Sat, 25 Jul 2015 23:28:45 +0200 Subject: [PATCH 1/2] add geojson support. not finished --- DESCRIPTION | 9 +++--- NAMESPACE | 1 + R/returnData.R | 75 +++++++++++++++++++++++++++++---------------- man/read.socrata.Rd | 33 +++++++++++++++----- 4 files changed, 80 insertions(+), 38 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 28ee7ec..eafc22b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,22 +9,23 @@ Description: Provides easier interaction with returns an R data frame. Converts dates to 'POSIX' format. Manages throttling by 'Socrata'. + Supports geospacial data (GeoJSON). Version: 1.6.3 Date: 2015-07-23 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc Maintainer: "Tom Schenk Jr." Depends: - R (>= 3.0.0) + R (>= 3.2.0) Imports: httr (>= 1.0.0), jsonlite (>= 0.9.16), - mime (>= 0.3) + mime (>= 0.3), + geojsonio (>= 0.1.0) Suggests: testthat (>= 0.10.0), roxygen2 (>= 4.1.0), knitr (>= 1.10.5), - leaflet (>= 1.0.0), - geojsonio (>= 0.1.0) + leaflet (>= 1.0.0) License: MIT + file LICENSE URL: https://github.com/Chicago/RSocrata BugReports: https://github.com/Chicago/RSocrata/issues diff --git a/NAMESPACE b/NAMESPACE index f12ba39..813c6b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export(ls.socrata) export(posixify) export(read.socrata) export(validateUrl) +importFrom(geojsonio,geojson_read) importFrom(httr,GET) importFrom(httr,build_url) importFrom(httr,content) diff --git a/R/returnData.R b/R/returnData.R index 198f3ea..778ea51 100644 --- a/R/returnData.R +++ b/R/returnData.R @@ -4,9 +4,10 @@ # Author: Hugh J. Devlin, Ph. D. 2013-08-28 ############################################################################### -# library('httr') # for access to the HTTP header -# library('jsonlite') # for parsing data types from Socrata -# library('mime') # for guessing mime type +# library("httr") # for access to the HTTP header +# library("jsonlite") # for parsing data types from Socrata +# library("mime") # for guessing mime type +# library("geojsonio") # for geospatial json #' Wrap httr GET in some diagnostics #' @@ -32,6 +33,7 @@ checkResponse <- function(url = "") { #' #' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} #' @importFrom httr content +#' @importFrom geojsonio geojson_read #' @param response - an httr response object #' @return data frame, possibly empty #' @noRd @@ -47,14 +49,16 @@ getContentAsDataFrame <- function(response) { } switch(mimeType, - 'text/csv' = + "text/csv" = httr::content(response), # automatic parsing - 'application/json' = - if(httr::content(response, as = 'text') == "[ ]") { # empty json? + "application/json" = + if(httr::content(response, as = "text") == "[ ]") { # empty json? data.frame() # empty data frame } else { data.frame(t(sapply(httr::content(response), unlist)), stringsAsFactors = FALSE) - } + }, + "application/vnd.geo+json" = # use geojson_read directly through its response link + geojsonio::geojson_read(response$url, method = "local", parse = FALSE, what = "list") ) } @@ -76,49 +80,66 @@ getSodaTypes <- function(response) { #' Get a full Socrata data set as an R data frame #' -#' Manages throttling and POSIX date-time conversions +#' @description Manages throttling and POSIX date-time conversions. #' -#' @param url - A Socrata resource URL, -#' or a Socrata "human-friendly" URL, +#' @param url - A Socrata resource URL, or a Socrata "human-friendly" URL, #' or Socrata Open Data Application Program Interface (SODA) query #' requesting a comma-separated download format (.csv suffix), -#' May include SoQL parameters, -#' but is assumed to not include a SODA offset parameter -#' @param app_token - a (non-required) string; SODA API token is used to query the data +#' May include SoQL parameters, and it is now assumed to include SODA \code{limit} +#' & \code{offset} parameters. +#' Either use a compelete URL, e.g. \code{} or use parameters below to construct your URL. +#' But don't combine them. +#' @param app_token - a (non-required) string; SODA API token can be used to query the data #' portal \url{http://dev.socrata.com/consumers/getting-started.html} -## @param domain - A Socrata domain, e.g \url{http://data.cityofchicago.org} -## @param fourByFour - a unique 4x4 identifier, e.g. "ydr8-5enu". See more \code{\link{isFourByFour}} -## @param query - Based on query language called the "Socrata Query Language" ("SoQL"), see -## \url{http://dev.socrata.com/docs/queries.html}. -## domain = NULL, fourByFour = NULL, query = NULL, limit = 50000, offset = 0 -## read.socrata(domain = "http://data.cityofchicago.org", fourByFour = "ydr8-5enu", query = "") -## @section TODO: \url{https://github.com/Chicago/RSocrata/issues/14} +#' @param domain - A Socrata domain, e.g \url{http://data.cityofchicago.org} +#' @param fourByFour - a unique 4x4 identifier, e.g. "ydr8-5enu". See more \code{\link{isFourByFour}} +#' @param query - Based on query language called the "Socrata Query Language" ("SoQL"), see +#' \url{http://dev.socrata.com/docs/queries.html}. +#' @param limit - defaults to the max of 50000. See \url{http://dev.socrata.com/docs/paging.html}. +#' @param offset - defaults to the max of 0. See \url{http://dev.socrata.com/docs/paging.html}. +#' @param output - in case of building URL manually, one of \code{c("csv", "json", "geojson")} +#' +#' @section TODO: \url{https://github.com/Chicago/RSocrata/issues/14} +#' #' @return a data frame with POSIX dates #' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +#' #' @examples #' df <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv") +#' dfgjs <- read.socrata(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") +#' df2 <- read.socrata(domain = "http://data.cityofchicago.org", fourByFour = "ydr8-5enu") +#' #' @importFrom httr parse_url build_url #' @importFrom mime guess_type #' #' @export -read.socrata <- function(url, app_token = NULL) { +read.socrata <- function(url = NULL, app_token = NULL, domain = NULL, fourByFour = NULL, + query = NULL, limit = 50000, offset = 0, output = NULL) { + # check url syntax, allow human-readable Socrata url validUrl <- validateUrl(url, app_token) parsedUrl <- httr::parse_url(validUrl) - mimeType <- mime::guess_type(parsedUrl$path) + mimeType <- mime::guess_type(parsedUrl$path, unknown = "application/vnd.geo+json") + + # match args + output_args <- match.arg(output) - if(!(mimeType %in% c('text/csv','application/json'))) { - stop("Error in read.socrata: ", mimeType, " not a supported data format. Try JSON or CSV.") + if(!(mimeType %in% c("text/csv","application/json", "application/vnd.geo+json"))) { + stop(mimeType, " not a supported data format. Try JSON, CSV or GeoJSON.") } response <- checkResponse(validUrl) page <- getContentAsDataFrame(response) result <- page - dataTypes <- getSodaTypes(response) + + if(mimeType %in% c("text/csv","application/json")) { + dataTypes <- getSodaTypes(response) + } ## More to come? Loop over pages implicitly + ## TODO: start here while (nrow(page) > 0) { - query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), '?', "&"), '$offset=', nrow(result)) + query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(result)) response <- checkResponse(query_url) page <- getContentAsDataFrame(response) result <- rbind(result, page) # accumulate @@ -127,7 +148,7 @@ read.socrata <- function(url, app_token = NULL) { # Convert Socrata calendar dates to POSIX format # Check for column names that are not NA and which dataType is a "calendar_date". If there are some, # then convert them to POSIX format - for(columnName in colnames(page)[!is.na(dataTypes[fieldName(colnames(page))]) & dataTypes[fieldName(colnames(page))] == 'calendar_date']) { + for(columnName in colnames(page)[!is.na(dataTypes[fieldName(colnames(page))]) & dataTypes[fieldName(colnames(page))] == "calendar_date"]) { result[[columnName]] <- posixify(result[[columnName]]) } diff --git a/man/read.socrata.Rd b/man/read.socrata.Rd index 3623a1b..3e8f4a9 100644 --- a/man/read.socrata.Rd +++ b/man/read.socrata.Rd @@ -4,27 +4,46 @@ \alias{read.socrata} \title{Get a full Socrata data set as an R data frame} \usage{ -read.socrata(url, app_token = NULL) +read.socrata(url = NULL, app_token = NULL, domain = NULL, + fourByFour = NULL, query = NULL, limit = 50000, offset = 0, + output = c("csv", "json", "geojson")) } \arguments{ -\item{url}{- A Socrata resource URL, -or a Socrata "human-friendly" URL, +\item{url}{- A Socrata resource URL, or a Socrata "human-friendly" URL, or Socrata Open Data Application Program Interface (SODA) query requesting a comma-separated download format (.csv suffix), -May include SoQL parameters, -but is assumed to not include a SODA offset parameter} +May include SoQL parameters, and it is now assumed to include SODA \code{limit} +& \code{offset} parameters. +Either use a compelete URL, e.g. \code{} or use parameters below to construct your URL. +But don't combine them.} -\item{app_token}{- a (non-required) string; SODA API token is used to query the data +\item{app_token}{- a (non-required) string; SODA API token can be used to query the data portal \url{http://dev.socrata.com/consumers/getting-started.html}} + +\item{domain}{- A Socrata domain, e.g \url{http://data.cityofchicago.org}} + +\item{fourByFour}{- a unique 4x4 identifier, e.g. "ydr8-5enu". See more \code{\link{isFourByFour}}} + +\item{query}{- Based on query language called the "Socrata Query Language" ("SoQL"), see +\url{http://dev.socrata.com/docs/queries.html}.} + +\item{limit}{- defaults to the max of 50000. See \url{http://dev.socrata.com/docs/paging.html}.} + +\item{offset}{- defaults to the max of 0. See \url{http://dev.socrata.com/docs/paging.html}.} } \value{ a data frame with POSIX dates } \description{ -Manages throttling and POSIX date-time conversions +Manages throttling and POSIX date-time conversions. +} +\section{TODO}{ + \url{https://github.com/Chicago/RSocrata/issues/14} } \examples{ df <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv") +dfgjs <- read.socrata(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") +df2 <- read.socrata(domain = "http://data.cityofchicago.org", fourByFour = "ydr8-5enu") } \author{ Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@cityofchicago.org} From 3bb2c201953e1630571f976e014eed06c368ac6b Mon Sep 17 00:00:00 2001 From: dmpe Date: Wed, 7 Oct 2015 00:39:18 +0200 Subject: [PATCH 2/2] fix leaflet example, not SP object but the list and add a new contributor fix test [skip ci] --- .Rbuildignore | 5 +- .gitignore | 1 + .travis.yml | 1 + DESCRIPTION | 30 ++--- NAMESPACE | 6 +- NEWS.md => NEWS | 17 +-- R/errorHandling.R | 43 +++---- R/metadata.R | 97 +++++++++++++++ R/returnData.R | 204 +++++++++++++++++++------------ R/utils.R | 97 +++++++++------ R/validateURL.R | 96 +++++++++------ README.md | 19 +-- man/fieldName.Rd | 27 ---- man/getMetadata.Rd | 33 +++++ man/isFourByFour.Rd | 2 +- man/posixify.Rd | 30 ----- man/read.socrata.Rd | 33 ++--- man/read.socrataGEO.Rd | 25 ++++ man/validateUrl.Rd | 22 ++-- tests/testthat.R | 1 + tests/testthat/test-all.R | 113 ++++++----------- tests/testthat/test-dateTime.R | 84 +++++++------ tests/testthat/test-fourByFour.R | 8 +- tests/testthat/test-geo.R | 5 +- tests/testthat/test-metadata.R | 17 +++ tests/testthat/test-token.R | 59 +++++++++ tests/testthat/test-validURL.R | 36 ++++-- vignettes/Examples.Rmd | 24 ++-- vignettes/bench.Rmd | 73 +++++++++++ 29 files changed, 767 insertions(+), 441 deletions(-) rename NEWS.md => NEWS (66%) create mode 100644 R/metadata.R delete mode 100644 man/fieldName.Rd create mode 100644 man/getMetadata.Rd delete mode 100644 man/posixify.Rd create mode 100644 man/read.socrataGEO.Rd create mode 100644 tests/testthat/test-metadata.R create mode 100644 tests/testthat/test-token.R create mode 100644 vignettes/bench.Rmd diff --git a/.Rbuildignore b/.Rbuildignore index df4cd6c..90b06bc 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -10,4 +10,7 @@ RSocrata.Rcheck .DS_Store ^\.travis\.yml$ appveyor.yml -CONTRIBUTING.md \ No newline at end of file +CONTRIBUTING.md +vignettes/rsconnect +vignettes/bench.rmd +^.*\.o$ diff --git a/.gitignore b/.gitignore index 74f359c..70a3405 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ *.Rhistory .Rproj.user inst/doc +/vignettes/rsconnect diff --git a/.travis.yml b/.travis.yml index 6f6a67c..44c111b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,7 @@ r_github_packages: - jeroenooms/curl - klutometis/roxygen - jimhester/covr + - yihui/mime - ropensci/geojsonio after_success: diff --git a/DESCRIPTION b/DESCRIPTION index eafc22b..c6f872a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,28 +5,28 @@ Description: Provides easier interaction with Socrata open data portals http://dev.socrata.com. Users can provide a 'Socrata' data set resource URL, or a 'Socrata' Open Data API (SoDA) web query, - or a 'Socrata' "human-friendly" URL, - returns an R data frame. - Converts dates to 'POSIX' format. - Manages throttling by 'Socrata'. - Supports geospacial data (GeoJSON). -Version: 1.6.3 -Date: 2015-07-23 -Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc + or a 'Socrata' "human-friendly" URL, all of which + return a R data frame. + Additionally, it converts dates to 'POSIX' format, + manages throttling by 'Socrata' and supports geospacial data. +Version: 1.7.5 +Date: 2015-10-10 +Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., David A Springate (@DASpringate) and John Malc (@dmpe) Maintainer: "Tom Schenk Jr." Depends: - R (>= 3.2.0) + R (>= 3.2.2) Imports: httr (>= 1.0.0), - jsonlite (>= 0.9.16), - mime (>= 0.3), - geojsonio (>= 0.1.0) + jsonlite (>= 0.9.17), + mime (>= 0.4), + geojsonio (>= 0.1.4), + plyr (>= 1.8.3) Suggests: testthat (>= 0.10.0), - roxygen2 (>= 4.1.0), - knitr (>= 1.10.5), + roxygen2 (>= 4.1.1), + knitr (>= 1.11), leaflet (>= 1.0.0) License: MIT + file LICENSE URL: https://github.com/Chicago/RSocrata BugReports: https://github.com/Chicago/RSocrata/issues -VignetteBuilder: knitr +VignetteBuilder: knitr \ No newline at end of file diff --git a/NAMESPACE b/NAMESPACE index 813c6b8..1acd5bf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,16 +1,18 @@ # Generated by roxygen2 (4.1.1): do not edit by hand -export(fieldName) +export(getMetadata) export(isFourByFour) export(ls.socrata) -export(posixify) export(read.socrata) +export(read.socrataGEO) export(validateUrl) importFrom(geojsonio,geojson_read) importFrom(httr,GET) +importFrom(httr,add_headers) importFrom(httr,build_url) importFrom(httr,content) importFrom(httr,parse_url) importFrom(httr,stop_for_status) importFrom(jsonlite,fromJSON) importFrom(mime,guess_type) +importFrom(plyr,rbind.fill) diff --git a/NEWS.md b/NEWS similarity index 66% rename from NEWS.md rename to NEWS index 762e290..576faae 100644 --- a/NEWS.md +++ b/NEWS @@ -38,10 +38,13 @@ Deprecated ```httr::guess_media()``` and implemented ```mime::guess_type()``` * Migrate Travis-CI to "proper" R YAML ([#46](https://github.com/Chicago/RSocrata/issues/46)) -### 1.6.3 / 1.6.2 (see roadmap) - -* Add a small vignette with existing examples -* Mostly internal changes which should not influence the current behaviour ([#53](https://github.com/Chicago/RSocrata/pull/53)) - * Add support of a `floating timestamp` - * New error handling function - +### 1.7 Several changes, bug fixes and new features + +* Create a small vignette with existing examples and add new ones with with ```leaflet``` map package. +* Some internal changes ([#53](https://github.com/Chicago/RSocrata/pull/53)) +* Add support of a `floating timestamp` +* New error handling function +* New functions returning metadata and GeoJSON data (similar to the read.socrata) +* Should be fixing [#27](https://github.com/Chicago/RSocrata/issues/27) + [#24](https://github.com/Chicago/RSocrata/pull/25) +* Many thanks to the [David A Springate](https://github.com/DASpringate), who wrote a function in 2014 to allow conversion of files with some missing columns to dataframe. This should be now fixing the long standing issue with rbind [https://github.com/Chicago/RSocrata/issues/19](https://github.com/Chicago/RSocrata/issues/19). +* ```rbind``` from base system has been replaced with `plyr's` `rbind.fill`, which can be faster [in some cases](https://github.com/Chicago/RSocrata/pull/56). diff --git a/R/errorHandling.R b/R/errorHandling.R index c16459a..0d50253 100644 --- a/R/errorHandling.R +++ b/R/errorHandling.R @@ -1,44 +1,45 @@ -#' Provides error handling functionality -#' -#' @description Based on \url{http://dev.socrata.com/docs/response-codes.html} -#' -#' @section TODO: Add messages that alert the user on the URL being valid, -#' but one that is not compatible with RSocrata. -#' See \url{https://github.com/Chicago/RSocrata/issues/16} -#' -#' @param rsp - \code{\link{httr::response}} response -#' @importFrom httr stop_for_status -#' -#' @noRd -errorHandling <- function(rsp = NULL) { +# Provides error handling functionality +# +# @description Based on \url{http://dev.socrata.com/docs/response-codes.html} +# +# @section TODO: Add messages that alert the user on the URL being valid, +# but one that is not compatible with RSocrata. +# See \url{https://github.com/Chicago/RSocrata/issues/16} +# +# @param url - SOPA url +#' @importFrom httr stop_for_status GET add_headers +errorHandling <- function(url = "", app_token = NULL) { + rsp <- httr::GET(url, httr::add_headers("X-App-Token" = app_token)) if (rsp$status_code == 200) { invisible("OK. Your request was successful.") - } else if(rsp$status_code == 202) { + } else if (rsp$status_code == 202) { warning("202 Request processing. You can retry your request, and when it's complete, you'll get a 200 instead.") - } else if(rsp$status_code == 400) { + } else if (rsp$status_code == 400) { stop("400 Bad request. Most probably was your request malformed (e.g URL with ?)") - } else if(rsp$status_code == 401) { + } else if (rsp$status_code == 401) { # only necessary when accessing datasets that have been marked as private or when making write requests (PUT, POST, and DELETE) - stop("Unauthorized. You attempted to authenticate but something went wrong.") + stop("Unauthorized. You attempted to authenticate but something went wrong.") - } else if(rsp$status_code == 403) { + } else if (rsp$status_code == 403) { stop("Forbidden. You're not authorized to access this resource. Make sure you authenticate to access private datasets.") - } else if(rsp$status_code == 404) { + } else if (rsp$status_code == 404) { stop("Not found. The resource requested doesn't exist.") - } else if(rsp$status_code == 429) { + } else if (rsp$status_code == 429) { stop("Too Many Requests. Your client is currently being rate limited. Make sure you're using an app token.") - } else if(rsp$status_code == 500) { + } else if (rsp$status_code == 500) { stop("Server error. Try later.") } else { httr::stop_for_status(rsp) } + return(rsp) + } \ No newline at end of file diff --git a/R/metadata.R b/R/metadata.R new file mode 100644 index 0000000..2567fe5 --- /dev/null +++ b/R/metadata.R @@ -0,0 +1,97 @@ +#' Return metadata about a Socrata dataset +#' +#' This function returns metadata about a dataset. Generally, such metadata can be accessed +#' with browser at \code{http://DOMAIN/api/views/FOUR-FOUR/rows.json} or +#' \code{http://DOMAIN/api/views/FOUR-FOUR/columns.json}, which is used here. +#' +#' @param url - A Socrata resource URL, or a Socrata "human-friendly" URL! +#' +#' @source \url{http://stackoverflow.com/a/29782941} +#' +#' @examples +#' \dontrun{ +#' gM1 <- getMetadata(url = "http://data.cityofchicago.org/resource/y93d-d9e3.json") +#' gM3 <- getMetadata(url = "https://data.cityofchicago.org/resource/6zsd-86xi.json") +#' gM2 <- getMetadata(url = "https://data.cityofboston.gov/resource/awu8-dc52") +#' } +#' +#' @return a list (!) containing a number of rows & columns and a data frame of metadata +#' +#' @importFrom jsonlite fromJSON +#' @importFrom httr parse_url build_url +#' @importFrom mime guess_type +#' +#' @author John Malc \email{cincenko@@outlook.com} +#' +#' @export +getMetadata <- function(url = "") { + + urlParsedBase <- httr::parse_url(url) + mimeType <- mime::guess_type(urlParsedBase$path) + + # use function below to get them using =COUNT(*) SODA query + gQRC <- getQueryRowCount(urlParsedBase, mimeType) + + # create URL for metadata data frame + fourByFour <- substr(basename(urlParsedBase$path), 1, 9) + urlParsed <- urlParsedBase + urlParsed$path <- paste0("api/views/", fourByFour, "/columns.json") + + # execute it + URL <- httr::build_url(urlParsed) + df <- jsonlite::fromJSON(URL) + + # number of rows can be sometimes "cached". If yes, then below we calculate the maximum number of + # rows from all non-null and null fields. + # If not, then it uses "getQueryRowCount" fnct with SODA =COUNT(*) SODA query. + rows <- if (suppressWarnings(max(df$cachedContents$non_null + df$cachedContents$null)) > 0) { + suppressWarnings(max(df$cachedContents$non_null + df$cachedContents$null)) + } else { + # as.numeric(ifelse(is.null(gQRC$count), gQRC$COUNT, gQRC$count)) # the reason + as.numeric(tolower(gQRC$COUNT)) + } + + columns <- as.numeric(nrow(df)) + + return(list(rows = rows, cols = columns, df)) +} + +# Return (always & only) number of rows as specified in the metadata of the data set +# +# @source Taken from \link{https://github.com/Chicago/RSocrata/blob/sprint7/R/getQueryRowCount.R} +# @author Gene Leynes \email{gleynes@@gmail.com} +# +#' @importFrom httr GET build_url content +getQueryRowCount <- function(urlParsed, mimeType) { + ## Construct the count query based on the URL, + if (is.null(urlParsed[['query']])) { + ## If there is no query at all, create a simple count + + cntQueryText <- "?$SELECT=COUNT(*)" + } else { + ## Otherwise, construct the query text with a COUNT command at the beginning of any other + ## limiting commands. Reconstitute the httr url into a string + cntQueryText <- httr::build_url(structure(list(query = urlParsed[['query']]), class = "url")) + ## Add the COUNT command to the beginning of the query + cntQueryText <- gsub(pattern = ".+\\?", replacement = "?$SELECT=COUNT(*)&", cntQueryText) + } + + ## Combine the count query with the rest of the URL + cntUrl <- paste0(urlParsed[[c('scheme')]], "://", urlParsed[[c('hostname')]], "/", + urlParsed[[c('path')]], cntQueryText) + + ## Execute the query to count the rows + totalRowsResult <- errorHandling(cntUrl, app_token = NULL) + + ## Parsing the result depends on the mime type + if (mimeType == "application/json") { + totalRows <- httr::content(totalRowsResult)[[1]] + } else { + totalRows <- httr::content(totalRowsResult) + } + + ## Limit the row count to $limit (if the $limit existed). + # totalRows <- min(totalRows, as.numeric(rowLimit)) + + return(totalRows) +} \ No newline at end of file diff --git a/R/returnData.R b/R/returnData.R index 778ea51..19561b6 100644 --- a/R/returnData.R +++ b/R/returnData.R @@ -1,39 +1,49 @@ # An interface to data hosted online in Socrata data repositories # This is the main file which uses other functions to download data from a Socrata repositories # -# Author: Hugh J. Devlin, Ph. D. 2013-08-28 +# Author: Hugh J. Devlin, Ph. D. et al. ############################################################################### # library("httr") # for access to the HTTP header # library("jsonlite") # for parsing data types from Socrata # library("mime") # for guessing mime type -# library("geojsonio") # for geospatial json +# library("geojsonio") # for geospatial json +# library("plyr") # for a faster binding of rows -#' Wrap httr GET in some diagnostics + +#' Converts to data frame even with missing columns #' -#' In case of failure, report error details from Socrata. +#' @source https://github.com/DASpringate/RSocrata/blob/master/R/RSocrata.R#L130 +#' @source https://github.com/Chicago/RSocrata/pull/3/files #' -#' @param url - Socrata Open Data Application Program Interface (SODA) query, a URL -#' @return httr a response object -#' @importFrom httr GET -#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +#' If all items are of the same length, just goes ahead and converts to df. +#' If the items are of different lengths, assume the longest has all the columns, +#' fill in the gaps with NA in the other columns and return in the original column order. #' +#' @param con - a list as output by content(response) +#' @return a dataframe +#' @author David A Springate \email{daspringate@@gmail.com} #' @noRd -checkResponse <- function(url = "") { - response <- httr::GET(url) - - errorHandling(response) - - return(response) +content_to_df <- function(con){ + lengths <- sapply(con, length) + if (all(lengths == length(con[[1]]))) { + data.frame(t(sapply(con, unlist)), stringsAsFactors = FALSE) + } else { + all_cols <- names(con[[which(sapply(con, length) == max(sapply(con, length)))[1]]]) + con <- lapply(con, function(x) { + r <- c(x, sapply(all_cols[!all_cols %in% names(x)], function(xx) NA, simplify = FALSE)) + r[all_cols] + }) + data.frame(t(sapply(con, unlist)), stringsAsFactors = FALSE) + } } #' Content parsers #' -#' Return a data frame for csv or json +#' Return a data frame for csv or json. GeoJSON is used extra in its own function. #' #' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} #' @importFrom httr content -#' @importFrom geojsonio geojson_read #' @param response - an httr response object #' @return data frame, possibly empty #' @noRd @@ -44,7 +54,7 @@ getContentAsDataFrame <- function(response) { # skip optional parameters sep <- regexpr(';', mimeType)[1] - if(sep != -1) { + if (sep != -1) { mimeType <- substr(mimeType, 0, sep[1] - 1) } @@ -52,105 +62,139 @@ getContentAsDataFrame <- function(response) { "text/csv" = httr::content(response), # automatic parsing "application/json" = - if(httr::content(response, as = "text") == "[ ]") { # empty json? + if (httr::content(response, as = "text") == "[ ]") { # empty json? data.frame() # empty data frame } else { - data.frame(t(sapply(httr::content(response), unlist)), stringsAsFactors = FALSE) - }, - "application/vnd.geo+json" = # use geojson_read directly through its response link - geojsonio::geojson_read(response$url, method = "local", parse = FALSE, what = "list") + content_to_df(httr::content(response)) + } ) } -#' Get the SoDA 2 data types -#' -#' Get the Socrata Open Data Application Program Interface data types from the http response header -#' -#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -#' @param response - headers attribute from an httr response object -#' @return a named vector mapping field names to data types -#' @importFrom jsonlite fromJSON -#' @noRd -getSodaTypes <- function(response) { - result <- jsonlite::fromJSON(response$headers[['x-soda2-types']]) - names(result) <- jsonlite::fromJSON(response$headers[['x-soda2-fields']]) - return(result) -} #' Get a full Socrata data set as an R data frame #' -#' @description Manages throttling and POSIX date-time conversions. +#' @description Manages throttling and POSIX date-time conversions. We support only .json suffix. #' #' @param url - A Socrata resource URL, or a Socrata "human-friendly" URL, #' or Socrata Open Data Application Program Interface (SODA) query -#' requesting a comma-separated download format (.csv suffix), +#' requesting a comma-separated download format (.json suffix), #' May include SoQL parameters, and it is now assumed to include SODA \code{limit} #' & \code{offset} parameters. -#' Either use a compelete URL, e.g. \code{} or use parameters below to construct your URL. -#' But don't combine them. +#' Either use a compelete URL or use parameters below to construct your URL. #' @param app_token - a (non-required) string; SODA API token can be used to query the data #' portal \url{http://dev.socrata.com/consumers/getting-started.html} -#' @param domain - A Socrata domain, e.g \url{http://data.cityofchicago.org} -#' @param fourByFour - a unique 4x4 identifier, e.g. "ydr8-5enu". See more \code{\link{isFourByFour}} #' @param query - Based on query language called the "Socrata Query Language" ("SoQL"), see #' \url{http://dev.socrata.com/docs/queries.html}. #' @param limit - defaults to the max of 50000. See \url{http://dev.socrata.com/docs/paging.html}. -#' @param offset - defaults to the max of 0. See \url{http://dev.socrata.com/docs/paging.html}. -#' @param output - in case of building URL manually, one of \code{c("csv", "json", "geojson")} +#' @param domain - A Socrata domain, e.g \url{http://data.cityofchicago.org} +#' @param fourByFour - a unique 4x4 identifier, e.g. "ydr8-5enu". See more \code{\link{isFourByFour}} #' -#' @section TODO: \url{https://github.com/Chicago/RSocrata/issues/14} -#' -#' @return a data frame with POSIX dates #' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} #' #' @examples -#' df <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv") -#' dfgjs <- read.socrata(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") -#' df2 <- read.socrata(domain = "http://data.cityofchicago.org", fourByFour = "ydr8-5enu") +#' \dontrun{ +#' df_csv <- read.socrata(url = "http://soda.demo.socrata.com/resource/4334-bgaj.csv") +#' df_manual2 <- read.socrata(domain = "http://data.cityofchicago.org/", fourByFour = "ydr8-5enu") +#' ## df_manual3<-read.socrata(url="http://data.cityofchicago.org/resource/ydr8-5enu") not working +#' lp<-read.socrata(url="http://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj.csv") +#' } #' #' @importFrom httr parse_url build_url -#' @importFrom mime guess_type +#' @importFrom plyr rbind.fill #' #' @export -read.socrata <- function(url = NULL, app_token = NULL, domain = NULL, fourByFour = NULL, - query = NULL, limit = 50000, offset = 0, output = NULL) { +read.socrata <- function(url = NULL, app_token = NULL, limit = 50000, domain = NULL, fourByFour = NULL, + query = NULL) { - # check url syntax, allow human-readable Socrata url - validUrl <- validateUrl(url, app_token) - parsedUrl <- httr::parse_url(validUrl) - mimeType <- mime::guess_type(parsedUrl$path, unknown = "application/vnd.geo+json") - - # match args - output_args <- match.arg(output) - - if(!(mimeType %in% c("text/csv","application/json", "application/vnd.geo+json"))) { - stop(mimeType, " not a supported data format. Try JSON, CSV or GeoJSON.") + if (is.null(url) == TRUE) { + buildUrl <- paste0(domain, "resource/", fourByFour, ".json") + url <- httr::parse_url(buildUrl) } - response <- checkResponse(validUrl) - page <- getContentAsDataFrame(response) - result <- page + # check url syntax, allow human-readable Socrata url + validUrl <- validateUrl(url) + parsedUrl <- httr::parse_url(validUrl) - if(mimeType %in% c("text/csv","application/json")) { - dataTypes <- getSodaTypes(response) - } + response <- errorHandling(validUrl, app_token) + results <- getContentAsDataFrame(response) + dataTypes <- getSodaTypes(response) + rowCount <- as.numeric(getMetadata(cleanQuest(validUrl))[1]) + ## More to come? Loop over pages implicitly - ## TODO: start here - while (nrow(page) > 0) { - query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(result)) - response <- checkResponse(query_url) + while (nrow(results) < rowCount) { + query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(results), "&$limit=", limit) + response <- errorHandling(query_url, app_token) page <- getContentAsDataFrame(response) - result <- rbind(result, page) # accumulate + results <- plyr::rbind.fill(results, page) # accumulate data } # Convert Socrata calendar dates to POSIX format - # Check for column names that are not NA and which dataType is a "calendar_date". If there are some, - # then convert them to POSIX format - for(columnName in colnames(page)[!is.na(dataTypes[fieldName(colnames(page))]) & dataTypes[fieldName(colnames(page))] == "calendar_date"]) { - result[[columnName]] <- posixify(result[[columnName]]) + # If sodaTypes are not null, check for column names that are not NA and which dataType + # is a "calendar_date". If there are some, then convert them to POSIX format + if (!is.null(dataTypes)) { + for (columnName in colnames(results)[!is.na(dataTypes[fieldName(colnames(results))]) + & dataTypes[fieldName(colnames(results))] == "calendar_date"]) { + results[[columnName]] <- posixify(results[[columnName]]) + } } - return(result) -} \ No newline at end of file + return(results) +} + + +#' Download GeoJSON data using geojsonio package +#' +#' @param ... - other arguments from \link{geojsonio} package for geojson_read method +#' @param url - A Socrata resource URL, requiring a .geojson suffix. +#' +#' @importFrom geojsonio geojson_read +#' @importFrom httr build_url parse_url +#' @importFrom mime guess_type +#' +#' @return Returns a list, which is the default option here. +#' +#' @examples +#' \dontrun{ +#' df_geo <- read.socrataGEO(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") +#' } +#' +#' @export +read.socrataGEO <- function(url = "", ...) { + + parseUrl <- httr::parse_url(url) + mimeType <- mime::guess_type(parseUrl$path) + + if (mimeType == "application/vnd.geo+json") { + results <- geojsonio::geojson_read(url, method = "local", what = "list", parse = FALSE, ...) + } + + return(results) +} + +#' Get the SoDA 2 data types +#' +#' Get the Socrata Open Data Application Program Interface data types from the http response header. +#' Used only for CSV and JSON, not GeoJSON +#' +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +#' @param response - headers attribute from an httr response object +#' @return a named vector mapping field names to data types +#' @importFrom jsonlite fromJSON +#' @noRd +getSodaTypes <- function(response) { + + # check if types and fields are not null + if (!is.null(response$headers[['x-soda2-types']]) | !is.null(response$headers[['x-soda2-fields']])) { + + result <- jsonlite::fromJSON(response$headers[['x-soda2-types']]) + names(result) <- jsonlite::fromJSON(response$headers[['x-soda2-fields']]) + return(result) + + } else { + NULL + } + +} + diff --git a/R/utils.R b/R/utils.R index f58a1f8..ee3eece 100644 --- a/R/utils.R +++ b/R/utils.R @@ -7,17 +7,16 @@ #' #' @param fourByFour - a string; character vector of length one #' @return TRUE if is valid Socrata unique identifier, FALSE otherwise -#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} +#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} et al. #' @examples #' isFourByFour(fourByFour = "4334-bgaj") #' isFourByFour("433-bgaj") #' isFourByFour(fourByFour = "4334-!gaj") -#' #' @export isFourByFour <- function(fourByFour = "") { if (nchar(fourByFour) == 9) { - if(identical(grepl("[[:alnum:]]{4}-[[:alnum:]]{4}", fourByFour), TRUE)) { + if (identical(grepl("[[:alnum:]]{4}-[[:alnum:]]{4}", fourByFour), TRUE)) { return(TRUE) } else { return(FALSE) @@ -28,39 +27,39 @@ isFourByFour <- function(fourByFour = "") { } -#' Convert Socrata human-readable column name to field name -#' -#' @description Convert Socrata human-readable column name, -#' as it might appear in the first row of data, -#' to field name as it might appear in the HTTP header; -#' that is, lower case, periods replaced with underscores -#' -#' @param humanName - a Socrata human-readable column name -#' @return Socrata field name in lower case -#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -#' @examples -#' fieldName("Number.of.Stations") # number_of_stations -#' -#' @export +# Convert Socrata human-readable column name to field name +# +# @description Convert Socrata human-readable column name, +# as it might appear in the first row of data, +# to field name as it might appear in the HTTP header; +# that is, lower case, periods replaced with underscores +# +# @param humanName - a Socrata human-readable column name +# @return Socrata field name in lower case +# @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +# @examples +# fieldName("Number.of.Stations") # number_of_stations +# @noRd +# @export fieldName <- function(humanName = "") { tolower(gsub('\\.', '_', humanName)) } -#' Convert Socrata calendar_date string to POSIX -#' -#' @description Datasets will either specify what timezone they should be interpreted in, -#' or you can usually assume they are in the timezone of the publisher. See examples below too. -#' -#' @seealso \url{http://dev.socrata.com/docs/datatypes/floating_timestamp.html} -#' @param x - character vector in one of possible Socrata calendar_date formats -#' @return a POSIX date -#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -#' @examples -#' posixify("2014-10-13T23:00:00") -#' posixify("09/14/2012 10:38:01 PM") -#' posixify("09/14/2012") -#' -#' @export +# Convert Socrata calendar_date string to POSIX +# +# @description Datasets will either specify what timezone they should be interpreted in, +# or you can usually assume they are in the timezone of the publisher. See examples below too. +# +# @seealso \url{http://dev.socrata.com/docs/datatypes/floating_timestamp.html} +# @param x - character vector in one of possible Socrata calendar_date formats +# @return a POSIX date +# @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} et al. +# @examples +# posixify("2014-10-13T23:00:00") +# posixify("09/14/2012 10:38:01 PM") +# posixify("09/14/2012") +# @noRd +# @export posixify <- function(x = "") { # https://github.com/Chicago/RSocrata/issues/24 @@ -78,12 +77,40 @@ posixify <- function(x = "") { } else if (any(regexpr("^[[:digit:]]{1,2}/[[:digit:]]{1,2}/[[:digit:]]{4}$", x)[1] == TRUE)) { # short date format - strptime(x, format="%m/%d/%Y") + strptime(x, format = "%m/%d/%Y") } else { # long date-time format - strptime(x, format="%m/%d/%Y %I:%M:%S %p") + strptime(x, format = "%m/%d/%Y %I:%M:%S %p") } -} \ No newline at end of file +} + +# Clean everything after "?", "&" or "." +# +# @source https://stackoverflow.com/questions/5631384/remove-everything-after-a-certain-character +# @source http://rfunction.com/archives/1499 +# +# @examples +# cleanQuest(url = "http://data.cityofchicago.org/resource/y93d-d9e3.csv?%24order=debarment_date&%24limit=50000") +# @returns http://data.cityofchicago.org/resource/y93d-d9e3.csv +# @author John Malc \email{cincenko@@outlook.com} +# @export +cleanQuest <- function(url = "") { + cleanURL <- strsplit(url, "?", fixed = TRUE) + return(cleanURL[[1]][1]) +} + +# @export +cleanAmp <- function(url = "") { + cleanURL <- strsplit(url, "&", fixed = TRUE) + return(cleanURL[[1]][1]) +} + +# @export +cleanDot <- function(url = "") { + cleanURL <- strsplit(url, ".", fixed = TRUE) + return(cleanURL[[1]][1]) +} + diff --git a/R/validateURL.R b/R/validateURL.R index 9e4eb96..8475a63 100644 --- a/R/validateURL.R +++ b/R/validateURL.R @@ -1,4 +1,4 @@ -#' Convert, if necessary, URL to valid REST API URL supported by Socrata. +#' Check if the URL is a valid one and supported by RSocrata (!). #' #' @description Will convert a human-readable URL to a valid REST API call #' supported by Socrata. It will accept a valid API URL if provided @@ -6,57 +6,77 @@ #' URL. Will accept queries with optional API token as a separate #' argument or will also accept API token in the URL query. Will #' resolve conflicting API token by deferring to original URL. -#' +#' #' @param url - a string; character vector of length one -#' @param app_token - a string; SODA API token used to query the data -#' portal \url{http://dev.socrata.com/consumers/getting-started.html} -#' @return a - valid Url -#' @importFrom httr parse_url build_url -#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} -#' @examples +#' +#' @return a valid URL used for downloading data +#' +#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} et al. +#' +#' @examples #' \dontrun{ -#' validateUrl(url = "a.fake.url.being.tested", app_token = "ew2rEMuESuzWPqMkyPfOSGJgE") +#' validateUrl(url = "a.fake.url.being.tested") +#' validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj") +#' validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.json") +#' validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.csv") +#' validateUrl(url = "http://soda.demo.socrata.com/resource/4334-bgaj.json") +#' validateUrl(url = "http://soda.demo.socrata.com/resource/4334-bgaj.xml") +#' validateUrl(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") +#' validateUrl(url = "http://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj.csv") #' } -#' validateUrl(url = "https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj", -#' app_token="ew2rEMuESuzWPqMkyPfOSGJgE") +#' +#' @importFrom httr parse_url build_url +#' @importFrom mime guess_type #' #' @export -validateUrl <- function(url = "", app_token = NULL) { +validateUrl <- function(url = "") { parsedUrl <- httr::parse_url(url) - if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname) | is.null(parsedUrl$path)) { + if ( is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname) | is.null(parsedUrl$path)) { stop(url, " does not appear to be a valid URL.") } - if(!is.null(app_token)) { # Handles the addition of API token and resolves invalid uses + fourByFour <- basename(parsedUrl$path) + if (!isFourByFour(cleanDot(fourByFour))) { + stop(fourByFour, " is not a valid Socrata dataset unique identifier.") + } + + if ( parsedUrl$scheme == "http") { + parsedUrl$scheme <- "https" + } + + # http://stackoverflow.com/a/7964098 + # For TRANSITION: it will be easier for users of CSV format to adapt to the JSON, by warning them. + # + # First, if suffix is CSV/XML, delete it and replace with JSON. + # Later, check if URL doesn't have JSON, i.e. has empty suffix, and if it does append JSON. + mimeType <- mime::guess_type(parsedUrl$path) + + if ( mimeType %in% c("text/csv", "application/xml")) { + parsedUrl$path <- substr(parsedUrl$path, 1, nchar(parsedUrl$path) - 4) # delete + parsedUrl$path <- paste0(parsedUrl$path, ".json") # add + message("BEWARE: Your suffix is no longer supported. Thus, we will automatically replace it with JSON.") - if(is.null(parsedUrl$query["$$app_token"])) { - token_inclusion <- "valid_use" - } else { - token_inclusion <- "already_included" - } + } else if ( mimeType == "application/json") { + # do nothing + } else if ( mimeType == "application/vnd.geo+json") { + message("For GeoJSON, you must use a new method: read.socrataGEO") - switch(token_inclusion, - "already_included" = { # Token already included in url argument - warning(url, " already contains an API token in url. Ignoring user-defined token.") - }, - "valid_use" = { # app_token argument is used, not duplicative. - parsedUrl$query[["app_token"]] <- paste0("%24%24app_token=", app_token) - } - ) + } else if ( mimeType == "text/plain") { + parsedUrl$path <- paste0(parsedUrl$path, ".json") - } - - if(substr(parsedUrl$path, 1, 9) == 'resource/') { - return(httr::build_url(parsedUrl)) # resource url already + } else { + stop(mimeType, " has never been supported. Use JSON instead. For GeoJSON use a new method: read.socrataGEO") } - fourByFour <- basename(parsedUrl$path) - if(!isFourByFour(fourByFour)) { - stop(fourByFour, " is not a valid Socrata dataset unique identifier.") + if ( substr(parsedUrl$path, 1, 9) == "resource/") { + return(httr::build_url(parsedUrl)) # resource url already } else { - parsedUrl$path <- paste0('resource/', fourByFour, '.csv') - httr::build_url(parsedUrl) - } + message("BEWARE: RSocrata no longer supports human readable URLs, thus we will convert it to the SODA ones (with resource/xxxx-xxxx.json) instead.") + parsedUrl$path <- paste0("resource/", cleanDot(fourByFour), ".json") + return(httr::build_url(parsedUrl)) # resource url already + } -} \ No newline at end of file +} + + diff --git a/README.md b/README.md index 3f05ff5..09da00b 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Supports [SoDA query parameters](http://dev.socrata.com/docs/queries.html) in th Use ```ls.socrata()``` to list all datasets available on a Socrata webserver. -This package uses [testthat](http://cran.r-project.org/package=testthat) test coverage. +This package uses [`testthat`](http://cran.r-project.org/package=testthat) test coverage. ### Installation @@ -36,29 +36,32 @@ devtools::install_github("Chicago/RSocrata") **OR** -on [CRAN](http://cran.r-project.org/package=RSocrata) +from [CRAN](http://cran.r-project.org/package=RSocrata): + +``` +install.packages("RSocrata") +``` **Beware**: -For the support of `GeoJSON` (which is optional), it is necessary to install [geojsonio](https://github.com/ropensci/geojsonio) correctly! +In order to support `GeoJSON` (which is semi-optional), it is necessary to install [geojsonio](https://github.com/ropensci/geojsonio) correctly! This depends on packages such as `rgdal` & `rgeos` (both on CRAN), which additionally on Linux you will need to install through `apt-get`: `sudo apt-get install libgdal1-dev libgdal-dev libgeos-c1 libproj-dev` -Then install both of them use: +Then install both CRAN packages using: ``` -install.packages("rgdal") -install.packages("rgeos") +install.packages(c("rgdal", "rgeos")) ``` ### Examples & Chanelog -Look for examples in the `vignette` folder and see `NEWS.MD` in the root of this repository. +Look for examples in the [`vignette` folder](https://github.com/Chicago/RSocrata/blob/dev/vignettes/Examples.Rmd) and see `NEWS` in the root of this repository. ### Issues -Please report issues, request enhancements or fork us at the [City of Chicago github](https://github.com/Chicago/RSocrata/issues). +**Please report issues**, request enhancements or fork us at the [City of Chicago github](https://github.com/Chicago/RSocrata/issues). ### Contributing diff --git a/man/fieldName.Rd b/man/fieldName.Rd deleted file mode 100644 index 4f74b5b..0000000 --- a/man/fieldName.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/utils.R -\name{fieldName} -\alias{fieldName} -\title{Convert Socrata human-readable column name to field name} -\usage{ -fieldName(humanName = "") -} -\arguments{ -\item{humanName}{- a Socrata human-readable column name} -} -\value{ -Socrata field name in lower case -} -\description{ -Convert Socrata human-readable column name, -as it might appear in the first row of data, -to field name as it might appear in the HTTP header; -that is, lower case, periods replaced with underscores -} -\examples{ -fieldName("Number.of.Stations") # number_of_stations -} -\author{ -Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@cityofchicago.org} -} - diff --git a/man/getMetadata.Rd b/man/getMetadata.Rd new file mode 100644 index 0000000..e2510d2 --- /dev/null +++ b/man/getMetadata.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/metadata.R +\name{getMetadata} +\alias{getMetadata} +\title{Return metadata about a Socrata dataset} +\source{ +\url{http://stackoverflow.com/a/29782941} +} +\usage{ +getMetadata(url = "") +} +\arguments{ +\item{url}{- A Socrata resource URL, or a Socrata "human-friendly" URL!} +} +\value{ +a list (!) containing a number of rows & columns and a data frame of metadata +} +\description{ +This function returns metadata about a dataset. Generally, such metadata can be accessed +with browser at \code{http://DOMAIN/api/views/FOUR-FOUR/rows.json} or +\code{http://DOMAIN/api/views/FOUR-FOUR/columns.json}, which is used here. +} +\examples{ +\dontrun{ +gM1 <- getMetadata(url = "http://data.cityofchicago.org/resource/y93d-d9e3.json") +gM3 <- getMetadata(url = "https://data.cityofchicago.org/resource/6zsd-86xi.json") +gM2 <- getMetadata(url = "https://data.cityofboston.gov/resource/awu8-dc52") +} +} +\author{ +John Malc \email{cincenko@outlook.com} +} + diff --git a/man/isFourByFour.Rd b/man/isFourByFour.Rd index 68af7d1..c002d37 100644 --- a/man/isFourByFour.Rd +++ b/man/isFourByFour.Rd @@ -24,6 +24,6 @@ isFourByFour("433-bgaj") isFourByFour(fourByFour = "4334-!gaj") } \author{ -Tom Schenk Jr \email{tom.schenk@cityofchicago.org} +Tom Schenk Jr \email{tom.schenk@cityofchicago.org} et al. } diff --git a/man/posixify.Rd b/man/posixify.Rd deleted file mode 100644 index 3bb20fd..0000000 --- a/man/posixify.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/utils.R -\name{posixify} -\alias{posixify} -\title{Convert Socrata calendar_date string to POSIX} -\usage{ -posixify(x = "") -} -\arguments{ -\item{x}{- character vector in one of possible Socrata calendar_date formats} -} -\value{ -a POSIX date -} -\description{ -Datasets will either specify what timezone they should be interpreted in, -or you can usually assume they are in the timezone of the publisher. See examples below too. -} -\examples{ -posixify("2014-10-13T23:00:00") -posixify("09/14/2012 10:38:01 PM") -posixify("09/14/2012") -} -\author{ -Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@cityofchicago.org} -} -\seealso{ -\url{http://dev.socrata.com/docs/datatypes/floating_timestamp.html} -} - diff --git a/man/read.socrata.Rd b/man/read.socrata.Rd index 3e8f4a9..968b895 100644 --- a/man/read.socrata.Rd +++ b/man/read.socrata.Rd @@ -4,46 +4,39 @@ \alias{read.socrata} \title{Get a full Socrata data set as an R data frame} \usage{ -read.socrata(url = NULL, app_token = NULL, domain = NULL, - fourByFour = NULL, query = NULL, limit = 50000, offset = 0, - output = c("csv", "json", "geojson")) +read.socrata(url = NULL, app_token = NULL, limit = 50000, domain = NULL, + fourByFour = NULL, query = NULL) } \arguments{ \item{url}{- A Socrata resource URL, or a Socrata "human-friendly" URL, or Socrata Open Data Application Program Interface (SODA) query -requesting a comma-separated download format (.csv suffix), +requesting a comma-separated download format (.json suffix), May include SoQL parameters, and it is now assumed to include SODA \code{limit} & \code{offset} parameters. -Either use a compelete URL, e.g. \code{} or use parameters below to construct your URL. -But don't combine them.} +Either use a compelete URL or use parameters below to construct your URL.} \item{app_token}{- a (non-required) string; SODA API token can be used to query the data portal \url{http://dev.socrata.com/consumers/getting-started.html}} +\item{limit}{- defaults to the max of 50000. See \url{http://dev.socrata.com/docs/paging.html}.} + \item{domain}{- A Socrata domain, e.g \url{http://data.cityofchicago.org}} \item{fourByFour}{- a unique 4x4 identifier, e.g. "ydr8-5enu". See more \code{\link{isFourByFour}}} \item{query}{- Based on query language called the "Socrata Query Language" ("SoQL"), see \url{http://dev.socrata.com/docs/queries.html}.} - -\item{limit}{- defaults to the max of 50000. See \url{http://dev.socrata.com/docs/paging.html}.} - -\item{offset}{- defaults to the max of 0. See \url{http://dev.socrata.com/docs/paging.html}.} -} -\value{ -a data frame with POSIX dates } \description{ -Manages throttling and POSIX date-time conversions. -} -\section{TODO}{ - \url{https://github.com/Chicago/RSocrata/issues/14} +Manages throttling and POSIX date-time conversions. We support only .json suffix. } \examples{ -df <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv") -dfgjs <- read.socrata(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") -df2 <- read.socrata(domain = "http://data.cityofchicago.org", fourByFour = "ydr8-5enu") +\dontrun{ +df_csv <- read.socrata(url = "http://soda.demo.socrata.com/resource/4334-bgaj.csv") +df_manual2 <- read.socrata(domain = "http://data.cityofchicago.org/", fourByFour = "ydr8-5enu") +## df_manual3<-read.socrata(url="http://data.cityofchicago.org/resource/ydr8-5enu") not working +lp<-read.socrata(url="http://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj.csv") +} } \author{ Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@cityofchicago.org} diff --git a/man/read.socrataGEO.Rd b/man/read.socrataGEO.Rd new file mode 100644 index 0000000..40ae5bb --- /dev/null +++ b/man/read.socrataGEO.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/returnData.R +\name{read.socrataGEO} +\alias{read.socrataGEO} +\title{Download GeoJSON data using geojsonio package} +\usage{ +read.socrataGEO(url = "", ...) +} +\arguments{ +\item{url}{- A Socrata resource URL, requiring a .geojson suffix.} + +\item{...}{- other arguments from \link{geojsonio} package for geojson_read method} +} +\value{ +Returns a list, which is the default option here. +} +\description{ +Download GeoJSON data using geojsonio package +} +\examples{ +\dontrun{ +df_geo <- read.socrataGEO(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") +} +} + diff --git a/man/validateUrl.Rd b/man/validateUrl.Rd index b8edaa7..be2cac9 100644 --- a/man/validateUrl.Rd +++ b/man/validateUrl.Rd @@ -2,18 +2,15 @@ % Please edit documentation in R/validateURL.R \name{validateUrl} \alias{validateUrl} -\title{Convert, if necessary, URL to valid REST API URL supported by Socrata.} +\title{Check if the URL is a valid one and supported by RSocrata (!).} \usage{ -validateUrl(url = "", app_token = NULL) +validateUrl(url = "") } \arguments{ \item{url}{- a string; character vector of length one} - -\item{app_token}{- a string; SODA API token used to query the data -portal \url{http://dev.socrata.com/consumers/getting-started.html}} } \value{ -a - valid Url +a valid URL used for downloading data } \description{ Will convert a human-readable URL to a valid REST API call @@ -25,12 +22,17 @@ resolve conflicting API token by deferring to original URL. } \examples{ \dontrun{ -validateUrl(url = "a.fake.url.being.tested", app_token = "ew2rEMuESuzWPqMkyPfOSGJgE") +validateUrl(url = "a.fake.url.being.tested") +validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj") +validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.json") +validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.csv") +validateUrl(url = "http://soda.demo.socrata.com/resource/4334-bgaj.json") +validateUrl(url = "http://soda.demo.socrata.com/resource/4334-bgaj.xml") +validateUrl(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") +validateUrl(url = "http://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj.csv") } -validateUrl(url = "https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj", -app_token="ew2rEMuESuzWPqMkyPfOSGJgE") } \author{ -Tom Schenk Jr \email{tom.schenk@cityofchicago.org} +Tom Schenk Jr \email{tom.schenk@cityofchicago.org} et al. } diff --git a/tests/testthat.R b/tests/testthat.R index 8604f6c..8196bc4 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,5 @@ library(testthat) library(RSocrata) +library(geojsonio) test_check("RSocrata") diff --git a/tests/testthat/test-all.R b/tests/testthat/test-all.R index 407c4db..99b2d7c 100644 --- a/tests/testthat/test-all.R +++ b/tests/testthat/test-all.R @@ -3,126 +3,83 @@ library(RSocrata) library(httr) library(jsonlite) library(mime) +library(geojsonio) context("read Socrata") test_that("read Socrata CSV", { - df <- read.socrata('https://soda.demo.socrata.com/resource/4334-bgaj.csv') + df <- read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv") expect_equal(1007, nrow(df), label="rows") - expect_equal(9, ncol(df), label="columns") + expect_equal(11, ncol(df), label="columns") }) -test_that("read Socrata JSON", { - df <- read.socrata(url = 'https://soda.demo.socrata.com/resource/4334-bgaj.json') +test_that("read Socrata JSON with HTTP", { + df <- read.socrata(url = "https://soda.demo.socrata.com/resource/4334-bgaj.json") expect_equal(1007, nrow(df), label="rows") expect_equal(11, ncol(df), label="columns") }) test_that("read Socrata No Scheme", { - expect_error(read.socrata('soda.demo.socrata.com/resource/4334-bgaj.csv')) + expect_error(read.socrata("soda.demo.socrata.com/resource/4334-bgaj.csv")) }) test_that("read SoQL", { - df <- read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.csv?$select=region') + skip("because of query") + skip_on_cran() + skip_on_travis() + df <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.json?$select=region") expect_equal(1007, nrow(df), label="rows") expect_equal(1, ncol(df), label="columns") }) test_that("read SoQL Column Not Found (will fail)", { + skip("because of query") + skip_on_cran() + skip_on_travis() # SoQL API uses field names, not human names - expect_error(read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.csv?$select=Region')) + expect_error(read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv?$select=Region")) }) test_that("URL is private (Unauthorized) (will fail)", { - expect_error(read.socrata('http://data.cityofchicago.org/resource/j8vp-2qpg.json')) + expect_error(read.socrata("http://data.cityofchicago.org/resource/j8vp-2qpg.json")) }) -test_that("read Socrata Human Readable", { - df <- read.socrata(url="https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj") - expect_equal(1007, nrow(df), label="rows") - expect_equal(9, ncol(df), label="columns") +test_that("it will not read Socrata Human Readable URL", { + expect_warning(read.socrata(url="https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj")) + # expect_equal(1007, nrow(df), label="rows") + # expect_equal(9, ncol(df), label="columns") }) test_that("format is not supported", { # Unsupported data formats - expect_error(read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.xml')) -}) - -context("Test Socrata with Token") - -test_that("CSV with Token", { - df <- read.socrata('https://soda.demo.socrata.com/resource/4334-bgaj.csv', app_token="ew2rEMuESuzWPqMkyPfOSGJgE") - expect_equal(1007, nrow(df), label="rows") - expect_equal(9, ncol(df), label="columns") -}) - - -test_that("readSocrataHumanReadableToken", { - df <- read.socrata('https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj', app_token="ew2rEMuESuzWPqMkyPfOSGJgE") - expect_equal(1007, nrow(df), label="rows") - expect_equal(9, ncol(df), label="columns") -}) - -test_that("API Conflict", { - df <- read.socrata('https://soda.demo.socrata.com/resource/4334-bgaj.csv?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE', app_token="ew2rEMuESuzWPqMkyPfOSUSER") - expect_equal(1007, nrow(df), label="rows") - expect_equal(9, ncol(df), label="columns") -}) - -test_that("readAPIConflictHumanReadable", { - df <- read.socrata('https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE', app_token="ew2rEMuESuzWPqMkyPfOSUSER") - expect_equal(1007, nrow(df), label="rows") - expect_equal(9, ncol(df), label="columns") -}) - -test_that("incorrect API Query", { - # The query below is missing a $ before app_token. - expect_error(read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv?$app_token=ew2rEMuESuzWPqMkyPfOSGJgE")) - # Check that it was only because of missing $ - df <- read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE") - expect_equal(1007, nrow(df), label="rows") - expect_equal(9, ncol(df), label="columns") + expect_message(read.socrata(url="http://soda.demo.socrata.com/resource/4334-bgaj.xml"), + "BEWARE: Your suffix is no longer supported. Thus, we will automatically replace it with JSON.") }) - -test_that("incorrect API Query Human Readable", { - # The query below is missing a $ before app_token. - expect_error(read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$app_token=ew2rEMuESuzWPqMkyPfOSGJgE")) - # Check that it was only because of missing $ - df <- read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE") - expect_equal(1007, nrow(df), label="rows") - expect_equal(9, ncol(df), label="columns") -}) - -# TODO # https://github.com/Chicago/RSocrata/issues/19 test_that("A JSON test with uneven row lengths", { - skip_on_cran() - skip_on_travis() - skip("Not done") # working with bare jsonlite::fromJSON - # Both should be OK data <- read.socrata(url = "https://data.cityofchicago.org/resource/kn9c-c2s2.json") - awqe <- read.socrata("http://data.ny.gov/resource/eda3-in2f.json") - - expect_that(ncol(data) > 10) + awqe <- read.socrata(url = "http://data.ny.gov/resource/eda3-in2f.json") + # df_manual3 <- read.socrata(url="http://data.cityofchicago.org/resource/ydr8-5enu.json") + expect_more_than(ncol(awqe), 26) + expect_more_than(ncol(data), 8) }) -# TODO # https://github.com/Chicago/RSocrata/issues/14 test_that("RSocrata hangs when passing along SoDA queries with small number of results ", { skip_on_cran() skip_on_travis() - skip("Not done") + skip("Test works, but is just to large & long to run it") - df500 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json?$limit=500") # Hangs - df250 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json?$limit=250") # Hangs - df100 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json?$limit=100") # Hangs - df50 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json?$limit=50") # Hangs - df25 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json?$limit=25") # Hangs - df10 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json?$limit=10") # Hangs - df5 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json?$limit=5") # Hangs - df1 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json?$limit=1") # Hangs - df <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json") # Success + df500 <- read.socrata(url = "https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =500) + df250 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =250) + df100 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =100) + df50 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =50) + df25 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =25) + df10 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =10) + df5 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =5) + df1 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =1) + df <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json") }) diff --git a/tests/testthat/test-dateTime.R b/tests/testthat/test-dateTime.R index 7ca5769..dff7f7a 100644 --- a/tests/testthat/test-dateTime.R +++ b/tests/testthat/test-dateTime.R @@ -10,72 +10,70 @@ context("Test posixify function") test_that("posixify returns Long format", { dt <- posixify("09/14/2012 10:38:01 PM") - expect_equal("POSIXlt", class(dt)[1], label="first data type of a date") - expect_equal(2012, dt$year + 1900, label="year") - expect_equal(9, dt$mon + 1, label="month") - expect_equal(14, dt$mday, label="day") - expect_equal(22, dt$hour, label="hours") - expect_equal(38, dt$min, label="minutes") - expect_equal(1, dt$sec, label="seconds") + expect_equal("POSIXlt", class(dt)[1], label = "first data type of a date") + expect_equal(2012, dt$year + 1900, label = "year") + expect_equal(9, dt$mon + 1, label = "month") + expect_equal(14, dt$mday, label = "day") + expect_equal(22, dt$hour, label = "hours") + expect_equal(38, dt$min, label = "minutes") + expect_equal(1, dt$sec, label = "seconds") }) test_that("posixify returns Short format", { dt <- posixify("09/14/2012") - expect_equal("POSIXlt", class(dt)[1], label="first data type of a date") - expect_equal(2012, dt$year + 1900, label="year") - expect_equal(9, dt$mon + 1, label="month") - expect_equal(14, dt$mday, label="day") - expect_equal(0, dt$hour, label="hours") - expect_equal(0, dt$min, label="minutes") - expect_equal(0, dt$sec, label="seconds") + expect_equal("POSIXlt", class(dt)[1], label = "first data type of a date") + expect_equal(2012, dt$year + 1900, label = "year") + expect_equal(9, dt$mon + 1, label = "month") + expect_equal(14, dt$mday, label = "day") + expect_equal(0, dt$hour, label = "hours") + expect_equal(0, dt$min, label = "minutes") + expect_equal(0, dt$sec, label = "seconds") }) test_that("posixify new Floating Timestamp format", { dt <- posixify("2014-10-13T23:25:47") - expect_equal("POSIXlt", class(dt)[1], label="first data type of a date") - expect_equal(2014, dt$year + 1900, label="year") - expect_equal(25, dt$min, label="minutes") - expect_equal(47, dt$sec, label="seconds") + expect_equal("POSIXlt", class(dt)[1], label = "first data type of a date") + expect_equal(2014, dt$year + 1900, label = "year") + expect_equal(25, dt$min, label = "minutes") + expect_equal(47, dt$sec, label = "seconds") }) -# TODO test_that("NA datetime in source (JSON)", { # https://github.com/Chicago/RSocrata/issues/24 # https://github.com/Chicago/RSocrata/issues/27 - skip("Test finished") # working with bare jsonlite::fromJson; not implemented in RSocrata + skip("Dataframe is just to big, over 600k. rows") skip_on_cran() skip_on_travis() df <- read.socrata(url = "https://data.cityofboston.gov/resource/awu8-dc52.json") - expect_equal(sum(is.na(df$target_dt)), 194) - expect_that(ncol(df) > 10) + df_met <- getMetadata("https://data.cityofboston.gov/City-Services/311-Service-Requests/awu8-dc52") + # expect_equal(sum(is.na(df$target_dt)), 194) + expect_more_than(ncol(df_met[[2]]), 10) }) context("Socrata Calendar") test_that("Calendar Date Long", { - df <- read.socrata(url = 'http://soda.demo.socrata.com/resource/4334-bgaj.csv') - dt <- df$Datetime[1] # "2012-09-14 22:38:01" - expect_equal("POSIXlt", class(dt)[1], label="data type of a date") - expect_equal(2012, dt$year + 1900, label="year") - expect_equal(9, dt$mon + 1, label="month") - expect_equal(14, dt$mday, label="day") - expect_equal(22, dt$hour, label="hours") - expect_equal(38, dt$min, label="minutes") - expect_equal(1, dt$sec, label="seconds") + df <- read.socrata(url = "https://soda.demo.socrata.com/resource/4334-bgaj.json") + dt <- df$datetime[1] # "2012-09-14 22:38:01" + expect_equal("POSIXlt", class(dt)[1], label = "data type of a date") + expect_equal(2012, dt$year + 1900, label = "year") + expect_equal(9, dt$mon + 1, label = "month") + expect_equal(14, dt$mday, label = "day") + expect_equal(22, dt$hour, label = "hours") + expect_equal(38, dt$min, label = "minutes") + expect_equal(1, dt$sec, label = "seconds") }) test_that("Calendar Date Short", { - df <- read.socrata('http://data.cityofchicago.org/resource/y93d-d9e3.csv?$order=debarment_date') - dt <- df$DEBARMENT.DATE[1] # "05/21/1981" - expect_equal("POSIXlt", class(dt)[1], label="data type of a date") - expect_equal(81, dt$year, label="year") - expect_equal(5, dt$mon + 1, label="month") - expect_equal(21, dt$mday, label="day") - expect_equal(0, dt$hour, label="hours") - expect_equal(0, dt$min, label="minutes") - expect_equal(0, dt$sec, label="seconds") + df <- read.socrata(url = "http://data.cityofchicago.org/resource/y93d-d9e3.json") + df <- df[with(df, order(debarment_date)), ] + dt <- df$debarment_date[1] # "05/21/1981" + expect_equal("POSIXlt", class(dt)[1], label = "data type of a date") + expect_equal(81, dt$year, label = "year") + expect_equal(5, dt$mon + 1, label = "month") + expect_equal(21, dt$mday, label = "day") + expect_equal(0, dt$hour, label = "hours") + expect_equal(0, dt$min, label = "minutes") + expect_equal(0, dt$sec, label = "seconds") }) - - - diff --git a/tests/testthat/test-fourByFour.R b/tests/testthat/test-fourByFour.R index b8ac1b1..ac3aa53 100644 --- a/tests/testthat/test-fourByFour.R +++ b/tests/testthat/test-fourByFour.R @@ -16,8 +16,8 @@ test_that("is 4x4", { test_that("URLs contain 4x4 format", { - expect_error(read.socrata("https://soda.demo.socrata.com/api/views/4334c-bgajc"), "4334c-bgajc is not a valid Socrata dataset unique identifier", label="11 characters instead of 9") - expect_error(read.socrata("https://soda.demo.socrata.com/api/views/433-bga"), "433-bga is not a valid Socrata dataset unique identifier", label="7 characters instead of 9") - expect_error(read.socrata("https://soda.demo.socrata.com/api/views/433-bgaj"), "433-bgaj is not a valid Socrata dataset unique identifier", label="3 characters before dash instead of 4") - expect_error(read.socrata("https://soda.demo.socrata.com/api/views/4334-!gaj"), "4334-!gaj is not a valid Socrata dataset unique identifier", label="non-alphanumeric character") + expect_error(read.socrata("https://soda.demo.socrata.com/api/views/4334c-bgajc")) + expect_error(read.socrata("https://soda.demo.socrata.com/api/views/433-bga")) + expect_error(read.socrata("https://soda.demo.socrata.com/api/views/433-bgaj")) + expect_error(read.socrata("https://soda.demo.socrata.com/api/views/4334-!gaj")) }) diff --git a/tests/testthat/test-geo.R b/tests/testthat/test-geo.R index 59140e2..1568a8c 100644 --- a/tests/testthat/test-geo.R +++ b/tests/testthat/test-geo.R @@ -8,6 +8,7 @@ library(geojsonio) context("Geospatial JSON") test_that("fetches GeoJSON data", { - geodf <- geojson_read("https://data.cityofchicago.org/resource/6zsd-86xi.geojson", method = "local", parse = FALSE, what = "list") + geodf <- read.socrataGEO("https://data.cityofchicago.org/resource/6zsd-86xi.geojson", + method = "local", parse = FALSE, what = "list") expect_equal(geodf$type, "FeatureCollection") -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-metadata.R b/tests/testthat/test-metadata.R new file mode 100644 index 0000000..b50c864 --- /dev/null +++ b/tests/testthat/test-metadata.R @@ -0,0 +1,17 @@ +library(testthat) +library(RSocrata) +library(httr) +library(jsonlite) +library(mime) + +context("Checks metadata") + +test_that("it returns some number of rows", { + nr <- getMetadata(url = "http://data.cityofchicago.org/resource/y93d-d9e3.json") + expect_more_than(nr[[1]], 141) + nr2 <- getMetadata(url = "https://data.cityofchicago.org/resource/6zsd-86xi.json") + expect_more_than(nr2[[1]], 5878398) +}) + + + diff --git a/tests/testthat/test-token.R b/tests/testthat/test-token.R new file mode 100644 index 0000000..2d31305 --- /dev/null +++ b/tests/testthat/test-token.R @@ -0,0 +1,59 @@ +library(testthat) +library(RSocrata) +library(httr) +library(jsonlite) +library(mime) + +context("Test Socrata with Token") + +test_that("CSV with Token", { + df <- read.socrata(url = "https://soda.demo.socrata.com/resource/4334-bgaj.csv", + app_token="ew2rEMuESuzWPqMkyPfOSGJgE") + + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") +}) + + +test_that("it will not read Socrata Human Readable URL with Token", { + df <- read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj", + app_token="ew2rEMuESuzWPqMkyPfOSGJgE") + + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") +}) + +test_that("API Conflict", { + expect_error(read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE", + app_token="ew2rEMuESuzWPqMkyPfOSUSER")) + + # expect_equal(1007, nrow(df), label="rows") + # expect_equal(9, ncol(df), label="columns") +}) + +test_that("read API Conflict HumanReadable", { + expect_error(read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE", + app_token="ew2rEMuESuzWPqMkyPfOSUSER")) + + # expect_equal(1007, nrow(df), label="rows") + # expect_equal(11, ncol(df), label="columns") +}) + +test_that("incorrect API Query", { + # The query below is missing a $ before app_token. + expect_error(read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv?$app_token=ew2rEMuESuzWPqMkyPfOSGJgE")) + + df <- read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv", app_token= "ew2rEMuESuzWPqMkyPfOSGJgE") + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") +}) + + +test_that("incorrect API Query Human Readable", { + # The query below is missing a $ before app_token. + expect_error(read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$app_token=ew2rEMuESuzWPqMkyPfOSGJgE")) + + df <- read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj", app_token= "ew2rEMuESuzWPqMkyPfOSGJgE") + expect_equal(1007, nrow(df), label = "rows") + expect_equal(11, ncol(df), label = "columns") +}) \ No newline at end of file diff --git a/tests/testthat/test-validURL.R b/tests/testthat/test-validURL.R index bbe63d5..59542a4 100644 --- a/tests/testthat/test-validURL.R +++ b/tests/testthat/test-validURL.R @@ -7,14 +7,32 @@ library(mime) context("Validate URL") test_that("Invalid URL", { - expect_error(read.socrata("a.fake.url.being.tested"), "a.fake.url.being.tested does not appear to be a valid URL", label="invalid url") + expect_error(read.socrata("a.fake.url.being.tested")) }) -test_that("function is calling the API token specified in url", { - expect_true(substr(validateUrl('https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE', - app_token="ew2rEMuESuzWPqMkyPfOSUSER"), 70, 94) == "ew2rEMuESuzWPqMkyPfOSGJgE") - - expect_true(substr(validateUrl('https://soda.demo.socrata.com/resource/4334-bgaj.csv?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE', - app_token="ew2rEMuESuzWPqMkyPfOSUSER"), 70, 94) == "ew2rEMuESuzWPqMkyPfOSGJgE") - -}) \ No newline at end of file +test_that("human readable URLs are not supported", { + expect_output(validateUrl("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj"), + "https://soda.demo.socrata.com/resource/4334-bgaj.json") +}) + +test_that("http will get replaced with HTTPS and JSON", { + expect_output(validateUrl("http://soda.demo.socrata.com/resource/4334-bgaj.csv"), + "https://soda.demo.socrata.com/resource/4334-bgaj.json") +}) + +test_that("URL with no suffix will get JSON one", { + expect_output(validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj"), + "https://soda.demo.socrata.com/resource/4334-bgaj.json") +}) + +test_that("nothing happens with URL", { + expect_output(validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.json"), + "https://soda.demo.socrata.com/resource/4334-bgaj.json") +}) + +test_that("CSV will get replaced with JSON", { + expect_output(validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.csv"), + "https://soda.demo.socrata.com/resource/4334-bgaj.json") +}) + + diff --git a/vignettes/Examples.Rmd b/vignettes/Examples.Rmd index a288250..1e1e720 100644 --- a/vignettes/Examples.Rmd +++ b/vignettes/Examples.Rmd @@ -11,10 +11,6 @@ vignette: > ```{r} library(RSocrata) - -# for geo support -library(leaflet) -library(geojsonio) ``` ### Example: Reading SoDA valid URLs @@ -26,7 +22,7 @@ class(earthquakesDataFrame$Datetime[1]) # POSIXlt ### Example: Reading "human-readable" URLs ```{r} -earthquakesDataFrame <- read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquakes-for-2012-11-01-API-School-Demo/4334-bgaj") +earthquakesDataFrame <- read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj") nrow(earthquakesDataFrame) # 1007 (two "pages") class(earthquakesDataFrame$Datetime[1]) # POSIXlt ``` @@ -45,17 +41,25 @@ nrow(allSitesDataFrame) # Number of datasets allSitesDataFrame$title # Names of each dataset ``` -### Geo & Leaflet +### GEO & Leaflet ```{r} +# for geo support +library(leaflet) + +geo_df <- read.socrataGEO("https://data.cityofchicago.org/resource/6zsd-86xi.geojson") -geo_df <- geojson_read("https://data.cityofchicago.org/resource/6zsd-86xi.geojson", method = "local", parse = FALSE, what = "list") +# OR classically +# library(geojsonio) +# geo_df <- geojsonio::geojson_read("https://data.cityofchicago.org/resource/6zsd-86xi.geojson", method = "local", parse = TRUE, what = "list") m <- leaflet() %>% addGeoJSON(geo_df) %>% - setView(-87.6, 41.8, zoom = 10) %>% - addTiles() + setView(-87.6, 41.8, zoom = 15) %>% + addTiles(attribution = paste( + '© OpenStreetMap contributors', + '© CartoDB' + )) m - ``` diff --git a/vignettes/bench.Rmd b/vignettes/bench.Rmd new file mode 100644 index 0000000..f422676 --- /dev/null +++ b/vignettes/bench.Rmd @@ -0,0 +1,73 @@ +--- +title: "Benchmark binding of rows: 3 examples" +author: "John Malc (@dmpe)" +date: "September 7, 2015" +output: html_document +--- + +In order to run this benchmark, you must load this file: https://gist.github.com/dmpe/5aec87f0c7a5ae2115ca + +```{r} +library("microbenchmark") +library("ggplot2") +library("RSocrata") +# source("https://gist.github.com/dmpe/5aec87f0c7a5ae2115ca") +set.seed(5125) +``` + +A smaller dataset. + +```{r, echo=TRUE, eval=FALSE} +# https://github.com/hadley/dplyr/issues/1162#issuecomment-137870577 +small <- microbenchmark( + read.socrataRBIND("https://soda.demo.socrata.com/resource/4334-bgaj.csv"), + read.socrataPLYR("https://soda.demo.socrata.com/resource/4334-bgaj.csv"), + read.socrataDATATABLE("https://soda.demo.socrata.com/resource/4334-bgaj.csv"), + times = 3L, + unit = "s" +) + +small +boxplot(small) + +``` + +A large one. + +```{r, echo=TRUE, eval=FALSE} + +# 5878399 rows +# https://data.cityofchicago.org/resource/6zsd-86xi.csv +# 49142 +# https://data.ny.gov/resource/hrvs-fxs2.csv +# 618029 +# https://data.ny.gov/resource/cwsm-2ns3.json +bigger <- microbenchmark( + read.socrataRBIND("https://data.ny.gov/resource/cwsm-2ns3.csv"), + read.socrataPLYR("https://data.ny.gov/resource/cwsm-2ns3.csv"), + read.socrataDPLYR("https://data.ny.gov/resource/cwsm-2ns3.csv"), + read.socrataDATATABLE("https://data.ny.gov/resource/cwsm-2ns3.csv"), + times = 3L, + unit = "s" +) + +bigger +boxplot(bigger) +``` + +Another one. + +```{r, echo=TRUE, eval=FALSE} +big <- microbenchmark( + read.socrataRBIND("https://data.ny.gov/resource/hrvs-fxs2.csv"), + read.socrataPLYR("https://data.ny.gov/resource/hrvs-fxs2.csv"), + read.socrataDATATABLE("https://data.ny.gov/resource/hrvs-fxs2.csv"), + times = 3L, + unit = "s" +) + +big +boxplot(big) + +``` +