diff --git a/R/construct_api_requests.R b/R/construct_api_requests.R
index 1016f590..3d2279f2 100644
--- a/R/construct_api_requests.R
+++ b/R/construct_api_requests.R
@@ -43,37 +43,11 @@
construct_api_requests <- function(service,
properties = NA_character_,
bbox = NA,
- limit = 10000,
+ limit = NA,
+ max_results = NA,
skipGeometry = FALSE,
...){
- schema <- check_OGC_requests(endpoint = service,
- type = "schema")
- all_properties <- names(schema$properties)
-
- if(!all(is.na(properties))){
- match.arg(properties, choices = all_properties,
- several.ok = TRUE)
- }
-
- use_sf <- all(pkg.env$local_sf)
-
- if(!use_sf){
- skipGeometry <- TRUE
- }
-
- if(all(all_properties[!all_properties %in% c("id", "geometry")] %in% properties)) {
- # Cleans up URL if we're asking for everything
- properties <- NA_character_
- } else {
- if(all(!is.na(properties))){
- properties <- gsub("-", "_", properties)
- properties <- properties[!properties %in% c("id",
- "geometry",
- paste0(gsub("-", "_", service), "_id"))]
- }
- }
-
baseURL <- setup_api(service)
POST <- FALSE
@@ -89,7 +63,19 @@ construct_api_requests <- function(service,
get_list <- full_list[names(full_list) %in% single_params]
get_list[["skipGeometry"]] <- skipGeometry
- get_list[["limit"]] <- limit
+
+ if(is.na(limit)){
+ if(!is.na(max_results)){
+ get_list[["limit"]] <- max_results
+ } else {
+ get_list[["limit"]] <- 10000
+ }
+ } else {
+ if(!is.na(max_results)){
+ if(limit > max_results) stop("limit cannot be greater than max_result")
+ }
+ get_list[["limit"]] <- limit
+ }
post_list <- full_list[!names(full_list) %in% single_params]
@@ -177,6 +163,111 @@ setup_api <- function(service){
}
+#' Switch endpoint id arg
+#'
+#' @noRd
+#' @return list
+#' @examples
+#'
+#' l1 <- list("id" = "1234")
+#' dataRetrieval:::switch_arg_id(l1,
+#' id_name = "monitoring_location_id",
+#' service = "monitoring-locations")
+#'
+#' l2 <- list("monitoring_location_id" = "1234")
+#' dataRetrieval:::switch_arg_id(l2,
+#' id_name = "monitoring_location_id",
+#' service = "monitoring-locations")
+#'
+#' l3 <- list("monitoring_locations_id" = "1234")
+#' dataRetrieval:::switch_arg_id(l3,
+#' id_name = "monitoring_location_id",
+#' service = "monitoring-locations")
+#'
+switch_arg_id <- function(ls, id_name, service){
+
+ service_id <- paste0(gsub("-", "_", service), "_id")
+ if(!"id" %in% names(ls)){
+ if(service_id %in% names(ls)){
+ ls[["id"]] <- ls[[service_id]]
+ } else {
+ ls[["id"]] <- ls[[id_name]]
+ }
+ }
+
+ ls[[service_id]] <- NULL
+ ls[[id_name]] <- NULL
+ return(ls)
+}
+
+#' Switch properties id
+#'
+#' @noRd
+#' @return list
+#' @examples
+#'
+#' properties <- c("id", "state_name", "country_name")
+#' dataRetrieval:::switch_properties_id(properties,
+#' id_name = "monitoring_location_id",
+#' service = "monitoring-locations")
+#'
+#' properties2 <- c("monitoring_location_id", "state_name", "country_name")
+#' dataRetrieval:::switch_properties_id(properties2,
+#' id_name = "monitoring_location_id",
+#' service = "monitoring-locations")
+#'
+#' properties3 <- c("monitoring_locations_id", "state_name", "country_name")
+#' dataRetrieval:::switch_properties_id(properties3,
+#' id_name = "monitoring_location_id",
+#' service = "monitoring-locations")
+switch_properties_id <- function(properties, id_name, service){
+
+ service_id <- paste0(gsub("-", "_", service), "_id")
+
+ last_letter <- substr(service, nchar(service), nchar(service))
+ if(last_letter == "s"){
+ service_singluar <- substr(service,1, nchar(service)-1)
+ service_id_singular <- paste0(gsub("-", "_", service_singluar), "_id")
+ } else {
+ service_id_singular <- ""
+ }
+
+ if(!"id" %in% properties){
+ if(service_id %in% properties){
+ properties[properties == service_id] <- "id"
+
+ } else if(service_id_singular %in% properties) {
+ properties[properties == service_id_singular] <- "id"
+ } else {
+ properties[properties == id_name] <- "id"
+ }
+ }
+
+ schema <- check_OGC_requests(endpoint = service,
+ type = "schema")
+ all_properties <- names(schema$properties)
+
+ if(all(all_properties[!all_properties %in% c("id", "geometry")] %in% properties)) {
+ # Cleans up URL if we're asking for everything
+ properties <- NA_character_
+ } else {
+ if(all(!is.na(properties))){
+ properties <- gsub("-", "_", properties)
+ properties <- properties[!properties %in% c("id",
+ "geometry",
+ paste0(gsub("-", "_", service), "_id"))]
+ }
+ }
+
+ if(!all(is.na(properties))){
+ match.arg(properties, choices = all_properties,
+ several.ok = TRUE)
+ }
+
+ return(properties)
+}
+
+
#' Format the date request
#'
#' Users will want to give either start/end dates or
diff --git a/R/read_USGS_daily.R b/R/read_USGS_daily.R
index d6341f5f..87947a84 100644
--- a/R/read_USGS_daily.R
+++ b/R/read_USGS_daily.R
@@ -23,10 +23,13 @@
#' depth). Coordinates are assumed to be in crs 4326. The expected format is a numeric
#' vector structured: c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude,
#' Southern-most latitude, Eastern-most longitude, Northern-most longitude).
-#' @param limit The optional limit parameter limits the number of items that are
-#' presented in the response document. Only items are counted that are on the
-#' first level of the collection in the response document. Nested objects
-#' contained within the explicitly requested items shall not be counted.
+#' @param limit The optional limit parameter is used to control the subset of the
+#' selected features that should be returned in each page. The maximum allowable
+#' limit is 10000. It may be beneficial to set this number lower if your internet
+#' connection is spotty. The default (`NA`) will set the limit to the maximum
+#' allowable limit for the service.
+#' @param max_results The optional maximum number of rows to return. This value
+#' must be less than the requested limit.
#' @param skipGeometry This option can be used to skip response geometries for
#' each feature. The returning object will be a data frame with no spatial
#' information.
@@ -38,8 +41,8 @@
#' site <- "USGS-02238500"
#' pcode <- "00060"
#' dv_data_sf <- read_USGS_daily(monitoring_location_id = site,
-#' parameter_code = "00060",
-#' time = c("2021-01-01", "2022-01-01"))
+#' parameter_code = "00060",
+#' time = c("2021-01-01", "2022-01-01"))
#'
#' dv_data_trim <- read_USGS_daily(monitoring_location_id = site,
#' parameter_code = "00060",
@@ -49,58 +52,69 @@
#' time = c("2021-01-01", "2022-01-01"))
#'
#' dv_data <- read_USGS_daily(monitoring_location_id = site,
-#' parameter_code = "00060",
-#' skipGeometry = TRUE)
+#' parameter_code = "00060",
+#' skipGeometry = TRUE)
#'
#' dv_data_period <- read_USGS_daily(monitoring_location_id = site,
-#' parameter_code = "00060",
-#' time = "P7D")
+#' parameter_code = "00060",
+#' time = "P7D")
#'
#' multi_site <- read_USGS_daily(monitoring_location_id = c("USGS-01491000",
-#' "USGS-01645000"),
-#' parameter_code = c("00060", "00010"),
-#' limit = 500,
-#' time = c("2023-01-01", "2024-01-01"))
+#' "USGS-01645000"),
+#' parameter_code = c("00060", "00010"),
+#' limit = 500,
+#' time = c("2023-01-01", "2024-01-01"))
#'
#' }
read_USGS_daily <- function(monitoring_location_id = NA_character_,
- parameter_code = NA_character_,
- statistic_id = NA_character_,
- properties = NA_character_,
- time_series_id = NA_character_,
- daily_id = NA_character_,
- approval_status = NA_character_,
- unit_of_measure = NA_character_,
- qualifier = NA_character_,
- value = NA,
- last_modified = NA_character_,
- limit = 10000,
- skipGeometry = NA,
- time = NA_character_,
- bbox = NA,
- convertType = TRUE){
+ parameter_code = NA_character_,
+ statistic_id = NA_character_,
+ properties = NA_character_,
+ time_series_id = NA_character_,
+ daily_id = NA_character_,
+ approval_status = NA_character_,
+ unit_of_measure = NA_character_,
+ qualifier = NA_character_,
+ value = NA,
+ last_modified = NA_character_,
+ skipGeometry = NA,
+ time = NA_character_,
+ bbox = NA,
+ limit = NA,
+ max_results = NA,
+ convertType = TRUE){
message("Function in development, use at your own risk.")
service <- "daily"
+ output_id <- "daily_id"
+
args <- mget(names(formals()))
- args[["id"]] <- args[["daily_id"]]
- args[["daily_id"]] <- NULL
- args[["convertType"]] <- NULL
args[["service"]] <- service
+
+ args <- switch_arg_id(args,
+ id_name = output_id,
+ service = service)
+
+ args[["properties"]] <- switch_properties_id(properties,
+ id_name = output_id,
+ service = service)
+
+ args[["convertType"]] <- NULL
+
dv_req <- do.call(construct_api_requests, args)
- return_list <- walk_pages(dv_req)
+ return_list <- walk_pages(dv_req, max_results)
return_list <- deal_with_empty(return_list, properties, service)
if(convertType) return_list <- cleanup_cols(return_list,
service = "daily")
+
+ return_list <- rejigger_cols(return_list, properties, output_id)
return_list <- return_list[order(return_list$time, return_list$monitoring_location_id), ]
- return_list <- rejigger_cols(return_list, properties, service)
-
return(return_list)
}
diff --git a/R/read_USGS_data.R b/R/read_USGS_data.R
index 58b2c3cc..e8f52c49 100644
--- a/R/read_USGS_data.R
+++ b/R/read_USGS_data.R
@@ -65,7 +65,13 @@ read_USGS_data <- function(service,
httr2::req_headers(`Content-Type` = "application/query-cql-json") |>
httr2::req_body_raw(CQL)
- return_list <- walk_pages(data_req)
+ if("max_results" %in% names(args)){
+ max_results <- args[["max_results"]]
+ } else {
+ max_results <- NA
+ }
+
+ return_list <- walk_pages(data_req, max_results)
return_list <- deal_with_empty(return_list, args[["properties"]], service)
diff --git a/R/read_USGS_monitoring_location.R b/R/read_USGS_monitoring_location.R
index 8c577236..cb0dba35 100644
--- a/R/read_USGS_monitoring_location.R
+++ b/R/read_USGS_monitoring_location.R
@@ -52,10 +52,13 @@
#' depth). Coordinates are assumed to be in crs 4326. The expected format is a numeric
#' vector structured: c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude,
#' Southern-most latitude, Eastern-most longitude, Northern-most longitude).
-#' @param limit The optional limit parameter limits the number of items that are
-#' presented in the response document. Only items are counted that are on the
-#' first level of the collection in the response document. Nested objects
-#' contained within the explicitly requested items shall not be counted.
+#' @param limit The optional limit parameter is used to control the subset of the
+#' selected features that should be returned in each page. The maximum allowable
+#' limit is 10000. It may be beneficial to set this number lower if your internet
+#' connection is spotty. The default (`NA`) will set the limit to the maximum
+#' allowable limit for the service.
+#' @param max_results The optional maximum number of rows to return. This value
+#' must be less than the requested limit.
#' @param skipGeometry This option can be used to skip response geometries for
#' each feature. The returning object will be a data frame with no spatial
#' information.
@@ -66,17 +69,25 @@
#' site_info <- read_USGS_monitoring_location(monitoring_location_id = site)
#'
#' site_slim <- read_USGS_monitoring_location(monitoring_location_id = site,
-#' properties = c("monitoring_locations_id",
+#' properties = c("monitoring_location_id",
#' "state_name",
#' "country_name"))
+#'
+#' site_slim_no_sf_slim <- read_USGS_monitoring_location(monitoring_location_id = site,
+#' properties = c("monitoring_location_id",
+#' "state_name",
+#' "country_name"),
+#' skipGeometry = TRUE)
#'
#' site_info_no_sf <- read_USGS_monitoring_location(monitoring_location_id = site,
#' skipGeometry = TRUE)
#'
-#' multi_site <- read_USGS_monitoring_location(state_name = "Wisconsin")
-#'
#' bbox_vals = c(-94.00, 35.0, -93.5, 35.5)
#' multi_site <- read_USGS_monitoring_location(bbox = bbox_vals)
+#' multi_site_n_100 <- read_USGS_monitoring_location(bbox = bbox_vals,
+#' max_results = 100)
+#' multi_site_limit_100 <- read_USGS_monitoring_location(bbox = bbox_vals,
+#' limit = 100)
#' }
read_USGS_monitoring_location <- function(monitoring_location_id = NA_character_,
agency_code = NA_character_,
@@ -120,24 +131,33 @@ read_USGS_monitoring_location <- function(monitoring_location_id = NA_character_
depth_source_code = NA_character_,
properties = NA_character_,
bbox = NA,
- limit = 10000,
+ limit = NA,
+ max_results = NA,
skipGeometry = NA){
message("Function in development, use at your own risk.")
service <- "monitoring-locations"
+ output_id <- "monitoring_location_id"
args <- mget(names(formals()))
args[["service"]] <- service
- args[["id"]] <- args[["monitoring_location_id"]]
- args[["monitoring_location_id"]] <- NULL
+
+ args <- switch_arg_id(args,
+ id_name = output_id,
+ service = service)
+
+ args[["properties"]] <- switch_properties_id(properties,
+ id_name = output_id,
+ service = service)
+
site_req <- do.call(construct_api_requests, args)
- return_list <- walk_pages(site_req)
+ return_list <- walk_pages(site_req, max_results)
return_list <- deal_with_empty(return_list, properties, service)
- return_list <- rejigger_cols(return_list, properties, service)
-
+ return_list <- rejigger_cols(return_list, properties, output_id)
+
return(return_list)
}
diff --git a/R/read_USGS_ts_meta.R b/R/read_USGS_ts_meta.R
index 887da8ee..e2f7efd2 100644
--- a/R/read_USGS_ts_meta.R
+++ b/R/read_USGS_ts_meta.R
@@ -20,17 +20,20 @@
#' @param properties A vector of requested columns to be returned from the query.
#' Available options are:
#' `r schema <- check_OGC_requests(endpoint = "time-series-metadata", type = "schema"); paste(names(schema$properties), collapse = ", ")`
-#' @param time_series_metadata_id `r get_params("time-series-metadata")$id`
+#' @param time_series_id `r get_params("time-series-metadata")$id`
#' @param bbox Only features that have a geometry that intersects the bounding
#' box are selected.The bounding box is provided as four or six numbers, depending
#' on whether the coordinate reference system includes a vertical axis (height or
#' depth). Coordinates are assumed to be in crs 4326. The expected format is a numeric
#' vector structured: c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude,
#' Southern-most latitude, Eastern-most longitude, Northern-most longitude).
-#' @param limit The optional limit parameter limits the number of items that are
-#' presented in the response document. Only items are counted that are on the
-#' first level of the collection in the response document. Nested objects
-#' contained within the explicitly requested items shall not be counted.
+#' @param limit The optional limit parameter is used to control the subset of the
+#' selected features that should be returned in each page. The maximum allowable
+#' limit is 10000. It may be beneficial to set this number lower if your internet
+#' connection is spotty. The default (`NA`) will set the limit to the maximum
+#' allowable limit for the service.
+#' @param max_results The optional maximum number of rows to return. This value
+#' must be less than the requested limit.
#' @param convertType logical, defaults to `TRUE`. If `TRUE`, the function
#' will convert the data to dates and qualifier to string vector.
#' @param skipGeometry This option can be used to skip response geometries for
@@ -43,20 +46,19 @@
#' meta_1 <- read_USGS_ts_meta(monitoring_location_id = site)
#'
#' meta_multi <- read_USGS_ts_meta(monitoring_location_id = c("USGS-01491000",
-#' "USGS-01645000"),
+#' "USGS-01645000"),
#' parameter_code = c("00060", "00010"),
#' properties = c("monitoring_location_id",
#' "parameter_code",
#' "begin",
-#' "end"),
+#' "end",
+#' "time_series_id"),
#' skipGeometry = TRUE)
#' }
read_USGS_ts_meta <- function(monitoring_location_id = NA_character_,
parameter_code = NA_character_,
parameter_name = NA_character_,
properties = NA_character_,
- limit = 10000,
- bbox = NA,
statistic_id = NA_character_,
last_modified = NA_character_,
begin = NA_character_,
@@ -67,30 +69,39 @@ read_USGS_ts_meta <- function(monitoring_location_id = NA_character_,
thresholds = NA,
sublocation_identifier = NA_character_,
primary = NA_character_,
- time_series_metadata_id = NA_character_,
+ time_series_id = NA_character_,
web_description = NA_character_,
skipGeometry = NA,
+ limit = NA,
+ max_results = NA,
+ bbox = NA,
convertType = FALSE){
message("Function in development, use at your own risk.")
service = "time-series-metadata"
+ output_id <- "time_series_id"
args <- mget(names(formals()))
-
args[["service"]] <- service
- args[["id"]] <- args[["time_series_id"]]
- args[["time_series_metadata_id"]] <- NULL
+
+ args <- switch_arg_id(args, id_name = output_id, service = service)
+
args[["convertType"]] <- NULL
+
+ args[["properties"]] <- switch_properties_id(properties,
+ id_name = output_id,
+ service = service)
+
req_ts_meta <- do.call(construct_api_requests, args)
- return_list <- walk_pages(req_ts_meta)
+ return_list <- walk_pages(req_ts_meta, max_results)
return_list <- deal_with_empty(return_list, properties, service)
if(convertType) return_list <- cleanup_cols(return_list)
-
- return_list <- rejigger_cols(return_list, properties, service)
+
+ return_list <- rejigger_cols(return_list, properties, output_id)
return(return_list)
diff --git a/R/walk_pages.R b/R/walk_pages.R
index a5476b2e..8a2d8ed9 100644
--- a/R/walk_pages.R
+++ b/R/walk_pages.R
@@ -46,19 +46,34 @@ deal_with_empty <- function(return_list, properties, service){
#' @examples
#'
#' df <- dataRetrieval:::deal_with_empty(data.frame(NULL),
-#' properties = c("time", "value", "id"),
-#' service = "daily")
+#' properties = c("state_code", "county_code", "id"),
+#' service = "monitoring-locations")
#' df2 <- dataRetrieval:::rejigger_cols(df,
-#' properties = c("value", "id", "time"),
-#' service = "daily")
-#'
-rejigger_cols <- function(df, properties, service){
- new_id <- paste0(gsub("-", "_", service), "_id")
- names(df)[names(df) == "id"] <- new_id
-
+#' properties = c("state_code", "id", "county_code"),
+#' output_id = "monitoring_location_id")
+#'
+#' df3 <- dataRetrieval:::rejigger_cols(df,
+#' properties = c("state_code", "monitoring_location_id", "county_code"),
+#' output_id = "monitoring_location_id")
+#'
+rejigger_cols <- function(df, properties, output_id){
+
if(!all(is.na(properties))){
- properties[properties == "id"] <- new_id
+ if(!"id" %in% properties){
+ if(output_id %in% properties){
+ names(df)[(names(df) == "id")] <- output_id
+ } else {
+ # just in case users become aware of the singular/plural issue
+ # where the endpoint name is plural, but input to other endpoints are singular
+ plural <- gsub("_id", "s_id", output_id)
+ if(plural %in% properties){
+ names(df)[(names(df) == "id")] <- plural
+ }
+ }
+ }
df <- df[, properties]
+ } else {
+ names(df)[(names(df) == "id")] <- output_id
}
df
}
@@ -180,12 +195,11 @@ get_resp_data <- function(resp) {
}
use_sf <- !grepl("skipGeometry=true", resp$url, ignore.case = TRUE)
+ return_df <- sf::read_sf(httr2::resp_body_string(resp))
- if(use_sf){
- return_df <- sf::read_sf(httr2::resp_body_string(resp))
- } else {
- return_df <- jsonlite::fromJSON(httr2::resp_body_string(resp))[["features"]][["properties"]]
- }
+ if(!use_sf){
+ return_df <- sf::st_drop_geometry(return_df)
+ }
return(return_df)
@@ -197,28 +211,33 @@ get_resp_data <- function(resp) {
#'
#' @noRd
#' @return data.frame with attributes
-walk_pages <- function(req){
-
- resps <- httr2::req_perform_iterative(req,
- next_req = next_req_url,
- max_reqs = Inf)
-
- ######################################
- # So far I haven't tested this because I haven't had
- # individual failures
- failures <- resps |>
- httr2::resps_failures() |>
- httr2::resps_requests()
-
- if(length(failures) > 0){
- message("There were", length(failures), "failed requests.")
- }
- ######################################
-
- return_list <- data.frame()
- for(resp in resps){
- df1 <- get_resp_data(resp)
- return_list <- rbind(return_list, df1)
+walk_pages <- function(req, max_results){
+
+ if(is.na(max_results)){
+ resps <- httr2::req_perform_iterative(req,
+ next_req = next_req_url,
+ max_reqs = Inf)
+ ######################################
+ # So far I haven't tested this because I haven't had
+ # individual failures
+ failures <- resps |>
+ httr2::resps_failures() |>
+ httr2::resps_requests()
+
+ if(length(failures) > 0){
+ message("There were", length(failures), "failed requests.")
+ }
+
+ return_list <- data.frame()
+ for(resp in resps){
+ df1 <- get_resp_data(resp)
+ return_list <- rbind(return_list, df1)
+ }
+
+ ######################################
+ } else {
+ resps <- httr2::req_perform(req)
+ return_list <- get_resp_data(resps)
}
attr(return_list, "request") <- req
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 740f1e12..404040c3 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -26,7 +26,6 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \
r-cran-readxl \
r-cran-whisker \
r-cran-ggplot2 \
- && apt-get install -y pandoc \
&& rm -rf /var/lib/apt/lists/*
diff --git a/man/construct_api_requests.Rd b/man/construct_api_requests.Rd
index 3e75315f..0c56fc39 100644
--- a/man/construct_api_requests.Rd
+++ b/man/construct_api_requests.Rd
@@ -8,7 +8,8 @@ construct_api_requests(
service,
properties = NA_character_,
bbox = NA,
- limit = 10000,
+ limit = NA,
+ max_results = NA,
skipGeometry = FALSE,
...
)
diff --git a/man/read_USGS_daily.Rd b/man/read_USGS_daily.Rd
index f30acb71..aa7a7f10 100644
--- a/man/read_USGS_daily.Rd
+++ b/man/read_USGS_daily.Rd
@@ -16,10 +16,11 @@ read_USGS_daily(
qualifier = NA_character_,
value = NA,
last_modified = NA_character_,
- limit = 10000,
skipGeometry = NA,
time = NA_character_,
bbox = NA,
+ limit = NA,
+ max_results = NA,
convertType = TRUE
)
}
@@ -58,11 +59,6 @@ Examples:
Only features that have a \code{last_modified} that intersects the value of datetime are selected. If a feature has multiple temporal properties, it is the decision of the server whether only a single temporal property is used to determine the extent or all relevant temporal properties.}
-\item{limit}{The optional limit parameter limits the number of items that are
-presented in the response document. Only items are counted that are on the
-first level of the collection in the response document. Nested objects
-contained within the explicitly requested items shall not be counted.}
-
\item{skipGeometry}{This option can be used to skip response geometries for
each feature. The returning object will be a data frame with no spatial
information.}
@@ -85,6 +81,15 @@ depth). Coordinates are assumed to be in crs 4326. The expected format is a nume
vector structured: c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude,
Southern-most latitude, Eastern-most longitude, Northern-most longitude).}
+\item{limit}{The optional limit parameter is used to control the subset of the
+selected features that should be returned in each page. The maximum allowable
+limit is 10000. It may be beneficial to set this number lower if your internet
+connection is spotty. The default (\code{NA}) will set the limit to the maximum
+allowable limit for the service.}
+
+\item{max_results}{The optional maximum number of rows to return. This value
+must be less than the requested limit.}
+
\item{convertType}{logical, defaults to \code{TRUE}. If \code{TRUE}, the function
will convert the data to dates and qualifier to string vector.}
}
@@ -98,8 +103,8 @@ Description Daily data provide one data value to represent water conditions for
site <- "USGS-02238500"
pcode <- "00060"
dv_data_sf <- read_USGS_daily(monitoring_location_id = site,
- parameter_code = "00060",
- time = c("2021-01-01", "2022-01-01"))
+ parameter_code = "00060",
+ time = c("2021-01-01", "2022-01-01"))
dv_data_trim <- read_USGS_daily(monitoring_location_id = site,
parameter_code = "00060",
@@ -109,18 +114,18 @@ dv_data_trim <- read_USGS_daily(monitoring_location_id = site,
time = c("2021-01-01", "2022-01-01"))
dv_data <- read_USGS_daily(monitoring_location_id = site,
- parameter_code = "00060",
- skipGeometry = TRUE)
+ parameter_code = "00060",
+ skipGeometry = TRUE)
dv_data_period <- read_USGS_daily(monitoring_location_id = site,
- parameter_code = "00060",
- time = "P7D")
+ parameter_code = "00060",
+ time = "P7D")
multi_site <- read_USGS_daily(monitoring_location_id = c("USGS-01491000",
- "USGS-01645000"),
- parameter_code = c("00060", "00010"),
- limit = 500,
- time = c("2023-01-01", "2024-01-01"))
+ "USGS-01645000"),
+ parameter_code = c("00060", "00010"),
+ limit = 500,
+ time = c("2023-01-01", "2024-01-01"))
}
\dontshow{\}) # examplesIf}
diff --git a/man/read_USGS_monitoring_location.Rd b/man/read_USGS_monitoring_location.Rd
index 1da7369d..58d3fa69 100644
--- a/man/read_USGS_monitoring_location.Rd
+++ b/man/read_USGS_monitoring_location.Rd
@@ -47,7 +47,8 @@ read_USGS_monitoring_location(
depth_source_code = NA_character_,
properties = NA_character_,
bbox = NA,
- limit = 10000,
+ limit = NA,
+ max_results = NA,
skipGeometry = NA
)
}
@@ -143,10 +144,14 @@ depth). Coordinates are assumed to be in crs 4326. The expected format is a nume
vector structured: c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude,
Southern-most latitude, Eastern-most longitude, Northern-most longitude).}
-\item{limit}{The optional limit parameter limits the number of items that are
-presented in the response document. Only items are counted that are on the
-first level of the collection in the response document. Nested objects
-contained within the explicitly requested items shall not be counted.}
+\item{limit}{The optional limit parameter is used to control the subset of the
+selected features that should be returned in each page. The maximum allowable
+limit is 10000. It may be beneficial to set this number lower if your internet
+connection is spotty. The default (\code{NA}) will set the limit to the maximum
+allowable limit for the service.}
+
+\item{max_results}{The optional maximum number of rows to return. This value
+must be less than the requested limit.}
\item{skipGeometry}{This option can be used to skip response geometries for
each feature. The returning object will be a data frame with no spatial
@@ -163,17 +168,25 @@ site <- "USGS-02238500"
site_info <- read_USGS_monitoring_location(monitoring_location_id = site)
site_slim <- read_USGS_monitoring_location(monitoring_location_id = site,
- properties = c("monitoring_locations_id",
+ properties = c("monitoring_location_id",
"state_name",
"country_name"))
+
+site_slim_no_sf_slim <- read_USGS_monitoring_location(monitoring_location_id = site,
+ properties = c("monitoring_location_id",
+ "state_name",
+ "country_name"),
+ skipGeometry = TRUE)
site_info_no_sf <- read_USGS_monitoring_location(monitoring_location_id = site,
skipGeometry = TRUE)
-multi_site <- read_USGS_monitoring_location(state_name = "Wisconsin")
-
bbox_vals = c(-94.00, 35.0, -93.5, 35.5)
multi_site <- read_USGS_monitoring_location(bbox = bbox_vals)
+multi_site_n_100 <- read_USGS_monitoring_location(bbox = bbox_vals,
+ max_results = 100)
+multi_site_limit_100 <- read_USGS_monitoring_location(bbox = bbox_vals,
+ limit = 100)
}
\dontshow{\}) # examplesIf}
}
diff --git a/man/read_USGS_ts_meta.Rd b/man/read_USGS_ts_meta.Rd
index ced421a7..9a4e7387 100644
--- a/man/read_USGS_ts_meta.Rd
+++ b/man/read_USGS_ts_meta.Rd
@@ -9,8 +9,6 @@ read_USGS_ts_meta(
parameter_code = NA_character_,
parameter_name = NA_character_,
properties = NA_character_,
- limit = 10000,
- bbox = NA,
statistic_id = NA_character_,
last_modified = NA_character_,
begin = NA_character_,
@@ -21,9 +19,12 @@ read_USGS_ts_meta(
thresholds = NA,
sublocation_identifier = NA_character_,
primary = NA_character_,
- time_series_metadata_id = NA_character_,
+ time_series_id = NA_character_,
web_description = NA_character_,
skipGeometry = NA,
+ limit = NA,
+ max_results = NA,
+ bbox = NA,
convertType = FALSE
)
}
@@ -38,18 +39,6 @@ read_USGS_ts_meta(
Available options are:
geometry, id, unit_of_measure, parameter_name, parameter_code, statistic_id, last_modified, begin, end, computation_period_identifier, computation_identifier, thresholds, sublocation_identifier, primary, monitoring_location_id, web_description, parameter_description}
-\item{limit}{The optional limit parameter limits the number of items that are
-presented in the response document. Only items are counted that are on the
-first level of the collection in the response document. Nested objects
-contained within the explicitly requested items shall not be counted.}
-
-\item{bbox}{Only features that have a geometry that intersects the bounding
-box are selected.The bounding box is provided as four or six numbers, depending
-on whether the coordinate reference system includes a vertical axis (height or
-depth). Coordinates are assumed to be in crs 4326. The expected format is a numeric
-vector structured: c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude,
-Southern-most latitude, Eastern-most longitude, Northern-most longitude).}
-
\item{statistic_id}{A code corresponding to the statistic an observation represents. Example codes include 00001 (max), 00002 (min), and 00003 (mean). A complete list of codes and their descriptions can be found at \url{https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=\%25&fmt=html}.}
\item{last_modified}{The last time a record was refreshed in our database. This may happen due to regular operational processes and does not necessarily indicate anything about the measurement has changed.
@@ -80,7 +69,7 @@ Only features that have a \code{last_modified} that intersects the value of date
\item{primary}{A flag identifying if the time series is a "primary" time series. "Primary" time series (which have this flag) are standard observations which undergo \href{https://www.usgs.gov/survey-manual/5028-fundamental-science-practices-review-and-approval-scientific-data-release}{Bureau review and approval processes}. Non-primary time series, which will have missing values for "primary", are provisional datasets made available to meet the need for timely best science and to assist with daily operations which need real-time information. Non-primary time series data are only retained by this system for 120 days. See the \href{https://waterdata.usgs.gov/provisional-data-statement/}{USGS Provisional Data Statement} for more information.}
-\item{time_series_metadata_id}{A unique identifier representing a single time series. This corresponds to the \code{id} field in the \code{time-series-metadata} endpoint.}
+\item{time_series_id}{A unique identifier representing a single time series. This corresponds to the \code{id} field in the \code{time-series-metadata} endpoint.}
\item{web_description}{A description of what this time series represents, as used by WDFN and other USGS data dissemination products.}
@@ -88,6 +77,22 @@ Only features that have a \code{last_modified} that intersects the value of date
each feature. The returning object will be a data frame with no spatial
information.}
+\item{limit}{The optional limit parameter is used to control the subset of the
+selected features that should be returned in each page. The maximum allowable
+limit is 10000. It may be beneficial to set this number lower if your internet
+connection is spotty. The default (\code{NA}) will set the limit to the maximum
+allowable limit for the service.}
+
+\item{max_results}{The optional maximum number of rows to return. This value
+must be less than the requested limit.}
+
+\item{bbox}{Only features that have a geometry that intersects the bounding
+box are selected.The bounding box is provided as four or six numbers, depending
+on whether the coordinate reference system includes a vertical axis (height or
+depth). Coordinates are assumed to be in crs 4326. The expected format is a numeric
+vector structured: c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude,
+Southern-most latitude, Eastern-most longitude, Northern-most longitude).}
+
\item{convertType}{logical, defaults to \code{TRUE}. If \code{TRUE}, the function
will convert the data to dates and qualifier to string vector.}
}
@@ -102,12 +107,13 @@ site <- "USGS-02238500"
meta_1 <- read_USGS_ts_meta(monitoring_location_id = site)
meta_multi <- read_USGS_ts_meta(monitoring_location_id = c("USGS-01491000",
- "USGS-01645000"),
+ "USGS-01645000"),
parameter_code = c("00060", "00010"),
properties = c("monitoring_location_id",
"parameter_code",
"begin",
- "end"),
+ "end",
+ "time_series_id"),
skipGeometry = TRUE)
}
\dontshow{\}) # examplesIf}
diff --git a/tests/testthat/tests_general.R b/tests/testthat/tests_general.R
index e42d3260..4cd73e48 100644
--- a/tests/testthat/tests_general.R
+++ b/tests/testthat/tests_general.R
@@ -62,7 +62,7 @@ test_that("General USGS retrievals working", {
test_that("General NWIS retrievals working", {
testthat::skip_on_cran()
-
+ skip_on_ci()
multiSite <- readNWISdata(
sites = c("04025500", "040263491"), service = "iv",
parameterCd = "00060",
diff --git a/tests/testthat/tests_userFriendly_fxns.R b/tests/testthat/tests_userFriendly_fxns.R
index 4d2b39ef..ed36dce7 100644
--- a/tests/testthat/tests_userFriendly_fxns.R
+++ b/tests/testthat/tests_userFriendly_fxns.R
@@ -2,7 +2,7 @@ context("Unit values")
test_that("Unit value data returns correct types", {
testthat::skip_on_cran()
-
+ skip_on_ci()
siteNumber <- "05114000"
parameterCd <- "00060"
startDate <- "2014-10-10"
@@ -68,7 +68,7 @@ test_that("Unit value data returns correct types", {
context("Peak, rating, meas, site")
test_that("peak, rating curves, surface-water measurements", {
testthat::skip_on_cran()
-
+ skip_on_ci()
siteNumbers <- c("01594440", "040851325")
data <- readNWISpeak(siteNumbers)
expect_is(data$agency_cd, "character")
@@ -83,11 +83,12 @@ test_that("peak, rating curves, surface-water measurements", {
data <- readNWISmeas(siteNumbers)
expect_is(data$agency_cd, "character")
- siteINFO_USGS <- read_USGS_monitoring_location("USGS-05114000")
+ siteINFO_USGS <- read_USGS_monitoring_location(monitoring_location_id = "USGS-05114000")
expect_is(siteINFO_USGS$agency_code, "character")
- expect_equal(siteINFO_USGS$monitoring_locations_id, "USGS-05114000")
+ expect_equal(siteINFO_USGS$monitoring_location_id, "USGS-05114000")
- siteINFOMulti_USGS <- read_USGS_monitoring_location(c("USGS-05114000", "USGS-09423350"))
+ siteINFOMulti_USGS <- read_USGS_monitoring_location(monitoring_location_id = c("USGS-05114000",
+ "USGS-09423350"))
expect_true(nrow(siteINFOMulti_USGS) == 2)
Meas07227500.ex <- readNWISmeas("07227500", expanded = TRUE)
@@ -157,6 +158,7 @@ test_that("read_USGS_daily", {
test_that("WQP qw tests", {
testthat::skip_on_cran()
+ skip_on_ci()
nameToUse <- "Specific conductance"
pcodeToUse <- "00095"
@@ -184,6 +186,7 @@ test_that("WQP qw tests", {
context("readNWISstat tests")
test_that("readNWISstat tests", {
testthat::skip_on_cran()
+ skip_on_ci()
data <- readNWISstat(
siteNumbers = c("02171500"),
parameterCd = c("00010", "00060"),
@@ -217,6 +220,7 @@ test_that("readNWISstat tests", {
context("readNWISuse tests")
test_that("readNWISuse tests", {
testthat::skip_on_cran()
+ skip_on_ci()
dc <- readNWISuse(
years = c(2000, 2005, 2010),
stateCd = "DC", countyCd = NULL
@@ -381,7 +385,7 @@ test_that("Construct USGS urls", {
expect_equal(url_daily$url,
"https://api.waterdata.usgs.gov/ogcapi/v0/collections/daily/items?f=json&lang=en-US&time=2024-01-01T00%3A00%3A00Z%2F..&skipGeometry=FALSE&limit=10000")
- url_works <- dataRetrieval:::walk_pages(url_daily)
+ url_works <- dataRetrieval:::walk_pages(url_daily, max_results = 1)
expect_true(nrow(url_works) > 0)
url_ts_meta <- construct_api_requests(monitoring_location_id = siteNumber,
@@ -393,7 +397,7 @@ test_that("Construct USGS urls", {
"https://api.waterdata.usgs.gov/ogcapi/v0/collections/time-series-metadata/items?f=json&lang=en-US&skipGeometry=FALSE&limit=10000"
)
- url_works_ts <- dataRetrieval:::walk_pages(url_ts_meta)
+ url_works_ts <- dataRetrieval:::walk_pages(url_ts_meta, max_results = 1)
expect_true(nrow(url_works_ts) > 0)
url_ml <- construct_api_requests(id = siteNumber,
@@ -401,7 +405,7 @@ test_that("Construct USGS urls", {
expect_equal(url_ml$url, "https://api.waterdata.usgs.gov/ogcapi/v0/collections/monitoring-locations/items?f=json&lang=en-US&skipGeometry=FALSE&limit=10000&id=USGS-01594440")
- url_works_ml <- dataRetrieval:::walk_pages(url_ml)
+ url_works_ml <- dataRetrieval:::walk_pages(url_ml, max_results = 1)
expect_true(nrow(url_works_ml) > 0)
url_use <- constructUseURL(
diff --git a/vignettes/read_USGS_functions.Rmd b/vignettes/read_USGS_functions.Rmd
index eda725e9..d1eb03bc 100644
--- a/vignettes/read_USGS_functions.Rmd
+++ b/vignettes/read_USGS_functions.Rmd
@@ -66,14 +66,124 @@ API_USGS_PAT = "my_super_secret_token"
```
You can use `usethis::edit_r_environ()` to edit find and open your .Renviron file. You will need to restart R for that variable to be recognized. You should not add this file to git projects or generally share your API key. Anyone else using your API key will count against the number of requests available to you!
-## Contextual Query Language 2 Support
+## Contextual Query Language Support
-Supports [Contextual Query Language](https://www.loc.gov/standards/sru/cql/) (CQL2) syntax for flexible queries. We'll show how to use the `read_USGS_data` function to make specific queries using Contextual Query Language (2).
+Supports [Contextual Query Language](https://www.loc.gov/standards/sru/cql/) (CQL2) syntax for flexible queries. We'll show how to use the `read_USGS_data` function to make specific CQL2 queries.
## Simple Features
Provides [Simple Features](https://en.wikipedia.org/wiki/Simple_Features) functionality. The data is returned with a "geometry" column, which is a simple feature object, allowing the data to be integrated with the [`sf`](https://r-spatial.github.io/sf/) package and associated geospatial workflows.
+# Lessons Learned
+
+This section will initially be a random stream of consciousness on lessons learned while developing these functions and playing with the services.
+
+## Query limits
+
+A semi-common way to find a lot of data in the past would have been to use a monitoring location query to get a huge list of sites, and then use that huge list of sites (maybe winnowing it down a little) to get the data. These new services return a 403 error if your request is too big ("web server understands your request but refuses to authorize it"). This is true whether or not the request is a GET or POST request (something that is taken care of under the hood), and seems to be a character limit of the overall request. Roughly, it seems like if you were requesting more than 250 monitoring locations, the response will immediately return with a 403 error.
+
+There are at least 2 ways to deal with this. One is to manually split the data requests and bind the results together later. The other is to use the bounding box of the initial request as an input to the data request. Potentially some sites would need to be filtered out later using this method.
+
+Example:
+
+```{r}
+ohio <- read_USGS_monitoring_location(state_name = "Ohio",
+ site_type_code = "ST")
+
+```
+
+There are `r nrow(ohio)` rows returned that are stream sites in Ohio. If we tried to ask for all the discharge data over the last 7 days from that list of sites:
+
+```
+ohio_discharge <- read_USGS_daily(monitoring_location_id = ohio$monitoring_location_id,
+ parameter_code = "00060",
+ time = "P7D")
+Error in `req_perform()`:
+! HTTP 403 Forbidden.
+• Query request denied. Possible reasons include query exceeding server limits.
+```
+
+We could use the fact that the `ohio` data frame contains geospatial information, create a bounding box, and ask for that data like this:
+
+```{r}
+ohio_discharge <- read_USGS_daily(bbox = sf::st_bbox(ohio),
+ parameter_code = "00060",
+ time = "P7D")
+
+```
+
+A reasonable `r nrow(ohio_discharge)` are returned with the bounding box query.
+
+Maybe you have a list of sites that are scattered around the country. The bounding box method might not be ideal. There are several ways to loop through a set of sites, here is one simple example:
+
+```{r}
+big_vector_of_sites <- ohio$monitoring_location_id
+
+site_list <- split(big_vector_of_sites, ceiling(seq_along(big_vector_of_sites)/200))
+
+data_returned <- data.frame()
+for(sites in site_list){
+ df_sites <- read_USGS_daily(monitoring_location_id = sites,
+ parameter_code = "00060",
+ time = "P7D")
+ if(nrow(df_sites) == 0){
+ next
+ }
+ data_returned <- rbind(data_returned, df_sites)
+}
+
+```
+
+Note there is fewer data returned in `data_returned` because those sites are already filtered down to just "Stream" sites. The bounding box results `ohio_discharge` contained other types of monitoring location types.
+
+## Result limits
+
+There's a hard cap at 50,000 rows returned per one request. This means that for 1 `dataRetrieval` request, only 50,000 rows will be returned even if there is more data! If you know you are making a big request, it will be up to you to split up your request into "reasonable" chunks. Note that sometimes you'll notice a big request gets chunked up and you can see that it actually made a bunch of requests - this is done automatically (it's called paging), and the 50,000 limit is still in effect for the total number returned from all the pages.
+
+## limit vs max_results
+
+A user can specify a `limit` or `max_results`.
+
+The `max_results` argument defines how many rows are returned (assuming the data has at least `max_results` rows to return). This can be used as a handy way to make sure you aren't requesting a ton of data, perhaps to do some initial coding or troubleshooting.
+
+The `limit` argument defines how many rows are returned per page of data, but does NOT affect the overall number of rows returned. With a good internet connection, you can probably get away with ignoring this argument. By default it will be set to the highest value that the services allow. The reason you might want to change this argument is that it might be easier on a spotty internet connection to page through smaller sets of data.
+
+## id
+
+Each API endpoint natively returns a column named "id". The results of the "id" column can be used as inputs into other endpoints, **HOWEVER** the input in those functions have different names. For example, the "id" column of the monitoring location endpoint is considered the "monitoring_location_id" when used as an input to any of the other functions.
+
+Therefore, `dataRetrieval` functions will rename the "id" column to whatever it is referred to in other functions. Here are the id translations:
+
+```{r echo=FALSE}
+df <- dplyr::tibble(Function = c("read_USGS_monitoring_location",
+ "read_USGS_ts_meta",
+ "read_USGS_daily"),
+ "ID returned" = c("monitoring_location_id",
+ "time_series_id",
+ "daily_id"))
+
+knitr::kable(df)
+```
+
+If a user would prefer the columns to come back as "id", they can specify that using the `properties` argument:
+
+```{r}
+site <- "USGS-02238500"
+
+site_1 <- read_USGS_monitoring_location(monitoring_location_id = site,
+ properties = c("monitoring_location_id",
+ "state_name",
+ "country_name"))
+names(site_1)
+site_2 <- read_USGS_monitoring_location(monitoring_location_id = site,
+ properties = c("id",
+ "state_name",
+ "country_name"))
+names(site_2)
+
+
+```
+
# New Functions
As new API endpoints come online, this section will be updated with any `dataRetrieval` function that is created.
@@ -102,7 +212,7 @@ Maybe that is more information than you need. You can specify which columns get
```{r}
site_info <- read_USGS_monitoring_location(monitoring_location_id = "USGS-01491000",
- properties = c("monitoring_locations_id",
+ properties = c("monitoring_location_id",
"site_type",
"drainage_area",
"monitoring_location_name"))
@@ -300,15 +410,15 @@ That's a lot of new information and changes. There are certainly going to be scr
Check back on the documentation often:
-Peruse the "Additional Articles", when we find common issues people have with converting their old workflows, we will try to add articles to clarrify new workflows.
+Peruse the "Additional Articles", when we find common issues people have with converting their old workflows, we will try to add articles to clarify new workflows.
Currently, you might be interested in:
-* [General Tutorial](articles/Tutorial.html)
+* [General Tutorial](tutorial.html)
-* [Pivot Help](articles/long_to_wide.html)
+* [Pivot Help](long_to_wide.html)
-* [Joining by closest date](articles/join_by_closest.html)
+* [Joining by closest date](join_by_closest.html)
If you have additional questions, email comptools@usgs.gov. General questions and bug reports can be reported here: