From 2865d8bc59904850fee54ca1dbe6d693458a38a5 Mon Sep 17 00:00:00 2001 From: Joseph Zemmels Date: Mon, 29 Dec 2025 16:24:59 -0700 Subject: [PATCH 1/6] First pass at statistics functions --- DESCRIPTION | 2 +- NAMESPACE | 2 + R/read_waterdata_stats.R | 200 +++++++++++++++++++++++++++++++++++++++ R/walk_pages.R | 28 +++--- 4 files changed, 218 insertions(+), 14 deletions(-) create mode 100644 R/read_waterdata_stats.R diff --git a/DESCRIPTION b/DESCRIPTION index e8ffb2c1..67a07bc9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -67,5 +67,5 @@ Encoding: UTF-8 BuildVignettes: true VignetteBuilder: knitr BugReports: https://github.com/DOI-USGS/dataRetrieval/issues -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index 8f0f178d..c919f665 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,6 +53,8 @@ export(read_waterdata_metadata) export(read_waterdata_monitoring_location) export(read_waterdata_parameter_codes) export(read_waterdata_samples) +export(read_waterdata_stats_interval) +export(read_waterdata_stats_normal) export(read_waterdata_ts_meta) export(renameNWISColumns) export(setAccess) diff --git a/R/read_waterdata_stats.R b/R/read_waterdata_stats.R new file mode 100644 index 00000000..f097f6fb --- /dev/null +++ b/R/read_waterdata_stats.R @@ -0,0 +1,200 @@ +#' Get summary statistic data from /statistics API +#' +#' @description +#' +#' \code{read_waterdata_stats_normal} fetches day- or month-of-year data from the observationNormals endpoint. +#' +#' \code{read_waterdata_stats} fetches calendar month-year, calendar year, or water year data from the observationIntervals endpoint. +#' +#' @export +#' +#' @param approval_status asdf +#' @param computation_type asdf +#' @param country_code asdf +#' @param state_code asdf +#' @param county_code asdf +#' @param start_date asdf +#' @param end_date asdf +#' @param mime_type asdf +#' @param monitoring_location_id asfd +#' @param next_token asdf +#' @param parent_time_series_id aasdf +#' @param site_type_code asdf +#' @param site_type_name asdf +#' @param parameter_code asf +#' @param next_token asdf +#' @param page_size asdf +#' +#' @rdname get_waterdata_stats +#' @seealso \url{https://api.waterdata.usgs.gov/statistics/v0/docs} +read_waterdata_stats_normal <- function( + approval_status = NA, + computation_type = NA_character_, + country_code = NA_character_, + state_code = NA_character_, + county_code = NA_character_, + start_date = NA_character_, + end_date = NA_character_, + mime_type = NA_character_, + monitoring_location_id = NA_character_, + parent_time_series_id = NA_character_, + site_type_code = NA_character_, + site_type_name = NA_character_, + parameter_code = NA_character_, + next_token = NA_character_, + page_size = NA +) { + + args <- mget(names(formals())) + + get_statistics_data(args = args, service = "Normals") +} + +#' @export +#' @rdname get_waterdata_stats +read_waterdata_stats_interval <- function( + approval_status = NA, + computation_type = NA_character_, + country_code = NA_character_, + state_code = NA_character_, + county_code = NA_character_, + start_date = NA_character_, + end_date = NA_character_, + mime_type = NA_character_, + monitoring_location_id = NA_character_, + next_token = NA_character_, + parent_time_series_id = NA_character_, + site_type_code = NA_character_, + site_type_name = NA_character_, + parameter_code = NA_character_, + page_size = NA +){ + + args <- mget(names(formals())) + + get_statistics_data(args = args, service = "Intervals") + +} + +#' Retrieve data from /statistics API +#' +#' @param args arguments from individual functions. +#' @param service Ednpoint name. +#' +#' @noRd +#' @return data.frame with attributes +get_statistics_data <- function(args, service) { + + base_request <- construct_statistics_request(service = service, version = 0) + + # TODO?: arg type checking here + + full_request <- explode_query(base_request, POST = FALSE, x = args) + + return_list <- walk_pages(full_request, max_results = NA) + + # TODO?: pull the following double-for loop out to other function(s) + return_list_tmp <- list() + for (i in 1:nrow(return_list)) { + x <- jsonlite::parse_json(return_list$data[i]) + z <- data.frame(rbind_fill(x)) + zz <- list() + for (j in 1:nrow(z)) { + zz[[j]] <- cbind( + subset(z[j, ], select = -values), + rbind_fill(z$values[[j]]), + row.names = NULL + ) + } + zz <- do.call(rbind, zz) + + return_list_tmp[[i]] <- cbind_sf_metadata( + subset(return_list[i, ], select = -data), + zz + ) + } + + return_list <- do.call(rbind, return_list_tmp) + + attr(return_list, "request") <- full_request + attr(return_list, "queryTime") <- Sys.time() + + return(return_list) +} + +#' Create a request object for the /statistics service +#' +#' @param service chr; "Normals" or "Intervals" +#' @param version int; /statistics API version number (default: 0) +#' +#' @return a httr2 request object +#' +#' @noRd +construct_statistics_request <- function(service = "Normals", version = 0){ + + httr2::request("https://api.waterdata.usgs.gov/statistics/") |> + httr2::req_url_path_append(paste0("v", version)) |> + httr2::req_url_path_append(paste0("observation", service)) + +} + +#' Bind a list of data frames by row +#' +#' Meant to emulate data.table::rbindlist in how it handles missing data +#' +#' @param dfs a list of data frames +#' +#' @return a data frame +#' +#' @noRd +rbind_fill <- function(dfs) { + + # drop NULL elements + dfs <- Filter(Negate(is.null), dfs) + if (length(dfs) == 0) return(NULL) + + # union of all column names + all_names <- unique(unlist(lapply(dfs, names))) + + # align columns + dfs_aligned <- lapply(dfs, function(x) { + missing <- setdiff(all_names, names(x)) + if (length(missing)) { + for (m in missing) x[[m]] <- NA + } + x[all_names] + }) + + # bind + out <- do.call(rbind, dfs_aligned) + rownames(out) <- NULL + out +} + +#' Combine sf metadata with observations from /statistics +#' +#' @param meta_sf a single-row sf data frame object containing metadata about observations +#' @param obs_df a multi-row data frame of observations +#' +#' @return an sf data frame object +#' +#' @noRd +cbind_sf_metadata <- function(meta_sf, obs_df) { + + stopifnot( + inherits(meta_sf, "sf"), + nrow(meta_sf) == 1 + ) + + n <- nrow(obs_df) + + # replicate metadata rows + meta_rep <- meta_sf[rep(1, n), , drop = FALSE] + + # drop geometry before cbind, then restore + geom <- sf::st_geometry(meta_rep) + meta_nogeo <- sf::st_drop_geometry(meta_rep) + + out <- cbind(meta_nogeo, obs_df) + sf::st_sf(out, geometry = geom) +} diff --git a/R/walk_pages.R b/R/walk_pages.R index 39b85ae2..fe6b7850 100644 --- a/R/walk_pages.R +++ b/R/walk_pages.R @@ -167,36 +167,38 @@ cleanup_cols <- function(df, service = "daily"){ } #' Next request URL -#' +#' #' Custom function to find the "next" URL from the API response. #' @seealso [httr2::req_perform_iterative] -#' +#' #' @param resp httr2 response from last request #' @param req httr2 request from last time -#' +#' #' @noRd #' @return the url for the next request -#' +#' next_req_url <- function(resp, req) { - body <- httr2::resp_body_json(resp) - + if(isTRUE(body[["numberReturned"]] == 0)){ return(NULL) } - + header_info <- httr2::resp_headers(resp) if(Sys.getenv("API_USGS_PAT") != ""){ message("Remaining requests this hour:", header_info$`x-ratelimit-remaining`, " ") } - - links <- body$links + if ("links" %in% names(body)) { + links <- body$links if(any(sapply(links, function(x) x$rel) == "next")){ - next_index <- which(sapply(links, function(x) x$rel) == "next") - - next_url <- links[[next_index]][["href"]] + next_index <- which(sapply(links, function(x) x$rel) == "next") + + next_url <- links[[next_index]][["href"]] - return(httr2::req_url(req = req, url = next_url)) + return(httr2::req_url(req = req, url = next_url)) + } + } else if (!is.null(body[["next"]])) { + return(httr2::req_url_query(req, next_token = body[["next"]])) } else { return(NULL) } From 7e331a3d25418414f37267afb709ead6d79f2b20 Mon Sep 17 00:00:00 2001 From: Joseph Zemmels Date: Tue, 30 Dec 2025 16:08:46 -0700 Subject: [PATCH 2/6] Add data.table version of get_statistics_data function --- DESCRIPTION | 3 +- NAMESPACE | 10 +++ R/dataRetrieval-package.R | 15 ++++ R/read_waterdata_stats.R | 139 ++++++++++++++++++++--------- R/read_waterdata_stats_datatable.R | 111 +++++++++++++++++++++++ 5 files changed, 234 insertions(+), 44 deletions(-) create mode 100644 R/dataRetrieval-package.R create mode 100644 R/read_waterdata_stats_datatable.R diff --git a/DESCRIPTION b/DESCRIPTION index 67a07bc9..dbff23ac 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -56,7 +56,8 @@ Imports: jsonlite, httr2, whisker, - sf + sf, + data.table Suggests: covr, dplyr, diff --git a/NAMESPACE b/NAMESPACE index c919f665..8c9606d5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(create_WQP_bib) export(findNLDI) export(getWebServiceData) export(get_nldi_sources) +export(get_statistics_data_rbindlist) export(importNGWMN) export(importRDB1) export(importWQP) @@ -70,3 +71,12 @@ export(whatWQPsamples) export(whatWQPsites) export(wqp_check_status) export(zeroPad) +importFrom(data.table,":=") +importFrom(data.table,.BY) +importFrom(data.table,.EACHI) +importFrom(data.table,.GRP) +importFrom(data.table,.I) +importFrom(data.table,.N) +importFrom(data.table,.NGRP) +importFrom(data.table,.SD) +importFrom(data.table,data.table) diff --git a/R/dataRetrieval-package.R b/R/dataRetrieval-package.R new file mode 100644 index 00000000..35a16fac --- /dev/null +++ b/R/dataRetrieval-package.R @@ -0,0 +1,15 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +#' @importFrom data.table .BY +#' @importFrom data.table .EACHI +#' @importFrom data.table .GRP +#' @importFrom data.table .I +#' @importFrom data.table .N +#' @importFrom data.table .NGRP +#' @importFrom data.table .SD +#' @importFrom data.table := +#' @importFrom data.table data.table +## usethis namespace: end +NULL diff --git a/R/read_waterdata_stats.R b/R/read_waterdata_stats.R index f097f6fb..c0503612 100644 --- a/R/read_waterdata_stats.R +++ b/R/read_waterdata_stats.R @@ -28,52 +28,52 @@ #' @rdname get_waterdata_stats #' @seealso \url{https://api.waterdata.usgs.gov/statistics/v0/docs} read_waterdata_stats_normal <- function( - approval_status = NA, - computation_type = NA_character_, - country_code = NA_character_, - state_code = NA_character_, - county_code = NA_character_, - start_date = NA_character_, - end_date = NA_character_, - mime_type = NA_character_, - monitoring_location_id = NA_character_, - parent_time_series_id = NA_character_, - site_type_code = NA_character_, - site_type_name = NA_character_, - parameter_code = NA_character_, - next_token = NA_character_, - page_size = NA + approval_status = NA, + computation_type = NA_character_, + country_code = NA_character_, + state_code = NA_character_, + county_code = NA_character_, + start_date = NA_character_, + end_date = NA_character_, + mime_type = NA_character_, + monitoring_location_id = NA_character_, + parent_time_series_id = NA_character_, + site_type_code = NA_character_, + site_type_name = NA_character_, + parameter_code = NA_character_, + next_token = NA_character_, + page_size = NA ) { args <- mget(names(formals())) - + get_statistics_data(args = args, service = "Normals") } #' @export #' @rdname get_waterdata_stats read_waterdata_stats_interval <- function( - approval_status = NA, - computation_type = NA_character_, - country_code = NA_character_, - state_code = NA_character_, - county_code = NA_character_, - start_date = NA_character_, - end_date = NA_character_, - mime_type = NA_character_, - monitoring_location_id = NA_character_, - next_token = NA_character_, - parent_time_series_id = NA_character_, - site_type_code = NA_character_, - site_type_name = NA_character_, - parameter_code = NA_character_, - page_size = NA + approval_status = NA, + computation_type = NA_character_, + country_code = NA_character_, + state_code = NA_character_, + county_code = NA_character_, + start_date = NA_character_, + end_date = NA_character_, + mime_type = NA_character_, + monitoring_location_id = NA_character_, + next_token = NA_character_, + parent_time_series_id = NA_character_, + site_type_code = NA_character_, + site_type_name = NA_character_, + parameter_code = NA_character_, + page_size = NA ){ - + args <- mget(names(formals())) - + get_statistics_data(args = args, service = "Intervals") - + } #' Retrieve data from /statistics API @@ -84,15 +84,15 @@ read_waterdata_stats_interval <- function( #' @noRd #' @return data.frame with attributes get_statistics_data <- function(args, service) { - + base_request <- construct_statistics_request(service = service, version = 0) - + # TODO?: arg type checking here - + full_request <- explode_query(base_request, POST = FALSE, x = args) - + return_list <- walk_pages(full_request, max_results = NA) - + # TODO?: pull the following double-for loop out to other function(s) return_list_tmp <- list() for (i in 1:nrow(return_list)) { @@ -107,18 +107,25 @@ get_statistics_data <- function(args, service) { ) } zz <- do.call(rbind, zz) - + return_list_tmp[[i]] <- cbind_sf_metadata( subset(return_list[i, ], select = -data), zz ) } - + return_list <- do.call(rbind, return_list_tmp) + + if(all(c("values", "percentiles") %in% names(return_list))){ + return_list <- combine_value_columns(return_list) + } + return_list$value <- as.numeric(unlist(return_list$value)) + return_list$percentile <- as.numeric(unlist(return_list$percentile)) + attr(return_list, "request") <- full_request attr(return_list, "queryTime") <- Sys.time() - + return(return_list) } @@ -131,7 +138,7 @@ get_statistics_data <- function(args, service) { #' #' @noRd construct_statistics_request <- function(service = "Normals", version = 0){ - + httr2::request("https://api.waterdata.usgs.gov/statistics/") |> httr2::req_url_path_append(paste0("v", version)) |> httr2::req_url_path_append(paste0("observation", service)) @@ -198,3 +205,49 @@ cbind_sf_metadata <- function(meta_sf, obs_df) { out <- cbind(meta_nogeo, obs_df) sf::st_sf(out, geometry = geom) } + +#' Clean up a /statistics data frame +#' +#' Unpacks "values" and "percentiles" nested columns to be one row per value, +#' renames "percentiles" to "percentile", consolidates "value" and "values" into +#' one "value" column +#' +#' @param df a data frame with "value" and "values" columns +#' +#' @return a data frame with a value column and maybe a +#' @noRd +combine_value_columns <- function(df) { + + split_vals <- lapply(df$values, function(x) as.numeric(unlist(x))) + split_pcts <- lapply(df$percentiles, function(x) as.numeric(unlist(x))) + + df$values <- NULL + df$percentiles <- NULL + + expanded <- lapply(seq_len(nrow(df)), function(i) { + + v <- split_vals[[i]] + p <- split_pcts[[i]] + + v[is.nan(v)] <- NA + v <- as.numeric(v) + p <- as.numeric(p) + + # case: single NA → scalar statistic + if (length(v) == 1 && is.na(v)) { + df$percentile <- NA_real_ + return(df[i, , drop = FALSE]) + } + + out <- df[rep(i, length(v)), , drop = FALSE] + + out$value <- v + out$percentile <- p + out + }) + + out <- do.call(rbind, expanded) + rownames(out) <- NULL + out +} + diff --git a/R/read_waterdata_stats_datatable.R b/R/read_waterdata_stats_datatable.R new file mode 100644 index 00000000..857cf43e --- /dev/null +++ b/R/read_waterdata_stats_datatable.R @@ -0,0 +1,111 @@ +get_statistics_data_data_table <- function(args, service) { + + base_request <- construct_statistics_request(service = service, version = 0) + + # TODO?: arg type checking here + + full_request <- explode_query(base_request, POST = FALSE, x = args) + + return_list <- data.table::as.data.table(walk_pages(full_request, max_results = NA)) + return_list[, rid := .I] + + parsed_data <- lapply(return_list$data, jsonlite::parse_json) + return_list[, data := NULL] + + combined_list <- list() + for (i in seq_along(parsed_data)){ + time_series_metainfo <- data.table::rbindlist(parsed_data[[i]]) + + observations <- data.table::rbindlist(time_series_metainfo$values, fill = TRUE) + observations <- observations[, eval(clean_value_cols()), by = .I][, I := NULL] + time_series_metainfo[, values := NULL] + + time_series_metainfo <- cbind(time_series_metainfo, observations) + time_series_metainfo[, rid := i] + + combined_list[[i]] <- time_series_metainfo + } + + combined <- data.table::rbindlist(combined_list) + combined <- combined[return_list, on = "rid"] + + col_order <- c("parent_time_series_id", "monitoring_location_id", "monitoring_location_name", "parameter_code", + "interval_type", "start_date", "end_date", "computation", "value", "percentile", + "unit_of_measure", "sample_count", + "approval_status", "computation_id", + "site_type", "site_type_code", + "country_code", "state_code", "county_code", + "geometry") + data.table::setcolorder(combined, col_order) + + attr(combined, "request") <- full_request + attr(combined, "queryTime") <- Sys.time() + + return(sf::st_as_sf(combined)) +} + +#' Clean up value, values, and percentiles columns in a data.table +clean_value_cols <- function() { + quote({ + + ## ---- detect column presence ---- + has_value <- exists("value") + has_values <- exists("values") + has_percentiles <- exists("percentiles") + + ## ---- values ---- + vals <- if (has_values && + !is.null(values[[1]]) && + length(values[[1]]) > 0) { + + v <- values[[1]] + if (length(v) == 1L && identical(v, "nan")) { + NA_real_ + } else { + as.numeric(v) + } + + } else if (has_value) { + as.numeric(value) + } else { + NA_real_ + } + + n <- length(vals) + + ## ---- percentiles ---- + percs <- if (has_percentiles && + !is.null(percentiles[[1]]) && + length(percentiles[[1]]) > 0) { + + as.numeric(percentiles[[1]]) + + } else if (data.table::`%chin%`(computation,c("minimum", "medium", "maximum"))) { + + fifelse( + computation == "minimum", 0, + fifelse(computation == "medium", 50, 100) + ) + + } else { + NA_real_ + } + + if (length(percs) == 1L && n > 1L) { + percs <- rep(percs, n) + } + + ## ---- expand scalar columns ---- + .( + value = vals, + percentile = percs, + start_date = rep(start_date, n), + end_date = rep(end_date, n), + interval_type = rep(interval_type, n), + sample_count = rep(sample_count, n), + approval_status = rep(approval_status, n), + computation_id = rep(computation_id, n), + computation = rep(computation, n) + ) + }) +} \ No newline at end of file From 37b9987493b43b1359022f476b18486a24560287 Mon Sep 17 00:00:00 2001 From: Joseph Zemmels Date: Tue, 30 Dec 2025 16:54:11 -0700 Subject: [PATCH 3/6] Add some documentation to internal clean_value_cols() function --- NAMESPACE | 1 - R/read_waterdata_stats_datatable.R | 26 +++++++++- man/get_waterdata_stats.Rd | 82 ++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 man/get_waterdata_stats.Rd diff --git a/NAMESPACE b/NAMESPACE index 8c9606d5..6999f2b9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,7 +17,6 @@ export(create_WQP_bib) export(findNLDI) export(getWebServiceData) export(get_nldi_sources) -export(get_statistics_data_rbindlist) export(importNGWMN) export(importRDB1) export(importWQP) diff --git a/R/read_waterdata_stats_datatable.R b/R/read_waterdata_stats_datatable.R index 857cf43e..41eb608a 100644 --- a/R/read_waterdata_stats_datatable.R +++ b/R/read_waterdata_stats_datatable.R @@ -1,3 +1,4 @@ +#' get_statistics_data_data_table <- function(args, service) { base_request <- construct_statistics_request(service = service, version = 0) @@ -44,7 +45,30 @@ get_statistics_data_data_table <- function(args, service) { return(sf::st_as_sf(combined)) } -#' Clean up value, values, and percentiles columns in a data.table +#' Clean up "value", "values", and "percentiles" columns in a data.table +#' +#' @description +#' If the input data.table has "values" and "percentiles" columns, then +#' it unnests both, making the data.table longer (one row per value) +#' +#' If the input data.table *also* has a "value" column, then it consolidates with "values", yielding a single "value" column +#' +#' The "percentiles" column is replaced by "percentile" (matching singularization of "value"). +#' The function also checks whether the "computation" column contains "minimum", "median", or "maximum" and +#' sets the corresponding "percentile" to 0, 50, or 100, respectively. Note that the percentile column might only +#' contain NAs if the input data.table only includes arithmetic_mean values +#' +#' Lastly, the value and percentile columns are converted from character to numeric +#' +#' +#' @note +#' This function is intended to evaluate as a data.table j expression, which is why it doesn't accept any arguments. +#' j expressions are typically written inline with the data.table definition (e.g., DT[, { do something }]). +#' However, this expression would have been excessively long. Instead, an eval(quote(...)) is used so this could be +#' pulled out into a separate function for readability, but we still gain the computational benefits of using an expression. +#' +#' @noRd +#' @return data.table object with "value" and "percentile" columns clean_value_cols <- function() { quote({ diff --git a/man/get_waterdata_stats.Rd b/man/get_waterdata_stats.Rd new file mode 100644 index 00000000..916396fe --- /dev/null +++ b/man/get_waterdata_stats.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_waterdata_stats.R +\name{read_waterdata_stats_normal} +\alias{read_waterdata_stats_normal} +\alias{read_waterdata_stats_interval} +\title{Get summary statistic data from /statistics API} +\usage{ +read_waterdata_stats_normal( + approval_status = NA, + computation_type = NA_character_, + country_code = NA_character_, + state_code = NA_character_, + county_code = NA_character_, + start_date = NA_character_, + end_date = NA_character_, + mime_type = NA_character_, + monitoring_location_id = NA_character_, + parent_time_series_id = NA_character_, + site_type_code = NA_character_, + site_type_name = NA_character_, + parameter_code = NA_character_, + next_token = NA_character_, + page_size = NA +) + +read_waterdata_stats_interval( + approval_status = NA, + computation_type = NA_character_, + country_code = NA_character_, + state_code = NA_character_, + county_code = NA_character_, + start_date = NA_character_, + end_date = NA_character_, + mime_type = NA_character_, + monitoring_location_id = NA_character_, + next_token = NA_character_, + parent_time_series_id = NA_character_, + site_type_code = NA_character_, + site_type_name = NA_character_, + parameter_code = NA_character_, + page_size = NA +) +} +\arguments{ +\item{approval_status}{asdf} + +\item{computation_type}{asdf} + +\item{country_code}{asdf} + +\item{state_code}{asdf} + +\item{county_code}{asdf} + +\item{start_date}{asdf} + +\item{end_date}{asdf} + +\item{mime_type}{asdf} + +\item{monitoring_location_id}{asfd} + +\item{parent_time_series_id}{aasdf} + +\item{site_type_code}{asdf} + +\item{site_type_name}{asdf} + +\item{parameter_code}{asf} + +\item{next_token}{asdf} + +\item{page_size}{asdf} +} +\description{ +\code{read_waterdata_stats_normal} fetches day- or month-of-year data from the observationNormals endpoint. + +\code{read_waterdata_stats} fetches calendar month-year, calendar year, or water year data from the observationIntervals endpoint. +} +\seealso{ +\url{https://api.waterdata.usgs.gov/statistics/v0/docs} +} From a59b1fa5e58984ccb0aef47c5d8a35f581a69046 Mon Sep 17 00:00:00 2001 From: Joseph Zemmels Date: Wed, 31 Dec 2025 11:38:45 -0700 Subject: [PATCH 4/6] Move data table implementation to read_statistics_data, update documentation --- R/read_waterdata_stats.R | 334 ++++++++++++++--------------- R/read_waterdata_stats_datatable.R | 135 ------------ 2 files changed, 166 insertions(+), 303 deletions(-) delete mode 100644 R/read_waterdata_stats_datatable.R diff --git a/R/read_waterdata_stats.R b/R/read_waterdata_stats.R index c0503612..45a45316 100644 --- a/R/read_waterdata_stats.R +++ b/R/read_waterdata_stats.R @@ -1,30 +1,66 @@ #' Get summary statistic data from /statistics API -#' -#' @description -#' -#' \code{read_waterdata_stats_normal} fetches day- or month-of-year data from the observationNormals endpoint. -#' -#' \code{read_waterdata_stats} fetches calendar month-year, calendar year, or water year data from the observationIntervals endpoint. -#' +#' +#' @description +#' +#' This service provides endpoints for access to computations on the historical +#' record regarding water conditions. For more information regarding the +#' calculation of statistics and other details, please visit the Statistics +#' documentation page. +#' +#' Note: This API is under active beta development and subject to change. +#' Improved handling of significant figures will be addressed in a future +#' release. +#' +#' \code{read_waterdata_stats_normal} Returns day-of-year and month-of-year +#' statistics matching your query. +#' +#' \code{read_waterdata_stats} Returns monthly and annual statistics matching +#' your query. +#' #' @export -#' -#' @param approval_status asdf -#' @param computation_type asdf -#' @param country_code asdf -#' @param state_code asdf -#' @param county_code asdf -#' @param start_date asdf -#' @param end_date asdf -#' @param mime_type asdf -#' @param monitoring_location_id asfd -#' @param next_token asdf -#' @param parent_time_series_id aasdf -#' @param site_type_code asdf -#' @param site_type_name asdf -#' @param parameter_code asf -#' @param next_token asdf -#' @param page_size asdf -#' +#' +#' @param approval_status Whether to include approved and/or provisional +#' observations. At this time, only approved observations are returned. +#' @param computation_type Desired statistical computation method. Available +#' values: "arithmetic_mean", "maximum", "median", "minimum", "percentile". +#' @param country_code Country Query Parameter. Accepts multiple values (see +#' examples). If one of country, county, or state code is supplied then the +#' other two arguments do not need to be specified. +#' @param state_code State Query Parameter. Accepts multiple values in a +#' character vector. +#' @param county_code County Query Parameter. Accepts multiple values in a +#' character vector. +#' @param start_date Start Date Query Parameter. The logic is inclusive i.e., it +#' will also return records that match the date. If an end date is supplied, +#' but no start date is supplied, then statistics will be supplied for the +#' entire period of record ending with the end date. If an end date is not +#' supplied, but a start date is supplied then statistics will be supplied for +#' the period of record following the start date. If no start or end date are +#' supplied then statistics will be supplied for the entire period of record. +#' @param end_date End Date Query Parameter. The logic is inclusive i.e., it will +#' also return records that match the date. +#' @param monitoring_location_id Each monitoring location has been assigned a +#' unique station number that places them in downstream order. Accepts +#' multiple values in a character vector. +#' @param parent_time_series_id The parent_time_series_id returns statistics +#' tied to a particular database entry. Accepts multiple values in a character +#' vector. If no parent time series identifier is supplied, then all records +#' matching the rest of the provided criteria will be returned. +#' @param site_type_code Site Type Code Query Parameter. Accepts multiple values +#' in a character vector. If no Site Type code is specified, statistics of all +#' site types with the matching Monitoring Location Identifier will be +#' returned. +#' @param site_type_name Site Type Name Query Parameter. If no Site Type name is +#' specified, statistics of all site types with the matching Monitoring +#' Location Identifier will be returned. +#' @param parameter_code USGS Parameter Code Query Parameter. Accepts multiple +#' values in a character vector. If no USGS parameter code is specified, but a +#' Monitoring Location Identifier is supplied, then all statistics and their +#' parameter codes with a matching monitoring location identifier will be +#' returned. All statistics within the period of record will be returned if no +#' parameter code or monitoring location identifier are specified. +#' @param page_size Return a defined number of results (default: 1000). +#' #' @rdname get_waterdata_stats #' @seealso \url{https://api.waterdata.usgs.gov/statistics/v0/docs} read_waterdata_stats_normal <- function( @@ -35,13 +71,11 @@ read_waterdata_stats_normal <- function( county_code = NA_character_, start_date = NA_character_, end_date = NA_character_, - mime_type = NA_character_, monitoring_location_id = NA_character_, parent_time_series_id = NA_character_, site_type_code = NA_character_, site_type_name = NA_character_, parameter_code = NA_character_, - next_token = NA_character_, page_size = NA ) { @@ -79,7 +113,7 @@ read_waterdata_stats_interval <- function( #' Retrieve data from /statistics API #' #' @param args arguments from individual functions. -#' @param service Ednpoint name. +#' @param service Endpoint name. #' #' @noRd #' @return data.frame with attributes @@ -91,163 +125,127 @@ get_statistics_data <- function(args, service) { full_request <- explode_query(base_request, POST = FALSE, x = args) - return_list <- walk_pages(full_request, max_results = NA) - - # TODO?: pull the following double-for loop out to other function(s) - return_list_tmp <- list() - for (i in 1:nrow(return_list)) { - x <- jsonlite::parse_json(return_list$data[i]) - z <- data.frame(rbind_fill(x)) - zz <- list() - for (j in 1:nrow(z)) { - zz[[j]] <- cbind( - subset(z[j, ], select = -values), - rbind_fill(z$values[[j]]), - row.names = NULL - ) - } - zz <- do.call(rbind, zz) - - return_list_tmp[[i]] <- cbind_sf_metadata( - subset(return_list[i, ], select = -data), - zz - ) - } + return_list <- data.table::as.data.table(walk_pages(full_request, max_results = NA)) + return_list[, rid := .I] - return_list <- do.call(rbind, return_list_tmp) + parsed_data <- lapply(return_list$data, jsonlite::parse_json) + return_list[, data := NULL] - if(all(c("values", "percentiles") %in% names(return_list))){ - return_list <- combine_value_columns(return_list) + combined_list <- list() + for (i in seq_along(parsed_data)){ + time_series_metainfo <- data.table::rbindlist(parsed_data[[i]]) + + observations <- data.table::rbindlist(time_series_metainfo$values, fill = TRUE) + observations <- observations[, .j, by = .I, env = list(.j = clean_value_cols())][, I := NULL] + time_series_metainfo[, values := NULL] + + time_series_metainfo <- cbind(time_series_metainfo, observations) + time_series_metainfo[, rid := i] + + combined_list[[i]] <- time_series_metainfo } - - return_list$value <- as.numeric(unlist(return_list$value)) - return_list$percentile <- as.numeric(unlist(return_list$percentile)) - - attr(return_list, "request") <- full_request - attr(return_list, "queryTime") <- Sys.time() - return(return_list) -} - -#' Create a request object for the /statistics service -#' -#' @param service chr; "Normals" or "Intervals" -#' @param version int; /statistics API version number (default: 0) -#' -#' @return a httr2 request object -#' -#' @noRd -construct_statistics_request <- function(service = "Normals", version = 0){ + combined <- data.table::rbindlist(combined_list) + combined <- combined[return_list, on = "rid"][, rid := NULL] - httr2::request("https://api.waterdata.usgs.gov/statistics/") |> - httr2::req_url_path_append(paste0("v", version)) |> - httr2::req_url_path_append(paste0("observation", service)) + attr(combined, "request") <- full_request + attr(combined, "queryTime") <- Sys.time() + return(sf::st_as_sf(as.data.frame(combined))) } -#' Bind a list of data frames by row -#' -#' Meant to emulate data.table::rbindlist in how it handles missing data -#' -#' @param dfs a list of data frames -#' -#' @return a data frame -#' -#' @noRd -rbind_fill <- function(dfs) { - - # drop NULL elements - dfs <- Filter(Negate(is.null), dfs) - if (length(dfs) == 0) return(NULL) - - # union of all column names - all_names <- unique(unlist(lapply(dfs, names))) - - # align columns - dfs_aligned <- lapply(dfs, function(x) { - missing <- setdiff(all_names, names(x)) - if (length(missing)) { - for (m in missing) x[[m]] <- NA - } - x[all_names] - }) - - # bind - out <- do.call(rbind, dfs_aligned) - rownames(out) <- NULL - out -} - -#' Combine sf metadata with observations from /statistics -#' -#' @param meta_sf a single-row sf data frame object containing metadata about observations -#' @param obs_df a multi-row data frame of observations -#' -#' @return an sf data frame object -#' -#' @noRd -cbind_sf_metadata <- function(meta_sf, obs_df) { - - stopifnot( - inherits(meta_sf, "sf"), - nrow(meta_sf) == 1 - ) - - n <- nrow(obs_df) - - # replicate metadata rows - meta_rep <- meta_sf[rep(1, n), , drop = FALSE] - - # drop geometry before cbind, then restore - geom <- sf::st_geometry(meta_rep) - meta_nogeo <- sf::st_drop_geometry(meta_rep) - - out <- cbind(meta_nogeo, obs_df) - sf::st_sf(out, geometry = geom) -} - -#' Clean up a /statistics data frame +#' Clean up "value", "values", and "percentiles" columns in a data.table +#' +#' @description If the input data.table has "values" and "percentiles" columns, +#' then it unnests both, making the data.table longer (one row per value) +#' +#' If the input data.table *also* has a "value" column, then it consolidates +#' with "values", yielding a single "value" column #' -#' Unpacks "values" and "percentiles" nested columns to be one row per value, -#' renames "percentiles" to "percentile", consolidates "value" and "values" into -#' one "value" column +#' The "percentiles" column is replaced by "percentile" (matching +#' singularization of "value"). The function also checks whether the +#' "computation" column contains "minimum", "median", or "maximum" and sets +#' the corresponding "percentile" to 0, 50, or 100, respectively. Note that +#' the percentile column might only contain NAs if the input data.table only +#' includes arithmetic_mean values. #' -#' @param df a data frame with "value" and "values" columns +#' Lastly, the value and percentile columns are converted from character to +#' numeric +#' +#' @note This function is intended to evaluate as a data.table j expression, +#' which is why it doesn't accept any arguments. j expressions are typically +#' written inline with the data.table definition (e.g., \code{DT[, { do +#' something }]}). However, this expression would have been excessively long. +#' Instead, a substitute() is used so this could be pulled out into a separate +#' function for readability, but we still gain the computational benefits of +#' using an expression. #' -#' @return a data frame with a value column and maybe a #' @noRd -combine_value_columns <- function(df) { - - split_vals <- lapply(df$values, function(x) as.numeric(unlist(x))) - split_pcts <- lapply(df$percentiles, function(x) as.numeric(unlist(x))) - - df$values <- NULL - df$percentiles <- NULL - - expanded <- lapply(seq_len(nrow(df)), function(i) { +#' @return data.table object with "value" and "percentile" columns +#' +#' @seealso +#' \url{https://stat.ethz.ch/CRAN/web/packages/data.table/vignettes/datatable-programming.html} +clean_value_cols <- function() { + substitute({ + + ## ---- detect column presence ---- + has_value <- exists("value") + has_values <- exists("values") + has_percentiles <- exists("percentiles") - v <- split_vals[[i]] - p <- split_pcts[[i]] + ## ---- values ---- + vals <- if (has_values && + !is.null(values[[1]]) && + length(values[[1]]) > 0) { + + v <- values[[1]] + if (length(v) == 1L && identical(v, "nan")) { + NA_real_ + } else { + as.numeric(v) + } + + } else if (has_value) { + as.numeric(value) + } else { + NA_real_ + } - v[is.nan(v)] <- NA - v <- as.numeric(v) - p <- as.numeric(p) + n <- length(vals) - # case: single NA → scalar statistic - if (length(v) == 1 && is.na(v)) { - df$percentile <- NA_real_ - return(df[i, , drop = FALSE]) + ## ---- percentiles ---- + percs <- if (has_percentiles && + !is.null(percentiles[[1]]) && + length(percentiles[[1]]) > 0) { + + as.numeric(percentiles[[1]]) + + } else if (data.table::`%chin%`(computation,c("minimum", "median", "maximum"))) { + + data.table::fifelse( + computation == "minimum", 0, + data.table::fifelse(computation == "median", 50, 100) + ) + + } else { + NA_real_ } - out <- df[rep(i, length(v)), , drop = FALSE] + if (length(percs) == 1L && n > 1L) { + percs <- rep(percs, n) + } - out$value <- v - out$percentile <- p - out + ## ---- expand scalar columns ---- + .( + value = vals, + percentile = percs, + start_date = rep(start_date, n), + end_date = rep(end_date, n), + interval_type = rep(interval_type, n), + sample_count = rep(sample_count, n), + approval_status = rep(approval_status, n), + computation_id = rep(computation_id, n), + computation = rep(computation, n) + ) }) - - out <- do.call(rbind, expanded) - rownames(out) <- NULL - out } - diff --git a/R/read_waterdata_stats_datatable.R b/R/read_waterdata_stats_datatable.R deleted file mode 100644 index 41eb608a..00000000 --- a/R/read_waterdata_stats_datatable.R +++ /dev/null @@ -1,135 +0,0 @@ -#' -get_statistics_data_data_table <- function(args, service) { - - base_request <- construct_statistics_request(service = service, version = 0) - - # TODO?: arg type checking here - - full_request <- explode_query(base_request, POST = FALSE, x = args) - - return_list <- data.table::as.data.table(walk_pages(full_request, max_results = NA)) - return_list[, rid := .I] - - parsed_data <- lapply(return_list$data, jsonlite::parse_json) - return_list[, data := NULL] - - combined_list <- list() - for (i in seq_along(parsed_data)){ - time_series_metainfo <- data.table::rbindlist(parsed_data[[i]]) - - observations <- data.table::rbindlist(time_series_metainfo$values, fill = TRUE) - observations <- observations[, eval(clean_value_cols()), by = .I][, I := NULL] - time_series_metainfo[, values := NULL] - - time_series_metainfo <- cbind(time_series_metainfo, observations) - time_series_metainfo[, rid := i] - - combined_list[[i]] <- time_series_metainfo - } - - combined <- data.table::rbindlist(combined_list) - combined <- combined[return_list, on = "rid"] - - col_order <- c("parent_time_series_id", "monitoring_location_id", "monitoring_location_name", "parameter_code", - "interval_type", "start_date", "end_date", "computation", "value", "percentile", - "unit_of_measure", "sample_count", - "approval_status", "computation_id", - "site_type", "site_type_code", - "country_code", "state_code", "county_code", - "geometry") - data.table::setcolorder(combined, col_order) - - attr(combined, "request") <- full_request - attr(combined, "queryTime") <- Sys.time() - - return(sf::st_as_sf(combined)) -} - -#' Clean up "value", "values", and "percentiles" columns in a data.table -#' -#' @description -#' If the input data.table has "values" and "percentiles" columns, then -#' it unnests both, making the data.table longer (one row per value) -#' -#' If the input data.table *also* has a "value" column, then it consolidates with "values", yielding a single "value" column -#' -#' The "percentiles" column is replaced by "percentile" (matching singularization of "value"). -#' The function also checks whether the "computation" column contains "minimum", "median", or "maximum" and -#' sets the corresponding "percentile" to 0, 50, or 100, respectively. Note that the percentile column might only -#' contain NAs if the input data.table only includes arithmetic_mean values -#' -#' Lastly, the value and percentile columns are converted from character to numeric -#' -#' -#' @note -#' This function is intended to evaluate as a data.table j expression, which is why it doesn't accept any arguments. -#' j expressions are typically written inline with the data.table definition (e.g., DT[, { do something }]). -#' However, this expression would have been excessively long. Instead, an eval(quote(...)) is used so this could be -#' pulled out into a separate function for readability, but we still gain the computational benefits of using an expression. -#' -#' @noRd -#' @return data.table object with "value" and "percentile" columns -clean_value_cols <- function() { - quote({ - - ## ---- detect column presence ---- - has_value <- exists("value") - has_values <- exists("values") - has_percentiles <- exists("percentiles") - - ## ---- values ---- - vals <- if (has_values && - !is.null(values[[1]]) && - length(values[[1]]) > 0) { - - v <- values[[1]] - if (length(v) == 1L && identical(v, "nan")) { - NA_real_ - } else { - as.numeric(v) - } - - } else if (has_value) { - as.numeric(value) - } else { - NA_real_ - } - - n <- length(vals) - - ## ---- percentiles ---- - percs <- if (has_percentiles && - !is.null(percentiles[[1]]) && - length(percentiles[[1]]) > 0) { - - as.numeric(percentiles[[1]]) - - } else if (data.table::`%chin%`(computation,c("minimum", "medium", "maximum"))) { - - fifelse( - computation == "minimum", 0, - fifelse(computation == "medium", 50, 100) - ) - - } else { - NA_real_ - } - - if (length(percs) == 1L && n > 1L) { - percs <- rep(percs, n) - } - - ## ---- expand scalar columns ---- - .( - value = vals, - percentile = percs, - start_date = rep(start_date, n), - end_date = rep(end_date, n), - interval_type = rep(interval_type, n), - sample_count = rep(sample_count, n), - approval_status = rep(approval_status, n), - computation_id = rep(computation_id, n), - computation = rep(computation, n) - ) - }) -} \ No newline at end of file From 8875de4a6d92e4fa7a19e6887110e6381971a96d Mon Sep 17 00:00:00 2001 From: Joseph Zemmels Date: Wed, 31 Dec 2025 15:36:50 -0700 Subject: [PATCH 5/6] Generalize to accommodate Normals and Intervals output, handle empty responses, fix data.table joining. --- R/read_waterdata_stats.R | 166 ++++++++++++++++++++++++++++++++++----- 1 file changed, 145 insertions(+), 21 deletions(-) diff --git a/R/read_waterdata_stats.R b/R/read_waterdata_stats.R index 45a45316..77688900 100644 --- a/R/read_waterdata_stats.R +++ b/R/read_waterdata_stats.R @@ -61,6 +61,12 @@ #' parameter code or monitoring location identifier are specified. #' @param page_size Return a defined number of results (default: 1000). #' +#' @examplesIf is_dataRetrieval_user() +#' +#' \donttest{ +#' +#' } +#' #' @rdname get_waterdata_stats #' @seealso \url{https://api.waterdata.usgs.gov/statistics/v0/docs} read_waterdata_stats_normal <- function( @@ -110,6 +116,22 @@ read_waterdata_stats_interval <- function( } +#' Create a request object for the /statistics service +#' +#' @param service chr; "Normals" or "Intervals" +#' @param version int; /statistics API version number (default: 0) +#' +#' @return a httr2 request object +#' +#' @noRd +construct_statistics_request <- function(service = "Normals", version = 0){ + + httr2::request("https://api.waterdata.usgs.gov/statistics/") |> + httr2::req_url_path_append(paste0("v", version)) |> + httr2::req_url_path_append(paste0("observation", service)) + +} + #' Retrieve data from /statistics API #' #' @param args arguments from individual functions. @@ -126,6 +148,11 @@ get_statistics_data <- function(args, service) { full_request <- explode_query(base_request, POST = FALSE, x = args) return_list <- data.table::as.data.table(walk_pages(full_request, max_results = NA)) + + if(nrow(return_list) == 0) { + return(deal_with_empty_stats(return_list)) + } + return_list[, rid := .I] parsed_data <- lapply(return_list$data, jsonlite::parse_json) @@ -133,25 +160,55 @@ get_statistics_data <- function(args, service) { combined_list <- list() for (i in seq_along(parsed_data)){ - time_series_metainfo <- data.table::rbindlist(parsed_data[[i]]) + # 1. One row per time series + ts_meta <- data.table::rbindlist(parsed_data[[i]], fill = TRUE) + ts_meta[, ts_id := .I] # local key + + # 2. One row per observation, keyed to ts_id + obs <- data.table::rbindlist(ts_meta$values, fill = TRUE, idcol = "ts_id") + obs <- obs[, .j, by = .I, env = list(.j = clean_value_cols())][, I := NULL] - observations <- data.table::rbindlist(time_series_metainfo$values, fill = TRUE) - observations <- observations[, .j, by = .I, env = list(.j = clean_value_cols())][, I := NULL] - time_series_metainfo[, values := NULL] + # 3. Drop nested list column + ts_meta[, values := NULL] - time_series_metainfo <- cbind(time_series_metainfo, observations) - time_series_metainfo[, rid := i] + # 4. Join + out <- ts_meta[obs, on = "ts_id", allow.cartesian = TRUE] - combined_list[[i]] <- time_series_metainfo + out[, `:=`( + ts_id = NULL, + rid = i + )] + + combined_list[[i]] <- out } combined <- data.table::rbindlist(combined_list) - combined <- combined[return_list, on = "rid"][, rid := NULL] + combined <- combined[return_list, on = "rid"] + combined[, rid := NULL] + + combined <- sf::st_as_sf(as.data.frame(combined)) attr(combined, "request") <- full_request attr(combined, "queryTime") <- Sys.time() - return(sf::st_as_sf(as.data.frame(combined))) + return(combined) +} + +#' +normalize_value_cols <- function(x) { + if (!is.null(x$value)) { + list( + value = as.numeric(x$value), + values = NA_real_, + percentiles = NA_real_ + ) + } else { + list( + value = NA_real_, + values = as.numeric(x$values), + percentiles = as.numeric(x$percentiles) + ) + } } #' Clean up "value", "values", and "percentiles" columns in a data.table @@ -193,6 +250,9 @@ clean_value_cols <- function() { has_values <- exists("values") has_percentiles <- exists("percentiles") + has_date_schema <- exists("start_date") && exists("end_date") && exists("interval_type") + has_toy_schema <- exists("time_of_year") && exists("time_of_year_type") + ## ---- values ---- vals <- if (has_values && !is.null(values[[1]]) && @@ -220,7 +280,7 @@ clean_value_cols <- function() { as.numeric(percentiles[[1]]) - } else if (data.table::`%chin%`(computation,c("minimum", "median", "maximum"))) { + } else if (data.table::`%chin%`(computation, c("minimum", "median", "maximum"))) { data.table::fifelse( computation == "minimum", 0, @@ -235,17 +295,81 @@ clean_value_cols <- function() { percs <- rep(percs, n) } - ## ---- expand scalar columns ---- - .( - value = vals, - percentile = percs, - start_date = rep(start_date, n), - end_date = rep(end_date, n), - interval_type = rep(interval_type, n), - sample_count = rep(sample_count, n), - approval_status = rep(approval_status, n), - computation_id = rep(computation_id, n), - computation = rep(computation, n) + ## ---- time columns (schema-dependent) ---- + time_cols <- if (has_date_schema) { + list( + start_date = rep(start_date, n), + end_date = rep(end_date, n), + interval_type = rep(interval_type, n) + ) + } else if (has_toy_schema) { + list( + time_of_year = rep(time_of_year, n), + time_of_year_type = rep(time_of_year_type, n) + ) + } else { + list() + } + + ## ---- assemble ---- + c( + list( + value = vals, + percentile = percs, + sample_count = rep(sample_count, n), + approval_status = rep(approval_status, n), + computation_id = rep(computation_id, n), + computation = rep(computation, n), + ts_id = rep(ts_id, n) + ), + time_cols ) }) } + +#' Handle empty responses from the /statistics service +#' +#' @param return_list data.frame returned from walk_pages +#' @param properties character vector of requested columns (or NA) +#' @param convertType logical, whether to convert numeric columns +#' +#' @return data.frame or sf object with expected columns +#' @noRd +deal_with_empty_stats <- function(return_list, properties = NA, + convertType = TRUE) { + + # Define default columns for stats service + default_columns <- c( + "monitoring_location_id", "monitoring_location_name", + "site_type", "site_type_code", "country_code", "state_code", + "county_code", "parameter_code", "unit_of_measure", "parent_time_series_id", + "value", "percentile", "start_date", "end_date", "interval_type", + "sample_count", "approval_status", "computation_id", "computation" + ) + + if (all(is.na(properties))) { + properties <- default_columns + } + + # create empty data.frame + return_list <- as.data.frame(matrix(nrow = 0, ncol = length(properties))) + names(return_list) <- properties + return_list <- lapply(return_list, as.character) + return_list <- as.data.frame(return_list) + + # convert numeric columns if requested + if (convertType) { + numeric_cols <- c("value", "percentile", "sample_count") + for (col in numeric_cols) { + if (col %in% names(return_list)) { + return_list[[col]] <- as.numeric(return_list[[col]]) + } + } + } + + # ensure geometry column exists if not skipped + return_list <- sf::st_as_sf(return_list, geometry = sf::st_sfc()) + + return(return_list) +} + From e143efbb73d5b7460946120a7509f2b9f9d22156 Mon Sep 17 00:00:00 2001 From: Joseph Zemmels Date: Wed, 31 Dec 2025 15:49:35 -0700 Subject: [PATCH 6/6] Unit tests --- tests/testthat/test_waterdata_stats.R | 290 ++++++++++++++++++++++++++ 1 file changed, 290 insertions(+) create mode 100644 tests/testthat/test_waterdata_stats.R diff --git a/tests/testthat/test_waterdata_stats.R b/tests/testthat/test_waterdata_stats.R new file mode 100644 index 00000000..7c4149e3 --- /dev/null +++ b/tests/testthat/test_waterdata_stats.R @@ -0,0 +1,290 @@ +# ------------------------------------------------------------------------------ +# clean_value_cols() +# ------------------------------------------------------------------------------ + +test_that("values column is unnested and converted to numeric", { + DT <- data.table( + values = list(c("1.2", "3.4")), + percentiles = list(c("10", "90")), + computation = "percentile", + start_date = "2000-01-01", + end_date = "2000-12-31", + interval_type = "month", + ts_id = "1", + sample_count = 10, + approval_status = "approved", + computation_id = "abc" + ) + + out <- DT[, .j, env = list(.j = clean_value_cols())] + + expect_equal(out$value, c(1.2, 3.4)) + expect_equal(out$percentile, c(10, 90)) +}) + +test_that("value column is used when values is absent", { + DT <- data.table( + value = "2.5", + computation = "arithmetic_mean", + start_date = "2000-01-01", + end_date = "2000-12-31", + interval_type = "year", + sample_count = 1, + ts_id = "1", + approval_status = "approved", + computation_id = "xyz" + ) + + out <- DT[, .j, env = list(.j = clean_value_cols())] + + expect_equal(out$value, 2.5) + expect_true(is.na(out$percentile)) +}) + +test_that("'nan' values are converted to NA_real_", { + DT <- data.table( + values = list("nan"), + computation = "arithmetic_mean", + start_date = "2000-01-01", + end_date = "2000-12-31", + interval_type = "month", + ts_id = "1", + sample_count = 1, + approval_status = "approved", + computation_id = "nan-test" + ) + + out <- DT[, .j, env = list(.j = clean_value_cols())] + + expect_true(is.na(out$value)) +}) + +test_that("percentile is inferred from computation type", { + DT <- data.table( + value = "5", + computation = c("minimum", "median", "maximum"), + start_date = "2000-01-01", + end_date = "2000-12-31", + interval_type = "year", + ts_id = "1", + sample_count = 1, + approval_status = "approved", + computation_id = 1:3 + ) + + out <- DT[, .j, by = computation, env = list(.j = clean_value_cols())] + + expect_equal( + out[order(computation)]$percentile, + c(100, 50, 0) # maximum, median, minimum + ) +}) + +test_that("scalar percentile is recycled to match values length", { + DT <- data.table( + values = list(c("1", "2", "3")), + percentiles = list("50"), + computation = "percentile", + start_date = "2000-01-01", + end_date = "2000-12-31", + interval_type = "month", + ts_id = "1", + sample_count = 3, + approval_status = "approved", + computation_id = "recycle" + ) + + out <- DT[, .j, env = list(.j = clean_value_cols())] + + expect_equal(out$percentile, c(50, 50, 50)) +}) + +test_that("missing value and values yields NA value", { + DT <- data.table( + computation = "arithmetic_mean", + start_date = "2000-01-01", + end_date = "2000-12-31", + interval_type = "year", + ts_id = "1", + sample_count = 1, + approval_status = "approved", + computation_id = "missing" + ) + + out <- DT[, .j, env = list(.j = clean_value_cols())] + + expect_true(is.na(out$value)) +}) + +# ------------------------------------------------------------------------------ +# construct_statistics_request() +# ------------------------------------------------------------------------------ + +test_that("construct_statistics_request builds correct URL path", { + req <- construct_statistics_request(service = "Normals", version = 0) + + expect_true(grepl("statistics/v0/observationNormals", req$url)) +}) + +test_that("Intervals service path is correct", { + req <- construct_statistics_request(service = "Intervals", version = 0) + + expect_true(grepl("statistics/v0/observationIntervals", req$url)) +}) + +# ------------------------------------------------------------------------------ +# explode_query integration (no HTTP) +# ------------------------------------------------------------------------------ + +test_that("arguments are passed as query parameters", { + args <- list( + state_code = "WI", + computation_type = "median", + page_size = 10 + ) + + req <- construct_statistics_request("Normals", 0) + full <- explode_query(req, POST = FALSE, x = args) + + qs <- httr2::req_get_url(full) + + expect_match(qs, "state_code=WI") + expect_match(qs, "computation_type=median") + expect_match(qs, "page_size=10") +}) + +# ------------------------------------------------------------------------------ +# get_statistics_data() (mocked) +# ------------------------------------------------------------------------------ + +test_that("get_statistics_data returns sf object with attributes", { + fake_page <- + sf::st_sf( + monitoring_location_id = c("USGS-01432160", "USGS-01432110"), + monitoring_location_name = c( + "DELAWARE RIVER AT BARRYVILLE NY", + "Lackawaxen River at Rowland, PA" + ), + site_type = c("Stream", "Stream"), + site_type_code = c("ST", "ST"), + country_code = c("US", "US"), + state_code = c("42", "42"), + county_code = c("103", "103"), + geometry = sf::st_sfc( + sf::st_point(c(0, 0)), + sf::st_point(c(1, 1)) + ), + data = c( + # First site + jsonlite::toJSON(list( + list( + parameter_code = "00060", + unit_of_measure = "ft^3/s", + parent_time_series_id = "1692e9a29c8c4276add4497c5da872fa", + values = list( + list( + start_date = "2018-10-01", + end_date = "2018-10-31", + interval_type = "month", + value = "8032.903", + sample_count = 31, + approval_status = "approved", + computation_id = "d98ebe80-d476-4144-b7ce-c6ca32f015de", + computation = "arithmetic_mean" + ), + list( + start_date = "2018-11-01", + end_date = "2018-11-30", + interval_type = "month", + value = "12521.667", + sample_count = 30, + approval_status = "approved", + computation_id = "93f45097-93ad-4b48-83d4-b27b5fc137dc", + computation = "arithmetic_mean" + ) + ) + ) + ), auto_unbox = TRUE), + # Second site + jsonlite::toJSON(list( + list( + parameter_code = "00060", + unit_of_measure = "ft^3/s", + parent_time_series_id = "7dda388ea270420dbe7324e56b6f907f", + values = list( + list( + start_date = "2007-07-27", + end_date = "2007-07-31", + interval_type = "month", + values = list("1", "2"), + percentiles = list("50", "100"), + sample_count = 5, + approval_status = "approved", + computation_id = "4579653d-f26f-4e9f-99ea-c3fa3fc68686", + computation = "arithmetic_mean" + ) + ) + ) + ), auto_unbox = TRUE) + ), + stringsAsFactors = FALSE + ) + + with_mocked_bindings( + walk_pages = function(...) list(fake_page), + { + out <- get_statistics_data(list(), "Normals") + + expect_s3_class(out, "sf") + expect_true("request" %in% names(attributes(out))) + expect_true("queryTime" %in% names(attributes(out))) + expect_equal(nrow(out), 4) + } + ) +}) + +test_that("get_statistics_data handles empty response", { + with_mocked_bindings( + walk_pages = function(...) list(), + { + out <- get_statistics_data(list(), "Normals") + + expect_s3_class(out, "sf") + expect_equal(nrow(out), 0) + } + ) +}) + +# ------------------------------------------------------------------------------ +# High-level API tests (skipped on CRAN) +# ------------------------------------------------------------------------------ + +test_that("read_waterdata_stats_normal returns data", { + skip_on_cran() + skip_if_offline() + + out <- read_waterdata_stats_normal( + monitoring_location_id = "USGS-01646500", + parameter_code = "00060", + computation_type = "median", + page_size = 5 + ) + + expect_s3_class(out, "sf") + expect_true(nrow(out) > 0) +}) + +test_that("read_waterdata_stats_interval returns data", { + skip_on_cran() + skip_if_offline() + + out <- read_waterdata_stats_interval( + monitoring_location_id = "USGS-01646500", + parameter_code = "00060", + computation_type = "maximum", + page_size = 5 + ) + + expect_s3_class(out, "sf") + expect_true(nrow(out) > 0) +})