From 2865d8bc59904850fee54ca1dbe6d693458a38a5 Mon Sep 17 00:00:00 2001
From: Joseph Zemmels <jzemmels@usgs.gov>
Date: Mon, 29 Dec 2025 16:24:59 -0700
Subject: [PATCH 1/6] First pass at statistics functions

---
 DESCRIPTION              |   2 +-
 NAMESPACE                |   2 +
 R/read_waterdata_stats.R | 200 +++++++++++++++++++++++++++++++++++++++
 R/walk_pages.R           |  28 +++---
 4 files changed, 218 insertions(+), 14 deletions(-)
 create mode 100644 R/read_waterdata_stats.R

diff --git a/DESCRIPTION b/DESCRIPTION
index e8ffb2c1..67a07bc9 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -67,5 +67,5 @@ Encoding: UTF-8
 BuildVignettes: true
 VignetteBuilder: knitr
 BugReports: https://github.com/DOI-USGS/dataRetrieval/issues
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
index 8f0f178d..c919f665 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -53,6 +53,8 @@ export(read_waterdata_metadata)
 export(read_waterdata_monitoring_location)
 export(read_waterdata_parameter_codes)
 export(read_waterdata_samples)
+export(read_waterdata_stats_interval)
+export(read_waterdata_stats_normal)
 export(read_waterdata_ts_meta)
 export(renameNWISColumns)
 export(setAccess)
diff --git a/R/read_waterdata_stats.R b/R/read_waterdata_stats.R
new file mode 100644
index 00000000..f097f6fb
--- /dev/null
+++ b/R/read_waterdata_stats.R
@@ -0,0 +1,200 @@
+#' Get summary statistic data from /statistics API
+#' 
+#' @description 
+#' 
+#' \code{read_waterdata_stats_normal} fetches day- or month-of-year data from the observationNormals endpoint.
+#' 
+#' \code{read_waterdata_stats} fetches calendar month-year, calendar year, or water year data from the observationIntervals endpoint.
+#' 
+#' @export
+#' 
+#' @param approval_status asdf
+#' @param computation_type asdf
+#' @param country_code asdf 
+#' @param state_code asdf
+#' @param county_code asdf
+#' @param start_date asdf
+#' @param end_date asdf
+#' @param mime_type asdf
+#' @param monitoring_location_id asfd
+#' @param next_token asdf
+#' @param parent_time_series_id aasdf
+#' @param site_type_code asdf
+#' @param site_type_name asdf
+#' @param parameter_code asf
+#' @param next_token asdf
+#' @param page_size asdf
+#' 
+#' @rdname get_waterdata_stats
+#' @seealso \url{https://api.waterdata.usgs.gov/statistics/v0/docs}
+read_waterdata_stats_normal <- function(
+  approval_status = NA,
+  computation_type = NA_character_,
+  country_code = NA_character_,
+  state_code = NA_character_,
+  county_code = NA_character_,
+  start_date = NA_character_,
+  end_date = NA_character_,
+  mime_type = NA_character_,
+  monitoring_location_id = NA_character_,
+  parent_time_series_id = NA_character_,
+  site_type_code = NA_character_,
+  site_type_name = NA_character_,
+  parameter_code = NA_character_,
+  next_token = NA_character_,
+  page_size = NA
+) {
+  
+  args <- mget(names(formals()))
+
+  get_statistics_data(args = args, service = "Normals")
+}
+
+#' @export
+#' @rdname get_waterdata_stats
+read_waterdata_stats_interval <- function(
+  approval_status = NA,
+  computation_type = NA_character_,
+  country_code = NA_character_,
+  state_code = NA_character_,
+  county_code = NA_character_,
+  start_date = NA_character_,
+  end_date = NA_character_,
+  mime_type = NA_character_,
+  monitoring_location_id = NA_character_,
+  next_token = NA_character_,
+  parent_time_series_id = NA_character_,
+  site_type_code = NA_character_,
+  site_type_name = NA_character_,
+  parameter_code = NA_character_,
+  page_size = NA
+){
+
+  args <- mget(names(formals()))
+
+  get_statistics_data(args = args, service = "Intervals")
+
+}
+
+#' Retrieve data from /statistics API
+#' 
+#' @param args arguments from individual functions.
+#' @param service Ednpoint name.
+#' 
+#' @noRd
+#' @return data.frame with attributes
+get_statistics_data <- function(args, service) {
+
+  base_request <- construct_statistics_request(service = service, version = 0)
+
+  # TODO?: arg type checking here
+
+  full_request <- explode_query(base_request, POST = FALSE, x = args)
+
+  return_list <- walk_pages(full_request, max_results = NA)
+
+  # TODO?: pull the following double-for loop out to other function(s)
+  return_list_tmp <- list()
+  for (i in 1:nrow(return_list)) {
+    x <- jsonlite::parse_json(return_list$data[i])
+    z <- data.frame(rbind_fill(x))
+    zz <- list()
+    for (j in 1:nrow(z)) {
+      zz[[j]] <- cbind(
+        subset(z[j, ], select = -values),
+        rbind_fill(z$values[[j]]),
+        row.names = NULL
+      )
+    }
+    zz <- do.call(rbind, zz)
+
+    return_list_tmp[[i]] <- cbind_sf_metadata(
+      subset(return_list[i, ], select = -data),
+      zz
+    )
+  }
+
+  return_list <- do.call(rbind, return_list_tmp)
+
+  attr(return_list, "request") <- full_request
+  attr(return_list, "queryTime") <- Sys.time()
+
+  return(return_list)
+}
+
+#' Create a request object for the /statistics service
+#' 
+#' @param service chr; "Normals" or "Intervals"
+#' @param version int; /statistics API version number (default: 0)
+#' 
+#' @return a httr2 request object
+#' 
+#' @noRd
+construct_statistics_request <- function(service = "Normals", version = 0){
+ 
+  httr2::request("https://api.waterdata.usgs.gov/statistics/") |>
+    httr2::req_url_path_append(paste0("v", version)) |>
+    httr2::req_url_path_append(paste0("observation", service))
+  
+}
+
+#' Bind a list of data frames by row
+#' 
+#' Meant to emulate data.table::rbindlist in how it handles missing data
+#' 
+#' @param dfs a list of data frames
+#' 
+#' @return a data frame
+#' 
+#' @noRd
+rbind_fill <- function(dfs) {
+  
+  # drop NULL elements
+  dfs <- Filter(Negate(is.null), dfs)
+  if (length(dfs) == 0) return(NULL)
+  
+  # union of all column names
+  all_names <- unique(unlist(lapply(dfs, names)))
+  
+  # align columns
+  dfs_aligned <- lapply(dfs, function(x) {
+    missing <- setdiff(all_names, names(x))
+    if (length(missing)) {
+      for (m in missing) x[[m]] <- NA
+    }
+    x[all_names]
+  })
+  
+  # bind
+  out <- do.call(rbind, dfs_aligned)
+  rownames(out) <- NULL
+  out
+}
+
+#' Combine sf metadata with observations from /statistics
+#' 
+#' @param meta_sf a single-row sf data frame object containing metadata about observations
+#' @param obs_df a multi-row data frame of observations 
+#' 
+#' @return an sf data frame object
+#' 
+#' @noRd
+cbind_sf_metadata <- function(meta_sf, obs_df) {
+  
+  stopifnot(
+    inherits(meta_sf, "sf"),
+    nrow(meta_sf) == 1
+  )
+  
+  n <- nrow(obs_df)
+  
+  # replicate metadata rows
+  meta_rep <- meta_sf[rep(1, n), , drop = FALSE]
+  
+  # drop geometry before cbind, then restore
+  geom <- sf::st_geometry(meta_rep)
+  meta_nogeo <- sf::st_drop_geometry(meta_rep)
+  
+  out <- cbind(meta_nogeo, obs_df)
+  sf::st_sf(out, geometry = geom)
+}
diff --git a/R/walk_pages.R b/R/walk_pages.R
index 39b85ae2..fe6b7850 100644
--- a/R/walk_pages.R
+++ b/R/walk_pages.R
@@ -167,36 +167,38 @@ cleanup_cols <- function(df, service = "daily"){
 }
 
 #' Next request URL
-#' 
+#'
 #' Custom function to find the "next" URL from the API response.
 #' @seealso [httr2::req_perform_iterative]
-#' 
+#'
 #' @param resp httr2 response from last request
 #' @param req httr2 request from last time
-#' 
+#'
 #' @noRd
 #' @return the url for the next request
-#' 
+#'
 next_req_url <- function(resp, req) {
-
   body <- httr2::resp_body_json(resp)
-  
+
   if(isTRUE(body[["numberReturned"]] == 0)){
     return(NULL)
   }
-  
+
   header_info <- httr2::resp_headers(resp)
   if(Sys.getenv("API_USGS_PAT") != ""){
     message("Remaining requests this hour:", header_info$`x-ratelimit-remaining`, " ")
   }
-  
-  links <- body$links
+  if ("links" %in% names(body)) {
+    links <- body$links
   if(any(sapply(links, function(x) x$rel) == "next")){
-    next_index <- which(sapply(links, function(x) x$rel) == "next")
-    
-    next_url <- links[[next_index]][["href"]]
+      next_index <- which(sapply(links, function(x) x$rel) == "next")
+
+      next_url <- links[[next_index]][["href"]]
 
-    return(httr2::req_url(req = req, url = next_url))
+      return(httr2::req_url(req = req, url = next_url))
+    }
+  } else if (!is.null(body[["next"]])) {
+    return(httr2::req_url_query(req, next_token = body[["next"]]))
   } else {
     return(NULL)
   }

From 7e331a3d25418414f37267afb709ead6d79f2b20 Mon Sep 17 00:00:00 2001
From: Joseph Zemmels <jzemmels@usgs.gov>
Date: Tue, 30 Dec 2025 16:08:46 -0700
Subject: [PATCH 2/6] Add data.table version of get_statistics_data function

---
 DESCRIPTION                        |   3 +-
 NAMESPACE                          |  10 +++
 R/dataRetrieval-package.R          |  15 ++++
 R/read_waterdata_stats.R           | 139 ++++++++++++++++++++---------
 R/read_waterdata_stats_datatable.R | 111 +++++++++++++++++++++++
 5 files changed, 234 insertions(+), 44 deletions(-)
 create mode 100644 R/dataRetrieval-package.R
 create mode 100644 R/read_waterdata_stats_datatable.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 67a07bc9..dbff23ac 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -56,7 +56,8 @@ Imports:
     jsonlite,
     httr2,
     whisker,
-    sf
+    sf,
+    data.table
 Suggests:
     covr,
     dplyr,
diff --git a/NAMESPACE b/NAMESPACE
index c919f665..8c9606d5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -17,6 +17,7 @@ export(create_WQP_bib)
 export(findNLDI)
 export(getWebServiceData)
 export(get_nldi_sources)
+export(get_statistics_data_rbindlist)
 export(importNGWMN)
 export(importRDB1)
 export(importWQP)
@@ -70,3 +71,12 @@ export(whatWQPsamples)
 export(whatWQPsites)
 export(wqp_check_status)
 export(zeroPad)
+importFrom(data.table,":=")
+importFrom(data.table,.BY)
+importFrom(data.table,.EACHI)
+importFrom(data.table,.GRP)
+importFrom(data.table,.I)
+importFrom(data.table,.N)
+importFrom(data.table,.NGRP)
+importFrom(data.table,.SD)
+importFrom(data.table,data.table)
diff --git a/R/dataRetrieval-package.R b/R/dataRetrieval-package.R
new file mode 100644
index 00000000..35a16fac
--- /dev/null
+++ b/R/dataRetrieval-package.R
@@ -0,0 +1,15 @@
+#' @keywords internal
+"_PACKAGE"
+
+## usethis namespace: start
+#' @importFrom data.table .BY
+#' @importFrom data.table .EACHI
+#' @importFrom data.table .GRP
+#' @importFrom data.table .I
+#' @importFrom data.table .N
+#' @importFrom data.table .NGRP
+#' @importFrom data.table .SD
+#' @importFrom data.table :=
+#' @importFrom data.table data.table
+## usethis namespace: end
+NULL
diff --git a/R/read_waterdata_stats.R b/R/read_waterdata_stats.R
index f097f6fb..c0503612 100644
--- a/R/read_waterdata_stats.R
+++ b/R/read_waterdata_stats.R
@@ -28,52 +28,52 @@
 #' @rdname get_waterdata_stats
 #' @seealso \url{https://api.waterdata.usgs.gov/statistics/v0/docs}
 read_waterdata_stats_normal <- function(
-  approval_status = NA,
-  computation_type = NA_character_,
-  country_code = NA_character_,
-  state_code = NA_character_,
-  county_code = NA_character_,
-  start_date = NA_character_,
-  end_date = NA_character_,
-  mime_type = NA_character_,
-  monitoring_location_id = NA_character_,
-  parent_time_series_id = NA_character_,
-  site_type_code = NA_character_,
-  site_type_name = NA_character_,
-  parameter_code = NA_character_,
-  next_token = NA_character_,
-  page_size = NA
+    approval_status = NA,
+    computation_type = NA_character_,
+    country_code = NA_character_,
+    state_code = NA_character_,
+    county_code = NA_character_,
+    start_date = NA_character_,
+    end_date = NA_character_,
+    mime_type = NA_character_,
+    monitoring_location_id = NA_character_,
+    parent_time_series_id = NA_character_,
+    site_type_code = NA_character_,
+    site_type_name = NA_character_,
+    parameter_code = NA_character_,
+    next_token = NA_character_,
+    page_size = NA
 ) {
   
   args <- mget(names(formals()))
-
+  
   get_statistics_data(args = args, service = "Normals")
 }
 
 #' @export
 #' @rdname get_waterdata_stats
 read_waterdata_stats_interval <- function(
-  approval_status = NA,
-  computation_type = NA_character_,
-  country_code = NA_character_,
-  state_code = NA_character_,
-  county_code = NA_character_,
-  start_date = NA_character_,
-  end_date = NA_character_,
-  mime_type = NA_character_,
-  monitoring_location_id = NA_character_,
-  next_token = NA_character_,
-  parent_time_series_id = NA_character_,
-  site_type_code = NA_character_,
-  site_type_name = NA_character_,
-  parameter_code = NA_character_,
-  page_size = NA
+    approval_status = NA,
+    computation_type = NA_character_,
+    country_code = NA_character_,
+    state_code = NA_character_,
+    county_code = NA_character_,
+    start_date = NA_character_,
+    end_date = NA_character_,
+    mime_type = NA_character_,
+    monitoring_location_id = NA_character_,
+    next_token = NA_character_,
+    parent_time_series_id = NA_character_,
+    site_type_code = NA_character_,
+    site_type_name = NA_character_,
+    parameter_code = NA_character_,
+    page_size = NA
 ){
-
+  
   args <- mget(names(formals()))
-
+  
   get_statistics_data(args = args, service = "Intervals")
-
+  
 }
 
 #' Retrieve data from /statistics API
@@ -84,15 +84,15 @@ read_waterdata_stats_interval <- function(
 #' @noRd
 #' @return data.frame with attributes
 get_statistics_data <- function(args, service) {
-
+  
   base_request <- construct_statistics_request(service = service, version = 0)
-
+  
   # TODO?: arg type checking here
-
+  
   full_request <- explode_query(base_request, POST = FALSE, x = args)
-
+  
   return_list <- walk_pages(full_request, max_results = NA)
-
+  
   # TODO?: pull the following double-for loop out to other function(s)
   return_list_tmp <- list()
   for (i in 1:nrow(return_list)) {
@@ -107,18 +107,25 @@ get_statistics_data <- function(args, service) {
       )
     }
     zz <- do.call(rbind, zz)
-
+    
     return_list_tmp[[i]] <- cbind_sf_metadata(
       subset(return_list[i, ], select = -data),
       zz
     )
   }
-
+  
   return_list <- do.call(rbind, return_list_tmp)
+  
+  if(all(c("values", "percentiles") %in% names(return_list))){
+    return_list <- combine_value_columns(return_list)
+  }
 
+  return_list$value <- as.numeric(unlist(return_list$value))
+  return_list$percentile <- as.numeric(unlist(return_list$percentile))
+  
   attr(return_list, "request") <- full_request
   attr(return_list, "queryTime") <- Sys.time()
-
+  
   return(return_list)
 }
 
@@ -131,7 +138,7 @@ get_statistics_data <- function(args, service) {
 #' 
 #' @noRd
 construct_statistics_request <- function(service = "Normals", version = 0){
- 
+  
   httr2::request("https://api.waterdata.usgs.gov/statistics/") |>
     httr2::req_url_path_append(paste0("v", version)) |>
     httr2::req_url_path_append(paste0("observation", service))
@@ -198,3 +205,49 @@ cbind_sf_metadata <- function(meta_sf, obs_df) {
   out <- cbind(meta_nogeo, obs_df)
   sf::st_sf(out, geometry = geom)
 }
+
+#' Clean up a /statistics data frame
+#'
+#' Unpacks "values" and "percentiles" nested columns to be one row per value,
+#' renames "percentiles" to "percentile", consolidates "value" and "values" into
+#' one "value" column
+#'
+#' @param df a data frame with "value" and "values" columns
+#'
+#' @return a data frame with a value column and maybe a
+#' @noRd
+combine_value_columns <- function(df) {
+  
+  split_vals <- lapply(df$values, function(x) as.numeric(unlist(x)))
+  split_pcts <- lapply(df$percentiles, function(x) as.numeric(unlist(x)))
+  
+  df$values <- NULL
+  df$percentiles <- NULL
+  
+  expanded <- lapply(seq_len(nrow(df)), function(i) {
+    
+    v <- split_vals[[i]]
+    p <- split_pcts[[i]]
+    
+    v[is.nan(v)] <- NA
+    v <- as.numeric(v)
+    p <- as.numeric(p)
+    
+    # case: single NA → scalar statistic
+    if (length(v) == 1 && is.na(v)) {
+      df$percentile <- NA_real_
+      return(df[i, , drop = FALSE])
+    }
+    
+    out <- df[rep(i, length(v)), , drop = FALSE]
+    
+    out$value <- v
+    out$percentile <- p
+    out
+  })
+  
+  out <- do.call(rbind, expanded)
+  rownames(out) <- NULL
+  out
+}
+
diff --git a/R/read_waterdata_stats_datatable.R b/R/read_waterdata_stats_datatable.R
new file mode 100644
index 00000000..857cf43e
--- /dev/null
+++ b/R/read_waterdata_stats_datatable.R
@@ -0,0 +1,111 @@
+get_statistics_data_data_table <- function(args, service) {
+  
+  base_request <- construct_statistics_request(service = service, version = 0)
+  
+  # TODO?: arg type checking here
+  
+  full_request <- explode_query(base_request, POST = FALSE, x = args)
+  
+  return_list <- data.table::as.data.table(walk_pages(full_request, max_results = NA))
+  return_list[, rid := .I]
+
+  parsed_data <- lapply(return_list$data, jsonlite::parse_json)
+  return_list[, data := NULL]
+
+  combined_list <- list()
+  for (i in seq_along(parsed_data)){
+    time_series_metainfo <- data.table::rbindlist(parsed_data[[i]])
+
+    observations <- data.table::rbindlist(time_series_metainfo$values, fill = TRUE)
+    observations <- observations[, eval(clean_value_cols()), by = .I][, I := NULL]
+    time_series_metainfo[, values := NULL]
+    
+    time_series_metainfo <- cbind(time_series_metainfo, observations)
+    time_series_metainfo[, rid := i]
+
+    combined_list[[i]] <- time_series_metainfo
+  }
+  
+  combined <- data.table::rbindlist(combined_list)
+  combined <- combined[return_list, on = "rid"]
+  
+  col_order <- c("parent_time_series_id", "monitoring_location_id", "monitoring_location_name", "parameter_code",
+               "interval_type", "start_date", "end_date", "computation", "value", "percentile",
+               "unit_of_measure", "sample_count",
+               "approval_status", "computation_id", 
+               "site_type", "site_type_code", 
+               "country_code", "state_code", "county_code", 
+               "geometry")
+  data.table::setcolorder(combined, col_order) 
+   
+  attr(combined, "request") <- full_request
+  attr(combined, "queryTime") <- Sys.time()
+  
+  return(sf::st_as_sf(combined))
+}
+
+#' Clean up value, values, and percentiles columns in a data.table
+clean_value_cols <- function() {
+  quote({
+
+    ## ---- detect column presence ----
+    has_value       <- exists("value")
+    has_values      <- exists("values")
+    has_percentiles <- exists("percentiles")
+
+    ## ---- values ----
+    vals <- if (has_values &&
+                !is.null(values[[1]]) &&
+                length(values[[1]]) > 0) {
+
+      v <- values[[1]]
+      if (length(v) == 1L && identical(v, "nan")) {
+        NA_real_
+      } else {
+        as.numeric(v)
+      }
+
+    } else if (has_value) {
+      as.numeric(value)
+    } else {
+      NA_real_
+    }
+
+    n <- length(vals)
+
+    ## ---- percentiles ----
+    percs <- if (has_percentiles &&
+                 !is.null(percentiles[[1]]) &&
+                 length(percentiles[[1]]) > 0) {
+
+      as.numeric(percentiles[[1]])
+
+    } else if (data.table::`%chin%`(computation,c("minimum", "medium", "maximum"))) {
+
+      fifelse(
+        computation == "minimum", 0,
+        fifelse(computation == "medium", 50, 100)
+      )
+
+    } else {
+      NA_real_
+    }
+
+    if (length(percs) == 1L && n > 1L) {
+      percs <- rep(percs, n)
+    }
+
+    ## ---- expand scalar columns ----
+    .(
+      value           = vals,
+      percentile      = percs,
+      start_date      = rep(start_date, n),
+      end_date        = rep(end_date, n),
+      interval_type   = rep(interval_type, n),
+      sample_count    = rep(sample_count, n),
+      approval_status = rep(approval_status, n),
+      computation_id  = rep(computation_id, n),
+      computation     = rep(computation, n)
+    )
+  })
+}
\ No newline at end of file

From 37b9987493b43b1359022f476b18486a24560287 Mon Sep 17 00:00:00 2001
From: Joseph Zemmels <jzemmels@usgs.gov>
Date: Tue, 30 Dec 2025 16:54:11 -0700
Subject: [PATCH 3/6] Add some documentation to internal clean_value_cols()
 function

---
 NAMESPACE                          |  1 -
 R/read_waterdata_stats_datatable.R | 26 +++++++++-
 man/get_waterdata_stats.Rd         | 82 ++++++++++++++++++++++++++++++
 3 files changed, 107 insertions(+), 2 deletions(-)
 create mode 100644 man/get_waterdata_stats.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 8c9606d5..6999f2b9 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -17,7 +17,6 @@ export(create_WQP_bib)
 export(findNLDI)
 export(getWebServiceData)
 export(get_nldi_sources)
-export(get_statistics_data_rbindlist)
 export(importNGWMN)
 export(importRDB1)
 export(importWQP)
diff --git a/R/read_waterdata_stats_datatable.R b/R/read_waterdata_stats_datatable.R
index 857cf43e..41eb608a 100644
--- a/R/read_waterdata_stats_datatable.R
+++ b/R/read_waterdata_stats_datatable.R
@@ -1,3 +1,4 @@
+#' 
 get_statistics_data_data_table <- function(args, service) {
   
   base_request <- construct_statistics_request(service = service, version = 0)
@@ -44,7 +45,30 @@ get_statistics_data_data_table <- function(args, service) {
   return(sf::st_as_sf(combined))
 }
 
-#' Clean up value, values, and percentiles columns in a data.table
+#' Clean up "value", "values", and "percentiles" columns in a data.table
+#' 
+#' @description
+#' If the input data.table has "values" and "percentiles" columns, then 
+#' it unnests both, making the data.table longer (one row per value)
+#' 
+#' If the input data.table *also* has a "value" column, then it consolidates with "values", yielding a single "value" column
+#' 
+#' The "percentiles" column is replaced by "percentile" (matching singularization of "value").
+#' The function also checks whether the "computation" column contains "minimum", "median", or "maximum" and 
+#' sets the corresponding "percentile" to 0, 50, or 100, respectively. Note that the percentile column might only 
+#' contain NAs if the input data.table only includes arithmetic_mean values
+#' 
+#' Lastly, the value and percentile columns are converted from character to numeric
+#'
+#' 
+#' @note
+#' This function is intended to evaluate as a data.table j expression, which is why it doesn't accept any arguments.
+#' j expressions are typically written inline with the data.table definition (e.g., DT[, { do something }]). 
+#' However, this expression would have been excessively long. Instead, an eval(quote(...)) is used so this could be 
+#' pulled out into a separate function for readability, but we still gain the computational benefits of using an expression.
+#' 
+#' @noRd
+#' @return data.table object with "value" and "percentile" columns
 clean_value_cols <- function() {
   quote({
 
diff --git a/man/get_waterdata_stats.Rd b/man/get_waterdata_stats.Rd
new file mode 100644
index 00000000..916396fe
--- /dev/null
+++ b/man/get_waterdata_stats.Rd
@@ -0,0 +1,82 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read_waterdata_stats.R
+\name{read_waterdata_stats_normal}
+\alias{read_waterdata_stats_normal}
+\alias{read_waterdata_stats_interval}
+\title{Get summary statistic data from /statistics API}
+\usage{
+read_waterdata_stats_normal(
+  approval_status = NA,
+  computation_type = NA_character_,
+  country_code = NA_character_,
+  state_code = NA_character_,
+  county_code = NA_character_,
+  start_date = NA_character_,
+  end_date = NA_character_,
+  mime_type = NA_character_,
+  monitoring_location_id = NA_character_,
+  parent_time_series_id = NA_character_,
+  site_type_code = NA_character_,
+  site_type_name = NA_character_,
+  parameter_code = NA_character_,
+  next_token = NA_character_,
+  page_size = NA
+)
+
+read_waterdata_stats_interval(
+  approval_status = NA,
+  computation_type = NA_character_,
+  country_code = NA_character_,
+  state_code = NA_character_,
+  county_code = NA_character_,
+  start_date = NA_character_,
+  end_date = NA_character_,
+  mime_type = NA_character_,
+  monitoring_location_id = NA_character_,
+  next_token = NA_character_,
+  parent_time_series_id = NA_character_,
+  site_type_code = NA_character_,
+  site_type_name = NA_character_,
+  parameter_code = NA_character_,
+  page_size = NA
+)
+}
+\arguments{
+\item{approval_status}{asdf}
+
+\item{computation_type}{asdf}
+
+\item{country_code}{asdf}
+
+\item{state_code}{asdf}
+
+\item{county_code}{asdf}
+
+\item{start_date}{asdf}
+
+\item{end_date}{asdf}
+
+\item{mime_type}{asdf}
+
+\item{monitoring_location_id}{asfd}
+
+\item{parent_time_series_id}{aasdf}
+
+\item{site_type_code}{asdf}
+
+\item{site_type_name}{asdf}
+
+\item{parameter_code}{asf}
+
+\item{next_token}{asdf}
+
+\item{page_size}{asdf}
+}
+\description{
+\code{read_waterdata_stats_normal} fetches day- or month-of-year data from the observationNormals endpoint.
+
+\code{read_waterdata_stats} fetches calendar month-year, calendar year, or water year data from the observationIntervals endpoint.
+}
+\seealso{
+\url{https://api.waterdata.usgs.gov/statistics/v0/docs}
+}

From a59b1fa5e58984ccb0aef47c5d8a35f581a69046 Mon Sep 17 00:00:00 2001
From: Joseph Zemmels <jzemmels@usgs.gov>
Date: Wed, 31 Dec 2025 11:38:45 -0700
Subject: [PATCH 4/6] Move data table implementation to read_statistics_data,
 update documentation

---
 R/read_waterdata_stats.R           | 334 ++++++++++++++---------------
 R/read_waterdata_stats_datatable.R | 135 ------------
 2 files changed, 166 insertions(+), 303 deletions(-)
 delete mode 100644 R/read_waterdata_stats_datatable.R

diff --git a/R/read_waterdata_stats.R b/R/read_waterdata_stats.R
index c0503612..45a45316 100644
--- a/R/read_waterdata_stats.R
+++ b/R/read_waterdata_stats.R
@@ -1,30 +1,66 @@
 #' Get summary statistic data from /statistics API
-#' 
-#' @description 
-#' 
-#' \code{read_waterdata_stats_normal} fetches day- or month-of-year data from the observationNormals endpoint.
-#' 
-#' \code{read_waterdata_stats} fetches calendar month-year, calendar year, or water year data from the observationIntervals endpoint.
-#' 
+#'
+#' @description
+#'
+#' This service provides endpoints for access to computations on the historical
+#' record regarding water conditions. For more information regarding the
+#' calculation of statistics and other details, please visit the Statistics
+#' documentation page.
+#'
+#' Note: This API is under active beta development and subject to change.
+#' Improved handling of significant figures will be addressed in a future
+#' release.
+#'
+#' \code{read_waterdata_stats_normal} Returns day-of-year and month-of-year
+#' statistics matching your query.
+#'
+#' \code{read_waterdata_stats} Returns monthly and annual statistics matching
+#' your query.
+#'
 #' @export
-#' 
-#' @param approval_status asdf
-#' @param computation_type asdf
-#' @param country_code asdf 
-#' @param state_code asdf
-#' @param county_code asdf
-#' @param start_date asdf
-#' @param end_date asdf
-#' @param mime_type asdf
-#' @param monitoring_location_id asfd
-#' @param next_token asdf
-#' @param parent_time_series_id aasdf
-#' @param site_type_code asdf
-#' @param site_type_name asdf
-#' @param parameter_code asf
-#' @param next_token asdf
-#' @param page_size asdf
-#' 
+#'
+#' @param approval_status Whether to include approved and/or provisional
+#'   observations. At this time, only approved observations are returned.
+#' @param computation_type Desired statistical computation method. Available
+#'   values: "arithmetic_mean", "maximum", "median", "minimum", "percentile".
+#' @param country_code Country Query Parameter. Accepts multiple values (see
+#'   examples). If one of country, county, or state code is supplied then the
+#'   other two arguments do not need to be specified.
+#' @param state_code State Query Parameter. Accepts multiple values in a
+#'   character vector.
+#' @param county_code County Query Parameter. Accepts multiple values in a
+#'   character vector.
+#' @param start_date Start Date Query Parameter. The logic is inclusive i.e., it
+#'   will also return records that match the date. If an end date is supplied,
+#'   but no start date is supplied, then statistics will be supplied for the
+#'   entire period of record ending with the end date. If an end date is not
+#'   supplied, but a start date is supplied then statistics will be supplied for
+#'   the period of record following the start date. If no start or end date are
+#'   supplied then statistics will be supplied for the entire period of record.
+#' @param end_date End Date Query Parameter. The logic is inclusive i.e., it will
+#'   also return records that match the date.
+#' @param monitoring_location_id Each monitoring location has been assigned a
+#'   unique station number that places them in downstream order. Accepts
+#'   multiple values in a character vector.
+#' @param parent_time_series_id The parent_time_series_id returns statistics
+#'   tied to a particular database entry. Accepts multiple values in a character
+#'   vector. If no parent time series identifier is supplied, then all records
+#'   matching the rest of the provided criteria will be returned.
+#' @param site_type_code Site Type Code Query Parameter. Accepts multiple values
+#'   in a character vector. If no Site Type code is specified, statistics of all
+#'   site types with the matching Monitoring Location Identifier will be
+#'   returned.
+#' @param site_type_name Site Type Name Query Parameter. If no Site Type name is
+#'   specified, statistics of all site types with the matching Monitoring
+#'   Location Identifier will be returned.
+#' @param parameter_code USGS Parameter Code Query Parameter. Accepts multiple
+#'   values in a character vector. If no USGS parameter code is specified, but a
+#'   Monitoring Location Identifier is supplied, then all statistics and their
+#'   parameter codes with a matching monitoring location identifier will be
+#'   returned. All statistics within the period of record will be returned if no
+#'   parameter code or monitoring location identifier are specified.
+#' @param page_size Return a defined number of results (default: 1000).
+#'
 #' @rdname get_waterdata_stats
 #' @seealso \url{https://api.waterdata.usgs.gov/statistics/v0/docs}
 read_waterdata_stats_normal <- function(
@@ -35,13 +71,11 @@ read_waterdata_stats_normal <- function(
     county_code = NA_character_,
     start_date = NA_character_,
     end_date = NA_character_,
-    mime_type = NA_character_,
     monitoring_location_id = NA_character_,
     parent_time_series_id = NA_character_,
     site_type_code = NA_character_,
     site_type_name = NA_character_,
     parameter_code = NA_character_,
-    next_token = NA_character_,
     page_size = NA
 ) {
   
@@ -79,7 +113,7 @@ read_waterdata_stats_interval <- function(
 #' Retrieve data from /statistics API
 #' 
 #' @param args arguments from individual functions.
-#' @param service Ednpoint name.
+#' @param service Endpoint name.
 #' 
 #' @noRd
 #' @return data.frame with attributes
@@ -91,163 +125,127 @@ get_statistics_data <- function(args, service) {
   
   full_request <- explode_query(base_request, POST = FALSE, x = args)
   
-  return_list <- walk_pages(full_request, max_results = NA)
-  
-  # TODO?: pull the following double-for loop out to other function(s)
-  return_list_tmp <- list()
-  for (i in 1:nrow(return_list)) {
-    x <- jsonlite::parse_json(return_list$data[i])
-    z <- data.frame(rbind_fill(x))
-    zz <- list()
-    for (j in 1:nrow(z)) {
-      zz[[j]] <- cbind(
-        subset(z[j, ], select = -values),
-        rbind_fill(z$values[[j]]),
-        row.names = NULL
-      )
-    }
-    zz <- do.call(rbind, zz)
-    
-    return_list_tmp[[i]] <- cbind_sf_metadata(
-      subset(return_list[i, ], select = -data),
-      zz
-    )
-  }
+  return_list <- data.table::as.data.table(walk_pages(full_request, max_results = NA))
+  return_list[, rid := .I]
   
-  return_list <- do.call(rbind, return_list_tmp)
+  parsed_data <- lapply(return_list$data, jsonlite::parse_json)
+  return_list[, data := NULL]
   
-  if(all(c("values", "percentiles") %in% names(return_list))){
-    return_list <- combine_value_columns(return_list)
+  combined_list <- list()
+  for (i in seq_along(parsed_data)){
+    time_series_metainfo <- data.table::rbindlist(parsed_data[[i]])
+    
+    observations <- data.table::rbindlist(time_series_metainfo$values, fill = TRUE)
+    observations <- observations[, .j, by = .I, env = list(.j = clean_value_cols())][, I := NULL]
+    time_series_metainfo[, values := NULL]
+    
+    time_series_metainfo <- cbind(time_series_metainfo, observations)
+    time_series_metainfo[, rid := i]
+    
+    combined_list[[i]] <- time_series_metainfo
   }
-
-  return_list$value <- as.numeric(unlist(return_list$value))
-  return_list$percentile <- as.numeric(unlist(return_list$percentile))
-  
-  attr(return_list, "request") <- full_request
-  attr(return_list, "queryTime") <- Sys.time()
   
-  return(return_list)
-}
-
-#' Create a request object for the /statistics service
-#' 
-#' @param service chr; "Normals" or "Intervals"
-#' @param version int; /statistics API version number (default: 0)
-#' 
-#' @return a httr2 request object
-#' 
-#' @noRd
-construct_statistics_request <- function(service = "Normals", version = 0){
+  combined <- data.table::rbindlist(combined_list)
+  combined <- combined[return_list, on = "rid"][, rid := NULL]
   
-  httr2::request("https://api.waterdata.usgs.gov/statistics/") |>
-    httr2::req_url_path_append(paste0("v", version)) |>
-    httr2::req_url_path_append(paste0("observation", service))
+  attr(combined, "request") <- full_request
+  attr(combined, "queryTime") <- Sys.time()
   
+  return(sf::st_as_sf(as.data.frame(combined)))
 }
 
-#' Bind a list of data frames by row
-#' 
-#' Meant to emulate data.table::rbindlist in how it handles missing data
-#' 
-#' @param dfs a list of data frames
-#' 
-#' @return a data frame
-#' 
-#' @noRd
-rbind_fill <- function(dfs) {
-  
-  # drop NULL elements
-  dfs <- Filter(Negate(is.null), dfs)
-  if (length(dfs) == 0) return(NULL)
-  
-  # union of all column names
-  all_names <- unique(unlist(lapply(dfs, names)))
-  
-  # align columns
-  dfs_aligned <- lapply(dfs, function(x) {
-    missing <- setdiff(all_names, names(x))
-    if (length(missing)) {
-      for (m in missing) x[[m]] <- NA
-    }
-    x[all_names]
-  })
-  
-  # bind
-  out <- do.call(rbind, dfs_aligned)
-  rownames(out) <- NULL
-  out
-}
-
-#' Combine sf metadata with observations from /statistics
-#' 
-#' @param meta_sf a single-row sf data frame object containing metadata about observations
-#' @param obs_df a multi-row data frame of observations 
-#' 
-#' @return an sf data frame object
-#' 
-#' @noRd
-cbind_sf_metadata <- function(meta_sf, obs_df) {
-  
-  stopifnot(
-    inherits(meta_sf, "sf"),
-    nrow(meta_sf) == 1
-  )
-  
-  n <- nrow(obs_df)
-  
-  # replicate metadata rows
-  meta_rep <- meta_sf[rep(1, n), , drop = FALSE]
-  
-  # drop geometry before cbind, then restore
-  geom <- sf::st_geometry(meta_rep)
-  meta_nogeo <- sf::st_drop_geometry(meta_rep)
-  
-  out <- cbind(meta_nogeo, obs_df)
-  sf::st_sf(out, geometry = geom)
-}
-
-#' Clean up a /statistics data frame
+#' Clean up "value", "values", and "percentiles" columns in a data.table
+#'
+#' @description If the input data.table has "values" and "percentiles" columns,
+#'   then it unnests both, making the data.table longer (one row per value)
+#'
+#'   If the input data.table *also* has a "value" column, then it consolidates
+#'   with "values", yielding a single "value" column
 #'
-#' Unpacks "values" and "percentiles" nested columns to be one row per value,
-#' renames "percentiles" to "percentile", consolidates "value" and "values" into
-#' one "value" column
+#'   The "percentiles" column is replaced by "percentile" (matching
+#'   singularization of "value"). The function also checks whether the
+#'   "computation" column contains "minimum", "median", or "maximum" and sets
+#'   the corresponding "percentile" to 0, 50, or 100, respectively. Note that
+#'   the percentile column might only contain NAs if the input data.table only
+#'   includes arithmetic_mean values.
 #'
-#' @param df a data frame with "value" and "values" columns
+#'   Lastly, the value and percentile columns are converted from character to
+#'   numeric
+#'
+#' @note This function is intended to evaluate as a data.table j expression,
+#'   which is why it doesn't accept any arguments. j expressions are typically
+#'   written inline with the data.table definition (e.g., \code{DT[, { do
+#'   something }]}). However, this expression would have been excessively long.
+#'   Instead, a substitute() is used so this could be pulled out into a separate
+#'   function for readability, but we still gain the computational benefits of
+#'   using an expression.
 #'
-#' @return a data frame with a value column and maybe a
 #' @noRd
-combine_value_columns <- function(df) {
-  
-  split_vals <- lapply(df$values, function(x) as.numeric(unlist(x)))
-  split_pcts <- lapply(df$percentiles, function(x) as.numeric(unlist(x)))
-  
-  df$values <- NULL
-  df$percentiles <- NULL
-  
-  expanded <- lapply(seq_len(nrow(df)), function(i) {
+#' @return data.table object with "value" and "percentile" columns
+#'
+#' @seealso
+#' \url{https://stat.ethz.ch/CRAN/web/packages/data.table/vignettes/datatable-programming.html}
+clean_value_cols <- function() {
+  substitute({
+    
+    ## ---- detect column presence ----
+    has_value       <- exists("value")
+    has_values      <- exists("values")
+    has_percentiles <- exists("percentiles")
     
-    v <- split_vals[[i]]
-    p <- split_pcts[[i]]
+    ## ---- values ----
+    vals <- if (has_values &&
+                !is.null(values[[1]]) &&
+                length(values[[1]]) > 0) {
+      
+      v <- values[[1]]
+      if (length(v) == 1L && identical(v, "nan")) {
+        NA_real_
+      } else {
+        as.numeric(v)
+      }
+      
+    } else if (has_value) {
+      as.numeric(value)
+    } else {
+      NA_real_
+    }
     
-    v[is.nan(v)] <- NA
-    v <- as.numeric(v)
-    p <- as.numeric(p)
+    n <- length(vals)
     
-    # case: single NA → scalar statistic
-    if (length(v) == 1 && is.na(v)) {
-      df$percentile <- NA_real_
-      return(df[i, , drop = FALSE])
+    ## ---- percentiles ----
+    percs <- if (has_percentiles &&
+                 !is.null(percentiles[[1]]) &&
+                 length(percentiles[[1]]) > 0) {
+      
+      as.numeric(percentiles[[1]])
+      
+    } else if (data.table::`%chin%`(computation,c("minimum", "median", "maximum"))) {
+      
+      data.table::fifelse(
+        computation == "minimum", 0,
+        data.table::fifelse(computation == "median", 50, 100)
+      )
+      
+    } else {
+      NA_real_
     }
     
-    out <- df[rep(i, length(v)), , drop = FALSE]
+    if (length(percs) == 1L && n > 1L) {
+      percs <- rep(percs, n)
+    }
     
-    out$value <- v
-    out$percentile <- p
-    out
+    ## ---- expand  scalar columns ----
+    .(
+      value           = vals,
+      percentile      = percs,
+      start_date      = rep(start_date, n),
+      end_date        = rep(end_date, n),
+      interval_type   = rep(interval_type, n),
+      sample_count    = rep(sample_count, n),
+      approval_status = rep(approval_status, n),
+      computation_id  = rep(computation_id, n),
+      computation     = rep(computation, n)
+    )
   })
-  
-  out <- do.call(rbind, expanded)
-  rownames(out) <- NULL
-  out
 }
-
diff --git a/R/read_waterdata_stats_datatable.R b/R/read_waterdata_stats_datatable.R
deleted file mode 100644
index 41eb608a..00000000
--- a/R/read_waterdata_stats_datatable.R
+++ /dev/null
@@ -1,135 +0,0 @@
-#' 
-get_statistics_data_data_table <- function(args, service) {
-  
-  base_request <- construct_statistics_request(service = service, version = 0)
-  
-  # TODO?: arg type checking here
-  
-  full_request <- explode_query(base_request, POST = FALSE, x = args)
-  
-  return_list <- data.table::as.data.table(walk_pages(full_request, max_results = NA))
-  return_list[, rid := .I]
-
-  parsed_data <- lapply(return_list$data, jsonlite::parse_json)
-  return_list[, data := NULL]
-
-  combined_list <- list()
-  for (i in seq_along(parsed_data)){
-    time_series_metainfo <- data.table::rbindlist(parsed_data[[i]])
-
-    observations <- data.table::rbindlist(time_series_metainfo$values, fill = TRUE)
-    observations <- observations[, eval(clean_value_cols()), by = .I][, I := NULL]
-    time_series_metainfo[, values := NULL]
-    
-    time_series_metainfo <- cbind(time_series_metainfo, observations)
-    time_series_metainfo[, rid := i]
-
-    combined_list[[i]] <- time_series_metainfo
-  }
-  
-  combined <- data.table::rbindlist(combined_list)
-  combined <- combined[return_list, on = "rid"]
-  
-  col_order <- c("parent_time_series_id", "monitoring_location_id", "monitoring_location_name", "parameter_code",
-               "interval_type", "start_date", "end_date", "computation", "value", "percentile",
-               "unit_of_measure", "sample_count",
-               "approval_status", "computation_id", 
-               "site_type", "site_type_code", 
-               "country_code", "state_code", "county_code", 
-               "geometry")
-  data.table::setcolorder(combined, col_order) 
-   
-  attr(combined, "request") <- full_request
-  attr(combined, "queryTime") <- Sys.time()
-  
-  return(sf::st_as_sf(combined))
-}
-
-#' Clean up "value", "values", and "percentiles" columns in a data.table
-#' 
-#' @description
-#' If the input data.table has "values" and "percentiles" columns, then 
-#' it unnests both, making the data.table longer (one row per value)
-#' 
-#' If the input data.table *also* has a "value" column, then it consolidates with "values", yielding a single "value" column
-#' 
-#' The "percentiles" column is replaced by "percentile" (matching singularization of "value").
-#' The function also checks whether the "computation" column contains "minimum", "median", or "maximum" and 
-#' sets the corresponding "percentile" to 0, 50, or 100, respectively. Note that the percentile column might only 
-#' contain NAs if the input data.table only includes arithmetic_mean values
-#' 
-#' Lastly, the value and percentile columns are converted from character to numeric
-#'
-#' 
-#' @note
-#' This function is intended to evaluate as a data.table j expression, which is why it doesn't accept any arguments.
-#' j expressions are typically written inline with the data.table definition (e.g., DT[, { do something }]). 
-#' However, this expression would have been excessively long. Instead, an eval(quote(...)) is used so this could be 
-#' pulled out into a separate function for readability, but we still gain the computational benefits of using an expression.
-#' 
-#' @noRd
-#' @return data.table object with "value" and "percentile" columns
-clean_value_cols <- function() {
-  quote({
-
-    ## ---- detect column presence ----
-    has_value       <- exists("value")
-    has_values      <- exists("values")
-    has_percentiles <- exists("percentiles")
-
-    ## ---- values ----
-    vals <- if (has_values &&
-                !is.null(values[[1]]) &&
-                length(values[[1]]) > 0) {
-
-      v <- values[[1]]
-      if (length(v) == 1L && identical(v, "nan")) {
-        NA_real_
-      } else {
-        as.numeric(v)
-      }
-
-    } else if (has_value) {
-      as.numeric(value)
-    } else {
-      NA_real_
-    }
-
-    n <- length(vals)
-
-    ## ---- percentiles ----
-    percs <- if (has_percentiles &&
-                 !is.null(percentiles[[1]]) &&
-                 length(percentiles[[1]]) > 0) {
-
-      as.numeric(percentiles[[1]])
-
-    } else if (data.table::`%chin%`(computation,c("minimum", "medium", "maximum"))) {
-
-      fifelse(
-        computation == "minimum", 0,
-        fifelse(computation == "medium", 50, 100)
-      )
-
-    } else {
-      NA_real_
-    }
-
-    if (length(percs) == 1L && n > 1L) {
-      percs <- rep(percs, n)
-    }
-
-    ## ---- expand scalar columns ----
-    .(
-      value           = vals,
-      percentile      = percs,
-      start_date      = rep(start_date, n),
-      end_date        = rep(end_date, n),
-      interval_type   = rep(interval_type, n),
-      sample_count    = rep(sample_count, n),
-      approval_status = rep(approval_status, n),
-      computation_id  = rep(computation_id, n),
-      computation     = rep(computation, n)
-    )
-  })
-}
\ No newline at end of file

From 8875de4a6d92e4fa7a19e6887110e6381971a96d Mon Sep 17 00:00:00 2001
From: Joseph Zemmels <jzemmels@usgs.gov>
Date: Wed, 31 Dec 2025 15:36:50 -0700
Subject: [PATCH 5/6] Generalize to accommodate Normals and Intervals output,
 handle empty responses, fix data.table joining.

---
 R/read_waterdata_stats.R | 166 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 145 insertions(+), 21 deletions(-)

diff --git a/R/read_waterdata_stats.R b/R/read_waterdata_stats.R
index 45a45316..77688900 100644
--- a/R/read_waterdata_stats.R
+++ b/R/read_waterdata_stats.R
@@ -61,6 +61,12 @@
 #'   parameter code or monitoring location identifier are specified.
 #' @param page_size Return a defined number of results (default: 1000).
 #'
+#' @examplesIf is_dataRetrieval_user()
+#' 
+#' \donttest{
+#' 
+#' }
+#' 
 #' @rdname get_waterdata_stats
 #' @seealso \url{https://api.waterdata.usgs.gov/statistics/v0/docs}
 read_waterdata_stats_normal <- function(
@@ -110,6 +116,22 @@ read_waterdata_stats_interval <- function(
   
 }
 
+#' Create a request object for the /statistics service
+#' 
+#' @param service chr; "Normals" or "Intervals"
+#' @param version int; /statistics API version number (default: 0)
+#' 
+#' @return a httr2 request object
+#' 
+#' @noRd
+construct_statistics_request <- function(service = "Normals", version = 0){
+  
+  httr2::request("https://api.waterdata.usgs.gov/statistics/") |>
+    httr2::req_url_path_append(paste0("v", version)) |>
+    httr2::req_url_path_append(paste0("observation", service))
+  
+}
+
 #' Retrieve data from /statistics API
 #' 
 #' @param args arguments from individual functions.
@@ -126,6 +148,11 @@ get_statistics_data <- function(args, service) {
   full_request <- explode_query(base_request, POST = FALSE, x = args)
   
   return_list <- data.table::as.data.table(walk_pages(full_request, max_results = NA))
+  
+  if(nrow(return_list) == 0) {
+    return(deal_with_empty_stats(return_list))
+  }
+  
   return_list[, rid := .I]
   
   parsed_data <- lapply(return_list$data, jsonlite::parse_json)
@@ -133,25 +160,55 @@ get_statistics_data <- function(args, service) {
   
   combined_list <- list()
   for (i in seq_along(parsed_data)){
-    time_series_metainfo <- data.table::rbindlist(parsed_data[[i]])
+    # 1. One row per time series
+    ts_meta <- data.table::rbindlist(parsed_data[[i]], fill = TRUE)
+    ts_meta[, ts_id := .I]  # local key
+    
+    # 2. One row per observation, keyed to ts_id
+    obs <- data.table::rbindlist(ts_meta$values, fill = TRUE, idcol = "ts_id")
+    obs <- obs[, .j, by = .I, env = list(.j = clean_value_cols())][, I := NULL]
     
-    observations <- data.table::rbindlist(time_series_metainfo$values, fill = TRUE)
-    observations <- observations[, .j, by = .I, env = list(.j = clean_value_cols())][, I := NULL]
-    time_series_metainfo[, values := NULL]
+    # 3. Drop nested list column
+    ts_meta[, values := NULL]
     
-    time_series_metainfo <- cbind(time_series_metainfo, observations)
-    time_series_metainfo[, rid := i]
+    # 4. Join
+    out <- ts_meta[obs, on = "ts_id", allow.cartesian = TRUE]
     
-    combined_list[[i]] <- time_series_metainfo
+    out[, `:=`(
+      ts_id = NULL,
+      rid   = i
+    )]
+    
+    combined_list[[i]] <- out
   }
   
   combined <- data.table::rbindlist(combined_list)
-  combined <- combined[return_list, on = "rid"][, rid := NULL]
+  combined <- combined[return_list, on = "rid"]
+  combined[, rid := NULL]
+  
+  combined <- sf::st_as_sf(as.data.frame(combined))
   
   attr(combined, "request") <- full_request
   attr(combined, "queryTime") <- Sys.time()
   
-  return(sf::st_as_sf(as.data.frame(combined)))
+  return(combined)
+}
+
+#'
+normalize_value_cols <- function(x) {
+  if (!is.null(x$value)) {
+    list(
+      value       = as.numeric(x$value),
+      values      = NA_real_,
+      percentiles = NA_real_
+    )
+  } else {
+    list(
+      value       = NA_real_,
+      values      = as.numeric(x$values),
+      percentiles = as.numeric(x$percentiles)
+    )
+  }
 }
 
 #' Clean up "value", "values", and "percentiles" columns in a data.table
@@ -193,6 +250,9 @@ clean_value_cols <- function() {
     has_values      <- exists("values")
     has_percentiles <- exists("percentiles")
     
+    has_date_schema <- exists("start_date") && exists("end_date") && exists("interval_type")
+    has_toy_schema  <- exists("time_of_year") && exists("time_of_year_type")
+    
     ## ---- values ----
     vals <- if (has_values &&
                 !is.null(values[[1]]) &&
@@ -220,7 +280,7 @@ clean_value_cols <- function() {
       
       as.numeric(percentiles[[1]])
       
-    } else if (data.table::`%chin%`(computation,c("minimum", "median", "maximum"))) {
+    } else if (data.table::`%chin%`(computation, c("minimum", "median", "maximum"))) {
       
       data.table::fifelse(
         computation == "minimum", 0,
@@ -235,17 +295,81 @@ clean_value_cols <- function() {
       percs <- rep(percs, n)
     }
     
-    ## ---- expand  scalar columns ----
-    .(
-      value           = vals,
-      percentile      = percs,
-      start_date      = rep(start_date, n),
-      end_date        = rep(end_date, n),
-      interval_type   = rep(interval_type, n),
-      sample_count    = rep(sample_count, n),
-      approval_status = rep(approval_status, n),
-      computation_id  = rep(computation_id, n),
-      computation     = rep(computation, n)
+    ## ---- time columns (schema-dependent) ----
+    time_cols <- if (has_date_schema) {
+      list(
+        start_date    = rep(start_date, n),
+        end_date      = rep(end_date, n),
+        interval_type = rep(interval_type, n)
+      )
+    } else if (has_toy_schema) {
+      list(
+        time_of_year      = rep(time_of_year, n),
+        time_of_year_type = rep(time_of_year_type, n)
+      )
+    } else {
+      list()
+    }
+    
+    ## ---- assemble ----
+    c(
+      list(
+        value           = vals,
+        percentile      = percs,
+        sample_count    = rep(sample_count, n),
+        approval_status = rep(approval_status, n),
+        computation_id  = rep(computation_id, n),
+        computation     = rep(computation, n),
+        ts_id = rep(ts_id, n)
+      ),
+      time_cols
     )
   })
 }
+
+#' Handle empty responses from the /statistics service
+#'
+#' @param return_list data.frame returned from walk_pages
+#' @param properties character vector of requested columns (or NA)
+#' @param convertType logical, whether to convert numeric columns
+#'
+#' @return data.frame or sf object with expected columns
+#' @noRd
+deal_with_empty_stats <- function(return_list, properties = NA,
+                                  convertType = TRUE) {
+  
+  # Define default columns for stats service
+  default_columns <- c(
+    "monitoring_location_id", "monitoring_location_name",
+    "site_type", "site_type_code", "country_code", "state_code",
+    "county_code", "parameter_code", "unit_of_measure", "parent_time_series_id",
+    "value", "percentile", "start_date", "end_date", "interval_type",
+    "sample_count", "approval_status", "computation_id", "computation"
+  )
+  
+  if (all(is.na(properties))) {
+    properties <- default_columns
+  }
+  
+  # create empty data.frame
+  return_list <- as.data.frame(matrix(nrow = 0, ncol = length(properties)))
+  names(return_list) <- properties
+  return_list <- lapply(return_list, as.character)
+  return_list <- as.data.frame(return_list)
+  
+  # convert numeric columns if requested
+  if (convertType) {
+    numeric_cols <- c("value", "percentile", "sample_count")
+    for (col in numeric_cols) {
+      if (col %in% names(return_list)) {
+        return_list[[col]] <- as.numeric(return_list[[col]])
+      }
+    }
+  }
+  
+  # ensure geometry column exists if not skipped
+  return_list <- sf::st_as_sf(return_list, geometry = sf::st_sfc())
+  
+  return(return_list)
+}
+

From e143efbb73d5b7460946120a7509f2b9f9d22156 Mon Sep 17 00:00:00 2001
From: Joseph Zemmels <jzemmels@usgs.gov>
Date: Wed, 31 Dec 2025 15:49:35 -0700
Subject: [PATCH 6/6] Unit tests

---
 tests/testthat/test_waterdata_stats.R | 290 ++++++++++++++++++++++++++
 1 file changed, 290 insertions(+)
 create mode 100644 tests/testthat/test_waterdata_stats.R

diff --git a/tests/testthat/test_waterdata_stats.R b/tests/testthat/test_waterdata_stats.R
new file mode 100644
index 00000000..7c4149e3
--- /dev/null
+++ b/tests/testthat/test_waterdata_stats.R
@@ -0,0 +1,290 @@
+# ------------------------------------------------------------------------------
+# clean_value_cols()
+# ------------------------------------------------------------------------------
+
+test_that("values column is unnested and converted to numeric", {
+  DT <- data.table(
+    values = list(c("1.2", "3.4")),
+    percentiles = list(c("10", "90")),
+    computation = "percentile",
+    start_date = "2000-01-01",
+    end_date = "2000-12-31",
+    interval_type = "month",
+    ts_id = "1",
+    sample_count = 10,
+    approval_status = "approved",
+    computation_id = "abc"
+  )
+  
+  out <- DT[, .j, env = list(.j = clean_value_cols())]
+  
+  expect_equal(out$value, c(1.2, 3.4))
+  expect_equal(out$percentile, c(10, 90))
+})
+
+test_that("value column is used when values is absent", {
+  DT <- data.table(
+    value = "2.5",
+    computation = "arithmetic_mean",
+    start_date = "2000-01-01",
+    end_date = "2000-12-31",
+    interval_type = "year",
+    sample_count = 1,
+    ts_id = "1",
+    approval_status = "approved",
+    computation_id = "xyz"
+  )
+  
+  out <- DT[, .j, env = list(.j = clean_value_cols())]
+  
+  expect_equal(out$value, 2.5)
+  expect_true(is.na(out$percentile))
+})
+
+test_that("'nan' values are converted to NA_real_", {
+  DT <- data.table(
+    values = list("nan"),
+    computation = "arithmetic_mean",
+    start_date = "2000-01-01",
+    end_date = "2000-12-31",
+    interval_type = "month",
+    ts_id = "1",
+    sample_count = 1,
+    approval_status = "approved",
+    computation_id = "nan-test"
+  )
+  
+  out <- DT[, .j, env = list(.j = clean_value_cols())]
+  
+  expect_true(is.na(out$value))
+})
+
+test_that("percentile is inferred from computation type", {
+  DT <- data.table(
+    value = "5",
+    computation = c("minimum", "median", "maximum"),
+    start_date = "2000-01-01",
+    end_date = "2000-12-31",
+    interval_type = "year",
+    ts_id = "1",
+    sample_count = 1,
+    approval_status = "approved",
+    computation_id = 1:3
+  )
+  
+  out <- DT[, .j, by = computation, env = list(.j = clean_value_cols())]
+  
+  expect_equal(
+    out[order(computation)]$percentile,
+    c(100, 50, 0) # maximum, median, minimum
+  )
+})
+
+test_that("scalar percentile is recycled to match values length", {
+  DT <- data.table(
+    values = list(c("1", "2", "3")),
+    percentiles = list("50"),
+    computation = "percentile",
+    start_date = "2000-01-01",
+    end_date = "2000-12-31",
+    interval_type = "month",
+    ts_id = "1",
+    sample_count = 3,
+    approval_status = "approved",
+    computation_id = "recycle"
+  )
+  
+  out <- DT[, .j, env = list(.j = clean_value_cols())]
+  
+  expect_equal(out$percentile, c(50, 50, 50))
+})
+
+test_that("missing value and values yields NA value", {
+  DT <- data.table(
+    computation = "arithmetic_mean",
+    start_date = "2000-01-01",
+    end_date = "2000-12-31",
+    interval_type = "year",
+    ts_id = "1",
+    sample_count = 1,
+    approval_status = "approved",
+    computation_id = "missing"
+  )
+  
+  out <- DT[, .j, env = list(.j = clean_value_cols())]
+  
+  expect_true(is.na(out$value))
+})
+
+# ------------------------------------------------------------------------------
+# construct_statistics_request()
+# ------------------------------------------------------------------------------
+
+test_that("construct_statistics_request builds correct URL path", {
+  req <- construct_statistics_request(service = "Normals", version = 0)
+  
+  expect_true(grepl("statistics/v0/observationNormals", req$url))
+})
+
+test_that("Intervals service path is correct", {
+  req <- construct_statistics_request(service = "Intervals", version = 0)
+  
+  expect_true(grepl("statistics/v0/observationIntervals", req$url))
+})
+
+# ------------------------------------------------------------------------------
+# explode_query integration (no HTTP)
+# ------------------------------------------------------------------------------
+
+test_that("arguments are passed as query parameters", {
+  args <- list(
+    state_code = "WI",
+    computation_type = "median",
+    page_size = 10
+  )
+  
+  req <- construct_statistics_request("Normals", 0)
+  full <- explode_query(req, POST = FALSE, x = args)
+  
+  qs <- httr2::req_get_url(full)
+  
+  expect_match(qs, "state_code=WI")
+  expect_match(qs, "computation_type=median")
+  expect_match(qs, "page_size=10")
+})
+
+# ------------------------------------------------------------------------------
+# get_statistics_data() (mocked)
+# ------------------------------------------------------------------------------
+
+test_that("get_statistics_data returns sf object with attributes", {
+  fake_page <-
+    sf::st_sf(
+      monitoring_location_id = c("USGS-01432160", "USGS-01432110"),
+      monitoring_location_name = c(
+        "DELAWARE RIVER AT BARRYVILLE NY",
+        "Lackawaxen River at Rowland, PA"
+      ),
+      site_type = c("Stream", "Stream"),
+      site_type_code = c("ST", "ST"),
+      country_code = c("US", "US"),
+      state_code = c("42", "42"),
+      county_code = c("103", "103"),
+      geometry = sf::st_sfc(
+        sf::st_point(c(0, 0)),
+        sf::st_point(c(1, 1))
+      ),
+      data = c(
+        # First site
+        jsonlite::toJSON(list(
+          list(
+            parameter_code = "00060",
+            unit_of_measure = "ft^3/s",
+            parent_time_series_id = "1692e9a29c8c4276add4497c5da872fa",
+            values = list(
+              list(
+                start_date = "2018-10-01",
+                end_date = "2018-10-31",
+                interval_type = "month",
+                value = "8032.903",
+                sample_count = 31,
+                approval_status = "approved",
+                computation_id = "d98ebe80-d476-4144-b7ce-c6ca32f015de",
+                computation = "arithmetic_mean"
+              ),
+              list(
+                start_date = "2018-11-01",
+                end_date = "2018-11-30",
+                interval_type = "month",
+                value = "12521.667",
+                sample_count = 30,
+                approval_status = "approved",
+                computation_id = "93f45097-93ad-4b48-83d4-b27b5fc137dc",
+                computation = "arithmetic_mean"
+              )
+            )
+          )
+        ), auto_unbox = TRUE),
+        # Second site
+        jsonlite::toJSON(list(
+          list(
+            parameter_code = "00060",
+            unit_of_measure = "ft^3/s",
+            parent_time_series_id = "7dda388ea270420dbe7324e56b6f907f",
+            values = list(
+              list(
+                start_date = "2007-07-27",
+                end_date = "2007-07-31",
+                interval_type = "month",
+                values = list("1", "2"),
+                percentiles = list("50", "100"),
+                sample_count = 5,
+                approval_status = "approved",
+                computation_id = "4579653d-f26f-4e9f-99ea-c3fa3fc68686",
+                computation = "arithmetic_mean"
+              )
+            )
+          )
+        ), auto_unbox = TRUE)
+      ),
+      stringsAsFactors = FALSE
+    )
+  
+  with_mocked_bindings(
+    walk_pages = function(...) list(fake_page),
+    {
+      out <- get_statistics_data(list(), "Normals")
+      
+      expect_s3_class(out, "sf")
+      expect_true("request" %in% names(attributes(out)))
+      expect_true("queryTime" %in% names(attributes(out)))
+      expect_equal(nrow(out), 4)
+    }
+  )
+})
+
+test_that("get_statistics_data handles empty response", {
+  with_mocked_bindings(
+    walk_pages = function(...) list(),
+    {
+      out <- get_statistics_data(list(), "Normals")
+      
+      expect_s3_class(out, "sf")
+      expect_equal(nrow(out), 0)
+    }
+  )
+})
+
+# ------------------------------------------------------------------------------
+# High-level API tests (skipped on CRAN)
+# ------------------------------------------------------------------------------
+
+test_that("read_waterdata_stats_normal returns data", {
+  skip_on_cran()
+  skip_if_offline()
+  
+  out <- read_waterdata_stats_normal(
+    monitoring_location_id = "USGS-01646500",
+    parameter_code = "00060",
+    computation_type = "median",
+    page_size = 5
+  )
+  
+  expect_s3_class(out, "sf")
+  expect_true(nrow(out) > 0)
+})
+
+test_that("read_waterdata_stats_interval returns data", {
+  skip_on_cran()
+  skip_if_offline()
+  
+  out <- read_waterdata_stats_interval(
+    monitoring_location_id = "USGS-01646500",
+    parameter_code = "00060",
+    computation_type = "maximum",
+    page_size = 5
+  )
+  
+  expect_s3_class(out, "sf")
+  expect_true(nrow(out) > 0)
+})