openair-project · jack-davison · Feb 1, 2025 · Feb 1, 2025 · Feb 1, 2025 · jack-davison
diff --git a/NEWS.md b/NEWS.md
@@ -19,6 +19,12 @@
     - When `n` isn't defined for a qualitative palette (e.g., "Dark2"), the full qualitative palette will be returned. Previously this errored with the default of `100`.
 
     - `openColours()` will now check whether the provided `scheme` is either a known scheme name *or* a vector of valid R colours, and provide an informative error if this is not the case.
+
+- Added new features for `calcPercentile()`:
+
+    - Added the `type` argument, in line with `timeAverage()`.
+
+    - Added the `prefix` argument to control the naming of the returned columns.
 
 - The `formula.label` argument of `polarPlot()` will now control whether concentration information is printed when `statistic = "cpf"`.
 
@@ -34,6 +40,8 @@
 
 - `importUKAQ()` will correctly append site meta data when `meta = TRUE`, `source` is a length greater than 1, and a single site is repeated in more than one source (e.g., `importUKAQ(source = c("waqn", "aurn"), data_type = "daqi", year = 2024L))`)
 
+- `calcPercentile()` will not correctly pass its arguments (e.g., `date.start`) to `timeAverage()`.
+
 # openair 2.18-2
 
 ## New Features

diff --git a/R/calcPercentile.R b/R/calcPercentile.R
@@ -1,71 +1,133 @@
 #' Calculate percentile values from a time series
 #'
 #' Calculates multiple percentile values from a time series, with flexible time
-#' aggregation.
+#' aggregation. This function is a wrapper for [timeAverage()], making it easier
+#' to calculate several percentiles at once. Like [timeAverage()], it requires a
+#' data frame with a `date` field and one other numeric variable.
 #'
-#' This is a utility function to calculate percentiles and is used in, for
-#' example, \code{timePlot}. Given a data frame with a \code{date} field and one
-#' other numeric variable, percentiles are calculated.
-#'
-#' @param mydata A data frame of data with a \code{date} field in the format
-#'   \code{Date} or \code{POSIXct}. Must have one variable to apply calculations
-#'   to.
-#' @param pollutant Name of variable to process. Mandatory.
-#' @param avg.time Averaging period to use. See [timeAverage()] for details.
-#' @param percentile A vector of percentile values. For example \code{percentile
-#'   = 50} for median values, \code{percentile = c(5, 50, 95} for multiple
-#'   percentile values.
-#' @param data.thresh Data threshold to apply when aggregating data. See
-#'   [timeAverage()] for details.
-#' @param start Start date to use - see [timeAverage()] for details.
+#' @inheritParams timeAverage
+#' @param pollutant Name of column containing variable to summarise, likely a
+#'   pollutant (e.g., `"o3"`).
+#' @param percentile A vector of percentile values; for example, `percentile =
+#'   50` will calculate median values. Multiple values may also be provided as a
+#'   vector, e.g., `percentile = c(5, 50, 95)` or `percentile = seq(0, 100,
+#'   10)`.
+#' @param prefix Each new column is named by appending a `prefix` to
+#'   `percentile`. For example, the default `"percentile."` will name the new
+#'   column as `percentile.95` when `percentile = 95`.
 #' @export
-#' @return Returns a data frame with new columns for each percentile level. New
-#'   columns are given names like percentile.95 e.g. when percentile = 95 is
-#'   chosen. See examples below.
+#'
+#' @return Returns a `data.frame` with a `date` column plus an additional
+#'   column for each given `percentile`.
 #' @author David Carslaw
-#' @seealso \code{\link{timePlot}}, [timeAverage()]
+#' @seealso [timePlot()], [timeAverage()]
 #' @examples
 #' # 95th percentile monthly o3 concentrations
-#' percentiles <- calcPercentile(mydata, pollutant ="o3",
-#' avg.time = "month", percentile = 95)
+#' percentiles <- calcPercentile(mydata,
+#'   pollutant = "o3",
+#'   avg.time = "month", percentile = 95
+#' )
 #'
 #' head(percentiles)
 #'
 #' # 5, 50, 95th percentile monthly o3 concentrations
 #' \dontrun{
-#' percentiles <- calcPercentile(mydata, pollutant ="o3",
-#' avg.time = "month", percentile = c(5, 50, 95))
+#' percentiles <- calcPercentile(mydata,
+#'   pollutant = "o3",
+#'   avg.time = "month", percentile = c(5, 50, 95)
+#' )
 #'
 #' head(percentiles)
 #' }
-calcPercentile <- function(mydata, pollutant = "o3", avg.time = "month", percentile = 50,
-                           data.thresh = 0, start = NA) {
-
-  make.percentile <- function(mydata, pollutant = "o3", avg.time = "month", percentile = 50,
-                              data.thresh = 0, start = NA) {
-    mydata <- timeAverage(
-      mydata, avg.time,
-      statistic = "percentile", percentile = percentile,
-      data.thresh = data.thresh, start.date = NA
+calcPercentile <- function(mydata,
+                           pollutant = "o3",
+                           avg.time = "month",
+                           percentile = 50,
+                           type = "default",
+                           data.thresh = 0,
+                           start.date = NA,
+                           end.date = NA,
+                           prefix = "percentile.") {
+  # check pollutant is valid
+  if (!pollutant %in% names(mydata)) {
+    cli::cli_abort(
+      c(
+        "x" = "{.field pollutant} '{pollutant}' not present in data.",
+        "i" = "{.emph Columns in {.field mydata}}: {names(mydata)}"
+      )
     )
-    ## change column name
-    new.name <- paste("percentile.", percentile, sep = "")
-    names(mydata)[names(mydata) == pollutant] <- new.name
-    results <- mydata[, new.name, drop = FALSE]
-    results <- data.frame(date = mydata$date, results)
+  }
+
+  # iterate over 'percentile's to calculate %tiles
+  mydata <- purrr::map(
+    .x = percentile,
+    .f = function(x) {
+      make.percentile(
+        mydata,
+        pollutant = pollutant,
+        avg.time = avg.time,
+        data.thresh = data.thresh,
+        percentile = x,
+        type = type,
+        start.date = start.date,
+        end.date = end.date,
+        prefix = prefix
+      )
+    }
+  )
 
-    results
+  # get joining columns
+  by <- "date"
+  if (type != "default") {
+    by <- append(by, type)
   }
 
-  mydata <- lapply(percentile, function(x)
-    make.percentile(
-      mydata,
-      pollutant = pollutant, avg.time = avg.time,
-      data.thresh = data.thresh, percentile = x
-    ))
+  # bind into one
+  mydata <- purrr::reduce(mydata, function(x, y) {
+    dplyr::full_join(x, y, by = by)
+  })
 
-  mydata <- Reduce(function(x, y, by = "date") merge(x, y, by = "date", all = TRUE), mydata)
+  # return
+  return(mydata)
+}
+
+#' Use timeAverage to calculate percentiles, with some custom naming
+#' @noRd
+make.percentile <- function(mydata,
+                            pollutant,
+                            avg.time,
+                            percentile,
+                            type,
+                            data.thresh,
+                            start.date,
+                            end.date,
+                            prefix) {
+  # summarise percentiles with timeAverage
+  mydata <- timeAverage(
+    mydata,
+    avg.time,
+    statistic = "percentile",
+    percentile = percentile,
+    type = type,
+    data.thresh = data.thresh,
+    start.date = start.date,
+    end.date = end.date,
+    progress = FALSE
+  )
+
+  # change output column name
+  new.name <- paste0(prefix, percentile)
+  names(mydata)[names(mydata) == pollutant] <- new.name
+
+  if (type == "default") {
+    cols <- c("date", new.name)
+  } else {
+    cols <- c("date", type, new.name)
+  }
 
+  # construct new dataframe with just new column & date
+  results <- mydata[, cols]
 
-  mydata
+  # return
+  return(results)
 }
diff --git a/man/calcPercentile.Rd b/man/calcPercentile.Rd