diff --git a/DESCRIPTION b/DESCRIPTION index 3daf6c2..2db8c6e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,9 +20,12 @@ Imports: fs, glue, magrittr, + janitor, + phsmethods, readr, rlang, - tibble + tibble, + tidyr Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index afd9c7d..618fb58 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(find_latest_file) export(get_hscp_locality) +export(get_pop_est) export(get_simd_datazone) export(get_simd_postcode) export(get_spd) diff --git a/R/find_latest_file.R b/R/find_latest_file.R index bc1fae5..5f3ecab 100644 --- a/R/find_latest_file.R +++ b/R/find_latest_file.R @@ -12,8 +12,9 @@ #' passed to [fs::dir_info()] to search for the file. #' @param selection_method Valid arguments are "modification_date" #' (the default) or "file_name". +#' @param ... Additional arguments passed to [fs::dir_info()]. #' -#' @return the [fs::path()] to the file +#' @return the [fs::path()] to the file. #' @export #' #' @examples @@ -25,13 +26,15 @@ #' } find_latest_file <- function(directory, regexp, - selection_method = "modification_date") { + selection_method = "modification_date", + ...) { if (selection_method == "modification_date") { latest_file <- fs::dir_info( path = directory, type = "file", regexp = regexp, - recurse = TRUE + recurse = TRUE, + ... ) |> dplyr::arrange( dplyr::desc(.data$birth_time), diff --git a/R/get_pop_est.R b/R/get_pop_est.R new file mode 100644 index 0000000..be190d2 --- /dev/null +++ b/R/get_pop_est.R @@ -0,0 +1,177 @@ +#' Get population estimates +#' +#' This function retrieves population estimates based on various parameters. +#' It reads population data from a specified file and filters it based on the +#' input parameters. The function also allows for grouping by age and pivoting +#' the data for wider format. +#' @param level The geographic level for which to retrieve population estimates. +#' One of "datazone", "intzone", "hscp", "ca", or "hb". +#' @param version The version of the population estimates to use (default: "latest"). +#' @param min_year,max_year (optional) The minimum and maximum years to include in the results. +#' @param age_groups Logical, indicating whether to aggregate population estimates by age groups. +#' If `TRUE`, the `phsmethods::create_age_groups` function is used. +#' @param pivot_wider Optionally reshape the data into a wider format, summarising population counts by the specified columns. +#' Allowed values: +#' * `FALSE` (default): Do not pivot. +#' * `TRUE` or `"all"`: Pivot by both sex and age/age group. +#' * `"age"`: Pivot by age/age group only. +#' * `"age-only"`: Pivot by age/age group and aggregate to remove sex. +#' * `"sex"`: Pivot by sex only. +#' * `"sex-only"`: Pivot by sex group and aggregate to remove age/age group +#' @param ... Additional arguments passed to [phsmethods::create_age_groups()]. +#' +#' @return A tibble containing the filtered and possibly transformed population data. +#' +#' @note +#' Depending on the values for `age_groups` and `pivot_wider`, the resulting +#' columns in the returned tibble will vary. Refer to the examples below for +#' illustration. +#' +#' @export +#' +#' @examples +#' # Basic Usage: Datazone Population Estimates (no filtering) +#' get_pop_est("datazone") +#' +#' # Filter by Year: +#' get_pop_est("ca", min_year = 1995, max_year = 2020) +#' +#' # Age Groups: Health Board (HB) Population Estimates by Age Group +#' get_pop_est("hb", age_groups = TRUE) +#' +#' # Age Groups with Custom Settings: +#' # Aggregate into 5-year age groups, with an open-ended final group "85+" +#' get_pop_est("hb", age_groups = TRUE, by = 5, to = 85) +#' +#' # Pivot Wider (All): CA Population Estimates, Reshaped by Sex and Age Group +#' # The result will have columns for each combination of sex and age group, +#' # e.g., "pop_f_0_4", "pop_m_5_9", etc. +#' get_pop_est("ca", age_groups = TRUE, pivot_wider = "all") +#' +#' # Pivot Wider (Age Only): CA Population Estimates, Reshaped by Age Group Only +#' # This is useful if you only need the total population for each age group, regardless of sex. +#' get_pop_est("ca", age_groups = TRUE, pivot_wider = "age-only") +#' +#' # Combined Filtering, Age Groups, and Pivoting: +#' # CA population from 2015-2020, aggregated by 10-year age groups, and pivoted by sex +#' # The result will have columns for each sex ("pop_f", "pop_m") and a row per age group. +#' get_pop_est("ca", min_year = 2015, max_year = 2020, age_groups = TRUE, by = 10, pivot_wider = "sex") +get_pop_est <- function( + level = c("datazone", "intzone", "hscp", "ca", "hb"), + version = "latest", + min_year = NULL, + max_year = NULL, + age_groups = FALSE, + pivot_wider = FALSE, + ...) { + level <- tolower(level) + level <- rlang::arg_match(level) + if (!inherits(pivot_wider, "logical")) { + pivot_wider <- rlang::arg_match( + pivot_wider, + values = c("all", "age", "age-only", "sex", "sex-only") + ) + } + + ext <- "rds" + pop_dir <- fs::path(get_lookups_dir(), "Populations", "Estimates") + + file_name_re <- paste0(level, "[0-9]{0,4}_pop_est_[0-9]{4}_[0-9]{4}\\.", ext) + + pop_path <- find_latest_file( + directory = pop_dir, + regexp = file_name_re, + ignore.case = TRUE + ) + + pop_est <- read_file(pop_path) + + # Validate year range and filter + if (!is.null(min_year) && !is.null(max_year) && min_year > max_year) { + cli::cli_abort( + "Invalid years: {.arg min_year} must not be greater than {.arg max_year}" + ) + } + + if (!is.null(min_year)) { + min_year_available <- min(pop_est$year) + if (min_year < min_year_available) { + cli::cli_abort( + "{.arg min_year} must be at least {min_year_available} when using the + {.file {fs::path_file(pop_path)}} file." + ) + } + pop_est <- dplyr::filter(pop_est, .data$year >= min_year) + } + + if (!is.null(max_year)) { + max_year_available <- max(pop_est$year) + if (max_year > max_year_available) { + cli::cli_abort( + "{.arg max_year} must be at most {max_year_available} when using the + {.file {fs::path_file(pop_path)}} file." + ) + } + pop_est <- dplyr::filter(pop_est, .data$year <= max_year) + } + + # Create age groups + if (age_groups) { + pop_est <- pop_est |> + dplyr::mutate( + age_group = phsmethods::create_age_groups(x = .data$age, ...), + .keep = "unused" + ) |> + dplyr::group_by(dplyr::across(!.data$pop)) |> + dplyr::summarise(pop = sum(.data$pop), .groups = "drop") + } + + # Pivot data + if (pivot_wider %in% list(TRUE, "all")) { + pop_est <- pop_est |> + pivot_data( + id_cols = -"sex", + names_from = c("sex_name", dplyr::if_else(age_groups, "age_group", "age")) + ) + } else if (pivot_wider == "sex") { + pop_est <- pop_est |> + pivot_data( + id_cols = c(-"sex", dplyr::if_else(age_groups, "age_group", "age")), + names_from = "sex_name" + ) + } else if (pivot_wider == "sex-only") { + pop_est <- pop_est |> + pivot_data( + id_cols = c(-"sex", -dplyr::if_else(age_groups, "age_group", "age")), + names_from = "sex_name" + ) + } else if (pivot_wider == "age") { + pop_est <- pop_est |> + pivot_data( + id_cols = c(-"sex", "sex_name"), + names_from = dplyr::if_else(age_groups, "age_group", "age") + ) + } else if (pivot_wider == "age-only") { + pop_est <- pop_est |> + pivot_data( + id_cols = c(-"sex", -"sex_name"), + names_from = dplyr::if_else(age_groups, "age_group", "age") + ) + } + + return(pop_est) +} + + +# Helper function to pivot data +pivot_data <- function(data, id_cols, names_from) { + tidyr::pivot_wider( + data, + id_cols = {{ id_cols }}, + names_from = !!names_from, + values_from = "pop", + values_fn = sum, + names_prefix = "pop_", + names_repair = janitor::make_clean_names + ) +} diff --git a/man/find_latest_file.Rd b/man/find_latest_file.Rd index cd54324..44dabd9 100644 --- a/man/find_latest_file.Rd +++ b/man/find_latest_file.Rd @@ -4,7 +4,12 @@ \alias{find_latest_file} \title{Find the latest version of a file} \usage{ -find_latest_file(directory, regexp, selection_method = "modification_date") +find_latest_file( + directory, + regexp, + selection_method = "modification_date", + ... +) } \arguments{ \item{directory}{The directory in which to search.} @@ -15,9 +20,11 @@ passed to \code{\link[fs:dir_ls]{fs::dir_info()}} to search for the file.} \item{selection_method}{Valid arguments are "modification_date" (the default) or "file_name".} + +\item{...}{Additional arguments passed to \code{\link[fs:dir_ls]{fs::dir_info()}}.} } \value{ -the \code{\link[fs:path]{fs::path()}} to the file +the \code{\link[fs:path]{fs::path()}} to the file. } \description{ This will return the latest created file matching diff --git a/man/get_pop_est.Rd b/man/get_pop_est.Rd new file mode 100644 index 0000000..d103fea --- /dev/null +++ b/man/get_pop_est.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_pop_est.R +\name{get_pop_est} +\alias{get_pop_est} +\title{Get population estimates} +\usage{ +get_pop_est( + level = c("datazone", "intzone", "hscp", "ca", "hb"), + version = "latest", + min_year = NULL, + max_year = NULL, + age_groups = FALSE, + pivot_wider = FALSE, + ... +) +} +\arguments{ +\item{level}{The geographic level for which to retrieve population estimates. +One of "datazone", "intzone", "hscp", "ca", or "hb".} + +\item{version}{The version of the population estimates to use (default: "latest").} + +\item{min_year, max_year}{(optional) The minimum and maximum years to include in the results.} + +\item{age_groups}{Logical, indicating whether to aggregate population estimates by age groups. +If \code{TRUE}, the \code{phsmethods::create_age_groups} function is used.} + +\item{pivot_wider}{Optionally reshape the data into a wider format, summarising population counts by the specified columns. +Allowed values: +\itemize{ +\item \code{FALSE} (default): Do not pivot. +\item \code{TRUE} or \code{"all"}: Pivot by both sex and age/age group. +\item \code{"age"}: Pivot by age/age group only. +\item \code{"age-only"}: Pivot by age/age group and aggregate to remove sex. +\item \code{"sex"}: Pivot by sex only. +\item \code{"sex-only"}: Pivot by sex group and aggregate to remove age/age group +}} + +\item{...}{Additional arguments passed to \code{\link[phsmethods:create_age_groups]{phsmethods::create_age_groups()}}.} +} +\value{ +A tibble containing the filtered and possibly transformed population data. +} +\description{ +This function retrieves population estimates based on various parameters. +It reads population data from a specified file and filters it based on the +input parameters. The function also allows for grouping by age and pivoting +the data for wider format. +} +\note{ +Depending on the values for \code{age_groups} and \code{pivot_wider}, the resulting +columns in the returned tibble will vary. Refer to the examples below for +illustration. +} +\examples{ +# Basic Usage: Datazone Population Estimates (no filtering) +get_pop_est("datazone") + +# Filter by Year: +get_pop_est("ca", min_year = 1995, max_year = 2020) + +# Age Groups: Health Board (HB) Population Estimates by Age Group +get_pop_est("hb", age_groups = TRUE) + +# Age Groups with Custom Settings: +# Aggregate into 5-year age groups, with an open-ended final group "85+" +get_pop_est("hb", age_groups = TRUE, by = 5, to = 85) + +# Pivot Wider (All): CA Population Estimates, Reshaped by Sex and Age Group +# The result will have columns for each combination of sex and age group, +# e.g., "pop_f_0_4", "pop_m_5_9", etc. +get_pop_est("ca", age_groups = TRUE, pivot_wider = "all") + +# Pivot Wider (Age Only): CA Population Estimates, Reshaped by Age Group Only +# This is useful if you only need the total population for each age group, regardless of sex. +get_pop_est("ca", age_groups = TRUE, pivot_wider = "age-only") + +# Combined Filtering, Age Groups, and Pivoting: +# CA population from 2015-2020, aggregated by 10-year age groups, and pivoted by sex +# The result will have columns for each sex ("pop_f", "pop_m") and a row per age group. +get_pop_est("ca", min_year = 2015, max_year = 2020, age_groups = TRUE, by = 10, pivot_wider = "sex") +} diff --git a/phslookups.Rproj b/phslookups.Rproj index 4f837c0..4885b3e 100644 --- a/phslookups.Rproj +++ b/phslookups.Rproj @@ -21,6 +21,6 @@ PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source PackageRoxygenize: rd,collate,namespace -UseNativePipeOperator: No +UseNativePipeOperator: Yes SpellingDictionary: en_GB