Skip to content

Commit

Permalink
Updates from code review
Browse files Browse the repository at this point in the history
  • Loading branch information
bsweger committed Jul 25, 2024
1 parent 59d8bc6 commit 3dbe9e9
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 70 deletions.
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# hubData 1.2.0

* Adds a `skip_checks` parameter to the `connect_hub` and `connect_model_output` functions. When `skip_checks` is set to `TRUE`, these functions will bypass the default behavior of scanning the hub's model output directory for valid file types. Omitting these checks results in better performance when connecting to cloud-based hubs but can result in errors when querying the data. This option is only valid when connecting to hubs that use a single file format for model output data.
* Adds a `skip_checks` parameter to the `connect_hub` and `connect_model_output` functions. When `skip_checks` is set to `TRUE`, these functions will bypass the default behavior of scanning the hub's model output directory for invalid files. Omitting these checks results in better performance when connecting to cloud-based hubs but can result in errors when querying the data. This option is only valid when connecting to hubs that meet the following criteria:
- the model output directory contains only model output data (no `README.md`, for example)
- the model output files use a single file format.

# hubData 1.1.1

Expand Down
33 changes: 17 additions & 16 deletions R/connect_hub.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
#' hub's model output files. Also excludes invalid model output files when opening hub datasets.
#' Setting to `TRUE`` will improve performance but will result in an error if the model output
#' directory includes invalid files. Cannot be `TRUE` when there are multiple file formats in
#' the hub's model output directory.
#' the hub's model output directory or when the hub's model output directory contains files that
#' are not model output data (for example, a README).
#' @inheritParams create_hub_schema
#'
#' @return
Expand Down Expand Up @@ -124,9 +125,9 @@ connect_hub.default <- function(hub_path,
# Based on skip_checks param, set a flag that determines whether or not to
# check for invalid files when opening model output data.
if (isTRUE(skip_checks)) {
exclude_invalid_files_flag <- FALSE
exclude_invalid_files <- FALSE
} else {
exclude_invalid_files_flag <- TRUE
exclude_invalid_files <- TRUE
}

if (length(file_format) == 0L) {
Expand All @@ -138,7 +139,7 @@ connect_hub.default <- function(hub_path,
config_tasks = config_tasks,
output_type_id_datatype = output_type_id_datatype,
partitions = partitions,
exclude_invalid_files_flag = exclude_invalid_files_flag
exclude_invalid_files = exclude_invalid_files
)
}
if (inherits(dataset, "UnionDataset")) {
Expand All @@ -160,7 +161,7 @@ connect_hub.default <- function(hub_path,
class = c("hub_connection", class(dataset)),
hub_name = hub_name,
file_format = file_format,
format_verified = exclude_invalid_files_flag,
checks = exclude_invalid_files,
file_system = file_system,
hub_path = hub_path,
model_output_dir = model_output_dir,
Expand Down Expand Up @@ -209,9 +210,9 @@ connect_hub.SubTreeFileSystem <- function(hub_path,
# Based on skip_checks param, set a flag that determines whether or not to
# check for invalid files when opening model output data.
if (isTRUE(skip_checks)) {
exclude_invalid_files_flag <- FALSE
exclude_invalid_files <- FALSE
} else {
exclude_invalid_files_flag <- TRUE
exclude_invalid_files <- TRUE
}

if (length(file_format) == 0L) {
Expand All @@ -223,7 +224,7 @@ connect_hub.SubTreeFileSystem <- function(hub_path,
config_tasks = config_tasks,
output_type_id_datatype = output_type_id_datatype,
partitions = partitions,
exclude_invalid_files_flag = exclude_invalid_files_flag
exclude_invalid_files = exclude_invalid_files
)
}

Expand All @@ -246,7 +247,7 @@ connect_hub.SubTreeFileSystem <- function(hub_path,
class = c("hub_connection", class(dataset)),
hub_name = hub_name,
file_format = file_format,
format_verified = exclude_invalid_files_flag,
checks = exclude_invalid_files,
file_system = file_system,
hub_path = hub_path$base_path,
model_output_dir = model_output_dir$base_path,
Expand All @@ -265,7 +266,7 @@ open_hub_dataset <- function(model_output_dir,
"logical", "Date"
),
partitions = list(model_id = arrow::utf8()),
exclude_invalid_files_flag) {
exclude_invalid_files) {
file_format <- rlang::arg_match(file_format)
schema <- create_hub_schema(config_tasks,
partitions = partitions,
Expand All @@ -280,23 +281,23 @@ open_hub_dataset <- function(model_output_dir,
col_types = schema,
unify_schemas = FALSE,
strings_can_be_null = TRUE,
factory_options = list(exclude_invalid_files = exclude_invalid_files_flag)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
),
parquet = arrow::open_dataset(
model_output_dir,
format = "parquet",
partitioning = "model_id",
schema = schema,
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = exclude_invalid_files_flag)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
),
arrow = arrow::open_dataset(
model_output_dir,
format = "arrow",
partitioning = "model_id",
schema = schema,
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = exclude_invalid_files_flag)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
)
)
}
Expand All @@ -311,7 +312,7 @@ open_hub_datasets <- function(model_output_dir,
"logical", "Date"
),
partitions = list(model_id = arrow::utf8()),
exclude_invalid_files_flag,
exclude_invalid_files,
call = rlang::caller_env()) {
if (length(file_format) == 1L) {
open_hub_dataset(
Expand All @@ -320,7 +321,7 @@ open_hub_datasets <- function(model_output_dir,
config_tasks = config_tasks,
output_type_id_datatype,
partitions = partitions,
exclude_invalid_files_flag
exclude_invalid_files
)
} else {
cons <- purrr::map(
Expand All @@ -331,7 +332,7 @@ open_hub_datasets <- function(model_output_dir,
config_tasks = config_tasks,
output_type_id_datatype = output_type_id_datatype,
partitions = partitions,
exclude_invalid_files_flag
exclude_invalid_files
)
)

Expand Down
20 changes: 10 additions & 10 deletions R/connect_model_output.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ connect_model_output.default <- function(model_output_dir,
# Based on skip_checks param set a flag that determines whether or not to
# check for invalid files when opening model output data.
if (isTRUE(skip_checks)) {
exclude_invalid_files_flag <- FALSE
exclude_invalid_files <- FALSE
} else {
exclude_invalid_files_flag <- TRUE
exclude_invalid_files <- TRUE
}

if (file_format == "csv") {
Expand All @@ -46,7 +46,7 @@ connect_model_output.default <- function(model_output_dir,
col_types = schema,
unify_schemas = TRUE,
strings_can_be_null = TRUE,
factory_options = list(exclude_invalid_files = exclude_invalid_files_flag)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
)
} else {
dataset <- arrow::open_dataset(
Expand All @@ -55,7 +55,7 @@ connect_model_output.default <- function(model_output_dir,
partitioning = partition_names,
schema = schema,
unify_schemas = TRUE,
factory_options = list(exclude_invalid_files = exclude_invalid_files_flag)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
)
}

Expand All @@ -66,7 +66,7 @@ connect_model_output.default <- function(model_output_dir,
structure(dataset,
class = c("mod_out_connection", class(dataset)),
file_format = file_format,
format_verified = exclude_invalid_files_flag,
checks = exclude_invalid_files,
file_system = class(dataset$filesystem)[1],
model_output_dir = model_output_dir
)
Expand All @@ -87,9 +87,9 @@ connect_model_output.SubTreeFileSystem <- function(model_output_dir,
# Based on skip_checks param, set a flag that determines whether or not to
# check for invalid files when opening model output data.
if (isTRUE(skip_checks)) {
exclude_invalid_files_flag <- FALSE
exclude_invalid_files <- FALSE
} else {
exclude_invalid_files_flag <- TRUE
exclude_invalid_files <- TRUE
}

if (file_format == "csv") {
Expand All @@ -100,7 +100,7 @@ connect_model_output.SubTreeFileSystem <- function(model_output_dir,
schema = schema,
unify_schemas = TRUE,
strings_can_be_null = TRUE,
factory_options = list(exclude_invalid_files = exclude_invalid_files_flag)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
)
} else {
dataset <- arrow::open_dataset(
Expand All @@ -109,7 +109,7 @@ connect_model_output.SubTreeFileSystem <- function(model_output_dir,
partitioning = partition_names,
schema = schema,
unify_schemas = TRUE,
factory_options = list(exclude_invalid_files = exclude_invalid_files_flag)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
)
}

Expand All @@ -121,7 +121,7 @@ connect_model_output.SubTreeFileSystem <- function(model_output_dir,
structure(dataset,
class = c("mod_out_connection", class(dataset)),
file_format = file_format,
format_verified = exclude_invalid_files_flag,
checks = exclude_invalid_files,
file_system = class(dataset$filesystem$base_fs)[1],
model_output_dir = model_output_dir$base_path
)
Expand Down
8 changes: 4 additions & 4 deletions R/print.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ print.hub_connection <- function(x, verbose = FALSE, ...) {
"*" = "file_format: {.val {print_file_format_meta(x)}}"
)
}
if (!is.null(attr(x, "format_verified"))) {
if (!is.null(attr(x, "checks"))) {
print_msg <- c(print_msg,
"*" = "format_verified: {.val {attr(x, 'format_verified')}}"
"*" = "checks: {.val {attr(x, 'checks')}}"
)
}
if (!is.null(attr(x, "file_system"))) {
Expand Down Expand Up @@ -84,9 +84,9 @@ print.mod_out_connection <- function(x, verbose = FALSE, ...) {
"*" = "file_format: {.val {print_file_format_meta(x)}}"
)
}
if (!is.null(attr(x, "format_verified"))) {
if (!is.null(attr(x, "checks"))) {
print_msg <- c(print_msg,
"*" = "format_verified: {.val {attr(x, 'format_verified')}}"
"*" = "checks: {.val {attr(x, 'checks')}}"
)
}
if (!is.null(attr(x, "file_system"))) {
Expand Down
20 changes: 10 additions & 10 deletions tests/_snaps/hub-connection.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "n_open" "n_in_dir"
.. ..$ : chr [1:2] "csv" "parquet"
- attr(*, "format_verified")= logi TRUE
- attr(*, "checks")= logi TRUE
- attr(*, "file_system")= chr "LocalFileSystem"
- attr(*, "hub_path")= chr "test/hub_path"
- attr(*, "model_output_dir")= chr "test/model_output_dir"
Expand Down Expand Up @@ -181,7 +181,7 @@
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "n_open" "n_in_dir"
.. ..$ : chr "parquet"
- attr(*, "format_verified")= logi TRUE
- attr(*, "checks")= logi TRUE
- attr(*, "file_system")= chr "LocalFileSystem"
- attr(*, "hub_path")= chr "test/hub_path"
- attr(*, "model_output_dir")= chr "test/model_output_dir"
Expand Down Expand Up @@ -311,7 +311,7 @@
* hub_name: "Simple Forecast Hub"
* hub_path: 'test/hub_path'
* format_verified: TRUE
* checks: TRUE
* file_system: "local"
* model_output_dir: "test/model_output_dir"
* config_admin: 'hub-config/admin.json'
Expand All @@ -327,7 +327,7 @@
* hub_name: "Simple Forecast Hub"
* hub_path: 'test/hub_path'
* format_verified: TRUE
* checks: TRUE
* file_system: "s3"
* model_output_dir: "test/model_output_dir"
* config_admin: 'hub-config/admin.json'
Expand Down Expand Up @@ -363,7 +363,7 @@
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "n_open" "n_in_dir"
.. ..$ : chr [1:3] "csv" "parquet" "arrow"
- attr(*, "format_verified")= logi TRUE
- attr(*, "checks")= logi TRUE
- attr(*, "file_system")= chr "LocalFileSystem"
- attr(*, "hub_path")= chr "flusight"
- attr(*, "model_output_dir")= chr "forecasts"
Expand Down Expand Up @@ -510,7 +510,7 @@
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "n_open" "n_in_dir"
.. ..$ : chr [1:3] "csv" "parquet" "arrow"
- attr(*, "format_verified")= logi TRUE
- attr(*, "checks")= logi TRUE
- attr(*, "file_system")= chr "LocalFileSystem"
- attr(*, "hub_path")= chr "test/hub_path"
- attr(*, "model_output_dir")= chr "test/model_output_dir"
Expand Down Expand Up @@ -730,7 +730,7 @@
* hub_name: "Simple Forecast Hub"
* hub_path: 'test/hub_path'
* file_format: "csv(3/3)" and "parquet(1/1)"
* format_verified: TRUE
* checks: TRUE
* file_system: "LocalFileSystem"
* model_output_dir: "test/model_output_dir"
* config_admin: 'hub-config/admin.json'
Expand Down Expand Up @@ -760,7 +760,7 @@
* hub_name: "Simple Forecast Hub"
* hub_path: 'test/hub_path'
* file_format: "csv(3/3)" and "parquet(1/1)"
* format_verified: TRUE
* checks: TRUE
* file_system: "LocalFileSystem"
* model_output_dir: "test/model_output_dir"
* config_admin: 'hub-config/admin.json'
Expand Down Expand Up @@ -803,7 +803,7 @@
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "n_open" "n_in_dir"
.. ..$ : chr [1:2] "csv" "parquet"
- attr(*, "format_verified")= logi TRUE
- attr(*, "checks")= logi TRUE
- attr(*, "file_system")= chr "LocalFileSystem"
- attr(*, "hub_path")= chr "test/hub_path"
- attr(*, "model_output_dir")= chr "test/model_output_dir"
Expand Down Expand Up @@ -1027,7 +1027,7 @@
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "n_open" "n_in_dir"
.. ..$ : chr [1:2] "csv" "parquet"
- attr(*, "format_verified")= logi TRUE
- attr(*, "checks")= logi TRUE
- attr(*, "file_system")= chr "S3FileSystem"
- attr(*, "hub_path")= chr "test/hub_path"
- attr(*, "model_output_dir")= chr "test/model_output_dir"
Expand Down
Loading

0 comments on commit 3dbe9e9

Please # to comment.