| Title: | Core 'hubverse' Utilities |
|---|---|
| Description: | Core set of low-level utilities common across the 'hubverse'. Used to interact with 'hubverse' schema, Hub configuration files and model outputs and designed to be primarily used internally by other 'hubverse' packages. See Reich et al. (2022) <doi:10.2105/AJPH.2022.306831> for an overview of Collaborative Hubs. |
| Authors: | Anna Krystalli [aut, cre] (ORCID: <https://orcid.org/0000-0002-2378-4915>), Li Shandross [aut], Nicholas G. Reich [ctb] (ORCID: <https://orcid.org/0000-0003-3503-9899>), Evan L. Ray [ctb], Zhian N. Kamvar [ctb] (ORCID: <https://orcid.org/0000-0003-1458-7108>), Consortium of Infectious Disease Modeling Hubs [cph] |
| Maintainer: | Anna Krystalli <[email protected]> |
| License: | MIT + file LICENSE |
| Version: | 1.2.0 |
| Built: | 2026-05-15 08:59:16 UTC |
| Source: | https://github.com/hubverse-org/hubUtils |
Coerce a config list to a config class object
as_config(x)as_config(x)
x |
a list representation of the contents a |
a config list object with subclass <config>.
config_tasks <- read_config( hub_path = system.file("testhubs/simple", package = "hubUtils") ) # Remove all attributes except names to demonstrate functionality attributes(config_tasks) <- attributes(config_tasks)[ names(attributes(config_tasks)) == "names" ] # Convert to config object as_config(config_tasks)config_tasks <- read_config( hub_path = system.file("testhubs/simple", package = "hubUtils") ) # Remove all attributes except names to demonstrate functionality attributes(config_tasks) <- attributes(config_tasks)[ names(attributes(config_tasks)) == "names" ] # Convert to config object as_config(config_tasks)
model_out_tbl class object.Convert model output to a model_out_tbl class object.
as_model_out_tbl( tbl, model_id_col = NULL, output_type_col = NULL, output_type_id_col = NULL, value_col = NULL, sep = "-", trim_to_task_ids = FALSE, hub_con = NULL, task_id_cols = NULL, remove_empty = FALSE )as_model_out_tbl( tbl, model_id_col = NULL, output_type_col = NULL, output_type_id_col = NULL, value_col = NULL, sep = "-", trim_to_task_ids = FALSE, hub_con = NULL, task_id_cols = NULL, remove_empty = FALSE )
tbl |
a |
model_id_col |
character string. If a |
output_type_col |
character string. If an |
output_type_id_col |
character string. If an |
value_col |
character string. If a |
sep |
character string. Character used as separator when concatenating
|
trim_to_task_ids |
logical. Whether to trim |
hub_con |
a |
task_id_cols |
a character vector of column names. Only used if
|
remove_empty |
Logical. Whether to remove columns containing only |
A model_out_tbl class object.
as_model_out_tbl(hub_con_output)as_model_out_tbl(hub_con_output)
Function compares the current schema version in a config file to a valid version, If config file version deprecated compared to valid version, the function issues a lifecycle warning to prompt user to upgrade.
check_deprecated_schema( config_version, config, valid_version = "v2.0.0", hubutils_version = "0.0.0.9010" )check_deprecated_schema( config_version, config, valid_version = "v2.0.0", hubutils_version = "0.0.0.9010" )
config_version |
Character string of the schema version. |
config |
List representation of config file. |
valid_version |
Character string of minimum valid schema version. |
hubutils_version |
The version of the hubUtils package in which deprecation of
the schema version below |
Invisibly, TRUE if the schema version is deprecated, FALSE otherwise.
Primarily used for the side effect of issuing a lifecycle warning.
Transform between output types for each unique combination of task IDs for each model. Conversion must be from a single initial output type to one or more to output types, and the resulting output will only contain the to output types. See details for supported conversions.
convert_output_type(model_out_tbl, to)convert_output_type(model_out_tbl, to)
model_out_tbl |
an object of class |
to |
a named list indicating the desired output types and associated output type IDs. List item name and value pairs may be as follows:
|
Currently, only "sample" can be converted to "mean", "median",
or "quantile"
object of class model_out_tbl containing (only) predictions of the
to output_type(s) for each unique combination of task IDs for each model
# We illustrate the conversion between output types using normal distributions ex_quantiles <- c(0.25, 0.5, 0.75) model_out_tbl <- expand.grid( stringsAsFactors = FALSE, group1 = c(1, 2), model_id = "A", output_type = "sample", output_type_id = 1:100 ) |> dplyr::mutate(value = rnorm(200, mean = group1)) # Output type conversions with vector `to` elements convert_output_type(model_out_tbl, to = list("quantile" = ex_quantiles, "median" = NA) ) # Output type conversion with dataframe `to` element # Output type ID values (quantile levels) are determined by group1 value quantile_levels <- rbind( data.frame(group1 = 1, output_type_id = 0.5), data.frame(group1 = 2, output_type_id = c(0.25, 0.5, 0.75)) ) convert_output_type(model_out_tbl, to = list("quantile" = quantile_levels) )# We illustrate the conversion between output types using normal distributions ex_quantiles <- c(0.25, 0.5, 0.75) model_out_tbl <- expand.grid( stringsAsFactors = FALSE, group1 = c(1, 2), model_id = "A", output_type = "sample", output_type_id = 1:100 ) |> dplyr::mutate(value = rnorm(200, mean = group1)) # Output type conversions with vector `to` elements convert_output_type(model_out_tbl, to = list("quantile" = ex_quantiles, "median" = NA) ) # Output type conversion with dataframe `to` element # Output type ID values (quantile levels) are determined by group1 value quantile_levels <- rbind( data.frame(group1 = 1, output_type_id = 0.5), data.frame(group1 = 2, output_type_id = c(0.25, 0.5, 0.75)) ) convert_output_type(model_out_tbl, to = list("quantile" = quantile_levels) )
Create a URL to a file in an S3 bucket
create_s3_url(base_fs, base_path)create_s3_url(base_fs, base_path)
base_fs |
character string. Path of the base s3 file system (bucket) in the
cloud. Can be extracted from the object of class |
base_path |
character string. Path to the file in relation to |
A character string of the URL to the file in s3.
create_s3_url( base_fs = "hubverse/hubutils/testhubs/simple/", base_path = "hub-config/admin.json" ) # Create a URL from an object of class `<SubTreeFileSystem>` of an s3 hub hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") create_s3_url(hub_path$base_path, "hub-config/admin.json") config_path <- hub_path$path("hub-config/admin.json") # Create a URL from an object of class `<SubTreeFileSystem>` of the path to # a config file in an s3 hub create_s3_url(config_path$base_fs$base_path, config_path$base_path)create_s3_url( base_fs = "hubverse/hubutils/testhubs/simple/", base_path = "hub-config/admin.json" ) # Create a URL from an object of class `<SubTreeFileSystem>` of an s3 hub hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") create_s3_url(hub_path$base_path, "hub-config/admin.json") config_path <- hub_path$path("hub-config/admin.json") # Create a URL from an object of class `<SubTreeFileSystem>` of the path to # a config file in an s3 hub create_s3_url(config_path$base_fs$base_path, config_path$base_path)
id or config schema_version property
character stringExtract the schema version from a schema id or config schema_version property
character string
extract_schema_version(id)extract_schema_version(id)
id |
A schema |
The schema version number as a character string.
extract_schema_version("schema_version: v3.0.0") extract_schema_version("refs/heads/main/v3.0.0")extract_schema_version("schema_version: v3.0.0") extract_schema_version("refs/heads/main/v3.0.0")
Version can be provided either directly through the config_version argument
or extracted from a config_tasks object.
get_config_tid(config_version, config_tasks)get_config_tid(config_version, config_tasks)
config_version |
Character string of the schema version. |
config_tasks |
a list version of the content's of a hub's |
character string of the name of the output type id column
get_config_tid("v3.0.0") get_config_tid("v2.0.0") # this will produce a warning because support for schema version 1.0.0 # has been dropped. get_config_tid("v1.0.0")get_config_tid("v3.0.0") get_config_tid("v2.0.0") # this will produce a warning because support for schema version 1.0.0 # has been dropped. get_config_tid("v1.0.0")
Get hub configuration fields
get_hub_timezone(hub_path) get_hub_model_output_dir(hub_path) get_hub_file_formats(hub_path, round_id = NULL) get_hub_derived_task_ids(hub_path, round_id = NULL)get_hub_timezone(hub_path) get_hub_model_output_dir(hub_path) get_hub_file_formats(hub_path, round_id = NULL) get_hub_derived_task_ids(hub_path, round_id = NULL)
hub_path |
Either a character string path to a local Modeling Hub directory,
a character string of a URL to a GitHub repository
or an object of class |
round_id |
Character string. Round identifier. If the round is set to
|
get_hub_timezone: The timezone of the hub
get_hub_model_output_dir: The model output directory name
get_hub_file_formats: character vector accepted hub or round level
file formats. If round_id is NULL or the round does not have a round level
file_format setting, returns the hub level file_format setting.
get_hub_derived_task_ids: character vector of hub or round level derived
task ID names. If round_id is NULL or the round does not have a round level
derived_tasks_ids setting, returns the hub level derived_tasks_ids setting.
get_hub_timezone(): Get the hub timezone
get_hub_model_output_dir(): Get the model output directory name
get_hub_file_formats(): Get the hub or round level file formats
get_hub_derived_task_ids(): Get the hub or round level derived_tasks_ids
hub_path <- system.file("testhubs", "flusight", package = "hubUtils") get_hub_timezone(hub_path) get_hub_model_output_dir(hub_path) get_hub_file_formats(hub_path) get_hub_file_formats(hub_path, "2022-12-12")hub_path <- system.file("testhubs", "flusight", package = "hubUtils") get_hub_timezone(hub_path) get_hub_model_output_dir(hub_path) get_hub_file_formats(hub_path) get_hub_file_formats(hub_path, "2022-12-12")
Utilities for accessing round ID metadata
get_round_idx(config_tasks, round_id) get_round_ids( config_tasks, flatten = c("all", "model_task", "task_id", "none") )get_round_idx(config_tasks, round_id) get_round_ids( config_tasks, flatten = c("all", "model_task", "task_id", "none") )
config_tasks |
a list version of the content's of a hub's |
round_id |
Character string. Round identifier. If the round is set to
|
flatten |
Character. Whether and how much to flatten output.
|
the integer index of the element in config_tasks$rounds that a
character round identifier maps to
a list or character vector of hub round IDs
A character vector is returned only if flatten = "all"
A list is returned otherwise (see flatten for more details)
get_round_idx(): Get an integer index of the element in
config_tasks$rounds that a character round identifier maps to.
get_round_ids(): Get a list or character vector of hub round IDs.
For each round, if round_id_from_variable is TRUE, round IDs returned are
the values of the task ID defined in the round_id property. Otherwise, if
round_id_from_variable is FALSE, the value of the round_id property is
returned.
config_tasks <- read_config( hub_path = system.file("testhubs/simple", package = "hubUtils") ) # Get round IDs get_round_ids(config_tasks) get_round_ids(config_tasks, flatten = "model_task") get_round_ids(config_tasks, flatten = "task_id") get_round_ids(config_tasks, flatten = "none") # Get round integer index using a round_id get_round_idx(config_tasks, "2022-10-01") get_round_idx(config_tasks, "2022-10-29")config_tasks <- read_config( hub_path = system.file("testhubs/simple", package = "hubUtils") ) # Get round IDs get_round_ids(config_tasks) get_round_ids(config_tasks, flatten = "model_task") get_round_ids(config_tasks, flatten = "task_id") get_round_ids(config_tasks, flatten = "none") # Get round integer index using a round_id get_round_idx(config_tasks, "2022-10-01") get_round_idx(config_tasks, "2022-10-29")
Get the model tasks for a given round
get_round_model_tasks(config_tasks, round_id)get_round_model_tasks(config_tasks, round_id)
config_tasks |
a list version of the content's of a hub's |
round_id |
Character string. Round identifier. If the round is set to
|
a list representation of model tasks for a given round.
hub_path <- system.file("testhubs/simple", package = "hubUtils") config_tasks <- read_config(hub_path, "tasks") get_round_model_tasks(config_tasks, round_id = "2022-10-08") get_round_model_tasks(config_tasks, round_id = "2022-10-15")hub_path <- system.file("testhubs/simple", package = "hubUtils") config_tasks <- read_config(hub_path, "tasks") get_round_model_tasks(config_tasks, round_id = "2022-10-08") get_round_model_tasks(config_tasks, round_id = "2022-10-15")
Get task ID names for a given round
get_round_task_id_names(config_tasks, round_id)get_round_task_id_names(config_tasks, round_id)
config_tasks |
a list version of the content's of a hub's |
round_id |
Character string. Round identifier. If the round is set to
|
a character vector of task ID names
hub_path <- system.file("testhubs/simple", package = "hubUtils") config_tasks <- read_config(hub_path, "tasks") get_round_task_id_names(config_tasks, round_id = "2022-10-08") get_round_task_id_names(config_tasks, round_id = "2022-10-15")hub_path <- system.file("testhubs/simple", package = "hubUtils") config_tasks <- read_config(hub_path, "tasks") get_round_task_id_names(config_tasks, round_id = "2022-10-08") get_round_task_id_names(config_tasks, round_id = "2022-10-15")
Download a schema
get_schema(schema_url)get_schema(schema_url)
schema_url |
The download URL for a given config schema version. |
Contents of the JSON schema as a character string.
Other functions supporting config file validation:
get_schema_url(),
get_schema_valid_versions()
schema_url <- get_schema_url(config = "tasks", version = "v0.0.0.9") get_schema(schema_url)schema_url <- get_schema_url(config = "tasks", version = "v0.0.0.9") get_schema(schema_url)
Get the JSON schema download URL for a given config file version
get_schema_url( config = c("tasks", "admin", "model", "target-data"), version, branch = "main" )get_schema_url( config = c("tasks", "admin", "model", "target-data"), version, branch = "main" )
config |
Name of config file to validate. One of |
version |
A valid version of hubverse
schema
(e.g. |
branch |
The branch of the hubverse
schemas repository
from which to fetch schema. Defaults to |
The JSON schema download URL for a given config file version.
Other functions supporting config file validation:
get_schema(),
get_schema_valid_versions()
get_schema_url(config = "tasks", version = "v0.0.0.9")get_schema_url(config = "tasks", version = "v0.0.0.9")
Get a vector of valid schema version
get_schema_valid_versions(branch = "main")get_schema_valid_versions(branch = "main")
branch |
The branch of the hubverse
schemas repository
from which to fetch schema. Defaults to |
a character vector of valid versions of hubverse schema.
Other functions supporting config file validation:
get_schema(),
get_schema_url()
get_schema_valid_versions()get_schema_valid_versions()
Get the latest schema version from the schema repository if "latest" requested (default) or ignore if specific version provided.
get_schema_version_latest(schema_version = "latest", branch = "main")get_schema_version_latest(schema_version = "latest", branch = "main")
schema_version |
A character vector. Either "latest" or a valid schema version. |
branch |
The branch of the hubverse
schemas repository
from which to fetch schema. Defaults to |
a schema version string. If schema_version is "latest", the latest schema
version from the schema repository. If specific version provided to schema_version, the same version is returned.
# Get the latest version of the schema get_schema_version_latest() get_schema_version_latest(schema_version = "v3.0.0")# Get the latest version of the schema get_schema_version_latest() get_schema_version_latest(schema_version = "v3.0.0")
Get hub task IDs
get_task_id_names(config_tasks)get_task_id_names(config_tasks)
config_tasks |
a list version of the content's of a hub's |
a character vector of all unique task ID names across all rounds.
hub_path <- system.file("testhubs/simple", package = "hubUtils") config_tasks <- read_config(hub_path, "tasks") get_task_id_names(config_tasks)hub_path <- system.file("testhubs/simple", package = "hubUtils") config_tasks <- read_config(hub_path, "tasks") get_task_id_names(config_tasks)
Get hub config schema versions
get_version_config(config) get_version_file(config_path) get_version_hub(hub_path, config_type = c("tasks", "admin", "target-data"))get_version_config(config) get_version_file(config_path) get_version_hub(hub_path, config_type = c("tasks", "admin", "target-data"))
config |
A |
config_path |
Either a character string of a path to a local JSON config
file, a character string of the URL to the raw contents of a JSON config
file (e.g on GitHub) or
an object of class |
hub_path |
Either a character string path to a local Modeling Hub directory,
a character string of a URL to a GitHub repository
or an object of class |
config_type |
Character vector specifying the type of config file to read. One of "tasks", "admin" or "target-data". Default is "tasks". |
The schema version number as a character string.
get_version_config(): Get schema version from config list representation.
get_version_file(): Get schema version from config file at specific path.
get_version_hub(): Get schema version from config file at specific path.
config <- read_config_file( system.file("config", "tasks.json", package = "hubUtils") ) get_version_config(config) config_path <- system.file("config", "tasks.json", package = "hubUtils") get_version_file(config_path) # Get version from a URL of a hub config file url <- paste0( "https://raw.githubusercontent.com/hubverse-org/", "example-simple-forecast-hub/refs/heads/main/hub-config/tasks.json" ) get_version_file(url) # Get version from an AWS S3 cloud hub config file hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") config_path <- hub_path$path("hub-config/admin.json") get_version_file(config_path) hub_path <- system.file("testhubs/simple", package = "hubUtils") get_version_hub(hub_path) get_version_hub(hub_path, "admin") # Get version from an AWS S3 cloud hub config file hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") get_version_hub(hub_path)config <- read_config_file( system.file("config", "tasks.json", package = "hubUtils") ) get_version_config(config) config_path <- system.file("config", "tasks.json", package = "hubUtils") get_version_file(config_path) # Get version from a URL of a hub config file url <- paste0( "https://raw.githubusercontent.com/hubverse-org/", "example-simple-forecast-hub/refs/heads/main/hub-config/tasks.json" ) get_version_file(url) # Get version from an AWS S3 cloud hub config file hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") config_path <- hub_path$path("hub-config/admin.json") get_version_file(config_path) hub_path <- system.file("testhubs/simple", package = "hubUtils") get_version_hub(hub_path) get_version_hub(hub_path, "admin") # Get version from an AWS S3 cloud hub config file hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") get_version_hub(hub_path)
A subset of model output data accessed using hubData from the simple example
hub contained in the hubUtils package. The subset consists of "quantile" output
type data for "US" location and the most recent forecast date.
hub_con_outputhub_con_output
A tbl with 92 rows and 8 columns:
forecast_date: Origin date of the forecast.
horizon: Forecast horizon relative to the forecast_date.
target: Target variable.
location: Location of the forecast.
output_type: Output type of forecast.
output_type_id: Forecast output type level/identifier. In this case,
quantile level.
value: Forecast value.
model_id: Model identifier.
Detect if a URL is a GitHub repository URL
is_github_repo_url(url)is_github_repo_url(url)
url |
character string of the URL to check. |
Logical. TRUE if the URL is a GitHub repository URL, FALSE otherwise.
is_github_repo_url("https://github.com/hubverse-org/example-simple-forecast-hub") raw_url <- paste0( "https://raw.githubusercontent.com/hubverse-org/", "example-simple-forecast-hub/refs/heads/main/hub-config/tasks.json" ) is_github_repo_url(raw_url) url_to_blob <- "https://github.com/hubverse-org/example-simple-forecast-hub/blob/main/README.md" is_github_repo_url(url_to_blob)is_github_repo_url("https://github.com/hubverse-org/example-simple-forecast-hub") raw_url <- paste0( "https://raw.githubusercontent.com/hubverse-org/", "example-simple-forecast-hub/refs/heads/main/hub-config/tasks.json" ) is_github_repo_url(raw_url) url_to_blob <- "https://github.com/hubverse-org/example-simple-forecast-hub/blob/main/README.md" is_github_repo_url(url_to_blob)
Detect a URL on github.com
is_github_url(url)is_github_url(url)
url |
character string of the URL to check. |
Logical. TRUE if the URL on github.com, FALSE otherwise.
# Returns TRUE is_github_url("https://github.com/hubverse-org/example-simple-forecast-hub") is_github_url("https://github.com/hubverse-org/schemas/tree/main/v5.0.0") # Returns FALSE is_github_url("https://gitlab.com/hubverse-org/schemas/tree/main/v5.0.0") raw_url <- paste0( "https://raw.githubusercontent.com/hubverse-org/", "example-simple-forecast-hub/refs/heads/main/hub-config/tasks.json" ) is_github_url(raw_url)# Returns TRUE is_github_url("https://github.com/hubverse-org/example-simple-forecast-hub") is_github_url("https://github.com/hubverse-org/schemas/tree/main/v5.0.0") # Returns FALSE is_github_url("https://gitlab.com/hubverse-org/schemas/tree/main/v5.0.0") raw_url <- paste0( "https://raw.githubusercontent.com/hubverse-org/", "example-simple-forecast-hub/refs/heads/main/hub-config/tasks.json" ) is_github_url(raw_url)
<SubTreeFileSystem> represents the base
path of an S3 file system (i.e. the root of a cloud hub)Detect whether An object of class <SubTreeFileSystem> represents the base
path of an S3 file system (i.e. the root of a cloud hub)
is_s3_base_fs(s3_fs)is_s3_base_fs(s3_fs)
s3_fs |
An object of class |
Logical. TRUE if the object represents the base path of an S3 file,
FALSE otherwise.
hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") config_path <- hub_path$path("hub-config/admin.json") is_s3_base_fs(hub_path) is_s3_base_fs(config_path)hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") config_path <- hub_path$path("hub-config/admin.json") is_s3_base_fs(hub_path) is_s3_base_fs(config_path)
Determine if a string is a URL
is_url(x)is_url(x)
x |
character string to check if it is a URL. Must contain a protocol to be considered a URL. |
Logical. TRUE if x is a URL, FALSE otherwise.
is_url("https://docs.hubverse.io") is_url("www.hubverse.io")is_url("https://docs.hubverse.io") is_url("www.hubverse.io")
Is config list representation using v3.0.0 schema?
is_v3_config(config)is_v3_config(config)
config |
List representation of the JSON config file. |
Logical, whether the config list representation is using v3.0.0 schema or greater.
config <- read_config_file( system.file("config", "tasks.json", package = "hubUtils") ) is_v3_config(config)config <- read_config_file( system.file("config", "tasks.json", package = "hubUtils") ) is_v3_config(config)
Is config file using v3.0.0 schema?
is_v3_config_file(config_path)is_v3_config_file(config_path)
config_path |
Either a character string of a path to a local JSON config
file, a character string of the URL to the raw contents of a JSON config
file (e.g on GitHub) or
an object of class |
Logical, whether the config file is using v3.0.0 schema or greater.
config_path <- system.file("config", "tasks.json", package = "hubUtils") is_v3_config_file(config_path)config_path <- system.file("config", "tasks.json", package = "hubUtils") is_v3_config_file(config_path)
Is hub configured using v3.0.0 schema?
is_v3_hub(hub_path, config = c("tasks", "admin", "target-data"))is_v3_hub(hub_path, config = c("tasks", "admin", "target-data"))
hub_path |
Either a character string path to a local Modeling Hub directory,
a character string of a URL to a GitHub repository
or an object of class |
config |
Type of config file to read. One of |
Logical, whether the hub is configured using v3.0.0 schema or greater.
is_v3_hub(hub_path = system.file("testhubs", "flusight", package = "hubUtils"))is_v3_hub(hub_path = system.file("testhubs", "flusight", package = "hubUtils"))
Determine if a URL is valid and reachable
is_valid_url(url)is_valid_url(url)
url |
character string of the URL to check. |
Logical. TRUE if the URL is valid and reachable, FALSE otherwise.
is_valid_url("https://docs.hubverse.io") is_valid_url("https://docs.hubverse.io/invalid")is_valid_url("https://docs.hubverse.io") is_valid_url("https://docs.hubverse.io/invalid")
model_id columnMerge/Split model output tbl model_id column
model_id_merge(tbl, sep = "-") model_id_split(tbl, sep = "-")model_id_merge(tbl, sep = "-") model_id_split(tbl, sep = "-")
tbl |
a |
sep |
character string. Character used as separator when concatenating
|
tbl with either team_abbr and model_abbr merged into a single model_id
column or model_id split into columns team_abbr and model_abbr.
a tibble with model_id column split into separate
team_abbr and model_abbr columns
model_id_merge(): merge team_abbr and model_abbr into a single
model_id column.
model_id_split(): split model_id column into separate team_abbr
and model_abbr columns.
tbl_split <- model_id_split(hub_con_output) tbl_split # Merge model_id tbl_merged <- model_id_merge(tbl_split) tbl_merged # Split / Merge using custom separator tbl_sep <- hub_con_output tbl_sep$model_id <- gsub("-", "_", tbl_sep$model_id) tbl_sep <- model_id_split(tbl_sep, sep = "_") tbl_sep tbl_sep <- model_id_merge(tbl_sep, sep = "_") tbl_septbl_split <- model_id_split(hub_con_output) tbl_split # Merge model_id tbl_merged <- model_id_merge(tbl_split) tbl_merged # Split / Merge using custom separator tbl_sep <- hub_con_output tbl_sep$model_id <- gsub("-", "_", tbl_sep$model_id) tbl_sep <- model_id_split(tbl_sep, sep = "_") tbl_sep tbl_sep <- model_id_merge(tbl_sep, sep = "_") tbl_sep
Read a hub config file into R
read_config( hub_path, config = c("tasks", "admin", "model-metadata-schema", "target-data"), silent = TRUE )read_config( hub_path, config = c("tasks", "admin", "model-metadata-schema", "target-data"), silent = TRUE )
hub_path |
Either a character string path to a local Modeling Hub directory,
a character string of a URL to a GitHub repository
or an object of class |
config |
Type of config file to read. One of |
silent |
Logical. If |
The contents of the config file as an R list. If possible, the output is
further converted to a <config> class object before returning. Note that
"model-metadata-schema" files are never converted to a <config> object.
# Read config files from local hub hub_path <- system.file("testhubs/simple", package = "hubUtils") read_config(hub_path, "tasks") read_config(hub_path, "admin") # Read config file from a GitHub hub repository github_url <- "https://github.com/hubverse-org/example-simple-forecast-hub" read_config(github_url) read_config(github_url, "admin") # Read config file from AWS S3 bucket hub hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") read_config(hub_path, "admin")# Read config files from local hub hub_path <- system.file("testhubs/simple", package = "hubUtils") read_config(hub_path, "tasks") read_config(hub_path, "admin") # Read config file from a GitHub hub repository github_url <- "https://github.com/hubverse-org/example-simple-forecast-hub" read_config(github_url) read_config(github_url, "admin") # Read config file from AWS S3 bucket hub hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") read_config(hub_path, "admin")
Read a JSON config file from a path
read_config_file(config_path, silent = TRUE)read_config_file(config_path, silent = TRUE)
config_path |
Either a character string of a path to a local JSON config
file, a character string of the URL to the raw contents of a JSON config
file (e.g on GitHub) or
an object of class |
silent |
Logical. If |
The contents of the config file as an R list. If possible, the output is
further converted to a <config> class object before returning. Note that
"model-metadata-schema" files are never converted to a <config> object.
# Read local config file read_config_file(system.file("config", "tasks.json", package = "hubUtils")) # Read config file from URL url <- paste0( "https://raw.githubusercontent.com/hubverse-org/", "example-simple-forecast-hub/refs/heads/main/hub-config/tasks.json" ) read_config_file(url) # Read config file from AWS S3 bucket hub hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") config_path <- hub_path$path("hub-config/admin.json") read_config_file(config_path)# Read local config file read_config_file(system.file("config", "tasks.json", package = "hubUtils")) # Read config file from URL url <- paste0( "https://raw.githubusercontent.com/hubverse-org/", "example-simple-forecast-hub/refs/heads/main/hub-config/tasks.json" ) read_config_file(url) # Read config file from AWS S3 bucket hub hub_path <- arrow::s3_bucket("hubverse/hubutils/testhubs/simple/") config_path <- hub_path$path("hub-config/admin.json") read_config_file(config_path)
A named character string of standard column names used in hubverse model output data files. The terms currently used for standard column names in the hubverse are English. In future, however, this could be expanded to provide the basis for hub terminology localisation.
std_colnamesstd_colnames
An object of class character of length 4.
model_out_tbl or submission tbl.Subset a model_out_tbl or submission tbl.
subset_task_id_cols(model_out_tbl) subset_std_cols(model_out_tbl)subset_task_id_cols(model_out_tbl) subset_std_cols(model_out_tbl)
model_out_tbl |
A |
subset_task_id_cols: an object of the same class as model_out_tbl
which contains only task ID columns.
subset_std_cols: an object of the same class as model_out_tbl
which contains only hubverse standard columns (i.e. columns that are not
task_id columns).
subset_task_id_cols(): subset a model_out_tbl or submission
tbl to only include task_id columns
subset_std_cols(): subset a model_out_tbl or submission tbl
to only include hubverse standard columns (i.e. columns that are not task_id
columns)
model_out_tbl_path <- system.file("testhubs", "v4", "simple", "model-output", "hub-baseline", "2022-10-15-hub-baseline.parquet", package = "hubUtils" ) model_out_tbl <- arrow::read_parquet(model_out_tbl_path) subset_task_id_cols(model_out_tbl) subset_std_cols(model_out_tbl)model_out_tbl_path <- system.file("testhubs", "v4", "simple", "model-output", "hub-baseline", "2022-10-15-hub-baseline.parquet", package = "hubUtils" ) model_out_tbl <- arrow::read_parquet(model_out_tbl_path) subset_task_id_cols(model_out_tbl) subset_std_cols(model_out_tbl)
Subset a vector of column names to only include task IDs
subset_task_id_names(x)subset_task_id_names(x)
x |
character vector of column names |
a character vector of task ID names
x <- c( "origin_date", "horizon", "target_date", "location", "output_type", "output_type_id", "value" ) subset_task_id_names(x)x <- c( "origin_date", "horizon", "target_date", "location", "output_type", "output_type_id", "value" ) subset_task_id_names(x)
Utility functions for extracting properties from target-data.json configuration files (v6.0.0 schema). These functions handle defaults and inheritance patterns for target data configuration.
get_date_col(config_target_data) get_observable_unit( config_target_data, dataset = c("time-series", "oracle-output") ) get_versioned(config_target_data, dataset = c("time-series", "oracle-output")) get_has_output_type_ids(config_target_data) get_non_task_id_schema(config_target_data) has_target_data_config(hub_path) ## Default S3 method: has_target_data_config(hub_path) ## S3 method for class 'SubTreeFileSystem' has_target_data_config(hub_path)get_date_col(config_target_data) get_observable_unit( config_target_data, dataset = c("time-series", "oracle-output") ) get_versioned(config_target_data, dataset = c("time-series", "oracle-output")) get_has_output_type_ids(config_target_data) get_non_task_id_schema(config_target_data) has_target_data_config(hub_path) ## Default S3 method: has_target_data_config(hub_path) ## S3 method for class 'SubTreeFileSystem' has_target_data_config(hub_path)
config_target_data |
A target-data config object created by
|
dataset |
Character string specifying the dataset type: either
|
hub_path |
Path to a hub. Can be a local directory path or cloud URL (S3, GCS). |
Some properties can be specified at both the global level and the dataset level:
observable_unit: Dataset-specific values override global when specified, otherwise the global value is used.
versioned: Dataset-specific values override global when specified,
otherwise inherits from global (default FALSE if not specified anywhere).
Other properties are dataset-specific only:
has_output_type_ids: Only for oracle-output dataset (default FALSE)
non_task_id_schema: Only for time-series dataset (default NULL)
get_date_col() returns a character string: the name of the date
column that stores the date on which observed data actually occurred.
get_observable_unit() returns a character vector: column names
whose unique value combinations define the minimum observable unit.
get_versioned() returns a logical value: whether the dataset is
versioned using as_of dates.
get_has_output_type_ids() returns a logical value: whether
oracle-output data has output_type and output_type_id columns (default
FALSE if not specified).
get_non_task_id_schema() returns a named list: key-value pairs of
non-task ID column names and their data types, or NULL if not specified.
has_target_data_config() returns a logical value: TRUE if the
target-data.json file exists in the hub-config directory of the hub,
FALSE otherwise.
get_date_col(): Get the name of the date column across hub data.
get_observable_unit(): Get observable unit column names. Returns
dataset-specific observable_unit if configured, otherwise falls back to global.
get_versioned(): Get whether target data is versioned for the
specified dataset. Returns dataset-specific setting if configured, otherwise
inherits from global (default FALSE if not specified).
get_has_output_type_ids(): Get whether oracle-output data has
output_type/output_type_id columns.
get_non_task_id_schema(): Get the schema for non-task ID columns in
time-series data.
has_target_data_config(): Check if target data config file exists in hub.
hub_path <- system.file("testhubs/v6/target_dir", package = "hubUtils") config <- read_config(hub_path, "target-data") # Get the date column name get_date_col(config) # Get observable unit (uses dataset-specific or falls back to global) get_observable_unit(config, dataset = "time-series") get_observable_unit(config, dataset = "oracle-output") # Get versioned setting (inherits from global if not specified) get_versioned(config, dataset = "time-series") # Get oracle-output specific property get_has_output_type_ids(config) # Get time-series specific property get_non_task_id_schema(config) # Check if target data config exists has_target_data_config(hub_path) no_config_hub <- system.file("testhubs/v5/target_file/", package = "hubUtils") has_target_data_config(no_config_hub)hub_path <- system.file("testhubs/v6/target_dir", package = "hubUtils") config <- read_config(hub_path, "target-data") # Get the date column name get_date_col(config) # Get observable unit (uses dataset-specific or falls back to global) get_observable_unit(config, dataset = "time-series") get_observable_unit(config, dataset = "oracle-output") # Get versioned setting (inherits from global if not specified) get_versioned(config, dataset = "time-series") # Get oracle-output specific property get_has_output_type_ids(config) # Get time-series specific property get_non_task_id_schema(config) # Check if target data config exists has_target_data_config(hub_path) no_config_hub <- system.file("testhubs/v5/target_file/", package = "hubUtils") has_target_data_config(no_config_hub)
model_out_tbl object.Validate a model_out_tbl object.
validate_model_out_tbl(tbl)validate_model_out_tbl(tbl)
tbl |
a |
If valid, returns a model_out_tbl class object. Otherwise, throws an
error.
md_out <- as_model_out_tbl(hub_con_output) validate_model_out_tbl(md_out)md_out <- as_model_out_tbl(hub_con_output) validate_model_out_tbl(md_out)
schema_versions to specific version numbers from
a variety of sourcesCompare hub config schema_versions to specific version numbers from
a variety of sources
version_equal( version, config = NULL, config_path = NULL, hub_path = NULL, schema_version = NULL ) version_gte( version, config = NULL, config_path = NULL, hub_path = NULL, schema_version = NULL ) version_gt( version, config = NULL, config_path = NULL, hub_path = NULL, schema_version = NULL ) version_lte( version, config = NULL, config_path = NULL, hub_path = NULL, schema_version = NULL ) version_lt( version, config = NULL, config_path = NULL, hub_path = NULL, schema_version = NULL )version_equal( version, config = NULL, config_path = NULL, hub_path = NULL, schema_version = NULL ) version_gte( version, config = NULL, config_path = NULL, hub_path = NULL, schema_version = NULL ) version_gt( version, config = NULL, config_path = NULL, hub_path = NULL, schema_version = NULL ) version_lte( version, config = NULL, config_path = NULL, hub_path = NULL, schema_version = NULL ) version_lt( version, config = NULL, config_path = NULL, hub_path = NULL, schema_version = NULL )
version |
Character string. Version number to compare against, must be in
the format |
config |
A |
config_path |
Either a character string of a path to a local JSON config
file, a character string of the URL to the raw contents of a JSON config
file (e.g on GitHub) or
an object of class |
hub_path |
Either a character string path to a local Modeling Hub directory,
a character string of a URL to a GitHub repository
or an object of class |
schema_version |
Character string. A config |
TRUE or FALSE depending on how the schema version compares to the
version number specified.
version_equal(): Check whether a schema version property is equal
to a specific version number.
version_gte(): Check whether a schema version property is equal to
or greater than a specific version number.
version_gt(): Check whether a schema version property is greater
than a specific version number.
version_lte(): Check whether a schema version property is equal to
or less than a specific version number.
version_lt(): Check whether a schema version property is less
than a specific version number.
# Actual version "v2.0.0" hub_path <- system.file("testhubs/simple", package = "hubUtils") # Actual version "v3.0.0" config_path <- system.file("config", "tasks.json", package = "hubUtils") config <- read_config_file(config_path) schema_version <- config$schema_version # Check whether schema_version equal to v3.0.0 version_equal("v3.0.0", config = config) version_equal("v3.0.0", config_path = config_path) version_equal("v3.0.0", hub_path = hub_path) version_equal("v3.0.0", schema_version = schema_version) # Check whether schema_version equal to or greater than v3.0.0 version_gte("v3.0.0", config = config) version_gte("v3.0.0", config_path = config_path) version_gte("v3.0.0", hub_path = hub_path) version_gte("v3.0.0", schema_version = schema_version) # Check whether schema_version greater than v3.0.0 version_gt("v3.0.0", config = config) version_gt("v3.0.0", config_path = config_path) version_gt("v3.0.0", hub_path = hub_path) version_gt("v3.0.0", schema_version = schema_version) # Check whether schema_version equal to or less than v3.0.0 version_lte("v3.0.0", config = config) version_lte("v3.0.0", config_path = config_path) version_lte("v3.0.0", hub_path = hub_path) version_lte("v3.0.0", schema_version = schema_version) # Check whether schema_version less than v3.0.0 version_lt("v3.0.0", config = config) version_lt("v3.0.0", config_path = config_path) version_lt("v3.0.0", hub_path = hub_path) version_lt("v3.0.0", schema_version = schema_version)# Actual version "v2.0.0" hub_path <- system.file("testhubs/simple", package = "hubUtils") # Actual version "v3.0.0" config_path <- system.file("config", "tasks.json", package = "hubUtils") config <- read_config_file(config_path) schema_version <- config$schema_version # Check whether schema_version equal to v3.0.0 version_equal("v3.0.0", config = config) version_equal("v3.0.0", config_path = config_path) version_equal("v3.0.0", hub_path = hub_path) version_equal("v3.0.0", schema_version = schema_version) # Check whether schema_version equal to or greater than v3.0.0 version_gte("v3.0.0", config = config) version_gte("v3.0.0", config_path = config_path) version_gte("v3.0.0", hub_path = hub_path) version_gte("v3.0.0", schema_version = schema_version) # Check whether schema_version greater than v3.0.0 version_gt("v3.0.0", config = config) version_gt("v3.0.0", config_path = config_path) version_gt("v3.0.0", hub_path = hub_path) version_gt("v3.0.0", schema_version = schema_version) # Check whether schema_version equal to or less than v3.0.0 version_lte("v3.0.0", config = config) version_lte("v3.0.0", config_path = config_path) version_lte("v3.0.0", hub_path = hub_path) version_lte("v3.0.0", schema_version = schema_version) # Check whether schema_version less than v3.0.0 version_lt("v3.0.0", config = config) version_lt("v3.0.0", config_path = config_path) version_lt("v3.0.0", hub_path = hub_path) version_lt("v3.0.0", schema_version = schema_version)