If you don’t have the following packages, make sure to install them
remotes::install_github("hubverse-org/hubData")
remotes::install_github("hubverse-org/hubEnsembles")
remotes::install_github("hubverse-org/hubVis")
Once the required packages are installed, load the following:
CDC’s FluSight Forecast Hub, which began as a GitHub repository, has been mirrored to the cloud via a publicly-accessible AWS S3 bucket. This means that instead of relying on a local clone, the data can be directly accessed from the cloud.
In particular, hubData
makes it possible to access the
model output data as an already-formatted tibble in a few simple
steps.
hub_path_cloud <- s3_bucket("cdcepi-flusight-forecast-hub/") # connect to bucket
hub_con <- connect_hub(hub_path_cloud, file_format = "parquet") # connect to hub
#> ℹ Updating superseded URL `Infectious-Disease-Modeling-hubs` to `hubverse-org`
#> ℹ Updating superseded URL `Infectious-Disease-Modeling-hubs` to `hubverse-org`
data_cloud <- hub_con |>
collect() # collect all model output into single tibble
This collected model output can then be changed with the common
dplyr
operations as desired
# perform various operations on data
filtered_outputs <- data_cloud |>
filter(output_type == "quantile", location == "US", horizon > -1) |>
mutate(output_type_id = as.numeric(output_type_id)) |>
as_model_out_tbl()
# print data
head(filtered_outputs)
#> # A tibble: 6 × 9
#> model_id reference_date target horizon location target_end_date output_type
#> <chr> <date> <chr> <int> <chr> <date> <chr>
#> 1 CEPH-Rtren… 2023-10-14 wk in… 0 US 2023-10-14 quantile
#> 2 CEPH-Rtren… 2023-10-14 wk in… 1 US 2023-10-21 quantile
#> 3 CEPH-Rtren… 2023-10-14 wk in… 2 US 2023-10-28 quantile
#> 4 CEPH-Rtren… 2023-10-14 wk in… 3 US 2023-11-04 quantile
#> 5 CEPH-Rtren… 2023-10-14 wk in… 0 US 2023-10-14 quantile
#> 6 CEPH-Rtren… 2023-10-14 wk in… 1 US 2023-10-21 quantile
#> # ℹ 2 more variables: output_type_id <dbl>, value <dbl>
Currently, there is no specific hubData
integration to
extract target data from the cloud; however, we can instead use the
aws.s3
package to read the contents of the most up-to-date
target data file stored in the cloud.
target_data <-
aws.s3::s3read_using(
readr::read_csv,
object = "s3://cdcepi-flusight-forecast-hub/target-data/target-hospital-admissions.csv"
) |>
select(date, location, value) # keep only required columns
head(target_data) # print
#> # A tibble: 6 × 3
#> date location value
#> <date> <chr> <dbl>
#> 1 2024-04-27 02 3
#> 2 2024-04-27 01 16
#> 3 2024-04-27 05 30
#> 4 2024-04-27 04 106
#> 5 2024-04-27 06 151
#> 6 2024-04-27 08 23
See hubEnsembles package for more information
Quantile mean:
mean_ens <- filtered_outputs |>
hubEnsembles::simple_ensemble(model_id = "mean-ensemble")
head(mean_ens)
#> # A tibble: 6 × 9
#> model_id reference_date target horizon location target_end_date output_type
#> <chr> <date> <chr> <int> <chr> <date> <chr>
#> 1 mean-ensem… 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> 2 mean-ensem… 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> 3 mean-ensem… 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> 4 mean-ensem… 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> 5 mean-ensem… 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> 6 mean-ensem… 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> # ℹ 2 more variables: output_type_id <dbl>, value <dbl>
Linear pool (with normal tails):
linear_pool <- filtered_outputs |>
hubEnsembles::linear_pool(model_id = "linear-pool")
head(linear_pool)
#> # A tibble: 6 × 9
#> model_id reference_date target horizon location target_end_date output_type
#> <chr> <date> <chr> <int> <chr> <date> <chr>
#> 1 linear-pool 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> 2 linear-pool 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> 3 linear-pool 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> 4 linear-pool 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> 5 linear-pool 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> 6 linear-pool 2023-10-07 wk in… 0 US 2023-10-07 quantile
#> # ℹ 2 more variables: output_type_id <dbl>, value <dbl>
See hubVis package for more information
Modify and filter the target data for plotting:
target_data_plotted <- target_data |>
mutate(target = "wk inc flu hosp", observation = value) |>
filter(location == "US", date >= as.Date("2023-09-23"))
Modify and filter forecast data for plotting
Single plot, single set of forecasts for one reference date
model_outputs_plotted |>
filter(reference_date == as.Date("2024-04-20")) |>
hubVis::plot_step_ahead_model_output(
target_data_plotted,
x_col_name = "target_end_date",
use_median_as_point = TRUE,
group = "reference_date",
interactive = FALSE
)
Single plot, multiple sets of forecasts for reference dates every 4 weeks
model_outputs_plotted |>
filter(reference_date %in% reference_dates[seq(3, 31, 4)]) |>
hubVis::plot_step_ahead_model_output(
target_data_plotted,
x_col_name = "target_end_date",
use_median_as_point = TRUE,
group = "reference_date",
interactive = FALSE
)
Faceted plot, multiple sets of forecasts for reference states every 4 weeks