FluSight Hubverse Cloud Vignette

Library and system set up

If you don’t have the following packages, make sure to install them

remotes::install_github("hubverse-org/hubData")
remotes::install_github("hubverse-org/hubEnsembles")
remotes::install_github("hubverse-org/hubVis")

Once the required packages are installed, load the following:

library(hubData)
library(hubEnsembles)
library(hubVis)

library(arrow)
library(dplyr)
library(aws.s3)

Load the data

CDC’s FluSight Forecast Hub, which began as a GitHub repository, has been mirrored to the cloud via a publicly-accessible AWS S3 bucket. This means that instead of relying on a local clone, the data can be directly accessed from the cloud.

In particular, hubData makes it possible to access the model output data as an already-formatted tibble in a few simple steps.

hub_path_cloud <- s3_bucket("cdcepi-flusight-forecast-hub/") # connect to bucket
hub_con <- connect_hub(hub_path_cloud, file_format = "parquet") # connect to hub
#> ℹ Updating superseded URL `Infectious-Disease-Modeling-hubs` to `hubverse-org`
#> ℹ Updating superseded URL `Infectious-Disease-Modeling-hubs` to `hubverse-org`
data_cloud <- hub_con |>
  collect() # collect all model output into single tibble

This collected model output can then be changed with the common dplyr operations as desired

# perform various operations on data
filtered_outputs <- data_cloud |>
  filter(output_type == "quantile", location == "US", horizon > -1) |>
  mutate(output_type_id = as.numeric(output_type_id)) |>
  as_model_out_tbl()

# print data
head(filtered_outputs)
#> # A tibble: 6 × 9
#>   model_id    reference_date target horizon location target_end_date output_type
#>   <chr>       <date>         <chr>    <int> <chr>    <date>          <chr>      
#> 1 CEPH-Rtren… 2023-10-14     wk in…       0 US       2023-10-14      quantile   
#> 2 CEPH-Rtren… 2023-10-14     wk in…       1 US       2023-10-21      quantile   
#> 3 CEPH-Rtren… 2023-10-14     wk in…       2 US       2023-10-28      quantile   
#> 4 CEPH-Rtren… 2023-10-14     wk in…       3 US       2023-11-04      quantile   
#> 5 CEPH-Rtren… 2023-10-14     wk in…       0 US       2023-10-14      quantile   
#> 6 CEPH-Rtren… 2023-10-14     wk in…       1 US       2023-10-21      quantile   
#> # ℹ 2 more variables: output_type_id <dbl>, value <dbl>

Currently, there is no specific hubData integration to extract target data from the cloud; however, we can instead use the aws.s3 package to read the contents of the most up-to-date target data file stored in the cloud.

target_data <-
  aws.s3::s3read_using(
    readr::read_csv,
    object = "s3://cdcepi-flusight-forecast-hub/target-data/target-hospital-admissions.csv"
  ) |>
  select(date, location, value) # keep only required columns

head(target_data) # print
#> # A tibble: 6 × 3
#>   date       location value
#>   <date>     <chr>    <dbl>
#> 1 2024-04-27 02           3
#> 2 2024-04-27 01          16
#> 3 2024-04-27 05          30
#> 4 2024-04-27 04         106
#> 5 2024-04-27 06         151
#> 6 2024-04-27 08          23

Calculate some ensembles

See hubEnsembles package for more information

Quantile mean:

mean_ens <- filtered_outputs |>
  hubEnsembles::simple_ensemble(model_id = "mean-ensemble")
head(mean_ens)
#> # A tibble: 6 × 9
#>   model_id    reference_date target horizon location target_end_date output_type
#>   <chr>       <date>         <chr>    <int> <chr>    <date>          <chr>      
#> 1 mean-ensem… 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> 2 mean-ensem… 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> 3 mean-ensem… 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> 4 mean-ensem… 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> 5 mean-ensem… 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> 6 mean-ensem… 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> # ℹ 2 more variables: output_type_id <dbl>, value <dbl>

Linear pool (with normal tails):

linear_pool <- filtered_outputs |>
  hubEnsembles::linear_pool(model_id = "linear-pool")
head(linear_pool)
#> # A tibble: 6 × 9
#>   model_id    reference_date target horizon location target_end_date output_type
#>   <chr>       <date>         <chr>    <int> <chr>    <date>          <chr>      
#> 1 linear-pool 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> 2 linear-pool 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> 3 linear-pool 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> 4 linear-pool 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> 5 linear-pool 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> 6 linear-pool 2023-10-07     wk in…       0 US       2023-10-07      quantile   
#> # ℹ 2 more variables: output_type_id <dbl>, value <dbl>

Plot the output

See hubVis package for more information

Data processing

Modify and filter the target data for plotting:

target_data_plotted <- target_data |>
  mutate(target = "wk inc flu hosp", observation = value) |>
  filter(location == "US", date >= as.Date("2023-09-23"))

Modify and filter forecast data for plotting

reference_dates <- unique(filtered_outputs$reference_date)
model_outputs_plotted <- filtered_outputs |>
  filter(model_id %in% c("FluSight-baseline", "MOBS-GLEAM_FLUH", "PSI-PROF")) |>
  rbind(mean_ens, linear_pool) # bind with ensembles

Plot

Single plot, single set of forecasts for one reference date

model_outputs_plotted |>
  filter(reference_date == as.Date("2024-04-20")) |>
  hubVis::plot_step_ahead_model_output(
    target_data_plotted,
    x_col_name = "target_end_date",
    use_median_as_point = TRUE,
    group = "reference_date",
    interactive = FALSE
  )

Single plot, multiple sets of forecasts for reference dates every 4 weeks

model_outputs_plotted |>
  filter(reference_date %in% reference_dates[seq(3, 31, 4)]) |>
  hubVis::plot_step_ahead_model_output(
    target_data_plotted,
    x_col_name = "target_end_date",
    use_median_as_point = TRUE,
    group = "reference_date",
    interactive = FALSE
  )

Faceted plot, multiple sets of forecasts for reference states every 4 weeks

model_outputs_plotted |>
  filter(reference_date %in% reference_dates[seq(3, 31, 4)]) |>
  hubVis::plot_step_ahead_model_output(
    target_data_plotted,
    x_col_name = "target_end_date",
    use_median_as_point = TRUE,
    show_legend = FALSE,
    facet = "model_id",
    group = "reference_date",
    interactive = FALSE
  )