【问题标题】:How to extract confidence intervals from modeltime recursive ensembles?如何从模型时间递归集成中提取置信区间?
【发布时间】:2022-01-07 21:21:11
【问题描述】:

由于我想在模型时间框架之外对预测数据进行一些可视化和分析,因此我需要提取置信度值、拟合值,也许还有残差。

文档表明,我需要使用函数 modeltime_calibrate() 来获取置信度值和残差。所以一个问题是,我从哪里提取拟合值?

我的主要问题是,如何对递归集成进行校准。对于任何非集成模型,我都能做到,但在递归集成的情况下,如果我想校准,我会遇到一些错误消息。

为了说明问题,请看下面的示例代码,它最终无法校准所有模型:

library(modeltime.ensemble)
library(modeltime)
library(tidymodels)
library(earth)
library(glmnet)
library(xgboost)
library(tidyverse)
library(lubridate)
library(timetk)


FORECAST_HORIZON <- 24

m4_extended <- m4_monthly %>%
  group_by(id) %>%
  future_frame(
    .length_out = FORECAST_HORIZON,
    .bind_data  = TRUE
  ) %>%
  ungroup()

lag_transformer_grouped <- function(data){
  data %>%
    group_by(id) %>%
    tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
    ungroup()
}

m4_lags <- m4_extended %>%
  lag_transformer_grouped()

test_data <- m4_lags %>%
  group_by(id) %>%
  slice_tail(n = 12) %>%
  ungroup()

train_data <- m4_lags %>%
  drop_na()

future_data <- m4_lags %>%
  filter(is.na(value))

model_fit_glmnet <- linear_reg(penalty = 1) %>%
  set_engine("glmnet") %>%
  fit(value ~ ., data = train_data)

model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
  set_engine("xgboost") %>%
  fit(value ~ ., data = train_data)

recursive_ensemble_panel <- modeltime_table(
  model_fit_glmnet,
  model_fit_xgboost
) %>%
  ensemble_weighted(loadings = c(4, 6)) %>%
  recursive(
    transform  = lag_transformer_grouped,
    train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
    id         = "id"
  )

model_tbl <- modeltime_table(
  recursive_ensemble_panel
)

calibrated_mod <- model_tbl %>%
  modeltime_calibrate(test_data, id = "id", quiet = FALSE)

model_tbl %>%
  modeltime_forecast(
    new_data    = future_data,
    actual_data = m4_lags,
    keep_data   = TRUE
  ) %>%
  group_by(id) %>%
  plot_modeltime_forecast(
    .interactive        = FALSE,
    .conf_interval_show = TRUE,
    .facet_ncol         = 2
  )

【问题讨论】:

    标签: r time-series confidence-interval ensemble-learning tidymodels


    【解决方案1】:

    我不得不做一些实验来找到正确的方法来提取我需要的东西(置信区间和残差)。

    您可以从下面的示例代码中看到,需要对模型工作流程进行更改才能实现此目的。递归需要出现在工作流对象定义中,既不在模型中,也不在集成拟合/规范中。

    我仍然需要在这里做一些测试,但我想,我知道了我需要知道的:

    # Time Series ML
    library(tidymodels)
    library(modeltime)
    library(modeltime.ensemble)
    
    # Core
    library(tidyverse)
    library(timetk)
    
    
    # data def
    FORECAST_HORIZON <- 24
    
    lag_transformer_grouped <- function(m750){
      m750 %>%
        group_by(id) %>%
        tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
        ungroup()
    }
    
    m750_lags <- m750 %>%
      lag_transformer_grouped()
    
    test_data <- m750_lags %>%
      group_by(id) %>%
      slice_tail(n = 12) %>%
      ungroup()
    
    train_data <- m750_lags %>%
      drop_na()
    
    future_data <- m750_lags %>%
      filter(is.na(value))
    
    # rec
    recipe_spec <- recipe(value ~ date, train_data) %>%
      step_timeseries_signature(date) %>%
      step_rm(matches("(.iso$)|(.xts$)")) %>%
      step_normalize(matches("(index.num$)|(_year$)")) %>%
      step_dummy(all_nominal()) %>%
      step_fourier(date, K = 1, period = 12)
    
    recipe_spec %>% prep() %>% juice()
    
    # elnet 
    model_fit_glmnet <- linear_reg(penalty = 1) %>%
      set_engine("glmnet") 
    
    wflw_fit_glmnet <- workflow() %>%
      add_model(model_fit_glmnet) %>%
      add_recipe(recipe_spec %>% step_rm(date)) %>%
      fit(train_data) %>%
      recursive(
        transform  = lag_transformer_grouped,
        train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
        id         = "id"
      )
    
    # xgboost    
    model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
      set_engine("xgboost")
    
    wflw_fit_xgboost <- workflow() %>%
      add_model(model_fit_xgboost) %>%
      add_recipe(recipe_spec %>% step_rm(date)) %>%
      fit(train_data) %>%
      recursive(
        transform  = lag_transformer_grouped,
        train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
        id         = "id"
      )
    
    # mtbl
    m750_models <- modeltime_table(
      wflw_fit_xgboost,
      wflw_fit_glmnet
    )
    
    # mfit
    ensemble_fit <- m750_models %>%
     ensemble_average(type = "mean")
    
    # mcalib
    calibration_tbl <- modeltime_table(
      ensemble_fit
    ) %>%
      modeltime_calibrate(test_data)
    
    # residuals
    calib_out <- calibration_tbl$.calibration_data[[1]] %>% 
      left_join(test_data %>% select(id, date, value))
    
    
    # Forecast ex post
    ex_post_obj <- 
    calibration_tbl %>%
      modeltime_forecast(
        new_data    = test_data,
        actual_data = m750
      )
    
    
    # Forecast ex ante
    data_prepared_tbl <- bind_rows(train_data, test_data)
    
    future_tbl <- data_prepared_tbl %>%
      group_by(id) %>%
      future_frame(.length_out = "2 years") %>%
      ungroup()
    
    ex_ante_obj <- 
      calibration_tbl %>%
      modeltime_forecast(
        new_data    = future_tbl,
        actual_data = m750
      )
    

    【讨论】:

      【解决方案2】:

      问题在于您的recursive_ensemble_panel。您必须对模型本身而不是整体进行递归部分。像你一样,我本来希望一次完成递归,也许是通过modeltime_table

      # start of changes to your code.
      
      # added recursive to the model 
      model_fit_glmnet <- linear_reg(penalty = 1) %>%
        set_engine("glmnet") %>%
        fit(value ~ ., data = train_data) %>% 
        recursive(
          transform  = lag_transformer_grouped,
          train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
          id         = "id"
        )
      
      # added recursive to the model     
      model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
        set_engine("xgboost") %>%
        fit(value ~ ., data = train_data) %>% 
        recursive(
          transform  = lag_transformer_grouped,
          train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
          id         = "id"
        )
      
      # removed recursive part    
      recursive_ensemble_panel <- modeltime_table(
        model_fit_glmnet,
        model_fit_xgboost
      ) %>%
        ensemble_weighted(loadings = c(4, 6))
      
      # rest of your code
      

      【讨论】:

      • 我刚刚将您的代码嵌入到我的代码中,并将校准对象作为 modeltime_forecast() 的输入。但是,查看校准或预测对象,我可以找到 conf_hi 和 conf_lo 列,但没有置信度值,因为这些列仅包含 NA。完成代码后,您检查值了吗?
      • 我在您的示例中使用的所有内容都会导致置信度列中的 NA,无论我是否仅采用 model_fit_xgboost 而没有递归、没有集成等。按照小插曲中的示例,一切正常。不确定问题出在哪里,但您必须了解示例显示的内容与代码失败的地方之间的差异。
      • 也许值得在包的github页面上问这个问题/
      • 不需要。我会尽快发布我的解决方案
      猜你喜欢
      • 2021-03-10
      • 2019-04-23
      • 2022-01-09
      • 1970-01-01
      • 2016-11-04
      • 2020-12-07
      • 2017-09-08
      • 2019-07-07
      • 2019-04-09
      相关资源
      最近更新 更多