R：在 R 中添加进度条答案

【问题标题】：R: Adding Progress Bars in RR：在 R 中添加进度条
【发布时间】：2021-10-06 09:33:56
【问题描述】：

我正在使用 R 编程语言。我正在尝试学习如何添加“进度条”来估计函数运行时剩余的时间 (https://www.rdocumentation.org/packages/progress/versions/1.2.2/topics/progress_bar)。

例如：

library(progress)

pb <- progress_bar$new(total = 100)
for (i in 1:100) {
  pb$tick()
  Sys.sleep(1 / 100)
}

假设我有一个名为“grid_function”的函数和一个名为“DF_1”的数据集。我从“DF_1”中取出每一行并将这一行输入“grid_function”。 “grid_function”使用这一行执行一些计算，并将其存储到一个名为“resultdf1”的“列表”中。最后，“resultdf1”被转换成一个名为“final_output”的数据框。 “喂食过程”可见下图：

resultdf1 <- apply(DF_1,1, # 1 means rows
                   FUN=function(x){
                       do.call(
                           # Call Function grid_function with the arguments in
                           # a list
                           grid_function,
                           # force list type for the arguments
                           c(list(train_data_new), as.list(
                               # make the row to a named vector
                               unlist(x)
                           )
                           ))
                   }
)

l = resultdf1
final_output = rbindlist(l, fill = TRUE)

问题：我想在上面的代码中添加一个“进度条”。

我尝试了什么：我尝试如下：

library(doParallel)
library(future)

cl <- makePSOCKcluster(6) # 6 cpu cores out of 8

registerDoParallel(cl)

    pb <- progress_bar$new(total = 100)
    
    for (i in 1:100) {
    resultdf1 <- apply(DF_1,1, # 1 means rows
                       FUN=function(x){
                           do.call(
                               # Call Function grid_function2 with the arguments in
                               # a list
                               grid_function,
                               # force list type for the arguments
                               c(list(train_data_new), as.list(
                                   # make the row to a named vector
                                   unlist(x)
                               )
                               ))
                       }
    )
    
    l = resultdf1
    final_output = rbindlist(l, fill = TRUE)
    
    
     pb$tick()
      Sys.sleep(1 / 100)
    }

stopCluster(cl)

这似乎有效，但我不确定我是否正确执行了所有操作。有人可以告诉我我是否正确地做到了这一点？添加这个“进度条”是否有可能实际上会导致函数需要更多时间来运行？

谢谢

更新： @Serkan：这是完整的代码：

library(dplyr)
library(data.table)

results_table <- data.frame()

grid_function <- function(train_data, random_1, random_2, random_3, random_4, split_1, split_2, split_3) {
    
    
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    
    #calculate random quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    
    
    
    results_table <- rbind(results_table, final_table_2)
    
    final_results = dcast(setDT(results_table), random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
}

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)




#grid
random_1 <- seq(80,100,5)
random_2 <- seq(85,120,5)
random_3 <- seq(85,120,5)
random_4 <- seq(90,120,5)
split_1 =  seq(0,1,0.1)
split_2 =  seq(0,1,0.1)
split_3 =  seq(0,1,0.1)
DF_1 <- expand.grid(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)

#reduce the size of the grid for this example
DF_1 = DF_1[1:100,]

colnames(DF_1) <- c("random_1" , "random_2", "random_3",                     "random_4", "split_1", "split_2", "split_3")

train_data_new <- copy(train_data)


resultdf1 <- apply(DF_1,1, # 1 means rows
                   FUN=function(x){
                       do.call(
                           # Call Function grid_function2 with the arguments in
                           # a list
                           grid_function,
                           # force list type for the arguments
                           c(list(train_data_new), as.list(
                               # make the row to a named vector
                               unlist(x)
                           )
                           ))
                   }
)

l = resultdf1
final_output = rbindlist(l, fill = TRUE)

这是最终输出的样子：

head(final_output)
   random_1 random_2 random_3 random_4 split_1 split_2 split_3 b c total  a
1:       80       85       85       90       0       0       0 0 0     0 NA
2:       85       85       85       90       0       0       0 0 0     0 NA
3:       90       85       85       90       0       0       0 0 0     0 NA
4:       95       85       85       90       0       0       0 0 0     0 NA
5:      100       85       85       90       0       0       0 0 0     0 NA
6:       80       90       85       90       0       0       0 0 0     0 NA

【问题讨论】：

有什么理由相信你做错了？ “这看起来是否正确”并不是一个非常适合 Stack Overflow 的特定编程问题。你的代码要么做你想做的事，要么不做。如果您关心性能，请从分析您自己的代码开始：adv-r.hadley.nz/perf-measure.html。有很多方法可以给代码计时，看看它需要多长时间。您应该能够以这种方式回答您自己关于速度的问题。
@MrFlick：谢谢你的回复！我仍在学习编程——很多时候我的代码运行时不会出错，但我仍然非常怀疑它可能没有按预期运行。再次感谢！

标签： r function parallel-processing progress-bar cluster-computing

【解决方案1】：

我会在furrr:future_map 中选择.progress = TRUE，下面是R Script 的示例；

library(furrr)
library(magrittr)


plan(multisession, workers = 2)


1:10 %>% future_map(
        ~ Sys.sleep(2),
        .progress = TRUE
)

它会自动添加一个progress-bar，

Progress: ─────────────────────────────────────── 100%

没有您的data.frame 和特定功能，我无法复制您的问题来向您展示它是如何实现的，但是apply-family 很容易转换为map-family 功能。

【讨论】：

非常感谢您的回答！我更新了我的问题并从头到尾包含了完整的代码。你能告诉我如何在我的代码上实现这个答案吗？我不知道我会在你的答案中插入我的代码！非常感谢！
我不太明白？请记住包含您想要的输出，并且只包含您数据的一个子集。无需运行完整的R Script
感谢您的回复！请再检查一次。非常感谢！
@Serkan：有机会看吗？
是的，但你的代码太长了——所以我有点迷失了方向。请参阅此stackoverflow.com/questions/68479928/…。在这里，我展示了如何将for-loop 替换为map。如果它没有回答你的问题，那么减少你的问题并模仿它，那么我也可以向你展示:-)