R：函数只产生 1 行数据答案

【问题标题】：R: Function only Produces 1 Row of DataR：函数只产生 1 行数据
【发布时间】：2021-09-16 13:58:15
【问题描述】：

我正在使用 R。在之前的帖子 (R: Keeping the 5 Biggest Rows in a Table) 中，我生成了一些随机数据并编写了以下代码，该代码循环执行了一系列数据操作步骤并生成了一个包含结果的表（“final_results”）：

#load library
    library(dplyr)

library(data.table)

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)


####
results_table <- data.frame()

for (i in 1:10 ) {
    
    #generate random numbers
    random_1 =  runif(1, 80, 120)
    random_2 =  runif(1, random_1, 120)
    random_3 =  runif(1, 85, 120)
    random_4 =  runif(1, random_3, 120)
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    split_1 =  runif(1,0, 1)
    split_2 =  runif(1, 0, 1)
    split_3 =  runif(1, 0, 1)
    
    #calculate 60th quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
   
    
    final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')

#keep the 5 biggest results (according to the "total" variable)
    final_results <- head(final_results[order(-total)], 5)
    
}

#view output (should only have 5 rows)

final_results

   iteration_number  random_1 random_2  random_3  random_4    split_1   split_2   split_3          a         b         c total
1:                3  81.02645 110.4645 116.42006 119.61718 0.11943576 0.9762721 0.9100522 0.14285714 0.9758162 0.9103448 0.943
2:                8 102.17487 117.1701  95.93786  96.80284 0.81599406 0.7785768 0.8593795 0.81300813 0.7795276 0.8586667 0.843
3:                2  92.31360 110.0762 106.46871 109.53428 0.24615922 0.8777580 0.7847697 0.24731183 0.8777429 0.7840909 0.744
4:                1  95.67371 111.8133  94.00313 102.05692 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7734554 0.730
5:                4  90.35986 116.7089 114.15588 116.72312 0.07675141 0.8661540 0.3236617 0.08139535 0.8658065 0.3207547 0.702

问题：现在，我正在尝试重新编写循环，以便在循环过程中：

对于每个独特的迭代：
“results_table”只保留与那些“总均值”最大值为 5 的迭代对应的行

例如

目标：对于上图的“results_table”，每个“黑框”代表一组迭代，每个“红框”显示“总均值”的值。为了防止“results_table”的大小随着每次迭代而增长，我只想保留与 5 个最大值相对应的行（在此示例中用红色框表示）。

我尝试使用以下代码将“results_table”和“final_table”合并为一个步骤：

#load library
library(dplyr)

library(data.table)

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)


####
results_table <- data.frame()

for (i in 1:10 ) {
    
    #generate random numbers
    random_1 =  runif(1, 80, 120)
    random_2 =  runif(1, random_1, 120)
    random_3 =  runif(1, 85, 120)
    random_4 =  runif(1, random_3, 120)
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    split_1 =  runif(1,0, 1)
    split_2 =  runif(1, 0, 1)
    split_3 =  runif(1, 0, 1)
    
    #calculate 60th quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
    
    
    results_table = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
    #keep the 5 biggest results (according to the "total" variable)
    results_table <- head(results_table[order(-total)], 5)
    
}

#view output (should only have 5 rows)

results_table

但这会导致错误并输出一个只有一行的表格：

Error in rbindlist(l, use.names, fill, idcol) : 
  Item 2 has 10 columns, inconsistent with item 1 which has 12 columns. To fill missing columns use fill=TRUE.

 #view output (should only have 5 rows)

  results_table
 
  iteration_number random_1 random_2 random_3 random_4   split_1   split_2   split_3         a         b         c total
1:                1 95.67371 111.8133 94.00313 102.0569 0.8404564 0.6882731 0.7749321 0.8205128 0.6870229 0.7734554  0.73

谁能告诉我如何解决这个问题？

谢谢

编辑

@Ronak Shah：这是你的意思吗？

#load library
    library(dplyr)

library(data.table)

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)

results_table <- vector('list', 10) 
####


for (i in 1:10 ) {
    
    #generate random numbers
    random_1 =  runif(1, 80, 120)
    random_2 =  runif(1, random_1, 120)
    random_3 =  runif(1, 85, 120)
    random_4 =  runif(1, random_3, 120)
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    split_1 =  runif(1,0, 1)
    split_2 =  runif(1, 0, 1)
    split_3 =  runif(1, 0, 1)
    
    #calculate 60th quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
   
    
    results_table[[i]] <- final_table_2


    
}
Error in `[[<-.data.frame`(`*tmp*`, i, value = list(cat = c("a", "b",  : 
  replacement has 4 rows, data has 8 

#view output (should only have 5 rows)
res <- bind_rows(results_table)
final <- dcast(setDT(res), iteration_number + random_1 + random_2 + random_3 + 
               random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')

Error in dcast.data.table(setDT(res), iteration_number + random_1 + random_2 +  : 
  Columns specified in formula can not be of type list
In addition: Warning message:
In setDT(res) :
  Some columns are a multi-column type (such as a matrix column): [1]. setDT will retain these columns as-is but subsequent operations like grouping and joining may fail. Please consider as.data.table() instead which will create a new column for each embedded column.

#view final result
final

Error: object 'final' not found

【问题讨论】：

这个错误可以用fill = TRUE in rbindlist 纠正
results_table <- rbind(results_table, final_table_2, fill = TRUE) 它是rbind，因为对象是一个data.table
您正在将setDT(results_table) 中的data.frame 转换为data.table，因此它使用rbind.data.table，其中包括fill = TRUE。检查methods("rbind") [1] rbind.data.frame rbind.data.table* rbind.grouped_df* ...
@akrun：谢谢！我会继续努力...
如果您在 for 循环中的赋值的 LHS 上没有索引，您将覆盖除这些命名对象的最后一个值之外的所有值。

标签： r loops dplyr data-manipulation

【解决方案1】：

您可以将其初始化为列表，而不是将 results_table 初始化为空数据框。

library(dplyr)
library(data.table)

results_table <- vector('list', 10)

从for 循环中删除dcast 行并将final_table_2 保存在列表中。

....
....
results_table[[i]] <- final_table_2

} #for loop end

在循环之后，您可以组合结果并使用dcast 进行整形。

res <- bind_rows(results_table)
final <- dcast(setDT(res), iteration_number + random_1 + random_2 + random_3 + 
               random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')

【讨论】：

感谢您的回答！我试过了，但我遇到了一个错误 - 我在我的问题的编辑中发布了我对你的答案的理解。如果你有时间，你可以看看它吗？谢谢！
@stats555 也删除此行。 results_table <- rbind(results_table, final_table_2)。抱歉，我在帖子中没有说清楚。
谢谢！你认为这会让代码更容易在计算机上运行吗？
我认为是因为 1）我们没有在循环中增加对象，我们现在使用 list。 2）我们也只使用一次dcast，所以这也应该有所帮助。
非常感谢您的帮助！如果你有时间，你能看看这个问题吗？ stackoverflow.com/questions/68264265/…