【问题标题】:R: (Selectively) Trimming the Results of a LoopR:(选择性地)修剪循环的结果
【发布时间】:2021-05-05 05:18:16
【问题描述】:

我正在使用 R 编程语言。我正在学习如何迭代地循环一个过程(例如,生成一些随机数据并适合不同的决策树)。在上一个问题 (R: Saving the Results of a Loop) 中,我学习了如何生成随机数据、拟合不同的决策树并记录它们的准确度:

library(caret)
library(rpart)

#generate data

a = rnorm(1000, 10, 10)
b = rnorm(1000, 10, 5)
c = rnorm(1000, 5, 10)
group <- sample( LETTERS[1:2], 1000, replace=TRUE, prob=c(0.5,0.5))
group_1 <- 1:1000

#put data into a frame
d = data.frame(a,b,c, group, group_1)
d$group = as.factor(d$group)

e <- d
vec1 <- sample(200:300, 5)
vec2 <- sample(400:500,5)
vec3 <- sample(700:800,5)
z <- 0
df <- expand.grid(vec1, vec2, vec3)
df$Accuracy <- NA

for (i in seq_along(vec1)) { 
  for (j in seq_along(vec2)) {
    for (k in seq_along(vec3)) {
      # d <- e
      d$group_2 = as.integer(ifelse(d$group_1 < vec1[i] , 0, ifelse(d$group_1 >vec1[i]  & d$group_1 < vec2[j] , 1, ifelse(d$group_1 >vec2[j]  & d$group_1 < vec3[k] , 2,3))))
      
      d$group_2 = as.factor(d$group_2)
      
      fitControl <- trainControl(## 10-fold CV
        method = "repeatedcv",
        number = 2,
        ## repeated ten times
        repeats = 1)
      
      TreeFit <- train(group_2 ~ ., data = d[,-5],
                       method = "rpart",
                       trControl = fitControl)
      
      pred <- predict(
        TreeFit,
        d[,-5])
      
      con <- confusionMatrix(
        d$group_2,
        pred) 
      
      #update results into table
      #final_table[i,j] = con$overall[1]
      z <- z + 1
      df$Accuracy[z] <- con$overall[1]
    }
  }
}
#view the final results
head(df)

数据框“df”包含最终结果。我担心的是:如果你想多次迭代这个循环,“df”的大小会变得非常大。假设我只想保留“df”的“前 20 行”(基于 df$Accuracy 的降序值)。我可以这样做:

#sort "df" by (descending values of) "Accuracy":
df_sort <- df[order(-df$Accuracy),]

#select first 20 rows
df_final = df_sort[1:20,]

但我担心计算机内存的限制可能会阻止创建“df”(对于大量迭代)。

我的问题:有没有办法阻止“df”超过 20 行?例如

  1. 填充“df”的前 20 行
  2. 如果第 21 行的准确率低于前 20 行中的任何一行,则删除
  3. 如果第 21 行的准确度大于前 20 行中最小的准确度,则保留第 21 行并删除准确度最小的行

这样,“df”的大小永远不会超过 20 行。

有人可以告诉我怎么做吗?

谢谢

【问题讨论】:

    标签: r loops for-loop iteration decision-tree


    【解决方案1】:

    你可以这样实现逻辑:

    library(caret)
    library(rpart)
    
    a = rnorm(1000, 10, 10)
    b = rnorm(1000, 10, 5)
    c = rnorm(1000, 5, 10)
    group <- sample( LETTERS[1:2], 1000, replace=TRUE, prob=c(0.5,0.5))
    group_1 <- 1:1000
    
    #put data into a frame
    d = data.frame(a,b,c, group, group_1)
    d$group = as.factor(d$group)
    
    e <- d
    vec1 <- sample(200:300, 5)
    vec2 <- sample(400:500,5)
    vec3 <- sample(700:800,5)
    z <- 0
    #Intialise a list
    result <- vector('list', 20)
    
    for (i in seq_along(vec1)) { 
      for (j in seq_along(vec2)) {
        for (k in seq_along(vec3)) {
          # d <- e
          d$group_2 = as.integer(ifelse(d$group_1 < vec1[i] , 0, ifelse(d$group_1 >vec1[i]  & d$group_1 < vec2[j] , 1, ifelse(d$group_1 >vec2[j]  & d$group_1 < vec3[k] , 2,3))))
          
          d$group_2 = as.factor(d$group_2)
          
          fitControl <- trainControl(## 10-fold CV
            method = "repeatedcv",
            number = 2,
            ## repeated ten times
            repeats = 1)
          
          TreeFit <- train(group_2 ~ ., data = d[,-5],
                           method = "rpart",
                           trControl = fitControl)
          
          pred <- predict(
            TreeFit,
            d[,-5])
          
          con <- confusionMatrix(
            d$group_2,
            pred) 
          
          
          z <- z + 1
          #Till 20 put the data in a list
          if(z <= 20) {
            result[[z]] <- data.frame(vec1 = vec1[i], vec2 = vec2[j], vec3 = vec3[j], Accuracy = con$overall[1])
          } else {
            #Create a dataframe of 20 list from above
            if(z == 21) result <- do.call(rbind, result)
            #Sort it in decreasing order
            result <- result[order(-result$Accuracy), ]
            #compare with last value
            if(result$Accuracy[20] < con$overall[1]) {
              #Create a new dataframe
              new_df <- data.frame(vec1 = vec1[i], vec2 = vec2[j], vec3 = vec3[j], Accuracy = con$overall[1])
              #Replace the last row
              result <- rbind(head(result, 19), new_df)
            }
          }
          
        }
      }
    }
    

    这应该会返回类似的输出:

    result
    #           vec1 vec2 vec3 Accuracy
    #Accuracy2   258  402  706    0.376
    #Accuracy4   258  402  706    0.376
    #Accuracy9   200  402  706    0.376
    #Accuracy15  214  402  706    0.376
    #Accuracy16  236  402  706    0.376
    #Accuracy18  207  402  706    0.376
    #Accuracy11  258  414  779    0.364
    #Accuracy12  200  414  779    0.364
    #Accuracy6   214  414  779    0.364
    #Accuracy13  236  414  779    0.364
    #Accuracy10  200  402  706    0.360
    #Accuracy17  214  402  706    0.360
    #Accuracy3   236  402  706    0.360
    #Accuracy5   207  402  706    0.360
    #Accuracy19  258  414  779    0.348
    #Accuracy8   200  414  779    0.348
    #Accuracy7   214  414  779    0.348
    #Accuracy14  236  414  779    0.348
    #Accuracy    207  414  779    0.348
    #Accuracy1   207  414  779    0.364
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2023-01-14
      • 1970-01-01
      • 2022-01-20
      • 1970-01-01
      • 2021-05-04
      • 2018-10-10
      • 1970-01-01
      相关资源
      最近更新 更多