具有大型数据集的嵌套 for 循环答案

【问题标题】：nested for loops with large data sets具有大型数据集的嵌套 for 循环
【发布时间】：2016-05-22 03:12:03
【问题描述】：

我正在尝试使用一系列嵌套的 for-loops 和一个大型数据集：250,000 行和几百列来评估一系列有点复杂的 if 语句。

下面嵌套的for-loops 做我想做的事。不幸的是，我估计我的计算机需要 5 天以上才能运行此代码。请提出一种提高执行速度的方法。

我正在展示我正在尝试的简化版本。该示例仍然有些复杂，但我认为我不应该进一步简化它以传达if 语句的复杂性。

first.v <-  1
last.v  <- 10

all.data <- read.table(text = '
   est.d   c1   c2   c3   c4   c5   c6   c7   c8   c9  c10  first.p last.p first.c last.c my.two
     5      4    3   -1    9    1    5    2    0    2    5      3      8        1     2        8
     6      4    2   -2    8    1    4    2    0    4    4      2     10        1     2       10
     7      4    1   -3    7    2    4    2    0    6    3      1      9        1     2        9
     4      4    3   -4    6    2    3    2    0    8    6      1      2        1     1       NA
     9      5    2   -5    5    3    3    2    0    8    1      4     10        1     1       NA
     1      5    1   -6    4    3    2    2    0    8    9      1      9        2     2        1
    10      5    3   -7    3    4    2    2    0    6    0      8      9        1     1       NA
     8      4    2   -8    2    4    1    2    0    4    2      3      4        1     1       NA
     2      4    1    9    1    5    1    2    6    2    8      4      5        2     2        5
', header = TRUE, stringsAsFactors = FALSE)

my.cov <- all.data[, paste0('c', first.v : last.v)]

my.cov.again <- matrix(0, nrow = nrow(my.cov), ncol = ncol(my.cov))

for(i in 1:nrow(my.cov.again)) {
     for(j in 1:ncol(my.cov.again)) {

          if((j >= all.data$first.p[i]                       &
                   all.data$first.c[i] == 1               )  |

             (j >= all.data$first.p[i]                       &
                   all.data$first.c[i] == 1                  &
                   !is.na(all.data$my.two[i])                &
              j <= (all.data$my.two[i] + 1)               )  |

             (j >= all.data$first.p[i]                       &
              j <= all.data$last.p[i]                        &
                   all.data$first.c[i] == 1                  &
                   all.data$last.c[i]  == 1               )  |

             (j == all.data$first.p[i]                       &
                   all.data$first.c[i] == 2)) {my.cov.again[i,j] = my.cov[i,j] + 4}

     }
}

my.cov.again

【问题讨论】：

标签： r performance for-loop

【解决方案1】：

我想出的解决方案是将matrices 转换为vectors。然后使用ifelse 语句。此解决方案将预计执行时间从大约 5 天以上缩短到 6 分钟。

my.cov <- all.data[, paste0('c', first.v : last.v)]

j          <- rep(1:ncol(my.cov), nrow(all.data))

# convert matrix to vector    
my.cov <- c(t(my.cov))

first.p <- rep(all.data$first.p,  each = ((last.v - first.v) + 1))
last.p  <- rep(all.data$last.p,   each = ((last.v - first.v) + 1))

first.c <- rep(all.data$first.c,  each = ((last.v - first.v) + 1))
last.c  <- rep(all.data$last.c,   each = ((last.v - first.v) + 1))

my.two  <- rep(all.data$my.two,   each = ((last.v - first.v) + 1))
my.three <- rep(all.data$my.three, each = ((last.v - first.v) + 1))
my.four <- rep(all.data$my.four,  each = ((last.v - first.v) + 1))

est.d <- rep(all.data$est.d,    each = ((last.v - first.v) + 1))

add.this <- sample(c(-3, -2, -1, 0, 1, 2, 3), nrow(all.data), replace =  TRUE)
add.this <- rep(add.this, each = ((last.v - first.v) + 1))

my.cov.again.b <- rep(0, length(my.cov))
new.est.d <- rep(0, length(my.cov))

my.cov.again.b <- ifelse((j >= first.p                       &
                               first.c == 1               )  |

                         (j >= first.p                       &
                               first.c == 1                  &
                               !is.na(my.two)                &
                          j <= (my.two + 1)               )  |

                         (j >= first.p                       &
                          j <= last.p                        &
                               first.c == 1                  &
                               last.c  == 1               )  |

                         (j == first.p                       &
                               first.c == 2), (my.cov + 4), my.cov.again.b)

my.cov.again.b

my.cov.again.b <- matrix(my.cov.again.b, nrow = nrow(all.data), byrow = TRUE)
head(my.cov.again.b)

all.equal(my.cov.again, my.cov.again.b)

【讨论】：