【问题标题】:R replace multiple variables in a string using a lookup tableR使用查找表替换字符串中的多个变量
【发布时间】:2021-06-03 10:11:02
【问题描述】:

我正在使用 2 个数据框作为示例:

DF1 是主表,其中有一列包含通常有变量的方程

DF1 <- read.table(text = 
"Unit     Year  Equation
 1        2020  'x+2*y'
 1        2021  'x+2*y'
 1        2022  'x+2*y'
 2        2020  'x'
 3        2020  'max(y^2, y+2*z)'
 3        2021  'max(y^2, y+2*z)'
 4        2020  '5'
 5        2020  '(x/y)+z'",
header = TRUE, stringsAsFactors = FALSE)

DF2 是参考或查找表,它为给定年份的变量分配值

DF2 <- read.table(text = 
"Year  Variable  Value
 2020  x         10
 2021  x         15.5
 2022  x         50
 2020  y         1
 2021  y         2
 2022  y         3.5
 2020  z         20
 2021  z         34
 2022  z         11",
header = TRUE, stringsAsFactors = FALSE)

我们的目标是匹配两个数据帧之间的变量和年份,以便在应用 eval(parse(text=Equation)) 或类似的东西后可以得出下表:

Unit     Year  Equation
 1       2020  12
 1       2021  19.5
 1       2022  57
 2       2020  10
 3       2020  41
 3       2021  70
 4       2020  5
 5       2020  30

目前我正在使用 for 和 if-else 循环来匹配年份并逐行替换变量。它工作正常,但运行它变得非常慢,因为 DF1 可能包含数千行和几个变量。我可以使用其他函数来实现相同的输出吗?

编辑 - 添加提到的循环以帮助比较:

library(dplyr)
library(reshape2)
DF2 = dcast(DF2, Year~Variable, value.var='Value')

  #Adding in this line to avoid replacing "x" in "max":
  DF1$Equation = gsub("max","placeholder",DF1$Equation)

for(i in 1:nrow(DF1)) {
  for (j in 1:nrow(DF2)) {
    if (DF1[i,]$Year==DF2[j,]$Year) {
      #Every variable would be declared here:
      DF1[i,]$Equation = gsub("x",DF2[j,]$x,DF1[i,]$Equation)
      DF1[i,]$Equation = gsub("y",DF2[j,]$y,DF1[i,]$Equation)
      DF1[i,]$Equation = gsub("z",DF2[j,]$z,DF1[i,]$Equation)
    }
  }
}
  #Returning the function:
  DF1$Equation = gsub("placeholder","max",DF1$Equation)

Results_DF1 = DF1 %>% rowwise() %>%
              mutate(Equation = eval(parse(text=Equation)))

【问题讨论】:

  • 您能否编辑 DF1 以获得正确的 R 方程,即 1+2x 无效。但是 1+2*x 是有效的
  • 感谢您指出这一点!编辑以修复 DF1
  • 你更喜欢base R还是tidyverse?
  • 您能否为您的方法提供 if-else 循环以将其包含在基准比较中?

标签: r


【解决方案1】:

你可以这样做:

left_join(DF1, DF2, 'Year') %>%
  pivot_wider(c(Unit,Year,Equation),Variable, values_from = Value) %>%
  rowwise() %>%
  mutate(a = eval(parse(text = Equation)))

   Unit  Year Equation            x     y     z     a
  <int> <int> <chr>           <dbl> <dbl> <dbl> <dbl>
1     1  2020 x+2*y            10     1      20  12  
2     1  2021 x+2*y            15.5   2      34  19.5
3     1  2022 x+2*y            50     3.5    11  57  
4     2  2020 x                10     1      20  10  
5     3  2020 max(y^2, y+2*z)  10     1      20  41  
6     3  2021 max(y^2, y+2*z)  15.5   2      34  70  
7     4  2020 5                10     1      20   5  
8     5  2020 (x/y)+z          10     1      20  30  

【讨论】:

    【解决方案2】:

    我刚刚注意到您已经编辑了您的DF1,所以我使用它来代替,我不需要进行编辑:

    library(dplyr)
    library(rlang)
    
    DF1 %>%
      left_join(DF2 %>%
                  pivot_wider(names_from = Variable, values_from = Value), 
                by = "Year") %>%
      rowwise() %>%
      mutate(Result = eval(parse_expr(Equation)))
    
    # A tibble: 8 x 7
    # Rowwise: 
       Unit  Year Equation            x     y     z Result
      <int> <int> <chr>           <dbl> <dbl> <dbl>  <dbl>
    1     1  2020 x+2*y            10     1      20   12  
    2     1  2021 x+2*y            15.5   2      34   19.5
    3     1  2022 x+2*y            50     3.5    11   57  
    4     2  2020 x                10     1      20   10  
    5     3  2020 max(y^2, y+2*z)  10     1      20   41  
    6     3  2021 max(y^2, y+2*z)  15.5   2      34   70  
    7     4  2020 5                10     1      20    5  
    8     5  2020 (x/y)+z          10     1      20   30  
    

    【讨论】:

      【解决方案3】:

      要提供baseR 解决方案,您可以执行以下操作以获得所需的结果数据框:

        df1YearRows <- split(1:NROW(DF1),DF1$Year)
        df2YearRows <- split(1:NROW(DF2), DF2$Year)
        
        equationValues <- setNames(lapply(names(df1YearRows), function(yearId) {
          fittingVariables <- DF2[df2YearRows[[yearId]],c("Variable","Value")]
          fittingVariablesAndvaluesList <- as.list(setNames(fittingVariables$Value,fittingVariables$Variable))
          equationToEvaluate <- DF1$Equation[df1YearRows[[yearId]]]
          sapply(equationToEvaluate, function(eqTxt) {
            eval(parse(text = eqTxt), fittingVariablesAndvaluesList)
          })
        }),names(df1YearRows))
        
        equationValueColumn <- Reduce(
          f = function(oldColumn,newYearId) {
            oldColumn[df1YearRows[[newYearId]]] <- equationValues[[newYearId]]
            oldColumn
          }, 
          x = names(equationValues), 
          init = numeric(NROW(DF1))
        )
        
        resultDf <- data.frame(Year = DF1$Year, 
                               Equation = DF1$Equation, 
                               Equationvalue = equationValueColumn)
        
        resultDf
      

      导致

        Year        Equation Equationvalue
      1 2020           x+2*y          12.0
      2 2021           x+2*y          19.5
      3 2022           x+2*y          57.0
      4 2020               x          10.0
      5 2020 max(y^2, y+2*z)          41.0
      6 2021 max(y^2, y+2*z)          70.0
      7 2020               5           5.0
      8 2020         (x/y)+z          30.0
      

      【讨论】:

        【解决方案4】:

        base 中的一种可能性。首先使用setNamessplitDF2 转换为每年的命名列表。然后eval Equations 与特定年份的 Values 使用mapply

        L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
        x <- mapply(function(e, y) {eval(str2lang(e), L2[[y]])},
                    DF1$Equation, match(DF1$Year, names(L2)))
        cbind(DF1[1:2], Equation=x)
        #  Unit Year Equation
        #1    1 2020     12.0
        #2    1 2021     19.5
        #3    1 2022     57.0
        #4    2 2020     10.0
        #5    3 2020     41.0
        #6    3 2021     70.0
        #7    4 2020      5.0
        #8    5 2020     30.0
        

        如果有许多相同的方程,它可以带来一些改进,只解析 unique

        L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
        E <- unique(DF1$Equation)
        ES <- lapply(E, str2lang)
        x <- mapply(function(e, y) {eval(ES[[e]], L2[[y]])},
                    match(DF1$Equation, E), match(DF1$Year, names(L2)))
        cbind(DF1[1:2], Equation=x)
        

        并在数字和变量之间使用没有* 的原始方程,使用gsub

        DF1a <- read.table(text = 
        "Unit     Year  Equation
         1        2020  'x+2y'
         1        2021  'x+2y'
         1        2022  'x+2y'
         2        2020  'x'
         3        2020  'max(y^2, y+2z)'
         3        2021  'max(y^2, y+2z)'
         4        2020  '5'
         5        2020  '(x/y)+z'",
        header = TRUE, stringsAsFactors = FALSE)
        
        x <- mapply(function(e, y) {eval(parse(text=e), L2[[y]])},
               gsub("(\\d)([a-z])", "\\1*\\2", DF1a$Equation), as.character(DF1a$Year))
        

        比较时间和内存使用情况。

        years <- 20
        units <- 1000
        DF1 <- expand.grid( stringsAsFactors = FALSE, Unit = 1:units, Year = 1:years,
                   Equation = c("x+2*y", "x", "max(y^2, y+2*z)", "5", "(x/y)+z"))
        DF2 <- expand.grid(Year = 1:years, Variable = c("x", "y", "z"))
        set.seed(42)
        DF2$Value <- rnorm(nrow(DF2))
        
        library(dplyr)
        library(rlang)
        library(tidyr)
        
        onyambu <- function() {
          left_join(DF1, DF2, 'Year') %>%
          pivot_wider(c(Unit,Year,Equation),Variable, values_from = Value) %>%
          rowwise() %>%
            mutate(a = eval(parse(text = Equation)))
        }
        
        anoushiravan <- function() {
          DF1 %>%
          left_join(DF2 %>%
                      pivot_wider(names_from = Variable, values_from = Value), 
                    by = "Year") %>%
          rowwise() %>%
          mutate(Result = eval(parse_expr(Equation)))
        }
        
        jonas <- function() {
          df1YearRows <- split(1:NROW(DF1),DF1$Year)
          df2YearRows <- split(1:NROW(DF2), DF2$Year)
          
          equationValues <- setNames(lapply(names(df1YearRows), function(yearId) {
            fittingVariables <- DF2[df2YearRows[[yearId]],c("Variable","Value")]
            fittingVariablesAndvaluesList <- as.list(setNames(fittingVariables$Value,fittingVariables$Variable))
            equationToEvaluate <- DF1$Equation[df1YearRows[[yearId]]]
            sapply(equationToEvaluate, function(eqTxt) {
              eval(parse(text = eqTxt), fittingVariablesAndvaluesList)
            })
          }),names(df1YearRows))
          
          equationValueColumn <- Reduce(
            f = function(oldColumn,newYearId) {
              oldColumn[df1YearRows[[newYearId]]] <- equationValues[[newYearId]]
              oldColumn
            }, 
            x = names(equationValues), 
            init = numeric(NROW(DF1))
          )
          
          resultDf <- data.frame(Year = DF1$Year, 
                                 Equation = DF1$Equation, 
                                 Equationvalue = equationValueColumn)
          
          resultDf
        }
        
        GKi <- function() {
          L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
          x <- mapply(function(e, y) {eval(str2lang(e), L2[[y]])},
                      DF1$Equation, match(DF1$Year, names(L2)))
          cbind(DF1[1:2], Equation=x)
        }
        
        GKi2 <- function() {
          L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
          E <- unique(DF1$Equation)
          ES <- lapply(E, str2lang)
          x <- mapply(function(e, y) {eval(ES[[e]], L2[[y]])},
                      match(DF1$Equation, E), match(DF1$Year, names(L2)))
          cbind(DF1[1:2], Equation=x)
        }
        
        bench::mark(check = FALSE
                  , onyambu()
                  , anoushiravan()
                  , jonas()
                  , GKi()
                  , GKi2()
                    )
        #  expression          min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
        #  <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
        #1 onyambu()         4.29s    4.29s     0.233  138.04MB     6.06     1    26
        #2 anoushiravan()    7.12s    7.12s     0.140  100.23MB     3.09     1    22
        #3 jonas()           3.49s    3.49s     0.287   89.98MB     3.44     1    12
        #4 GKi()          863.95ms 863.95ms     1.16      6.6MB     3.47     1     3
        #5 GKi2()         324.69ms 325.58ms     3.07     7.49MB     4.61     2     3
        

        【讨论】:

          猜你喜欢
          • 2019-03-12
          • 1970-01-01
          • 1970-01-01
          • 2013-05-17
          • 2015-05-11
          • 1970-01-01
          • 1970-01-01
          • 2014-10-28
          • 1970-01-01
          相关资源
          最近更新 更多