【问题标题】:Creating a panel data frame based on two other dataframes基于其他两个数据框创建面板数据框
【发布时间】:2016-09-11 04:19:50
【问题描述】:

我有两个数据框:

ANNUALSALARY <- structure(list(FIRM = structure(1:3, .Label = c("A", "B", "C"), class = "factor"), SLY_ADMIN = c(0.1, 0.2, 0.3), SLY_MKT = c(0.5, 0.003,0.3), SLY_FIN = c(0.11, 0.12, 0.03)), .Names = c("FIRM", "SLY_ADMIN", "SLY_MKT", "SLY_FIN"), row.names = c(NA, -3L), class = "data.frame")

和:

WEEKLYPRODUCTIVITY <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR_ADMIN = c(1, 5, 4, 3, 2, 1, 4, 2, 4, 2, 3, 1, 4, 5, 5), Z_ADMIN = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6), PR_MKT = c(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2), Z_MKT = c(9, 8, 7, 6, 5, 4, 3, 2, 1, 9, 8, 7, 6, 5, 4), PR_FIN = c(5, 4, 3, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 2, 1), Z_FIN = c(1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR_ADMIN", "Z_ADMIN", "PR_MKT", "Z_MKT", "PR_FIN", "Z_FIN"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"))

我有兴趣为每个FIRM 创建一个从SLY_ADMINSLY_MKTSLY_FIN 中提取最小值的数据框。然后从PR_ADMINPR_MKTPR_FIN以及Z_ADMINZ_MKTZ_FIN中取出相应的值。例如如果 SLY_MKT 是 FIRM A 的最小值,那么它会在 5 周内返回 PR_MKTZ_MKT。面板数据框看起来像这样(我手动创建的):

REQUIRED <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR = c(1, 5, 4, 3, 2, 5, 0, 1, 2, 3, 5, 4, 3, 2, 1), MIN_SLY = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.003, 0.003, 0.003, 0.003, 0.003, 0.03, 0.03, 0.03, 0.03, 0.03), SLY_DEPT = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("SLY_ADMIN", "SLY_FIN", "SLY_MKT"), class = "factor"), Z = c(1, 2, 3, 4, 5, 4, 3, 2, 1, 9, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR", "MIN_SLY", "SLY_DEPT", "Z"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"))

请帮忙。谢谢

【问题讨论】:

    标签: r dataframe panel-data


    【解决方案1】:

    这是一个棘手的问题!我提出了一个围绕 max.col()merge() 和索引矩阵构建的基本 R 解决方案。

    请注意,为了简洁起见,我使用了变量名 salprod

    sufs <- c('ADMIN','MKT','FIN');
    slys <- paste0('SLY_',sufs);
    mins <- max.col(-sal[slys]);
    res <- merge(prod[,c('FIRM','WEEKS')],cbind(sal[,'FIRM',drop=F],SLY_DEPT=slys[mins],MIN_SLY=sal[slys][cbind(seq_len(nrow(sal)),mins)]));
    res.sufs <- sub('.*_','',res$SLY_DEPT);
    for (pre in c('PR','Z')) { pre.cns <- paste0(pre,'_',sufs); res[[pre]] <- prod[pre.cns][cbind(seq_len(nrow(prod)),match(paste0(pre,'_',res.sufs),pre.cns))]; };
    
    res;
    ##    FIRM WEEKS  SLY_DEPT MIN_SLY PR Z
    ## 1     A     1 SLY_ADMIN   0.100  1 1
    ## 2     A     2 SLY_ADMIN   0.100  5 2
    ## 3     A     3 SLY_ADMIN   0.100  4 3
    ## 4     A     4 SLY_ADMIN   0.100  3 4
    ## 5     A     5 SLY_ADMIN   0.100  2 5
    ## 6     B     1   SLY_MKT   0.003  5 4
    ## 7     B     2   SLY_MKT   0.003  0 3
    ## 8     B     3   SLY_MKT   0.003  1 2
    ## 9     B     4   SLY_MKT   0.003  2 1
    ## 10    B     5   SLY_MKT   0.003  3 9
    ## 11    C     1   SLY_FIN   0.030  5 1
    ## 12    C     2   SLY_FIN   0.030  4 2
    ## 13    C     3   SLY_FIN   0.030  3 3
    ## 14    C     4   SLY_FIN   0.030  2 4
    ## 15    C     5   SLY_FIN   0.030  1 5
    

    基准测试

    ## libraries
    library(data.table);
    library(microbenchmark);
    
    ## define inputs, including data.table instances for akrun and maximus solutions
    sal <- structure(list(FIRM = structure(1:3, .Label = c("A", "B", "C"), class = "factor"), SLY_ADMIN = c(0.1, 0.2, 0.3), SLY_MKT = c(0.5, 0.003,0.3), SLY_FIN = c(0.11, 0.12, 0.03)), .Names = c("FIRM", "SLY_ADMIN", "SLY_MKT", "SLY_FIN"), row.names = c(NA, -3L), class = "data.frame");
    prod <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR_ADMIN = c(1, 5, 4, 3, 2, 1, 4, 2, 4, 2, 3, 1, 4, 5, 5), Z_ADMIN = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6), PR_MKT = c(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2), Z_MKT = c(9, 8, 7, 6, 5, 4, 3, 2, 1, 9, 8, 7, 6, 5, 4), PR_FIN = c(5, 4, 3, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 2, 1), Z_FIN = c(1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR_ADMIN", "Z_ADMIN", "PR_MKT", "Z_MKT", "PR_FIN", "Z_FIN"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"));
    sal.dt <- as.data.table(sal);
    prod.dt <- as.data.table(prod);
    
    ## solutions
    bgoldst <- function(sal,prod) { sufs <- c('ADMIN','MKT','FIN'); slys <- paste0('SLY_',sufs); mins <- max.col(-sal[slys]); res <- merge(prod[,c('FIRM','WEEKS')],cbind(sal[,'FIRM',drop=F],SLY_DEPT=slys[mins],MIN_SLY=sal[slys][cbind(seq_len(nrow(sal)),mins)])); res.sufs <- sub('.*_','',res$SLY_DEPT); for (pre in c('PR','Z')) { pre.cns <- paste0(pre,'_',sufs); res[[pre]] <- prod[pre.cns][cbind(seq_len(nrow(prod)),match(paste0(pre,'_',res.sufs),pre.cns))]; }; res; };
    akrun <- function(ANNUALSALARY,WEEKLYPRODUCTIVITY) { i1 <- max.col(-1*ANNUALSALARY[,-1,with=F]); dN <- data.table(FIRM= ANNUALSALARY$FIRM, MIN_SLY=as.data.frame(ANNUALSALARY)[-1][cbind(1:nrow(ANNUALSALARY), i1)], SLY_DEPT = names(ANNUALSALARY)[-1][i1]); dN2 <- melt(dN[WEEKLYPRODUCTIVITY, on = "FIRM"], measure = patterns("^PR", "^Z"), value.name = c("PR", "Z"))[order(FIRM, variable, WEEKS)][, gr1 := cumsum(WEEKS==1), FIRM][]; res <- data.table(FIRM= ANNUALSALARY$FIRM, i1)[dN2, on = "FIRM"][gr1==i1]; res[,!names(res)%in%c('i1','variable','gr1'),with=F]; };
    maximus <- function(ANNUALSALARY,WEEKLYPRODUCTIVITY) { res <- WEEKLYPRODUCTIVITY[ANNUALSALARY, on = 'FIRM']; sly.cols <- grep('^SLY_', names(res), value = TRUE); res[, `:=` (MIN_SLY = min(.SD), SLY_DEPT = sly.cols[which.min(.SD)]), by = 1:nrow(res), .SDcols = sly.cols][]; res2 <- melt(res, id = c('FIRM','WEEKS','MIN_SLY','SLY_DEPT'), measure.vars = patterns('^PR_','^Z_'), value.name = c('PR','Z'))[, variable := c('ADMIN','MKT','FIN')[variable]][, `:=` (PR = PR[sub('^SLY_','',SLY_DEPT) == variable], Z = Z[sub('^SLY_','',SLY_DEPT) == variable]), by = .(FIRM,WEEKS)][, variable := NULL]; res2 <- res2[!duplicated(res2)]; };
    
    ## proofs of equivalence
    ex <- bgoldst(sal,prod); co <- names(ex);
    identical(ex,transform(as.data.frame(akrun(sal.dt,prod.dt))[co],SLY_DEPT=factor(SLY_DEPT)));
    ## [1] TRUE
    identical(ex,transform(as.data.frame(maximus(sal.dt,prod.dt))[co],SLY_DEPT=factor(SLY_DEPT)));
    ## [1] TRUE
    
    ## benchmark
    microbenchmark(bgoldst(sal,prod),akrun(sal.dt,prod.dt),maximus(sal.dt,prod.dt));
    ## Unit: milliseconds
    ##                      expr      min       lq     mean   median       uq       max neval
    ##        bgoldst(sal, prod) 1.639193 1.730070 1.883285 1.807047 1.881031  3.230917   100
    ##    akrun(sal.dt, prod.dt) 6.392125 6.666251 7.744077 6.901033 7.230752 53.621663   100
    ##  maximus(sal.dt, prod.dt) 5.002254 5.229979 5.853681 5.423492 6.034609 12.182544   100
    

    【讨论】:

    • 感谢基础 R 解决方案。当数据表与 plm 包一起使用时,通常会有什么问题吗?这个问题我想了很久。
    • 不客气。我从来没有使用过 plm 包,所以我问错人了。你可以试试问akrun,他对R非常了解。
    • 不错的基础 R 解决方案!
    • @bgoldst 请看看这个类似的问题:stackoverflow.com/questions/37293841/…
    【解决方案2】:

    我们可以使用data.table。使用max.col 获取“ANNUALSALARY”中数值列最​​小值的索引。然后,我们将 'data.frame' 转换为 'data.table' 和 melt 将它从 'wide' 转换为 'long' 格式,得到 "MIN_SLY" 和 "S

    library(data.table)
    i1 <- max.col(-1*ANNUALSALARY[-1])
    dN <- melt(setDT(ANNUALSALARY), id.var = "FIRM", value.name = "MIN_SLY", 
       variable.name = "SLY_DEPT")[ , .SD[which.min(MIN_SLY)], by = FIRM]
    setDT(WEEKLYPRODUCTIVITY)
    

    或者代替melting,我们可以使用'i1'创建'data.table'

    dN <- data.table(FIRM= ANNUALSALARY$FIRM, 
                    MIN_SLY=as.data.frame(ANNUALSALARY)[-1][cbind(1:nrow(ANNUALSALARY), i1)], 
                    SLY_DEPT = names(ANNUALSALARY)[-1][i1])
    

    然后,我们根据列名中的patternsjoin 'dN' 由'WEEKLYPRODUCTIVITY' 和melt 转换为'long' 格式。我们order by 'FIRM', 'variable', 'WEEKS',根据“WEEKS”值创建分组变量('gr1'),按'FIRM'分组。

    dN2 <- melt(dN[WEEKLYPRODUCTIVITY, on = "FIRM"], measure = patterns("^PR", "^Z"), 
        value.name = c("PR", "Z"))[order(FIRM, variable, WEEKS)
           ][, gr1 := cumsum(WEEKS==1), FIRM][]
    

    最后,我们加入使用 'i1'、on "FIRM" 创建的 data.table',对 'gr1' 等于 'i1' 的行进行子集化,然后选择感兴趣的列。

    res <- data.table(FIRM= ANNUALSALARY$FIRM, i1)[dN2, on = "FIRM"
                ][gr1==i1][,names(REQUIRED), with = FALSE]
    
    all.equal(as.data.frame(res), REQUIRED, check.attributes=FALSE)
    #[1] TRUE
    res
    #    FIRM WEEKS PR MIN_SLY  SLY_DEPT Z
    # 1:    A     1  1   0.100 SLY_ADMIN 1
    # 2:    A     2  5   0.100 SLY_ADMIN 2
    # 3:    A     3  4   0.100 SLY_ADMIN 3
    # 4:    A     4  3   0.100 SLY_ADMIN 4
    # 5:    A     5  2   0.100 SLY_ADMIN 5
    # 6:    B     1  5   0.003   SLY_MKT 4
    # 7:    B     2  0   0.003   SLY_MKT 3
    # 8:    B     3  1   0.003   SLY_MKT 2
    # 9:    B     4  2   0.003   SLY_MKT 1
    #10:    B     5  3   0.003   SLY_MKT 9
    #11:    C     1  5   0.030   SLY_FIN 1
    #12:    C     2  4   0.030   SLY_FIN 2
    #13:    C     3  3   0.030   SLY_FIN 3
    #14:    C     4  2   0.030   SLY_FIN 4
    #15:    C     5  1   0.030   SLY_FIN 5
    

    【讨论】:

    • 当我运行 dN2 行时:[.data.table(dN, WEEKLYPRODUCTIVITY, on = "FIRM") 中的错误:逻辑错误。 i 不是 data.table,但提供了 'on' 参数。
    • @PolarBear 我没有收到任何错误。我用data.table_1.9.6
    【解决方案3】:

    另一种方法,但也使用data.table 包:

    library(data.table)
    # convert the dataframes to datatables (which is an enhanced form of dataframe)
    setDT(ANNUALSALARY)
    setDT(WEEKLYPRODUCTIVITY)
    
    # join them on 'FIRM'
    res <- WEEKLYPRODUCTIVITY[ANNUALSALARY, on = 'FIRM']
    # create a convenience vector with the columnnames starting with 'SLY_
    sly.cols <- grep('^SLY_', names(res), value = TRUE)
    
    # create the 'MIN_SLY' & 'SLY_DEPT' columns
    res[, `:=` (MIN_SLY = min(.SD),
                SLY_DEPT = sly.cols[which.min(.SD)]), 
        by = 1:nrow(res), .SDcols = sly.cols][]
    
    # melt it in log format and create the 'PR' & 'Z' column
    res2 <- melt(res, id = c('FIRM','WEEKS','MIN_SLY','SLY_DEPT'), 
                 measure.vars = patterns('^PR_','^Z_'),
                 value.name = c('PR','Z'))[, variable := c('ADMIN','MKT','FIN')[variable]
                                           ][, `:=` (PR = PR[sub('^SLY_','',SLY_DEPT) == variable],
                                                     Z = Z[sub('^SLY_','',SLY_DEPT) == variable]), 
                                             by = .(FIRM,WEEKS)
                                             ][, variable := NULL]
    
    # removing the duplicates
    res2 <- res2[!duplicated(res2)]
    

    导致:

    > res2
        FIRM WEEKS MIN_SLY  SLY_DEPT PR Z
     1:    A     1   0.100 SLY_ADMIN  1 1
     2:    A     2   0.100 SLY_ADMIN  5 2
     3:    A     3   0.100 SLY_ADMIN  4 3
     4:    A     4   0.100 SLY_ADMIN  3 4
     5:    A     5   0.100 SLY_ADMIN  2 5
     6:    B     1   0.003   SLY_MKT  5 4
     7:    B     2   0.003   SLY_MKT  0 3
     8:    B     3   0.003   SLY_MKT  1 2
     9:    B     4   0.003   SLY_MKT  2 1
    10:    B     5   0.003   SLY_MKT  3 9
    11:    C     1   0.030   SLY_FIN  5 1
    12:    C     2   0.030   SLY_FIN  4 2
    13:    C     3   0.030   SLY_FIN  3 3
    14:    C     4   0.030   SLY_FIN  2 4
    15:    C     5   0.030   SLY_FIN  1 5
    

    【讨论】:

    • 谢谢。这很漂亮,而且更简单。
    • 您知道数据表与 plm 包一起使用时的任何已知问题吗?
    • @PolarBear 我以前从未使用过那个包。你遇到过问题吗?如果是这样,最好提出一个新问题。
    • @PolarBear 如果您认为该问题不是重复的:我可以重新打开它,但您必须先取消删除它。
    猜你喜欢
    • 1970-01-01
    • 2023-03-22
    • 2019-11-02
    • 2011-05-01
    • 2019-07-03
    • 2021-05-12
    • 1970-01-01
    • 2014-01-27
    • 1970-01-01
    相关资源
    最近更新 更多