【问题标题】:Transforming a Dataframe into a matrix keeping one column as attribute [duplicate]将数据框转换为矩阵,将一列作为属性[重复]
【发布时间】:2016-09-27 17:52:16
【问题描述】:

我有一个如下的数据框:

Destination    User     User_Price 
     A          a           5
     A          b           4
     B          c           6
     B          a           5
     C          b           4
     C          d           7

我想将其转换为一个矩阵,显示用户点击的目的地,如下所示:

   User    User_Price    A    B    C    
    a          5         1    1    0
    b          4         1    0    1
    c          6         0    1    0
    d          7         0    0    1

【问题讨论】:

标签: r matrix dataframe


【解决方案1】:

这看起来非常类似于正常的重塑操作,除了需要一些需要几行代码的特质来实现在基础r中。

首先,为参考和比较目的,这是最少的reshape()呼叫产生的东西:
df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L),stringsAsFactors=F);
reshape(df,dir='w',idvar='User',timevar='Destination');
##   User User_Price.A User_Price.B User_Price.C
## 1    a            5            5           NA
## 2    b            4           NA            4
## 3    c           NA            6           NA
## 6    d           NA           NA            7

显然,在我们到达所需的输出之前必须解决的几个问题:

  • 我们必须从多个加宽的列中计算所需的奇异User_Price @列。
  • 我们必须用0替换NA价格。
  • 我们必须用1替换非NA价格。
  • 我们必须修复列名以省略User_Price.前缀。

这是一个完整的解决方案,在上面使用df

res <- reshape(df,dir='w',idvar='User',timevar='Destination');
pre <- '^User_Price\\.';
cis <- grep(pre,names(res));
res$User_Price <- do.call(pmax,c(res[cis],na.rm=T));
names(res)[cis] <- sub(pre,'',names(res)[cis]);
nas <- is.na(res[cis]);
res[cis][nas] <- 0;
res[cis][!nas] <- 1;
res;
  User A B C User_Price
1    a 1 1 0          5
2    b 1 0 1          4
3    c 0 1 0          6
6    d 0 0 1          7

基准测试

library(microbenchmark);
library(dplyr);
library(tidyr);
library(data.table);

bgoldst <- function(df) { res <- reshape(df,dir='w',idvar='User',timevar='Destination'); pre <- '^User_Price\\.'; cis <- grep(pre,names(res)); res$User_Price <- do.call(pmax,c(res[cis],na.rm=T)); names(res)[cis] <- sub(pre,'',names(res)[cis]); nas <- is.na(res[cis]); res[cis][nas] <- 0; res[cis][!nas] <- 1; res; };
thelatemail <- function(df) { x <- table(df[,c('User','Destination')]); data.frame(User=rownames(x),User_Price=df[match(rownames(x),df$User),'User_Price'],unclass(x)); };
jazzurro <- function(foo) { count(foo, User, User_Price, Destination) %>% spread(key = Destination, value = n, fill = 0); };
akrun <- function(foo) dcast(setDT(foo),User + User_Price ~ Destination, length, value.var="Destination");

## OP's test case
df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L));
dt <- as.data.table(df);

ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE

microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt));
## Unit: microseconds
##             expr      min       lq      mean   median        uq      max neval
##      bgoldst(df) 1767.488 1897.281 2021.7741 1943.894 2035.6260 5227.196   100
##  thelatemail(df)  473.412  536.063  574.4233  578.186  608.1225  738.129   100
##     jazzurro(df) 2707.468 2914.666 3145.7258 3032.270 3160.3515 5677.514   100
##        akrun(dt) 4403.964 4721.069 5026.5023 4875.238 5028.1230 7703.303   100

## scale test
set.seed(1L);
ND <- 1e3L; NU <- 1e3L; NR <- 1e4L;
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T);
us <- make.unique(rep(letters,len=NU));
users <- ave(dests,dests,FUN=function(x) sample(us,length(x)));
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x)));
df <- data.frame(Destination=dests,User=users,User_Price=prices);
dt <- as.data.table(df);

ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE

microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=10L);
## Unit: milliseconds
##             expr        min         lq       mean     median         uq        max neval
##      bgoldst(df) 1381.46461 1418.13922 1445.20568 1437.82683 1474.79075 1538.37153    10
##  thelatemail(df)   31.84727   37.56498   57.47417   44.54106   82.39749   92.63933    10
##     jazzurro(df)   79.18924   91.20755  117.20360  126.22693  136.13885  168.26623    10
##        akrun(dt)   52.06625   59.02158   79.59568   70.09136  106.93019  130.31208    10

## scale test 2
set.seed(1L);
ND <- 1e4L; NU <- 1e4L; NR <- 1e6L;
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T);
us <- make.unique(rep(letters,len=NU));
users <- ave(dests,dests,FUN=function(x) sample(us,length(x)));
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x)));
df <- data.frame(Destination=dests,User=users,User_Price=prices);
dt <- as.data.table(df);

ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE

microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=1L);
## Unit: seconds
##             expr        min         lq       mean     median         uq        max neval
##      bgoldst(df) 485.849043 485.849043 485.849043 485.849043 485.849043 485.849043     1
##  thelatemail(df)   3.377981   3.377981   3.377981   3.377981   3.377981   3.377981     1
##     jazzurro(df)  12.858542  12.858542  12.858542  12.858542  12.858542  12.858542     1
##        akrun(dt)   4.132785   4.132785   4.132785   4.132785   4.132785   4.132785     1

【讨论】:

  • 我不确定table工作时间有效,当行数增加到大约1E7左右时。 span>
  • @ akrun我添加了一个带有1e6行的基准。我尝试了1E7,但在运行@ 987654333 Error in table(df[, c("User", "Destination")]) : attempt to make a table with &gt;= 2^31 elements 987654333 @。 span>
【解决方案2】:

这是一个使用data.table的选项

library(data.table)
dcast(setDT(foo),User + User_Price ~ Destination, length, value.var="Destination")
#   User User_Price A B C
#1:    a          5 1 1 0
#2:    b          4 1 0 1
#3:    c          6 0 1 0
#4:    d          7 0 0 1

【讨论】:

    【解决方案3】:

    实现相同目的的另一种方法是使用 dcast。

    a

    稍后更改目标列的值

    【讨论】:

      【解决方案4】:

      dplyrtidyr 包的一种方法是:

      library(dplyr)
      library(tidyr)
      
      count(foo, User, User_Price, Destination) %>%
      spread(key = Destination, value = n, fill = 0)
      
      #    User User_Price     A     B     C
      #  (fctr)      (int) (dbl) (dbl) (dbl)
      #1      a          5     1     1     0
      #2      b          4     1     0     1
      #3      c          6     0     1     0
      #4      d          7     0     0     1
      

      如果需要矩阵,可以将此结果(数据框)转换为矩阵。

      数据

      foo <- structure(list(Destination = structure(c(1L, 1L, 2L, 2L, 3L, 
      3L), .Label = c("A", "B", "C"), class = "factor"), User = structure(c(1L, 
      2L, 3L, 1L, 2L, 4L), .Label = c("a", "b", "c", "d"), class = "factor"), 
      User_Price = c(5L, 4L, 6L, 5L, 4L, 7L)), .Names = c("Destination", 
      "User", "User_Price"), class = "data.frame", row.names = c(NA, 
      -6L))
      

      【讨论】:

        猜你喜欢
        • 2015-06-10
        • 2020-10-31
        • 2021-11-12
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2022-06-23
        • 2014-01-13
        相关资源
        最近更新 更多