R - 有条件地创建多个变量（因子百分比）答案

【问题标题】：R - conditionally creating multiple variables (factor percentages)R - 有条件地创建多个变量（因子百分比）
【发布时间】：2018-08-13 20:04:22
【问题描述】：

R 版本 3.4.2

我正在尝试根据基于同一数据帧的其他变量的条件创建 3 个新变量。我设法实现了我的预期，但它需要几行代码才能产生我认为其他方法（可能使用 dplyr）可以轻松提供的输出。

这是一个可复制的例子：

city <- c("London", "London", "Leeds","Leeds", "Leeds", "Nottingham", "Glasgow", "Belfast", "Belfast", "Oxford", "Oxford", "Southampton", "Aberdeen", "Bath", "Bath", "Bath", "Preston", "Preston", "Liverpool", "Derby","Hereford")

transport <- c("cars", "scooters", "cars", "scooters", "bikes", "cars", "scooters", "cars", "bikes", "scooters", "bikes", "bikes", "scooters", "cars", "scooters", "bikes", "scooters", "bikes", "bikes", "cars", "bikes")

number <- c("153", "21", "267", "87", "13", "95", "17", "199", "8", "34", "5", "23", "40", "142", "79", "28", "37", "22", "19", "83", "23")

df <- data.frame(city, transport, number)

我想知道每个城市中每种交通工具的百分比，如下所示：

> df
          city transport number pct.cars pct.scooters pct.bikes
1       London      cars    153    87.93        12.07      0.00
2       London  scooters     21    87.93        12.07      0.00
3        Leeds      cars    267    72.75        23.71      3.54
4        Leeds  scooters     87    72.75        23.71      3.54
5        Leeds     bikes     13    72.75        23.71      3.54
6   Nottingham      cars     95   100.00         0.00      0.00
7      Glasgow  scooters     17     0.00       100.00      0.00
8      Belfast      cars    199    96.14         0.00      3.86
9      Belfast     bikes      8    96.14         0.00      3.86
10      Oxford  scooters     34     0.00        87.18     12.82
11      Oxford     bikes      5     0.00        87.18     12.82
12 Southampton     bikes     23     0.00         0.00    100.00
13    Aberdeen  scooters     40     0.00       100.00      0.00
14        Bath      cars    142    57.03        31.73     11.24
15        Bath  scooters     79    57.03        31.73     11.24
16        Bath     bikes     28    57.03        31.73     11.24
17     Preston  scooters     37     0.00        62.71     37.29
18     Preston     bikes     22     0.00        62.71     37.29
19   Liverpool     bikes     19     0.00         0.00    100.00
20       Derby      cars     83   100.00         0.00      0.00
21    Hereford     bikes     23     0.00         0.00    100.00

产生上述数据框的代码如下：

df <- tbl_df(df) %>%
  mutate(., pct.cars = rep(as.numeric(0), length.out = length(df$city)),
            pct.scooters = rep(as.numeric(0), length.out = length(df$city)),
            pct.bikes = rep(as.numeric(0), length.out = length(df$city)))


for (i in 1:nrow(df)) {

  cur_city <- df$city[i]
  n_cars <- df$number[df$city == cur_city & df$transport == "cars"]
  n_scooters <- df$number[df$city == cur_city & df$transport == "scooters"]
  n_bikes <- df$number[df$city == cur_city & df$transport == "bikes"]

  if (length(n_cars) == 1 & length(n_scooters) < 1 & length(n_bikes) < 1) {
    # case: there are no scooters nor bikes

    df$pct.cars[i] <- 100
    df$pct.scooters[i] <- 0
    df$pct.bikes[i] <- 0

  } else if (length(n_cars) < 1 & length(n_scooters) == 1 & length(n_bikes) == 1) {
    # case: there are no cars

    df$pct.cars[i] <- 0
    df$pct.scooters[i] <- (n_scooters/(n_scooters + n_bikes))*100
    df$pct.bikes[i] <- (n_bikes/(n_scooters + n_bikes))*100

  } else if (length(n_cars) == 1 & length(n_scooters) == 1 & length(n_bikes) < 1) {
    # case: there are no bikes

    df$pct.cars[i] <- (n_cars/(n_cars + n_scooters))*100
    df$pct.scooters[i] <- (n_scooters/(n_cars + n_scooters))*100
    df$pct.bikes[i] <- 0

  } else if (length(n_cars) == 1 & length(n_scooters) < 1 & length(n_bikes) == 1) {
    # case: there are no scooters

    df$pct.cars[i] <- (n_cars/(n_cars + n_bikes))*100
    df$pct.scooters[i] <- 0
    df$pct.bikes[i] <- (n_bikes/(n_cars + n_bikes))*100

  } else if (length(n_cars) < 1 & length(n_scooters) == 1 & length(n_bikes) < 1) {
    # case: there are no cars nor bikes

    df$pct.cars[i] <- 0
    df$pct.scooters[i] <- 100
    df$pct.bikes[i] <- 0

  } else if (length(n_cars) < 1 & length(n_scooters) < 1 & length(n_bikes) == 1) {
    # case: there are no cars nor scooters

    df$pct.cars[i] <- 0
    df$pct.scooters[i] <- 0
    df$pct.bikes[i] <- 100

  } else if (length(n_cars) == 1 & length(n_scooters) == 1 & length(n_bikes) == 1 ) {
    # case: there are cars, scooters & bikes

    df$pct.cars[i] <- (n_cars/(n_cars + n_scooters + n_bikes))*100
    df$pct.scooters[i] <- (n_scooters/(n_cars + n_scooters + n_bikes))*100
    df$pct.bikes[i] <- (n_bikes/(n_cars + n_scooters + n_bikes))*100

  }
}

如果有人有更简单的解决方案或建议（可能使用 dplyr），将不胜感激。提前谢谢！

【问题讨论】：

基础 R 中的一个衬垫：prop.table(xtabs(as.numeric(as.character(number)) ~ city + transport, data=df), 1)
太棒了——就是这么简单！非常感谢@thelatemail！

标签： r for-loop if-statement dataframe dplyr

【解决方案1】：

这样的事情怎么样？

df %>%
    group_by(city) %>%
    mutate(
        number = as.numeric(as.character(number)),
        ntot = sum(number),
        percentage = number / ntot * 100,
        id = paste0("perc.", transport)) %>%
    spread(id, percentage) %>%
    replace(., is.na(.), 0) %>%
    select(-ntot) %>%
    ungroup()
#   city     transport number perc.bikes perc.cars perc.scooters
#   <fct>    <fct>      <dbl>      <dbl>     <dbl>         <dbl>
# 1 Aberdeen scooters   40.0        0          0           100
# 2 Bath     bikes      28.0       11.2        0             0
# 3 Bath     cars      142          0         57.0           0
# 4 Bath     scooters   79.0        0          0            31.7
# 5 Belfast  bikes       8.00       3.86       0             0
# 6 Belfast  cars      199          0         96.1           0
# 7 Derby    cars       83.0        0        100             0
# 8 Glasgow  scooters   17.0        0          0           100
# 9 Hereford bikes      23.0      100          0             0
#10 Leeds    bikes      13.0        3.54       0             0

或者，如果您想在所有百分比列中填写条目：

df %>%
    group_by(city) %>%
    mutate(
        number = as.numeric(as.character(number)),
        ntot = sum(number),
        percentage = number / ntot * 100,
        id = paste0("perc.", transport)) %>%
    spread(id, percentage) %>%
    select(-ntot) %>%
    fill(perc.bikes, perc.cars, perc.scooters, .direction = "up") %>%
    fill(perc.bikes, perc.cars, perc.scooters, .direction = "down") %>%
    replace(., is.na(.), 0) %>%
    ungroup()
## A tibble: 21 x 6
#   city     transport number perc.bikes perc.cars perc.scooters
#   <fct>    <fct>      <dbl>      <dbl>     <dbl>         <dbl>
# 1 Aberdeen scooters   40.0        0          0           100
# 2 Bath     bikes      28.0       11.2       57.0          31.7
# 3 Bath     cars      142         11.2       57.0          31.7
# 4 Bath     scooters   79.0       11.2       57.0          31.7
# 5 Belfast  bikes       8.00       3.86      96.1           0
# 6 Belfast  cars      199          3.86      96.1           0
# 7 Derby    cars       83.0        0        100             0
# 8 Glasgow  scooters   17.0        0          0           100
# 9 Hereford bikes      23.0      100          0             0
#10 Leeds    bikes      13.0        3.54      72.8          23.7
## ... with 11 more rows

更新

如果您只想要一个包含每个城市百分比的表格，您可以这样做：

df %>%
    group_by(city) %>%
    mutate(
        number = as.numeric(as.character(number)),
        number = number / sum(ntot) * 100,
        transport = paste0("perc.", transport)) %>%
    spread(transport, number, fill = 0)
    ungroup()
    ## A tibble: 13 x 4
    #   city        perc.bikes perc.cars perc.scooters
    #   <fct>            <dbl>     <dbl>         <dbl>
    # 1 Aberdeen          0          0           100
    # 2 Bath             11.2       57.0          31.7
    # 3 Belfast           3.86      96.1           0
    # 4 Derby             0        100             0
    # 5 Glasgow           0          0           100
    # 6 Hereford        100          0             0
    # 7 Leeds             3.54      72.8          23.7
    # 8 Liverpool       100          0             0
    # 9 London            0         87.9          12.1
    #10 Nottingham        0        100             0
    #11 Oxford           12.8        0            87.2
    #12 Preston          37.3        0            62.7
    #13 Southampton     100          0             0

【讨论】：

这正是我想要的！太感谢了！从这一点开始，我将使用 df %>% select(., -transport, -number) %>% filter(., !duplicated(city)) 之类的东西来处理数据框
@JoRom 没问题，很乐意提供帮助；顺便说一句，不需要select(., ...) %>% ...；看看我更新的解决方案。它仍然没有xtabs 解决方案那么短，但是很多多余的部分来自复制您原来的预期结果表。要获得一个简单的道具表，这基本上可以归结为单个 group_by、mutate 和 spread 调用。

【解决方案2】：

有时表格/矩阵运算可以提供一个简单的解决方案：

df$number <- as.numeric(as.character(df$number))
prop.table(xtabs(number ~ city + transport, data=df), 1)

#             transport
#city               bikes       cars   scooters
#  Aberdeen    0.00000000 0.00000000 1.00000000
#  Bath        0.11244980 0.57028112 0.31726908
#  Belfast     0.03864734 0.96135266 0.00000000
#  Derby       0.00000000 1.00000000 0.00000000
#  Glasgow     0.00000000 0.00000000 1.00000000
#  Hereford    1.00000000 0.00000000 0.00000000
#  Leeds       0.03542234 0.72752044 0.23705722
#  Liverpool   1.00000000 0.00000000 0.00000000
#  London      0.00000000 0.87931034 0.12068966
#  Nottingham  0.00000000 1.00000000 0.00000000
#  Oxford      0.12820513 0.00000000 0.87179487
#  Preston     0.37288136 0.00000000 0.62711864
#  Southampton 1.00000000 0.00000000 0.00000000

这也使您可以灵活地找到每个城市中一种交通工具的比例，只需更改一个值：

prop.table(xtabs(number ~ city + transport, data=df), 2)

【讨论】：

再次感谢@thelatemail！我不知道在 prop.table() 上使用不同的索引可能会产生不同的输出 - 非常有用且简单！