【发布时间】:2020-02-12 18:44:50
【问题描述】:
我有一个看起来像这样的数据框
user booking_date origin destination price sale_channel
1 user5 2018-11-01 MAD PMI 58.20 online
2 user7 2018-11-01 DUB MAD 147.50 online
3 user4 2018-11-02 TFS MAD 24.05 online
4 user7 2018-11-01 LPA MAD 37.30 call center
5 user1 2018-11-01 AMS MAD 149.74 travel agency
6 user1 2018-11-01 MAD PMI 19.95 online
现在我想:
- 按我完成的特定日期 (2018-11-02) 过滤
df <- df[df$booking_date == '2018-11-02',]
- 按用户分组并将他们在门票上的总支出相加。
我尝试了几种使用 group_by 或聚合的方法,但我设法得到的只是一个额外的列,而不是对每个用户的值进行分组或聚合。
- 理想情况下,我的最终输出可以让我提取出在门票上花费最多的 10 个用户,例如:
'user1' 'user10' 'user 7' etc.
示例数据
structure(list(user = c("user4", "user5", "user3", "user10",
"user1", "user2", "user7", "user6", "user5", "user6", "user6",
"user7", "user1", "user7", "user4", "user4", "user1", "user7",
"user7", "user8", "user4", "user10", "user4", "user8", "user3",
"user9", "user5", "user2", "user5", "user3", "user3", "user9",
"user6", "user10", "user9", "user5", "user3", "user5", "user7",
"user9", "user2", "user2", "user7", "user10", "user7", "user3",
"user1", "user2", "user8", "user6", "user6", "user10", "user4",
"user7", "user4", "user1", "user4", "user2", "user1", "user7",
"user5", "user4", "user4", "user7", "user10"), booking_date = structure(c(17837,
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837,
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837,
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837,
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837,
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837,
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837,
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837,
17837), class = "Date"), origin = c("TFS", "MAD", "TFN", "MAD",
"LYS", "LPA", "MAD", "MAD", "AMS", "MAD", "MAD", "MAD", "MAD",
"LPA", "MAD", "MAD", "MAD", "TXL", "MAD", "MAD", "TXL", "TXL",
"TFS", "MAD", "NAP", "TFS", "PMI", "TLS", "TFS", "NTE", "AMS",
"FUE", "TFN", "CPH", "TFN", "MAD", "SVQ", "SCQ", "SVQ", "MAD",
"PMI", "MAD", "PMI", "MAD", "MAD", "MAD", "MAD", "MAD", "SVQ",
"NCE", "CDG", "MAD", "MAD", "MAD", "MAD", "MAD", "TFN", "LGW",
"LGW", "MAD", "TFN", "MAD", "RNS", "AGP", "CDG"), destination = c("MAD",
"DUB", "MAD", "TFS", "MAD", "MAD", "NAP", "TLS", "MAD", "SCQ",
"LPA", "TFN", "TXL", "MAD", "TFN", "SVQ", "ACE", "MAD", "TLS",
"SCQ", "MAD", "MAD", "MAD", "LPA", "MAD", "MAD", "MAD", "MAD",
"MAD", "MAD", "MAD", "MAD", "MAD", "MAD", "MAD", "CAG", "MAD",
"MAD", "MAD", "LPA", "MAD", "CDG", "MAD", "LPA", "TFS", "TFN",
"PMI", "NAP", "MAD", "MAD", "MAD", "LPA", "LGW", "LPA", "CDG",
"SPC", "MAD", "MAD", "MAD", "SCQ", "MAD", "SVQ", "MAD", "MAD",
"MAD"), price = c(24.0499992371, 41.5400009155, 251.199996948,
15.6000003815, 44.0099983215, 73.8499984741, 115.470001221, 69.4400024414,
81.3899993896, 15.2399997711, 41.1199989319, 274.559997559, 150,
29.3199996948, 332.440002441, 94.9100036621, 97.9800033569, 55.1199989319,
81.7399978638, 4.86000013351, 39.0299987793, 53.6300010681, 39.3199996948,
114.559997559, 65.4000015259, 96.2900009155, 41.75, 28.9099998474,
25.1900005341, 14, 50.3100013733, 47.2999992371, 53.8199996948,
91.3199996948, 77.6800003052, 17.8099994659, 96.5400009155, 27.6900005341,
34.1399993896, 34.3300018311, 15.5600004196, 158.449996948, 45.2999992371,
36.8100013733, 50.6800003052, 62.9000015259, 13.7399997711, 49.4399986267,
66.1100006104, 95.4400024414, 41.8400001526, 69.8300018311, 60.6599998474,
34.3300018311, 97.1399993896, 84.3099975586, 25.8099994659, 185.899993896,
34.8899993896, 304.380004883, 15.1300001144, 29.1399993896, 133.529998779,
208.910003662, 152.960006714), sale_channel = c("online", "online",
"travel agency", "online", "online", "online", "travel agency",
"online", "travel agency", "travel agency", "travel agency",
"online", "travel agency", "online", "online", "travel agency",
"online", "travel agency", "travel agency", "call center", "online",
"online", "online", "travel agency", "travel agency", "travel agency",
"online", "online", "online", "online", "online", "travel agency",
"travel agency", "online", "travel agency", "call center", "online",
"travel agency", "online", "online", "travel agency", "travel agency",
"online", "travel agency", "online", "online", "online", "travel agency",
"online", "travel agency", "travel agency", "online", "online",
"online", "online", "online", "online", "online", "online", "travel agency",
"travel agency", "travel agency", "online", "travel agency",
"online"), total = c(876.0300006858, 250.5000009536, 540.3500022886,
420.1500110628, 424.9299983977, 512.1099882118, 1233.9500045785,
316.9000034332, 250.5000009536, 316.9000034332, 316.9000034332,
1233.9500045785, 424.9299983977, 1233.9500045785, 876.0300006858,
876.0300006858, 424.9299983977, 1233.9500045785, 1233.9500045785,
185.52999830291, 876.0300006858, 420.1500110628, 876.0300006858,
185.52999830291, 540.3500022886, 255.6000022889, 250.5000009536,
512.1099882118, 250.5000009536, 540.3500022886, 540.3500022886,
255.6000022889, 316.9000034332, 420.1500110628, 255.6000022889,
250.5000009536, 540.3500022886, 250.5000009536, 1233.9500045785,
255.6000022889, 512.1099882118, 512.1099882118, 1233.9500045785,
420.1500110628, 1233.9500045785, 540.3500022886, 424.9299983977,
512.1099882118, 185.52999830291, 316.9000034332, 316.9000034332,
420.1500110628, 876.0300006858, 1233.9500045785, 876.0300006858,
424.9299983977, 876.0300006858, 512.1099882118, 424.9299983977,
1233.9500045785, 250.5000009536, 876.0300006858, 876.0300006858,
1233.9500045785, 420.1500110628)), row.names = c(3L,
26L, 37L, 42L, 48L, 82L, 89L, 100L, 112L, 124L, 133L, 144L, 148L,
150L, 166L, 167L, 173L, 182L, 217L, 243L, 259L, 285L, 300L, 304L,
306L, 336L, 341L, 366L, 388L, 397L, 413L, 417L, 423L, 452L, 457L,
473L, 474L, 478L, 482L, 483L, 486L, 496L, 499L, 504L, 510L, 513L,
529L, 531L, 558L, 605L, 615L, 628L, 629L, 664L, 669L, 672L, 684L,
722L, 730L, 752L, 766L, 767L, 779L, 804L, 819L), class = c("data.table",
"data.frame"))
关于如何解决这个问题的任何建议?
编辑: 最终解决方案
df <- df[df$booking_date == '2018-11-02',]
total <- aggregate(price ~ user, df, sum)
top_10 <- total[order(total$price, decreasing = T), ]
top_10[1:5, 1]
这给了我
"user7" "user4" "user3" "user2" "user1"
【问题讨论】:
-
试试
df$total <- ave(df$price, df$user, FUN = sum)。 -
谢谢,朝着正确的方向前进,但我现在如何实际按用户分组或至少检索 n 个最大的消费者?
-
请提供a minimal reproducible data 和您的预期输出。您可以生成示例数据或
dput()部分真实数据。 -
我编辑了我最初的问题,所需的输出在第 3 点下
-
aggregate(price ~ user, df, sum)解决您的问题?您只需对价格进行排序,即可查看谁花费最多。