【发布时间】:2019-11-13 23:03:40
【问题描述】:
我正在使用一种新颖的方法来使用 RFM 模型来划分手机应用使用的成瘾程度。 RFM 的快速解释,因为它与我编写的代码相关:它是一种营销模型,R 是 Recency,自客户上次在网站上购买以来的天数; F(频率),客户在一个网站上进行了多少次交易; M(货币):每笔交易在该网站上花费的平均金额(总金额/频率)。通过将这些分数分成不同的组,您可以知道哪些组对您的品牌更忠诚,但我想尝试一下来衡量不同类型应用的成瘾程度。
我会将这些值替换为:
R:用户在午夜 12:00 PM 之前使用此类应用程序的秒数;
F:他一天用了多少次这种类型的应用;
M:此类应用使用的平均时长;
你可以在这里找到 dput 样本数据,抱歉数据只有两个用户的信息,我仍然不知道如何制作随机 dput 数据集:
structure(list(application = c("com.android.calculator2", "com.whatsapp",
"com.whatsapp", "com.android.mediacenter", "com.whatsapp", "com.whatsapp",
"com.android.mediacenter", "com.whatsapp", "com.facebook.orca",
"com.whatsapp", "com.android.chrome", "com.google.android.youtube",
"com.tinder", "com.android.vending", "com.android.mms", "com.google.android.youtube",
"com.whatsapp", "com.google.android.youtube", "com.facebook.orca",
"com.huawei.android.internal.app", "com.android.chrome", "com.android.calculator2",
"com.android.server.telecom", "com.android.incallui", "com.whatsapp",
"com.android.mediacenter", "com.android.mediacenter", "com.android.settings",
"com.google.android.youtube", "com.whatsapp", "com.facebook.orca",
"com.android.mediacenter", "com.whatsapp", "com.whatsapp", "com.ninegag.android.app",
"com.whatsapp", "com.huawei.android.internal.app", "com.whatsapp",
"com.facebook.orca", "com.android.server.telecom", "com.android.contacts",
"com.whatsapp", "com.whatsapp", "com.facebook.orca", "com.whatsapp",
"com.audible.application", "com.facebook.orca", "com.android.vending",
"com.android.mediacenter", "com.audible.application", "com.spotlightsix.zentimerlite2"
), battery = c(99L, 91L, 91L, 91L, 59L, 59L, 86L, 82L, 82L, 78L,
78L, 78L, 59L, 23L, 24L, 24L, 21L, 20L, 27L, 27L, 27L, 66L, 66L,
66L, 51L, 78L, 79L, 79L, 61L, 15L, 83L, 64L, 64L, 64L, 77L, 77L,
76L, 74L, 74L, 68L, 67L, 26L, 26L, 26L, 14L, 42L, 21L, 7L, 49L,
47L, 7L), endTime = structure(c(1552937669.979, 1552939304.982,
1552940267.085, 1552940491.247, 1552927214.751, 1552927358.731,
1552943502.52, 1552947058.616, 1552947085.757, 1552947640.862,
1552948140.615, 1552950642.956, 1552950670.904, 1552698488.211,
1552699286.179, 1552699661.943, 1552694622.527, 1552695838.488,
1552669634.35, 1552669720.844, 1552669759.436, 1552658315.76,
1552658392.324, 1552658435.825, 1552826238.709, 1552829407.296,
1552830394.329, 1552830666.554, 1552834920.948, 1552843002.461,
1552850435.957, 1552924112.501, 1552924305.967, 1552924485.245,
1552746587.447, 1552746621.156, 1552746808.486, 1552747504.807,
1552747525.748, 1552749348.81, 1552749531.786, 1552774429.995,
1552774593.78, 1552774601.257, 1552765986.942, 1552866265.965,
1552869582.984, 1552871863.451, 1552863539.106, 1552864201.43,
1552872500.501), class = c("POSIXct", "POSIXt"), tzone = ""),
session = c(1552929316L, 1552937670L, 1552937670L, 1552940489L,
1552926942L, 1552926942L, 1552942385L, 1552947023L, 1552947023L,
1552947023L, 1552947023L, 1552947023L, 1552947023L, 1552698280L,
1552698280L, 1552698280L, 1552694528L, 1552695704L, 1552669479L,
1552669479L, 1552669479L, 1552658249L, 1552658249L, 1552658249L,
1552825368L, 1552829142L, 1552830354L, 1552830378L, 1552830378L,
1552842287L, 1552849970L, 1552923851L, 1552924111L, 1552924284L,
1552745790L, 1552746579L, 1552746579L, 1552747501L, 1552747501L,
1552748903L, 1552748903L, 1552774264L, 1552774264L, 1552774264L,
1552765953L, 1552865369L, 1552869549L, 1552869549L, 1552862301L,
1552862301L, 1552869549L), startTime = structure(c(1552937669.974,
1552939288.014, 1552940265.404, 1552940489.402, 1552927083.565,
1552927349.671, 1552943488.401, 1552947031.581, 1552947061.03,
1552947572.997, 1552948109.636, 1552948146.197, 1552950662.47,
1552698481.19, 1552699269.439, 1552699288.018, 1552694548.992,
1552695764.75, 1552669520.073, 1552669719.309, 1552669722.031,
1552658293.438, 1552658391.914, 1552658392.34, 1552826236.588,
1552829400.281, 1552830375.788, 1552830660.017, 1552834299.004,
1552842297.013, 1552850071.788, 1552924108.617, 1552924282.513,
1552924479.884, 1552746579.19, 1552746590.718, 1552746807.361,
1552747501.668, 1552747507.62, 1552749347.688, 1552749522.781,
1552774269.867, 1552774430.015, 1552774600.383, 1552765963.791,
1552866265.186, 1552869577.804, 1552871854.773, 1552863054.623,
1552864194.888, 1552872479.38), class = c("POSIXct", "POSIXt"
), tzone = ""), user_id = c(10161L, 10161L, 10161L, 10161L,
10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L,
10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L,
10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L, 10161L,
10161L, 10161L, 10161L, 10161L, 10161L, 10162L, 10162L, 10162L,
10162L, 10162L, 10162L, 10162L, 10162L, 10162L, 10162L, 10162L,
10162L, 10162L, 10162L, 10162L, 10162L, 10162L, 10162L),
categories = structure(c(6L, 1L, 1L, 3L, 1L, 1L, 3L, 1L,
1L, 1L, 6L, 2L, 3L, 6L, 1L, 2L, 1L, 2L, 1L, 6L, 6L, 6L, 6L,
6L, 1L, 3L, 3L, 6L, 2L, 1L, 1L, 3L, 1L, 1L, 5L, 1L, 6L, 1L,
1L, 6L, 6L, 1L, 1L, 1L, 1L, 6L, 1L, 6L, 3L, 6L, 3L), .Label = c("communication",
"games & entertainment", "lifestyle", "news & information outlet",
"social network", "utility & tools"), class = "factor"),
date = structure(c(17973, 17973, 17973, 17973, 17973, 17973,
17973, 17973, 17973, 17973, 17973, 17973, 17974, 17971, 17971,
17971, 17971, 17971, 17970, 17970, 17970, 17970, 17970, 17970,
17972, 17972, 17972, 17972, 17972, 17972, 17972, 17973, 17973,
17973, 17971, 17971, 17971, 17971, 17971, 17971, 17971, 17971,
17971, 17971, 17971, 17973, 17973, 17973, 17972, 17973, 17973
), class = "Date"), duration = structure(c(0, 17, 1.7, 1.8,
131.2, 9.1, 14.1, 27, 24.7, 67.9, 31, 2496.8, 8.4, 7, 16.7,
373.9, 73.5, 73.7, 114.3, 1.5, 37.4, 22.3, 0.4, 43.5, 2.1,
7, 18.5, 6.5, 621.9, 705.4, 364.2, 3.9, 23.5, 5.4, 8.3, 30.4,
1.1, 3.1, 18.1, 1.1, 9, 160.1, 163.8, 0.9, 23.2, 0.8, 5.2,
8.7, 484.5, 6.5, 21.1), class = "difftime", units = "secs")), row.names = 162574:162624, class = "data.frame")
由于我不熟悉处理时间类型数据,因此我在使用 Recency 部分时遇到了问题。到目前为止我只能算出这个,它不计算每日RFM,而只计算整个数据集的最后一天。
df_RFM <- df_data %>%
group_by(user_id) %>%
summarise(recency=as.numeric(as.Date(endTime)-max(endTime)),
frequency=n_distinct(categories), monetary= sum(duration)/n_distinct(categories))
与频率和货币相同,它是根据整个数据集计算的,但我需要每天计算它们。简而言之,一个包含每个用户日常应用 RFM 的数据集,按每个类别划分,如下所示(仅作为示例,并非实际价值):
user_id date recency frequency monetary categories
10161 2019-03-15 21040 sec 5 109.7 utility & tools
10161 2019-03-15 77538 sec 1 181.6 Communication
10161 2019-03-16 12345 sec 4 123.5 games&entertainment
10161 2019-03-16 77538 sec 1 181.6 communication
10162 2019-03-15 21040 sec 2 109.7 utility & tools
10162 2019-03-15 77538 sec 3 181.6 Communication
10162 2019-03-17 12345 sec 12 123.5 games&entertainment
10162 2019-03-17 77538 sec 2 181.6 utility & tools
通过阅读本文,您可以获得如下信息:用户 10161 在 03-15,使用实用工具和工具 5 次,平均 109.7 秒。他最后一次使用实用工具是在 21040 年前的午夜之前。
欢迎提出建议,谢谢!
【问题讨论】:
标签: r