【发布时间】:2019-12-08 04:30:45
【问题描述】:
我在pandas 数据框中有以下数据集。我希望为每个 user_id 提取 trajectory 中每个项目的计数,计算/返回列表中每个项目的概率质量函数 (pmf),并返回前 3 个(或任何数字) pmf 的项目。所有这些都应该返回到同一个数据帧。
代码:
这是我用来制作我想要完成的原型的代码:
import math
from collections import Counter
cluster = trajs['cluster_label'][0]
#print(user)
counter = Counter(cluster[0])
print(counter)
print("Top 3 clusters: ", counter.most_common(3))
cluster_norm = sum(counter.values(), 0.0)
for key in counter:
counter[key] /= user_norm
print("PMF:",counter.most_common(5))
Counter({6: 907, 12: 728, 7: 716, 4: 638, 0: 594, 3: 551, 5: 352, 8: 335, 11: 236, 14: 207, 2: 206, 1: 195, 16: 190, 13: 150, 9: 115, 10: 78, 15: 55, 17: 52})
Top 3 clusters: [(6, 907), (12, 728), (7, 716)]
PMF: [(6, 0.14385408406026962), (12, 0.1154639175257732), (7, 0.11356066613798572), (4, 0.10118953211736717), (0, 0.09421094369547978)]
我已经能够使用上面的代码手动完成少数条目,但是在遍历整个数据帧(约 14k 行)并将结果格式化为新数据帧时遇到了一些麻烦。
数据:
user_id,trajectory
431997632,[[28, 215, 278, 213, 432, 238, 122, 188, 401, 401, 289, 264, 468, 401, 537, 401, 34, 191, 469, 471, 609, 506, 254, 144, 451]]
4725565346,[[259, 585, 585, 513, 432, 662, 527, 527, 527, 527, 513, 513, 662, 527, 527, 662, 527, 662, 662, 662, 662, 513]]
1331881832,[[215, 432, 432, 432, 432, 432, 432, 432, 432, 432, 432, 432, 432, 432, 432, 205, 432, 205, 432]]
58637506,[[432, 156, 109, 401, 401, 327, 205, 372, 536, 206, 633, 538, 384, 572, 401, 125]]
1396149025,[[537, 372, 372, 421, 440, 100, 645, 55, 401, 537, 384, 317, 317, 351, 459, 109]]
26412887,[[344, 335, 335, 286, 314, 335, 330, 283, 286, 307, 314, 432, 335, 335, 335]]
3259595431,[[400, 339, 401, 400, 28, 307, 327, 327, 537, 36, 472, 472, 522]]
45429271,[[473, 360, 314, 314, 314, 314, 314, 330, 330, 360, 330, 330]]
22536391,[[609, 407, 384, 470, 485, 415, 384, 384, 466, 403, 388, 388, 388, 432, 415, 408, 314, 513, 311, 464, 473, 473, 467]]
8.13E+17,[[421, 384, 401, 230, 330, 609, 401, 424, 264, 265, 384, 661, 445, 215, 257, 140, 601, 213, 265, 79, 378]]
7.63E+17,[[238, 238, 238, 457, 267, 474, 338, 401, 512, 401, 486, 278, 384, 133, 304, 537, 407, 304, 384]]
98221214,[[432, 432, 88, 432, 384, 215, 259, 384, 522, 259, 384, 432, 384, 384, 384, 384, 384, 384, 384]]
7.20E+17,[[465, 329, 465, 329, 432, 432, 432, 432, 432, 18, 465, 432, 432, 269, 465, 465, 288, 288, 152]]
323346248,[[407, 401, 603, 641, 521, 327, 0, 432, 262, 453, 628, 289, 634, 125, 63, 385, 395, 432, 327]]
4036152552,[[327, 632, 632, 168, 28, 168, 632, 632, 632, 168, 106, 28, 168, 106, 168, 168, 168, 168, 28]]
17281102,[[225, 225, 225, 225, 225, 225, 225, 225, 384, 384, 628, 628, 628, 628, 628, 628, 708, 708]]
24685146,[[396, 330, 330, 330, 396, 330, 330, 330, 330, 330, 330, 330, 330, 330, 330, 264, 264, 330]]
24822125,[[401, 354, 314, 360, 432, 360, 432, 360, 689, 689, 314, 314, 689, 689, 300, 432, 300, 300]]
28477232,[[432, 385, 385, 385, 385, 385, 385, 385, 385, 385, 385, 385, 385, 385, 385, 385, 385, 385]]
136675517,[[410, 404, 466, 527, 473, 403, 410, 503, 503, 485, 467, 463, 458, 384, 384, 384, 384, 384]]
228534984,[[432, 213, 432, 213, 432, 432, 432, 432, 205, 432, 432, 432, 432, 552, 432, 432, 432, 432]]
237564656,[[327, 254, 213, 254, 254, 254, 432, 213, 213, 213, 254, 167, 254, 228, 240, 509, 254, 213]]
423924903,[[267, 432, 609, 342, 432, 432, 199, 122, 150, 372, 265, 432, 194, 456, 401, 401, 432, 401]]
496539092,[[609, 597, 597, 597, 597, 432, 597, 597, 597, 634, 634, 609, 597, 597, 597, 597, 634, 311]]
18193826,[[299, 299, 299, 427, 299, 28, 595, 401, 401, 432, 299, 299, 197, 401]]
【问题讨论】: