本文由于没有现成的数据,就自己生成了一些商品订单数据,基于该数据进行了RFM和聚类的构建
1.数据的生成
数据库表操作
1 use my_work; 2 3 -- 创建商品订单表 4 CREATE table goods_orders_ful( 5 user_id varchar(100), -- 用户id 6 order_id varchar(100), -- 订单id 7 is_paid bool, -- 用户是否实际支付,1支付;0未支付 8 amount double, -- 订单金额 9 created_date date, -- 订单生成日期 yyyy-mm-dd 10 created_time timestamp, -- 订单生成时间 yyyy-mm-dd hh:mm:ss 11 business_type varchar(10), -- 业务类型 12 region_name varchar(10), -- 所属区域:如 东部地区 13 order_source_name varchar(10), -- 订单渠道:Web、H5、App 14 is_done bool -- 订单是否完成 15 ); 16 17 -- 创建用户订单行为中间表 18 drop table if exists user_info_frm_01; 19 CREATE table user_info_frm_01 20 as 21 select gof.user_id, 22 sum(gof.amount) all_of_money, 23 max(gof.created_date) latest_date, 24 count(gof.order_id) all_of_orders 25 FROM goods_orders_ful gof 26 where gof.is_paid = 1 27 and gof.is_done = 1 28 and gof.created_date >= \'2020-01-01\' 29 and gof.created_date < \'2020-07-01\' 30 group by gof.user_id; 31 32 SELECT count(*) from user_info_frm_01 uif; 33 SELECT * from user_info_frm_01 uif limit 10 34 35 -- 创建行为指标均值表 36 create table if not exists user_info_frm_02 37 as 38 select avg(uif.all_of_money) all_of_money_avg, 39 avg(datediff(\'2020-07-22\', uif.latest_date)) latest_days_avg, 40 avg(uif.all_of_orders) orders_avg 41 from user_info_frm_01 uif; 42 43 SELECT * from user_info_frm_02; 44 -- 消费均值1107.10,最小天数均值86.9,订单数量均值2.1 45 46 -- 将用户进行rfm一级打标 47 create table if not exists user_info_frm_03 48 as 49 SELECT uif.user_id, 50 case when uif.all_of_money >= 1107.10 51 then \'高\' 52 else \'低\' 53 end money, 54 case when datediff(\'2020-07-22\', uif.latest_date) >= 86.9 55 then \'高\' 56 else \'低\' 57 end recency, 58 case when uif.all_of_orders >= 2.1 59 then \'高\' 60 else \'低\' 61 end frequency 62 from user_info_frm_01 uif; 63 64 -- 将用户进行二级打标 65 create table if not exists user_info_frm_04 66 as 67 select uif.user_id, 68 uif.recency, 69 uif.frequency, 70 uif.money, 71 case when uif.recency = \'高\' and uif.frequency = \'高\' and uif.money = \'高\' 72 then \'重要价值用户\' 73 when uif.recency = \'低\' and uif.frequency = \'高\' and uif.money = \'高\' 74 then \'重要保持用户\' 75 when uif.recency = \'高\' and uif.frequency = \'低\' and uif.money = \'高\' 76 then \'重要发展用户\' 77 when uif.recency = \'低\' and uif.frequency = \'低\' and uif.money = \'高\' 78 then \'重要挽留用户\' 79 when uif.recency = \'高\' and uif.frequency = \'高\' and uif.money = \'低\' 80 then \'一般价值用户\' 81 when uif.recency = \'低\' and uif.frequency = \'高\' and uif.money = \'低\' 82 then \'一般保持用户\' 83 when uif.recency = \'高\' and uif.frequency = \'低\' and uif.money = \'低\' 84 then \'一般发展用户\' 85 when uif.recency = \'低\' and uif.frequency = \'低\' and uif.money = \'低\' 86 then \'一般挽留用户\' 87 else NULL 88 end type 89 90 from user_info_frm_03 uif;
python 程序生成数据
1 # _*_ coding: utf-8 _*_ # 2 # @Time :2020/7/25 7:30 下午 3 # @Author :Zhx 4 5 6 import pymysql 7 import uuid 8 import random 9 import time 10 11 12 class CreateData(object): 13 14 def __init__(self): 15 pass 16 17 @staticmethod 18 def create(): 19 user_id_ = random.randint(1, 5000) 20 order_id_ = uuid.uuid1() 21 is_paid_ = random.choice([1, 0, 1, 1, 1, 1, 1, 1, 1, 1]) 22 amount_ = random.uniform(10, 1000) 23 a1 = (2020, 1, 1, 0, 0, 0, 0, 0, 0) 24 a2 = (2020, 6, 31, 23, 59, 59, 0, 0, 0) 25 26 start = time.mktime(a1) # 生成开始时间戳 27 end = time.mktime(a2) # 生成结束时间戳 28 29 # 随机生成10个日期字符串 30 t = random.randint(start, end) # 在开始和结束时间戳中随机取出一个 31 date_tuple = time.localtime(t) # 将时间戳生成时间元组 32 created_date_ = time.strftime("%Y-%m-%d", date_tuple) # 将时间元组转成格式化字符串 33 created_time_ = time.strftime("%Y-%m-%d %H:%M:%S", date_tuple) 34 business_type_ = random.randint(0, 20) 35 region_name_ = random.choice([\'东\', \'西\', \'南\', \'北\']) 36 order_source_name_ = random.choice([\'Web\', \'app\', \'H5\']) 37 is_done_ = is_paid_ 38 return user_id_, order_id_, is_paid_, amount_, created_date_, created_time_, \ 39 business_type_, region_name_, order_source_name_, is_done_ 40 41 42 if __name__ == \'__main__\': 43 database = \'my_work\' 44 table = \'goods_orders_ful\' 45 counts = 10000 46 create_data = CreateData() 47 con = pymysql.connect(database=database, host=\'localhost\', 48 user=\'root\', port=3306, password=\'199498zhx@\') 49 cur = con.cursor() 50 for i in range(counts): 51 user_id, order_id, is_paid, amount, created_date, created_time, \ 52 business_type, region_name, order_source_name, is_done = create_data.create() 53 sql = """insert into %s.%s values(\'%s\', \'%s\', %d, %f, \'%s\', \'%s\', \'%s\', \'%s\', \'%s\', %d)""" % \ 54 (database, table, user_id, order_id, is_paid, amount, created_date, created_time, business_type, 55 region_name, order_source_name, is_done) 56 try: 57 cur.execute(sql) 58 print(i, i % 1000) 59 con.commit() 60 except Exception as e: 61 print(e) 62 con.rollback() 63 con.close() 64 cur.close()
源数据字段有:
user_id varchar(100), -- 用户id
order_id varchar(100), -- 订单id
is_paid bool, -- 用户是否实际支付,1支付;0未支付
amount double, -- 订单金额
created_date date, -- 订单生成日期 yyyy-mm-dd
created_time timestamp, -- 订单生成时间 yyyy-mm-dd hh:mm:ss
business_type varchar(10), -- 业务类型
region_name varchar(10), -- 所属区域:如 东部地区
order_source_name varchar(10), -- 订单渠道:Web、H5、App
is_done bool -- 订单是否完成
RFM 模型最终表数据
最终的可视化分析使用jupyter完成