資料分析之兩種使用者分群方法(RFM和聚類)
阿新 • • 發佈:2020-07-26
本文由於沒有現成的資料,就自己生成了一些商品訂單資料,基於該資料進行了RFM和聚類的構建
1.資料的生成
資料庫表操作
1 use my_work; 2 3 -- 建立商品訂單表 4 CREATE table goods_orders_ful( 5 user_id varchar(100), -- 使用者id 6 order_id varchar(100), -- 訂單id 7 is_paid bool, -- 使用者是否實際支付,1支付;0未支付 8 amount double, -- 訂單金額 9 created_date date, --訂單生成日期 yyyy-mm-dd 10 created_time timestamp, -- 訂單生成時間 yyyy-mm-dd hh:mm:ss 11 business_type varchar(10), -- 業務型別 12 region_name varchar(10), -- 所屬區域:如 東部地區 13 order_source_name varchar(10), -- 訂單渠道:Web、H5、App 14 is_done bool -- 訂單是否完成 15 ); 16 17-- 建立使用者訂單行為中間表 18 drop table if exists user_info_frm_01; 19 CREATE table user_info_frm_01 20 as 21 select gof.user_id, 22 sum(gof.amount) all_of_money, 23 max(gof.created_date) latest_date, 24 count(gof.order_id) all_of_orders 25 FROM goods_orders_ful gof 26 wheregof.is_paid = 1 27 and gof.is_done = 1 28 and gof.created_date >= '2020-01-01' 29 and gof.created_date < '2020-07-01' 30 group by gof.user_id; 31 32 SELECT count(*) from user_info_frm_01 uif; 33 SELECT * from user_info_frm_01 uif limit 10 34 35 -- 建立行為指標均值表 36 create table if not exists user_info_frm_02 37 as 38 select avg(uif.all_of_money) all_of_money_avg, 39 avg(datediff('2020-07-22', uif.latest_date)) latest_days_avg, 40 avg(uif.all_of_orders) orders_avg 41 from user_info_frm_01 uif; 42 43 SELECT * from user_info_frm_02; 44 -- 消費均值1107.10,最小天數均值86.9,訂單數量均值2.1 45 46 -- 將使用者進行rfm一級打標 47 create table if not exists user_info_frm_03 48 as 49 SELECT uif.user_id, 50 case when uif.all_of_money >= 1107.10 51 then '高' 52 else '低' 53 end money, 54 case when datediff('2020-07-22', uif.latest_date) >= 86.9 55 then '高' 56 else '低' 57 end recency, 58 case when uif.all_of_orders >= 2.1 59 then '高' 60 else '低' 61 end frequency 62 from user_info_frm_01 uif; 63 64 -- 將使用者進行二級打標 65 create table if not exists user_info_frm_04 66 as 67 select uif.user_id, 68 uif.recency, 69 uif.frequency, 70 uif.money, 71 case when uif.recency = '高' and uif.frequency = '高' and uif.money = '高' 72 then '重要價值使用者' 73 when uif.recency = '低' and uif.frequency = '高' and uif.money = '高' 74 then '重要保持使用者' 75 when uif.recency = '高' and uif.frequency = '低' and uif.money = '高' 76 then '重要發展使用者' 77 when uif.recency = '低' and uif.frequency = '低' and uif.money = '高' 78 then '重要挽留使用者' 79 when uif.recency = '高' and uif.frequency = '高' and uif.money = '低' 80 then '一般價值使用者' 81 when uif.recency = '低' and uif.frequency = '高' and uif.money = '低' 82 then '一般保持使用者' 83 when uif.recency = '高' and uif.frequency = '低' and uif.money = '低' 84 then '一般發展使用者' 85 when uif.recency = '低' and uif.frequency = '低' and uif.money = '低' 86 then '一般挽留使用者' 87 else NULL 88 end type 89 90 from user_info_frm_03 uif;
python 程式生成資料
1 # _*_ coding: utf-8 _*_ # 2 # @Time :2020/7/25 7:30 下午 3 # @Author :Zhx 4 5 6 import pymysql 7 import uuid 8 import random 9 import time 10 11 12 class CreateData(object): 13 14 def __init__(self): 15 pass 16 17 @staticmethod 18 def create(): 19 user_id_ = random.randint(1, 5000) 20 order_id_ = uuid.uuid1() 21 is_paid_ = random.choice([1, 0, 1, 1, 1, 1, 1, 1, 1, 1]) 22 amount_ = random.uniform(10, 1000) 23 a1 = (2020, 1, 1, 0, 0, 0, 0, 0, 0) 24 a2 = (2020, 6, 31, 23, 59, 59, 0, 0, 0) 25 26 start = time.mktime(a1) # 生成開始時間戳 27 end = time.mktime(a2) # 生成結束時間戳 28 29 # 隨機生成10個日期字串 30 t = random.randint(start, end) # 在開始和結束時間戳中隨機取出一個 31 date_tuple = time.localtime(t) # 將時間戳生成時間元組 32 created_date_ = time.strftime("%Y-%m-%d", date_tuple) # 將時間元組轉成格式化字串 33 created_time_ = time.strftime("%Y-%m-%d %H:%M:%S", date_tuple) 34 business_type_ = random.randint(0, 20) 35 region_name_ = random.choice(['東', '西', '南', '北']) 36 order_source_name_ = random.choice(['Web', 'app', 'H5']) 37 is_done_ = is_paid_ 38 return user_id_, order_id_, is_paid_, amount_, created_date_, created_time_, \ 39 business_type_, region_name_, order_source_name_, is_done_ 40 41 42 if __name__ == '__main__': 43 database = 'my_work' 44 table = 'goods_orders_ful' 45 counts = 10000 46 create_data = CreateData() 47 con = pymysql.connect(database=database, host='localhost', 48 user='root', port=3306, password='199498zhx@') 49 cur = con.cursor() 50 for i in range(counts): 51 user_id, order_id, is_paid, amount, created_date, created_time, \ 52 business_type, region_name, order_source_name, is_done = create_data.create() 53 sql = """insert into %s.%s values('%s', '%s', %d, %f, '%s', '%s', '%s', '%s', '%s', %d)""" % \ 54 (database, table, user_id, order_id, is_paid, amount, created_date, created_time, business_type, 55 region_name, order_source_name, is_done) 56 try: 57 cur.execute(sql) 58 print(i, i % 1000) 59 con.commit() 60 except Exception as e: 61 print(e) 62 con.rollback() 63 con.close() 64 cur.close()
源資料欄位有:
user_idvarchar(100),-- 使用者id
order_idvarchar(100),-- 訂單id
is_paidbool, -- 使用者是否實際支付,1支付;0未支付
amountdouble, -- 訂單金額
created_datedate, -- 訂單生成日期 yyyy-mm-dd
created_timetimestamp, -- 訂單生成時間 yyyy-mm-dd hh:mm:ss
business_typevarchar(10),-- 業務型別
region_namevarchar(10), -- 所屬區域:如 東部地區
order_source_namevarchar(10), -- 訂單渠道:Web、H5、App
is_donebool-- 訂單是否完成
RFM 模型最終表資料
最終的視覺化分析使用jupyter完成