1. 程式人生 > 實用技巧 >資料分析之兩種使用者分群方法(RFM和聚類)

資料分析之兩種使用者分群方法(RFM和聚類)

本文由於沒有現成的資料,就自己生成了一些商品訂單資料,基於該資料進行了RFM和聚類的構建

1.資料的生成

資料庫表操作

 1 use my_work;
 2 
 3 -- 建立商品訂單表
 4 CREATE table goods_orders_ful(
 5     user_id varchar(100),  -- 使用者id
 6     order_id varchar(100), -- 訂單id
 7     is_paid bool, -- 使用者是否實際支付,1支付;0未支付
 8     amount double, -- 訂單金額
 9     created_date date, --
訂單生成日期 yyyy-mm-dd 10 created_time timestamp, -- 訂單生成時間 yyyy-mm-dd hh:mm:ss 11 business_type varchar(10), -- 業務型別 12 region_name varchar(10), -- 所屬區域:如 東部地區 13 order_source_name varchar(10), -- 訂單渠道:Web、H5、App 14 is_done bool -- 訂單是否完成 15 ); 16 17
-- 建立使用者訂單行為中間表 18 drop table if exists user_info_frm_01; 19 CREATE table user_info_frm_01 20 as 21 select gof.user_id, 22 sum(gof.amount) all_of_money, 23 max(gof.created_date) latest_date, 24 count(gof.order_id) all_of_orders 25 FROM goods_orders_ful gof 26 where
gof.is_paid = 1 27 and gof.is_done = 1 28 and gof.created_date >= '2020-01-01' 29 and gof.created_date < '2020-07-01' 30 group by gof.user_id; 31 32 SELECT count(*) from user_info_frm_01 uif; 33 SELECT * from user_info_frm_01 uif limit 10 34 35 -- 建立行為指標均值表 36 create table if not exists user_info_frm_02 37 as 38 select avg(uif.all_of_money) all_of_money_avg, 39 avg(datediff('2020-07-22', uif.latest_date)) latest_days_avg, 40 avg(uif.all_of_orders) orders_avg 41 from user_info_frm_01 uif; 42 43 SELECT * from user_info_frm_02; 44 -- 消費均值1107.10,最小天數均值86.9,訂單數量均值2.1 45 46 -- 將使用者進行rfm一級打標 47 create table if not exists user_info_frm_03 48 as 49 SELECT uif.user_id, 50 case when uif.all_of_money >= 1107.10 51 then '' 52 else '' 53 end money, 54 case when datediff('2020-07-22', uif.latest_date) >= 86.9 55 then '' 56 else '' 57 end recency, 58 case when uif.all_of_orders >= 2.1 59 then '' 60 else '' 61 end frequency 62 from user_info_frm_01 uif; 63 64 -- 將使用者進行二級打標 65 create table if not exists user_info_frm_04 66 as 67 select uif.user_id, 68 uif.recency, 69 uif.frequency, 70 uif.money, 71 case when uif.recency = '' and uif.frequency = '' and uif.money = '' 72 then '重要價值使用者' 73 when uif.recency = '' and uif.frequency = '' and uif.money = '' 74 then '重要保持使用者' 75 when uif.recency = '' and uif.frequency = '' and uif.money = '' 76 then '重要發展使用者' 77 when uif.recency = '' and uif.frequency = '' and uif.money = '' 78 then '重要挽留使用者' 79 when uif.recency = '' and uif.frequency = '' and uif.money = '' 80 then '一般價值使用者' 81 when uif.recency = '' and uif.frequency = '' and uif.money = '' 82 then '一般保持使用者' 83 when uif.recency = '' and uif.frequency = '' and uif.money = '' 84 then '一般發展使用者' 85 when uif.recency = '' and uif.frequency = '' and uif.money = '' 86 then '一般挽留使用者' 87 else NULL 88 end type 89 90 from user_info_frm_03 uif;

python 程式生成資料

 1 # _*_ coding: utf-8 _*_ #
 2 # @Time     :2020/7/25 7:30 下午
 3 # @Author   :Zhx
 4 
 5 
 6 import pymysql
 7 import uuid
 8 import random
 9 import time
10 
11 
12 class CreateData(object):
13 
14     def __init__(self):
15         pass
16 
17     @staticmethod
18     def create():
19         user_id_ = random.randint(1, 5000)
20         order_id_ = uuid.uuid1()
21         is_paid_ = random.choice([1, 0, 1, 1, 1, 1, 1, 1, 1, 1])
22         amount_ = random.uniform(10, 1000)
23         a1 = (2020, 1, 1, 0, 0, 0, 0, 0, 0)
24         a2 = (2020, 6, 31, 23, 59, 59, 0, 0, 0)
25 
26         start = time.mktime(a1)  # 生成開始時間戳
27         end = time.mktime(a2)  # 生成結束時間戳
28 
29         # 隨機生成10個日期字串
30         t = random.randint(start, end)  # 在開始和結束時間戳中隨機取出一個
31         date_tuple = time.localtime(t)  # 將時間戳生成時間元組
32         created_date_ = time.strftime("%Y-%m-%d", date_tuple)  # 將時間元組轉成格式化字串
33         created_time_ = time.strftime("%Y-%m-%d %H:%M:%S", date_tuple)
34         business_type_ = random.randint(0, 20)
35         region_name_ = random.choice(['', '西', '', ''])
36         order_source_name_ = random.choice(['Web', 'app', 'H5'])
37         is_done_ = is_paid_
38         return user_id_, order_id_, is_paid_, amount_, created_date_, created_time_, \
39             business_type_, region_name_, order_source_name_, is_done_
40 
41 
42 if __name__ == '__main__':
43     database = 'my_work'
44     table = 'goods_orders_ful'
45     counts = 10000
46     create_data = CreateData()
47     con = pymysql.connect(database=database, host='localhost',
48                           user='root', port=3306, password='199498zhx@')
49     cur = con.cursor()
50     for i in range(counts):
51         user_id, order_id, is_paid, amount, created_date, created_time, \
52              business_type, region_name, order_source_name, is_done = create_data.create()
53         sql = """insert into %s.%s values('%s', '%s', %d, %f, '%s', '%s', '%s', '%s', '%s', %d)""" % \
54               (database, table, user_id, order_id, is_paid, amount, created_date, created_time, business_type,
55                region_name, order_source_name, is_done)
56         try:
57             cur.execute(sql)
58             print(i, i % 1000)
59             con.commit()
60         except Exception as e:
61             print(e)
62             con.rollback()
63     con.close()
64     cur.close()

源資料欄位有:

  user_idvarchar(100),-- 使用者id

  order_idvarchar(100),-- 訂單id

  is_paidbool, -- 使用者是否實際支付,1支付;0未支付

amountdouble, -- 訂單金額

  created_datedate, -- 訂單生成日期 yyyy-mm-dd

  created_timetimestamp, -- 訂單生成時間 yyyy-mm-dd hh:mm:ss

  business_typevarchar(10),-- 業務型別

  region_namevarchar(10), -- 所屬區域:如 東部地區

  order_source_namevarchar(10), -- 訂單渠道:Web、H5、App

  is_donebool-- 訂單是否完成

RFM 模型最終表資料

最終的視覺化分析使用jupyter完成