1. 程式人生 > >京東JData演算法大賽高潛使用者購買意向預測——復現

京東JData演算法大賽高潛使用者購買意向預測——復現

一、前言

  完全是重現別人的過程,學習思路和處理方式,僅供記錄,具體請看參考連結,更完善清晰

  參考連結      http://izhaoyi.top/2017/06/25/JData/#%E6%95%B0%E6%8D%AE%E9%9B%86%E8%A7%A3%E6%9E%90

  嘗試重現別人的挖掘過程,學習別人的思路

 

二、具體過程

  資料集介紹等前期資訊可以看參考連結,或是演算法大賽的官網,這裡直接進行操作

  

  資料預處理:

    異常值判斷

#檔名
#coding=utf-8
import matplotlib
import matplotlib.pyplot as plt
import numpy as np import pandas as pd ACTION_201602_FILE = "D:\data\JData_Action_201602.csv" #讀取資料 ACTION_201603_FILE = "D:\data\JData_Action_201603.csv" ACTION_201604_FILE = "D:\data\JData_Action_201604.csv" COMMENT_FILE = "D:\data\JData_Comment.csv" PRODUCT_FILE = "D:\data\JData_Product.csv" USER_FILE
= "D:\data\JData_User.csv" #USER_TABLE_FILE = "D:\data\ User_table.csv" #ITEM_TABLE_FILE = "D:\data\Item_table.csv"

    

    判斷是否空值

def check_empty(file_path,file_name):           #判斷是否存在空值
    file = open(file_path)                      #直接用pd.read_csv會報錯,因此先用file open
    df_file = pd.read_csv(file)
    
print('判斷missing value in {0},{1}'.format(file_name,df_file.isnull().any().any())) ''' isnull()判斷是否空值,但是直接使用的話得到的是一個矩陣, 因此用.any()得到每列是否存在空值的情況, 再使用.any()得到整個檔案是否存在空值的情況 '''
check_empty(USER_FILE,'user')
check_empty(ACTION_201602_FILE,'Action 2')
check_empty(ACTION_201603_FILE,'Action 3')
check_empty(ACTION_201604_FILE,'Action 4')
check_empty(COMMENT_FILE,'Comment')
check_empty(PRODUCT_FILE,'Product')

    得到結果

判斷missing value in user,True
判斷missing value in Product,False
判斷missing value in Action 2,True
判斷missing value in Action 3,True
判斷missing value in Action 4,True
判斷missing value in Comment,False

 

    檢視每個表空值的情況,也就是列列空值情況

def empty_detail(file_path,file_name):
    file = open(file_path)
    df_file = pd.read_csv(file)
    print('空值詳細資訊 of {0}'.format(file_name))
    print(pd.isnull(df_file).any())         #.any()檢視列情況

empty_detail(USER_FILE,'User')
empty_detail(ACTION_201604_FILE,'Action 2')
empty_detail(ACTION_201603_FILE,'Action 3')
empty_detail(ACTION_201602_FILE,'Action 4')

    得到結果

空值詳細資訊 of User
user_id        False
age             True
sex             True
user_lv_cd     False
user_reg_tm     True
dtype: bool
空值詳細資訊 of Action 2
user_id     False
sku_id      False
time        False
model_id     True
type        False
cate        False
brand       False
dtype: bool
空值詳細資訊 of Action 3
user_id     False
sku_id      False
time        False
model_id     True
type        False
cate        False
brand       False
dtype: bool
空值詳細資訊 of Action 4
user_id     False
sku_id      False
time        False
model_id     True
type        False
cate        False
brand       False
dtype: bool

  可得,存在空值的情況為

    User

      age,sex,user_reg_tm

    Action

      model_id

 

  接著檢視缺失值的數量和佔比

def empty_records(file_path,file_name,col_name):
    file = open(file_path)
    df_file = pd.read_csv(file)
    missing = df_file[col_name].isnull().sum().sum()        #使用.sum()

    print('缺失數 of {0} in {1} is {2}'.format(col_name,file_name,missing))
    print('佔百分比為:',missing*1.0/df_file.shape[0])
                #df.shape 獲取df的size
                #df.shape[0] 獲取df的行數    df.shape[1] 獲取列數


empty_records(USER_FILE,'User','age')
empty_records(USER_FILE,'User','sex')
empty_records(USER_FILE,'User','user_reg_tm')
empty_records(ACTION_201602_FILE,'Action 2','model_id')
empty_records(ACTION_201602_FILE,'Action 3','model_id')
empty_records(ACTION_201602_FILE,'Action 4','model_id')

  結果為

缺失數 of age in User is 3
佔百分比為: 2.8484347850855955e-05
缺失數 of sex in User is 3
佔百分比為: 2.8484347850855955e-05
缺失數 of user_reg_tm in User is 3
佔百分比為: 2.8484347850855955e-05
缺失數 of model_id in Action 2 is 4959617
佔百分比為: 0.4318183638671067
缺失數 of model_id in Action 3 is 10553261
佔百分比為: 0.4072043168995297
缺失數 of model_id in Action 4 is 5143018
佔百分比為: 0.38962452388019514

 

填充user檔案的空值,age用-1,sex用2

userfile = open(USER_FILE)
user = pd.read_csv(userfile)           #填充空值,age用-1,sex用2
user['age'].fillna('-1',inplace=True)
user['sex'].fillna('2',inplace=True)

print(pd.isnull(user).any())

檢視結果

user_id        False
age            False
sex            False
user_lv_cd     False
user_reg_tm     True
dtype: bool

 

檢視各個檔案中未知記錄所佔比重

print('未知檔案 of age in user:{0} 所佔比重:{1}'.format(user[user['age']=='-1'].shape[0],\
                                                user[user['age']=='-1'].shape[0]/user.shape[0]))
print('未知檔案 of sex in user: {0} 所佔比重: {1} '.format(user[user['sex']==2].shape[0],\
                                                  user[user['sex']==2].shape[0]/user.shape[0] ))

結果

未知檔案 of age in user:14415 所佔比重:0.13686729142336287
未知檔案 of sex in user: 54735 所佔比重: 0.5196969265388669
def unknown_records(file_path, file_name, col_name):
    file_path1 = open(file_path)
    df_file = pd.read_csv(file_path1)
    missing = df_file[df_file[col_name] == -1].shape[0]
    print( 'No. of unknown {0} in {1} is {2}'.format(col_name, file_name, missing))
    print ('percent: ', missing  / df_file.shape[0])

'''
unknown_records(PRODUCT_FILE, 'Product', 'a1')
unknown_records(PRODUCT_FILE, 'Product', 'a2')
unknown_records(PRODUCT_FILE, 'Product', 'a3')
'''

 

資料一致性驗證:利用pd.Merge連線sku 和 Action中的sku, 觀察Action中的資料是否減少

def user_action_check():
    user_f = open(USER_FILE)
    df_user = pd.read_csv(user_f)
    df_sku = df_user.ix[:,'user_id'].to_frame()
    Ac2 = open(ACTION_201602_FILE)
    df_month2 = pd.read_csv(Ac2)
    Ac3 = open(ACTION_201603_FILE)
    print ('Is action of Feb. from User file? ', len(df_month2) == len(pd.merge(df_sku,df_month2)))
    df_month3 = pd.read_csv(Ac3)
    print ('Is action of Mar. from User file? ', len(df_month3) == len(pd.merge(df_sku,df_month3)))
    Ac4 = open(ACTION_201604_FILE)
    df_month4 = pd.read_csv(Ac4)
    print ('Is action of Apr. from User file? ', len(df_month4) == len(pd.merge(df_sku,df_month4)))


user_action_check()

結果

Is action of Feb. from User file?  True
Is action of Mar. from User file?  True
Is action of Apr. from User file?  True

結論: User資料集中的使用者和互動行為資料集中的使用者完全一致

 

#重複記錄分析

#檢查是否存在註冊時間在2016年-4月-15號之後的使用者

 

將user_id轉換為int

import pandas as pd
df_month = pd.read_csv('data\JData_Action_201602.csv')
df_month['user_id'] = df_month['user_id'].apply(lambda x:int(x))
print df_month['user_id'].dtype
df_month.to_csv('data\JData_Action_201602.csv',index=None)
df_month = pd.read_csv('data\JData_Action_201603.csv')
df_month['user_id'] = df_month['user_id'].apply(lambda x:int(x))
print df_month['user_id'].dtype
df_month.to_csv('data\JData_Action_201603.csv',index=None)
df_month = pd.read_csv('data\JData_Action_201604.csv')
df_month['user_id'] = df_month['user_id'].apply(lambda x:int(x))
print df_month['user_id'].dtype
df_month.to_csv('data\JData_Action_201604.csv',index=None)

 

按照星期對使用者進行分析

def get_from_action_data(fname, chunk_size=100000):
    reader = pd.read_csv(fname, header=0, iterator=True)
    chunks = []
    loop = True
    while loop:
        try:
            chunk = reader.get_chunk(chunk_size)[
                ["user_id", "sku_id", "type", "time"]]
            chunks.append(chunk)
        except StopIteration:
            loop = False
            print("Iteration is stopped")
    df_ac = pd.concat(chunks, ignore_index=True)
    # type=4,為購買
    df_ac = df_ac[df_ac['type'] == 4]
    return df_ac[["user_id", "sku_id", "time"]]



df_ac = []
df_ac.append(get_from_action_data(fname=ACTION_201602_FILE))
df_ac.append(get_from_action_data(fname=ACTION_201603_FILE))
df_ac.append(get_from_action_data(fname=ACTION_201604_FILE))
df_ac = pd.concat(df_ac, ignore_index=True)

print(df_ac.dtypes)




# 將time欄位轉換為datetime型別
df_ac['time'] = pd.to_datetime(df_ac['time'])
# 使用lambda匿名函式將時間time轉換為星期(週一為1, 週日為7)
df_ac['time'] = df_ac['time'].apply(lambda x: x.weekday() + 1)


# 週一到週日每天購買使用者個數
df_user = df_ac.groupby('time')['user_id'].nunique()
df_user = df_user.to_frame().reset_index()
df_user.columns = ['weekday', 'user_num']


# 週一到週日每天購買商品個數
df_item = df_ac.groupby('time')['sku_id'].nunique()
df_item = df_item.to_frame().reset_index()
df_item.columns = ['weekday', 'item_num']


# 週一到週日每天購買記錄個數
df_ui = df_ac.groupby('time', as_index=False).size()
df_ui = df_ui.to_frame().reset_index()
df_ui.columns = ['weekday', 'user_item_num']


# 條形寬度
bar_width = 0.2
# 透明度
opacity = 0.4
plt.bar(df_user['weekday'], df_user['user_num'], bar_width,
        alpha=opacity, color='c', label='user')
plt.bar(df_item['weekday']+bar_width, df_item['item_num'],
        bar_width, alpha=opacity, color='g', label='item')
plt.bar(df_ui['weekday']+bar_width*2, df_ui['user_item_num'],
        bar_width, alpha=opacity, color='m', label='user_item')
plt.xlabel('weekday')
plt.ylabel('number')
plt.title('A Week Purchase Table')
plt.xticks(df_user['weekday'] + bar_width * 3 / 2., (1,2,3,4,5,6,7))
plt.tight_layout()
plt.legend(prop={'size':10})
#plt.show()

結果