R語言-基於電商平臺真實使用者-商品行為資料的移動推薦演算法模型
推薦模型以阿里移動電商平臺的真實使用者-商品行為資料為基礎訓練。
該資料包含了抽樣出來的一定量使用者在一定時間之內的移動端行為資料,評分資料是這些使用者在之後的一天對商品子集的購買資料。任務是使用訓練資料建立推薦模型,並輸出使用者在接下來一天對商品子集購買行為的預測結果。
##########讀取資料####### install.packages('readr',dependencies = TRUE) library(readr) df2 <- read.csv(file="F://Data/UserBehavior.csv", nrow = 100000,header = F, col.names = c('user_ID','item_ID', 'item_category','behavior_type','time')) head(df2) #檢視前幾行資料 user_ID item_ID item_category behavior_type time 1 1 2268318 2520377 pv 1511544070 2 1 2333346 2520771 pv 1511561733 3 1 2576651 149192 pv 1511572885 4 1 3830808 4181361 pv 1511593493 5 1 4365585 2520377 pv 1511596146 6 1 4606018 2735466 pv 1511616481 str(df2) # 檢視資料集結構 table(is.na(df2)) #檢查是否有缺失值,直接用is.na()會返回一大堆布林值 FALSE 500000
載入資料:str(df2)可以大致瞭解原始資料集結構,資料集有4個變數分別為 user_ID:使用者ID;item_ID:商品ID; item_category:商品類別;behavior_type:行為型別,有四種分別為pv、fav、cart、buy;time:時間戳
df2$time_s <- as.POSIXct(df2$time, origin="1970-01-01 00:00:00") #時間戳轉換為標準時間 df2$time_day <- as.factor(substr(df2$time_s, 1,10)) #日期 df2$time_hour <- as.factor(substr(df2$time_s,11,13)) #小時 df2$time_weekday <- as.factor(weekdays(df2$time_s)) #星期幾 table(df2$time_day) #看看時間的分佈 library(plyr) df2$behavior_type <- mapvalues(df2$behavior_type, c('pv','fav','cart','buy'), c('1','2','3','4')) str(df2) user_all <- count(df2$user_ID) #該使用者的總行為數 user_all <- rename(user_all,c(x = 'user_ID', freq = 'user_all')) item_all <- count(df2$item_ID) #該商品的總行為數 item_all <- rename(item_all,c(x = 'item_ID', freq = 'item_all')) category_all <- count(df2$item_category) #商品類別的總行為數 category_all <- rename(category_all,c(x = 'item_category', freq = 'category_all')) library(dplyr) #lefet_join()函式類似於excel的vlookup函式 df2 <- left_join(df2,user_all) #使用者總活躍度匹配到df2 df2 <- left_join(df2,item_all) #商品熱度匹配到df2 df2 <- left_join(df2,category_all) #商品類別熱度匹配到df2
初步分析:用ggplot2包繪圖以觀察整個資料集
library(ggplot2) #載入ggplot2包 df2$time_day <- factor(df2$time_day) ggplot(data = df2,aes(x = df2$time_weekday)) + geom_bar(fill = "brown", colour = "black") #總的行為頻率圖(星期) ggplot(data = df2,aes(x = df2$time_weekday)) + geom_bar(fill = "brown", colour = "black") + facet_grid(df2$behavior_type~.) #基於行為分面的行為頻率圖(星期) ggplot(data = df2,aes(x = df2$time_hour)) + geom_bar(fill = "brown", colour = "black") #總的行為頻率圖(小時) ggplot(data = df2,aes(x = df2$time_hour)) + geom_bar(fill = "brown", colour = "black") + facet_grid(df2$behavior_type~.) #基於行為分面的行為頻率圖(小時) ggplot(data = df2,aes(x = df2$time_day)) + geom_bar(fill = "brown", colour = "black") #總的行為頻率圖(日期) ggplot(data = df2,aes(x = df2$time_day)) + geom_bar(fill = "brown", colour = "black") + facet_grid(df2$behavior_type~.) #基於行為分面的行為頻率圖(日期)
從星期-行為頻率圖可以看出使用者在週六日活躍度相對於週一至週五有較大提升,構建特徵時考慮將是否週末設定為啞變數。
從小時-行為頻率圖可看出行為頻率隨使用者作息時間變化明顯,在晚上20-22時使用者活躍度相對最高,可對時間切片來構建特徵。
以上對資料集進行了初步分析。至此,我們對所要分析的資料物件有了一個基本的輪廓認知。為進一步的資料探勘工作做出了先驗鋪墊,方便特徵工程中特徵的構建。
特徵工程:我一直都覺得,沒有最好的演算法,只有更好的思路,思路決定了模型的上限,而思路就反映在一些細節和特徵的構建中,包括資料集的預處理,著重於特徵工程有時可以起到事半功倍的奇效,所以這就是為什麼資料預處理佔了幾乎80%的工作量的原因。
因為這裡的資料比較乾淨,沒有缺失值,異常值的話取決於我們自己的定義,可能是一些點選量大而購買量幾乎為0的樣本等等,需要結合實際剔除異常值。這裡主要構建了幾個系列互動特徵:
使用者U特徵:使用者的點選量/收藏量/加購量/購買量/點選-購買轉化率/加購-購買轉化率,等等;(反映使用者長期屬性)
商品I特徵:商品的被點選量/收藏量/加購量/購買量/點選-購買轉化率/加購-購買轉化率,等等;(反映商品長期屬性)
商品分類C特徵:商品的被點選量/收藏量/加購量/購買量/點選-購買轉化率/加購-購買轉化率,等等;(反映商品類長期屬性)
使用者-時間UT特徵:使用者在最近的n天內、週末/非週末、每天不同時間段的點選量/收藏量/加購量/購買量/點選-購買轉化率/加購-購買轉化率,等等;(反映使用者近期屬性)
使用者-商品-時間UIT特徵:使用者在最近的n天內對該商品所屬類的點選量/收藏量/加購量/購買量/點選-購買轉化率/加購-購買轉化率,等等;(反映使用者對該商品的喜愛程度等等)
使用者-商品分類-時間UCT特徵:使用者在最近的n天內對該商品的點選量/收藏量/加購量/購買量/點選-購買轉化率/加購-購買轉化率,等等;(反映使用者對該類別商品的需求程度)
還可以提取很多其他的互動特徵,應儘量結合所擁有的資料集,找出相對較合適的有意義的特徵。
(多做時間切片的特徵對提高模型準確率有很大幫助),程式碼如下:
##########使用者U特徵###########
library(dplyr) #主要使用了for迴圈+ dplyr包的filter()篩選函式
user_pv <- c() #該使用者的總點選量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$user_ID == df2[i,1]& df2$behavior_type == '1')
countdata <- nrow(use_data)
user_pv <- rbind(user_pv,countdata)
}
write.csv(user_pv,file = 'F://Data/user_pv.csv')
user_pv <- read.csv(file = 'F://Data/user_pv.csv')
user_fav <- c() #該使用者的總收藏量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$user_ID == df2[i,1]& df2$behavior_type == '2')
countdata <- nrow(use_data)
user_fav <- rbind(user_fav,countdata)
}
write.csv(user_fav,file = 'F://Data/user_fav.csv')
user_fav <- read.csv(file = 'F://Data/user_fav.csv')
user_cart <- c() #該使用者的總加購量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$user_ID == df2[i,1]& df2$behavior_type == '3')
countdata <- nrow(use_data)
user_cart <- rbind(user_cart,countdata)
}
write.csv(user_cart,file = 'F://Data/user_cart.csv')
user_cart <- read.csv(file = 'F://Data/user_cart.csv')
user_buy <- c() #該使用者的總購買量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$user_ID == df2[i,1]& df2$behavior_type == '4')
countdata <- nrow(use_data)
user_buy <- rbind(user_buy,countdata)
}
write.csv(user_buy,file = 'F://Data/user_buy.csv')
user_buy <- read.csv(file = 'F://Data/user_buy.csv')
#
df_U <- data.frame(df2[,12],user_pv[,2],user_fav[,2],user_cart[,2],user_buy[,2])
colnames(df_U) <- c('user_all','user_pv','user_fav','user_cart','user_buy')
write.csv(df_U, file = 'F://Data/df_U.csv')
df_U <- read.csv(file = 'F://Data/df_U.csv')[,-1]
df_U$user_pa <- round(df_U$user_pv/df_U$user_all, digits = 3)
df_U$user_fa <- round(df_U$user_fav/df_U$user_all, digits = 3)
df_U$user_ca <- round(df_U$user_cart/df_U$user_all, digits = 3)
df_U$user_ba <- round(df_U$user_buy/df_U$user_all, digits = 3)
user_all_mean <- mean(df_U$user_all,trim = 0.05,na.rm = TRUE) #捨棄前後5%的樣本和缺失值後的平均值
df_U$user_activity <- round(df_U$user_all/user_all_mean, digits = 3)
write.csv(df_U, file = 'F://Data/df_U_rate.csv')
df_U <- read.csv(file = 'F://Data/df_U_rate.csv')[,-1]
##########商品I特徵###########
item_pv <- c() #該商品的總點選量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$item_ID == df2[i,2] & df2$behavior_type == '1')
countdata <- nrow(use_data)
item_pv <- rbind(item_pv,countdata)
}
write.csv(item_pv,file = 'F://Data/item_pv.csv')
item_pv <- read.csv(file = 'F://Data/item_pv.csv')
item_fav <- c() #該商品的總收藏量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$item_ID == df2[i,2] & df2$behavior_type == '2')
countdata <- nrow(use_data)
item_fav <- rbind(item_fav,countdata)
}
write.csv(item_fav,file = 'F://Data/item_fav.csv')
item_fav <- read.csv(file = 'F://Data/item_fav.csv')
item_cart <- c() #該商品的總加購量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$item_ID == df2[i,2] & df2$behavior_type == '3')
countdata <- nrow(use_data)
item_cart <- rbind(item_cart,countdata)
}
write.csv(item_cart,file = 'F://Data/item_cart.csv')
item_cart <- read.csv(file = 'F://Data/item_cart.csv')
item_buy <- c() #該商品的總購買量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$item_ID == df2[i,2]& df2$behavior_type == '4')
countdata <- nrow(use_data)
item_buy <- rbind(item_buy,countdata)
}
write.csv(item_buy,file = 'F://Data/item_buy.csv')
item_buy <- read.csv(file = 'F://Data/item_buy.csv')
df_I <- data.frame(df2[,13],item_pv[,2],item_fav[,2],item_cart[,2],item_buy[,2])
colnames(df_I) <- c('item_all','item_pv','item_fav','item_cart','item_buy')
write.csv(df_I, file = 'F://Data/df_I.csv')
df_I <- read.csv(file = 'F://Data/df_I.csv')[,-1]
df_I$item_pa <- round(df_I$item_pv/df_I$item_all, digits = 3)
df_I$item_fa <- round(df_I$item_fav/df_I$item_all, digits = 3)
df_I$item_ca <- round(df_I$item_cart/df_I$item_all, digits = 3)
df_I$item_ba <- round(df_I$item_buy/df_I$item_all, digits = 3)
item_all_mean <- mean(df_I$item_all,trim = 0.05,na.rm = TRUE) #捨棄前後5%的樣本和缺失值後的平均值
df_I$item_hot <- round(df_I$item_all/item_all_mean, digits = 3)
write.csv(df_I, file = 'F://Data/df_I_rate.csv')
df_I <- read.csv(file = 'F://Data/df_I_rate.csv')[,-1]
##########商品分類C特徵###########
category_pv <- c() #該商品分類的總點選量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$item_category == df2[i,3]
& df2$behavior_type == '1')
countdata <- nrow(use_data)
category_pv <- rbind(category_pv,countdata)
}
write.csv(category_pv,file = 'F://Data/category_pv.csv')
category_pv <- read.csv(file = 'F://Data/category_pv.csv')
category_fav <- c() #該商品分類的總收藏量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$item_ID == df2[i,2] &
df2$behavior_type == '2')
countdata <- nrow(use_data)
category_fav <- rbind(category_fav,countdata)
}
write.csv(category_fav,file = 'F://Data/category_fav.csv')
category_fav <- read.csv(file = 'F://Data/category_fav.csv')
category_cart <- c() #該商品分類的總加購量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$item_category == df2[i,3]
& df2$behavior_type == '3')
countdata <- nrow(use_data)
category_cart <- rbind(category_cart,countdata)
}
write.csv(category_cart,file = 'F://Data/category_cart.csv')
category_cart <- read.csv(file = 'F://Data/category_cart.csv')
category_buy <- c() #該商品分類的總購買量
for (i in 1:dim(df2)[1]){
use_data <- filter(df2, df2$item_category == df2[i,3]
& df2$behavior_type == '4')
countdata <- nrow(use_data)
category_buy <- rbind(category_buy,countdata)
}
write.csv(category_buy,file = 'F://Data/category_buy.csv')
category_buy <- read.csv(file = 'F://Data/category_buy.csv')
df_C <- data.frame(df2[,14],category_pv[,2],category_fav[,2],
category_cart[,2],category_buy[,2])
colnames(df_C) <- c('category_all','category_pv','category_fav',
'category_cart','category_buy')
write.csv(df_C, file = 'F://Data/df_C.csv')
df_C <- read.csv(file = 'F://Data/df_C.csv') #商品分類C特徵
df_C$pa_rate <- round(df_C$category_pv/df_C$category_all, digits = 3)
df_C$fa_rate <- round(df_C$category_fav/df_C$category_all, digits = 3)
df_C$ca_rate <- round(df_C$category_cart/df_C$category_all, digits = 3)
df_C$ba_rate <- round(df_C$category_buy/df_C$category_all, digits = 3)
category_all_mean <- mean(df_C$category_all,trim = 0.05,na.rm = TRUE) #捨棄前後5%的樣本和缺失值後的平均值
df_C$C_activity <- round(df_C$category_all/category_all_mean, digits = 3)
write.csv(df_C, file = 'F://Data/df_C_rate.csv')
df_C <- read.csv(file = 'F://Data/df_C_rate.csv')[,-1]
colnames(df_C)[6:10] <- c('category_pa','category_fa','category_ca','category_ba','category_hot')
##########使用者-商品分類-時間UCT特徵###########
#最近5天該使用者的總行為數
user_5day=c()
for (i in 1:dim(df2)[1]){
use_data <- filter(df2,df2$time < df2[i,5]
& df2$time > (df2[i,5]- 60*60*24*5)
& df2$user_ID == df2[i,1])
countdata <- nrow(use_data)
user_5day <- rbind(user_5day,countdata)
}
head(user_5day)
write.csv(user_5day,file = 'F://Data/user_5day.csv')
user_5day <- read.csv(file = 'F://Data/user_5day.csv')
#最近5天該使用者對該商品類別的總行為數
user_category_5day=c()
for (i in 1:dim(df2)[1]){
use_data <- filter(df2,df2$time < df2[i,5]
& df2$time > (df2[i,5]- 60*60*24*5) #注意factor不可比
& df2$user_ID == df2[i,1]
& df2$item_category == df2[i,3])
countdata <- nrow(use_data)
user_category_5day <- rbind(user_category_5day,countdata)
}
head(user_category_5day)
write.csv(user_category_5day,file = 'F://Data/user_category_5day.csv')
user_category_5day <- read.csv(file = 'F://Data/user_category_5day.csv')
#最近5天該使用者對該商品類別的點選數(pv1)
user_category_pv_5day <- c()
for (i in 1:dim(df2)[1]){
use_datapv <- filter(df2,df2$time < df2[i,5]
& df2$time > (df2[i,5]- 60*60*24*5) #注意factor不可比
& df2$user_ID == df2[i,1]
& df2$item_category == df2[i,3]
& df2$behavior_type == '1')
countdatapv <- nrow(use_datapv)
item5daypv <- rbind(item5daypv,countdatapv)
}
head(user_category_pv_5day)
write.csv(user_category_pv_5day,file = 'F://Data/user_category_pv_5day.csv')
user_category_pv_5day <- read.csv(file = 'F://Data/user_category_pv_5day.csv')
#最近5天該使用者對該商品類別的收藏數(fav2)
user_category_fav_5day <- c()
for (i in 1:dim(df2)[1]){
use_datafav <- filter(df2,df2$time < df2[i,5]
& df2$time > (df2[i,5]- 60*60*24*5) #注意factor不可比
& df2$user_ID == df2[i,1]
& df2$item_category == df2[i,3]
& df2$behavior_type == '2')
countdatafav <- nrow(use_datafav)
user_category_fav_5day <- rbind(user_category_fav_5day,countdatafav)
}
head(user_category_fav_5day)
write.csv(user_category_fav_5day,file = 'F://Data/user_category_fav_5day.csv')
user_category_fav_5day <- read.csv(file = 'F://Data/user_category_fav_5day.csv')
#最近5天該使用者對該商品類別的加購數(cart3)
user_category_cart_5day <- c()
for (i in 1:dim(df2)[1]){
use_datacart <- filter(df2,df2$time < df2[i,5]
& df2$time > (df2[i,5]- 60*60*24*5) #注意factor不可比
& df2$user_ID == df2[i,1]
& df2$item_category == df2[i,3]
& df2$behavior_type == '3')
countdatacart <- nrow(use_datacart)
user_category_cart_5day <- rbind(user_category_cart_5day,countdatacart)
}
head(user_category_cart_5day)
write.csv(user_category_cart_5day,file = 'F://Data/user_category_cart_5day.csv')
user_category_cart_5day <- read.csv(file = 'F://Data/user_category_cart_5day.csv')
#最近5天該使用者對該商品類別的購買數(buy4)
user_category_buy_5day <- c()
for (i in 1:dim(df2)[1]){
use_databuy <- filter(df2,df2$time < df2[i,5]
& df2$time > (df2[i,5]- 60*60*24*5) #注意factor不可比
& df2$user_ID == df2[i,1]
& df2$item_category == df2[i,3]
& df2$behavior_type == '4')
countdatabuy <- nrow(use_databuy)
user_category_buy_5day <- rbind(user_category_buy_5day,countdatabuy)
}
head(user_category_buy_5day)
write.csv(user_category_buy_5day,file = 'F://Data/user_category_buy_5day.csv')
user_category_buy_5day <- read.csv(file = 'F://Data/user_category_buy_5day.csv')
df_UCT <- data.frame(user_category_5day[,2],user_category_pv_5day[,2],
user_category_fav_5day[,2],user_category_cart_5day[,2],
user_category_buy_5day[,2])
colnames(df_UCT) <- c('UC5day_all','UC5day_pv','UC5day_fav','UC5day_cart','UC5day_buy')
write.csv(df_UCT, file = 'F://Data/df_UCT.csv')
df_UCT <- read.csv(file = 'F://Data/df_UCT.csv')[,-1] #使用者-商品分類-時間UCT特徵
df_UCT$UC5day_all <- user_5day[,1]
df_UCT$aa_rate <- round(df_UCT$UC5day_category/df_UCT$UC5day_all, digits = 3)
df_UCT$aa_rate[is.na(df_UCT$aa_rate)] <- 0
df_UCT$pa_rate <- round(df_UCT$UC5day_pv/df_UCT$UC5day_all, digits = 3)
df_UCT$pa_rate[is.na(df_UCT$pa_rate)] <- 0
df_UCT$fa_rate <- round(df_UCT$UC5day_fav/df_UCT$UC5day_all, digits = 3)
df_UCT$fa_rate[is.na(df_UCT$fa_rate)] <- 0
df_UCT$ca_rate <- round(df_UCT$UC5day_cart/df_UCT$UC5day_all, digits = 3)
df_UCT$ca_rate[is.na(df_UCT$ca_rate)] <- 0
df_UCT$ba_rate <- round(df_UCT$UC5day_buy/df_UCT$UC5day_all, digits = 3)
df_UCT$ba_rate[is.na(df_UCT$ba_rate)] <- 0
UC5day_all_mean <- mean(df_UCT$UC5day_all,trim = 0.05,na.rm = TRUE) #捨棄前後5%的樣本和缺失值後的平均值
df_UCT$UCT_activity <- round(df_UCT$UC5day_all/UC5day_all_mean, digits = 3)#使用者近5天活躍度(相對近五天所有人)
write.csv(df_UCT, file = 'F://Data/df_UCT_rate.csv')
colnames(df_UCT) <- c('uc5_all','uc5_pv','uc5_fav','uc5_cart','uc5_buy','u5_all',
'uc5_pa','uc5_fa','uc5_ca','uc5_ba','uc5_u5','u5_activity')
df_UCT <- read.csv(file = 'F://Data/df_UCT_rate.csv')[,-1]
##########使用者-時間UT特徵###########
#是否週末對該使用者行為的影響
library(plyr)
df2$time_week <- mapvalues(df2$time_weekday,c('星期一','星期二','星期三','星期四','星期五','星期六','星期日'),
c('0','0','0','0','0','1','1'))
str(df2)
library(dplyr)
#週末該使用者平均每天的總行為數
user_holiday=c()
for (i in 1:dim(df2)[1]){
use_data <- filter(df2,df2$user_ID == df2[i,1]
& df2$time_week == '1')
n_day <- nrow(data.frame(unique(use_data$time_day)))
countdata <- round(nrow(use_data)/n_day,digits = 1)
user_holiday <- rbind(user_holiday,countdata)
}
head(user_holiday)
write.csv(user_holiday,file = 'F://Data/user_holiday.csv')
user_holiday <- read.csv(file = 'F://Data/user_holiday.csv')
#週末該使用者平均每天的購買數
user_buy_holiday=c()
for (i in 1:dim(df2)[1]){
use_data <- filter(df2,df2$user_ID == df2[i,1]
& df2$time_week == '1'
& df2$behavior_type == '4')
n_day <- nrow(data.frame(unique(use_data$time_day)))
countdata <- round(nrow(use_data)/n_day, digits = 1)
user_buy_holiday <- rbind(user_buy_holiday,countdata)
}
head(user_buy_holiday)
write.csv(user_buy_holiday,file = 'F://Data/user_buy_holiday.csv')
user_buy_holiday <- read.csv(file = 'F://Data/user_buy_holiday.csv')
#非週末該使用者平均每天的總行為數
user_workday=c()
for (i in 1:dim(df2)[1]){
use_data <- filter(df2,df2$user_ID == df2[i,1]
& df2$time_week == '0')
n_day <- nrow(data.frame(unique(use_data$time_day)))
countdata <- round(nrow(use_data)/n_day,digits = 1)
user_workday <- rbind(user_workday,countdata)
}
head(user_workday)
write.csv(user_workday,file = 'F://Data/user_workday.csv')
user_workday <- read.csv(file = 'F://Data/user_workday.csv')
#非週末該使用者平均每天的購買數
user_buy_workday=c()
for (i in 1:dim(df2)[1]){
use_data <- filter(df2,df2$user_ID == df2[i,1]
& df2$time_week == '0'
& df2$behavior_type == '4')
n_day <- nrow(data.frame(unique(use_data$time_day)))
countdata <- round(nrow(use_data)/n_day,digits = 1)
user_buy_workday <- rbind(user_buy_workday,countdata)
}
head(user_buy_workday)
write.csv(user_buy_workday,file = 'F://Data/user_buy_workday.csv')
user_buy_workday <- read.csv(file = 'F://Data/user_buy_workday.csv')
df_UT <- data.frame(user_holiday[,2],user_buy_holiday[,2],
user_workday[,2],user_buy_workday[,2])
colnames(df_UT) <- c('user_holiday','user_buy_holiday','user_workday','user_buy_workday')
df_UT[is.na(df_UT)] <- 0
df_UT$holiday_ba <- round(df_UT$user_buy_holiday/df_UT$user_holiday,digits = 3)
df_UT$workday_ba <- round(df_UT$user_buy_workday/df_UT$user_workday,digits = 3)
df_UT[is.na(df_UT)] <- 0
table(is.na(df_UT))
write.csv(df_UT, file = 'F://Data/df_UT.csv')
df_UT <- read.csv(file = 'F://Data/df_UT.csv',header = TRUE)[,-1]
##########使用者-商品-時間UIT特徵###########
# 最近5天該使用者對該商品的行為總數(pv1/fav2/cart4/buy4)
library(dplyr)
user_item_5day_all <- c()
for (i in 1:dim(df2)[1]){
use_data <- filter(df2,df2$time < df2[i,5]
& df2$time > (df2[i,5]- 60*60*24*5) #注意factor不可比
& df2$user_ID == df2[i,1]
& df2$item_ID == df2[i,2])
countdata <- nrow(use_data)
user_item_5day_all <- rbind(user_item_5day_all,countdata)
}
head(user_item_5day_all)
write.csv(user_item_5day_all,file = 'F://Data/user_item_5day_all.csv')
user_item_5day_all <- read.csv(file = 'F://Data/user_item_5day_all.csv')
# 該使用者對該商品最近5天的點選數pv
user_item_pv_5day <- c()
for (i in 1:dim(df2)[1]){
use_data <- filter(df2,df2$time < df2[i,5]
& df2$time > (df2[i,5]- 60*60*24*5) #注意factor不可比
& df2$user_ID == df2[i,1]
& df2$item_ID == df2[i,2]
& df2$behavior_type == '1')
countdata <- nrow(use_data)
user_item_pv_5day <- rbind(user_item_pv_5day,countdata)
}
head(user_item_pv_5day)
write.csv(user_item_pv_5day,file = 'F://Data/user_item_pv_5day.csv')
user_item_pv_5day <- read.csv(file = 'F://Data/user_item_pv_5day.csv')
# 該使用者對該商品最近5天的收藏數fav
user_item_fav_5day <- c()
for (i in 1:dim(df2)[1]){
use_data <- filter(df2,df2$time < df2[i,5]
& df2$time > (df2[i,5]- 60*60*24*5) #注意factor不可比
& df2$user_ID == df2[i,1]
& df2$item_ID == df2[i,2]
& df2$behavior_type == '2')
countdata <- nrow(use_data)
user_item_fav_5day <- rbind(user_item_fav_5day,countdata)
}
head(user_item_fav_5day)
write.csv(user_item_fav_5day,file = 'F://Data/user_item_fav_5day.csv')
user_item_fav_5day <- read.csv(file = 'F://Data/user_item_fav_5day.csv')
# 該使用者對該商品最近5天的加購數cart
user_item_cart_5day <- c()
for (i in 1:dim(df2)[1]){
use_data <- filter(df2,df2$time < df2[i,5]
& df2$time > (df2[i,5]- 60*60*24*5) #注意factor不可比
& df2$user_ID == df2[i,1]
& df2$item_ID == df2[i,2]
& df2$behavior_type == '3')
countdata <- nrow(use_data)
user_item_cart_5day <- rbind(user_item_cart_5day,countdata)
}
head(user_item_cart_5day)
write.csv(user_item_cart_5day,file = 'F://Data/user_item_cart_5day.csv')
user_item_cart_5day <- read.csv(file = 'F://Data/user_item_cart_5day.csv')
# 該使用者對該商品最近5天的購買數buy
user_item_buy_5day <- c()
for (i in 1:dim(df2)[1]){
use_data <- filter(df2,df2$time < df2[i,5]
& df2$time > (df2[i,5]- 60*60*24*5) #注意factor不可比
& df2$user_ID == df2[i,1]
& df2$item_ID == df2[i,2]
& df2$behavior_type == '4')
countdata <- nrow(use_data)
user_item_buy_5day <- rbind(user_item_buy_5day,countdata)
}
head(user_item_buy_5day)
write.csv(user_item_buy_5day,file = 'F://Data/user_item_buy_5day.csv')
user_item_buy_5day <- read.csv(file = 'F://Data/user_item_buy_5day.csv')
df_UIT <- data.frame(user_item_5day_all[,2],user_item_pv_5day[,2],
user_item_fav_5day[,2],user_item_cart_5day[,2],
user_item_buy_5day[,2])
colnames(df_UIT) <- c('UI5day_all','UI5day_pv','UI5day_fav','UI5day_cart','UI5day_buy')
write.csv(df_UIT, file = 'F://Data/df_UIT.csv')
df_UIT <- read.csv(file = 'F://Data/df_UIT.csv')[,-1]
df_UIT$U5day_all <- user_5day[,1]
df_UIT$aa_rate <- round(df_UIT$UI5day_all/df_UIT$U5day_all, digits = 3)
df_UIT$aa_rate[is.na(df_UIT$aa_rate)] <- 0
df_UIT$ic_rate <- round(df_UIT$UI5day_all/df_UCT$UC5day_all, digits = 3)
df_UIT$ic_rate[is.na(df_UIT$ic_rate)] <- 0
df_UIT$pa_rate <- round(df_UIT$UI5day_pv/df_UIT$U5day_all, digits = 3)
df_UIT$pa_rate[is.na(df_UIT$pa_rate)] <- 0
df_UIT$fa_rate <- round(df_UIT$UI5day_fav/df_UIT$U5day_all, digits = 3)
df_UIT$fa_rate[is.na(df_UIT$fa_rate)] <- 0
df_UIT$ca_rate <- round(df_UIT$UI5day_cart/df_UIT$U5day_all, digits = 3)
df_UIT$ca_rate[is.na(df_UIT$ca_rate)] <- 0
df_UIT$ba_rate <- round(df_UIT$UI5day_buy/df_UIT$U5day_all, digits = 3)
df_UIT$ba_rate[is.na(df_UIT$ba_rate)] <- 0
write.csv(df_UIT, file = 'F://Data/df_UIT_rate.csv')
df_UIT <- read.csv(file = 'F://Data/df_UIT_rate.csv')[,-1]
colnames(df_UIT) <- c('ui5_all','ui5_pv','ui5_fav','ui5_cart','ui5_buy','u5_all',
'ui5_u5','ui5_pa','ui5_fa','ui5_ca','ui5_ba')
##########總特徵############
data_final <- cbind( df2,df_U,df_I,df_C,df_UT,df_UCT,df_UIT )
table(is.na(data_final))
write.csv(data_origin,file = 'F://Data/data_final.csv' )
特徵選擇:減少特徵數量、降維可使模型泛化能力更強,減少過擬合。特徵初步選擇有很多種不同的方法,基於特徵的方差:方差越小包含的資訊越少,可用於移除那些取值變化小的特徵;基於單個特徵與響應變數的測試得分,但可能剔除掉組合效果好的特徵;基於特徵間相關係數,移除強相關的特徵,等等這些都可以初步剔除對模型作用微乎其微的特徵。
這裡衡量特徵間相關係數來移除冗餘特徵,即移除高度關聯的特徵,R語言程式碼如下:
#Caret R包提供findCorrelation函式,分析特徵的關聯矩陣,移除冗餘特徵
set.seed(333)
library(mlbench)
library(caret)
data_final <- read.csv(file = 'F://Data/data_final.csv' )[,-1] #載入資料
str(data_final)
#去掉非數值型別的屬性,然後通過相關性計算得到一個關聯度矩陣
n1 <- grep('user_all',colnames(data_final))
n2 <- ncol(data_final)
correlation_matrix <- cor(data_final[,n1:n2]) #計算相關矩陣
#print(correlation_matrix)
# 查詢高度相關的屬性(相關係數>0.9),names = TRUE 返回屬性名,否則返回對應列序號
highly_correlated <- findCorrelation(correlation_matrix,
cutoff=0.9,names = TRUE)
# names = TRUE 返回列名,否則返回列序,(cutoff一般為0.75,這裡取0.9)
print(highly_correlated)
#[1] "user_activity" "user_all" "uc5_all" "u5_all"
#[5] "ui5_all" "item_hot" "item_all" "category_hot"
#[9] "category_all" "ui5_u5" "category_cart" "uc5_u5"
#[13] "item_fav"
data_final <- data_final[,-grep('user_activity',colnames(data_final))]
data_final <- data_final[,-grep('user_all',colnames(data_final))]
data_final <- data_final[,-grep('u5_all',colnames(data_final))]
data_final <- data_final[,-grep('item_all',colnames(data_final))]
data_final <- data_final[,-grep('category_all',colnames(data_final))]
data_final <- data_final[,-grep('ui5_u5',colnames(data_final))]
data_final <- data_final[,-grep('category_cart',colnames(data_final))]
data_final <- data_final[,-grep('uc5_u5',colnames(data_final))]
data_final <- write.csv(data_final,file = 'F://Data/data_final.csv')
劃分資料集:訓練集,驗證集和測試集,劃分資料集意義很重要,這裡也不多說,直接上程式碼:
############劃分資料集#########
rm(list = ls())
data_final <- read.csv(file = 'F://Data/data_final.csv' )[,-1]
data_final <- data_final[,-(5:9)]
library(plyr)
data_final$behavior_type <- mapvalues(data_final$behavior_type, c('1','2','3','4'),
c('0','0','0','1'))
data_final$behavior_type <- as.factor(data_final$behavior_type)
class(data_final$behavior_type)
set.seed(333) #隨便設定一個隨機種子
trainsample <- sample(nrow(data_final),0.7*nrow(data_final))
traindata <- data_final[trainsample,] #訓練集
write.csv(traindata,file = 'F://Data/traindata.csv' )
other <- data_final[-trainsample,] #驗證+測試
set.seed(333)
testsample <- sample(nrow(other),0.5*nrow(other))
testdata <- other[testsample,] #測試集
validatedata <- other[-testsample,]
write.csv(testdata,file = 'F://Data/testdata.csv' )
write.csv(validatedata,file = 'F://Data/validatedata.csv' )
訓練模型:到了訓練模型的步驟,預測使用者購買行為可以用LR、RF、GBDT等模型,也可以加入並行的神經網路演算法,這裡採用LR模型,R語言程式碼如下:
rm(list=ls())
logit_df <- read.csv(file = 'F://Data/traindata.csv' )[,-(1:4)]
str(logit_df)
logit_df$behavior_type <- as.factor(logit_df$behavior_type)
logit_df$time_week <- as.factor(logit_df$time_week)
str(logit_df)
table(is.na(logit_df))
logit_fit <- glm(behavior_type~.,data = logit_df, family = binomial(link = 'logit'))
summary(logit_fit)
logit_fit2 <- glm(behavior_type~.-category_fav-uc5_buy-ui5_buy-category_pv-
category_hot-user_all_w-user_buy_w,
data = logit_df,family = binomial(link = 'logit'))
#響應變數~.-特徵A, 去除變數A後對所有變數進行迴歸
summary(logit_fit2)
logit_fit3 <- glm(behavior_type~.-category_fav-uc5_buy-ui5_buy-category_pv-
category_hot-user_all_w-user_buy_w-item_pa-item_fa-
item_ca-item_ba-category_ba-uc5_ca,
data = logit_df,family = binomial(link = 'logit'))
summary(logit_fit3)
logit_fit3 <- glm(behavior_type~.-category_fav-uc5_buy-ui5_buy-category_pv-
category_hot-user_all_w-user_buy_w-item_pa-
item_ca-item_ba-category_ba-uc5_ca-item_fa-
user_pv-user_fav-user_cart,
data = logit_df,family = binomial(link = 'logit'))
summary(logit_fit3)
logit_fit4 <- glm(behavior_type~.-category_fav-uc5_buy-ui5_buy-category_pv-
category_hot-user_all_w-user_buy_w-item_pa-
item_ca-item_ba-category_ba-uc5_ca-item_fa-
user_pv-user_fav-user_cart-uc5_pa-uc5_fa-user_all_h,
data = logit_df,family = binomial(link = 'logit'))
summary(logit_fit4)
rm(logit_fit4)
logit_fit <- logit_fit4
n <- nrow(logit_df)
R2 <- 1-exp((logit_fit$deviance - logit_fit$null.deviance)/n)
cat("Cox-Snell R2=",R2,"\n") #計算Cox-Snell擬合優度
R2<-R2/(1-exp((-logit_fit$null.deviance)/n))
R2 #計算Nagelkerke擬合優度,最後輸出Nagelkerke擬合優度值
# R2= 0.7373435
logit_validate <- read.csv(file = "F://Data/validatedata.csv")[,-(1:4)]
logit_validate$behavior_type <- as.factor(logit_validate$behavior_type)
logit_validate$time_week <- as.factor(logit_validate$time_week)
str(logit_validate)
prob <- predict(logit_fit,logit_validate,type = 'response')
head(prob)
df_a <- c()
i = 0
a <- 0
for (i in 0:99){
i <- i + 1
a <- a +0.01
df_a <- rbind(df_a,a)
}
F1 <- c()
for (sp in df_a[1:100]){
#設定閾值
logit_pred <- factor(prob > sp,levels = c(FALSE , TRUE),
labels = c('0','1'))
# 評估模型
logit_evaluation <- table(logit_validate$behavior_type,logit_pred,
dnn = c('Actual','Predicted'))
logit_evaluation
TP <- logit_evaluation[2,2] #判定為1實際為1的樣本數
FP <- logit_evaluation[1,2] #判定為1實際為0的樣本數
TN <- logit_evaluation[1,1] #判定為0實際為0的樣本數
FN <- logit_evaluation[2,1] #判定為0實際為1的樣本數
precision <- TP/(TP + FP) #準確率
recall <- TP/(TP + FN) #召回率
rluofa <- 1 #F_measure 取 F1
F_measure <- ((rluofa**2 + 1)*precision*recall) / ((rluofa**2)*(precision + recall))
use_data <- cbind(sp, precision, recall, F_measure)
F1 <- rbind(F1, use_data)
cat('閾值P=',sp,'準確率=',precision,"召回率=",recall,'F1值=',F_measure,'\n')
}
df_f1 <- data.frame(F1)
ggplot(data = df_f1,aes(x = sp, y = F_measure))+
geom_line()+geom_point()
write.csv(df_f1,file = 'F://Data/df_f1.csv')
# 根據F1的變化,選擇其最大值F1=0.72,對應劃分N/P閾值sp = 0.38
#根據對驗證集的預測調好引數後可以用測試集試試模型效果如何
用for迴圈把LR模型的N/P劃分閾值從0.00到1.00都跑了一遍,得到不同劃分閾值對應的準確率、召回率、F1值資料,如下圖:
MySQL儲存資料:最最重要的當然還是儲存資料啊
##########資料庫MySQL儲存資料##########
install.packages('RMySQL',dependencies = TRUE)
library(RMySQL,DBI)
#資料操作方法:
library(RMySQL)
help(package = "RMySQL") #檢視RMySQL的說明文件,裡面有RMySQL所有可用的方法
#建立資料庫連線
con <- dbConnect(MySQL(),
username="root",
dbname="ali",
password="123456",
host="localhost",
port=3306)
summary(con)
dbGetInfo(con) #獲取連線資訊
dbWriteTable(con, "logit_fit", logit_fit,row.names = TRUE) #在資料庫'ali'下寫表'logit_fit'
dbReadTable(con, "ali.logit_fit") #讀表'ali.logit_fit
dbDisconnect(conn) #關閉連線
大致的流程就是如此,其中還有很多錯誤之處,歡迎各位前輩指出!