資料分析學習體驗——實際案例_邏輯迴歸&線性迴歸
阿新 • • 發佈:2019-01-05
作者: 江俊
日期: 2018年3月27日
主要介紹批量生成profiling圖以及五數概括的自建函式。
專案背景
某保養品公司目前有一款產品線銷售情況一直不景氣,公司預算有限,希望在現有的客戶中挖掘出最有可能在30天內購買該產品的使用者群
使用語言
R語言
使用模型
邏輯迴歸+線性迴歸
建模步驟
一、 瞭解資料
- 資料結構
- Y變數定義
- X變數型別
- 響應率情況
- 花費金額分佈
程式碼:
rm(list=ls())
setwd("./") #change the location
getwd() #check the location
list.files() #list the files under your location
#########################################################################
######################## Part1 read data ###########################
#########################################################################
filepath<-"./Exercise_Response_data.csv"
raw<-read.csv(filepath,stringsAsFactors = F )
dim(raw)
str(raw)
summary(raw)
var<-data.frame(var=colnames(raw),type=sapply(raw,class))
# 將結果匯出到 xlsx表格
require(XLConnect)
#xlsx <- loadWorkbook('Correlation.xlsx',create=TRUE)
xlsx <- loadWorkbook('myhomework.xlsx',create = T)
createSheet(xlsx,name='variable') #name the worksheet as 'correlation'
writeWorksheet(xlsx,var,'variable',startRow=1,startCol=1, header=TRUE) #define the startrow,startcol,header
saveWorkbook(xlsx)
# dv_revenue
summary(raw$dv_revenue)
raw$dv_revenue<-ifelse(is.na(raw$dv_revenue),0,raw$dv_revenue)
# table 自動忽略缺失值
View(table(raw$dv_revenue))
hist(raw$dv_revenue) # dv_revenue hist
quantile(raw$dv_revenue,(1:20)/20,na.rm = T) # dv_revenue quantile
View(t(mean_rev<-quantile(raw$dv_revenue,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm = T)))
hist(raw[raw$dv_revenue>0 & raw$dv_revenue<=50,"dv_revenue"],main="dev_revenue <=50",xlab = "dev_revenue")
# dv_response
table(raw$dv_response)
prop.table(table(raw$dv_response))
執行結果:
0 1
22878 1220
0 1
0.94937339 0.05062661
二、拆分資料
- train:訓練集
- test:驗證集
程式碼:
#########################################################################
######################## Part2 split into two ######################
#########################################################################
# modeling segments
table(raw$segment)
prop.table(table(raw$segment))
#separate build sample
train<-raw[raw$segment=="build",]
table(train$segment)
#separate inval sample
test<-raw[raw$segment=="inval",]
table(test$segment)
執行結果:
build inval
16898 7200
build inval
0.70122 0.29878
build
16898
inval
7200
三、 探索資料
- 分型別、數值型
- X內部表現
- X與Y關係
- 缺失值
批量生成profiling圖
程式碼:
#########################################################################
######################## Part3 profile ############################
#########################################################################
#overall performance
#總體人數,計算總體樣本響應情況
overrall<-dim(train)[1]
#相應人數,因為響應的記為1,所以可以直接使用sum()求和
over_responder<-sum(train$dv_response)
#responder<-length(train$dv_response[train$dv_response==1])
#響應率
over_response_rate<-over_responder/overrall
overall_perf<-data.frame(overrall,responder=over_responder,response_rate=over_response_rate)
overall_perf
#variable type
data.frame(table(sapply(train[,4:27],class)))
#character
#檢視資料型別為某種的資料名,類似的有:is.character,is.numeric,is.factor
chavar_name<-colnames(train[,4:27])[unlist(lapply(train[,4:27],is.character))]
#字元型資料索引
charater_index<-which(colnames(train) %in% chavar_name)
lapply(train[,chavar_name],table)
#整數型,注意可能是分型別數值
intvar_name<-colnames(train[,4:27])[unlist(lapply(train[4:27],is.integer))]
summary(train[,intvar_name])
#根據結果記錄分型別數值的變數名
var_fenlei<-c(chavar_name,"Occupation","Education","Frequency_of_last_mth")
lapply(train[,var_fenlei],table)
#根據分型別和連續型將原資料集分成兩類,方便後續profile的批量處理
#分型別數值的索引
fenlei_index<-which(colnames(train) %in% var_fenlei)
#除開id列,響應變數列,字元型,分型別數值以外的連續數值型變數
#which(colnames(train[,4:27]) %in% c("rid","dv_response","dv_revenue"))
numvar_name<-colnames(train[,-c(1:3,fenlei_index,28:ncol(train))])
#數值型數值的索引
lianxu_index<-which(colnames(train) %in% numvar_name)
############################################### 1. Profiling for category variables####################################################
#install.packages('plyr')
library(plyr)
###################################### 1.profile 分型別數值 #########################################
#封裝函式,分型別數值
#資料集,索引,索引長度
profile_fenlei<-function(x,y,n){
results<-data.frame(var=NA,category=NA,count=NA,responder=NA,
percent=NA,response_rate=NA,index=NA)
for(i in 1:n){
prof<-ddply(x,.(x[,y[i]]),summarise,count=length(id),responder=sum(dv_response)) #group by hh_gender_m_flg
#prof
#新增百分比結果
propf<-within(prof,{
index<-responder/count/over_response_rate*100
response_rate<-responder/count*100
percent<-count/overrall*100
}) #add response_rate,index, percentage
propf<-data.frame(var=colnames(train)[y[i]],propf)
colnames(propf)[2]<-"category"
#行連線
results<-rbind(results,propf)
}
#去除首行的空值
results<-results[-1,]
row.names(results)<-1:nrow(results)
return(results)
}
#分類數值的profile
results_fenlei<-profile_fenlei(train[,1:28],fenlei_index,length(fenlei_index))
results_fenlei$category[is.na(results_fenlei$category)]<-"unknown"
results_fenlei$category[results_fenlei$category==""]<-"unknown"
View(results_fenlei)
# #xlsx <- loadWorkbook('Correlation.xlsx',create=TRUE)
# xlsx <- loadWorkbook('myhomework.xlsx')
# createSheet(xlsx,name='profile') #name the worksheet as 'correlation'
# writeWorksheet(xlsx,results_fenlei,'profile',startRow=1,startCol=1, header=TRUE) #define the startrow,startcol,header
# saveWorkbook(xlsx)
#
#
###################################### 1.profile 分型別數值 #########################################
##################################### 2.profile 連續型數值 #########################################
######封裝函式
#資料集,索引,索引長度,分段個數
profile_lianxu<-function(x,y,n,m){
var_data=x
results<-data.frame(var=NA,category=NA,count=NA,responder=NA,
percent=NA,response_rate=NA,index=NA)
for(i in 1:n){
#分離成兩部分:缺失值和無缺失值
nomissing<-data.frame(var_data[!is.na(var_data[,y[i]]),]) #select the no missing value records
missing<-data.frame(var_data[is.na(var_data[,y[i]]),]) #select the missing value records
##################3.2.1 numeric Profiling:missing part
missing2<-ddply(missing,.(missing[,y[i]]),summarise,count=length(id),responder=sum(dv_response)) #group by pos_revenue_base_sp_6mo
colnames(missing2)[1]<-"category"
#View(missing2)
missing_perf<-within(missing2,{
index<-responder/count/over_response_rate*100
response_rate<-responder/count*100
percent<-count/overrall*100
})
#View(missing_perf)
nomissing_value<-nomissing[,y[i]] #put the nomissing values into a variable
nomissing$category<-cut(nomissing_value,unique(quantile(nomissing_value,(0:m)/m)),include.lowest = T) #separte into 10 groups
#View(table(nomissing$var_category)) #take a look at the 10 category
prof2<-ddply(nomissing,.(category),summarise,count=length(id),responder=sum(dv_response))#group by the 10 groups
#View(prof2)
nonmissing_perf<-within(prof2,{
index<-responder/count/over_response_rate*100
response_rate<-responder/count*100
percent<-count/overrall*100
})#add avg_revenue,index,percent
#View(nonmissing_perf)
#set missing_perf and non-missing_Perf together
#View(missing_perf)
#View(nonmissing_perf)
#colnames(nonmissing_perf)[3]<-"responder"
lastprofile<-rbind(nonmissing_perf,missing_perf) #set 2 data together
lastprofile<-data.frame(var=colnames(train)[y[i]],lastprofile)
#行連線
results<-rbind(results,lastprofile)
}
#去除首行的空值
results<-results[-1,]
row.names(results)<-1:nrow(results)
return(results)
}
#連續數值的profile
results_lianxu<-profile_lianxu(train[,1:34],lianxu_index,length(lianxu_index),10)
results_lianxu$category[is.na(results_lianxu$category)]<-"unknown"
View(results_lianxu)
######封裝函式
##################################### 2.profile 連續型數值 #########################################
#將兩個 profile 合成一個整體,輸出到xlsx表格
#xlsx <- loadWorkbook('Correlation.xlsx',create=TRUE)
final_profile<-rbind(results_fenlei,results_lianxu)
View(final_profile)
xlsx <- loadWorkbook('myhomework.xlsx')
createSheet(xlsx,name='profile') #name the worksheet as 'correlation'
writeWorksheet(xlsx,final_profile,'profile',startRow=1,startCol=1, header=T) #define the startrow,startcol,header
saveWorkbook(xlsx)
執行結果(部分截圖):
生成連續型數值的五數概括
程式碼:
#########################################################################
######################## Part4 means ##############################
#########################################################################
# 連續性資料的五數概括
dat_n<-train[,lianxu_index]
mean_var<-data.frame(var=1:ncol(dat_n),mean=NA,median=NA,"0%"=NA,
"1%"=NA,"10%"=NA,"25%"=NA,"50%"=NA,
"75%"=NA,"90%"=NA,"99%"=NA,"100%"=NA,
max=NA,missing=NA)
colnames(mean_var)[4:12]<-c("Minimum","1st Pthl","10th Pctl","25th Pctl","50th Pctl","75th Pctl","90th Pctl",
"99th Pctl","Maximum")
for(i in 1:ncol(dat_n)){
mean_var$var[i]=colnames(dat_n)[i]
mean_var$mean[i]=mean(dat_n[,i],na.rm=TRUE) #na.rm=TRUE去除NA的影響
mean_var$median[i]=median(dat_n[,i],na.rm=TRUE)
mean_var[i,4:12]=quantile(dat_n[,i],c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=TRUE)
mean_var$max[i]=max(dat_n[,i],na.rm=TRUE)
mean_var$missing[i]=sum(is.na(dat_n[,i]))
}
# #銷燬臨時變數
# dat_n<-NULL
#在列表中檢視數值變數的統計資訊
View(mean_var)
# 匯出到 xlsx 表格
xlsx <- loadWorkbook('myhomework.xlsx')
createSheet(xlsx,name='means') #name the worksheet as 'correlation'
writeWorksheet(xlsx,mean_var,'means',startRow=1,startCol=1, header=T) #define the startrow,startcol,header
saveWorkbook(xlsx)
執行結果(部分截圖):
未完待續。。。
轉載請註明出處