資料分析學習體驗——實際案例_邏輯迴歸&線性迴歸

阿新 • • 發佈：2019-01-05

作者: 江俊
日期: 2018年3月27日

主要介紹批量生成profiling圖以及五數概括的自建函式。

專案背景

某保養品公司目前有一款產品線銷售情況一直不景氣，公司預算有限，希望在現有的客戶中挖掘出最有可能在30天內購買該產品的使用者群

使用語言

R語言

使用模型

邏輯迴歸+線性迴歸

建模步驟

一、瞭解資料

資料結構
Y變數定義
X變數型別
響應率情況
花費金額分佈

程式碼：

rm(list=ls())
setwd("./") #change the location
getwd()      #check the location
list.files() #list the files under your location 


#########################################################################
########################   Part1 read data    ###########################
#########################################################################
filepath<-"./Exercise_Response_data.csv"
raw<-read.csv(filepath,stringsAsFactors = F 
)

dim(raw)
str(raw)
summary(raw)
var<-data.frame(var=colnames(raw),type=sapply(raw,class))

# 將結果匯出到 xlsx表格
require(XLConnect)
#xlsx <- loadWorkbook('Correlation.xlsx',create=TRUE)
xlsx <- loadWorkbook('myhomework.xlsx',create = T)
createSheet(xlsx,name='variable')  #name the worksheet as 'correlation' 

writeWorksheet(xlsx,var,'variable',startRow=1,startCol=1, header=TRUE)  #define the startrow,startcol,header
saveWorkbook(xlsx)

# dv_revenue
  summary(raw$dv_revenue)
  raw$dv_revenue<-ifelse(is.na(raw$dv_revenue),0,raw$dv_revenue)
  # table 自動忽略缺失值
  View(table(raw$dv_revenue))
  hist(raw$dv_revenue)                         # dv_revenue hist
  quantile(raw$dv_revenue,(1:20)/20,na.rm = T) # dv_revenue quantile

  View(t(mean_rev<-quantile(raw$dv_revenue,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm = T)))

  hist(raw[raw$dv_revenue>0 & raw$dv_revenue<=50,"dv_revenue"],main="dev_revenue <=50",xlab = "dev_revenue")

# dv_response
  table(raw$dv_response)
  prop.table(table(raw$dv_response))

執行結果：
這裡寫圖片描述

 0     1 
22878  1220 

         0          1 
0.94937339 0.05062661

二、拆分資料

train：訓練集
test：驗證集
程式碼：

#########################################################################
########################   Part2 split into two    ######################
#########################################################################
# modeling segments
  table(raw$segment)
  prop.table(table(raw$segment))

#separate build sample
  train<-raw[raw$segment=="build",]
  table(train$segment)

#separate inval sample
  test<-raw[raw$segment=="inval",]
  table(test$segment)

執行結果：

build inval 
16898  7200 

  build   inval 
0.70122 0.29878 

build 
16898 

inval 
 7200

三、探索資料

分型別、數值型
X內部表現
X與Y關係
缺失值

批量生成profiling圖

程式碼：

#########################################################################
########################   Part3 profile     ############################
#########################################################################
  #overall performance
  #總體人數，計算總體樣本響應情況
  overrall<-dim(train)[1]
  #相應人數，因為響應的記為1，所以可以直接使用sum()求和
  over_responder<-sum(train$dv_response)
  #responder<-length(train$dv_response[train$dv_response==1])
  #響應率
  over_response_rate<-over_responder/overrall
  overall_perf<-data.frame(overrall,responder=over_responder,response_rate=over_response_rate)
  overall_perf

  #variable type  
  data.frame(table(sapply(train[,4:27],class)))

  #character
  #檢視資料型別為某種的資料名，類似的有：is.character,is.numeric,is.factor
  chavar_name<-colnames(train[,4:27])[unlist(lapply(train[,4:27],is.character))]
  #字元型資料索引
  charater_index<-which(colnames(train) %in% chavar_name)
  lapply(train[,chavar_name],table)

  #整數型，注意可能是分型別數值
  intvar_name<-colnames(train[,4:27])[unlist(lapply(train[4:27],is.integer))]
  summary(train[,intvar_name])

  #根據結果記錄分型別數值的變數名
  var_fenlei<-c(chavar_name,"Occupation","Education","Frequency_of_last_mth")
  lapply(train[,var_fenlei],table)
  #根據分型別和連續型將原資料集分成兩類，方便後續profile的批量處理
  #分型別數值的索引
  fenlei_index<-which(colnames(train) %in% var_fenlei)

  #除開id列，響應變數列，字元型，分型別數值以外的連續數值型變數
  #which(colnames(train[,4:27]) %in% c("rid","dv_response","dv_revenue"))
  numvar_name<-colnames(train[,-c(1:3,fenlei_index,28:ncol(train))])

  #數值型數值的索引
  lianxu_index<-which(colnames(train) %in% numvar_name)

  ############################################### 1. Profiling for category variables####################################################
  #install.packages('plyr')
  library(plyr)

  ###################################### 1.profile 分型別數值 #########################################

  #封裝函式,分型別數值
  #資料集，索引，索引長度
  profile_fenlei<-function(x,y,n){
    results<-data.frame(var=NA,category=NA,count=NA,responder=NA,
                        percent=NA,response_rate=NA,index=NA)
    for(i in 1:n){
      prof<-ddply(x,.(x[,y[i]]),summarise,count=length(id),responder=sum(dv_response)) #group by hh_gender_m_flg
      #prof
      #新增百分比結果
      propf<-within(prof,{
        index<-responder/count/over_response_rate*100
        response_rate<-responder/count*100
        percent<-count/overrall*100
      })  #add response_rate,index, percentage
      propf<-data.frame(var=colnames(train)[y[i]],propf)   
      colnames(propf)[2]<-"category"
      #行連線
      results<-rbind(results,propf)
    }
    #去除首行的空值
    results<-results[-1,]
    row.names(results)<-1:nrow(results)
    return(results)
  }
  #分類數值的profile
  results_fenlei<-profile_fenlei(train[,1:28],fenlei_index,length(fenlei_index))
  results_fenlei$category[is.na(results_fenlei$category)]<-"unknown"
  results_fenlei$category[results_fenlei$category==""]<-"unknown"
  View(results_fenlei)

  # #xlsx <- loadWorkbook('Correlation.xlsx',create=TRUE)
  # xlsx <- loadWorkbook('myhomework.xlsx')
  # createSheet(xlsx,name='profile')  #name the worksheet as 'correlation'
  # writeWorksheet(xlsx,results_fenlei,'profile',startRow=1,startCol=1, header=TRUE)  #define the startrow,startcol,header
  # saveWorkbook(xlsx)
  # 
  # 
  ###################################### 1.profile 分型別數值 #########################################

  #####################################  2.profile 連續型數值 #########################################

  ######封裝函式
  #資料集，索引，索引長度，分段個數
  profile_lianxu<-function(x,y,n,m){
    var_data=x
    results<-data.frame(var=NA,category=NA,count=NA,responder=NA,
                        percent=NA,response_rate=NA,index=NA)
    for(i in 1:n){
      #分離成兩部分：缺失值和無缺失值
      nomissing<-data.frame(var_data[!is.na(var_data[,y[i]]),]) #select the no missing value records 
      missing<-data.frame(var_data[is.na(var_data[,y[i]]),])    #select the missing value records
      ##################3.2.1 numeric Profiling:missing part 
      missing2<-ddply(missing,.(missing[,y[i]]),summarise,count=length(id),responder=sum(dv_response)) #group by pos_revenue_base_sp_6mo
      colnames(missing2)[1]<-"category"
      #View(missing2)
      missing_perf<-within(missing2,{
        index<-responder/count/over_response_rate*100
        response_rate<-responder/count*100
        percent<-count/overrall*100
      })   
      #View(missing_perf)
      nomissing_value<-nomissing[,y[i]]  #put the nomissing values into a variable

      nomissing$category<-cut(nomissing_value,unique(quantile(nomissing_value,(0:m)/m)),include.lowest = T) #separte into 10 groups
      #View(table(nomissing$var_category))  #take a look at the 10 category
      prof2<-ddply(nomissing,.(category),summarise,count=length(id),responder=sum(dv_response))#group by the 10 groups
      #View(prof2)
      nonmissing_perf<-within(prof2,{
        index<-responder/count/over_response_rate*100
        response_rate<-responder/count*100
        percent<-count/overrall*100
      })#add avg_revenue,index,percent
      #View(nonmissing_perf)
      #set missing_perf and non-missing_Perf together
      #View(missing_perf)
      #View(nonmissing_perf)
      #colnames(nonmissing_perf)[3]<-"responder"
      lastprofile<-rbind(nonmissing_perf,missing_perf) #set 2 data together
      lastprofile<-data.frame(var=colnames(train)[y[i]],lastprofile)
      #行連線
      results<-rbind(results,lastprofile)
    }
    #去除首行的空值
    results<-results[-1,]
    row.names(results)<-1:nrow(results)
    return(results)
  }
  #連續數值的profile
  results_lianxu<-profile_lianxu(train[,1:34],lianxu_index,length(lianxu_index),10)
  results_lianxu$category[is.na(results_lianxu$category)]<-"unknown"
  View(results_lianxu)
  ######封裝函式
#####################################  2.profile 連續型數值 #########################################

    #將兩個 profile 合成一個整體,輸出到xlsx表格

  #xlsx <- loadWorkbook('Correlation.xlsx',create=TRUE)
  final_profile<-rbind(results_fenlei,results_lianxu)
  View(final_profile)
  xlsx <- loadWorkbook('myhomework.xlsx')
  createSheet(xlsx,name='profile')  #name the worksheet as 'correlation'
  writeWorksheet(xlsx,final_profile,'profile',startRow=1,startCol=1, header=T)  #define the startrow,startcol,header
  saveWorkbook(xlsx)

執行結果（部分截圖）：
所有變數的profiling圖

生成連續型數值的五數概括

程式碼：

#########################################################################
########################   Part4 means     ##############################
#########################################################################
  # 連續性資料的五數概括
  dat_n<-train[,lianxu_index]
  mean_var<-data.frame(var=1:ncol(dat_n),mean=NA,median=NA,"0%"=NA,
                       "1%"=NA,"10%"=NA,"25%"=NA,"50%"=NA,
                       "75%"=NA,"90%"=NA,"99%"=NA,"100%"=NA,
                       max=NA,missing=NA)
  colnames(mean_var)[4:12]<-c("Minimum","1st Pthl","10th Pctl","25th Pctl","50th Pctl","75th Pctl","90th Pctl",
                              "99th Pctl","Maximum")
  for(i in 1:ncol(dat_n)){
    mean_var$var[i]=colnames(dat_n)[i]
    mean_var$mean[i]=mean(dat_n[,i],na.rm=TRUE)   #na.rm=TRUE去除NA的影響
    mean_var$median[i]=median(dat_n[,i],na.rm=TRUE)
    mean_var[i,4:12]=quantile(dat_n[,i],c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=TRUE)
    mean_var$max[i]=max(dat_n[,i],na.rm=TRUE)
    mean_var$missing[i]=sum(is.na(dat_n[,i]))
  }
  # #銷燬臨時變數
  # dat_n<-NULL
  #在列表中檢視數值變數的統計資訊
  View(mean_var)

  # 匯出到 xlsx 表格
  xlsx <- loadWorkbook('myhomework.xlsx')
  createSheet(xlsx,name='means')  #name the worksheet as 'correlation'
  writeWorksheet(xlsx,mean_var,'means',startRow=1,startCol=1, header=T)  #define the startrow,startcol,header
  saveWorkbook(xlsx)

執行結果（部分截圖）：
僅連續型數值變數的五數概括

未完待續。。。
轉載請註明出處

資料分析學習體驗——實際案例_邏輯迴歸&線性迴歸

作者: 江俊日期: 2018年3月27日主要介紹批量生成profiling圖以及五數概括的自建函式。專案背景某保養品公司目前有一款產品線銷售情況一直不景氣，公司預算有限，希望在現有的客戶中挖掘出最有可能在30天內購買該產品的使用者群

資料分析學習體驗——特徵變數相關係數和主成分分析

如何處理資料集中高度相關的特徵變數作者: 江俊時間: “2018/03/25” 以下所有程式碼均使用R語言資料集下載地址：https://download.csdn.net/download/smallernovice/10307411

資料分析學習筆記part_4

資料分析 Lesson 4 : 統計學描述性統計學 - 第一部分資料型別數值型別數值資料採用允許我們執行數學運算（例如計算狗的數量）的數值。分類資料分類資料用於標記一個群體或一組條目（例如狗的品種 —— 牧羊

資料分析學習筆記part_1

資料分析 Lesson 1 : SQL初探 SQL和移動平均值 SQL簡介實體關係圖(ERD) 是檢視資料庫中資料的常用方式。下面是我們將用於 Parch & Posey 資料庫的 ERD。包括：1. 表的名稱 2. 每個表中的列 3. 表配合工作的方式。如下圖所

Python資料分析學習路徑圖

本文摘自同行說使用者“風一樣的男子”，原文連結：http://www.yidianzixun.com/n/0CAz84ve?s=1&appid=yidian，如涉及版權問題請及時聯絡小編！ Python是一種面向物件、直譯式計算機程式設計語言，由Guido van Rossum於1989

資料分析學習之不得不知的八大演算法詳解

學習資料分析的朋友們都知道，演算法是不可或缺的，或者說演算法在一定程度上可以更好的量化的一個人的學習能力和水平，本文感謝科多大資料的馮老師，由他整理了經典的八大演算法，相關的資料希望能幫助大家瞭解。演算法一：快速排序法快速排序是由東尼 · 霍爾所發展的一種排序演算法。在平均狀況下，排序

資料分析學習筆記(1):工作環境以及建模理論基礎

一、環境部署　　1.python包管理：　　　　(1)安裝：pip install xxx,conda install xxx 　　　　(2)解除安裝：pip uninstall xxx,　　conda uninstall xxx 　　　　(3)升級：pip install -upgrade xx

大資料分析學習之路

一、大資料分析的五個基本方面二、如何選擇適合的資料分析工具三、如何區分三個大資料熱門職業四、從菜鳥成為資料科學家的 9步養成方案五、從入門到精通——快速學會大資料分析推薦下小編的大資料學習群；

大資料分析學習筆記（Z檢驗，分類器以及Association Rule）

大資料分析學習筆記（Z檢驗，分類器以及Association Rule） Task 1 – Hypothesis Testing To improve student learning performance, a teacher developed two new learning app

spark快速大資料分析學習筆記（1）

本文是《spark快速大資料分析學習》第三章學習筆記，文中大量摘抄書中原本，僅為個人學習筆記。 RDD基礎： RDD是一個不可變的分散式物件集合。每個RDD都被分為多個分割槽，這個分割槽執行在叢集的不同節點上。RDD可以包含Python、Java、Scala中任意型別的物件。建立RDD的方式：

史上最全Python資料分析學習路徑圖

Python是一種面向物件、直譯式計算機程式設計語言，由Guido van Rossum於1989年底發明。由於他簡單、易學、免費開源、可移植性、可擴充套件性等特點，Python又被稱之為膠水語言。下圖為主要程式語言近年來的流行趨勢，Python受歡迎程度扶搖直上。圖

Python資料分析學習筆記（1）numpy模組基礎入門

numpy模組可以進行高效的資料處理，並提供了陣列的支援，很多模組都依賴他，比如pandas、scipy、matplotlib等，因此這個模組是基礎。（1）匯入： import numpy （2）建立一維和二維陣列： #建立一維陣列 x=numpy.

大資料、資料分析學習資料合集（含學習路線圖）

給大家整理一下本年度一些優質的文章，根據大資料相關的知識點一個個整理的，整理的內容包括知識點普及、學習書籍、學習路線圖、學習筆記、學習資料、學習視訊等等。網際網路科技發展蓬勃興起，人工智慧時代來臨，抓住下一個風口。為幫助那些往想網際網路方向轉行想學習，卻因為時間不夠，資源不足而放棄的人。我自己

Python資料分析學習筆記（6）資料規約實戰--以主成分分析PCA為例

一、相關理論： 1、資料規約：產生更小且保持資料完整性的新資料集。意義在於降低無效、錯誤資料；降低儲存成本；少量且具有代表性的資料大幅加快，主要分為以下兩類： ①屬性規約：屬性合併或刪除無關維，目標是尋找最小子集使子集概率分佈儘可能與原來相同。常用方法：（

大資料分析學習的詳細解讀

以大資料分析師為目標，從資料分析基礎、JAVA語言入門和linux作業系統入門知識學起，系統介紹Hadoop、HDFS、MapReduce和Hbase等理論知識和hadoop的生態環境。一、大資料分析的五個基本方面 1，視覺化分析大資料分析的使用者有大資

Python資料分析學習總結

Python資料分析基礎 numpy 開源、資料計算擴充套件；ndarray、多維操作、線性代數 numpy使用程式： import numpy as np def main(): lst=[[1,3,5],[2,4,6]] print(type(lst)) np_lst=n

資料分析學習方向（一）

很多人看到了資料分析行業的火爆形勢，於是就想進入資料分析行業.他們在學習資料分析知識的時候可能有點迷茫，這是因為資料分析知識有很多內容，不知道如何去學習或者不知道從何處下手。今天我們中給大家介紹一系列的資料分析的學習方向，希望這篇文章能夠給大家帶來幫助。資料分析這四個字讓人們覺得這是一個高

資料分析學習方向（三）

在上一篇文章中我們簡單給大家介紹了資料分析工作中的資料獲取以及資料提取，這兩個步驟是十分重要的。要知道，資料分析就是分析資料，我們只有獲取了資料才能夠做好資料分析工作。但是我們提取了資料還是需要進一步整理的，下面我們就給大家講解一下資料分析中後續步驟。當我們獲取了資料之後，我們需要做資料預處理工作。很

Python資料分析學習筆記——DataFrame(還在更新中)

pandas的官方文件 1.DataFrame DataFrame是一個表格型的資料結構，它含有一組有序的列，每列可以是不同的值型別（數值、字串、布林值等）。DataFrame既有行索引也有列索引，它可以被看做由Series組成的字典（共用同一個索引）。 DataFrame可以通過類

資料分析學習筆記（七）-- 股價分析

本例子，通過numpy分析股價 csv檔案讀寫 CSV（Comma-Separated Value，逗號分隔值）是一種常見的檔案格式，通常資料庫的轉存檔案就是csv格式，檔案中的各個欄位對應於資料庫表中的列。這裡有一份csv格式的檔案，本文一該檔案資

資料分析學習體驗——實際案例_邏輯迴歸&線性迴歸

專案背景

使用語言

使用模型

建模步驟

相關推薦