sklearn-woe/iv-乳腺癌分類器實戰
阿新 • • 發佈:2018-04-21
pandas 有變 備份 def 沒有 log ... 腳本 col
sklearn實戰-乳腺癌細胞數據挖掘
https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share
醫藥統計項目聯系QQ:231469242
如果樣本量太小,數據必須做分段化處理,否則會有很多空缺數據,woe效果不能有效發揮
隨機森林結果
iv》0.02的因子在隨機森林結果裏都屬於有效因子,但是隨機森林重要性最強的因子沒有出現在有效iv參數裏,說明這些缺失重要變量沒有做分段處理,數據離散造成。
數據文件
腳本備份
step1_customers_split_goodOrBad.py
# -*- coding: utf-8 -*- """ Created on Sun Jan 14 21:45:43 2018 @author QQ:231469242 把數據源分類為兩個Excel,好客戶Excel數據和壞客戶Excel數據 """ import pandas as pd import numpy as np import matplotlib.pyplot as plt #讀取文件 readFileName="breast_cancer_總.xlsx" #保存文件 saveFileName_good="result_good.xlsx" saveFileName_bad="result_bad.xlsx" #讀取excel df=pd.read_excel(readFileName) #帥選數據 df_good=df[df.diagnosis=="B"] df_bad=df[df.diagnosis=="M"] #保存數據 df_good.to_excel(saveFileName_good, sheet_name=‘Sheet1‘) df_bad.to_excel(saveFileName_bad, sheet_name=‘Sheet1‘)
step2_automate_find_informative_variables.py
# -*- coding: utf-8 -*- """ Created on Sun Jan 14 22:13:30 2018 @author: QQ:231469242 woe負數,好客戶<壞客戶 woe正數,好客戶>壞客戶 """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import os #創建save文件 newFile=os.mkdir("save/") #讀取文件 FileName_good="result_good.xlsx" FileName_bad="result_bad.xlsx" #保存文件 saveFileName="result_woe_iv.xlsx" #讀取excel df_good=pd.read_excel(FileName_good) df_bad=pd.read_excel(FileName_bad) #所有變量列表 list_columns=list(df_good.columns[:-1]) index=0 def Ratio_goodDevideBad(index): #第一列字段名(好客戶屬性) columnName=list(df_good.columns)[index] #第一列好客戶內容和第二列壞客戶內容 column_goodCustomers=df_good[columnName] column_badCustomers=df_bad[columnName] #去掉NAN num_goodCustomers=column_goodCustomers.dropna() #統計數量 num_goodCustomers=num_goodCustomers.size #去掉NAN num_badCustomers=column_badCustomers.dropna() #統計數量 num_badCustomers=num_badCustomers.size #第一列頻率分析 frenquency_goodCustomers=column_goodCustomers.value_counts() #第二列頻率分析 frenquency_badCustomers=column_badCustomers.value_counts() #各個元素占比 ratio_goodCustomers=frenquency_goodCustomers/num_goodCustomers ratio_badCustomers=frenquency_badCustomers/num_badCustomers #最終好壞比例 ratio_goodDevideBad=ratio_goodCustomers/ratio_badCustomers return (columnName,num_goodCustomers,num_badCustomers,frenquency_goodCustomers,frenquency_badCustomers,ratio_goodCustomers,ratio_badCustomers,ratio_goodDevideBad) #woe函數,陣列計算 def Woe(ratio_goodDevideBad): woe=np.log(ratio_goodDevideBad) return woe ‘‘‘ #iv函數,陣列計算 def Iv(woe): iv=(ratio_goodCustomers-ratio_badCustomers)*woe return iv ‘‘‘ #iv參數評估,參數iv_sum(變量iv總值) def Iv_estimate(iv_sum): #如果iv值大於0.02,為有效因子 if iv_sum>0.02: print("informative") return "A" #評估能力一般 else: print("not informative") return "B" ‘‘‘ #詳細參數輸出 def Print(): print ("columnName:",columnName) Iv_estimate(iv_sum) print("iv_sum",iv_sum) #print("",) #print("",) ‘‘‘ #詳細參數保存到excel,save文件裏 def Write_singleVariable_to_Excel(index): #index為變量索引,第一個變量,index=0 ratio=Ratio_goodDevideBad(index) columnName,num_goodCustomers,num_badCustomers,frenquency_goodCustomers,frenquency_badCustomers,ratio_goodCustomers,ratio_badCustomers,ratio_goodDevideBad=ratio[0],ratio[1],ratio[2],ratio[3],ratio[4],ratio[5],ratio[6],ratio[7] woe=Woe(ratio_goodDevideBad) iv=(ratio_goodCustomers-ratio_badCustomers)*woe df_woe_iv=pd.DataFrame({"num_goodCustomers":num_goodCustomers,"num_badCustomers":num_badCustomers,"frenquency_goodCustomers":frenquency_goodCustomers, "frenquency_badCustomers":frenquency_badCustomers,"ratio_goodCustomers":ratio_goodCustomers, "ratio_badCustomers":ratio_badCustomers,"ratio_goodDevideBad":ratio_goodDevideBad, "woe":woe,"iv":iv},columns=["num_goodCustomers","num_badCustomers","frenquency_goodCustomers","frenquency_badCustomers", "ratio_goodCustomers","ratio_badCustomers","ratio_goodDevideBad","woe","iv"]) #sort_values(by=...)用於對指定字段排序 df_sort=df_woe_iv.sort_values(by=‘iv‘,ascending=False) #ratio_badDevideGood數據寫入到result_compare_badDevideGood.xlsx文件 df_sort.to_excel("save/"+columnName+".xlsx") #計算iv總和,評估整體變量 iv_sum=sum([i for i in iv if np.isnan(i)!=True]) print ("變量:",columnName) #iv參數評估,參數iv_sum(變量iv總值) iv_estimate=Iv_estimate(iv_sum) print("iv_sum",iv_sum) return iv_estimate,columnName #y\有價值變量列表存儲器 list_Informative_variables=[] #寫入所有變量參數,保存到excel裏,save文件 for i in range(len(list_columns)): status=Write_singleVariable_to_Excel(i)[0] columnName=Write_singleVariable_to_Excel(i)[1] if status=="A": list_Informative_variables.append(columnName)
最終得到一部分有效因子,共12個,經過數據分段化處理,會得到更多有效因子。
python風控評分卡建模和風控常識
https://study.163.com/course/introduction.htm?courseId=1005214003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share
sklearn-woe/iv-乳腺癌分類器實戰