1. 程式人生 > >使用者貸款風險預測-datacastle競賽題目

使用者貸款風險預測-datacastle競賽題目

##自己是大菜鳥一枚,datacastle比賽題目,用的是Logistic,做出的結果不好,目前只排在200多名。先放在部落格上面,專案比較緊張,還得學一些javaweb的東西,就怕以後沒時間做了。。。。

# -*- coding: utf-8 -*-
"""
Created on Tue Jan 10 09:54:12 2017
###Datacastle的‘使用者貸款風險預測’競賽題目###
#初步想法是利用邏輯斯蒂迴歸,特徵的選擇對結果影響很大,有時間的話多看看特徵選擇方面的東西
"""
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

class DataCastle(object):
    def __init__(self):
        self.name = "<<- User loan forecast match ->>"
        self.result = "result.csv"
        
    #讀取使用者資訊表 並返回
    def readUserInfo(self):
        user_info_train = readData("train/user_info_train.txt")
        user_info_test = readData("test/user_info_test.txt")
        col_names = ['userid', 'sex', 'occupation', 'education', 'marriage', 'household']
        user_info_train.columns = col_names
        user_info_test.columns = col_names
        user_info = pd.concat([user_info_train, user_info_test])
        user_info.index = user_info['userid']
        user_info.drop('userid',axis=1,inplace=True)
        return user_info
    
    #讀取使用者銀行賬單表 對賬單資料求和並返回
    def readBankDetail(self):
        bank_detail_train = readData("train/bank_detail_train.txt")
        bank_detail_test = readData("test/bank_detail_test.txt")
        col_names = ['userid', 'time_bank', 'tradeType', 'tradeMoney', 'incomeTag']
        bank_detail_train.columns = col_names
        bank_detail_test.columns = col_names
        bank_detail_pre = pd.concat([bank_detail_train,bank_detail_test])
        bank_detail = (bank_detail_pre.loc[:,['userid','tradeType', 'tradeMoney']]).groupby(['userid','tradeType']).sum()
        bank_detail = bank_detail.unstack()
        bank_detail.columns = ['income','outcome']
        return bank_detail
        
    #讀取使用者的瀏覽歷史 對瀏覽資料求和並返回
    def readBrowseHistory(self):
        browse_history_train = readData("train/browse_history_train.txt")
        browse_history_test = readData("test/browse_history_test.txt")
        col_names = ['userid', 'time_browse', 'browseData', 'browseTag']
        browse_history_train.columns = col_names
        browse_history_test.columns = col_names
        browse_history_pre = pd.concat([browse_history_train, browse_history_test])
        browse_history = (browse_history_pre.loc[:,['userid','browseData']]).groupby(['userid']).sum()
        return browse_history
        
    #讀取信用卡賬單記錄 取均值並返回
    def readBillDetail(self):
        bill_detail_train = readData("train/bill_detail_train.txt")
        bill_detail_test = readData("test/bill_detail_test.txt")
        col_names = ['userid', 'time_bill', 'bank_id', 'prior_account', 'prior_repay',
             'credit_limit', 'account_balance', 'minimun_repay', 'consume_count',
             'account', 'adjust_account', 'circulated_interest', 'avaliable_balance',
             'cash_limit', 'repay_state']
        bill_detail_train.columns = col_names
        bill_detail_test.columns = col_names
        bill_detail_pre = pd.concat([bill_detail_train,bill_detail_test])
        bill_detail_pre.drop('bank_id',axis=1,inplace=True)
        bill_detail = bill_detail_pre.groupby(['userid']).mean()
        return bill_detail
        
    #讀取使用者發放貸款時間 並返回
    def readLoanTime(self):
        loan_time_train = readData("train/loan_time_train.txt")
        loan_time_test = readData("test/loan_time_test.txt")
        col_names = ['userid','loanTime']
        loan_time_train.columns = col_names
        loan_time_test.columns = col_names
        loan_time = pd.concat([loan_time_train,loan_time_test])
        loan_time.index = loan_time['userid']
        loan_time.drop('userid',axis=1,inplace=True)
        return loan_time
        
     #讀取類別資訊
    def readTarget(self):
        target = readData("train/overdue_train.txt")
        target.columns = ['userid', 'label']
        target.index = target['userid']
        target.drop('userid',axis = 1,inplace = True)
        return target
    
    #利用邏輯斯蒂迴歸
    def logisticMethod(self):
        
        user_info = self.readUserInfo()
        bank_detail = self.readBankDetail()
        bill_detail = self.readBillDetail()
        loan_time = self.readLoanTime()
        browse_history = self.readBrowseHistory()
        target = self.readTarget()
        
        loan_data = user_info.join(bank_detail,how='outer')
        loan_data = loan_data.join(bill_detail,how='outer')
        loan_data = loan_data.join(browse_history,how='outer')
        loan_data = loan_data.join(loan_time,how='outer')
        loan_data = loan_data.fillna(0.0)
        
        #對資料進行歸一化
        datas = loan_data.values
        datas = preprocessing.scale(datas)
        col_names = list(loan_data.columns)
        nums=0
        for col in col_names:
            loan_data.loc[:,[col]] = datas[:,nums]
            nums += 1
        
        #對資料進行劃分並且進行訓練
        train = loan_data.iloc[0: 55596, :]
        test = loan_data.iloc[55596:, :]
        train_X, test_X, train_y, test_y = train_test_split(train,target,test_size = 0.2,random_state = 0)
        train_y = train_y['label']
        test_y = test_y['label']
        lr_model = LogisticRegression(C = 1.0,penalty = 'l2')
        lr_model.fit(train_X, train_y)
        #驗證集進行預測
        pred_test = lr_model.predict(test_X)
        #對預測結果進行評估
        print classification_report(test_y, pred_test)
        
        #對測試集生成結果並存儲為csv格式
        pred = lr_model.predict_proba(test)
        result = pd.DataFrame(pred)
        result.index = test.index
        result.columns = ['0', 'probability']
        result.drop('0',axis = 1,inplace = True)
        print result.head(5)      
        result.to_csv(self.result)
        
#資料讀取
def readData(filename):
    filepath = './'+filename
    data = pd.read_csv(filepath,header=None)
    return data