機器學習-貝葉斯演算法
阿新 • • 發佈:2018-12-14
1、 data_handler.py
from load_data import From_file import pandas as pd def Data_disperse(data, disperse_col, money_split_rule): money_split_str = [] money_distribute = [] money_type = [] index = 1 for cur_split_rule in money_split_rule: range_lower = cur_split_rule[0] range_upper = cur_split_rule[2] analysis_data = data[(data[disperse_col] > range_lower) & (data[disperse_col] <= range_upper)] money_distribute.append(len(analysis_data)) split_rule_str = "".join([str(split_rule) for split_rule in cur_split_rule]) money_split_str.append(split_rule_str) money_type.append(index) index += 1 money_dist_dict = dict(zip(money_split_str, money_distribute)) money_split_map = dict(zip(money_split_str, money_type)) return money_dist_dict, money_split_map def Locate_money(money, money_dist_dict, money_split_map): money_map = 1 map_num = 0 for money_range, map in money_split_map.items(): money_range = money_range.split('-') if money>float(money_range[0]) and money<=float(money_range[1]): money_map = map for money_range, num in money_dist_dict.items(): money_range = money_range.split('-') if money>float(money_range[0]) and money<=float(money_range[1]): map_num = num return money_map, map_num def Sort_data_col(data, col, loc, pre): col_names = data.columns.tolist() data_col = data[col] data.drop(labels=[col], axis=1, inplace=True) if (pre): data.insert(col_names.index(loc), col, data_col) else: data.insert(col_names.index(loc)+1, col, data_col) return data def Data_disp_construct(data, disperse_rule, money_dist_dict, money_split_map): money_maps = [] map_nums = [] maps_nums = [] for money in data[disperse_rule]: money_map, map_num = Locate_money(float(money), money_dist_dict, money_split_map) money_maps.append(money_map) map_nums.append(map_num) for index in range(0, len(money_maps), 1): maps_nums.append([money_maps[index], map_nums[index]]) temp_data = pd.DataFrame(columns=['金額對映', '對映數量'], data=maps_nums) data = data.join(temp_data, on='index') data = Sort_data_col(data, '金額對映', '內容', True) data = Sort_data_col(data, '對映數量', '內容', True) return data def Date_handler(data, start_col, end_col): return data def Data_pre_handle(data): #去除全為空值的行資料 data = data.dropna(axis=0, how='all') #print (data) #294*12 #去除重複值 data = data.drop_duplicates() #print (data) #294*12 #資料篩選(分管領導=李博洋) select_col = '分管領導' col_value = '李博洋' data = data[data[select_col]==col_value] data['index'] = [index for index in range(0, data.shape[0], 1)] data = Sort_data_col(data, 'index', '我方主體', True) #print (data) #74*13 #資料的離散化(金額) disperse_rule = '金額' money_split_rule = [[0, '-', 5000.0], [5000.0, '-', 10000.0], [10000.0, '-', 15000.0], [15000.0, '-', 20000.0], [20000.0, '-', 25000.0], [25000.0, '-', 35000.0], [35000.0, '-', 50000.0], [50000.0, '-', 80000.0], [80000.0, '-', 120000.0], [120000, '-', 999999999999]] money_dist_dict, money_split_map = Data_disperse(data, disperse_rule, money_split_rule) data = Data_disp_construct(data, disperse_rule, money_dist_dict, money_split_map) #日期處理 start_col = '合同開始日期' end_col = '合同結束日期' data = Date_handler(data, start_col, end_col) #資料對映(合作方、結果) partner_map_dict = {} partner_map = [] partners = data['合作方'] partners = partners.drop_duplicates() map = 1 for partner in partners: partner_map_dict[partner] = map map += 1 for partner in data['合作方']: partner_map.append(partner_map_dict[partner]) data['合作方對映'] = partner_map data = Sort_data_col(data, '合作方對映', '合作方', False) result_map = [] for result in data['結果']: if result == '分管領導': map = 1 else: map = 0 result_map.append(map) data['結果對映'] = result_map data = Sort_data_col(data, '結果對映', '結果', False) return data if __name__ == '__main__': #通過load_data取數 filename = 'data/data.xlsx' pick_list = ['我方主體', '合同號', '合同開始日期', '合同結束日期', '金額', '內容', '合作方', '經辦部門', '經辦人員', '經辦人上級', '分管領導', '結果'] data = From_file(filename, pick_list) #print (data) #294*12 handled_data = Data_pre_handle(data) #print (type(handled_data)) #74*16 handled_data.to_csv('data/handled_data.csv', encoding='gbk', index=False)
2、 get_properties.py
from data_handler import Data_pre_handle from load_data import From_file import pandas as pd def Get_properties(data, property_dict): properties_data = pd.DataFrame() for key in property_dict: properties_data[property_dict[key]] = data[key] properties_data_file = 'data/properties.csv' properties_data.to_csv(properties_data_file, encoding='gbk', index=False) return properties_data_file, properties_data if __name__ == '__main__': filename = 'data/data.xlsx' pick_list = ['我方主體', '合同號', '合同開始日期', '合同結束日期', '金額', '內容', '合作方', '經辦部門', '經辦人員', '經辦人上級', '分管領導', '結果'] data = From_file(filename, pick_list) handled_data = Data_pre_handle(data) #print (handled_data) #74*16 property_dict = {'金額對映':'money_map', '合作方對映':'partner_map', '結果對映':'result_map'} properties_data_file, properties_data = Get_properties(handled_data, property_dict) print (properties_data_file)
3、 solver.py
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.naive_bayes import GaussianNB from data_handler import Data_pre_handle from load_data import From_file from get_properties import Get_properties from domain import Domain from data_handler import Data_disperse, Data_disp_construct def Training(data): model = Domain(data, 'result_map') x_train, x_test, y_train, y_test = train_test_split(model.x, model.y, test_size=0.2) clf = GaussianNB().fit(x_train, y_train) ''' precision, recall, f1_score = metrics.classification_report(expected, doc_class_predicted) confuse_matrix = metrics.confusion_matrix(expected, doc_class_predicted) algorithm_assess = { 'precision' : precision, 'recall' : recall, 'f1_score' : f1_score, 'confuse_matrix' : confuse_matrix } ''' return clf def Handle_input(data): disperse_rule = '合同金額' money_split_rule = [[0, '-', 5000.0], [5000.0, '-', 10000.0], [10000.0, '-', 15000.0], [15000.0, '-', 20000.0], [20000.0, '-', 25000.0], [25000.0, '-', 35000.0], [35000.0, '-', 50000.0], [50000.0, '-', 80000.0], [80000.0, '-', 120000.0], [120000, '-', 999999999999]] money_dist_dict, money_split_map = Data_disperse(data, disperse_rule, money_split_rule) data = Data_disp_construct(data, disperse_rule, money_dist_dict, money_split_map) if __name__ == '__main__': filename = 'data/data.xlsx' pick_list = ['我方主體', '合同號', '合同開始日期', '合同結束日期', '金額', '內容', '合作方', '經辦部門', '經辦人員', '經辦人上級', '分管領導', '結果'] data = From_file(filename, pick_list) handled_data = Data_pre_handle(data) # print (handled_data) #74*16 property_dict = {'金額對映': 'money_map', '合作方對映': 'partner_map', '結果對映': 'result_map'} properties_data_file, properties_data = Get_properties(handled_data, property_dict) print (properties_data) clf = Training(properties_data) input_data = [[8,28], [4,40], [3,31], [2,40], [10,41]] #, [9,40], [8,30], [7,40],[7,33], [10,33] #input_handled_data = Handle_input(input_data) solver_result = clf.predict(input_data) print (solver_result) ''' model = Domain(properties_data, 'result_map') #print (model.x, model.y) x_train, x_test, y_train, y_test = train_test_split(model.x, model.y, test_size=0.2) clf = GaussianNB().fit(x_train, y_train) doc_class_predicted = clf.predict(x_test) expected = y_test print(y_test) # 輸出實際結果 print(doc_class_predicted) # 輸出測試結果 print(metrics.classification_report(expected, doc_class_predicted)) # 輸出結果,精確度、召回率、f-1分數 print(metrics.confusion_matrix(expected, doc_class_predicted)) # 混淆矩陣 '''
4、 score.py
import pandas as pd
from data_handler import Data_pre_handle
from load_data import From_file
from get_properties import Get_properties
from solver import Solver
from utils.send_mail import Mail
from utils.all_data_statis import All_data_statis
import time
def Score(data, input_data, solver_result, constraints):
standarded = True
result_data_dict = {
'index' : [],
'結果': [],
'結果對映' : []
}
input_data['index'] = [len(data), len(data)+1]
index = len(data)
for result in solver_result:
if result == 0:
result_data_dict['index'].append(index)
result_data_dict['結果'].append('財務部')
result_data_dict['結果對映'].append(0)
else:
result_data_dict['index'].append(index)
result_data_dict['結果'].append('分管領導')
result_data_dict['結果對映'].append(1)
index += 1
result_data = pd.DataFrame(result_data_dict)
recovery_data = pd.merge(input_data, result_data, on='index')
all_data = pd.concat([data, recovery_data])
#print (all_data)
all_data.to_csv('data/all_data.csv', encoding='gbk', index=False)
money_coverage, amount_coverage = All_data_statis(all_data)
print (money_coverage, amount_coverage)
money_constraint = constraints.get('hard_constraint').get('money_coverage')
amount_constraint = constraints.get('soft_constraint').get('amount_coverage')
amount_pardon_range = constraints.get('soft_constraint').get('pardon_range')
if money_coverage < money_constraint:
standarded = False
elif amount_coverage<(amount_constraint-amount_pardon_range) or amount_coverage>(amount_constraint+amount_pardon_range):
standarded = False
return standarded
if __name__ == '__main__':
filename = 'data/data.xlsx'
pick_list = ['我方主體', '合同號', '合同開始日期', '合同結束日期', '金額', '內容', '合作方', '經辦部門', '經辦人員', '經辦人上級', '分管領導', '結果']
data = From_file(filename, pick_list)
handled_data = Data_pre_handle(data)
# print (handled_data) #74*16
handled_money_coverage, handled_amount_coverage = All_data_statis(handled_data)
print (handled_money_coverage, handled_amount_coverage)
property_dict = {'金額對映': 'money_map', '合作方對映': 'partner_map', '結果對映': 'result_map'}
properties_data_file, properties_data = Get_properties(handled_data, property_dict)
'''
input_data = handled_data.iloc[0:2,].copy()
input_data.drop(labels=['結果', '結果對映'], axis=1, inplace=True)
x_input = []
for index in range(0, len(input_data), 1):
data = input_data.iloc[index,]
input = []
input.append(data['金額對映'])
input.append(data['合作方對映'])
x_input.append(input)
'''
solver_result = Solver(properties_data, x_input)
constraints = {
'hard_constraint': {
'money_coverage': 0.70
},
'soft_constraint': {
'amount_coverage': 0.30,
'pardon_range': 0.05
}
}
standarded = Score(handled_data, input_data, solver_result, constraints)
if (standarded):
mail_data = []
for index in range(0, len(input_data), 1):
left_data = input_data.iloc[index,]
right_data = solver_result[index]
print (right_data)
input = []
input.append(data['合同號'])
input.append(right_data)
mail_data.append(input)
print (mail_data)
mail_result = ''
for mail in mail_data:
if (mail[1]==0):
mail_result += mail[0] + '\t' + '財務部' + '\n'
else:
mail_result += mail[0] + '\t' + '分管領導' + '\n'
print (mail_result)
ret = Mail(mail_result, 'Notify!')
if ret:
print ('Mail Done!')
else:
print ('Mail Failed, try again!')
else:
print('penalty properties, and try solver again, plz!')