1. 程式人生 > >機器學習-貝葉斯演算法

機器學習-貝葉斯演算法

1、 data_handler.py

from load_data import From_file
import pandas as pd

def Data_disperse(data, disperse_col, money_split_rule):
    money_split_str = []
    money_distribute = []
    money_type = []
    index = 1

    for cur_split_rule in money_split_rule:
        range_lower = cur_split_rule[0]
        range_upper = cur_split_rule[2]
        analysis_data = data[(data[disperse_col] > range_lower) & (data[disperse_col] <= range_upper)]
        money_distribute.append(len(analysis_data))
        split_rule_str = "".join([str(split_rule) for split_rule in cur_split_rule])
        money_split_str.append(split_rule_str)
        money_type.append(index)
        index += 1

    money_dist_dict = dict(zip(money_split_str, money_distribute))
    money_split_map = dict(zip(money_split_str, money_type))

    return money_dist_dict, money_split_map

def Locate_money(money, money_dist_dict, money_split_map):
    money_map = 1
    map_num = 0

    for money_range, map in money_split_map.items():
        money_range = money_range.split('-')
        if money>float(money_range[0]) and money<=float(money_range[1]):
            money_map = map
    for money_range, num in money_dist_dict.items():
        money_range = money_range.split('-')
        if money>float(money_range[0]) and money<=float(money_range[1]):
            map_num = num

    return money_map, map_num

def Sort_data_col(data, col, loc, pre):
    col_names = data.columns.tolist()
    data_col = data[col]
    data.drop(labels=[col], axis=1, inplace=True)
    if (pre):
        data.insert(col_names.index(loc), col, data_col)
    else:
        data.insert(col_names.index(loc)+1, col, data_col)

    return data

def Data_disp_construct(data, disperse_rule, money_dist_dict, money_split_map):
    money_maps = []
    map_nums = []
    maps_nums = []

    for money in data[disperse_rule]:
        money_map, map_num = Locate_money(float(money), money_dist_dict, money_split_map)
        money_maps.append(money_map)
        map_nums.append(map_num)
    for index in range(0, len(money_maps), 1):
        maps_nums.append([money_maps[index], map_nums[index]])
    temp_data = pd.DataFrame(columns=['金額對映', '對映數量'], data=maps_nums)
    data = data.join(temp_data, on='index')
    data = Sort_data_col(data, '金額對映', '內容', True)
    data = Sort_data_col(data, '對映數量', '內容', True)

    return data

def Date_handler(data, start_col, end_col):

    return data
            

def Data_pre_handle(data):
    #去除全為空值的行資料
    data = data.dropna(axis=0, how='all')
    #print (data) #294*12

    #去除重複值
    data = data.drop_duplicates()
    #print (data) #294*12

    #資料篩選(分管領導=李博洋)
    select_col = '分管領導'
    col_value = '李博洋'
    data = data[data[select_col]==col_value]
    data['index'] = [index for index in range(0, data.shape[0], 1)]
    data = Sort_data_col(data, 'index', '我方主體', True)
    #print (data)    #74*13

    #資料的離散化(金額)
    disperse_rule = '金額'
    money_split_rule = [[0, '-', 5000.0], [5000.0, '-', 10000.0], [10000.0, '-', 15000.0], [15000.0, '-', 20000.0],
                        [20000.0, '-', 25000.0],
                        [25000.0, '-', 35000.0], [35000.0, '-', 50000.0], [50000.0, '-', 80000.0],
                        [80000.0, '-', 120000.0],
                        [120000, '-', 999999999999]]
    money_dist_dict, money_split_map = Data_disperse(data, disperse_rule, money_split_rule)
    data = Data_disp_construct(data, disperse_rule, money_dist_dict, money_split_map)

    #日期處理
    start_col = '合同開始日期'
    end_col = '合同結束日期'
    data = Date_handler(data, start_col, end_col)

    #資料對映(合作方、結果)
    partner_map_dict = {}
    partner_map = []
    partners = data['合作方']
    partners = partners.drop_duplicates()
    map = 1
    for partner in partners:
        partner_map_dict[partner] = map
        map += 1
    for partner in data['合作方']:
        partner_map.append(partner_map_dict[partner])
    data['合作方對映'] = partner_map
    data = Sort_data_col(data, '合作方對映', '合作方', False)

    result_map = []
    for result in data['結果']:
        if result == '分管領導':
            map = 1
        else:
            map = 0
        result_map.append(map)
    data['結果對映'] = result_map
    data = Sort_data_col(data, '結果對映', '結果', False)

    return data

if __name__ == '__main__':
    #通過load_data取數
    filename = 'data/data.xlsx'
    pick_list = ['我方主體', '合同號', '合同開始日期', '合同結束日期', '金額', '內容', '合作方', '經辦部門', '經辦人員', '經辦人上級', '分管領導', '結果']
    data = From_file(filename, pick_list)
    #print (data)    #294*12

    handled_data = Data_pre_handle(data)
    #print (type(handled_data))    #74*16

    handled_data.to_csv('data/handled_data.csv', encoding='gbk', index=False)

2、 get_properties.py

from data_handler import Data_pre_handle
from load_data import From_file
import pandas as pd

def Get_properties(data, property_dict):
    properties_data = pd.DataFrame()
    for key in property_dict:
        properties_data[property_dict[key]] = data[key]
    properties_data_file = 'data/properties.csv'
    properties_data.to_csv(properties_data_file, encoding='gbk', index=False)

    return properties_data_file, properties_data

if __name__ == '__main__':
    filename = 'data/data.xlsx'
    pick_list = ['我方主體', '合同號', '合同開始日期', '合同結束日期', '金額', '內容', '合作方', '經辦部門', '經辦人員', '經辦人上級', '分管領導', '結果']
    data = From_file(filename, pick_list)
    handled_data = Data_pre_handle(data)
    #print (handled_data)    #74*16
    property_dict = {'金額對映':'money_map', '合作方對映':'partner_map', '結果對映':'result_map'}
    properties_data_file, properties_data = Get_properties(handled_data, property_dict)
    print (properties_data_file)

3、 solver.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from data_handler import Data_pre_handle
from load_data import From_file
from get_properties import  Get_properties
from domain import Domain
from data_handler import Data_disperse, Data_disp_construct

def Training(data):
    model = Domain(data, 'result_map')
    x_train, x_test, y_train, y_test = train_test_split(model.x, model.y, test_size=0.2)
    clf = GaussianNB().fit(x_train, y_train)
    '''
    precision, recall, f1_score = metrics.classification_report(expected, doc_class_predicted)
    confuse_matrix = metrics.confusion_matrix(expected, doc_class_predicted)
    algorithm_assess = {
        'precision' : precision,
        'recall' : recall,
        'f1_score' : f1_score,
        'confuse_matrix' : confuse_matrix
    }
    '''
    return clf

def Handle_input(data):
    disperse_rule = '合同金額'
    money_split_rule = [[0, '-', 5000.0], [5000.0, '-', 10000.0], [10000.0, '-', 15000.0], [15000.0, '-', 20000.0],
                        [20000.0, '-', 25000.0],
                        [25000.0, '-', 35000.0], [35000.0, '-', 50000.0], [50000.0, '-', 80000.0],
                        [80000.0, '-', 120000.0],
                        [120000, '-', 999999999999]]
    money_dist_dict, money_split_map = Data_disperse(data, disperse_rule, money_split_rule)
    data = Data_disp_construct(data, disperse_rule, money_dist_dict, money_split_map)



if __name__ == '__main__':
    filename = 'data/data.xlsx'
    pick_list = ['我方主體', '合同號', '合同開始日期', '合同結束日期', '金額', '內容', '合作方', '經辦部門', '經辦人員', '經辦人上級', '分管領導', '結果']
    data = From_file(filename, pick_list)
    handled_data = Data_pre_handle(data)
    # print (handled_data)    #74*16
    property_dict = {'金額對映': 'money_map', '合作方對映': 'partner_map', '結果對映': 'result_map'}
    properties_data_file, properties_data = Get_properties(handled_data, property_dict)
    print (properties_data)
    clf = Training(properties_data)
    input_data = [[8,28], [4,40], [3,31], [2,40], [10,41]]
    #, [9,40], [8,30], [7,40],[7,33], [10,33]
    #input_handled_data = Handle_input(input_data)
    solver_result = clf.predict(input_data)
    print (solver_result)
    '''
    model = Domain(properties_data, 'result_map')
    #print (model.x, model.y)

    x_train, x_test, y_train, y_test = train_test_split(model.x, model.y, test_size=0.2)

    clf = GaussianNB().fit(x_train, y_train)
    doc_class_predicted = clf.predict(x_test)
    expected = y_test

    print(y_test)  # 輸出實際結果
    print(doc_class_predicted)  # 輸出測試結果
    print(metrics.classification_report(expected, doc_class_predicted))  # 輸出結果,精確度、召回率、f-1分數
    print(metrics.confusion_matrix(expected, doc_class_predicted))  # 混淆矩陣
    '''

4、 score.py

import pandas as pd
from data_handler import Data_pre_handle
from load_data import From_file
from get_properties import  Get_properties
from solver import Solver
from utils.send_mail import Mail
from utils.all_data_statis import All_data_statis
import time

def Score(data, input_data, solver_result, constraints):
    standarded = True
    result_data_dict = {
        'index' : [],
        '結果': [],
        '結果對映' : []
    }
    input_data['index'] = [len(data), len(data)+1]
    index = len(data)
    for result in solver_result:
        if result == 0:
            result_data_dict['index'].append(index)
            result_data_dict['結果'].append('財務部')
            result_data_dict['結果對映'].append(0)
        else:
            result_data_dict['index'].append(index)
            result_data_dict['結果'].append('分管領導')
            result_data_dict['結果對映'].append(1)
        index += 1
    result_data = pd.DataFrame(result_data_dict)
    recovery_data = pd.merge(input_data, result_data, on='index')
    all_data = pd.concat([data, recovery_data])
    #print (all_data)
    all_data.to_csv('data/all_data.csv', encoding='gbk', index=False)
    money_coverage, amount_coverage = All_data_statis(all_data)
    print (money_coverage, amount_coverage)
    money_constraint = constraints.get('hard_constraint').get('money_coverage')
    amount_constraint = constraints.get('soft_constraint').get('amount_coverage')
    amount_pardon_range = constraints.get('soft_constraint').get('pardon_range')
    if money_coverage < money_constraint:
        standarded = False
    elif amount_coverage<(amount_constraint-amount_pardon_range) or amount_coverage>(amount_constraint+amount_pardon_range):
        standarded = False

    return standarded


if __name__ == '__main__':
    filename = 'data/data.xlsx'
    pick_list = ['我方主體', '合同號', '合同開始日期', '合同結束日期', '金額', '內容', '合作方', '經辦部門', '經辦人員', '經辦人上級', '分管領導', '結果']
    data = From_file(filename, pick_list)
    handled_data = Data_pre_handle(data)
    # print (handled_data)    #74*16

    handled_money_coverage, handled_amount_coverage = All_data_statis(handled_data)
    print (handled_money_coverage, handled_amount_coverage)

    property_dict = {'金額對映': 'money_map', '合作方對映': 'partner_map', '結果對映': 'result_map'}
    properties_data_file, properties_data = Get_properties(handled_data, property_dict)
    '''
    input_data = handled_data.iloc[0:2,].copy()
    input_data.drop(labels=['結果', '結果對映'], axis=1, inplace=True)
    x_input = []
    for index in range(0, len(input_data), 1):
        data = input_data.iloc[index,]
        input = []
        input.append(data['金額對映'])
        input.append(data['合作方對映'])
        x_input.append(input)
    '''
    solver_result = Solver(properties_data, x_input)


    constraints = {
        'hard_constraint': {
            'money_coverage': 0.70
        },
        'soft_constraint': {
            'amount_coverage': 0.30,
            'pardon_range': 0.05
        }
    }

    standarded = Score(handled_data, input_data, solver_result, constraints)

    if (standarded):
        mail_data = []
        for index in range(0, len(input_data), 1):
            left_data = input_data.iloc[index,]
            right_data = solver_result[index]
            print (right_data)
            input = []
            input.append(data['合同號'])
            input.append(right_data)
            mail_data.append(input)

        print (mail_data)
        mail_result = ''
        for mail in mail_data:
            if (mail[1]==0):
                mail_result += mail[0] + '\t' + '財務部' + '\n'
            else:
                mail_result += mail[0] + '\t' + '分管領導' + '\n'
        print (mail_result)
        ret = Mail(mail_result, 'Notify!')
        if ret:
            print ('Mail Done!')
        else:
            print ('Mail Failed, try again!')
    else:
        print('penalty properties, and try solver again, plz!')