樸素貝葉斯分類Python演示
阿新 • • 發佈:2019-01-08
# -*- coding: utf-8 -*- import numpy as np import matplotlib.mlab as mlab import matplotlib.pyplot as plt ''' 樸素貝葉斯分類器的原理 事件H的先驗概率P(H),即無條件概率 事件H在X發生時的後驗概率P(H|X) 這裡面H通常是指某一個分類 X是指樣本事件 P(H|X)就表示在樣本X發生的情況下,分類H的概率 假設有N個分類H1到HN,那麼P(Hi|X)最大的分類就是我們要找的,假設是Hk 因為在X發生的情況下,Hk發生的概率最大 P(H|X) = [P(X|H)*P(H)]/P(X) 分類器的構造 條件如下 1.每個元組有N個屬性 X = {x1 , x2 , ... , xn},分別對應屬性{A1 ... An},並且屬性之間相互獨立 2.當前有M個訓練元組,並且知道它們的分類標號 3.一共有K個分類C1 ... Ck,並且知道每個分類的先驗概率,如果不知道則擬定其為1/K 問題是 對於任意給定的新的元組Xgive,確定其分類Cget,使得P(Cget|Xgive)最大(相對於其它分類) 解析 P(Ci|X) = [P(X|Ci)*P(Ci)]/P(X) 對於所有的分類{C},P(X)是不變的(其實是未知的),為了使P(Ci|X)最大, 只需要使P(X|Ci)*P(Ci)最大 如果P(Ci)取1/K,則只要使P(X|Ci)最大 由於X = {x1 , x2 , ... , xn},分別對應屬性{A1 ... An},並且各個屬性之間沒有關聯 則P(X|Ci)*P(Ci) = P(x1|Ci)*P(x2|Ci)*...*P(xn|Ci) * P(Ci) P(xi|Ci)由訓練元組得到 分為兩種情況 如果Ai為離散值,則P(xi|Ci)等於【屬於Ci的訓練元組中,Ai=xi的元組個數】/【屬於Ci的元組總數】 如果Ai為連續值,則假定Ci中的Ai服從高斯分佈,計算出高斯分佈的引數,分別為平均值和標準差 P(xi|Ci)等於xi處的概率密度 ''' ''' 問題一:訓練元組從何而來? ''' #載入訓練資料 #檔案格式:屬性標號,是否連續【yes|no】,屬性說明 attribute_file_dest = 'F:\\bayes_categorize\\attribute.dat' attribute_file = open(attribute_file_dest) #檔案格式:rec_id,attr1_value,attr2_value,...,attrn_value,class_id trainning_data_file_dest = 'F:\\bayes_categorize\\trainning_data.dat' trainning_data_file = open(trainning_data_file_dest) #檔案格式:class_id,class_desc class_desc_file_dest = 'F:\\bayes_categorize\\class_desc.dat' class_desc_file = open(class_desc_file_dest) attr_dict = {} for line in attribute_file : line = line.strip() fld_list = line.split(',') attr_dict[int(fld_list[0])] = tuple(fld_list[1:]) class_dict = {} for line in class_desc_file : line = line.strip() fld_list = line.split(',') class_dict[int(fld_list[0])] = fld_list[1] trainning_data_dict = {} class_member_set_dict = {} for line in trainning_data_file : line = line.strip() fld_list = line.split(',') rec_id = int(fld_list[0]) a1 = int(fld_list[1]) a2 = int(fld_list[2]) a3 = float(fld_list[3]) c_id = int(fld_list[4]) if c_id not in class_member_set_dict : class_member_set_dict[c_id] = set() class_member_set_dict[c_id].add(rec_id) trainning_data_dict[rec_id] = (a1 , a2 , a3) attribute_file.close() class_desc_file.close() trainning_data_file.close() class_possibility_dict = {} for c_id in class_member_set_dict : class_possibility_dict[c_id] = (len(class_member_set_dict[c_id]) + 0.0)/len(trainning_data_dict) #等待分類的資料 data_to_classify_file_dest = 'F:\\bayes_categorize\\trainning_data_new.dat' data_to_classify_file = open(data_to_classify_file_dest) data_to_classify_dict = {} for line in data_to_classify_file : line = line.strip() fld_list = line.split(',') rec_id = int(fld_list[0]) a1 = int(fld_list[1]) a2 = int(fld_list[2]) a3 = float(fld_list[3]) c_id = int(fld_list[4]) data_to_classify_dict[rec_id] = (a1 , a2 , a3 , c_id) data_to_classify_file.close() diff_cnt = 0 #對於每一個待分類元組,對於每一個分類計算P(X|Ci)*P(Ci),尋找取得最大值的分類 for rec_id in data_to_classify_dict : res_class_id = 0 max_P_X_Ci = 0.0 a1_x1 = data_to_classify_dict[rec_id][0] a2_x2 = data_to_classify_dict[rec_id][1] a3_x3 = data_to_classify_dict[rec_id][2] for c_id in class_possibility_dict : P_Ci = class_possibility_dict[c_id] #求P_x1_Ci cnt_Ci = len(class_member_set_dict[c_id]) cnt_x1_Ci = len([tmp_rec_id for tmp_rec_id in trainning_data_dict \ if trainning_data_dict[tmp_rec_id][0] == a1_x1 and tmp_rec_id in class_member_set_dict[c_id]]) P_x1_Ci = (cnt_x1_Ci + 0.0) / cnt_Ci #求P_x2_Ci cnt_Ci = len(class_member_set_dict[c_id]) cnt_x2_Ci = len([tmp_rec_id for tmp_rec_id in trainning_data_dict \ if trainning_data_dict[tmp_rec_id][1] == a2_x2 and tmp_rec_id in class_member_set_dict[c_id]]) P_x2_Ci = (cnt_x2_Ci + 0.0) / cnt_Ci #求P_x3_Ci #按正態分佈處理,取標準差和平均值 a3_data = [ trainning_data_dict[tmp_rec_id][2] for tmp_rec_id in trainning_data_dict \ if tmp_rec_id in class_member_set_dict[c_id] ] a3_std_err = np.sqrt(np.var(a3_data)) a3_mean = np.mean(a3_data) P_x3_Ci = mlab.normpdf(a3_x3 , a3_mean , a3_std_err ) res = P_x1_Ci * P_x2_Ci * P_x3_Ci * P_Ci if res > max_P_X_Ci : max_P_X_Ci = res res_class_id = c_id if res_class_id == 0 : print 'error 2' if res_class_id != data_to_classify_dict[rec_id][3] : print 'different' print res_class_id print data_to_classify_dict[rec_id] diff_cnt += 1 print diff_cnt 產生測試資料的指令碼: # -*- coding: utf-8 -*- import numpy as np from random import random as rdn #attr : a1 離【1 -- 10 】, a2 離【1 -- 10 】, a3 連【1 -- 100】正態分佈 #class : c1 , c2 , c3 , c4 , c5 , c6 , c7 , c8 #data : 1 - 1000 ''' c1 : a1[1 - 3] a2[4 - 10] a3[<= 50] c2 : a1[1 - 3] a2[4 - 10] a3[> 50] c3 : a1[1 - 3] a2[1 - 3] a3[> 30] c4 : a1[1 - 3] a2[1 - 3] a3[<= 30] c5 : a1[4 - 10] a2[4 - 10] a3[<= 50] c6 : a1[4 - 10] a2[4 - 10] a3[> 50] c7 : a1[4 - 10] a2[1 - 3] a3[> 30] c8 : a1[4 - 10] a2[1 - 3] a3[<= 30] ''' data_file = open('F:\\bayes_categorize\\trainning_data_new.dat' , 'w') a3_data = np.random.randn(1000 ) * 30 + 50 for i in range(1 , 1001 ) : rec_id = i a1 = int(rdn()*10) + 1 if a1 > 10 : a1 = 10 a2 = int(rdn()*10) + 1 if a2 > 10 : a2 = 10 a3 = a3_data[i-1] c_id = 0 if a1 <= 3 and a2 >= 4 and a3 <= 50 : c_id = 1 elif a1 <= 3 and a2 >= 4 and a3 > 50 : c_id = 2 elif a1 <= 3 and a2 < 4 and a3 > 30 : c_id = 3 elif a1 <= 3 and a2 < 4 and a3 <= 30 : c_id = 4 elif a1 > 3 and a2 >= 4 and a3 <= 50 : c_id = 5 elif a1 > 3 and a2 >= 4 and a3 > 50 : c_id = 6 elif a1 > 3 and a2 < 4 and a3 > 30 : c_id = 7 elif a1 > 3 and a2 < 4 and a3 <= 30 : c_id = 8 else : print 'error' str_line = str(rec_id) + ',' + str(a1) + ',' + str(a2) + ',' + str(a3) + ',' + str(c_id) + '\n' data_file.write(str_line) data_file.close() 配置檔案: 1,no, 2,no, 3,yes,