遺傳演算法應用於隨機森林的調參過程
阿新 • • 發佈:2019-01-10
背景
其實不管調參的物件的是隨機森林,還是其他分類器,遺傳演算法都是作為分類器對其超引數進行調優的工具,當然,遺傳演算法是一個貪心演算法,只能接近於最優解,類似的演算法還有比如退火演算法、蟻群演算法等等,關於遺傳演算法的詳解這裡不再多說,網上參考有很多:
例項程式碼
本例項程式碼應用於隨機森林的引數(最大樹深、基分類器個數)調優過程,歡迎一起交流:
#coding=utf-8 import numpy as np from sklearn.ensemble import RandomForestClassifier import pandas as pd import random import math from sklearn import metrics from sklearn.model_selection import train_test_split generations = 10 # 繁殖代數 100 pop_size = 20 # 種群數量 500 max_value = 10 # 基因中允許出現的最大值 chrom_length = 8 # 染色體長度 pc = 0.6 # 交配概率 pm = 0.01 # 變異概率 results = [[]] # 儲存每一代的最優解,N個三元組(auc最高值, n_estimators, max_depth) fit_value = [] # 個體適應度 fit_mean = [] # 平均適應度 pop = [[0, 1, 0, 1, 0, 1, 0, 1] for i in range(pop_size)] # 初始化種群中所有個體的基因初始序列 ''' n_estimators 取 {10、20、30、40、50、60、70、80、90、100、110、120、130、140、150、160} max_depth 取 {1、2、3、4、5、6、7、8、9、10、11、12、13、14、15、16} (1111,1111)基因組8位長 ''' def randomForest(n_estimators_value, max_depth_value): # print("n_estimators_value: " + str(n_estimators_value)) # print("max_depth_value: " + str(max_depth_value)) train_xy = loadFile("data.csv") train_xy = train_xy.drop('ID', axis=1) # 刪除訓練集的ID # 將訓練集劃分成7:3(訓練集與測試集比例)的比例 train, val = train_test_split( train_xy, test_size=0.3, random_state=80) train_y = train['Kind'] # 訓練集類標 val_y = val['Kind'] # 測試集類標 train = train.drop('Kind', axis=1) # 刪除訓練集的類標 val = val.drop('Kind', axis=1) # 刪除測試集的類標 rf = RandomForestClassifier(n_estimators=n_estimators_value, max_depth=max_depth_value, n_jobs=2) rf.fit(train, train_y) # 訓練分類器 predict_test = rf.predict_proba(val)[:, 1] roc_auc = metrics.roc_auc_score(val_y, predict_test) return roc_auc def loadFile(filePath): fileData = pd.read_csv(filePath) return fileData # Step 1 : 對引數進行編碼(用於初始化基因序列,可以選擇初始化基因序列,本函式省略) def geneEncoding(pop_size, chrom_length): pop = [[]] for i in range(pop_size): temp = [] for j in range(chrom_length): temp.append(random.randint(0, 1)) pop.append(temp) return pop[1:] # Step 2 : 計算個體的目標函式值 def cal_obj_value(pop): objvalue = [] variable = decodechrom(pop) for i in range(len(variable)): tempVar = variable[i] n_estimators_value = (tempVar[0] + 1) * 10 max_depth_value = tempVar[1] + 1 aucValue = randomForest(n_estimators_value, max_depth_value) objvalue.append(aucValue) return objvalue #目標函式值objvalue[m] 與個體基因 pop[m] 對應 # 對每個個體進行解碼,並拆分成單個變數,返回 n_estimators 和 max_depth def decodechrom(pop): variable = [] n_estimators_value = [] max_depth_value = [] for i in range(len(pop)): res = [] # 計算第一個變數值,即 0101->10(逆轉) temp1 = pop[i][0:4] preValue = 0 for pre in range(4): preValue += temp1[pre] * (math.pow(2, pre)) res.append(int(preValue)) # 計算第二個變數值 temp2 = pop[i][4:8] aftValue = 0 for aft in range(4): aftValue += temp2[aft] * (math.pow(2, aft)) res.append(int(aftValue)) variable.append(res) return variable # Step 3: 計算個體的適應值(計算最大值,於是就淘汰負值就好了) def calfitvalue(obj_value): fit_value = [] temp = 0.0 Cmin = 0 for i in range(len(obj_value)): if(obj_value[i] + Cmin > 0): temp = Cmin + obj_value[i] else: temp = 0.0 fit_value.append(temp) return fit_value # Step 4: 找出適應函式值中最大值,和對應的個體 def best(pop, fit_value): best_individual = pop[0] best_fit = fit_value[0] for i in range(1, len(pop)): if(fit_value[i] > best_fit): best_fit = fit_value[i] best_individual = pop[i] return [best_individual, best_fit] # Step 5: 每次繁殖,將最好的結果記錄下來(將二進位制轉化為十進位制) def b2d(best_individual): temp1 = best_individual[0:4] preValue = 0 for pre in range(4): preValue += temp1[pre] * (math.pow(2, pre)) preValue = preValue + 1 preValue = preValue * 10 # 計算第二個變數值 temp2 = best_individual[4:8] aftValue = 0 for aft in range(4): aftValue += temp2[aft] * (math.pow(2, aft)) aftValue = aftValue + 1 return int(preValue), int(aftValue) # Step 6: 自然選擇(輪盤賭演算法) def selection(pop, fit_value): # 計算每個適應值的概率 new_fit_value = [] total_fit = sum(fit_value) for i in range(len(fit_value)): new_fit_value.append(fit_value[i] / total_fit) # 計算每個適應值的累積概率 cumsum(new_fit_value) # 生成隨機浮點數序列 ms = [] pop_len = len(pop) for i in range(pop_len): ms.append(random.random()) # 對生成的隨機浮點數序列進行排序 ms.sort() # 輪盤賭演算法(選中的個體成為下一輪,沒有被選中的直接淘汰,被選中的個體代替) fitin = 0 newin = 0 newpop = pop while newin < pop_len: if(ms[newin] < new_fit_value[fitin]): newpop[newin] = pop[fitin] newin = newin + 1 else: fitin = fitin + 1 pop = newpop # 求適應值的總和 def sum(fit_value): total = 0 for i in range(len(fit_value)): total += fit_value[i] return total # 計算累積概率 def cumsum(fit_value): temp=[] for i in range(len(fit_value)): t = 0 j = 0 while(j <= i): t += fit_value[j] j = j + 1 temp.append(t) for i in range(len(fit_value)): fit_value[i]=temp[i] # Step 7: 交叉繁殖 def crossover(pop, pc): #個體間交叉,實現基因交換 poplen = len(pop) for i in range(poplen - 1): if(random.random() < pc): cpoint = random.randint(0,len(pop[0])) temp1 = [] temp2 = [] temp1.extend(pop[i][0 : cpoint]) temp1.extend(pop[i+1][cpoint : len(pop[i])]) temp2.extend(pop[i+1][0 : cpoint]) temp2.extend(pop[i][cpoint : len(pop[i])]) pop[i] = temp1 pop[i+1] = temp2 # Step 8: 基因突變 def mutation(pop, pm): px = len(pop) py = len(pop[0]) for i in range(px): if(random.random() < pm): mpoint = random.randint(0,py-1) if(pop[i][mpoint] == 1): pop[i][mpoint] = 0 else: pop[i][mpoint] = 1 if __name__ == '__main__': # pop = geneEncoding(pop_size, chrom_length) for i in range(generations): print("第 " + str(i) + " 代開始繁殖......") obj_value = cal_obj_value(pop) # 計算目標函式值 # print(obj_value) fit_value = calfitvalue(obj_value) #計算個體的適應值 # print(fit_value) [best_individual, best_fit] = best(pop, fit_value) #選出最好的個體和最好的函式值 # print("best_individual: "+ str(best_individual)) temp_n_estimator, temp_max_depth = b2d(best_individual) results.append([best_fit, temp_n_estimator, temp_max_depth]) #每次繁殖,將最好的結果記錄下來 print(str(best_individual) + " " + str(best_fit)) selection(pop, fit_value) #自然選擇,淘汰掉一部分適應性低的個體 crossover(pop, pc) #交叉繁殖 mutation(pop, pc) #基因突變 # print(results) results.sort() print(results[-1])