基於隨機森林的化合物活性二分類模型
阿新 • • 發佈:2019-01-06
#匯入依賴包 import pandas as pd import numpy as np from rdkit import Chem, DataStructs from rdkit.Chem import AllChem from rdkit.ML.Descriptors import MoleculeDescriptors from rdkit.Chem import Descriptors from rdkit.Chem.EState import Fingerprinter from rdkit.Chem import PandasTools from sklearn.ensemble import RandomForestClassifier from sklearn.utils import shuffle from sklearn import metrics from sklearn import model_selection from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt
#定義化合物指紋計算函式
def get_fps(mol):
# 計算指紋 (clogP, PSA, etc etc)
calc=MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
ds = np.asarray(calc.CalcDescriptors(mol))
arr=Fingerprinter.FingerprintMol(mol)[0]
return np.append(arr,ds)
# 載入資料. df=pd.read_csv('mol_IC50.csv',usecols=[0,1,4]) #隨機打亂資料 df = shuffle(df) #檢視資料 df.head()
# 將化合物加入資料框
PandasTools.AddMoleculeColumnToFrame(df,'mol','Molecule')
#檢視資料
df.head()
# 計算描述符和指紋新增至資料框 df['Descriptors']=df['Molecule'].apply(get_fps) # 新增標籤, pIC50>6標記為活性分子 (Active = 1) df['Active']=np.where(df['pIC50']>6, 1, 0) # 將描述符和活性資料轉化為陣列 X = np.array(list(df['Descriptors'])) y = df['Active'].values # 劃分訓練集和測試集 X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)
# 建立隨機森林模型並擬合數據
rf = RandomForestClassifier(max_features='auto')
rf.fit(X_train, y_train)
# 在測試集上進行預測
y_pred = rf.predict(X_test)
#進行ROC統計
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
#繪製ROC曲線
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig("ROC.jpg", dpi = 300)
plt.show()