1. 程式人生 > >Python機器學習之特徵工程

Python機器學習之特徵工程

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

1、資料載入

# 載入資料集
fruits_df = pd.read_table('fruit_data_with_colors.txt')
print(fruits_df.head())
print('樣本個數:', len(fruits_df))
# 建立目標標籤和名稱的字典
fruit_name_dict = dict(zip(fruits_df['fruit_label'
], fruits_df['fruit_name'])) print(fruit_name_dict) # 劃分資料集 X = fruits_df[['mass', 'width', 'height', 'color_score']] y = fruits_df['fruit_label'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0) print('資料集樣本數:{},訓練集樣本數:{},測試集樣本數:{}'.format(len(X), len(X_train), len(X_test)))

2、特徵歸一化

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

for i in range(4):
    print('歸一化前,訓練資料第{}維特徵最大值:{:.3f},最小值:{:.3f}'.format(i + 1, 
                                           X_train.iloc[:, i].max
(), X_train.iloc[:, i].min())) print('歸一化後,訓練資料第{}維特徵最大值:{:.3f},最小值:{:.3f}'.format(i + 1, X_train_scaled[:, i].max(), X_train_scaled[:, i].min())) print() from mpl_toolkits.mplot3d import Axes3D label_color_dict = {1: 'red', 2: 'green', 3: 'blue', 4: 'yellow'} colors = list(map(lambda label: label_color_dict[label], y_train)) fig = plt.figure(figsize=(10, 5)) ax1 = fig.add_subplot(121, projection='3d', aspect='equal') ax1.scatter(X_train['width'], X_train['height'], X_train['color_score'], c=colors, marker='o', s=100) ax1.set_xlabel('width') ax1.set_ylabel('height') ax1.set_zlabel('color_score') ax2 = fig.add_subplot(122, projection='3d', aspect='equal') ax2.scatter(X_train_scaled[:, 1], X_train_scaled[:, 2], X_train_scaled[:, 3], c=colors, marker='o', s=100) ax2.set_xlabel('width') ax2.set_ylabel('height') ax2.set_zlabel('color_score') plt.show()

3、 歸一化對結果的影響

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
# 在未歸一化的資料上進行訓練並測試
knn.fit(X_train, y_train)
print('未歸一化特徵,測試準確率:{:.3f}'.format(knn.score(X_test, y_test)))
# 在歸一化的資料上進行訓練並測試
knn.fit(X_train_scaled, y_train)
print('歸一化特徵後,測試準確率:{:.3f}'.format(knn.score(X_test_scaled, y_test)))

4、標籤編碼和獨熱編碼

# 隨機生成有序型特徵和類別特徵作為例子
X_train = np.array([['male', 'low'],
                  ['female', 'low'],
                  ['female', 'middle'],
                  ['male', 'low'],
                  ['female', 'high'],
                  ['male', 'low'],
                  ['female', 'low'],
                  ['female', 'high'],
                  ['male', 'low'],
                  ['male', 'high']])

X_test = np.array([['male', 'low'],
                  ['male', 'low'],
                  ['female', 'middle'],
                  ['female', 'low'],
                  ['female', 'high']])
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# 在訓練集上進行編碼操作
label_enc1 = LabelEncoder() # 首先將male, female用數字編碼
one_hot_enc = OneHotEncoder() # 將數字編碼轉換為獨熱編碼
label_enc2 = LabelEncoder() # 將low, middle, high用數字編碼
tr_feat1_tmp = label_enc1.fit_transform(X_train[:, 0]).reshape(-1, 1) # reshape(-1, 1)保證為一維列向量
tr_feat1 = one_hot_enc.fit_transform(tr_feat1_tmp) 
tr_feat1 = tr_feat1.todense()
tr_feat2 = label_enc2.fit_transform(X_train[:, 1]).reshape(-1, 1)
X_train_enc = np.hstack((tr_feat1, tr_feat2))
print(X_train_enc)
# 在測試集上進行編碼操作
te_feat1_tmp = label_enc1.transform(X_test[:, 0]).reshape(-1, 1) # reshape(-1, 1)保證為一維列向量
te_feat1 = one_hot_enc.transform(te_feat1_tmp) 
te_feat1 = te_feat1.todense()
te_feat2 = label_enc2.transform(X_test[:, 1]).reshape(-1, 1)
X_test_enc = np.hstack((te_feat1, te_feat2))
print(X_test_enc)