資料準備和特徵工程
1.感知資料
1.1.1CSV檔案
pd.read_csv(csv_file, index_col=0)
index_col=1預設讀取資料的第一列是索引
df_new.to_csv("work/files/ten_bicycle.csv")
儲存成csv檔案
1.1.2Excel檔案
jiangsu = pd.read_excel("/home/aistudio/data/data20465/jiangsu.xls")
jiangsu.to_excel('work/files/jiangsu.xlsx')
cpi.drop([11, 12], axis=0, inplace=True)
刪除第11、12行,並覆蓋原來的
cpi.reset_index(drop=True, inplace=True)
重置索引
cpi.columns.rename('', inplace=True)
列名重新命名
for column in cpi.columns[:-1]:
cpi[column] = pd.to_numeric(cpi[column])
cpi.dtypes
將資料轉換為數字
ax.boxplot(js['population'], showmeans=True)
畫箱線圖並顯示均值
1.1.3圖形檔案
from PIL import Image
color_image = Image.open("work/images/laoqi.png")
讀取圖片1
gray_image = Image.open("work/images/laoqi.png").convert("L")
彩色影象轉灰度圖
convert()
是影象例項物件的一個方法,接受一個 mode 引數,用以指定一種色彩模式
1 ------------------(1位畫素,黑白,每位元組一個畫素儲存)
L ------------------(8位畫素,黑白)
P ------------------(8位畫素,使用調色盤對映到任何其他模式)
RGB------------------(3x8位畫素,真彩色)
RGBA------------------(4x8位畫素,帶透明度掩模的真彩色)
CMYK--------------------(4x8位畫素,分色)
YCbCr--------------------(3x8位畫素,彩色視訊格式)
I-----------------------(32位有符號整數畫素)
F------------------------(32位浮點畫素)
import numpy as np color_array = np.array(color_image) color_array.shape 輸出:(407, 396, 4)
將彩色圖片轉為np矩陣
gray_array = np.array(gray_image) gray_array.shape 輸出:(407, 396)
將灰色圖片轉為np矩陣
import cv2 img = cv2.imread('work/images/laoqi.png', 0)
讀取圖片2(常用)
plt.imshow(img, cmap = 'gray', interpolation = 'bicubic')
顯示圖片
from PIL import Image Image.fromarray(img)
實現array到image的轉換
part_img = img[50:260, 100:280]
裁剪圖片
reverse_img = 255 - img Image.fromarray(reverse_img)
負片
part1 = img1[50:260, 100:280] part2 = img2[300:, 100:280] new_img = np.vstack((part1, part2))
拼接兩張圖片
1-2資料庫中的資料(可不看)
import pandas as pd import pymysql mydb = pymysql.connect(host="localhost", user='root', password='1q2w3e4r5t', db="books", ) #連線資料庫 cursor = mydb.cursor() path = "/Users/qiwsir/Documents/Codes/DataSet" df = pd.read_csv(path + "/jiangsu/cities.csv") #插入資料 sql = 'insert into city (name, area, population, longd, latd) \ values ("%s","%s", "%s", "%s", "%s")' for idx in df.index: row = df.iloc[idx] cursor.execute(sql % (row['name'], row['area'], row['population'], row['longd'], row['latd']))#進行sql操作 mydb.commit()#關閉連線
sql_count = "SELECT COUNT(1) FROM city" cursor.execute(sql_count) n = cursor.fetchone() # 獲得一個返回值 n
sql_columns = 'SELECT name, area FROM city' cursor.execute(sql_columns) cursor.fetchall() #以area欄位值從大到小查詢全部記錄; sql_sort = "SELECT * FROM city ORDER BY area DESC" cursor.execute(sql_sort) cursor.fetchall()
#更簡便的寫法 import pandas as pd import pymysql mydb = pymysql.connect(host="localhost", user='root', password='1q2w3e4r5t', db="books",) cities = pd.read_sql_query("Select * FROM city", con=mydb, index_col='id') cities
1-3網頁上的資料(可不看)
1-4來自API的資料(可不看)
2資料清理
2-0基本概念
import pandas as pd df = pd.read_csv("/home/aistudio/data/data20505/pm2.csv") df.sample(10) df.shape df.info() df.dtypes
2-1轉化資料型別
import pandas as pd df = pd.DataFrame([{'col1':'a', 'col2':'1'}, {'col1':'b', 'col2':'2'}]) #類似字典,df.dtypes是object s = pd.Series(['1', '2', '4.7', 'pandas', '10']) #類似列表 df['col2-int'] = df['col2'].astype(int) #將數值轉換為int型別 s.astype(float, errors='ignore')#忽略錯誤的引數 pd.to_numeric(s, errors='coerce')#可以將無效值強制轉換為NaN pd.to_datetime(df[['Month', 'Day', 'Year']])#將資料轉換成時間
#替換資料 def convert_money(value): new_value = value.replace("$","").replace(",","") return float(new_value) df['2016'].apply(convert_money) #替換資料2 df['Percent Growth'].apply(lambda x: float(x.replace("%", "")) / 100)
np.where(df['Active']=='Y', 1, 0) #條件查詢,滿足輸出1,不滿足輸出0
bras['creationTime'].str.split().apply(pd.Series, 0)#將axis=0字元分割並轉換成pd.Series bras['productColor'].str.findall("[\u4E00-\u9FFF]+").str[0]#正則表示式匹配 bras2.str.findall("[a-zA-Z]+").str[0] bras2 = bras['productSize'].str.upper()#轉換成大寫字母
2-2處理重複資料
df.duplicated('Age', keep='last')#保留重複資料的後一個,返回:指定列重複行boolean Series df.drop_duplicates('Age', keep='last')# 返回:副本或替代
df[df.duplicated()].count() / df.count() #檢視重複資料所佔比例 輸出:Name 0.142857 Age 0.142857 Score 0.142857 dtype: float64 df.duplicated().any() #檢視是否有重複資料 輸出:True
2-3處理缺失資料
hitters.isna().any() #檢視是否有缺失資料 hitters.isnull().sum() (hitters.shape[0] - hitters.count()) / hitters.shape[0] #檢視缺失資料比例 df.dropna(axis=0, how='all') # how宣告刪除條件 df.dropna(thresh=2) # 非缺失值小於2的刪除 df['ColA'].fillna(method='bfill') #用指定值填補缺失資料
pdf2 = persons.sample(20) pdf2['Height-na'] = np.where(pdf2['Height'] % 5 == 0, np.nan, pdf2['Height']) # 製造缺失值 from sklearn.impute import SimpleImputer imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') #用均值替換缺失值 col_values = imp_mean.fit_transform(pdf2['Height-na'].values.reshape((-1, 1))) col_values #使用固定值替換缺失值 imp = SimpleImputer(missing_values=-1, strategy='constant', fill_value=110) imp.fit_transform(df['price'].values.reshape((-1, 1)))
#根據規律填補缺失值1 df = pd.DataFrame({"one":np.random.randint(1, 100, 10), "two": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], "three":[5, 9, 13, np.nan, 21, np.nan, 29, 33, 37, 41]}) from sklearn.linear_model import LinearRegression df_train = df.dropna() #訓練集 df_test = df[df['three'].isnull()] #測試集 regr = LinearRegression() regr.fit(df_train['two'].values.reshape(-1, 1), df_train['three'].values.reshape(-1, 1)) df_three_pred = regr.predict(df_test['two'].values.reshape(-1, 1)) # 將所得數值填補到原資料集中 df.loc[(df.three.isnull()), 'three'] = df_three_pred df #根據規律填補缺失值2 from sklearn.datasets import load_iris # 引入鳶尾花資料集 import numpy as np iris = load_iris() X = iris.data # 製造含有缺失值的資料集 rng = np.random.RandomState(0) X_missing = X.copy() mask = np.abs(X[:, 2] - rng.normal(loc=5.5, scale=0.7, size=X.shape[0])) < 0.6 X_missing[mask, 3] = np.nan # X_missing是包含了缺失值的資料集 from missingpy import KNNImputer # 引入KNN填充缺失值的模型 imputer = KNNImputer(n_neighbors=3, weights="uniform") X_imputed = imputer.fit_transform(X_missing)
2-4處理離群資料
%matplotlib inline import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv("/home/aistudio/data/data20510/experiment.csv", index_col=0) fig, ax = plt.subplots() ax.scatter(df['alpha'], df['belta']) #通過散點圖檢視離散值 sns.boxplot(x="day", y="tip", data=tips, palette="Set3")#通過箱線圖檢視離散值 #箱線圖和散點圖結合檢視離散值 ax = sns.boxplot(x="day", y="tip", data=tips) ax = sns.swarmplot(x="day", y="tip", data=tips, color=".25")
#通過箱線圖去除離群值 percentlier = boston_df.quantile([0, 0.25, 0.5, 0.75, 1], axis=0) IQR = percentlier.iloc[3] - percentlier.iloc[1] #箱線圖裡矩形的高度 Q1 = percentlier.iloc[1] #下四分位 Q3 = percentlier.iloc[3] #上四分位 (boston_df < (Q1 - 1.5 * IQR)).any() #上限 (boston_df > (Q3 + 1.5 * IQR)).any() #下限 boston_df_out = boston_df[~((boston_df < (Q1 - 1.5 * IQR)) |(boston_df > (Q3 + 1.5 * IQR))).any(axis=1)] #去掉離群值 boston_df_out.shape
四分位數(Quartile),即統計學中,把所有數值由小到大排列並分成四等份,處於三個分割點位置的得分就是四分位數。
第一四分位數 (Q1),又稱“較小四分位數”,等於該樣本中所有數值由小到大排列後第25%的數字。
第二四分位數 (Q2),又稱“中位數”,等於該樣本中所有數值由小到大排列後第50%的數字。
第三四分位數 (Q3),又稱“較大四分位數”,等於該樣本中所有數值由小到大排列後第75%的數字。
第三四分位數與第一四分位數的差距又稱四分位距(InterQuartile Range,IQR)。
首先確定四分位數的位置:
Q1**的位置= (n+1) × 0.25**
Q2**的位置= (n+1) × 0.5**
Q3**的位置= (n+1) × 0.75**
n表示項數
對於四分位數的確定,有不同的方法,另外一種方法基於N-1 基礎。即
Q1的位置=(n-1)x 0.25
Q2的位置=(n-1)x 0.5
Q3的位置=(n-1)x 0.75
#通過正態分佈去除離群值 # 計算z值 from scipy import stats #統計專用模組 import numpy as np rm = boston_df['RM'] z = np.abs(stats.zscore(rm)) st = boston_df['RM'].std() st threshold = 3 * st #閾值,不是“閥值” print(np.where(z > threshold)) # ⑤ 輸出:(array([ 97, 98, 162, 163, 166, 180, 186, 195, 203, 204, 224, 225, 226, 232, 233, 253, 257, 262, 267, 280, 283, 364, 365, 367, 374, 384, 386, 406, 412, 414]),) rm_in = rm[(z < threshold)] # 消除離群值 rm_in.shape 輸出:(476,)
3特徵變換
3-1特徵數值化
df.replace({"N": 0, 'Y': 1}) #直接替換
from sklearn.preprocessing import LabelEncoder #自動轉換 le = LabelEncoder() le.fit_transform(df['hypertension']) le.inverse_transform([0, 1, 1, 2, 1, 0]) #將標準化後的資料轉換為原始資料
import re #用詞頻統計進行轉換 d1 = "I am Laoqi. I am a programmer." d2 = "Laoqi is in Soochow. It is a beautiful city." words = re.findall(r"\w+", d1+d2) # 以正則表示式提煉單詞,不是用split(),這樣就避免了句點問題 words = list(set(words)) # 唯一單詞儲存為列表 [w.lower() for w in words] words # 為每句話中的單詞出現次數計數 def count_word(document, unique_words): count_doc = [] for word in unique_words: n = document.lower().count(word) count_doc.append(n) return count_doc count1 = count_word(d1, words) count2 = count_word(d2, words) print(count1) print(count2) # 儲存為dataframe df = pd.DataFrame([count1, count2], columns=words, index=['d1', 'd2']) df
from sklearn.feature_extraction.text import CountVectorizer #使用自帶的庫進行詞頻統計 count_vect = CountVectorizer() tf1 = count_vect.fit_transform([d1, d2]) tf1.shape 輸出:(2, 9) count_vect.get_feature_names() # 相對前面方法少了2個,因為I 和 a作為常用詞停詞了。 輸出:['am', 'beautiful', 'city', 'in', 'is', 'it', 'laoqi', 'programmer', 'soochow'] tf1.toarray() # 顯示記錄數值 輸出:array([[2, 0, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 1, 2, 1, 1, 0, 1]])
3-2特徵二值化
#閾值將數值型轉變為二進位制型,閾值可以進行設定,另外只能對數值型資料進行處理,且傳入的引數必須為2D陣列,也就是不能是Series這種型別,shape為(m,n)而不是(n,)型別的陣列 from sklearn.preprocessing import Binarizer bn = Binarizer(threshold=pm25["Exposed days"].mean()) # ① result = bn.fit_transform(pm25[["Exposed days"]]) # ② pm25['sk-bdays'] = result pm25.sample(10)
from sklearn.preprocessing import binarize fbin = binarize(pm25[['Exposed days']], threshold=pm25['Exposed days'].mean()) fbin[[1, 50, 100, 150, 200]]
圖片部分(略)
3-3One-Hot編碼
pd.get_dummies(g) #pandas提供對one-hot編碼的函式 persons.merge(df_dum, left_index=True, right_index=True) #組合資料
from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder() fs = ohe.fit_transform(df[['color']]) fs_ohe = pd.DataFrame(fs.toarray()[:, 1:], columns=["color_green", 'color_red']) df = pd.concat([df, fs_ohe], axis=1) df 輸出: color size price classlabel color_green color_red 0 green 1 29.9 class1 1.0 0.0 1 red 2 69.9 class2 0.0 1.0 2 blue 3 99.9 class1 0.0 0.0 3 red 2 59.9 class1 0.0 1.0
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder import numpy as np encoded_x = None for i in range(0, X.shape[1]): label_encoder = LabelEncoder() # 數值化 feature = label_encoder.fit_transform(X[:,i]) feature = feature.reshape(X.shape[0], 1) onehot_encoder = OneHotEncoder(sparse=False) # OneHot編碼 feature = onehot_encoder.fit_transform(feature) if encoded_x is None: encoded_x = feature else: encoded_x = np.concatenate((encoded_x, feature), axis=1) print("X shape: : ", encoded_x.shape)
3-4資料變換
#將資料由非鄭態分佈轉換為正態分佈常用的方法 data['logtime'] = np.log10(data['time']) #方法一 from scipy import stats dft = stats.boxcox(transform)[0] #方法二 from sklearn.preprocessing import power_transform dft2 = power_transform(dc_data[['AIR_TIME']], method='box-cox')
#使用sklearn.preprocessing.PolynomialFeatures來進行特徵的構造 from sklearn.preprocessing import PolynomialFeatures # ③ poly = PolynomialFeatures(2) # ④ poly.fit_transform(X) 原始資料: array([[0, 1], [2, 3], [4, 5]]) 構造特徵後的資料: array([[ 1., 0., 1., 0., 0., 1.], [ 1., 2., 3., 4., 6., 9.], [ 1., 4., 5., 16., 20., 25.]])
#將資料從任意分佈對映到儘可能接近高斯分佈,以穩定方差和最小化偏度 from sklearn.preprocessing import power_transform dft2 = power_transform(dc_data[['AIR_TIME']], method='box-cox') hbcs = plt.hist(dft2, bins=100)
#為了簡化構建變換和模型鏈的過程,Scikit-Learn提供了pipeline類,可以將多個處理步驟合併為單個Scikit-Learn估計器 %matplotlib inline import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import Ridge from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline df = pd.read_csv("/home/aistudio/data/data20514/xsin.csv") colors = ['teal', 'yellowgreen', 'gold'] plt.scatter(df['x'], df['y'], color='navy', s=30, marker='o', label="training points") for count, degree in enumerate([3, 4, 5]): model = make_pipeline(PolynomialFeatures(degree), Ridge()) # ③ model.fit(df[['x']], df[['y']]) y_pre = model.predict(df[['x']]) plt.plot(df['x'], y_pre, color=colors[count], linewidth=2, label="degree %d" % degree) plt.legend()
3-5特徵離散化
#無監督離散等分分箱 pd.cut(ages['years'],3) #可新增引數如:bins=[9, 30, 50],labels=[0, 1, 2] 輸出: 0 (9.943, 29.0] 1 (9.943, 29.0] 2 (29.0, 48.0] 3 (48.0, 67.0] 4 (48.0, 67.0] 5 (29.0, 48.0] 6 (29.0, 48.0] Name: years, dtype: category Categories (3, interval[float64]): [(9.943, 29.0] < (29.0, 48.0] < (48.0, 67.0]] #分成三部分 pd.qcut(ages['years'],3) #與cut類似
#無監督離散2 from sklearn.preprocessing import KBinsDiscretizer kbd = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') #n_bins=3:劃分區間個數、encode='ordinal'編碼方式:整數數值、strategy='uniform'離散化採用的特質是分割槽的寬度相同 trans = kbd.fit_transform(ages[['years']]) ages['kbd'] = trans[:, 0] ages
#有監督離散化 import entropy_based_binning as ebb A = np.array([[1,1,2,3,3], [1,1,0,1,0]]) ebb.bin_array(A, nbins=2, axis=1) 輸出:array([[0, 0, 1, 1, 1], [1, 1, 0, 1, 0]])
#有監督離散化2 from mdlp.discretization import MDLP from sklearn.datasets import load_iris transformer = MDLP() iris = load_iris() X, y = iris.data, iris.target X_disc = transformer.fit_transform(X, y) X_disc
3-6資料規範化
from sklearn import datasets from sklearn.preprocessing import StandardScaler #標準化 iris = datasets.load_iris() iris_std = StandardScaler().fit_transform(iris.data)
from sklearn.preprocessing import MinMaxScaler #最小最大區間化 iris_mm = MinMaxScaler().fit_transform(iris.data) iris_mm[:5]
from sklearn.preprocessing import RobustScaler, MinMaxScaler #RobustScaler基於原始資料的均值和標準差進行的標準化 robust = RobustScaler() robust_scaled = robust.fit_transform(X) robust_scaled = pd.DataFrame(robust_scaled, columns=['x1', 'x2'])
from sklearn.preprocessing import Normalizer #歸一化 可新增引數norm='l1'、norm='max' norma = Normalizer() norma.fit_transform([[3, 4]]) array([[0.6, 0.8]])
4特徵選擇
4-0特徵選擇概述
from sklearn.model_selection import train_test_split #分割資料集 from sklearn.preprocessing import StandardScaler X, y = df_wine.iloc[:, 1:], df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y) std = StandardScaler() X_train_std = std.fit_transform(X_train) X_test_std = std.fit_transform(X_test)
4-1封裝器法
#循序特徵選擇 from mlxtend.feature_selection import SequentialFeatureSelector as SFS X_train, X_test, y_train, y_test= train_test_split(X, y, stratify=y, test_size=0.3, random_state=1) std = StandardScaler() X_train_std = std.fit_transform(X_train) knn = KNeighborsClassifier(n_neighbors=3) # ① sfs = SFS(estimator=knn, # ② k_features=4, forward=True, floating=False, verbose=2, scoring='accuracy', cv=0) sfs.fit(X_train_std, y_train)
#窮舉特徵選擇 from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS efs = EFS(RandomForestRegressor(),min_features=1,max_features=5,scoring='r2',n_jobs=-1) efs.fit(np.array(mini_data),y_train) mini_data.columns[list(efs.best_idx_)] #窮舉特徵選擇2 from mlxtend.feature_selection import ExhaustiveFeatureSelector from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.metrics import roc_auc_score feature_selector = ExhaustiveFeatureSelector(RandomForestClassifier(n_jobs=-1), min_features=2, max_features=4, scoring='roc_auc', print_progress=True, cv=2) features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels) filtered_features= train_features.columns[list(features.best_idx_)] filtered_features
#遞迴特徵消除 from sklearn.feature_selection import RFE rfe = RFE(RandomForestRegressor(), n_features_to_select=5) rfe.fit(np.array(mini_data),y_train) rfe.ranking_
4-2過濾器法
#方法一 from sklearn.datasets import load_iris from sklearn.feature_selection import SelectKBest # ① from sklearn.feature_selection import chi2 iris = load_iris() X, y = iris.data, iris.target skb = SelectKBest(chi2, k=2) # ② result = skb.fit(X, y) # ③
#方法二 from sklearn.feature_selection import VarianceThreshold vt = VarianceThreshold(threshold=(0.8 * (1 - 0.8))) # ⑤ vt.fit_transform(X)
4-3嵌入法
# 用嵌入法選擇特徵 from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression #使用logistic迴歸模型 embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), '1.25*median') embeded_lr_selector.fit(X_norm, y) embeded_lr_support = embeded_lr_selector.get_support() embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist() print(str(len(embeded_lr_feature)), 'selected features')
可以看下例項瞭解
5特徵抽取
5-1無監督特徵抽取
#主成分分析 from sklearn.decomposition import PCA import numpy as np pca = PCA() # ① X_pca = pca.fit_transform(X) # ② np.round(X_pca[: 4], 2)
#因子分析 from sklearn.decomposition import FactorAnalysis fa = FactorAnalysis(n_components=2) iris_two = fa.fit_transform(iris.data) iris_two[: 4]
5-2有監督特徵抽取
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis lda = LinearDiscriminantAnalysis(n_components=2) X_lda = lda.fit_transform(X, y) plt.scatter(X_lda[:, 0], X_lda[:, 1], c=y)