Datawhale學資料分析第二章第一節
阿新 • • 發佈:2020-08-21
import numpy as np import pandas as pd df = pd.read_csv('/Users/mofashipython/test/train.csv') #缺失值統計 df.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): PassengerId 891 non-null int64 Survived 891 non-null int64 Pclass891 non-null int64 Name 891 non-null object Sex 891 non-null object Age 714 non-null float64 SibSp 891 non-null int64 Parch 891 non-null int64 Ticket 891 non-null object Fare 891 non-null float64 Cabin 204 non-null object Embarked889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.6+ KB #缺失值查詢 df[df['Age']==None]=0 df[df['Age'] == np.nan] = 0 #檢視重複值 df[df.duplicated()] #隱藏(刪除重複值) df.drop_duplicates() #分箱 #將連續變數Age平均分箱成5個年齡段,並分別用類別變數12345表示 df['AgeBand'] = pd.cut(df['Age'], 5,labels = ['1','2','3','4','5']) #將連續變數Age劃分為[0,5) [5,15) [15,30) [30,50) [50,80)五個年齡段,並分別用類別變數12345表示 df['AgeBand'] = pd.cut(df['Age'],[0,5,15,30,50,80],labels = ['1','2','3','4','5']) df.to_csv('test_cut.csv') #將連續變數Age按10% 30% 50 70% 90%五個年齡段,並用分類變數12345表示 df['AgeBand'] = pd.qcut(df['Age'],[0,0.1,0.3,0.5,0.7,0.9],labels = ['1','2','3','4','5']) #文字變數進行轉換 #檢視類別文字變數名及種類 df['Sex'].value_counts() df['Sex'].unique() #將類別文字轉換為12345 df['Sex_num'] = df['Sex'].replace(['male','female'],[1,2]) df['Sex_num'] = df['Sex'].map({'male': 1, 'female': 2}) #將類別文字轉換為one-hot編碼 for feat in ["Age", "Embarked"]: # x = pd.get_dummies(df["Age"] // 6) # x = pd.get_dummies(pd.cut(df['Age'],5)) x = pd.get_dummies(df[feat], prefix=feat) df = pd.concat([df, x], axis=1) #df[feat] = pd.get_dummies(df[feat], prefix=feat)