python 檢測和過濾異常值
阿新 • • 發佈:2019-02-13
import numpy as np from pandas import Series, DataFrame ####檢測和過濾異常值 np.random.seed(12345) #設定隨機變數的種子 data = DataFrame(np.random.randn(1000, 4)) print( data.describe() ) ''' 0 1 2 3 count 1000.000000 1000.000000 1000.000000 1000.000000 mean -0.067684 0.067924 0.025598 -0.002298 std 0.998035 0.992106 1.006835 0.996794 min -3.428254 -3.548824 -3.184377 -3.745356 25% -0.774890 -0.591841 -0.641675 -0.644144 50% -0.116401 0.101143 0.002073 -0.013611 75% 0.616366 0.780282 0.680391 0.654328 max 3.366626 2.653656 3.260383 3.927528 ''' col = data[3] print( col[np.abs(col) > 3] ) # 過濾 ''' index data 97 3.927528 305 -3.399312 400 -3.745356 Name: 3, dtype: float64 ''' # any(x)判斷x物件是否為空物件,如果都為空、0、false,則返回false,如果不都為空、0、false,則返回true # all(x)如果all(x)引數x物件的所有元素不為0、''、False或者x為空物件,則返回True,否則返回False print( data[(np.abs(data) > 3).any(1)] ) ''' 0 1 2 3 5 -0.539741 0.476985 3.248944 -1.021228 97 -0.774363 0.552936 0.106061 3.927528 102 -0.655054 -0.565230 3.176873 0.959533 305 -2.315555 0.457246 -0.025907 -3.399312 324 0.050188 1.951312 3.260383 0.963301 400 0.146326 0.508391 -0.196713 -3.745356 499 -0.293333 -0.242459 -3.056990 1.918403 523 -3.428254 -0.296336 -0.439938 -0.867165 586 0.275144 1.179227 -3.184377 1.369891 808 -0.362528 -3.548824 1.553205 -2.186301 900 3.366626 -2.372214 0.851010 1.332846 ''' #2 # numpy sign的用法 大於0的返回1.0 小於0的返回-1.0 等於0的返回0.0 data[np.abs(data) > 3] = np.sign(data) * 3 print( data.describe() ) ''' 0 1 2 3 count 1000.000000 1000.000000 1000.000000 1000.000000 mean -0.067623 0.068473 0.025153 -0.002081 std 0.995485 0.990253 1.003977 0.989736 min -3.000000 -3.000000 -3.000000 -3.000000 25% -0.774890 -0.591841 -0.641675 -0.644144 50% -0.116401 0.101143 0.002073 -0.013611 75% 0.616366 0.780282 0.680391 0.654328 max 3.000000 2.653656 3.000000 3.000000 '''