1. 程式人生 > 其它 >python資料分析八_Na值得資料處理

python資料分析八_Na值得資料處理

    # -*- coding: utf-8 -*-
    import pandas as pd
    
    from pandas import Series,DataFrame
    
    import numpy as np
    
    string_data=Series(['aa','bb','cc',np.nan])
    print(string_data)
    # 0     aa
    # 1     bb
    # 2     cc
    # 3    NaN
    
    print(string_data.isnull())
    # 0    False
    # 1    False
    # 2    False
    # 3     True
    # dtype: bool
    
    #python none值也會當做Na值處理
    '''
    dropna() 過濾,還可以設定閾值。調節確實容忍度
    fillna() 設定預設值
    isnull() 判斷為空
    notnull() 判斷不為空
    '''
    print(string_data[string_data.notnull()])
    print(string_data.dropna())
    # 0    aa
    # 1    bb
    # 2    cc
    print(string_data.fillna(1))
    # 0    aa
    # 1    bb
    # 2    cc
    # 3     1
    # dtype: object
    
    data=DataFrame([[1,2,3],[np.nan,2,3],[np.nan,np.nan,np.nan],[np.nan,2,3]])
    print(data)
    #    0    1    2
    # 0  1.0  2.0  3.0
    # 1  NaN  2.0  3.0
    # 2  NaN  NaN  NaN
    # 3  NaN  2.0  3.0
    
    #過濾行Na的行
    print(data.dropna())
    #      0    1    2
    # 0  1.0  2.0  3.0
    
    #過濾全是Na的行
    print(data.dropna(how='all'))
    # 0  1.0  2.0  3.0
    # 1  NaN  2.0  3.0
    # 3  NaN  2.0  3.0
    
    #過濾含Na的列
    data[4]=np.nan
    print(data)
    #    0    1    2   4
    # 0  1.0  2.0  3.0 NaN
    # 1  NaN  2.0  3.0 NaN
    # 2  NaN  NaN  NaN NaN
    # 3  NaN  2.0  3.0 NaN
    
    print(data.dropna(axis=1,how='all'))
    #     0    1    2
    # 0  1.0  2.0  3.0
    # 1  NaN  2.0  3.0
    # 2  NaN  NaN  NaN
    # 3  NaN  2.0  3.0
    
    data=DataFrame(np.random.randn(7,3))
    print(data)
    #         0         1         2
    # 0  0.329393 -0.849128  1.864103
    # 1 -1.413807  0.461709  1.097039
    # 2  0.191843  0.654831 -0.527389
    # 3 -1.012322 -0.210649 -0.226674
    # 4  0.864600  0.960556 -1.436670
    # 5 -1.411272 -0.315413  0.575377
    # 6 -0.819563 -0.893195  0.057541
    
    data.ix[:4,1]=np.nan
    print(data)
    #         0         1         2
    # 0  0.051068       NaN  0.333383
    # 1  0.793481       NaN -1.240897
    # 2  0.705667       NaN  0.797441
    # 3  0.088447       NaN  0.059333
    # 4 -1.638566       NaN -0.853278
    # 5  0.676200  0.151795 -1.329852
    # 6 -0.849945  0.955315 -0.526976
    
    data.ix[:2,2]=np.nan
    print(data)
    
    #    0         1         2
    # 0 -0.429629       NaN       NaN
    # 1  0.071094       NaN       NaN
    # 2  0.735083       NaN       NaN
    # 3 -2.396363       NaN  0.236465
    # 4 -2.817603       NaN -0.919750
    # 5 -1.031900  0.941620  1.547814
    # 6  1.290588  0.116789  0.375252
    
    #過濾前三行
    print(data.dropna(thresh=3))
    #           0         1         2
    # 5  0.559282  0.406619 -0.633786
    # 6  0.061291 -0.586182  0.123497
    
    
    '''
    填充缺失資料
    '''
    print(data.fillna(0))
    #   0         1         2
    # 0 -0.621390  0.000000  0.000000
    # 1 -1.483077  0.000000  0.000000
    # 2 -0.948426  0.000000  0.000000
    # 3  1.641440  0.000000  2.015218
    # 4 -1.036951  0.000000  0.990668
    # 5 -0.396387 -0.043747 -0.579406
    # 6 -0.272858 -1.523178 -0.494554
    
    #不同列的填充
    print(data.fillna({1:0.5,2:-1}))#2列賦值0.5,3列瀆職-1
    #     0         1         2
    # 0 -0.857152  0.500000 -1.000000
    # 1 -1.054372  0.500000 -1.000000
    # 2  0.930643  0.500000 -1.000000
    # 3 -1.130063  0.500000  0.240622
    # 4 -0.623378  0.500000  0.524232
    # 5  0.662496 -0.101754  0.170956
    # 6  0.213570  0.864930 -0.383311
    
    #fillna 對原有物件進行更改
    _=data.fillna(0,inplace=True)
    print(data)
    #    0         1         2
    # 0 -1.440240  0.000000  0.000000
    # 1  0.100231  0.000000  0.000000
    # 2 -0.660957  0.000000  0.000000
    # 3  0.004898  0.000000 -1.313950
    # 4  1.110324  0.000000 -0.276177
    # 5  2.536283 -0.294194 -0.706136
    # 6 -2.313634 -0.270051  0.295415
    
    #reindex 插值的方法
    df=DataFrame(np.random.randn(6,3))
    df.ix[2:,1]=np.nan
    df.ix[4:,2]=np.nan
    print(df)
    #   0         1         2
    # 0 -1.292891  0.977053 -1.339258
    # 1 -0.981534  0.643460 -0.699660
    # 2 -0.343731       NaN  0.812251
    # 3  0.446141       NaN -0.824229
    # 4  0.389609       NaN       NaN
    # 5  0.716714       NaN       NaN
    
    print(df.fillna(method='ffill'))
    #  0         1         2
    # 0 -0.672379  0.088150 -0.765589
    # 1  0.225561  1.370398 -1.211027
    # 2  0.379040  1.370398 -1.009322
    # 3 -0.388188  1.370398 -0.986014
    # 4  0.387574  1.370398 -0.986014
    # 5 -0.196254  1.370398 -0.986014
    print(df.fillna(method='ffill',limit=2))
    
    #    0         1         2
    # 0  0.816393  0.031747  1.354395
    # 1 -0.940994  0.093215  0.837312
    # 2 -0.163731  0.093215  0.349830
    # 3  0.268226  0.093215 -0.762212
    # 4 -2.650622       NaN -0.762212
    # 5 -1.195725       NaN -0.762212
    
    
    #在Na值中插入平均數
    data=Series([1,np.nan,2,np.nan,3])
    
    print(data.fillna(data.mean()))
    # 0    1.0
    # 1    2.0
    # 2    2.0
    # 3    2.0
    # 4    3.0
    # dtype: float64
    
    '''
    value 預設填充值
    method 預設是ffill向下填充
    axis 行填充 預設是列
    inplace 不產生副本
    limit 限定填充數量
    '''