Python for Data Science - Treating missing values
阿新 • • 發佈:2021-01-02
Chapter 2 - Data Preparation Basics
Segment 2 - Treating missing values
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
Figuring out what data is missing
missing = np.nan series_obj = Series(['row 1','row 2',missing,'row 4','row 5','row 6',missing,'row 8']) series_obj
0 row 1
1 row 2
2 NaN
3 row 4
4 row 5
5 row 6
6 NaN
7 row 8
dtype: object
series_obj.isnull()
0 False
1 False
2 True
3 False
4 False
5 False
6 True
7 False
dtype: bool
Filling in for missing values
np.random.seed(25) DF_obj = DataFrame(np.random.rand(36).reshape(6,6)) DF_obj
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 0.870124 | 0.582277 | 0.278839 | 0.185911 | 0.411100 | 0.117376 |
1 | 0.684969 | 0.437611 | 0.556229 | 0.367080 | 0.402366 | 0.113041 |
2 | 0.447031 | 0.585445 | 0.161985 | 0.520719 | 0.326051 | 0.699186 |
3 | 0.366395 | 0.836375 | 0.481343 | 0.516502 | 0.383048 | 0.997541 |
4 | 0.514244 | 0.559053 | 0.034450 | 0.719930 | 0.421004 | 0.436935 |
5 | 0.281701 | 0.900274 | 0.669612 | 0.456069 | 0.289804 | 0.525819 |
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 0.870124 | 0.582277 | 0.278839 | 0.185911 | 0.411100 | 0.117376 |
1 | 0.684969 | 0.437611 | 0.556229 | 0.367080 | 0.402366 | NaN |
2 | 0.447031 | 0.585445 | 0.161985 | 0.520719 | 0.326051 | NaN |
3 | NaN | 0.836375 | 0.481343 | 0.516502 | 0.383048 | NaN |
4 | NaN | 0.559053 | 0.034450 | 0.719930 | 0.421004 | NaN |
5 | NaN | 0.900274 | 0.669612 | 0.456069 | 0.289804 | 0.525819 |
filled_DF = DF_obj.fillna(0)
filled_DF
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 0.870124 | 0.582277 | 0.278839 | 0.185911 | 0.411100 | 0.117376 |
1 | 0.684969 | 0.437611 | 0.556229 | 0.367080 | 0.402366 | 0.000000 |
2 | 0.447031 | 0.585445 | 0.161985 | 0.520719 | 0.326051 | 0.000000 |
3 | 0.000000 | 0.836375 | 0.481343 | 0.516502 | 0.383048 | 0.000000 |
4 | 0.000000 | 0.559053 | 0.034450 | 0.719930 | 0.421004 | 0.000000 |
5 | 0.000000 | 0.900274 | 0.669612 | 0.456069 | 0.289804 | 0.525819 |
filled_DF = DF_obj.fillna({0:0.1, 5:1.25})
filled_DF
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 0.870124 | 0.582277 | 0.278839 | 0.185911 | 0.411100 | 0.117376 |
1 | 0.684969 | 0.437611 | 0.556229 | 0.367080 | 0.402366 | 1.250000 |
2 | 0.447031 | 0.585445 | 0.161985 | 0.520719 | 0.326051 | 1.250000 |
3 | 0.100000 | 0.836375 | 0.481343 | 0.516502 | 0.383048 | 1.250000 |
4 | 0.100000 | 0.559053 | 0.034450 | 0.719930 | 0.421004 | 1.250000 |
5 | 0.100000 | 0.900274 | 0.669612 | 0.456069 | 0.289804 | 0.525819 |
fill_DF = DF_obj.fillna(method='ffill')
fill_DF
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 0.870124 | 0.582277 | 0.278839 | 0.185911 | 0.411100 | 0.117376 |
1 | 0.684969 | 0.437611 | 0.556229 | 0.367080 | 0.402366 | 0.117376 |
2 | 0.447031 | 0.585445 | 0.161985 | 0.520719 | 0.326051 | 0.117376 |
3 | 0.447031 | 0.836375 | 0.481343 | 0.516502 | 0.383048 | 0.117376 |
4 | 0.447031 | 0.559053 | 0.034450 | 0.719930 | 0.421004 | 0.117376 |
5 | 0.447031 | 0.900274 | 0.669612 | 0.456069 | 0.289804 | 0.525819 |
Counting missing values
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape(6,6))
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 0.870124 | 0.582277 | 0.278839 | 0.185911 | 0.411100 | 0.117376 |
1 | 0.684969 | 0.437611 | 0.556229 | 0.367080 | 0.402366 | NaN |
2 | 0.447031 | 0.585445 | 0.161985 | 0.520719 | 0.326051 | NaN |
3 | NaN | 0.836375 | 0.481343 | 0.516502 | 0.383048 | NaN |
4 | NaN | 0.559053 | 0.034450 | 0.719930 | 0.421004 | NaN |
5 | NaN | 0.900274 | 0.669612 | 0.456069 | 0.289804 | 0.525819 |
DF_obj.isnull().sum()
0 3
1 0
2 0
3 0
4 0
5 4
dtype: int64
Filtering out missing values
DF_no_NaN = DF_obj.dropna()
DF_no_NaN
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 0.870124 | 0.582277 | 0.278839 | 0.185911 | 0.4111 | 0.117376 |
DF_no_NaN = DF_obj.dropna(axis=1)
DF_no_NaN
1 | 2 | 3 | 4 | |
---|---|---|---|---|
0 | 0.582277 | 0.278839 | 0.185911 | 0.411100 |
1 | 0.437611 | 0.556229 | 0.367080 | 0.402366 |
2 | 0.585445 | 0.161985 | 0.520719 | 0.326051 |
3 | 0.836375 | 0.481343 | 0.516502 | 0.383048 |
4 | 0.559053 | 0.034450 | 0.719930 | 0.421004 |
5 | 0.900274 | 0.669612 | 0.456069 | 0.289804 |