【Pandas-Cookbook】07:資料清洗
阿新 • • 發佈:2019-02-17
# -*-coding:utf-8-*-
# by kevinelstri
# 2017.2.17
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# ---------------------
# Chapter 7 - Cleaning up messy data.ipynb 清理垃圾資料
# ---------------------
requests = pd.read_csv('../data/311-service-requests.csv')
# print requests.head()
'''
7.1 How do we know if it's messy?
'''
zip = requests['Incident Zip'].unique() # unique()用於檢視所有的值
# print zip
'''
zip中存在的問題:
1、資料型別問題,有些是字串型,有些是浮點型
2、有一些值不存在nan
3、有些值不正確 83 29616-0759
4、有N/A值,pandas不能夠識別,'N/A','NO CLUE'
處理方法:
1、使'N/A','NO CLUE'變成規則的nan
2、使所有格式都變成字串
'''
'''
7.3 Fixing the nan values and string/float confusion
'''
na_value = ['N/A', 'NO CLUE', 'O', 'nan']
requests = pd.read_csv('../data/311-service-requests.csv', na_values=na_value, dtype={'Incident Zip': str})
# 讀取csv檔案時,將異常值設定為空值,將資料型別全部轉換為字串型別
zip = requests['Incident Zip'].copy()
# print zip.unique()
'''
7.4 What's up with the dashes? 處理數字之間的橫槓29616-0759
'''
row_with_dashs = requests['Incident Zip'].str.contains('-').fillna(False) # 將帶橫槓的全部提取出來
# print len(requests[row_with_dashs])
# print requests[row_with_dashs]
requests['Incident Zip'][row_with_dashs] = np.nan # 將帶橫槓的全部轉換為空值
# print requests['Incident Zip'].unique()
long_zip_codes = requests['Incident Zip'].str.len() > 5
# print requests['Incident Zip'][long_zip_codes].unique()
requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5) # slice()獲取字串的指定長度
# requests['Incident Zip'] = requests['Incident Zip'].str[0:5]
# print requests['Incident Zip'].unique()
# requests[requests['Incident Zip']] == '00000'
zero_zips = requests['Incident Zip'] == '00000'
requests.loc[zero_zips, 'Incident Zip'] = np.nan
unique_zips = requests['Incident Zip'].unique()
unique_zips.sort() # 排序
print unique_zips
zips = requests['Incident Zip']
is_close = zips.str.startswith('0') | zips.str.startswith('1') # zip以0或1開頭
is_far = ~(is_close) & zips.notnull()
print zips[is_far]
print requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort('Incident Zip')
print requests['City'].str.upper().value_counts() # 城市名轉換為大寫的,並且統計城市的數量
'''
7.5 Putting it together
'''
# 異常值處理及csv檔案的讀取
na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv',
na_values=na_values,
dtype={'Incident Zip': str})
# 將郵政編碼的位數固定為5位
def fix_zip_codes(zips):
zips = zips.str.slice(0, 5)
zero_zips = zips == '00000'
zips[zero_zips] = np.nan
return zips
requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])
print requests['Incident Zip'].unique()