python中的pandas包的資料清洗能力
阿新 • • 發佈:2019-01-25
pandas很強大,前幾天公司要求利用已知使用者身份證、手機號知道客戶的星座、性別、年齡等相關資訊做使用者畫像,剛剛開始想到用R語言來實現,後來想到以後效能的問題,放棄了,由於公司沒sas,就用pandas快速實現,參考程式碼:
# coding: UTF-8 ''' Created on 2015年8月25日 @author: ZHOUMEIXU204 ''' import MySQLdb import pandas as pd import time import datetime Table_id =pd.read_excel(u'D:\\Users\\zhoumeixu204\\Desktop\\全國身份證歸屬地資料庫.xlsx') print(Table_id.head()) num_str = lambda x: str(x) num_sub = lambda x:x[0:6] id_dict = dict(zip(Table_id['BM'].apply(num_str).apply(num_sub), Table_id['DQ'])) con = MySQLdb.connect(host="202.69.27.239", port=8443, user="root", passwd="Pa123456!", db="analyse", use_unicode=True, charset="utf8") con_dev = MySQLdb.connect(host="202.69.27.239", port=8443, user="root", passwd="Pa123456!", db="analyse_dev", use_unicode=True, charset="utf8") table_id_decode = pd.read_sql("select * from table_id", con) f = lambda x: x[0:6] table_id_decode['cert_address'] = table_id_decode['cert_id'].apply(f).map(id_dict).fillna(u'無法匹配') def constellation(x): if len(x)>=15: monthday=int(x[10:12]+x[12:14]) if monthday>=321 and monthday<=419: constellation=u'白羊座' elif monthday>=420 and monthday<=520: constellation=u'金牛座' elif monthday>=521 and monthday<=621: constellation=u'雙子座' elif monthday>=622 and monthday<=722: constellation=u'巨蟹座' elif monthday>=723 and monthday<=822: constellation=u'獅子座' elif monthday>=823 and monthday<=922: constellation=u'處女座' elif monthday>=823 and monthday<=1023: constellation=u'天枰座' elif monthday>=1024 and monthday<=1121: constellation=u'天蠍座' elif monthday>=1122 and monthday<=1221: constellation=u'射手座' elif (monthday>=1222 and monthday<=1231) or (monthday>=101 and monthday<=119) : constellation=u'魔蠍座' elif monthday>=120 and monthday<=218: constellation=u'水瓶座' elif monthday>=219 and monthday<=320: constellation=u'雙魚座' else: constellation=u'其他' else: constellation=u'無法識別' return(constellation) def zodiac(x): if len(x)>=15: year=int(x[6:10]) if year==11: zodiac=u'羊' elif year==10: zodiac=u'馬' elif year==9: zodiac=u'蛇' elif year==8: zodiac=u'龍' elif year==7: zodiac=u'兔' elif year==6: zodiac=u'虎' elif year==5: zodiac=u'牛' elif year==4: zodiac=u'鼠' elif year==3: zodiac=u'豬' elif year==2: zodiac=u'狗' elif year==1: zodiac=u'雞' elif year==0: zodiac=u'猴' else: zodiac=u'其他' else: zodiac=u'無法獲得' return(zodiac) def sex(x): if len(x)==15: if int(x[len(x)-1])%2==1: sex=u'男' else: sex=u'女' elif len(x)==18: if int(x[len(x)-2])%2==1: sex=u'男' else: sex=u'女' else: sex=u'無法識別' return(sex) time.sleep(0.5) def birthday(x): if len(x)>=15: birthday=x[6:10]+"-"+x[10:12]+"-"+x[12:14] else: birthday=u'無法獲得' return(birthday) def age(x): if len(x)>=15: now = datetime.datetime.now() otherStyleTime =now.strftime("%Y-%m-%d %H:%M:%S")[0:4] age=str(int(otherStyleTime)-int(x[6:10])) else: age=u'無法獲得' return(age) table_id_decode['zodiac']=table_id_decode['cert_id'].apply(zodiac) table_id_decode['constellation']=table_id_decode['cert_id'].apply(constellation) table_id_decode['sex']=table_id_decode['cert_id'].apply(sex) table_id_decode['birthday']=table_id_decode['cert_id'].apply(birthday) table_id_decode['age']=table_id_decode['cert_id'].apply(age) cert_address = table_id_decode.drop('usr_name',axis=1) # print(cert_address) cert_address.to_sql("cert_info", con_dev, flavor="mysql", if_exists='replace', index=False) con.commit() con.close() con_dev.commit() con_dev.close() print("suceess")