用spark分析北京積分落戶資料,整理北京積分落戶名單資料成csv格式
阿新 • • 發佈:2018-12-15
讀取json檔案格式資料,整理匯出成csv格式
import json,csv #載入資料 def loadData(): with open('jifenluohu.json', 'r') as f: data = json.load(f) rows = data['rows'] with open("jifenluohu.csv", "w") as f: fieldnames = ["pxid", "id", "idCard", "name", "score", "unit", "ranking"] # 表的列名 writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() # 加上表頭 for row in rows: newrow = {"pxid":row["pxid"],"id":row["id"],"idCard":row["idCard"],"name":row["name"],"score":row["score"],"unit":row["unit"],"ranking":row["ranking"]} writer.writerow(newrow) print("寫csv完成") t = loadData() print(t)
後面有增加了年齡,生肖,年齡,省份,城市等屬性。為後續進一步分析做準備。
import json,csv from datetime import datetime #根據出生年份獲取生肖 def chinese_zodiac(year): return u'猴雞狗豬鼠牛虎兔龍蛇馬羊'[year%12] # 根據出生日期獲取星座 def get_constellation(month, date): dates = (21, 20, 21, 21, 22, 22, 23, 24, 24, 24, 23, 22) constellations = ("摩羯", "水瓶", "雙魚", "白羊", "金牛", "雙子", "巨蟹", "獅子", "處女", "天秤", "天蠍", "射手", "摩羯") if date < dates[month-1]: return constellations[month-1] else: return constellations[month] #city def citydict(): with open("city.csv") as file: citys = {} for line in file: if line==",": continue city = line.split(",") citys.update({city[0]:city[1].replace("\n", "")}) return citys #載入資料 def loadData(): with open('jifenluohu.json', 'r') as f: data = json.load(f) rows = data['rows'] with open("jifenluohu.csv", "w") as f: fieldnames = ["pxid", "id", "idCard", "name", "score", "unit", "ranking", "province", "city", "provincename", "cityname", "birthday", "age", "zoo", "star"] # 表的列名 writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() # 加上表頭 citys = citydict() for row in rows: idCard = row["idCard"] province = idCard[0:2] city = idCard[0:6] year = idCard[6:10] month = idCard[10:12] day = idCard[12:14] zoo = chinese_zodiac(int(year)) star = get_constellation(int(month), int(day)) provincename = citys.get(province) cityname = citys.get(city) birthday = year+'-'+month+'-'+day age = 2018-int(year) ext = {'province':province, 'city':city, 'provincename':provincename, 'cityname':cityname, 'birthday':birthday, 'age':age, 'zoo':zoo, 'star':star} #print(ext) newrow = {"pxid":row["pxid"],"id":row["id"],"idCard":row["idCard"],"name":row["name"],"score":row["score"],"unit":row["unit"].strip(),"ranking":row["ranking"]} newrow.update(ext) writer.writerow(newrow) print("寫csv完成") t = loadData() #t = citydict() print(t)
資料包,以及用pyspark分析過程下載
百度雲