1. 程式人生 > >用spark分析北京積分落戶資料,整理北京積分落戶名單資料成csv格式

用spark分析北京積分落戶資料,整理北京積分落戶名單資料成csv格式

讀取json檔案格式資料,整理匯出成csv格式

import json,csv

#載入資料
def loadData():
    with open('jifenluohu.json', 'r') as f:
        data = json.load(f)
        rows = data['rows']
    with open("jifenluohu.csv", "w") as f:
        fieldnames = ["pxid", "id", "idCard", "name", "score", "unit", "ranking"]    # 表的列名
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()    # 加上表頭
        for row in rows:
            newrow = {"pxid":row["pxid"],"id":row["id"],"idCard":row["idCard"],"name":row["name"],"score":row["score"],"unit":row["unit"],"ranking":row["ranking"]}
            writer.writerow(newrow)
        print("寫csv完成")
 
t = loadData()

print(t)

後面有增加了年齡,生肖,年齡,省份,城市等屬性。為後續進一步分析做準備。

import json,csv
from datetime import datetime

#根據出生年份獲取生肖
def chinese_zodiac(year):  
    return u'猴雞狗豬鼠牛虎兔龍蛇馬羊'[year%12]  
    
# 根據出生日期獲取星座
def get_constellation(month, date):
    dates = (21, 20, 21, 21, 22, 22, 23, 24, 24, 24, 23, 22)
    constellations = ("摩羯", "水瓶", "雙魚", "白羊", "金牛", "雙子", "巨蟹", "獅子", "處女", "天秤", "天蠍", "射手", "摩羯")
    if date < dates[month-1]:
        return constellations[month-1]
    else:
        return constellations[month]  

    
#city
def citydict():
    with open("city.csv") as file:
        citys = {}
        for line in  file:
            if line==",":
                continue
            city = line.split(",")
            citys.update({city[0]:city[1].replace("\n", "")})
        return citys


#載入資料
def loadData():
    with open('jifenluohu.json', 'r') as f:
        data = json.load(f)
        rows = data['rows']
    with open("jifenluohu.csv", "w") as f:
        fieldnames = ["pxid", "id", "idCard", "name", "score", "unit", "ranking", "province", "city", "provincename", "cityname", "birthday", "age", "zoo", "star"]    # 表的列名
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()    # 加上表頭
        citys = citydict()
        for row in rows:
            idCard = row["idCard"]
            province = idCard[0:2]
            city = idCard[0:6]
            year = idCard[6:10]
            month = idCard[10:12]
            day = idCard[12:14]
            zoo = chinese_zodiac(int(year))
            star = get_constellation(int(month), int(day))
            provincename = citys.get(province)
            cityname = citys.get(city)
            birthday = year+'-'+month+'-'+day
            age = 2018-int(year)
            ext = {'province':province, 'city':city, 'provincename':provincename, 'cityname':cityname, 'birthday':birthday, 'age':age, 'zoo':zoo, 'star':star}
            #print(ext)
            newrow = {"pxid":row["pxid"],"id":row["id"],"idCard":row["idCard"],"name":row["name"],"score":row["score"],"unit":row["unit"].strip(),"ranking":row["ranking"]}
            newrow.update(ext)
            writer.writerow(newrow)
        print("寫csv完成")


t = loadData()
#t = citydict()
print(t)

資料包,以及用pyspark分析過程下載

百度雲