資料分析:2012聯邦選舉委員會資料庫
阿新 • • 發佈:2019-02-13
# -*- coding: utf-8 -*- import pandas as pd from pandas import DataFrame, Series fec = pd.read_csv('fec/P00000001-ALL.csv') #print fec #print fec.ix[123456] #加入黨派 unique_cands = fec.cand_nm.unique() #print unique_cands parties = {'Bachmann, Michelle': 'Republican', 'Cain, Herman': 'Republican', 'Gingrich, Newt': 'Republican', 'Huntsman, Jon': 'Republican', 'Johnson, Gary Earl': 'Republican', 'McCotter, Thaddeus G': 'Republican', 'Obama, Barack': 'Democrat', 'Paul, Ron': 'Republican', 'Pawlenty, Timothy': 'Republican', 'Perry, Rick': 'Republican', "Roemer, Charles E. 'Buddy' III": 'Republican', 'Romney, Mitt': 'Republican', 'Santorum, Rick': 'Republican', } #.map 對應關係對映值,可以是函式、字典、Sreies #print fec.cand_nm[12456:123461].map(parties) fec['party'] = fec.cand_nm.map(parties) #print fec['party'].value_counts() #注意有退款 #print (fec.contb_receipt_amt > 0).value_counts() #簡化過程,限定只有正出資 fec = fec[fec.contb_receipt_amt > 0] #print fec #Obama和Romney是主要選舉人 fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])] #print fec_mrbo #根據職業和僱主統計贊助資訊 #算出出資總額 #print fec.contbr_occupation.value_counts()[:10] occ_mapping = { 'INFORMATION REQUESTED PER BEST EFFORTS': 'NOT PROVIDED', 'INFORMATION REQUESTED': 'NOT PROVIDED', 'INFORMATION REQUESTED (BEST EFFORTS)': 'NOT PROVIDED', 'C.E.O.': 'CEO' } #如果未提供相關對映,則返回x f = lambda x: occ_mapping.get(x, x) fec.contbr_occupation = fec.contbr_occupation.map(f) #同樣處理僱主資訊 emp_mapping = { 'INFORMATION REQUESTED PER BEST EFFORTS': 'NOT PROVIDED', 'INFORMATION REQUESTED': 'NOT PROVIDED', 'SELF': 'self-EMPLOYED', 'SELF EMPLOYED': 'self-EMPLOYED', } #如果未提供相關對映,則返回x f = lambda x: emp_mapping.get(x, x) fec.contbr_employer = fec.contbr_employer.map(f) #對職業和資料進行聚合,過濾掉總額不超過200萬美元 by_occupation = fec.pivot_table('contb_receipt_amt', rows = 'contbr_occupation', cols = 'party', aggfunc='sum') over_2mm = by_occupation[by_occupation.sum(1) > 2000000] #print over_2mm #柱狀圖 #over_2mm.plot(kind='barh') #對Obabam和Romney總出資最高的的職業和企業 def get_top_amounts(group, key, n=5): totals = group.groupby(key)['contb_receipt_amt'].sum() #根據key對totals進行將序排列 return totals.order(ascending=False)[n:] #根據職業和僱主進行聚合 grouped = fec_mrbo.groupby('cand_nm') #print grouped.apply(get_top_amounts, 'contbr_occupation', n=7) #print grouped.apply(get_top_amounts, 'contbr_employer', n=10) #對出資額進行分組 bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]) labels = pd.cut(fec_mrbo.contb_receipt_amt, bins) #print labels #根據候選人姓名以及面元標籤對資料進行分組 grouped = fec_mrbo.groupby(['cand_nm', labels]) #print grouped.size().unstack(0) #資料面元內規格化 bucket_sums = grouped.contb_receipt_amt.sum().unstack(0) #print bucket_sums #.div除法 normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0) #print normed_sums #最大面元非個人捐贈,排除 #normed_sums[:-2].plot(kind='barh', stacked=True) #根據州統計贊助資訊 grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st']) totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0) totals = totals[totals.sum(1) > 100000] #print totals[:10] #贊助比例 percent = totals.div(totals.sum(1), axis=0) #print percent[:10]