1. 程式人生 > >資料分析:2012聯邦選舉委員會資料庫

資料分析:2012聯邦選舉委員會資料庫

# -*- coding: utf-8 -*-

import pandas as pd
from pandas import DataFrame, Series

fec = pd.read_csv('fec/P00000001-ALL.csv')
#print fec
#print fec.ix[123456]

#加入黨派
unique_cands = fec.cand_nm.unique()
#print unique_cands

parties = {'Bachmann, Michelle': 'Republican',
                  'Cain, Herman': 'Republican',
                  'Gingrich, Newt': 'Republican',
                  'Huntsman, Jon': 'Republican',
                  'Johnson, Gary Earl': 'Republican',
                  'McCotter, Thaddeus G': 'Republican',
                  'Obama, Barack': 'Democrat',
                  'Paul, Ron': 'Republican',
                  'Pawlenty, Timothy': 'Republican',
                  'Perry, Rick': 'Republican',
                  "Roemer, Charles E. 'Buddy' III": 'Republican',
                  'Romney, Mitt': 'Republican',
                  'Santorum, Rick': 'Republican',
           }
#.map 對應關係對映值,可以是函式、字典、Sreies
#print fec.cand_nm[12456:123461].map(parties)
fec['party'] = fec.cand_nm.map(parties)
#print fec['party'].value_counts()

#注意有退款
#print (fec.contb_receipt_amt > 0).value_counts()

#簡化過程,限定只有正出資
fec = fec[fec.contb_receipt_amt > 0]
#print fec

#Obama和Romney是主要選舉人
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]
#print fec_mrbo


#根據職業和僱主統計贊助資訊
#算出出資總額
#print fec.contbr_occupation.value_counts()[:10]
occ_mapping = {
        'INFORMATION REQUESTED PER BEST EFFORTS': 'NOT PROVIDED',
        'INFORMATION REQUESTED': 'NOT PROVIDED',
        'INFORMATION REQUESTED (BEST EFFORTS)': 'NOT PROVIDED',
        'C.E.O.': 'CEO'
}
#如果未提供相關對映,則返回x
f = lambda x: occ_mapping.get(x, x)
fec.contbr_occupation = fec.contbr_occupation.map(f)

#同樣處理僱主資訊
emp_mapping = {
        'INFORMATION REQUESTED PER BEST EFFORTS': 'NOT PROVIDED',
        'INFORMATION REQUESTED': 'NOT PROVIDED',
        'SELF': 'self-EMPLOYED',
        'SELF EMPLOYED': 'self-EMPLOYED',
}
#如果未提供相關對映,則返回x
f = lambda x: emp_mapping.get(x, x)
fec.contbr_employer = fec.contbr_employer.map(f)

#對職業和資料進行聚合,過濾掉總額不超過200萬美元
by_occupation = fec.pivot_table('contb_receipt_amt',
                                rows = 'contbr_occupation',
                                cols = 'party', aggfunc='sum')
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]
#print over_2mm

#柱狀圖
#over_2mm.plot(kind='barh')

#對Obabam和Romney總出資最高的的職業和企業
def get_top_amounts(group, key, n=5):
    totals = group.groupby(key)['contb_receipt_amt'].sum()
    
    #根據key對totals進行將序排列
    return totals.order(ascending=False)[n:]
    
#根據職業和僱主進行聚合
grouped = fec_mrbo.groupby('cand_nm')
#print grouped.apply(get_top_amounts, 'contbr_occupation', n=7)
#print grouped.apply(get_top_amounts, 'contbr_employer', n=10)

#對出資額進行分組
bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)
#print labels

#根據候選人姓名以及面元標籤對資料進行分組
grouped = fec_mrbo.groupby(['cand_nm', labels])
#print grouped.size().unstack(0)

#資料面元內規格化
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
#print bucket_sums
#.div除法
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)
#print normed_sums

#最大面元非個人捐贈,排除
#normed_sums[:-2].plot(kind='barh', stacked=True)

#根據州統計贊助資訊
grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals = totals[totals.sum(1) > 100000]
#print totals[:10]

#贊助比例
percent = totals.div(totals.sum(1), axis=0)
#print percent[:10]