1. 程式人生 > >資料分析——pandas

資料分析——pandas

  1 class Titanic(object):
  2     def __init__(self):
  3         self.data = titanic
  4 
  5     # 1.存活率是多少
  6     def rate_survive(self):
  7         survived = self.data.loc[:, 'Survived'].value_counts()[1]
  8         death = self.data.loc[:, 'Survived'].value_counts()[0]
  9         rate = float(survived) / (float(death) + float(survived))
10 print '總人數:{},存活人數:{},死亡人數:{}'.format(survived + death, survived, death) 11 return u'存活率:' + '%.2f' % rate 12 13 # 2.哪個年齡段存活率最高 14 def max_survive(self): 15 age18_survived = self.data[self.data[u'Age'] <= 18][u'Survived'].value_counts()[1] 16 age18_death = self.data[self.data[u'
Age'] <= 18][u'Survived'].value_counts()[0] 17 age18_rate = float(age18_survived) / (float(age18_survived) + float(age18_death)) 18 19 age1860_survived = self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 60)][u'Survived'].value_counts()[1] 20 age1860_death = self.data[(self.data[u'
Age'] > 18) & (self.data[u'Age'] < 60)][u'Survived'].value_counts()[0] 21 age1860_rate = float(age1860_survived) / (float(age1860_survived) + float(age1860_death)) 22 23 age60_survived = self.data[self.data[u'Age'] >= 60][u'Survived'].value_counts()[1] 24 age60_death = self.data[self.data[u'Age'] >= 60][u'Survived'].value_counts()[0] 25 age60_rate = float(age60_survived) / (float(age60_survived) + float(age60_death)) 26 27 rate = [age18_rate, age60_rate, age1860_rate] 28 age_data = ['18歲以下', '18-60歲', '60歲以上'] 29 max_rate = max(rate) 30 age_range = age_data[rate.index(max(rate))] 31 return '存活率最高的年齡段是{},存活率為{}'.format(age_range, max_rate) 32 33 # 3.女性存活率是否高於男性 34 def than_survive(self): 35 male_survied = self.data[self.data[u'Sex'] == u'male'][u'Survived'].value_counts()[1] 36 male_death = self.data[self.data[u'Sex'] == u'male'][u'Survived'].value_counts()[0] 37 rate_male = float(male_survied) / (float(male_survied) + float(male_death)) 38 print '男性共有{}人,存活{}人,死亡{}人'.format(male_death + male_survied, male_survied, male_death) 39 female_survied = self.data[self.data[u'Sex'] == u'female'][u'Survived'].value_counts()[1] 40 female_death = self.data[self.data[u'Sex'] == u'female'][u'Survived'].value_counts()[0] 41 rate_female = float(female_survied) / (float(female_survied) + float(female_death)) 42 print '女性共有{}人,存活{}人,死亡{}人'.format(female_death + female_survied, female_survied, female_death) 43 if rate_male > rate_female: 44 return u'男性存活率更高,存活率為:%.2f' % rate_male 45 else: 46 return u'女性存活率更高,存活率為:%.2f' % rate_female 47 48 # 4.船上是否出現貧富差距 49 def poor_wealth(self): 50 max_wealth = self.data[u'Fare'].max() 51 max_poor = self.data[u'Fare'].min() 52 if max_wealth - max_poor > 500: 53 return '船上乘客最多消費了{},最少消費了{},存在貧富差距'.format(max_wealth, max_poor) 54 else: 55 return '船上乘客最多消費了{},最少消費了{},不存在貧富差距'.format(max_wealth, max_poor) 56 57 # 5.頭等艙乘客的存活率是否高於經濟艙 58 def pclass_survive(self): 59 pclass1_survived = self.data[self.data[u'Pclass'] == 1]['Survived'].value_counts()[1] 60 pclass1_death = self.data[self.data[u'Pclass'] == 1]['Survived'].value_counts()[0] 61 pclass1_rate = float(pclass1_survived) / (float(pclass1_survived) + float(pclass1_death)) 62 63 pclass3_survived = self.data[self.data[u'Pclass'] == 3]['Survived'].value_counts()[1] 64 pclass3_death = self.data[self.data[u'Pclass'] == 3]['Survived'].value_counts()[0] 65 pclass3_rate = float(pclass3_survived) / (float(pclass3_survived) + float(pclass3_death)) 66 67 if pclass3_rate > pclass1_rate: 68 return '頭等艙乘客存活率更高,存活率為{}'.format(pclass3_rate) 69 else: 70 return '經濟艙乘客存活率更高,存活率為{}'.format(pclass1_rate) 71 72 # 6.有親屬在船上乘客比率,有親屬是否會影響存活率 73 def family_survive(self): 74 has_family = self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'PassengerId'].count() 75 no_family = self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'PassengerId'].count() 76 rate_family = float(has_family) / (float(has_family) + float(no_family)) 77 78 has_family_survived = \ 79 self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'Survived'].value_counts()[1] 80 has_family_death = \ 81 self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'Survived'].value_counts()[0] 82 has_family_rate = float(has_family_survived) / (float(has_family_survived) + float(has_family_death)) 83 84 no_family_survived = \ 85 self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'Survived'].value_counts()[1] 86 no_family_death = \ 87 self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'Survived'].value_counts()[0] 88 no_family_rate = float(no_family_survived) / (float(no_family_survived) + float(no_family_death)) 89 90 print '船上乘客中有親屬也在船上的有{}人,無親屬在船上的有{}人,有親屬在船上的乘客的比率為{}'.format(has_family, no_family, rate_family) 91 if has_family_rate > no_family_rate: 92 return '有親屬在船上的乘客存活率更高,存活率為{}'.format(has_family_rate) 93 else: 94 return '無親屬在船上的乘客存活率更高,存活率為{}'.format(no_family_rate) 95 96 # 7.從哪個港口登船是否影響獲救 97 def emarked_survive(self): 98 Embarked_S_survived = self.data[self.data[u'Embarked'] == 'S'][u'Survived'].value_counts()[1] 99 Embarked_S_death = self.data[self.data[u'Embarked'] == 'S'][u'Survived'].value_counts()[0] 100 Embarked_S_rate = float(Embarked_S_survived) / (float(Embarked_S_survived) + float(Embarked_S_death)) 101 102 Embarked_C_survived = self.data[self.data[u'Embarked'] == 'C'][u'Survived'].value_counts()[1] 103 Embarked_C_death = self.data[self.data[u'Embarked'] == 'C'][u'Survived'].value_counts()[0] 104 Embarked_C_rate = float(Embarked_C_survived) / (float(Embarked_C_survived) + float(Embarked_C_death)) 105 106 Embarked_Q_survived = self.data[self.data[u'Embarked'] == 'Q'][u'Survived'].value_counts()[1] 107 Embarked_Q_death = self.data[self.data[u'Embarked'] == 'Q'][u'Survived'].value_counts()[0] 108 Embarked_Q_rate = float(Embarked_Q_survived) / (float(Embarked_Q_survived) + float(Embarked_Q_death)) 109 110 embarked = ['S港口', 'C港口', 'Q港口'] 111 rate = [Embarked_S_rate, Embarked_C_rate, Embarked_Q_rate] 112 max_rate = max(rate) 113 return '{}存活率最大,為{}'.format(embarked[rate.index(max_rate)], max_rate) 114 115 # 8.不同年齡段女性的獲救率 116 def female_survive(self): 117 female18_survived = \ 118 self.data[(self.data[u'Age'] <= 18) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[1] 119 female18_death = \ 120 self.data[(self.data[u'Age'] <= 18) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[0] 121 female18_rate = float(female18_survived) / (float(female18_survived) + float(female18_death)) 122 123 female1850_survived = \ 124 self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 50) & (self.data[u'Sex'] == u'female')][ 125 u'Survived'].value_counts()[1] 126 female1850_death = \ 127 self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 50) & (self.data[u'Sex'] == u'female')][ 128 u'Survived'].value_counts()[0] 129 female1850_rate = float(female1850_survived) / (float(female1850_survived) + float(female1850_death)) 130 131 female50_survived = \ 132 self.data[(self.data[u'Age'] >= 50) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[1] 133 female50_death = \ 134 self.data[(self.data[u'Age'] >= 50) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[0] 135 female50_rate = float(female50_survived) / (float(female50_survived) + float(female50_death)) 136 137 return '18歲以下女性存活率:{},18-50歲女性存活率:{},50歲以上女性存活率:{}'.format(female18_rate, female1850_rate, female50_rate) 138 139 140 if __name__ == '__main__': 141 tt = Titanic() 142 # print tt.rate_survive() 143 # print tt.than_survive() 144 # print tt.max_survive() 145 # print tt.poor_wealth() 146 # print tt.pclass_survive() 147 # print tt.family_survive() 148 # print tt.emarked_survive() 149 print tt.female_survive()