【python資料探勘課程】十二.Pandas、Matplotlib結合SQL語句對比圖分析
一. 直方圖四圖對比
資料庫如下所示,包括URL、作者、標題、摘要、日期、閱讀量和評論數等。
執行結果如下所示,其中繪製多個圖的核心程式碼為:
p1 = plt.subplot(221)
plt.bar(ind, num1, width, color='b', label='sum num')
plt.sca(p1)
完整程式碼如下:
# coding=utf-8 ''' ' 這篇程式碼主要講述獲取MySQL中資料,再進行簡單的統計 ' 統計採用SQL語句進行 ''' import matplotlib.pyplot as plt import matplotlib import pandas as pd import numpy as np import pylab import MySQLdb from pylab import * # 根據SQL語句輸出24小時的柱狀圖 try: conn = MySQLdb.connect(host='localhost',user='root', passwd='123456',port=3306, db='test01') cur = conn.cursor() #資料庫遊標 #防止報錯:UnicodeEncodeError: 'latin-1' codec can't encode character conn.set_character_set('utf8') cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET character_set_connection=utf8;') ################################################# # 2014年 ################################################# sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2014' group by mm;''' cur.execute(sql) result = cur.fetchall() #獲取結果複製給result hour1 = [n[0] for n in result] print hour1 num1 = [n[1] for n in result] print num1 N = 12 ind = np.arange(N) #賦值0-11 width=0.35 p1 = plt.subplot(221) plt.bar(ind, num1, width, color='b', label='sum num') #設定底部名稱 plt.xticks(ind+width/2, hour1, rotation=40) #旋轉40度 for i in range(12): #中心底部翻轉90度 plt.text(i, num1[i], str(num1[i]), ha='center', va='bottom', rotation=45) plt.title('2014 Number-12Month') plt.sca(p1) ################################################# # 2015年 ################################################# sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2015' group by mm;''' cur.execute(sql) result = cur.fetchall() hour1 = [n[0] for n in result] print hour1 num1 = [n[1] for n in result] print num1 N = 12 ind = np.arange(N) #賦值0-11 width=0.35 p2 = plt.subplot(222) plt.bar(ind, num1, width, color='r', label='sum num') #設定底部名稱 plt.xticks(ind+width/2, hour1, rotation=40) #旋轉40度 for i in range(12): #中心底部翻轉90度 plt.text(i, num1[i], str(num1[i]), ha='center', va='bottom', rotation=45) plt.title('2015 Number-12Month') plt.sca(p2) ################################################# # 2016年 ################################################# sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2016' group by mm;''' cur.execute(sql) result = cur.fetchall() hour1 = [n[0] for n in result] print hour1 num1 = [n[1] for n in result] print num1 N = 12 ind = np.arange(N) #賦值0-11 width=0.35 p3 = plt.subplot(223) plt.bar(ind, num1, width, color='g', label='sum num') #設定底部名稱 plt.xticks(ind+width/2, hour1, rotation=40) #旋轉40度 for i in range(12): #中心底部翻轉90度 plt.text(i, num1[i], str(num1[i]), ha='center', va='bottom', rotation=45) plt.title('2016 Number-12Month') plt.sca(p3) ################################################# # 所有年份資料對比 ################################################# sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog group by mm;''' cur.execute(sql) result = cur.fetchall() hour1 = [n[0] for n in result] print hour1 num1 = [n[1] for n in result] print num1 N = 12 ind = np.arange(N) #賦值0-11 width=0.35 p4 = plt.subplot(224) plt.bar(ind, num1, width, color='y', label='sum num') #設定底部名稱 plt.xticks(ind+width/2, hour1, rotation=40) #旋轉40度 for i in range(12): #中心底部翻轉90度 plt.text(i, num1[i], str(num1[i]), ha='center', va='bottom', rotation=45) plt.title('All Year Number-12Month') plt.sca(p4) plt.savefig('ttt.png',dpi=400) plt.show() #異常處理 except MySQLdb.Error,e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) finally: cur.close() conn.commit() conn.close()
二. Area Plot圖對比
執行效果如下所示,核心程式碼如下:
data = np.array([num1, num2, num3, num4])
d = data.T #轉置 12*4
df = DataFrame(d, index=hour1, columns=['All','2014', '2015', '2016'])
df.plot(kind='area', alpha=0.2) #設定顏色 透明度
plt.savefig('csdn.png',dpi=400)
plt.show()
其中需要將num1~num4合併為[12,4]陣列,同時轉換為array,再轉置繪圖。index是設定X軸時間,columns是設定每行資料對應的值。kind='area'設定Area Plot圖,還有 'bar'(柱狀圖)、'barh'(柱狀圖-縱向)
該圖會將資料劃分為等級梯度,基本趨勢相同。
完整程式碼如下所示:
# coding=utf-8 ''' ' 這篇程式碼主要講述獲取MySQL中資料,再進行簡單的統計 ' 統計採用SQL語句進行 By:Eastmount CSDN ''' import matplotlib.pyplot as plt import matplotlib import pandas as pd import numpy as np import MySQLdb from pandas import * try: conn = MySQLdb.connect(host='localhost',user='root', passwd='123456',port=3306, db='test01') cur = conn.cursor() #資料庫遊標 #防止報錯:UnicodeEncodeError: 'latin-1' codec can't encode character conn.set_character_set('utf8') cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET character_set_connection=utf8;') #所有部落格數 sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog group by mm;''' cur.execute(sql) result = cur.fetchall() #獲取結果複製給result hour1 = [n[0] for n in result] print hour1 num1 = [n[1] for n in result] print num1 #2014年部落格數 sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2014' group by mm;''' cur.execute(sql) result = cur.fetchall() num2 = [n[1] for n in result] print num2 #2015年部落格數 sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2015' group by mm;''' cur.execute(sql) result = cur.fetchall() num3 = [n[1] for n in result] print num3 #2016年部落格數 sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2016' group by mm;''' cur.execute(sql) result = cur.fetchall() num4 = [n[1] for n in result] print num4 #重點: 資料整合 [12,4] data = np.array([num1, num2, num3, num4]) print data d = data.T #轉置 print d df = DataFrame(d, index=hour1, columns=['All','2014', '2015', '2016']) df.plot(kind='area', alpha=0.2) #設定顏色 透明度 plt.title('Arae Plot Blog-Month') plt.savefig('csdn.png',dpi=400) plt.show() #異常處理 except MySQLdb.Error,e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) finally: cur.close() conn.commit() conn.close()
三. MySQL語句獲取星期資訊
MySQL通過日期獲取星期的語句如下:
select now(), case dayofweek(now())
when 1 then '星期日'
when 2 then '星期一'
when 3 then '星期二'
when 4 then '星期三'
when 5 then '星期四'
when 6 then '星期五'
when 7 then '星期六' end as 'week'
from dual;
輸出如下圖所示:Python對應的程式碼如下,獲取總的部落格星期分佈:
# coding=utf-8
'''
' 這篇程式碼主要講述獲取MySQL中資料,再進行簡單的統計
' 統計採用SQL語句進行 By:Eastmount CSDN
'''
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import MySQLdb
from pandas import *
try:
conn = MySQLdb.connect(host='localhost',user='root',
passwd='123456',port=3306, db='test01')
cur = conn.cursor() #資料庫遊標
#防止報錯:UnicodeEncodeError: 'latin-1' codec can't encode character
conn.set_character_set('utf8')
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET character_set_connection=utf8;')
sql = '''select
COUNT(case dayofweek(FBTime) when 1 then 1 end) AS '星期日',
COUNT(case dayofweek(FBTime) when 2 then 1 end) AS '星期一',
COUNT(case dayofweek(FBTime) when 3 then 1 end) AS '星期二',
COUNT(case dayofweek(FBTime) when 4 then 1 end) AS '星期三',
COUNT(case dayofweek(FBTime) when 5 then 1 end) AS '星期四',
COUNT(case dayofweek(FBTime) when 6 then 1 end) AS '星期五',
COUNT(case dayofweek(FBTime) when 7 then 1 end) AS '星期六'
from csdn_blog;
'''
cur.execute(sql)
result = cur.fetchall()
print result
#((31704L, 43081L, 42670L, 43550L, 41270L, 39164L, 29931L),)
name = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
#轉換為numpy陣列
data = np.array(result)
print data
d = data.T #轉置
print d
matplotlib.style.use('ggplot')
df=DataFrame(d, index=name,columns=['Nums'])
df.plot(kind='bar')
plt.title('All Year Blog-Week')
plt.xlabel('Week')
plt.ylabel('The number of blog')
plt.savefig('01csdn.png',dpi=400)
plt.show()
#異常處理
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
finally:
cur.close()
conn.commit()
conn.close()
執行結果如下所示:四. 星期資料柱狀圖及折線圖對比
下面獲取四年的資料進行對比,程式碼如下所示:
核心程式碼如下,注意三個一維陣列轉換為num[7][3]二維陣列的方法。
data = np.random.rand(7,3)
print data
i = 0
while i<7:
data[i][0] = d1[i]
data[i][1] = d2[i]
data[i][2] = d3[i]
i = i + 1
matplotlib.style.use('ggplot')
#資料[7,3]陣列 name為星期 columns對應年份
df=DataFrame(data, index=name, columns=['2008','2011','2016'])
df.plot(kind='bar')
plt.show()
完整程式碼為:
# coding=utf-8
'''
' 這篇程式碼主要講述獲取MySQL中資料,再進行簡單的統計
' 統計採用SQL語句進行 By:Eastmount CSDN 楊秀璋
'''
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import MySQLdb
from pandas import *
try:
conn = MySQLdb.connect(host='localhost',user='root',
passwd='123456',port=3306, db='test01')
cur = conn.cursor() #資料庫遊標
#防止報錯:UnicodeEncodeError: 'latin-1' codec can't encode character
conn.set_character_set('utf8')
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET character_set_connection=utf8;')
sql = '''select
COUNT(case dayofweek(FBTime) when 1 then 1 end) AS '星期日',
COUNT(case dayofweek(FBTime) when 2 then 1 end) AS '星期一',
COUNT(case dayofweek(FBTime) when 3 then 1 end) AS '星期二',
COUNT(case dayofweek(FBTime) when 4 then 1 end) AS '星期三',
COUNT(case dayofweek(FBTime) when 5 then 1 end) AS '星期四',
COUNT(case dayofweek(FBTime) when 6 then 1 end) AS '星期五',
COUNT(case dayofweek(FBTime) when 7 then 1 end) AS '星期六'
from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2008';
'''
cur.execute(sql)
result1 = cur.fetchall()
print result1
name = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
data = np.array(result1)
d1 = data.T #轉置
print d1
sql = '''select
COUNT(case dayofweek(FBTime) when 1 then 1 end) AS '星期日',
COUNT(case dayofweek(FBTime) when 2 then 1 end) AS '星期一',
COUNT(case dayofweek(FBTime) when 3 then 1 end) AS '星期二',
COUNT(case dayofweek(FBTime) when 4 then 1 end) AS '星期三',
COUNT(case dayofweek(FBTime) when 5 then 1 end) AS '星期四',
COUNT(case dayofweek(FBTime) when 6 then 1 end) AS '星期五',
COUNT(case dayofweek(FBTime) when 7 then 1 end) AS '星期六'
from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2011';
'''
cur.execute(sql)
result2 = cur.fetchall()
data = np.array(result2)
d2 = data.T #轉置
print d2
sql = '''select
COUNT(case dayofweek(FBTime) when 1 then 1 end) AS '星期日',
COUNT(case dayofweek(FBTime) when 2 then 1 end) AS '星期一',
COUNT(case dayofweek(FBTime) when 3 then 1 end) AS '星期二',
COUNT(case dayofweek(FBTime) when 4 then 1 end) AS '星期三',
COUNT(case dayofweek(FBTime) when 5 then 1 end) AS '星期四',
COUNT(case dayofweek(FBTime) when 6 then 1 end) AS '星期五',
COUNT(case dayofweek(FBTime) when 7 then 1 end) AS '星期六'
from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2016';
'''
cur.execute(sql)
result3 = cur.fetchall()
data = np.array(result3)
print type(result3),type(data)
d3 = data.T #轉置
print d3
#SQL語句獲取3個數組,採用迴圈複製到一個[7][3]的二維陣列中
data = np.random.rand(7,3)
print data
i = 0
while i<7:
data[i][0] = d1[i]
data[i][1] = d2[i]
data[i][2] = d3[i]
i = i + 1
print data
print type(data)
#繪圖
matplotlib.style.use('ggplot')
#資料[7,3]陣列 name為星期 columns對應年份
df=DataFrame(data, index=name, columns=['2008','2011','2016'])
df.plot(kind='bar')
plt.title('Comparison Chart Blog-Week')
plt.xlabel('Week')
plt.ylabel('The number of blog')
plt.savefig('03csdn.png', dpi=400)
plt.show()
#異常處理
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
finally:
cur.close()
conn.commit()
conn.close()
其中將程式碼 "df.plot(kind='bar')" 修改為 "df.plot()" 即為折線圖。講到這裡,通過Pandas、Matplotlib、Numpy結合MySQL視覺化分析,並且進階對比圖片函式的分析過程已經講完了,後面會結合SQL資料庫做一些詞雲WordCloud、顏色圖、Power-low圖等分析。
希望文章對你有所幫助,尤其是結合資料庫做資料分析的人。還是那句話,如果剛好需要這部分知識,你就會覺得非常有幫助,否則只是覺得好玩,這也是線上筆記的作用。如果文章中存在不足或錯誤的地方,還請海涵~