1. 程式人生 > 其它 >豌豆莢排行榜資料視覺化分析

豌豆莢排行榜資料視覺化分析

一、選題背景

  現如今,手機APP品類繁多。不同種類的軟體可以實現不同的功能。為此,根據豌豆莢應用市場排行榜做出分析,通過下載量來分析出來受歡迎的軟體。

二、爬蟲設計方案

爬蟲名稱:豌豆莢排行榜爬蟲

內容與資料特徵分析:通過獲取網頁介面獲取想要的資料。

設計方案:request,etree進行網頁分析,xpath獲取想要爬取的資料,sys進行檔案儲存。

技術難點:xpath格式轉換。

三、主題頁面結構特徵分析

頁面的結構特徵分析:內容導航型

Htmls頁面分析:

標題、簡介:

下載數量:

軟體大小、分類:

節點查詢、遍歷:

查詢:

sf_name = html.xpath("
//*[@id='j-top-list']/li[{}]/div[2]/h2/a/text()".format( coun )) sf_download = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun )) sf_size = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun )) sf_classify = html.xpath("//*[@id='j-top-list']/li[{}]/a[1]/text()
".format( coun )) sf_synopsis = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[2]/text()".format( coun ))

遍歷:通過迴圈進行提取

四、網路爬蟲程式設計

資料爬取與採集:

  1 import  requests
  2 from bs4 import BeautifulSoup
  3 import time
  4 import random
  5 import sys
  6 import re
  7 from tqdm import tqdm
  8 from lxml import
etree 9 10 USER_AGENTS = [ 11 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36' 12 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36' 13 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36' 14 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36' 15 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36' 16 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36' 17 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36' 18 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36' 19 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36' 20 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36' 21 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36' 22 'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3' 23 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0' 24 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0' 25 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' 26 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0' 27 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0' 28 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0' 29 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0' 30 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0' 31 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0' 32 'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0' 33 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1' 34 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1' 35 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0' 36 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0' 37 ] 38 39 headers = { 40 'User-Agent':random.choice(USER_AGENTS), 41 # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', 42 'Connection':'keep-alive', 43 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2' 44 } 45 46 47 # 軟體排行爬蟲 48 def Wdj_sf(): 49 # 建立檔案 50 file = open("sf_pop.csv", "a") 51 file.write( "sf_name" + "," + "sf_download" + "," + "sf_size" + "," + "sf_classify" + "," + "sf_synopsis" + '\n') 52 file = file.close() 53 # 請求訪問 54 url = 'https://www.wandoujia.com/top/app' 55 res = requests.get(url,headers=headers) 56 res.encoding = 'utf-8' 57 # soup = BeautifulSoup(res.text,'lxml') 58 # print(soup) 59 html = etree.HTML(res.text) 60 # print(html) 61 #獲取標籤內容:軟體名sf_name、下次次數sf_download、軟體大小sf_size、軟體分類sf_classify、軟體簡介sf_synopsis 62 coun = 1 63 for i in range(1,25): 64 try: 65 sf_name = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/h2/a/text()".format( coun )) 66 for i in sf_name: 67 sf_name = i 68 sf_download = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun )) 69 for i in sf_download: 70 sf_download = i.strip('萬億人下載') 71 sf_download = float(sf_download) 72 if sf_download > 100: 73 sf_download = sf_download/10000 74 sf_download = round(sf_download, 2) 75 sf_size = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun )) 76 for i in sf_size: 77 sf_size = i 78 sf_classify = html.xpath("//*[@id='j-top-list']/li[{}]/a[1]/text()".format( coun )) 79 for i in sf_classify: 80 sf_classify = i 81 sf_synopsis = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[2]/text()".format( coun )) 82 for i in sf_synopsis: 83 sf_synopsis = i 84 coun += 1 85 # sum = sf_name + str(sf_download) + sf_size + sf_classify + sf_synopsis 86 # 儲存資料 87 print('軟體名:',sf_name,'\n','下載數量:',sf_download,'億人下載','\n','軟體大小:',sf_size,'\n','軟體分類:',sf_classify,'\n','簡介:',sf_synopsis) 88 print('————————————————————————————————————————————————') 89 with open('sf_pop.csv', "a", encoding='utf-8') as file1: 90 file1.writelines(sf_name + "," + str(sf_download) + "," + sf_size + "," + sf_classify + "," + sf_classify + '\n') 91 # print(sum) 92 except Exception: 93 print(Exception) 94 95 # 遊戲排行爬蟲 96 def Wdj_game(): 97 # 建立檔案 98 file = open("game_pop.csv", "a") 99 file.write( "game_name" + "," + "game_download" + "," + "game_size" + "," + "game_classify" + "," + "game_synopsis" + '\n') 100 file = file.close() 101 url = 'https://www.wandoujia.com/top/game' 102 res = requests.get(url,headers=headers) 103 res.encoding = 'utf-8' 104 # soup = BeautifulSoup(res.text,'lxml') 105 # print(soup) 106 html = etree.HTML(res.text) 107 coun = 1 108 for i in range(1,25): 109 try: 110 game_name = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/h2/a/text()".format( coun )) 111 for i in game_name: 112 game_name = i 113 game_download = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun )) 114 for i in game_download: 115 game_download = i.strip('萬億人下載') 116 game_download = float(game_download) 117 if game_download > 100: 118 game_download = game_download/10000 119 game_download = round(game_download, 2) 120 game_size = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun )) 121 for i in game_size: 122 game_size = i 123 game_classify = html.xpath("//*[@id='j-top-list']/li[{}]/a[1]/text()".format( coun )) 124 for i in game_classify: 125 game_classify = i 126 game_synopsis = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[2]/text()".format( coun )) 127 for i in game_synopsis: 128 game_synopsis = i 129 coun += 1 130 print('軟體名:', game_name, '\n', '下載數量:', game_download, '億人下載', '\n', '軟體大小:', game_size, '\n', '軟體分類:',game_classify, '\n', '簡介:', game_synopsis) 131 print('————————————————————————————————————————————————') 132 with open('game_pop.csv', "a", encoding='utf-8') as file1: 133 file1.writelines(game_name + "," + str(game_download) + "," + game_size + "," + game_classify + "," + game_synopsis + '\n') 134 135 except Exception: 136 print(Exception) 137 138 if __name__ == '__main__': 139 print('—————————————————————————Start————————————————————————') 140 print('軟體爬蟲:') 141 Wdj_sf() 142 print('———————————————————————分界線——————————————————————————') 143 print('遊戲爬蟲:') 144 Wdj_game() 145 print('———————————————————————End————————————————————————————')

資料清洗處理:

匯入:

import pandas as pd
import numpy as np

# 下載數量的單位是億人
sf = pd.read_csv(r'C:\Users\10950\Desktop\LHX\sf_pop.csv')
game = pd.read_csv(r'C:\Users\10950\Desktop\LHX\game_pop.csv')
sf.head(20)

重複值處理:

# 重複值處理
sf = sf.drop_duplicates()
game = game.drop_duplicates()

資料視覺化:

import matplotlib.pyplot as plt
# sf資料視覺化分析
x = sf['sf_name']
y = sf['sf_download']
plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤
plt.plot(x,y,'-',color = 'c',label="單位/億")
plt.xticks(rotation=90)
plt.legend(loc = "best")#圖例
plt.title("軟體下載數量趨勢圖")
plt.xlabel("軟體名",)#橫座標名字
plt.ylabel("下載數量")#縱座標名字
plt.show()
# game資料視覺化分析
x = game['game_name']
y = game['game_download']
plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤
plt.plot(x,y,'-',color = 'm',label="單位/億")
plt.xticks(rotation=90)
plt.legend(loc = "best")#圖例
plt.title("遊戲下載數量趨勢圖")
plt.xlabel("遊戲名",)#橫座標名字
plt.ylabel("下載數量")#縱座標名字
plt.show()
# 柱狀圖
plt.bar(x,y,alpha=0.2, width=0.4, color='yellow', edgecolor='red', lw=3)
plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤
plt.title("軟體下載數量柱狀圖")
plt.xticks(rotation=90)
plt.xlabel("軟體名",)#橫座標名字
plt.ylabel("下載數量")#縱座標名字
plt.show()
# 柱狀圖
plt.bar(x,y,alpha=0.2, width=0.4, color='w', edgecolor='red', lw=3)
plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤
plt.title("遊戲下載數量柱狀圖")
plt.xticks(rotation=90)
plt.xlabel("遊戲名",)#橫座標名字
plt.ylabel("下載數量")#縱座標名字
plt.show()
# 水平圖
plt.barh(x,y, alpha=0.2, height=0.4, color='r', edgecolor='gray',label='單位/億', lw=3)
plt.title("軟體下載數量水平圖")
plt.legend(loc = "best")#圖例
plt.xlabel("下載數量",)#橫座標名字
plt.ylabel("軟體名")#縱座標名字
plt.show()
# 水平圖
plt.barh(x,y, alpha=0.2, height=0.4, color='gray', edgecolor='gray',label='單位/億', lw=3)
plt.title("遊戲下載數量水平圖")
plt.legend(loc = "best")#圖例
plt.xlabel("下載數量",)#橫座標名字
plt.ylabel("遊戲名")#縱座標名字
plt.show()
# 散點圖
plt.scatter(x,y,color='b',marker='o',s=40,edgecolor='black',alpha=0.5)
plt.xticks(rotation=90)
plt.title("軟體下載數量散點圖")
plt.xlabel("軟體名",)#橫座標名字
plt.ylabel("下載數量")#縱座標名字
plt.show()
# 散點圖
plt.scatter(x,y,color='w',marker='o',s=40,edgecolor='black',alpha=0.5)
plt.xticks(rotation=90)
plt.title("軟體下載數量散點圖")
plt.xlabel("遊戲名",)#橫座標名字
plt.ylabel("下載數量")#縱座標名字
plt.show()

  

# 餅狀圖
label_list = x
explode = (0,0,0,0.1,0,0)
plt.rcParams['font.sans-serif']=['SimHei']
plt.xticks(rotation=0)
plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6)
plt.title("軟體下載數量餅狀圖")
plt.axis("equal")
plt.show()
# 餅狀圖
label_list = x
explode = (0,0,0,0.1,0,0)
plt.rcParams['font.sans-serif']=['SimHei']
plt.xticks(rotation=0)
plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6)
plt.title("遊戲下載數量餅狀圖")
plt.axis("equal")
plt.show()

總程式碼

  1 import pandas as pd
  2 import numpy as np
  3 import matplotlib.pyplot as plt
  4 
  5 # 下載數量的單位是億人
  6 sf = pd.read_csv(r'C:\Users\10950\Desktop\LHX\sf_pop.csv')
  7 game = pd.read_csv(r'C:\Users\10950\Desktop\LHX\game_pop.csv')
  8 sf.head(20)
  9 # 重複值處理
 10 sf = sf.drop_duplicates()
 11 game = game.drop_duplicates()
 12 
 13 # sf資料視覺化分析
 14 x = sf['sf_name']
 15 y = sf['sf_download']
 16 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤
 17 plt.plot(x,y,'-',color = 'c',label="單位/億")
 18 plt.xticks(rotation=90)
 19 plt.legend(loc = "best")#圖例
 20 plt.title("軟體下載數量趨勢圖")
 21 plt.xlabel("軟體名",)#橫座標名字
 22 plt.ylabel("下載數量")#縱座標名字
 23 plt.show()
 24 
 25 # game資料視覺化分析
 26 x = game['game_name']
 27 y = game['game_download']
 28 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤
 29 plt.plot(x,y,'-',color = 'm',label="單位/億")
 30 plt.xticks(rotation=90)
 31 plt.legend(loc = "best")#圖例
 32 plt.title("遊戲下載數量趨勢圖")
 33 plt.xlabel("遊戲名",)#橫座標名字
 34 plt.ylabel("下載數量")#縱座標名字
 35 plt.show()
 36 
 37 # 柱狀圖
 38 plt.bar(x,y,alpha=0.2, width=0.4, color='yellow', edgecolor='red', lw=3)
 39 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤
 40 plt.title("遊戲軟體數量柱狀圖")
 41 plt.xticks(rotation=90)
 42 plt.xlabel("軟體名",)#橫座標名字
 43 plt.ylabel("下載數量")#縱座標名字
 44 plt.show()
 45 
 46 # 柱狀圖
 47 plt.bar(x,y,alpha=0.2, width=0.4, color='w', edgecolor='red', lw=3)
 48 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤
 49 plt.title("遊戲下載數量柱狀圖")
 50 plt.xticks(rotation=90)
 51 plt.xlabel("遊戲名",)#橫座標名字
 52 plt.ylabel("下載數量")#縱座標名字
 53 plt.show()
 54 
 55 # 水平圖
 56 plt.barh(x,y, alpha=0.2, height=0.4, color='r', edgecolor='gray',label='單位/億', lw=3)
 57 plt.title("軟體下載數量水平圖")
 58 plt.legend(loc = "best")#圖例
 59 plt.xlabel("下載數量",)#橫座標名字
 60 plt.ylabel("軟體名")#縱座標名字
 61 plt.show()
 62 
 63 # 水平圖
 64 plt.barh(x,y, alpha=0.2, height=0.4, color='gray', edgecolor='gray',label='單位/億', lw=3)
 65 plt.title("遊戲下載數量水平圖")
 66 plt.legend(loc = "best")#圖例
 67 plt.xlabel("下載數量",)#橫座標名字
 68 plt.ylabel("遊戲名")#縱座標名字
 69 plt.show()
 70 
 71 # 散點圖
 72 plt.scatter(x,y,color='b',marker='o',s=40,edgecolor='black',alpha=0.5)
 73 plt.xticks(rotation=90)
 74 plt.title("軟體下載數量散點圖")
 75 plt.xlabel("軟體名",)#橫座標名字
 76 plt.ylabel("下載數量")#縱座標名字
 77 plt.show()
 78 
 79 # 散點圖
 80 plt.scatter(x,y,color='w',marker='o',s=40,edgecolor='black',alpha=0.5)
 81 plt.xticks(rotation=90)
 82 plt.title("軟體下載數量散點圖")
 83 plt.xlabel("遊戲名",)#橫座標名字
 84 plt.ylabel("下載數量")#縱座標名字
 85 plt.show()
 86 
 87 # 餅狀圖
 88 label_list = x
 89 explode = (0,0,0,0.1,0,0)
 90 plt.rcParams['font.sans-serif']=['SimHei']
 91 plt.xticks(rotation=0)
 92 plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6)
 93 plt.title("軟體下載數量餅狀圖")
 94 plt.axis("equal")
 95 plt.show()
 96 
 97 # 餅狀圖
 98 label_list = x
 99 explode = (0,0,0,0.1,0,0)
100 plt.rcParams['font.sans-serif']=['SimHei']
101 plt.xticks(rotation=0)
102 plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6)
103 plt.title("遊戲下載數量餅狀圖")
104 plt.axis("equal")
105 plt.show()

五、總結

  從視覺化結果分析來看,軟體排行QQ和微信下載量最受歡迎,遊戲排行看王者榮耀和4399遊戲盒受歡迎。分析結果達到預期效果。在設計過程中,我收穫了資料處理的程式設計思維方式。不足之處在繪製散點圖時,沒有達到自己的效果,通過此次實驗後對視覺化分析有了更深的認知,並對繪圖繼續專研。