豌豆莢排行榜資料視覺化分析
阿新 • • 發佈:2021-06-22
一、選題背景
現如今,手機APP品類繁多。不同種類的軟體可以實現不同的功能。為此,根據豌豆莢應用市場排行榜做出分析,通過下載量來分析出來受歡迎的軟體。
二、爬蟲設計方案
爬蟲名稱:豌豆莢排行榜爬蟲
內容與資料特徵分析:通過獲取網頁介面獲取想要的資料。
設計方案:request,etree進行網頁分析,xpath獲取想要爬取的資料,sys進行檔案儲存。
技術難點:xpath格式轉換。
三、主題頁面結構特徵分析
頁面的結構特徵分析:內容導航型
Htmls頁面分析:
標題、簡介:
下載數量:
軟體大小、分類:
節點查詢、遍歷:
查詢:
sf_name = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/h2/a/text()".format( coun )) sf_download = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun )) sf_size = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun )) sf_classify = html.xpath("//*[@id='j-top-list']/li[{}]/a[1]/text()".format( coun )) sf_synopsis = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[2]/text()".format( coun ))
遍歷:通過迴圈進行提取
四、網路爬蟲程式設計
資料爬取與採集:
1 import requests 2 from bs4 import BeautifulSoup 3 import time 4 import random 5 import sys 6 import re 7 from tqdm import tqdm 8 from lxml importetree 9 10 USER_AGENTS = [ 11 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36' 12 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36' 13 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36' 14 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36' 15 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36' 16 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36' 17 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36' 18 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36' 19 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36' 20 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36' 21 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36' 22 'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3' 23 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0' 24 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0' 25 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' 26 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0' 27 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0' 28 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0' 29 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0' 30 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0' 31 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0' 32 'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0' 33 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1' 34 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1' 35 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0' 36 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0' 37 ] 38 39 headers = { 40 'User-Agent':random.choice(USER_AGENTS), 41 # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', 42 'Connection':'keep-alive', 43 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2' 44 } 45 46 47 # 軟體排行爬蟲 48 def Wdj_sf(): 49 # 建立檔案 50 file = open("sf_pop.csv", "a") 51 file.write( "sf_name" + "," + "sf_download" + "," + "sf_size" + "," + "sf_classify" + "," + "sf_synopsis" + '\n') 52 file = file.close() 53 # 請求訪問 54 url = 'https://www.wandoujia.com/top/app' 55 res = requests.get(url,headers=headers) 56 res.encoding = 'utf-8' 57 # soup = BeautifulSoup(res.text,'lxml') 58 # print(soup) 59 html = etree.HTML(res.text) 60 # print(html) 61 #獲取標籤內容:軟體名sf_name、下次次數sf_download、軟體大小sf_size、軟體分類sf_classify、軟體簡介sf_synopsis 62 coun = 1 63 for i in range(1,25): 64 try: 65 sf_name = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/h2/a/text()".format( coun )) 66 for i in sf_name: 67 sf_name = i 68 sf_download = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun )) 69 for i in sf_download: 70 sf_download = i.strip('萬億人下載') 71 sf_download = float(sf_download) 72 if sf_download > 100: 73 sf_download = sf_download/10000 74 sf_download = round(sf_download, 2) 75 sf_size = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun )) 76 for i in sf_size: 77 sf_size = i 78 sf_classify = html.xpath("//*[@id='j-top-list']/li[{}]/a[1]/text()".format( coun )) 79 for i in sf_classify: 80 sf_classify = i 81 sf_synopsis = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[2]/text()".format( coun )) 82 for i in sf_synopsis: 83 sf_synopsis = i 84 coun += 1 85 # sum = sf_name + str(sf_download) + sf_size + sf_classify + sf_synopsis 86 # 儲存資料 87 print('軟體名:',sf_name,'\n','下載數量:',sf_download,'億人下載','\n','軟體大小:',sf_size,'\n','軟體分類:',sf_classify,'\n','簡介:',sf_synopsis) 88 print('————————————————————————————————————————————————') 89 with open('sf_pop.csv', "a", encoding='utf-8') as file1: 90 file1.writelines(sf_name + "," + str(sf_download) + "," + sf_size + "," + sf_classify + "," + sf_classify + '\n') 91 # print(sum) 92 except Exception: 93 print(Exception) 94 95 # 遊戲排行爬蟲 96 def Wdj_game(): 97 # 建立檔案 98 file = open("game_pop.csv", "a") 99 file.write( "game_name" + "," + "game_download" + "," + "game_size" + "," + "game_classify" + "," + "game_synopsis" + '\n') 100 file = file.close() 101 url = 'https://www.wandoujia.com/top/game' 102 res = requests.get(url,headers=headers) 103 res.encoding = 'utf-8' 104 # soup = BeautifulSoup(res.text,'lxml') 105 # print(soup) 106 html = etree.HTML(res.text) 107 coun = 1 108 for i in range(1,25): 109 try: 110 game_name = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/h2/a/text()".format( coun )) 111 for i in game_name: 112 game_name = i 113 game_download = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun )) 114 for i in game_download: 115 game_download = i.strip('萬億人下載') 116 game_download = float(game_download) 117 if game_download > 100: 118 game_download = game_download/10000 119 game_download = round(game_download, 2) 120 game_size = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun )) 121 for i in game_size: 122 game_size = i 123 game_classify = html.xpath("//*[@id='j-top-list']/li[{}]/a[1]/text()".format( coun )) 124 for i in game_classify: 125 game_classify = i 126 game_synopsis = html.xpath("//*[@id='j-top-list']/li[{}]/div[2]/div[2]/text()".format( coun )) 127 for i in game_synopsis: 128 game_synopsis = i 129 coun += 1 130 print('軟體名:', game_name, '\n', '下載數量:', game_download, '億人下載', '\n', '軟體大小:', game_size, '\n', '軟體分類:',game_classify, '\n', '簡介:', game_synopsis) 131 print('————————————————————————————————————————————————') 132 with open('game_pop.csv', "a", encoding='utf-8') as file1: 133 file1.writelines(game_name + "," + str(game_download) + "," + game_size + "," + game_classify + "," + game_synopsis + '\n') 134 135 except Exception: 136 print(Exception) 137 138 if __name__ == '__main__': 139 print('—————————————————————————Start————————————————————————') 140 print('軟體爬蟲:') 141 Wdj_sf() 142 print('———————————————————————分界線——————————————————————————') 143 print('遊戲爬蟲:') 144 Wdj_game() 145 print('———————————————————————End————————————————————————————')
資料清洗處理:
匯入:
import pandas as pd import numpy as np # 下載數量的單位是億人 sf = pd.read_csv(r'C:\Users\10950\Desktop\LHX\sf_pop.csv') game = pd.read_csv(r'C:\Users\10950\Desktop\LHX\game_pop.csv') sf.head(20)
重複值處理:
# 重複值處理 sf = sf.drop_duplicates() game = game.drop_duplicates()
資料視覺化:
import matplotlib.pyplot as plt # sf資料視覺化分析 x = sf['sf_name'] y = sf['sf_download'] plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤 plt.plot(x,y,'-',color = 'c',label="單位/億") plt.xticks(rotation=90) plt.legend(loc = "best")#圖例 plt.title("軟體下載數量趨勢圖") plt.xlabel("軟體名",)#橫座標名字 plt.ylabel("下載數量")#縱座標名字 plt.show()
# game資料視覺化分析 x = game['game_name'] y = game['game_download'] plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤 plt.plot(x,y,'-',color = 'm',label="單位/億") plt.xticks(rotation=90) plt.legend(loc = "best")#圖例 plt.title("遊戲下載數量趨勢圖") plt.xlabel("遊戲名",)#橫座標名字 plt.ylabel("下載數量")#縱座標名字 plt.show()
# 柱狀圖 plt.bar(x,y,alpha=0.2, width=0.4, color='yellow', edgecolor='red', lw=3) plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤 plt.title("軟體下載數量柱狀圖") plt.xticks(rotation=90) plt.xlabel("軟體名",)#橫座標名字 plt.ylabel("下載數量")#縱座標名字 plt.show()
# 柱狀圖 plt.bar(x,y,alpha=0.2, width=0.4, color='w', edgecolor='red', lw=3) plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤 plt.title("遊戲下載數量柱狀圖") plt.xticks(rotation=90) plt.xlabel("遊戲名",)#橫座標名字 plt.ylabel("下載數量")#縱座標名字 plt.show()
# 水平圖 plt.barh(x,y, alpha=0.2, height=0.4, color='r', edgecolor='gray',label='單位/億', lw=3) plt.title("軟體下載數量水平圖") plt.legend(loc = "best")#圖例 plt.xlabel("下載數量",)#橫座標名字 plt.ylabel("軟體名")#縱座標名字 plt.show()
# 水平圖 plt.barh(x,y, alpha=0.2, height=0.4, color='gray', edgecolor='gray',label='單位/億', lw=3) plt.title("遊戲下載數量水平圖") plt.legend(loc = "best")#圖例 plt.xlabel("下載數量",)#橫座標名字 plt.ylabel("遊戲名")#縱座標名字 plt.show()
# 散點圖 plt.scatter(x,y,color='b',marker='o',s=40,edgecolor='black',alpha=0.5) plt.xticks(rotation=90) plt.title("軟體下載數量散點圖") plt.xlabel("軟體名",)#橫座標名字 plt.ylabel("下載數量")#縱座標名字 plt.show()
# 散點圖 plt.scatter(x,y,color='w',marker='o',s=40,edgecolor='black',alpha=0.5) plt.xticks(rotation=90) plt.title("軟體下載數量散點圖") plt.xlabel("遊戲名",)#橫座標名字 plt.ylabel("下載數量")#縱座標名字 plt.show()
# 餅狀圖 label_list = x explode = (0,0,0,0.1,0,0) plt.rcParams['font.sans-serif']=['SimHei'] plt.xticks(rotation=0) plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6) plt.title("軟體下載數量餅狀圖") plt.axis("equal") plt.show()
# 餅狀圖 label_list = x explode = (0,0,0,0.1,0,0) plt.rcParams['font.sans-serif']=['SimHei'] plt.xticks(rotation=0) plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6) plt.title("遊戲下載數量餅狀圖") plt.axis("equal") plt.show()
總程式碼
1 import pandas as pd 2 import numpy as np 3 import matplotlib.pyplot as plt 4 5 # 下載數量的單位是億人 6 sf = pd.read_csv(r'C:\Users\10950\Desktop\LHX\sf_pop.csv') 7 game = pd.read_csv(r'C:\Users\10950\Desktop\LHX\game_pop.csv') 8 sf.head(20) 9 # 重複值處理 10 sf = sf.drop_duplicates() 11 game = game.drop_duplicates() 12 13 # sf資料視覺化分析 14 x = sf['sf_name'] 15 y = sf['sf_download'] 16 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤 17 plt.plot(x,y,'-',color = 'c',label="單位/億") 18 plt.xticks(rotation=90) 19 plt.legend(loc = "best")#圖例 20 plt.title("軟體下載數量趨勢圖") 21 plt.xlabel("軟體名",)#橫座標名字 22 plt.ylabel("下載數量")#縱座標名字 23 plt.show() 24 25 # game資料視覺化分析 26 x = game['game_name'] 27 y = game['game_download'] 28 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤 29 plt.plot(x,y,'-',color = 'm',label="單位/億") 30 plt.xticks(rotation=90) 31 plt.legend(loc = "best")#圖例 32 plt.title("遊戲下載數量趨勢圖") 33 plt.xlabel("遊戲名",)#橫座標名字 34 plt.ylabel("下載數量")#縱座標名字 35 plt.show() 36 37 # 柱狀圖 38 plt.bar(x,y,alpha=0.2, width=0.4, color='yellow', edgecolor='red', lw=3) 39 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤 40 plt.title("遊戲軟體數量柱狀圖") 41 plt.xticks(rotation=90) 42 plt.xlabel("軟體名",)#橫座標名字 43 plt.ylabel("下載數量")#縱座標名字 44 plt.show() 45 46 # 柱狀圖 47 plt.bar(x,y,alpha=0.2, width=0.4, color='w', edgecolor='red', lw=3) 48 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤 49 plt.title("遊戲下載數量柱狀圖") 50 plt.xticks(rotation=90) 51 plt.xlabel("遊戲名",)#橫座標名字 52 plt.ylabel("下載數量")#縱座標名字 53 plt.show() 54 55 # 水平圖 56 plt.barh(x,y, alpha=0.2, height=0.4, color='r', edgecolor='gray',label='單位/億', lw=3) 57 plt.title("軟體下載數量水平圖") 58 plt.legend(loc = "best")#圖例 59 plt.xlabel("下載數量",)#橫座標名字 60 plt.ylabel("軟體名")#縱座標名字 61 plt.show() 62 63 # 水平圖 64 plt.barh(x,y, alpha=0.2, height=0.4, color='gray', edgecolor='gray',label='單位/億', lw=3) 65 plt.title("遊戲下載數量水平圖") 66 plt.legend(loc = "best")#圖例 67 plt.xlabel("下載數量",)#橫座標名字 68 plt.ylabel("遊戲名")#縱座標名字 69 plt.show() 70 71 # 散點圖 72 plt.scatter(x,y,color='b',marker='o',s=40,edgecolor='black',alpha=0.5) 73 plt.xticks(rotation=90) 74 plt.title("軟體下載數量散點圖") 75 plt.xlabel("軟體名",)#橫座標名字 76 plt.ylabel("下載數量")#縱座標名字 77 plt.show() 78 79 # 散點圖 80 plt.scatter(x,y,color='w',marker='o',s=40,edgecolor='black',alpha=0.5) 81 plt.xticks(rotation=90) 82 plt.title("軟體下載數量散點圖") 83 plt.xlabel("遊戲名",)#橫座標名字 84 plt.ylabel("下載數量")#縱座標名字 85 plt.show() 86 87 # 餅狀圖 88 label_list = x 89 explode = (0,0,0,0.1,0,0) 90 plt.rcParams['font.sans-serif']=['SimHei'] 91 plt.xticks(rotation=0) 92 plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6) 93 plt.title("軟體下載數量餅狀圖") 94 plt.axis("equal") 95 plt.show() 96 97 # 餅狀圖 98 label_list = x 99 explode = (0,0,0,0.1,0,0) 100 plt.rcParams['font.sans-serif']=['SimHei'] 101 plt.xticks(rotation=0) 102 plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6) 103 plt.title("遊戲下載數量餅狀圖") 104 plt.axis("equal") 105 plt.show()
五、總結
從視覺化結果分析來看,軟體排行QQ和微信下載量最受歡迎,遊戲排行看王者榮耀和4399遊戲盒受歡迎。分析結果達到預期效果。在設計過程中,我收穫了資料處理的程式設計思維方式。不足之處在繪製散點圖時,沒有達到自己的效果,通過此次實驗後對視覺化分析有了更深的認知,並對繪圖繼續專研。