python爬取資料並可視化展現

阿新 • • 發佈：2020-09-21

#將excel中的資料進行讀取分析
import openpyxl
import matplotlib.pyplot as pit #資料統計用的

wk=openpyxl.load_workbook('銷售資料.xlsx')
sheet=wk.active #獲取活動表

#獲取最大行數和最大列數
rows=sheet.max_row
cols=sheet.max_column

lst=[] #用於儲存鞋子碼數
for i in range (2,rows+1):
  size=sheet.cell(i,3).value
  lst.append(size)


#以上已經將excel中的資料讀取完畢 

#一下操作就你行統計不同碼數的數量
'''python中有一個數據結構叫做字典，使用鞋碼做key，使用銷售數量做value'''
dic_size={}
for item in lst:
  dic_size[item]=0

for item in lst:
  for size in dic_size:
    #遍歷字典
    if item==size:
      dic_size[size]+=1
      break
for item in dic_size:
  print(item,dic_size[item])
#弄成百分比的形式
lst_total=[]
for item in 
 dic_size:
  lst_total.append([item,dic_size[item],dic_size[item]/160*1.0])

#接下來進行資料的視覺化(進行畫餅操作)
labels=[item[0] +'碼'for item in lst_total] #使用列表生成式，得到餅圖的標籤
fraces=[item[2] for item in lst_total] #餅圖中的資料來源
pit.rcParams['font.family']=['SimHei'] #單獨的表格亂碼的處理方式
pit.pie(x=fraces,labels=labels,autopct='%1.1f%% 
')
#pit.show()進行結果的圖片的展示
pit.savefig('圖.jpg')

#所涉及到的是requests和openpyxl資料的儲存和資料的清洗以及統計然後就是matplotlib進行資料的視覺化
#靜態資料點選element中點擊發現在html中，伺服器已經渲染好的內容，直接發給瀏覽器，瀏覽器解釋執行，
#動態資料：如果點選下一頁。我們的位址列（加字尾但是前面的位址列沒變也算）（也可以點選2和3頁）沒有發生任何變化說明是動態資料，說明我們的資料是後來被渲染到html中的。他的資料根本不在html中的。
#動態檢視network然後用的url是network裡面的headers
#安裝第三方模組輸入cmd之後pip install 加名字例如requests
import requests
import re
import time
import json
import openpyxl #用於操作 excel檔案的
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#建立頭部資訊
def get_comments(productId,page):
  url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={0}&score=0&sortType=5&page={1}&pageSize=10&isShadowSku=0&fold=1".format(productId,page)
  resp = requests.get(url, headers=headers)
  s=resp.text.replace('fetchJSON_comment98(','')#進行替換操作。獲取到所需要的相應的json，也就是去掉前後沒用的東西
  s=s.replace(');','')
  json_data=json.loads(s)#進行資料json轉換
  return json_data

#獲取最大頁數
def get_max_page(productId):
  dis_data=get_comments(productId,0)#呼叫剛才寫的函式進行向伺服器的訪問請求，獲取字典資料
  return dis_data['maxPage']#獲取他的最大頁數。每一頁都有最大頁數

#進行資料提取

def get_info(productId):

  max_page=get_max_page(productId)
  lst=[]#用於儲存提取到的商品資料

  for page in range(1,max_page+1):
    #獲取沒頁的商品評論
    comments=get_comments(productId,page)
    comm_list=comments['comments']#根據comnents獲取到評論的列表(每頁有10條評論)
    #遍歷評論列表，獲取其中的相應的資料
    for item in comm_list:
      #每條評論分別是一字典。在繼續通過key來獲取值
      content=item['content']
      color=item['productColor']
      size=item['productSize']
      lst.append([content,color,size])#將每條評論新增到列表當中
    time.sleep(3)#防止被京東封ip進行一個時間延遲。防止訪問次數太頻繁
  save(lst)

def save(lst):

  #把爬取到的資料進行儲存,儲存到excel中
  wk=openpyxl.Workbook()#用於建立工作簿物件
  sheet=wk.active #獲取活動表（一個工作簿有三個表）


  #遍歷列表將資料新增到excel中。列表中的一條資料在表中是一行
  biaotou='評論','顏色','大小'
  sheet.append(biaotou)
  for item in lst:
    sheet.append(item)

  #將excel儲存到磁碟上
  wk.save('銷售資料.xlsx')


if __name__=='__main__':
  productId='66749071789'
  get_info(productId)
  print("ok")

#將excel中的資料進行讀取分析
import openpyxl
import numpy as np
import math
import matplotlib.pyplot as pit

wk=openpyxl.load_workbook('資訊11.xlsx')
sheet=wk.active
rows=sheet.max_row
cols=sheet.max_column
lst1=[]
lst2=[]

for i in range (1,rows+1):
  size1=sheet.cell(i,1).value
  lst1.append(size1)
  size2 = sheet.cell(i, 2).value
  lst2.append(size2)
num=0
dic_size={}
for item in lst1:
  dic_size[lst1[num]]=lst2[num]
  num+=1

#弄成百分比的形式
lst_total=[]
for item in dic_size:
  lst_total.append([item,dic_size[item]])

labels=[item[0] for item in lst_total] #使用列表生成式，得到餅圖的標籤
fraces=[item[1] for item in lst_total] #餅圖中的資料來源
pit.rcParams['font.family']=['SimHei'] #單獨的表格亂碼的處理方式
pit.scatter(labels,fraces)
pit.plot(labels,fraces,color='green')
pit.bar(labels,fraces,width=5,color='red')
z1=np.polyfit(labels,fraces,2)
p1=np.poly1d(z1)
x = np.linspace(0, 500, 50)
y=-0.00024*(x**2)+0.1013*(x)+10.23
pit.plot(x,y,color='purple')

#pit.savefig('圖.jpg')
yre=[]

for item in labels:
  y=-0.00024*(item**2)+0.1013*(item)+10.23
  yre.append(round(y,6))
print(fraces)
print(yre)
result=[]
a=0
mse=0
mae=0
for i in range(0,10):
  a+=round(fraces[i]-yre[i],6)
  mae+=round(math.fabs(fraces[i]-yre[i]),6)
for i in range(0,10):
  result.append(round(fraces[i] - yre[i]-round(a/10,6), 6))
  mse += round((fraces[i] - yre[i]-round(a/10,6)) * (fraces[i] - yre[i]-round(a/10,6)), 6)

print(result)
print('均值',round(a/10,6))
print('均方誤差',round(mse/10,6))

rmse=math.sqrt(round(mse/10,6))

print('均方根誤差',round(rmse,6))
print('平均絕對誤差',round(mae/10,6))
print('R平方的數值',1-((round(a/10,6))*round(a/10,6))/round(mse/10,6))
print(p1)
#pit.show()

Python爬蟲基於lxml解決資料編碼亂碼問題

lxml是python的一個解析庫，支援HTML和XML的解析，支援XPath解析方式，而且解析效率非常高

XPath，全稱XML Path Language，即XML路徑語言，它是一門在XML文件中查詢資訊的語言，它最初是用來搜尋XML文件的，但是它同樣適用於HTML文件的搜尋

XPath的選擇功能十分強大，它提供了非常簡明的路徑選擇表示式，另外，它還提供了超過100個內建函式，用於字串、數值、時間的匹配以及節點、序列的處理等，幾乎所有我們想要定位的節點，都可以用XPath來選擇

XPath於1999年11月16日成為W3C標準，它被設計為供XSLT、XPointer以及其他XML解析軟體使用，更多的文件可以訪問其官方網站：https://www.w3.org/TR/xpath/

問題狀況：

response = requests.get(url=url, headers=headers).text
html = etree.HTML(response)
name = html.xpath("/html/body/div[2]/ul/li[1]/a/p/text()")[0]
print(name)

可以正常獲取資料，但是結果是

å·²éªŒè¯ å®‰å…¨ ç›¾ç‰Œ

這樣子的亂碼

解決方法：

name = html.xpath("/html/body/div[2]/ul/li[1]/a/p/text()")[0].encode('ISO-8859-1').decode('UTF-8')

這邊的UTF-8根據網頁編碼情況而定

以上內容轉自

https://www.jb51.net/article/192191.htm

python爬取資料並可視化展現

Python爬蟲基於lxml解決資料編碼亂碼問題

python爬取資料並可視化展現

Python爬取資料並實現視覺化程式碼解析

Python爬取資料並寫入MySQL資料庫的例項

Python爬取資料並儲存到csv檔案中

Python爬取資料並輸出到資料庫

Python爬取股票資訊，並可視化資料的示例

Python資料分析+視覺化專案教學：分析猛男童年的玩具，並可視化展示商品資料

爬蟲爬取鏈家網資訊並可視化

通過Python的requests庫爬取資料並儲存為csv檔案

利用python進行遙感影像灰度處理並可視化

Python3實現的爬蟲爬取資料並存入mysql資料庫操作示例

監控多條微信公眾號連結閱讀量，並可視化--模擬滑鼠鍵盤

python 爬取小說並下載的示例

獲取瀏覽器歷史記錄並可視化展示

自己動手用Python爬取資料：涉及Selenium、Scrapy、高併發處理

k8s filebeat sidecar模式收集ingress nginx日誌並可視化展示

ESP8266+樹莓派+Splunk製作一個數據採集並可視化的系統

一文教你實現skip-gram模型，訓練並可視化詞向量

Python爬取北京地區蛋殼公寓資料，並進行資料視覺化處理

Python爬取招聘網站資料並做資料視覺化處理

python爬取資料並可視化展現

Python爬蟲基於lxml解決資料編碼亂碼問題

相關推薦