1. 程式人生 > 其它 >【Python爬蟲】查自己部落格每月發帖量的小程式

【Python爬蟲】查自己部落格每月發帖量的小程式

【前提:安裝beautifulsoup4和requests】

pip install beautifulsoup4

pip install requests

【程式碼】

#encoding=utf-8

from bs4 import BeautifulSoup
import requests
import re

user_agent='Mozilla/4.0 (compatible;MEIE 5.5;windows NT)'
headers={'User-Agent':user_agent}

dic={}; #定義個字典物件,存月份和個數
for i in range(1,139):
    html
=requests.get('http://www.cnblogs.com/heyang78/p/?page='+str(i),headers=headers) soup= BeautifulSoup(html.text,'html.parser'); for descDiv in soup.find_all(class_="postDesc2"): rawInfo=descDiv.text #得到class="postDesc2"的div的內容 yearMonth=re.search(r'\d{4}-\d{2}',rawInfo).group() #
用正則表示式去匹配年月並取其值 # 將年月存入字典,如果存在就在原基礎上加一 if yearMonth in dic: dic[yearMonth]=dic[yearMonth]+1 else: dic[yearMonth]=1 list=sorted(dic.items(),key=lambda x:x[0]) #將排序後的字典轉化為陣列 #存入檔案 with open(r'output.txt','w') as outfile: for item in
list: print(item) outfile.write(str(item)+"\n")

【輸出示例】

('2016-02', 8)
('2016-03', 14)
('2016-05', 1)
('2016-06', 1)
('2016-07', 17)
('2016-08', 12)
('2016-10', 1)
('2017-01', 19)
('2017-02', 3)
('2017-03', 2)
('2017-04', 1)
('2017-05', 1)
('2017-06', 20)
('2017-07', 10)
('2017-08', 16)
('2017-09', 78)
('2017-10', 5)
('2017-11', 32)
('2017-12', 21)
('2018-01', 7)
('2018-03', 19)
('2018-04', 45)
('2018-05', 43)
('2018-06', 2)
('2018-07', 2)
('2019-03', 37)
('2019-04', 1)
('2019-05', 2)
('2019-07', 1)
('2019-08', 17)
('2019-09', 41)
('2019-10', 63)
('2019-11', 73)
('2019-12', 64)
('2020-01', 80)
('2020-02', 42)
('2020-03', 61)
('2020-04', 43)
('2020-05', 68)
('2020-06', 26)
('2020-09', 1)
('2021-08', 39)
('2021-09', 73)
('2021-10', 61)
('2021-11', 42)
('2021-12', 46)
('2022-01', 30)
('2022-02', 63)
('2022-03', 26)

END