1. 程式人生 > >爬取部落格資訊的簡單爬蟲

爬取部落格資訊的簡單爬蟲

        呼叫 getOneBlogDetails( ) 函式可以獲取目標網頁的博主姓名, 個人主頁網址,原創文章、粉絲、喜歡、評論數量, 等級、訪問量、積分、排名。

#!/usr/lib/python3.6
#encoding = utf-8
#爬取一個部落格的基本資訊
#本爬蟲僅用於學習,純屬愛好,雖然本爬蟲很簡單,但還是請大家不要濫用

import requests
from bs4 import BeautifulSoup

# 請求頭
headers = {
	'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'
} 

#部落格網址
url = "https://blog.csdn.net/qq_38737992"

#爬取部落格基本資訊的函式
def getOneBlogDetails(url):

    #獲取網頁的資料
	r = requests.get(url, headers=headers, timeout=100)
        
    #解析網頁,BeautifulSoup的官方文件:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
	soup = BeautifulSoup(r.text, "html.parser")

	Name = soup.find('a', id="uid").text
	My_url = soup.find('a', id="uid").get('href')

	info = soup.find('div', class_="data-info d-flex item-tiling")
	#info_list1: [<dt><a href="https://blog.csdn.net/qq_38737992?t=1">原創</a></dt>, <dt>粉絲</dt>, <dt>喜歡</dt>, <dt>評論</dt>]
	info_list1 = info.find_all('dt')
	# info_list2: [<span class="count">77</span>, <span class="count" id="fan">0</span>, <span class="count">0</span>, <span class="count">1</span>]
	info_list2 = info.find_all('span', class_='count')


	info1 = soup.find('div', class_="grade-box clearfix")
	info1_list = info1.find_all('dl')
	Base = {}
	Base1 = {}
	for i in range(len(info_list2)):
		Base[info_list1[i].text] = info_list2[i].text
    
	for link in info1_list:
		key = link.find('dt').text 
		if link.find('dd').text.strip() == '':
			values = link.find('a').get('title')[0:2] 
		else:
			values = link.find('dd').text.strip()
		Base1[key] = values

	print( "博主姓名: " + Name )
	print( "個人主頁: " + My_url)
	for key in Base.keys():
		print( key + ": " + Base[key])

	for key in Base1.keys():
		print( key  + Base1[key])

# 呼叫函式
getOneBlogDetails(url)