爬取部落格資訊的簡單爬蟲
阿新 • • 發佈:2019-02-06
呼叫 getOneBlogDetails( ) 函式可以獲取目標網頁的博主姓名, 個人主頁網址,原創文章、粉絲、喜歡、評論數量, 等級、訪問量、積分、排名。
#!/usr/lib/python3.6 #encoding = utf-8 #爬取一個部落格的基本資訊 #本爬蟲僅用於學習,純屬愛好,雖然本爬蟲很簡單,但還是請大家不要濫用 import requests from bs4 import BeautifulSoup # 請求頭 headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0' } #部落格網址 url = "https://blog.csdn.net/qq_38737992" #爬取部落格基本資訊的函式 def getOneBlogDetails(url): #獲取網頁的資料 r = requests.get(url, headers=headers, timeout=100) #解析網頁,BeautifulSoup的官方文件:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html soup = BeautifulSoup(r.text, "html.parser") Name = soup.find('a', id="uid").text My_url = soup.find('a', id="uid").get('href') info = soup.find('div', class_="data-info d-flex item-tiling") #info_list1: [<dt><a href="https://blog.csdn.net/qq_38737992?t=1">原創</a></dt>, <dt>粉絲</dt>, <dt>喜歡</dt>, <dt>評論</dt>] info_list1 = info.find_all('dt') # info_list2: [<span class="count">77</span>, <span class="count" id="fan">0</span>, <span class="count">0</span>, <span class="count">1</span>] info_list2 = info.find_all('span', class_='count') info1 = soup.find('div', class_="grade-box clearfix") info1_list = info1.find_all('dl') Base = {} Base1 = {} for i in range(len(info_list2)): Base[info_list1[i].text] = info_list2[i].text for link in info1_list: key = link.find('dt').text if link.find('dd').text.strip() == '': values = link.find('a').get('title')[0:2] else: values = link.find('dd').text.strip() Base1[key] = values print( "博主姓名: " + Name ) print( "個人主頁: " + My_url) for key in Base.keys(): print( key + ": " + Base[key]) for key in Base1.keys(): print( key + Base1[key]) # 呼叫函式 getOneBlogDetails(url)