網路爬蟲-問答對練習
阿新 • • 發佈:2019-02-12
弄了一陣網路爬蟲,使用requests,re,BeautifulSoup,這些包。暫放一段時間,怕忘了,就記下來吧。
按照mu zhi醫生的網站佈局,只要有一個醫生的ID就可以把屬於這位醫生的問答對全部爬下來。所以,思路是先把所有醫生的ID拿下來儲存到一個檔案,以後按照這個進行爬取。問答網頁使用動態js,需要注意下吧。這是程式碼。
import requests
import re
import time
from bs4 import BeautifulSoup
doc_num = []
"""
Save_docs = open('SaveDoc.txt','w')
for i in range(222):
url = 'http://muzhi.baidu.com/doctor/list/doctoronline?pn={}&rn=5&cid1=127'.format(i)
request = requests.get(url).json()
for item in request['data']['list']:
Save_docs.write(item['uid']+'\n')
doc_num.append(item['uid'])
print('Get doc:',item['realname'],' Company:',item['company'],' uid:',item['uid'])
Save_docs.close()
"""
#第一次執行這段程式碼會爬下所有醫生的ID
Save_docs = open('SaveDoc.txt','r')
doc_num=[line.rstrip() for line in Save_docs]
Save_docs.close()
docs_file = open('docsSeen.txt','w')
ques_file = open('quesSeen.txt','w',encoding='utf-8')
for docNum in doc_num:
seed_doc_url = 'http://muzhi.baidu.com/home/{}'.format(docNum)
print('Downloading from doc:' ,docNum)
docs_file.write(docNum+'\n')
#把已經爬取得醫生的ID放到檔案記錄
request = requests.get(seed_doc_url)
#訪問醫生頁面
#request.encoding='GB2312'
#可能是編碼問題,這句用來解決中文亂碼問題
html = request.text
soup = BeautifulSoup(html,'html.parser')
#使用BeautifulSoup物件對網頁HTML進行分析
target = soup.find_all('script' ,type="text/javascript")[2].text
uid_regex = "'id':'(\d*)'"
cid_regex ="'cid1':'(\d*)'"
#這裡利用了正則表示式,使用了Python re庫的函式,非常方便。
uid = re.search(uid_regex,target).groups()[0]
cid = re.search(cid_regex,target).groups()[0]
for i in range(76):
questions_page = 'http://muzhi.baidu.com/doctor/list/answer?pn={0}&rn=10&uid={1}'.format(i*10,uid)
time.sleep(3)
#避免網站封鎖IP
request = requests.get(questions_page).json()['data']['list']
#處理辦法,利用Shell,試驗出來的。
for item in request:
que_url = 'http://muzhi.baidu.com/question/{}'.format(item['qid'])
print('Downloading from:',que_url,' uid:',uid,' page',i+1)
QandA = []
time.sleep(1)
request = requests.get(que_url)
request.encoding='GB2312'
html = request.text
soup = BeautifulSoup(html,'html.parser')
try:
question = soup.find_all('div','ask-txt')[0].contents[1].contents[2].strip()
QandA.append(question)
except IndexError:
pass
try:
answer = soup.find_all('div','pgc-rich line q-content')[0].contents[1].contents[2].strip()
QandA.append(answer)
except IndexError:
pass
while QandA == []:
#這個問題有待解決,當爬了幾次後,百度便提示驗證碼,阻礙爬蟲。
#這個迴圈體會每5秒重試一次,直到解決驗證碼。(顯然這是個嚴重問題)
print('Under control! Waiting...Waiting....')
time.sleep(5)
request = requests.get(que_url)
request.encoding='GB2312'
html = request.text
soup = BeautifulSoup(html,'html.parser')
try:
question = soup.find_all('div','ask-txt')[0].contents[1].contents[2].strip()
QandA.append(question)
except IndexError:
pass
try:
answer = soup.find_all('div','pgc-rich line q-content')[0].contents[1].contents[2].strip()
QandA.append(answer)
except IndexError:
pass
print(QandA)
strQandA = '|'.join(QandA)
ques_file.write(strQandA+'\n')