1. 程式人生 > >網路爬蟲-問答對練習

網路爬蟲-問答對練習

弄了一陣網路爬蟲,使用requests,re,BeautifulSoup,這些包。暫放一段時間,怕忘了,就記下來吧。

按照mu zhi醫生的網站佈局,只要有一個醫生的ID就可以把屬於這位醫生的問答對全部爬下來。所以,思路是先把所有醫生的ID拿下來儲存到一個檔案,以後按照這個進行爬取。問答網頁使用動態js,需要注意下吧。這是程式碼。

import requests
import re
import time
from bs4 import BeautifulSoup

doc_num = []
"""
Save_docs = open('SaveDoc.txt','w')
for i in range(222):
    url = 'http://muzhi.baidu.com/doctor/list/doctoronline?pn={}&rn=5&cid1=127'.format(i)
    request = requests.get(url).json()
    for item in request['data']['list']:
        Save_docs.write(item['uid']+'\n')
        doc_num.append(item['uid'])
        print('Get doc:',item['realname'],'  Company:',item['company'],'   uid:',item['uid'])
Save_docs.close()
"""
#第一次執行這段程式碼會爬下所有醫生的ID Save_docs = open('SaveDoc.txt','r') doc_num=[line.rstrip() for line in Save_docs] Save_docs.close() docs_file = open('docsSeen.txt','w') ques_file = open('quesSeen.txt','w',encoding='utf-8') for docNum in doc_num: seed_doc_url = 'http://muzhi.baidu.com/home/{}'.format(docNum) print('Downloading from doc:'
,docNum) docs_file.write(docNum+'\n') #把已經爬取得醫生的ID放到檔案記錄 request = requests.get(seed_doc_url) #訪問醫生頁面 #request.encoding='GB2312' #可能是編碼問題,這句用來解決中文亂碼問題 html = request.text soup = BeautifulSoup(html,'html.parser') #使用BeautifulSoup物件對網頁HTML進行分析 target = soup.find_all('script'
,type="text/javascript")[2].text uid_regex = "'id':'(\d*)'" cid_regex ="'cid1':'(\d*)'" #這裡利用了正則表示式,使用了Python re庫的函式,非常方便。 uid = re.search(uid_regex,target).groups()[0] cid = re.search(cid_regex,target).groups()[0] for i in range(76): questions_page = 'http://muzhi.baidu.com/doctor/list/answer?pn={0}&rn=10&uid={1}'.format(i*10,uid) time.sleep(3) #避免網站封鎖IP request = requests.get(questions_page).json()['data']['list'] #處理辦法,利用Shell,試驗出來的。 for item in request: que_url = 'http://muzhi.baidu.com/question/{}'.format(item['qid']) print('Downloading from:',que_url,' uid:',uid,' page',i+1) QandA = [] time.sleep(1) request = requests.get(que_url) request.encoding='GB2312' html = request.text soup = BeautifulSoup(html,'html.parser') try: question = soup.find_all('div','ask-txt')[0].contents[1].contents[2].strip() QandA.append(question) except IndexError: pass try: answer = soup.find_all('div','pgc-rich line q-content')[0].contents[1].contents[2].strip() QandA.append(answer) except IndexError: pass while QandA == []: #這個問題有待解決,當爬了幾次後,百度便提示驗證碼,阻礙爬蟲。 #這個迴圈體會每5秒重試一次,直到解決驗證碼。(顯然這是個嚴重問題) print('Under control! Waiting...Waiting....') time.sleep(5) request = requests.get(que_url) request.encoding='GB2312' html = request.text soup = BeautifulSoup(html,'html.parser') try: question = soup.find_all('div','ask-txt')[0].contents[1].contents[2].strip() QandA.append(question) except IndexError: pass try: answer = soup.find_all('div','pgc-rich line q-content')[0].contents[1].contents[2].strip() QandA.append(answer) except IndexError: pass print(QandA) strQandA = '|'.join(QandA) ques_file.write(strQandA+'\n')