網路爬蟲:百度百科
阿新 • • 發佈:2019-02-13
爬百度百科的詞條
import urllib.request
import re
from bs4 import BeautifulSoup
def main():
url="http://baike.baidu.com/view/284853.htm"
req=urllib.request.Request(url)
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
soup=BeautifulSoup(html,"html.parser")#使用python預設的解析器
for each in soup.find_all(href=re.compile("view")):
#print(each.text, "-->","http: // baike.baidu.com"+each["href"])
print (each.text,"-->","".join(["http://baike.baidu.com",each["href"]]))
#上邊用join()不用+直接拼接,是因為join()被證明執行效率要高許多
if __name__=="__main__":
main()
深入:使用者希望輸入任意詞條
import urllib.request
import urllib.parse
import re
from bs4 import BeautifulSoup
def main():
word=input("請輸入檢索的關鍵詞:")
keyword=urllib.parse.urlencode({"word":word})
url="https://baike.baidu.com/search/word?%s" %keyword
req=urllib.request.Request(url)
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8" )
soup=BeautifulSoup(html,"html.parser")
for each in soup.find_all(href=re.compile("view")):
print (each.text,"-->","".join(["http://baike.baidu.com",each["href"]]))
if __name__ == "__main__":
main()
深入:新增副標題
使用者輸入搜尋的關鍵詞,然後爬蟲進入每一條詞條,然後檢查是否有副標題,如果有,將副標題一併列印。
import urllib.request
import urllib.parse
import re
from bs4 import BeautifulSoup
def main():
word=input("請輸入檢索的關鍵詞:")
keyword=urllib.parse.urlencode({"word":word})
url="https://baike.baidu.com/search/word?%s" %keyword
req=urllib.request.Request(url)
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
soup=BeautifulSoup(html,"html.parser")
for each in soup.find_all(href=re.compile("view")):
# print (each.text,"-->","".join(["http://baike.baidu.com",each["href"]]))
content="".join([each.text])
url2="".join(["http://baike.baidu.com",each["href"]])
req2=urllib.request.Request(url2)
response2=urllib.request.urlopen(req2)
html2=response2.read().decode("utf-8")
soup2=BeautifulSoup(html2,"html.parser")
if soup2.h2:
content="".join([content,soup2.h2.text])
content="".join([content,"-->",url2])
print (content)
if __name__ == "__main__":
main()
深入:先列印10條連結
進一步深入,我們先列印10條連結,然後問使用者“您往下看嗎?”
import urllib.request
import urllib.parse
import re
from bs4 import BeautifulSoup
def test_url(soup):
result=soup.find(text=re.compile("百度百科尚未收錄詞條"))
if result:
print(result[0:-1])
#百度這個bitch在最後加了一個“符號,去掉 --{百度百科尚未收錄詞條 “}
return False
else:
return True
def summary(soup):
# word=soup.h1.text
# #如果存在副標題,一起列印
# if soup.h2:
# word+=soup.h2.text
# #列印標題
# print (word)
# #列印簡介
# if soup.find(class_="lemma-summary"):
# print(soup.find(class_="lemma-summary").text)
title_node = soup.find("dd", class_="lemmaWgt-lemmaTitle-title").find("h1")
title = title_node.get_text()
if soup.h2:
title+=soup.h2.text
#列印標題
print (title)
# 根據頁面的特徵,獲取摘要內容
summary_node = soup.find('div', class_="lemma-summary")
if summary_node is None:
summary = "None summary"
else:
summary = summary_node.get_text()
print (summary)
def get_urls(soup):
for each in soup.find_all(href=re.compile("view")):
content="".join([each.text])
url2="".join(["http://baike.baidu.com",each["href"]])
req2=urllib.request.Request(url2)
response2=urllib.request.urlopen(req2)
html2=response2.read().decode("utf-8")
soup2=BeautifulSoup(html2,"html.parser")
if soup2.h2:
content="".join([content,soup2.h2.text])
content="".join([content,"-->",url2])
yield content
def main():
word=input("請輸入檢索的關鍵詞:")
keyword=urllib.parse.urlencode({"word":word})
url="https://baike.baidu.com/search/word?%s" %keyword
req=urllib.request.Request(url)
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
soup=BeautifulSoup(html,"html.parser")
if test_url(soup):
summary(soup)
print ("下邊列印相關連結:")
each=get_urls(soup)
while True:
try:
for i in range(10):
print (next(each))
except StopIteration:
break
command=input("請輸入任意字元繼續列印,q退出程式:")
if command=="q":
break
else:
continue
if __name__ == "__main__":
main()
結果為: