bs4抓取糗事百科
阿新 • • 發佈:2018-09-10
tps quest mpi block ntp lap closed resp pan
抓取糗事百科內容及評論,不包含圖片信息。user-agent填入瀏覽器的即可。user-agent對應的value,360極速瀏覽器的話,可以在地址欄輸入about:version,回車,用戶代理後面的一長串就是需要填入‘‘裏面的內容。其他的可以自行百度
import urllib.request import re from urllib import request from bs4 import BeautifulSoup #1.獲取網頁源代碼 def get_html(url): headers = { ‘User-Agent‘: ‘‘, } reqView Code= request.Request(headers=headers,url=url) response = urllib.request.urlopen(req) content = response.read().decode(‘utf-8‘) return content #獲取評論鏈接 def get_comment_link(content,comment_url_base): soup = BeautifulSoup(content,‘html.parser‘) articleFloor = 1 for string in soup.find_all(attrs=re.compile(r"article block untagged mb15.*?")): comment = str(string.get(‘id‘)).strip().split("_")[2] comment_url = comment_url_base % comment#評論鏈接 get_comment_content(comment_url,articleFloor)#獲取評論內容 articleFloor += 1 #獲取糗事內容及評論內容 def get_comment_content(comment_url,articleFloor): commentPage= get_html(comment_url) commentFloor = 1 soupComment = BeautifulSoup(commentPage,‘html.parser‘) for item in soupComment.find_all(‘div‘,class_=‘content‘): print(articleFloor,".",item.get_text().strip())#獲取糗事內容 for comment in soupComment.find_all(attrs="body"): print(" ",commentFloor,"樓回復:",comment.get_text())#獲取評論內容 commentFloor += 1 def command(): while True: raw = input("點擊enter查看或者輸入exit退出,請輸入你的選擇:") if raw==‘enter‘: main() break else: break def main(): article_url_base = ‘https://www.qiushibaike.com/8hr/page/%d/‘#文章地址 comment_url_base = ‘https://www.qiushibaike.com/article/%s‘#評論地址 article_url = article_url_base % 2 content = get_html(article_url) get_comment_link(content,comment_url_base) if __name__ == ‘__main__‘: command()
bs4抓取糗事百科