Python3 黑板客爬蟲闖關第一關
阿新 • • 發佈:2018-12-14
#coding=utf-8 import re import requests from requests.exceptions import RequestException from bs4 import BeautifulSoup def getHtml(url): try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'} response = requests.get(url,headers = headers) if response.status_code == 200: return response.text return None except RequestException: return None if __name__=='__main__': start_url = "http://www.heibanke.com/lesson/crawler_ex00/" real_url = start_url while 1:#while 1 的執行速度比while True 要快那麼一點 print ("當前請求頁面:{}".format(real_url)) html = getHtml(real_url) soup = BeautifulSoup(html,"lxml") source =soup.select_one('h3').text num = re.findall('\d+',source) if len(num) == 0: break real_url = start_url + num[0]
考察點:url的拼接,BeautifulSoup庫及正則表示式庫的使用。