python 爬蟲(五)爬取多頁內容
阿新 • • 發佈:2019-01-03
import urllib.request import ssl import re def ajaxCrawler(url): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"} req = urllib.request.Request(url,headers=headers) #使用ssl建立未驗證的上下文 context = ssl._create_unverified_context() response = urllib.request.urlopen(req,context=context) jsonStr = response.read().decode("utf-8") return jsonStr url = "https://www.qiushibaike.com/text/page/1/" #然後迴圈爬取page/2/ 、、、 #filePath = "qiushi.html" par1 = r'''article block untagged mb15(.*?)class="stats-comments''' re_ob = re.compile(par1,re.S) listStr = re_ob.findall(ajaxCrawler(url)) jsonStr ={} for ss in listStr: re_Content = re.compile(r'''class="content".*?<span>(.*?)</span>''',re.S) #前期不要寫的太嚴格,防止有的匹配不到 userContent = re_Content.findall(ss)[0] #返回的是一個數組,取第一個 re_name = re.compile(r'''<h2>(.*?)</h2>''',re.S) userName = re_name.findall(ss)[0] jsonStr[userName] = userContent for k,v in jsonStr.items(): print(k+":說"+v)