Python-爬蟲-基本庫(requests)使用-抓取貓眼電影Too100榜
阿新 • • 發佈:2018-12-28
spa spi fire tools not agen ext get pytho
1 #抓取貓眼電影,https://maoyan.com/board/4 榜單電影列表 2 import requests 3 import re 4 from requests.auth import HTTPBasicAuth 5 6 #定義爬蟲工具類 7 class SpiderTools(): 8 def __init__(self): 9 super(SpiderTools, self).__init__() 10 #抓取首頁信息 11 def load_onePage(self,url): 12 self.headers={13 ‘Host‘:‘maoyan.com‘, 14 ‘Accept‘:‘text / html, application / xhtml + xml, * / *‘, 15 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0‘ 16 } 17 res=requests.get(url,headers=self.headers) 18 #print(res.text) 19#解析獲取電影名稱排行,名稱,圖片地址,主演,上映時間 20 pattern=re.compile(‘<dd>.*?board-index.*?>(.*?)</i>.*?<p class="name".*?><a.*?>(.*?)</a>.*?<img data-src="(.*?)".*?>.*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>‘,re.S) 21 result=re.findall(pattern,res.text)22 #將數據創建字典 23 #print(result0) 24 items=[] 25 for i in result: 26 dict={ 27 "order":i[0], 28 "name":i[1], 29 "imageURL":i[2], 30 "auth":i[3].strip(),#strip去掉前後換行符合空格 31 "time":i[4] 32 } 33 items.append(dict) 34 return items 35 36 37 38 if __name__=="__main__": 39 spider=SpiderTools() 40 i=0 41 while True: 42 items = spider.load_onePage(‘https://maoyan.com/board/4?offset=%d‘%(i)) 43 i=i+10 44 if not len(items): 45 break; 46 print(i,items)
Python-爬蟲-基本庫(requests)使用-抓取貓眼電影Too100榜