python爬蟲練習1:豆瓣電影TOP250
阿新 • • 發佈:2017-07-28
import ria fff python top font beautiful code pen
項目1:實現豆瓣電影TOP250標題爬取:
1 from urllib.request import urlopen 2 from bs4 import BeautifulSoup 3 import re 4 5 class doubanSpider(): 6 def __init__(self): 7 """ 8 初始化, 9 頁碼,URL,存儲數據, 10 """ 11 self.page = 0; 12 # "http://movie.douban.com/top250?start=25&filter=&type=" 第二頁13 # 第一頁 14 self.cur_url = "http://movie.douban.com/top250?start=0&filter=&type=" 15 self.datas = [] 16 17 def claw(self): 18 while self.page<10: 19 self.downloadURL() 20 self.updateURL() 21 self.output() 22 23 def updateURL(self):24 self.page+=1 25 self.cur_url.replace("start=0","start="+str(self.page*25)) 26 27 def downloadURL(self): 28 html = urlopen(self.cur_url) 29 bsObj = BeautifulSoup(html,"html.parser") 30 datas = bsObj.findAll("span", {"class": "title"}) 31 for data indatas: 32 str = data.get_text() 33 if "\xa0/\xa0" not in str: 34 self.datas.append(str) 35 36 def output(self): 37 num = 1 38 for data in self.datas: 39 print("TOP"+str(num)+": " +data) 40 num+=1 41 42 if __name__ == "__main__": 43 print("豆瓣電影TOP250:python抓取") 44 myspider = doubanSpider() 45 myspider.claw()
python爬蟲練習1:豆瓣電影TOP250