1. 程式人生 > 其它 >Python3.8 爬取豆瓣電影TOP250 練手爬蟲

Python3.8 爬取豆瓣電影TOP250 練手爬蟲

 1 #!/usr/bin/env python
 2 # encoding=utf-8
 3 import requests
 4 import re
 5 import codecs
 6 from bs4 import BeautifulSoup
 7 from openpyxl import Workbook
 8 wb = Workbook()
 9 dest_filename = '電影.xlsx'
10 ws1 = wb.active
11 ws1.title = "電影top250"
12 
13 DOWNLOAD_URL = 'http://movie.douban.com/top250/'
14
15 16 def download_page(url): 17 """獲取url地址頁面內容""" 18 headers = { 19 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' 20 } 21 data = requests.get(url, headers=headers).content 22 return data
23 24 25 def get_li(doc): 26 soup = BeautifulSoup(doc, 'html.parser') 27 ol = soup.find('ol', class_='grid_view') 28 name = [] # 名字 29 star_con = [] # 評價人數 30 score = [] # 評分 31 info_list = [] # 短評 32 desc_list = [] # 簡介 33 for i in ol.find_all('li'): 34 detail = i.find('
div', attrs={'class': 'hd'}) 35 movie_name = detail.find( 36 'span', attrs={'class': 'title'}).get_text() # 電影名字 37 level_star = i.find( 38 'span', attrs={'class': 'rating_num'}).get_text() # 評分 39 star = i.find('div', attrs={'class': 'star'}) 40 star_num = star.find(text=re.compile('評價')) # 評價 41 info = i.find('span', attrs={'class': 'inq'}) # 短評 42 desc = i.find('p', attrs={'class': ''}) # 介紹 43 desc_list.append(desc.get_text()) 44 45 if info: # 判斷是否有短評 46 info_list.append(info.get_text()) 47 else: 48 info_list.append('') 49 score.append(level_star) 50 51 name.append(movie_name) 52 star_con.append(star_num) 53 page = soup.find('span', attrs={'class': 'next'}).find('a') # 獲取下一頁 54 if page: 55 return name, star_con, score, info_list, desc_list, DOWNLOAD_URL + page['href'] 56 return name, star_con, score, info_list, desc_list,None 57 58 59 def main(): 60 url = DOWNLOAD_URL 61 name = [] 62 star_con = [] 63 score = [] 64 info = [] 65 desc = [] 66 while url: 67 doc = download_page(url) 68 movie, star, level_num, info_list, desc_list, url = get_li(doc) 69 name = name + movie 70 star_con = star_con + star 71 score = score + level_num 72 info = info + info_list 73 desc = desc + desc_list 74 for (i, m, o, p , d) in zip(name, star_con, score, info , desc): 75 col_A = 'A%s' % (name.index(i) + 1) 76 col_B = 'B%s' % (name.index(i) + 1) 77 col_C = 'C%s' % (name.index(i) + 1) 78 col_D = 'D%s' % (name.index(i) + 1) 79 col_E = 'E%s' % (name.index(i) + 1) 80 ws1[col_A] = i 81 ws1[col_B] = m 82 ws1[col_C] = o 83 ws1[col_D] = p 84 ws1[col_E] = d 85 wb.save(filename=dest_filename) 86 87 88 if __name__ == '__main__': 89 main()

用完python寫爬蟲,再也不想用php寫了,方便太多了。php只有無數的正則匹配,效率低,還寫的累。。。

滴水成冰,世間不存在毫無意義的付出,時間終會給你答案。