web爬蟲01-單執行緒爬蟲
阿新 • • 發佈:2021-10-13
單執行緒爬蟲
目標網站:https://movie.douban.com/top250 豆瓣top250
程式碼:
import codecs import requests from bs4 import BeautifulSoup import time DOWNLOAD_URL = 'https://movie.douban.com/top250' # 時間裝飾器 def timer(func): def inner(*args, **kw): t1 = time.time() func(*args, **kw) t2 = time.time() print("-------一共花費時間:{}秒".format(t2-t1)) return t2 - t1 return inner # 下載html def download_page(url): return requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }).content # 提取需要資訊 def parse_html(html): soup = BeautifulSoup(html, 'html.parser') # 電影列表 movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'}) movie_name_list = [] for movie_li in movie_list_soup.find_all('li'): movie_name = movie_li.find('span', attrs={'class': 'title'}).get_text() movie_info = movie_li.find('div', attrs={'class': 'bd'}).find('p').get_text() movie_star = movie_li.find('span', attrs={'class': 'rating_num'}).get_text() movie_name_list.append(movie_name) movie_name_list.append(movie_info) movie_name_list.append(movie_star) # 下一頁連結 next_page = soup.find('span', attrs={'class': 'next'}).find('a') if next_page: return movie_name_list, DOWNLOAD_URL + next_page['href'] return movie_name_list, None @timer def main(): url = DOWNLOAD_URL with codecs.open('movies', 'wb', encoding='utf-8') as f: while True: if url == None: break html = download_page(url) movies, url = parse_html(html) f.write(u'{movies}\n'.format(movies='\n'.join(movies))) if __name__ == '__main__': main() #參考連結:https://www.jianshu.com/p/8a460be5a26e