小菜鳥的第一個爬蟲:豆瓣爬取電影資訊
阿新 • • 發佈:2019-02-09
#!/usr/bin/env python
# -*- coding=utf-8 -*-
import urllib.request
import re
import time
import os
from bs4 import BeautifulSoup
def get_html(url): #通過url獲取網頁內容
result = urllib.request.urlopen(url)
return result.read()
# save_file(result.read(), 'thefile.txt')
def get_movie_all(html): #通過soup提取到每個電影的全部資訊,以list返回
soup = BeautifulSoup(html,'html.parser')
movie_list = soup.find_all('div', class_='bd doulist-subject')
return movie_list
def get_one_movie(movie):
result = [] # 用於儲存提取出來的電影資訊
soup_all = BeautifulSoup(str(movie), 'html.parser')
title = soup_all.find_all('div', class_='title' )
soup_title = BeautifulSoup(str(title[0]), 'html.parser')
for line in soup_title.stripped_strings: # 對獲取到的<a>裡的內容進行提取
result.append(line)
# num = soup_all.find_all('span', class_='rating_nums')
# 加入電影評分
num = soup_all.find_all('span')
result.append(num[1].string)
# 加入abstract
info = soup_all.find_all('div', class_='abstract')
soup_info = BeautifulSoup(str(info[0]), 'html.parser')
result_str = ""
for line in soup_info.stripped_strings: # 對獲取到的<div>裡的內容進行提取
result_str = result_str +" "+ line
result.append(result_str)
return result # 返回獲取到的結果
def save(text,file_name):
with open(file_name,'ab') as f:
texts = str.encode(text)
f.write(texts)
if __name__=='__main__':
url = 'https://www.douban.com/doulist/3516235/?start=0&sort=seq&sub_type='
html = get_html(url)
movie_list = get_movie_all(html)
for movie in movie_list:
result = get_one_movie(movie)
text = '電影名:'+result[0]+' '+'評分:'+result[1]+' '+result[2]+'\n'
save(text,'movie.txt')
只爬取了第一頁的內容,參考這位大神的程式碼
畢竟小白開始學習是要從模仿開始的嘛~~思路懂了又自己敲了一遍。
慢慢來吧,相信自己不是廢物┭┮﹏┭┮
相關待看
豆瓣電影TOP250爬取