爬取豆瓣TOP250電影
阿新 • • 發佈:2021-07-28
自己跟著視訊學習的第一個爬蟲小程式,裡面有許多不太清楚的地方,不如怎麼找到具體的電影名字的,那麼多級關係,怎麼以下就找到的是那個div呢?
諸如此類的,有許多,不過先做起來再說吧,後續再取去弄懂。
import requests import bs4 import re def open_url(url): #使用代理 # proxies={'http':"127.0.0.1:1080",'https':'127.0.0.1:1080'} headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \ Chrome/86.0.4240.198 Safari/537.36 QIHU 360EE"} headers1={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'} #res=requests.get(url,headers=headers,proxies=proxies) res=requests.get(url,headers=headers1) return res def find_movies(res): soup=bs4.BeautifulSoup(res.text,'html.parser') #電影名 movies=[] targets=soup.find_all('div',class_="hd") for each in targets: movies.append(each.a.span.text) #評分 ranks=[] targets=soup.find_all('span',class_='rating_num') for each in targets: ranks.append('評分:{}'.format(each.text)) #資料 messages=[] targets=soup.find_all("div",class_='bd') for each in targets: try: messages.append(each.p.text.split('\n')[1].strip()+each.p.text.split('\n')[2].strip()) except: continue result=[] length=len(movies) for i in range(length): result.append(movies[i]+ranks[i]+messages[i]+'\n') return result #找出一共有多少個頁面 def find_depth(res): soup=bs4.BeautifulSoup(res.text,'html.parser') depth=soup.find('span',class_='next').previous_sibling.previous_sibling.text return int(depth) def main(): host="https://movie.douban.com/top250" res=open_url(host) depth=find_depth(res) result=[] for i in range(depth): url=host+'/?start='+str(25*i) res=open_url(url) result.extend(find_movies(res)) with open("豆瓣TOP250電影.txt","w",encoding='utf-8') as f: for each in result: f.write(each) if __name__=="__main__": main()