豆瓣top250爬取
阿新 • • 發佈:2021-11-02
一、電影名字爬取
import requests import re headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40" } url = "https://movie.douban.com/top250" # 獲取html頁面 resp = requests.get(url,headers=headers) page_contenr = resp.text # 解析資料 obj = re.compile(r'<li>.*?<span class="title">(?P<name>.*?)</span>',re.S) # 開始匹配 result = obj.finditer(page_contenr) for i in result: print(i.group("name")) resp.close()
二、爬取年份
import requests import re headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40" } url = "https://movie.douban.com/top250" resp = requests.get(url,headers=headers) page_contenr = resp.text obj = re.compile( # 一行寫不下時,換行接著寫 r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?' r'<p class="">.*?<br>(?P<year>.*?) ' ,re.S) result = obj.finditer(page_contenr) for i in result: print(i.group("name")) # strip()去除空格 print(i.group("year").strip()) resp.close()
三、將資料存入csv中
import requests import re import csv headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40" } url = "https://movie.douban.com/top250" resp = requests.get(url,headers=headers) page_contenr = resp.text obj = re.compile( r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?' r'<p class="">.*?<br>(?P<year>.*?) .*?' r'<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?' r'<span>(?P<num>.*?)人評價</span>' ,re.S) result = obj.finditer(page_contenr) # 寫的模式開啟一個檔案 f = open("data.csv",mode="w") # csv一種以逗號分隔按行儲存的文字檔案,csv方式寫入 csvwriter = csv.writer(f) for i in result: # 將結果變為字典型 dic = i.groupdict() # 其中key為year的結果前有大量空格,將空格刪除後的結果賦給year dic['year'] = dic['year'].strip() # 將資料一行一行寫入 csvwriter.writerow(dic.values()) resp.close()