1. 程式人生 > 其它 >豆瓣top250爬取

豆瓣top250爬取

一、電影名字爬取

import requests
import re

headers = {
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40"
}

url = "https://movie.douban.com/top250" 

# 獲取html頁面
resp = requests.get(url,headers=headers)
page_contenr = resp.text

# 解析資料
obj = re.compile(r'<li>.*?<span class="title">(?P<name>.*?)</span>',re.S)

# 開始匹配
result = obj.finditer(page_contenr)
for i in result:
	print(i.group("name"))

resp.close()

二、爬取年份

import requests
import re

headers = {

	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40"
}

url = "https://movie.douban.com/top250" 

resp = requests.get(url,headers=headers)
page_contenr = resp.text

obj = re.compile(
# 一行寫不下時,換行接著寫
r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?'
r'<p class="">.*?<br>(?P<year>.*?)&nbsp'
,re.S)


result = obj.finditer(page_contenr)
for i in result:
	print(i.group("name"))
        # strip()去除空格
	print(i.group("year").strip())

resp.close()

三、將資料存入csv中

import requests
import re
import csv

headers = {

	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40"
}

url = "https://movie.douban.com/top250" 

resp = requests.get(url,headers=headers)
page_contenr = resp.text

obj = re.compile(
r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?'
r'<p class="">.*?<br>(?P<year>.*?)&nbsp.*?'
r'<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
r'<span>(?P<num>.*?)人評價</span>'
,re.S)


result = obj.finditer(page_contenr)

# 寫的模式開啟一個檔案
f = open("data.csv",mode="w")

# csv一種以逗號分隔按行儲存的文字檔案,csv方式寫入
csvwriter = csv.writer(f)

for i in result:
        # 將結果變為字典型	
	dic = i.groupdict()

        # 其中key為year的結果前有大量空格,將空格刪除後的結果賦給year
	dic['year'] = dic['year'].strip()
    
        # 將資料一行一行寫入 
	csvwriter.writerow(dic.values())

resp.close()