python 爬蟲爬取豆瓣Top250榜單
阿新 • • 發佈:2021-11-09
python 爬蟲爬取豆瓣Top250榜單
這是一個小作業。
request模組
使用request.get(url)可以爬取一個網址的資訊
# 構造合理的HTTP請求頭, 偽裝成瀏覽器, 繞過反爬蟲機制,否則會被反爬蟲機制拒絕(418)。 https://www.kesci.com/home/project/5dd6003700b0b900365feaeb user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.44 Safari/537.36" r = requests.get('http://movie.douban.com/top250?start=225', headers={'User-Agent': user_agent}) print(r.status_code) # 418 表示返回失敗, 200表示返回成功 f = open("1.txt", "w", encoding='utf-8') html = r.text # 列印頁面資訊 print(html, file = f)
BeatifulSoup 模組
詳細參考官方文件
1、安裝
pip install beautifulsoup4
pip list // 檢視安裝的python 模組
2、建立BeautifulSoup4物件
bs = BeautifulSoup(html, "html.parser") #建立beautifulSoup4物件
print(bs.prettify()) # 有縮排地輸出bs所有內容
3、訪問一個標籤內容
html = ''' <!DOCTYPE html> <!--STATUS OK--> <html> <head> <meta content="text/html;charset=utf-8" http-equiv="content-type"/> <meta content="IE=Edge" http-equiv="X-UA-Compatible"/> <meta content="always" name="referrer"/> <link href="https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/bdorz/baidu.min.css" rel="stylesheet" type="text/css"/> <title> 百度一下,你就知道 </title> </head> <body link="#0000cc"> <div id="wrapper"> <div id="head"> <div class="head_wrapper"> <div id="u1"> <a class="mnav" href="http://news.baidu.com" name="tj_trnews"> 新聞 </a> <a class="mnav" href="https://www.hao123.com" name="tj_trhao123"> hao123 </a> <a class="mnav" href="http://map.baidu.com" name="tj_trmap"> 地圖 </a> <a class="mnav" href="http://v.baidu.com" name="tj_trvideo"> 視訊 </a> <a class="mnav" href="http://tieba.baidu.com" name="tj_trtieba"> 貼吧 </a> <a class="bri" href="//www.baidu.com/more/" name="tj_briicon" style="display: block;"> 更多產品 </a> </div> </div> </div> </div> </body> </html> ''' # 以以上html頁面為例 bs.title # 獲得<title>兩個標籤內的所有內容 bs.div # 獲取兩個<div>標籤之間地所有內容(預設第一個出現地div) bs.a # 獲取第一個出現地<a>標籤的內容 bs.head # 獲取head標籤的內容。 bs.title.name # 標籤名稱,即title bs.title.string # 輸出title標籤內第一個文字部分,或者子標籤內第一個文字部分,如果有多個文字,無法輸出 bs.a.stirng bs.get_text() # 輸出一個標籤以及它的子標籤的所有 文字 bs.find_all("a") #獲取所有的a標籤,並返回一個list bs.find_all("div") #獲取所有的div標籤,返回一個list bs.find_add(id='u1') # 返回所有id='u1'的標籤 bs.find_all(id=True) # 返回所有存在id的標籤 bs.find_all(class_='mnav') # 返回所有class為mnav的標籤 bs.find_all(attrs={"class":"mnav"}) # 返回class=mnav的所有標籤,此處class可以為 bs.find_all(attrs={"name":"tj_trnews"}) # 返回所有name=tj_trnews的標籤 # find_all引數可以使用多個 # 訪問子標籤 tmp = bs.find_all("head") print(t[0].a) # 注意find返回一個bs4物件,find_all返回一個list
csv 模組
1、讀入
import csv
with open('a.csv','r') as myFile:
lines=csv.reader(myFile)
for line in lines:
print (line)
# 另外的寫法
# f = open("a.csv", "r")
# lines = csv.reader(f)
# lines = csv.reader('a.csv', 'r')
2、寫入
headers = ['class','name','sex','height','year'] rows = [ [1,'xiaoming','male',168,23], [1,'xiaohong','female',162,22], [2,'xiaozhang','female',163,21], [2,'xiaoli','male',158,21] ] with open('test.csv','w', newline='')as f: # 不加newline=''會出現隔行輸出的情況 f_csv = csv.writer(f) f_csv.writerow(headers) # 可以將一個list寫入到一行 f_csv.writerows(rows) # 可以寫入多行
完整程式碼
最後寫入csv存在亂碼的情況,原因為csv檔案對於中文編碼預設為ansi,輸出時為utf-8,修改方式使用記事本開啟csv,點選另存為,然後選擇編碼為ansi,記事本既支援utf-8也支援ansi。
# coding=utf-8
# html = '''
# <ol class="grid_view">
# <li>
# <div class="item">
# <div class="pic">
# <em class="">1</em>
# <a href="https://movie.douban.com/subject/1292052/">
# <img width="100" alt="肖申克的救贖" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class="">
# </a>
# </div>
# <div class="info">
# <div class="hd">
# <a href="https://movie.douban.com/subject/1292052/" class="">
# <span class="title">肖申克的救贖</span>
# <span class="title"> / The Shawshank Redemption</span>
# <span class="other"> / 月黑高飛(港) / 刺激1995(臺)</span>
# </a>
# <span class="playable">[可播放]</span>
# </div>
# <div class="bd">
# <p class="">
# 導演: 弗蘭克·德拉邦特 Frank Darabont 主演: 蒂姆·羅賓斯 Tim Robbins /...<br>
# 1994 / 美國 / 犯罪 劇情
# </p>
# <div class="star">
# <span class="rating5-t"></span>
# <span class="rating_num" property="v:average">9.7</span>
# <span property="v:best" content="10.0"></span>
# <span>2476527人評價</span>
# </div>
# <p class="quote">
# <span class="inq">希望讓人自由。</span>
# </p>
# </div>
# </div>
# </div>
# </li>
# </ol>
# ```
import requests
from bs4 import BeautifulSoup
import csv
Name = []
Name2 = []
Url = []
Actor = []
Score = []
Number = []
def getHtml(html):
bs = BeautifulSoup(html, "html.parser")
totlist = bs.find_all('ol', class_='grid_view')
for nowMovie in totlist[0].find_all('li'):
tmp = nowMovie.find_all('span', class_='title')
movie_name = tmp[0].string
if len(tmp) > 1:
movie_name2 = tmp[1].string
movie_name2 = movie_name2[3:]
else :
movie_name2 = ''
tmp = nowMovie.find('div', class_ = 'hd')
movie_url = tmp.a.get('href')
tmp = nowMovie.find('div', attrs={'class':'bd'})
movie_actor = tmp.p.getText()
tmp = nowMovie.find('span', attrs={'class':'rating_num'})
movie_score = tmp.string
tmp = nowMovie.find_all('span', class_=False)
tmpstr = tmp[1].string
movie_number = tmpstr.strip("人評價")
Url.append(movie_url)
Name.append(movie_name)
Name2.append(movie_name2)
Actor.append(movie_actor)
Score.append(movie_score)
Number.append(movie_number)
return
def printCsv():
File = open('a.csv', 'w' , newline = '', encoding="utf-8" ) # 因為存在外文韓語日語等,所以需要用utf-8
Print = csv.writer(File)
Print.writerow(['電影名', '英文名', '評分', '評價人數', '演員', '電影連結'])
for i in range(0, 250):
nowlist = []
nowlist.append(Name[i])
# nowlist.append(Name2[i]) # 存在日語韓語,注意編碼
nowlist.append(Score[i])
nowlist.append(Number[i])
# nowlist.append(Actor[i])
nowlist.append(Url[i])
Print.writerow(nowlist)
def main():
# 構造合理的HTTP請求頭, 偽裝成瀏覽器, 繞過反爬蟲機制,否則會被反爬蟲機制拒絕(418)。 https://www.kesci.com/home/project/5dd6003700b0b900365feaeb
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.44 Safari/537.36"
targetUrl = 'https://movie.douban.com/top250?start='
for i in range(0, 226, 25):
print(targetUrl + str(i))
r = requests.get(targetUrl + str(i), headers={'User-Agent': user_agent})
print(r.status_code) # 418 表示返回失敗, 200表示返回成功
getHtml(r.text)
printCsv()
if __name__=='__main__':
main()