1. 程式人生 > 其它 >python 爬蟲爬取豆瓣Top250榜單

python 爬蟲爬取豆瓣Top250榜單

python 爬蟲爬取豆瓣Top250榜單

這是一個小作業。


request模組

使用request.get(url)可以爬取一個網址的資訊

 # 構造合理的HTTP請求頭, 偽裝成瀏覽器, 繞過反爬蟲機制,否則會被反爬蟲機制拒絕(418)。 https://www.kesci.com/home/project/5dd6003700b0b900365feaeb
 user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.44 Safari/537.36"
r = requests.get('http://movie.douban.com/top250?start=225', headers={'User-Agent': user_agent})
print(r.status_code) # 418 表示返回失敗, 200表示返回成功

f = open("1.txt", "w", encoding='utf-8')
html = r.text # 列印頁面資訊
print(html, file = f)

BeatifulSoup 模組

詳細參考官方文件
1、安裝

pip install beautifulsoup4
pip list // 檢視安裝的python 模組

2、建立BeautifulSoup4物件

bs = BeautifulSoup(html, "html.parser") #建立beautifulSoup4物件
print(bs.prettify()) # 有縮排地輸出bs所有內容

3、訪問一個標籤內容

html = '''
<!DOCTYPE html>
<!--STATUS OK-->
<html>
 <head>
  <meta content="text/html;charset=utf-8" http-equiv="content-type"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
  <meta content="always" name="referrer"/>
  <link href="https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/bdorz/baidu.min.css" rel="stylesheet" type="text/css"/>
  <title>
   百度一下,你就知道 </title>
 </head>
 <body link="#0000cc">
  <div id="wrapper">
   <div id="head">
    <div class="head_wrapper">
     <div id="u1">
      <a class="mnav" href="http://news.baidu.com" name="tj_trnews">
       新聞 </a>
      <a class="mnav" href="https://www.hao123.com" name="tj_trhao123">
       hao123 </a>
      <a class="mnav" href="http://map.baidu.com" name="tj_trmap">
       地圖 </a>
      <a class="mnav" href="http://v.baidu.com" name="tj_trvideo">
       視訊 </a>
      <a class="mnav" href="http://tieba.baidu.com" name="tj_trtieba">
       貼吧 </a>
      <a class="bri" href="//www.baidu.com/more/" name="tj_briicon" style="display: block;">
       更多產品 </a>
     </div>
    </div>
   </div>
  </div>
 </body>
</html>
'''
# 以以上html頁面為例

bs.title # 獲得<title>兩個標籤內的所有內容
bs.div # 獲取兩個<div>標籤之間地所有內容(預設第一個出現地div)
bs.a # 獲取第一個出現地<a>標籤的內容
bs.head # 獲取head標籤的內容。


bs.title.name # 標籤名稱,即title
bs.title.string # 輸出title標籤內第一個文字部分,或者子標籤內第一個文字部分,如果有多個文字,無法輸出
bs.a.stirng
bs.get_text() # 輸出一個標籤以及它的子標籤的所有 文字

bs.find_all("a") #獲取所有的a標籤,並返回一個list
bs.find_all("div") #獲取所有的div標籤,返回一個list
bs.find_add(id='u1') # 返回所有id='u1'的標籤
bs.find_all(id=True) # 返回所有存在id的標籤
bs.find_all(class_='mnav') # 返回所有class為mnav的標籤

bs.find_all(attrs={"class":"mnav"}) # 返回class=mnav的所有標籤,此處class可以為
bs.find_all(attrs={"name":"tj_trnews"})
# 返回所有name=tj_trnews的標籤
# find_all引數可以使用多個

# 訪問子標籤
tmp = bs.find_all("head")
print(t[0].a) # 注意find返回一個bs4物件,find_all返回一個list

csv 模組

1、讀入

import csv
 with open('a.csv','r') as myFile:
     lines=csv.reader(myFile)
     for line in lines:
         print (line)
# 另外的寫法
# f = open("a.csv", "r")
# lines = csv.reader(f)
# lines = csv.reader('a.csv', 'r')

2、寫入

headers = ['class','name','sex','height','year']
rows = [
        [1,'xiaoming','male',168,23],
        [1,'xiaohong','female',162,22],
        [2,'xiaozhang','female',163,21],
        [2,'xiaoli','male',158,21]
    ]
with open('test.csv','w', newline='')as f: # 不加newline=''會出現隔行輸出的情況
    f_csv = csv.writer(f)
    f_csv.writerow(headers) # 可以將一個list寫入到一行
    f_csv.writerows(rows) # 可以寫入多行

完整程式碼

最後寫入csv存在亂碼的情況,原因為csv檔案對於中文編碼預設為ansi,輸出時為utf-8,修改方式使用記事本開啟csv,點選另存為,然後選擇編碼為ansi,記事本既支援utf-8也支援ansi。

# coding=utf-8
# html = '''
# <ol class="grid_view">
#         <li>
#             <div class="item">
#                 <div class="pic">
#                     <em class="">1</em>
#                     <a href="https://movie.douban.com/subject/1292052/">
#                         <img width="100" alt="肖申克的救贖" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class="">
#                     </a>
#                 </div>
#                 <div class="info">
#                     <div class="hd">
#                         <a href="https://movie.douban.com/subject/1292052/" class="">
#                             <span class="title">肖申克的救贖</span>
#                                     <span class="title">&nbsp;/&nbsp;The Shawshank Redemption</span>
#                                 <span class="other">&nbsp;/&nbsp;月黑高飛(港)  /  刺激1995(臺)</span>
#                         </a>


#                             <span class="playable">[可播放]</span>
#                     </div>
#                     <div class="bd">
#                         <p class="">
#                             導演: 弗蘭克·德拉邦特 Frank Darabont&nbsp;&nbsp;&nbsp;主演: 蒂姆·羅賓斯 Tim Robbins /...<br>
#                             1994&nbsp;/&nbsp;美國&nbsp;/&nbsp;犯罪 劇情
#                         </p>

                        
#                         <div class="star">
#                                 <span class="rating5-t"></span>
#                                 <span class="rating_num" property="v:average">9.7</span>
#                                 <span property="v:best" content="10.0"></span>
#                                 <span>2476527人評價</span>
#                         </div>

#                             <p class="quote">
#                                 <span class="inq">希望讓人自由。</span>
#                             </p>
#                     </div>
#                 </div>
#             </div>
#         </li>
       
# </ol>
# ```

import requests
from bs4 import BeautifulSoup
import csv

Name = []
Name2 = []
Url = []
Actor = []
Score = []
Number = []

def getHtml(html):
    bs = BeautifulSoup(html, "html.parser")
    totlist = bs.find_all('ol', class_='grid_view')
    for nowMovie in totlist[0].find_all('li'):
        tmp = nowMovie.find_all('span', class_='title')
        movie_name = tmp[0].string
        if len(tmp) > 1:
            movie_name2 = tmp[1].string
            movie_name2 = movie_name2[3:]
        else :
            movie_name2 = ''
        tmp = nowMovie.find('div', class_ = 'hd')
        movie_url = tmp.a.get('href')
        tmp = nowMovie.find('div', attrs={'class':'bd'})
        movie_actor = tmp.p.getText()
        tmp = nowMovie.find('span', attrs={'class':'rating_num'})
        movie_score = tmp.string
        tmp = nowMovie.find_all('span', class_=False)
        tmpstr = tmp[1].string
        movie_number = tmpstr.strip("人評價")


        Url.append(movie_url)
        Name.append(movie_name)
        Name2.append(movie_name2)
        Actor.append(movie_actor)
        Score.append(movie_score)
        Number.append(movie_number)
    return 

def printCsv():
    File = open('a.csv', 'w' , newline = '', encoding="utf-8" ) # 因為存在外文韓語日語等,所以需要用utf-8
    Print = csv.writer(File)
    Print.writerow(['電影名', '英文名', '評分', '評價人數', '演員', '電影連結'])
    for i in range(0, 250):
        nowlist  = []
        nowlist.append(Name[i])
        # nowlist.append(Name2[i]) # 存在日語韓語,注意編碼
        nowlist.append(Score[i])
        nowlist.append(Number[i])
        # nowlist.append(Actor[i]) 
        nowlist.append(Url[i])
        Print.writerow(nowlist)
    
        
    

def main():
    # 構造合理的HTTP請求頭, 偽裝成瀏覽器, 繞過反爬蟲機制,否則會被反爬蟲機制拒絕(418)。 https://www.kesci.com/home/project/5dd6003700b0b900365feaeb
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.44 Safari/537.36"
    targetUrl = 'https://movie.douban.com/top250?start='
    for i in range(0, 226, 25):
        print(targetUrl + str(i))
        r = requests.get(targetUrl + str(i), headers={'User-Agent': user_agent})
        print(r.status_code) # 418 表示返回失敗, 200表示返回成功
        getHtml(r.text)   
    printCsv()
   

if __name__=='__main__':
    main()