IMDB TOP 250爬蟲

阿新 • • 發佈：2017-09-18

imp syn style 信息 ati font meta and ***

這個小學期Python大作業搞了個獲取IMDB TOP 250電影全部信息的爬蟲。第二次寫爬蟲，比在暑假集訓時寫的熟練多了。歡迎大家評論。

  1 ‘‘‘
  2 ************************************************
  3 *Made by 1120162015 李博       
  4 *        1120161966 張嘉熙     
  5 *Time：2017.9.11       
  6 *Target：All movies‘ information of IMDB TOP_250
  7 *Resources：http://www.imdb.cn/IMDB250/
 
  8 *純原創 轉載請註明作者：李博，張嘉熙
  9 ************************************************
 10 ‘‘‘
 11 
 12 import re
 13 import requests
 14 import numpy as np
 15 import matplotlib.pyplot as plt
 16 from bs4 import BeautifulSoup
 17 
 18 num = 1 #電影計數
 19 All_txt = [] #全部電影的信息
 20 headers={‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0 
‘}#瀏覽器代理
 21 def  getHTMLText(url):
 22     try:
 23         #print(url)
 24         r = requests.get( url,headers = headers )
 25         #print(r)
 26         r.encoding = ‘utf-8‘
 27         return r.text
 28     except:
 29         return "錯誤"
 30 
 31 #從每一部電影的頁面中獲取全部信息
 32 def get_all_information(url,page):
 
 33     global num,All_txt
 34     txt = getHTMLText(url)
 35     if txt != "錯誤":
 36         print(‘page‘+str(page)+‘ NO.‘+str(num)+‘ Get it!‘)
 37     if num == 247:
 38         print(‘Finished!!!‘)
 39     soup = BeautifulSoup(txt,"html.parser")
 40     Cname,Ename,Score,title,Actor,Starring,Infor = ‘‘,‘‘,‘‘,‘‘,‘‘,‘‘,‘‘
 41 
 42     #TOP250-film_Chinese_name&Score
 43     infor_1 = soup.find_all(‘div‘,class_ = ‘hdd‘)
 44     rel = ‘<h3>‘+‘[\s\S]*?‘+‘</h3>‘
 45     pattern = re.compile(rel)
 46     Cname = ‘‘.join(pattern.findall(str(infor_1[0])))
 47     Cname = Cname.replace(‘<h3>‘,‘‘).replace(‘</h3>‘,‘‘)
 48     #print(Cname)
 49     #find_the_year & save
 50     rel = ‘（‘+‘[\s\S]*?‘+‘）‘
 51     pattern = re.compile(rel)
 52     time_ = ‘‘.join(pattern.findall(Cname))
 53     #print(time_)
 54     with open(‘time.txt‘,‘a‘,encoding=‘utf-8‘) as t:
 55         t.write( time_.replace(‘（‘,‘‘).replace(‘）‘,‘‘) + ‘\n‘ )
 56     #find_Score
 57     rel = ‘<i>‘+‘[\s\S]*?‘+‘</i>‘
 58     pattern = re.compile(rel)
 59     Score = ‘‘.join(pattern.findall(str(infor_1[0])))
 60     Score = Score.replace(‘<i>‘,‘‘).replace(‘</i>‘,‘‘)
 61     #print(Cname,Score)
 62 
 63     #TOP250-film_many_infor
 64     now = soup.find_all(‘div‘,class_ = ‘bdd clear‘)
 65     #print(now[0])
 66     a = BeautifulSoup(str(now[0]), "html.parser")
 67     many_infor = a.find_all(‘li‘)
 68 
 69     #TOP250-film_Ename
 70     Ename = str(many_infor[0]).replace(‘<li>‘,‘‘).replace(‘<i>‘,‘‘).replace(‘</i>‘,‘‘).replace(‘</li>‘,‘‘).replace(‘<a>‘,‘‘).replace(‘</a>‘,‘‘)
 71     #TOP250-film_Actor
 72     Actor_temp = BeautifulSoup(str(many_infor[2]), "html.parser").find_all(‘a‘)
 73     Actor = Actor_temp[0].get_text().replace(‘導演：‘,‘‘)
 74     #TOP250-film_Starring
 75     Starring_temp = BeautifulSoup(str(many_infor[3]), "html.parser").find_all(‘a‘)
 76     for i in Starring_temp:
 77         Starring += i.get_text().replace(‘ ‘,‘‘) + ‘ ‘
 78     #print(Starring)
 79 
 80     #Top-film_Infor
 81     for j in range(4,7):
 82         Infor_temp = BeautifulSoup(str(many_infor[j]), "html.parser")
 83         for i in Infor_temp.children:
 84             Infor += i.get_text().replace(‘ ‘,‘‘) + ‘ ‘
 85         Infor += ‘\n‘
 86     #print(Infor)
 87 
 88     #TOP250-film_Synopsis
 89     content =  soup.find_all(‘div‘,class_ = ‘fk-4 clear‘)
 90     #print(content)
 91     soup_con = BeautifulSoup(str(content[0]), "html.parser")
 92     title = soup_con.find_all(‘div‘,class_ = ‘hdd‘)
 93     title = str(title[0]).replace(‘<div class="hdd">‘,‘‘).replace(‘</div>‘,‘\n‘)
 94     #print(title)
 95     content_1 = soup_con.find_all(‘div‘,class_ = ‘bdd clear‘)
 96     content_1 = str(content_1[0]).replace(‘<div class="bdd clear" style="font-size:15px">‘,‘‘).replace(‘</div>‘,‘‘)
 97     content_1 = content_1.replace(‘<!-- <p><a href="#">更多劇情 >></a></p>  -->‘,‘‘).replace(‘<br/>‘,‘\n‘)
 98 
 99     #Save_all_information
100     All_txt.append(‘第‘+str(num)+‘部‘+‘\n‘)
101     All_txt.append( Cname+‘\n‘ )
102     All_txt.append( ‘【英文名】‘+Ename+‘\n‘ )
103     All_txt.append( ‘【評分】‘+Score+‘\n‘ )
104     All_txt.append( ‘【導演】‘+Actor+‘\n‘ )
105     All_txt.append( ‘【主演】‘+Starring+‘\n‘ )
106     All_txt.append( Infor+‘\n‘ )
107     All_txt.append( title+‘\n‘+content_1+‘\n‘ )
108     All_txt.append(‘\n‘)
109     num += 1
110 
111 #在每一頁中得到當前頁的全部電影的url
112 def getin_one(url,page):
113     txt = getHTMLText(url)
114     soup = BeautifulSoup(txt, "html.parser")
115     #print(soup)
116     temp = soup.find_all(‘div‘,class_="ss-3 clear")
117     rel = ‘<a href="‘ + ‘[\s\S]*?‘ + ‘">‘
118     pattern = re.compile(rel)
119     All_url = pattern.findall( str(temp[0]) )
120     for i in range(len(All_url)):
121         temp_url = ‘http://www.imdb.cn‘+All_url[i].replace(‘<a href="‘,‘‘).replace(‘">‘,‘‘)
122         get_all_information(temp_url,page)
123     #print(All_url)
124 
125 #將所有電影的年份統計並生成條形圖
126 def Analyze_some_infor():
127     plt.rc(‘font‘, family=‘SimHei‘, size=13)#字體及大小
128     #Analyze_time
129     file = open(‘time.txt‘)
130     a,b,c,d,e,f = 0,0,0,0,0,0
131     for line in file:
132         line = eval(line)
133         if line == 0:
134             f += 1
135         elif line < 1940 and line >= 1920:
136             a += 1 
137         elif line < 1960 and line >= 1940:
138             b += 1
139         elif line < 1980 and line >= 1960:
140             c += 1
141         elif line < 2000 and line >= 1980:
142             d += 1
143         else:
144             e += 1
145     times = [a,b,c,d,e,f]
146     range_time = [‘1920-1940‘,‘1940-1960‘,‘1960-1980‘,‘1980-2000‘,‘2000-現在‘,‘無信息‘]
147     idx = np.arange(len(range_time))
148     width = 0.5
149     plt.bar(idx,times,width,color=‘green‘)
150     plt.xticks(idx+width/2, range_time, rotation=40)
151     plt.xlabel(‘電影年代‘)
152     plt.ylabel(‘數目‘)
153     plt.savefig(‘time_pic.jpg‘)
154     plt.show()
155 
156 def main():
157     global All_txt
158     getin_one(‘http://www.imdb.cn/IMDB250/‘,1)
159     for i in range(2,10):
160         getin_one( ‘http://www.imdb.cn/imdb250/‘+str(i) , i )
161     #將已有內容清空
162     with open(‘All_infor.txt‘,‘w‘,encoding=‘utf-8‘) as x:
163         pass
164     with open(‘All_infor.txt‘,‘a‘,encoding=‘utf-8‘) as x:
165         for i in All_txt:
166             x.write(i)
167     Analyze_some_infor()
168 
169 main()

作者： LB919
出處：http://www.cnblogs.com/L1B0/
該文章為LB919投入了時間和精力的原創；
如有轉載，榮幸之至！請隨手標明出處；

IMDB TOP 250爬蟲

imp syn style 信息 ati font meta and *** 這個小學期Python大作業搞了個獲取IMDB TOP 250電影全部信息的爬蟲。第二次寫爬蟲，比在暑假集訓時寫的熟練多了。歡迎大家評論。 1 ‘‘‘ 2 ***************

Python開發簡單爬蟲之靜態網頁抓取篇：爬取“豆瓣電影 Top 250”電影數據

模塊歲月 python開發 IE 女人 bubuko status 公司使用目標：爬取豆瓣電影TOP250的所有電影名稱，網址為：https://movie.douban.com/top250 1）確定目標網站的請求頭：打開目標網站，在網頁空白處點擊鼠標右鍵，

Python爬蟲-豆瓣電影 Top 250

EDA esc std app data raise 打開網頁正則表達 sta 爬取的網頁地址為：https://movie.douban.com/top250 打開網頁後，可觀察到：TOP250的電影被分成了10個頁面來展示，每個頁面有25個電影。那麽要爬取所有電影

python3 爬蟲抓取豆掰電影TOP 250

個人喜歡看電影，就去爬豆瓣上的電影Top榜，python入門不久，順便學習練下 from urllib import request from bs4 import BeautifulSoup #Beautiful Soup是一個可以從HTML或XML檔案中提取結構化

豆瓣電影 Top 250

utf-8 comm mov write 獲取網頁 quest fin read 評價 # by luffycity.comimport refrom urllib.request import urlopendef getPage(url): # 獲取網頁的字符串

用爬蟲分析IMDB TOP250電影數據

tle table close 保存 ins turn com 現在標示起因恰逢諾蘭導演的新片《敦刻爾克》即將在中國上映，作為諾蘭導演的鐵粉，印象中他的很多部電影都進入了IMDB TOP250的榜單，但是具體是多少部呢？他是不是IMDB TOP250 中作品最多的導演

Python爬蟲，用於抓取豆瓣電影Top前100的電影的名稱

初步接觸python爬蟲(其實python也是才起步)，發現一段程式碼研究了一下，覺得還比較有用處，Mark下。上程式碼： #!/usr/bin/python #coding=utf-8 #Author: Andrew_liu #mender：cy "

網易雲音樂爬蟲--評論爬取以及Top Music統計

網易云云音樂評論十分有趣，於是就想寫個爬蟲爬取評論。但是不熟悉Python，就用java寫了個。主要使用了HttpClient,，Jsoup，佇列，執行緒， log4j，poi生成Excel儲存結果，書寫過程中主要一個問題就是評論獲取

pyhthon 利用爬蟲結合阿裏大於短信接口實現短信發送天氣預報

logging restapi cep elf except cnblogs author div time() 1 # -*- coding: utf-8 -*- 2 ‘‘‘‘‘ 3 SDK for alidayu 4 5

網絡爬蟲之網頁排重：語義指紋

網絡爬蟲網頁排重引言：網絡爬蟲讓我們高效地從網頁獲取到信息，但網頁的重復率很高，網頁需要按內容做文檔排重，而判斷文檔的內容重復有很多種方法，語義指紋是其中比較高效的方法。本文選自《網絡爬蟲全解析——技術、原理與實踐》。　　現代社會，有效信息對人來說就像氧氣一樣不可或缺。互聯網讓有效信息的收集工作變

2.3 基於寬度優先搜索的網頁爬蟲原理講解

什麽每一個 empty 目錄 except open 要求 and ref 上一節我們下載並使用了寬度優先的爬蟲，這一節我們來具體看一下這個爬蟲的原理。首先，查看HTML.py的源代碼。第一個函數： def get_html(url): try:

爬蟲 spider

貪婪匹配結合 pri () 取消 print all odin html python 2.x # -*- coding: utf-8 -*-import reimport urlliburl = ‘http://tieba.baidu.com/p/4872795764‘

Python 和 Scrapy 爬蟲框架部署

python scrapy 爬蟲框架 Scrapy 是采用Python 開發的一個快速可擴展的抓取WEB 站點內容的爬蟲框架。安裝依賴 yum install gcc gcc-c++ openssl mysql mysql-server libffi* libxml* libxml2 l

python：爬蟲0

電驢 cati body nbsp 爬蟲 esp 域名對象通過什麽是網頁爬蟲，也叫網頁蜘蛛。把互聯網比作一個蜘蛛網，有好多節點，這個蜘蛛在網上爬來爬去，對對網頁中的每個關鍵字進行建立索引，然後建立索引數據庫，經過復雜的排序算法後，這些算法的結果將按照相關度的高低展現出

Python簡易爬蟲

5.0 抓取 content utf ade response con pla bsp # coding: utf-8 import urllib import urllib2 import re import os if __name__==‘__main__‘:

Scrapy 爬蟲框架入門案例詳解

tin mon setting 爬蟲框架 finished perror project 原因 create 歡迎大家關註騰訊雲技術社區-博客園官方主頁，我們將持續在博客園為大家推薦技術精品文章哦~ 作者：崔慶才 Scrapy入門本篇會通過介紹一

爬蟲一

.com 封裝 ice nbsp auto 方法覆蓋 parse ext .get 初識爬蟲 1 #! /usr/bin/env python 2 # encoding: utf-8 3 4 from bs4 import BeautifulSoup 5 im

一個鹹魚的Python爬蟲之路（三）：爬取網頁圖片

you os.path odin 路徑生成存在 parent lose exist 學完Requests庫與Beautifulsoup庫我們今天來實戰一波，爬取網頁圖片。依照現在所學只能爬取圖片在html頁面的而不能爬取由JavaScript生成的圖。所以我找了這個網站

[Python爬蟲] 之十五：Selenium +phantomjs根據微信公眾號抓取微信文章

頭部 drive lac 過程標題操作函數軟件測試 init 　　借助搜索微信搜索引擎進行抓取　　抓取過程　　1、首先在搜狗的微信搜索頁面測試一下，這樣能夠讓我們的思路更加清晰　　　　　　在搜索引擎上使用微信公眾號英文名進行“搜公眾號&r

爬蟲庫之BeautifulSoup學習（二）

不必要 baidu html left 官方 blank 正則文本處理比較 BeautifulSoup官方介紹文檔：https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html 四大對象種

IMDB TOP 250爬蟲

相關推薦