1. 程式人生 > >IMDB TOP 250爬蟲

IMDB TOP 250爬蟲

imp syn style 信息 ati font meta and ***

這個小學期Python大作業搞了個獲取IMDB TOP 250電影全部信息的爬蟲。第二次寫爬蟲,比在暑假集訓時寫的熟練多了。歡迎大家評論。

  1 ‘‘‘
  2 ************************************************
  3 *Made by 1120162015 李博       
  4 *        1120161966 張嘉熙     
  5 *Time:2017.9.11       
  6 *Target:All movies‘ information of IMDB TOP_250
  7 *Resources:http://www.imdb.cn/IMDB250/
8 *純原創 轉載請註明作者:李博,張嘉熙 9 ************************************************ 10 ‘‘‘ 11 12 import re 13 import requests 14 import numpy as np 15 import matplotlib.pyplot as plt 16 from bs4 import BeautifulSoup 17 18 num = 1 #電影計數 19 All_txt = [] #全部電影的信息 20 headers={User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0
}#瀏覽器代理 21 def getHTMLText(url): 22 try: 23 #print(url) 24 r = requests.get( url,headers = headers ) 25 #print(r) 26 r.encoding = utf-8 27 return r.text 28 except: 29 return "錯誤" 30 31 #從每一部電影的頁面中獲取全部信息 32 def get_all_information(url,page):
33 global num,All_txt 34 txt = getHTMLText(url) 35 if txt != "錯誤": 36 print(page+str(page)+ NO.+str(num)+ Get it!) 37 if num == 247: 38 print(Finished!!!) 39 soup = BeautifulSoup(txt,"html.parser") 40 Cname,Ename,Score,title,Actor,Starring,Infor = ‘‘,‘‘,‘‘,‘‘,‘‘,‘‘,‘‘ 41 42 #TOP250-film_Chinese_name&Score 43 infor_1 = soup.find_all(div,class_ = hdd) 44 rel = <h3>+[\s\S]*?+</h3> 45 pattern = re.compile(rel) 46 Cname = ‘‘.join(pattern.findall(str(infor_1[0]))) 47 Cname = Cname.replace(<h3>,‘‘).replace(</h3>,‘‘) 48 #print(Cname) 49 #find_the_year & save 50 rel = +[\s\S]*?+ 51 pattern = re.compile(rel) 52 time_ = ‘‘.join(pattern.findall(Cname)) 53 #print(time_) 54 with open(time.txt,a,encoding=utf-8) as t: 55 t.write( time_.replace(,‘‘).replace(,‘‘) + \n ) 56 #find_Score 57 rel = <i>+[\s\S]*?+</i> 58 pattern = re.compile(rel) 59 Score = ‘‘.join(pattern.findall(str(infor_1[0]))) 60 Score = Score.replace(<i>,‘‘).replace(</i>,‘‘) 61 #print(Cname,Score) 62 63 #TOP250-film_many_infor 64 now = soup.find_all(div,class_ = bdd clear) 65 #print(now[0]) 66 a = BeautifulSoup(str(now[0]), "html.parser") 67 many_infor = a.find_all(li) 68 69 #TOP250-film_Ename 70 Ename = str(many_infor[0]).replace(<li>,‘‘).replace(<i>,‘‘).replace(</i>,‘‘).replace(</li>,‘‘).replace(<a>,‘‘).replace(</a>,‘‘) 71 #TOP250-film_Actor 72 Actor_temp = BeautifulSoup(str(many_infor[2]), "html.parser").find_all(a) 73 Actor = Actor_temp[0].get_text().replace(導演:,‘‘) 74 #TOP250-film_Starring 75 Starring_temp = BeautifulSoup(str(many_infor[3]), "html.parser").find_all(a) 76 for i in Starring_temp: 77 Starring += i.get_text().replace( ,‘‘) + 78 #print(Starring) 79 80 #Top-film_Infor 81 for j in range(4,7): 82 Infor_temp = BeautifulSoup(str(many_infor[j]), "html.parser") 83 for i in Infor_temp.children: 84 Infor += i.get_text().replace( ,‘‘) + 85 Infor += \n 86 #print(Infor) 87 88 #TOP250-film_Synopsis 89 content = soup.find_all(div,class_ = fk-4 clear) 90 #print(content) 91 soup_con = BeautifulSoup(str(content[0]), "html.parser") 92 title = soup_con.find_all(div,class_ = hdd) 93 title = str(title[0]).replace(<div class="hdd">,‘‘).replace(</div>,\n) 94 #print(title) 95 content_1 = soup_con.find_all(div,class_ = bdd clear) 96 content_1 = str(content_1[0]).replace(<div class="bdd clear" style="font-size:15px">,‘‘).replace(</div>,‘‘) 97 content_1 = content_1.replace(<!-- <p><a href="#">更多劇情 >></a></p> -->,‘‘).replace(<br/>,\n) 98 99 #Save_all_information 100 All_txt.append(+str(num)++\n) 101 All_txt.append( Cname+\n ) 102 All_txt.append( 【英文名】+Ename+\n ) 103 All_txt.append( 【評分】+Score+\n ) 104 All_txt.append( 【導演】+Actor+\n ) 105 All_txt.append( 【主演】+Starring+\n ) 106 All_txt.append( Infor+\n ) 107 All_txt.append( title+\n+content_1+\n ) 108 All_txt.append(\n) 109 num += 1 110 111 #在每一頁中得到當前頁的全部電影的url 112 def getin_one(url,page): 113 txt = getHTMLText(url) 114 soup = BeautifulSoup(txt, "html.parser") 115 #print(soup) 116 temp = soup.find_all(div,class_="ss-3 clear") 117 rel = <a href=" + [\s\S]*? + "> 118 pattern = re.compile(rel) 119 All_url = pattern.findall( str(temp[0]) ) 120 for i in range(len(All_url)): 121 temp_url = http://www.imdb.cn+All_url[i].replace(<a href=",‘‘).replace(">,‘‘) 122 get_all_information(temp_url,page) 123 #print(All_url) 124 125 #將所有電影的年份統計並生成條形圖 126 def Analyze_some_infor(): 127 plt.rc(font, family=SimHei, size=13)#字體及大小 128 #Analyze_time 129 file = open(time.txt) 130 a,b,c,d,e,f = 0,0,0,0,0,0 131 for line in file: 132 line = eval(line) 133 if line == 0: 134 f += 1 135 elif line < 1940 and line >= 1920: 136 a += 1 137 elif line < 1960 and line >= 1940: 138 b += 1 139 elif line < 1980 and line >= 1960: 140 c += 1 141 elif line < 2000 and line >= 1980: 142 d += 1 143 else: 144 e += 1 145 times = [a,b,c,d,e,f] 146 range_time = [1920-1940,1940-1960,1960-1980,1980-2000,2000-現在,無信息] 147 idx = np.arange(len(range_time)) 148 width = 0.5 149 plt.bar(idx,times,width,color=green) 150 plt.xticks(idx+width/2, range_time, rotation=40) 151 plt.xlabel(電影年代) 152 plt.ylabel(數目) 153 plt.savefig(time_pic.jpg) 154 plt.show() 155 156 def main(): 157 global All_txt 158 getin_one(http://www.imdb.cn/IMDB250/,1) 159 for i in range(2,10): 160 getin_one( http://www.imdb.cn/imdb250/+str(i) , i ) 161 #將已有內容清空 162 with open(All_infor.txt,w,encoding=utf-8) as x: 163 pass 164 with open(All_infor.txt,a,encoding=utf-8) as x: 165 for i in All_txt: 166 x.write(i) 167 Analyze_some_infor() 168 169 main()

作者: LB919
出處:http://www.cnblogs.com/L1B0/
該文章為LB919投入了時間和精力的原創;
如有轉載,榮幸之至!請隨手標明出處;

IMDB TOP 250爬蟲