電影天堂爬蟲實戰
阿新 • • 發佈:2021-08-29
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# @Time:2021/8/28 22:38
# @author: Mrwhite
# @File:電影天堂爬蟲.py
# @DESC:
import re
import urllib.request,urllib.error #制定URL,獲取網頁資料
import xlwt
from bs4 import BeautifulSoup
def main():
pass
#電影天堂url
baseurl = "https://dy.dytt8.net/index.htm"
#1.爬取主頁-電影名稱,超連結,更新日期
#據超鏈地址開啟後,獲取導演/主演/豆瓣評分/磁力連結/簡介
datalist = getData(baseurl)
#2.儲存資料excel表,根據分類插入對應sheet頁
saveData(datalist,"電影天堂電影.xls")
#建立正則表示式物件,表示規則(字串的模式)
findLink = re.compile(r'・\[<a href="/html/gndy/.*<a href="(.*?)">') #影片連結匹配規則
findMovieName = re.compile( r'・\[<a href="/html/gndy/.*">(.*?)</a><br/> ' ) #匹配電影名稱
findUpDateTime = re.compile( r'<td class="inddline" width="15%"><font color="#FF0000">(.*?)</font></td>' ) #匹配更新日期
findDirect = re.compile( r'<br />◎導 演 (.*?)<br />' ) #匹配導演
findActor = re.compile( r'<br />◎主 演 (.*?)<br /><br />◎標 籤' ) #匹配演員
findScore = re.compile( r' <br />◎豆瓣評分 (.*?) from' ) #匹配豆瓣評分
findDownloadLink = re.compile( r'<a target="_blank" href="(.*?)">' ) #匹配下載連結
findInfo = re.compile( r'◎簡 介<br /><br /> (.*?)<br />' ) #相信資訊
def getData(baseurl):
datalist = []
titles,links,updateTimes,directs,actors,scores,downloadLinks,infos=[],[],[],[],[],[],[],[]
#1.爬取網頁
html = askURl(baseurl)
#print(html)
# 2.解析資料
soup=BeautifulSoup( html, "html.parser" )
#nth-child需要替換為nth-of-type
item = soup.select("div:nth-of-type(2) > div:nth-of-type(1) > div > div > div.co_content8")
item = str(item)
#print(item)
titles = re.findall(findMovieName, item) #正則匹配標題
#links = f'https://dy.dytt8.net/{re.findall(findLink, html)}'
linksUnSet = re.findall(findLink, item) #正則匹配超連結並拼接完整路徑
for link in linksUnSet:
link = f'https://dy.dytt8.net{link}'
links.append(link)
updateTimes = re.findall(findUpDateTime,item) #正則匹配更新實際
#3.迴圈訪問電影子連結獲取:導演/主演/豆瓣評分/磁力連結/簡介
for link in links:
#print(link)
html=askURl(link)
#print(html)
directUnSet = re.findall(findDirect,html) # 正則匹配導演並處理
if directUnSet==[]:
directs.append("")
else:
direct=directUnSet[0].replace(" ","").replace("·","·")
directs.append(direct)
actorsUnset = re.findall(findActor,html) # 正則匹配主演
if actorsUnset==[]:
actors.append("")
else:
actorList = actorsUnset[0].replace("·","·").replace(" ","").replace("\u3000","").split("<br />")[0:3]
actor="/".join( actorList )
actors.append(actor)
scoresUnset = re.findall(findScore,html) # 正則匹配豆瓣評分
if scoresUnset==[]:
scores.append("無評分")
else:
score=scoresUnset[0].split("/")[0]
scores.append(score)
downloadLink = re.findall(findDownloadLink,html) # 正則匹配磁力連結
downloadLinks.append(downloadLink)
infosUnSet = re.findall(findInfo,html) # 正則匹配簡介
if infosUnSet==[]:
infos.append("")
else:
info = infosUnSet[0].replace("·","·").replace(" ","").replace("“","")
infos.append(info)
dataList=[titles, updateTimes,directs, actors, scores, downloadLinks, infos]
#print( len( titles ), len( updateTimes ),len(links), len( directs ), len( actors ), len( scores ), len( downloadLinks ),len( infos ) )
return dataList
#得到指定一個URL的網頁內容
def askURl(url):
#head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
#request = urllib.request.Request(url,headers=head)
request = urllib.request.Request(url)
try:
response = urllib.request.urlopen(request)
html = response.read().decode("gb2312",errors = 'ignore')
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
return html
#儲存資料
def saveData(datalist,savepath):
print("save......")
book = xlwt.Workbook(encoding="utf8",style_compression=0)
sheet = book.add_sheet("from電影天堂",cell_overwrite_ok=True)
col = ('標題',"更新時間","導演","主演","豆瓣評分","磁力連結","簡介")
try:
for j in range(7): #i為行,j為列
sheet.write(0,j,col[j]) #列名
for i in range(1,len(datalist[0])):
sheet.write(i,j,datalist[j][i])
print("datalist的",i,"行",j,"列的資料為:",datalist[j][i],"成功寫入")
book.save(savepath) #儲存
except Exception as e:
print("datalist的",i,"行",j,"列的資料為:",datalist[j][i],"寫入失敗")
print(e)
if __name__ == "__main__": #當程式執行時
#呼叫函式
main()
print("爬取完畢")
展示效果如下:可繼續新增下優化爬蟲的效率