用request獲取貓眼電影排行榜前一百頁電影的資訊
阿新 • • 發佈:2020-07-21
這次任務是獲取貓眼電影排行前一百頁電影的資訊,其中涉及滑塊驗證和動態加密。
滑塊驗證
貓眼的滑塊驗證其實很瞎,只要手工滑動通過驗證,然後帶上header訪問就可以通過了,但過段時間還會需要重新驗證,想要一勞永逸可能還要用autogui寫一段。
可以通過request之後檢查請求的url與想要訪問的url是否一致來判斷是否需要驗證:
url = 'https://maoyan.com/films?showType=3&sortId=3&offset=' h = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'} #手動滑動驗證一次後就可以繞過驗證了 req = requests.get(url, headers=h) print(req.url)
如果打印出來的url與需要訪問的一致,則說明訪問成功,否則手動滑塊一下即可。
字型加密
本實驗中需要獲取一部電影的票房、評分、評分人數等,發現這些欄位是經過動態加密的,加密方法是字型加密,即每次重新整理都返回一種新的字型,這些新的字型實際上是對基本模板的細微改動,改動後導致字元的編碼發生改變,從而實現動態加密。
這裡採用的處理方法是借鑑https://zhuanlan.zhihu.com/p/84358858
因為還要完成其他任務現在懶得詳細寫了,簡單複述下對引文所使用方法的理解:首先從網頁中獲取一份字型,解析字型檔案後可以看到字型中各個數字字元的座標,這些座標可以唯一地確定該數字,這就相當於一份鑰匙,有了它就可以對網頁返回的其他字型進行解密;解密的方法就是將其他字型中字元的座標與模板字型中字元的座標進行比對,待解密字元與模板中各字元的相似度,相似度最為接近的就是我們所需的答案。
後續有空再好好寫吧,這裡就先貼一下程式碼:
from fontTools.ttLib import TTFont
from math import sqrt
#抓取加密部分 #字型庫獲取 def get_font_file(req, file_name): # 獲取字型地址 font_url = re.findall(r'url\(\'(.*?)\'\)', req.text, re.S)[2] font_url = 'https:' + font_url #下載字型檔案 req_font = requests.get(font_url, headers=h) with open(file_name,'wb') as f: f.write(req_font.content) #建立模板 template_data = { 'uniE387':3, 'uniEDA2':9, 'uniE667':5, 'uniF350':0, 'uniF8BF':6, 'uniF4DA':8, 'uniE40E':2, 'uniF079':7, 'uniE295':4, 'uniF82F':1 } template_font = TTFont('stonefont.woff') #通過與模板比較判斷字元是哪個數字 #計算兩個字元間的平均距離 def cal_distance(list): d = 0 for p in list: x1, x2 = p d = d + sqrt(pow(x1[0]-x2[0],2)) + pow(x1[1]-x2[1],2) d = d/len(list) return d def deciphering(req, check_file): #提取網頁中被加密的字元 encryption_num = re.findall(r'&#x(.*?);', req.text) encryption_num = set(encryption_num) #讀取模板字型庫資訊 # template_font = TTFont(template_file) template_index = template_font.getGlyphOrder()[2:] #讀取新的字型庫資訊 target_font = TTFont(check_file) #字型解密 correct_num = {} num = {} #遍歷每個待計算的字元 for item in encryption_num: target = target_font['glyf']['uni'+item.upper()].coordinates dist_min = 1000000 #遍歷每個模板字元 for index in template_index: template = template_font['glyf'][index].coordinates #不同字型庫中的相同字元座標數量不一樣,人為確定一個可接受的差異範圍10 if abs(len(target)-len(template))<10: dist = cal_distance(list(zip(target, template))) if dist < dist_min: dist_min = dist num[item] = index for item in encryption_num: correct_num['&#x'+item+';'] = template_data[num[item]] return correct_num get_font_file(req, 'check.woff') correct = deciphering(req, 'check.woff') #解碼後替換原網頁中的字元程式碼 html = req.text for key in correct: html = html.replace(key, str(correct[key]))
完整程式碼
import requests import re import pandas from fontTools.ttLib import TTFont from math import sqrt url = 'https://maoyan.com/films?showType=3&sortId=3&offset=' h = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'} #手動滑動驗證一次後就可以繞過驗證了 req = requests.get(url, headers=h) print(req.url) #獲取電影頁面的url def get_movie_url(): with open('film_code.txt', 'w') as f: for i in range(100): url_ = url + str(i*30) req_ = requests.get(url_, headers=h) film_code = re.findall(r'<a href="/films/(.*?)" target', req_.text) film_code = set(film_code) for code in film_code: f.write(code + '\n') print('成功獲取前一百頁電影的url') #獲取電影頁面內的資訊 def parse(url): req = requests.get(url, headers=h) print(req.url) name = re.findall('<h1 class="name">(.*?)</h1>', req.text)[0] name_eng = re.findall('<div class="ename ellipsis">(.*?)</div>', req.text)[0] genres = re.findall('<a class="text-link".*target="_blank"> (.*) </a>', req.text) release_time = re.findall('<li class="ellipsis">(.*?)</li>', req.text)[0] country_and_time = re.findall('</a>.*</li>.*<li class="ellipsis">(.*?)</li>.*<li class="ellipsis">.*</li>', req.text, re.S)[0]#正則表示式不能匹配換行符,因此必須加上re.S使得正則表示式可以匹配所有換行符或空字元 #https://blog.csdn.net/qq_38486203/article/details/85245011 country_and_time = country_and_time.replace(' ','').replace('\n','').split('/') if len(country_and_time)==2: country = country_and_time[0] produce_time = country_and_time[1] else: country = country_and_time[0] produce_time = '' script = re.findall('<span class="dra">(.*?)</span>', req.text) cast = re.findall('<a href="/films/celebrity/.*?" target="_blank" class="name">(.*?)</a>', req.text, re.S) cast_ = [] for c in cast: cast_.append(c.strip()) #字型加密解碼 get_font_file(req, 'check.woff') correct = deciphering(req, 'check.woff') html = req.text for key in correct: html = html.replace(key, str(correct[key])) #爬取解碼後的欄位 rating = re.findall(r'<div class="movie-index-content score normal-score".*?<span class="index-left info-num ">.*?<span class="stonefont">(.*?)</span>.*?</span>', html, re.S) rate_num = re.findall(r'<span class=\'score-num\'><span class="stonefont">(.*?)</span>人評分</span>', html) box_office = re.findall(r'<div class="movie-index-content box">.*<span class="stonefont">(.*?)</span><span class="unit">(.*?)</span>.*</div>', html, re.S) d = {'name':name,'name_eng':name_eng,'genres':genres,'release_time':release_time,'country':country,'produce_time':produce_time,'rating':rating,'rate_num':rate_num,'box_office':box_office, 'script':script, 'cast':cast_} return d #抓取加密部分 #字型庫獲取 def get_font_file(req, file_name): # 獲取字型地址 font_url = re.findall(r'url\(\'(.*?)\'\)', req.text, re.S)[2] font_url = 'https:' + font_url #下載字型檔案 req_font = requests.get(font_url, headers=h) with open(file_name, 'wb') as f: f.write(req_font.content) #建立模板 template_data = { 'uniE387':3, 'uniEDA2':9, 'uniE667':5, 'uniF350':0, 'uniF8BF':6, 'uniF4DA':8, 'uniE40E':2, 'uniF079':7, 'uniE295':4, 'uniF82F':1 } template_font = TTFont('stonefont.woff') #通過與模板比較判斷字元是哪個數字 #計算兩個字元間的平均距離 def cal_distance(list): d = 0 for p in list: x1, x2 = p d = d + sqrt(pow(x1[0]-x2[0],2)) + pow(x1[1]-x2[1],2) d = d/len(list) return d def deciphering(req, check_file): #提取網頁中被加密的字元 encryption_num = re.findall(r'&#x(.*?);', req.text) encryption_num = set(encryption_num) #讀取模板字型庫資訊 # template_font = TTFont(template_file) template_index = template_font.getGlyphOrder()[2:] #讀取新的字型庫資訊 target_font = TTFont(check_file) #字型解密 correct_num = {} num = {} #遍歷每個待計算的字元 for item in encryption_num: target = target_font['glyf']['uni'+item.upper()].coordinates dist_min = 1000000 #遍歷每個模板字元 for index in template_index: template = template_font['glyf'][index].coordinates #不同字型庫中的相同字元座標數量不一樣,人為確定一個可接受的差異範圍10 if abs(len(target)-len(template))<10: dist = cal_distance(list(zip(target, template))) if dist < dist_min: dist_min = dist num[item] = index for item in encryption_num: correct_num['&#x'+item+';'] = template_data[num[item]] return correct_num #獲取電影內頁的url get_movie_url() #爬取欄位 dt = pandas.DataFrame(columns=['name','name_eng','genres','release_time','country','produce_time','rating','rate_num','box_office', 'script', 'cast']) with open('film_code.txt','r') as f: lines = f.readlines() print(len(lines)) for line in lines: try: movie_url = 'https://maoyan.com/films/'+str(line.strip()) d = parse(movie_url) print(d) dt = dt.append(d, ignore_index=True) dt.to_excel('data.xlsx', encoding='utf8') except Exception as e: dt.to_excel('data.xlsx', encoding='utf8') print(e)