字型反爬
總體思路
破解字型反爬:
1. 獲取字型檔案,
2.Unicode解碼字型檔案,儲存為ttf檔案,
3.開啟ttf檔案,檢視對映關係建立對映字典(字形與實際字型之間的關係)(或者可以設定自動識別)
4.通過code與name的關係,尋找name與字形之間的關係,得到code(需要做hex)與字形關係之間的關係
5.第二次獲取響應時,直接通過code與字形之間的關係,得到實際的字型,並將code替換成實際字型
破解過程
可以明顯的看到顯示正常但html中的是亂碼。
1.獲取字型檔案
如上所示,字型檔案隱藏在css中,點選右上角的style。
上圖中那一段密文就是我們需要的ttf檔案(這一段密文是可變的,但其中的對映關係不變)
獲取那段程式碼然後儲存為ttf檔案,檢視字形與實際字型的對應關係
進行base64解碼(為什麼是bs64呢,因為font-face中標明瞭是base64)並儲存為ttf檔案,使用Fontcreator開啟,可以看到字形與實際字型的對應關係,也就是對映
因為是自定義的字形,且個數不多,手動建立對映表
這裡我手動建:
font_dict = { 'glyph1': '0', 'glyph2': '1', 'glyph3': '2', 'glyph4': '3', 'glyph5': '4','glyph6': '5', 'glyph7': '6', 'glyph8': '7', 'glyph9': '8', 'glyph10': '9', }
2.將ttf檔案儲存為xml檔案,通過xml檔案分析code,name,字形之間的關係
import re import requests import base64 from fontTools.ttLib import TTFont import io def get_ttf(): url = 'https://cd.58.com/chuzu/' headers= { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' } page_text = requests.get(url,headers).text font_ttf = re.search(r"charset=utf-8;base64,(.*?)\'",page_text).group(1) font = TTFont(io.BytesIO(base64.b64decode(font_ttf))) # 放入到記憶體中,以位元組形式存在 font.saveXML("wuba.xml") def main(): get_ttf() if __name__ == '__main__': main()
分析xml檔案
回顧一下上文中提到的三個關係:code ——>name,name——>字形,字形——>實際字型(值)
前面說過:code與name是變化的,但字形與實際值是不變的,然後我們分析一下name與字形之間的關係
name:glyph0001
字形:glyph1
很明顯中間多了幾個零而已,處理一下,直接讓code與字形對應起來,下一次請求過來時,直接通過code找到字形,最終找到值做code替換。
處理轉換一下資料:
梳理一下:當獲取到響應的html時,我們會先拿到ttf檔案,然後通過fonttools模組處理code,轉換name為字形,通過對映字典自動轉換為值,這樣code與值就直接對應上了
到這裡我們還需要處理一下code,因為現在得到的code與html中的文字是有一點小差別的
可以看到0變成了&#後面多了";"
解析來,code直接對映值,然後替換html中的文字
# -*- coding: utf-8 -*- # __author__ = "maple" import re import requests import base64 from fontTools.ttLib import TTFont import io def get_ttf(): url = 'https://cd.58.com/chuzu/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' } font_dict = { 'glyph1': '0', 'glyph2': '1', 'glyph3': '2', 'glyph4': '3', 'glyph5': '4', 'glyph6': '5', 'glyph7': '6', 'glyph8': '7', 'glyph9': '8', 'glyph10': '9', } page_text = requests.get(url,headers).text font_ttf = re.search(r"charset=utf-8;base64,(.*?)\'",page_text).group(1) font = TTFont(io.BytesIO(base64.b64decode(font_ttf))) # 放入到記憶體中,以位元組形式存在 code_name = font.getBestCmap() for code,name in code_name.items(): # 對code做雜湊,將數字轉換為對應的碼 code = hex(code).replace("0","&#") +";" # 自動將name轉換為字形 name = name.split("000")[0]+str(int(name.split("000")[-1])) code_value = font_dict[name] # 通過name找到值 page_text = page_text.replace(code,code_value) # 執行替換操作,將code替換為值 def main(): get_ttf() if __name__ == '__main__': main()
3.資料解析
最後寫入資料庫或這redis即可
程式碼:
# -*- coding: utf-8 -*- # __author__ = "maple" import re import requests import base64 from fontTools.ttLib import TTFont from lxml import etree import io def get_ttf(): url = 'https://cd.58.com/chuzu/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' } font_dict = { 'glyph1': '0', 'glyph2': '1', 'glyph3': '2', 'glyph4': '3', 'glyph5': '4', 'glyph6': '5', 'glyph7': '6', 'glyph8': '7', 'glyph9': '8', 'glyph10': '9', } page_text = requests.get(url,headers).text font_ttf = re.search(r"charset=utf-8;base64,(.*?)\'",page_text).group(1) font = TTFont(io.BytesIO(base64.b64decode(font_ttf))) # 放入到記憶體中,以位元組形式存在 code_name = font.getBestCmap() for code,name in code_name.items(): # 對code做雜湊,將數字轉換為對應的碼 code = hex(code).replace("0","&#") +";" # 自動將name轉換為字形 name = name.split("000")[0]+str(int(name.split("000")[-1])) code_value = font_dict[name] # 通過name找到值 page_text = page_text.replace(code,code_value) # 執行替換操作,將code替換為值 tree = etree.HTML(page_text) for li in tree.xpath("/html/body/div[6]/div[2]/ul/li"): try: # 標題 title = li.xpath("./div[@class='des']/h2/a/text()")[0].replace(" ", "").strip('\n') # 房屋型別及面積 room = li.xpath("./div[@class='des']/p[@class='room']/text()")[0].replace(" ", "").split( "\xa0") # \xa0是不間斷空白符 # 核驗 check_status = li.xpath("./div[@class='des']/p[@class='room']/i/text()")[0] # 地點 infor = "".join(li.xpath("./div[@class='des']/p[@class='infor']//text()")).replace("\xa0", "").replace("\n", "").replace( " ", "").replace(":", "\:") # 來源 gongyu = "".join(li.xpath("./div[@class='des']/p[3]//text()|./div[@class='des']/div//text()")).replace(" ", "").replace( "\n", " ") # 價格 price = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="money"]//text()')).replace(" ", "").replace( "\n", " ") # 釋出時間 up_date = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="send-time"]//text()')).replace(" ", "").replace( "\n", " ") # 來源 # print(title) type_room = room[0] # 房子型別:幾室幾廳 area_room = room[-1].strip("\n") # 面積m**2 print(title,check_status,infor,gongyu,price,up_date,type_room,area_room) except Exception: pass def main(): get_ttf() if __name__ == '__main__': main()
使用多工非同步協程
# -*- coding: utf-8 -*- # __author__ = "maple" # -*- coding: utf-8 -*- # __author__ = "maple" import io import re import requests import base64 import pymysql import json import asyncio import aiohttp from fontTools.ttLib import TTFont from lxml import etree # 資料庫連線 conn = pymysql.connect(host='localhost', user='root', password='123456', db='wuba', charset='utf8') async def get_ttf(url): """ 獲取ttf檔案,ttf檔案中字形與實際字型對應關係不變 :return:返回字型檔案和響應文字 """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36", } async with aiohttp.ClientSession() as sess: async with await sess.get(url=url, headers=headers) as response: page_text = await response.text() font_sert = re.search(r"charset=utf-8;base64,(.*?)\'", page_text).group(1) font = base64.b64decode(font_sert) return font, page_text def page_source(task): """ 回撥函式,處理文字,資料解析 :param font: :param page_text: :return: """ # 字型形狀 --> 實際的字型 font_dict = { 'glyph1': '0', 'glyph2': '1', 'glyph3': '2', 'glyph4': '3', 'glyph5': '4', 'glyph6': '5', 'glyph7': '6', 'glyph8': '7', 'glyph9': '8', 'glyph10': '9', } # 獲取結果 font,page_text = task.result() font = TTFont(io.BytesIO(font)) # 不儲存,從記憶體中讀取 for code, name in font['cmap'].getBestCmap().items(): # 38006 glyph00002 code做雜湊變為0x993c # print(code,name) code = str(hex(code)).replace("0", "&#") + ";" name = name.split("000")[0] + str(int(name.split("000")[-1])) # 將glyph00008變成字型形狀glyph8 # 餼 glyph8 real_text = str(font_dict[name]) page_text = page_text.replace(code, real_text) tree = etree.HTML(page_text) for li in tree.xpath("/html/body/div[6]/div[2]/ul/li"): try: # 標題 title = li.xpath("./div[@class='des']/h2/a/text()")[0].replace(" ", "").strip('\n') # 房屋型別及面積 room = li.xpath("./div[@class='des']/p[@class='room']/text()")[0].replace(" ", "").split( "\xa0") # \xa0是不間斷空白符 # 核驗 check_status = li.xpath("./div[@class='des']/p[@class='room']/i/text()")[0] # 地點 infor = "".join(li.xpath("./div[@class='des']/p[@class='infor']//text()")).replace("\xa0", "").replace("\n", "").replace( " ", "").replace(":", "\:") # 來源 gongyu = "".join(li.xpath("./div[@class='des']/p[3]//text()|./div[@class='des']/div//text()")).replace(" ", "").replace( "\n", " ") # 價格 price = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="money"]//text()')).replace(" ", "").replace( "\n", " ") # 釋出時間 up_date = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="send-time"]//text()')).replace(" ", "").replace( "\n", " ") # 來源 # print(title) type_room = room[0] # 房子型別:幾室幾廳 area_room = room[-1].strip("\n") # 面積m**2 items = { "title": title, "check_status": check_status, "infor": infor, "gongyu": gongyu, "price": price, "up_date": up_date, "type_room": type_room, "area_room": area_room, } json.dumps(items) # 持久化儲存呼叫 storage(items, conn) except Exception: pass def storage(items, conn): """ 持久化儲存 :param items: :param conn: :return: """ ex = conn.cursor() title = items['title'] type_room = items['type_room'] area_room = items['area_room'] infor = items['infor'] gongyu = items['gongyu'] price = items['price'] check_status = items['check_status'] up_date = items['up_date'] sql = r""" insert into chuzu(title,type_room,area_room,infor,gongyu,price,check_status,up_date) values('%s','%s','%s','%s','%s','%s','%s','%s') """ % (title, type_room, area_room, infor, gongyu, price, check_status, up_date) try: ex.execute(sql) conn.commit() except Exception as e: print(e) conn.rollback() def main(): task_list = [] for num in range(1, 71): url = f'https://cd.58.com/chuzu/pn{num}/' # 協程物件 c = get_ttf(url) # 任務物件 task = asyncio.ensure_future(c) # 繫結回撥 task.add_done_callback(page_source) # 將任務物件放到列表中,後續提交給任務迴圈物件 task_list.append(task) # 建立事件迴圈物件 loop = asyncio.get_event_loop() # 提交併執行 loop.run_until_complete(asyncio.wait(task_list)) if __name__ == '__main__': main()
成果: