1. 程式人生 > 實用技巧 >字型反爬

字型反爬

總體思路

破解字型反爬:

  1. 獲取字型檔案,

  2.Unicode解碼字型檔案,儲存為ttf檔案,

  3.開啟ttf檔案,檢視對映關係建立對映字典(字形與實際字型之間的關係)(或者可以設定自動識別)

  4.通過code與name的關係,尋找name與字形之間的關係,得到code(需要做hex)與字形關係之間的關係

  5.第二次獲取響應時,直接通過code與字形之間的關係,得到實際的字型,並將code替換成實際字型

破解過程

可以明顯的看到顯示正常但html中的是亂碼。

1.獲取字型檔案

如上所示,字型檔案隱藏在css中,點選右上角的style。

上圖中那一段密文就是我們需要的ttf檔案(這一段密文是可變的,但其中的對映關係不變)

獲取那段程式碼然後儲存為ttf檔案,檢視字形與實際字型的對應關係

進行base64解碼(為什麼是bs64呢,因為font-face中標明瞭是base64)並儲存為ttf檔案,使用Fontcreator開啟,可以看到字形與實際字型的對應關係,也就是對映

因為是自定義的字形,且個數不多,手動建立對映表

這裡我手動建:

    font_dict = {
        'glyph1': '0',
        'glyph2': '1',
        'glyph3': '2',
        'glyph4': '3',
        'glyph5': '4',
        
'glyph6': '5', 'glyph7': '6', 'glyph8': '7', 'glyph9': '8', 'glyph10': '9', }

2.將ttf檔案儲存為xml檔案,通過xml檔案分析code,name,字形之間的關係

import re
import requests
import base64
from fontTools.ttLib import TTFont
import io

def get_ttf():
    url = 'https://cd.58.com/chuzu/'
    headers 
= { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' } page_text = requests.get(url,headers).text font_ttf = re.search(r"charset=utf-8;base64,(.*?)\'",page_text).group(1) font = TTFont(io.BytesIO(base64.b64decode(font_ttf))) # 放入到記憶體中,以位元組形式存在 font.saveXML("wuba.xml") def main(): get_ttf() if __name__ == '__main__': main()

分析xml檔案

回顧一下上文中提到的三個關係:code ——>name,name——>字形,字形——>實際字型(值)

前面說過:code與name是變化的,但字形與實際值是不變的,然後我們分析一下name與字形之間的關係

name:glyph0001

字形:glyph1

很明顯中間多了幾個零而已,處理一下,直接讓code與字形對應起來,下一次請求過來時,直接通過code找到字形,最終找到值做code替換。

處理轉換一下資料:

梳理一下:當獲取到響應的html時,我們會先拿到ttf檔案,然後通過fonttools模組處理code,轉換name為字形,通過對映字典自動轉換為值,這樣code與值就直接對應上了

到這裡我們還需要處理一下code,因為現在得到的code與html中的文字是有一點小差別的

可以看到0變成了&#後面多了";"

解析來,code直接對映值,然後替換html中的文字

# -*- coding: utf-8 -*-
# __author__ = "maple"
import re
import requests
import base64
from fontTools.ttLib import TTFont
import io

def get_ttf():
    url = 'https://cd.58.com/chuzu/'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
    }
    font_dict = {
        'glyph1': '0',
        'glyph2': '1',
        'glyph3': '2',
        'glyph4': '3',
        'glyph5': '4',
        'glyph6': '5',
        'glyph7': '6',
        'glyph8': '7',
        'glyph9': '8',
        'glyph10': '9',
    }
    page_text = requests.get(url,headers).text
    font_ttf = re.search(r"charset=utf-8;base64,(.*?)\'",page_text).group(1)
    font = TTFont(io.BytesIO(base64.b64decode(font_ttf))) # 放入到記憶體中,以位元組形式存在
    code_name = font.getBestCmap()
    for code,name in code_name.items():
        # 對code做雜湊,將數字轉換為對應的碼
        code = hex(code).replace("0","&#") +";"
        # 自動將name轉換為字形
        name = name.split("000")[0]+str(int(name.split("000")[-1]))
        code_value = font_dict[name]  # 通過name找到值 
        page_text = page_text.replace(code,code_value) # 執行替換操作,將code替換為值
        


def main():
    get_ttf()


if __name__ == '__main__':
    main()

3.資料解析

最後寫入資料庫或這redis即可

程式碼:

# -*- coding: utf-8 -*-
# __author__ = "maple"
import re
import requests
import base64
from fontTools.ttLib import TTFont
from lxml import etree
import io

def get_ttf():
    url = 'https://cd.58.com/chuzu/'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
    }
    font_dict = {
        'glyph1': '0',
        'glyph2': '1',
        'glyph3': '2',
        'glyph4': '3',
        'glyph5': '4',
        'glyph6': '5',
        'glyph7': '6',
        'glyph8': '7',
        'glyph9': '8',
        'glyph10': '9',
    }
    page_text = requests.get(url,headers).text
    font_ttf = re.search(r"charset=utf-8;base64,(.*?)\'",page_text).group(1)
    font = TTFont(io.BytesIO(base64.b64decode(font_ttf))) # 放入到記憶體中,以位元組形式存在
    code_name = font.getBestCmap()
    for code,name in code_name.items():
        # 對code做雜湊,將數字轉換為對應的碼
        code = hex(code).replace("0","&#") +";"
        # 自動將name轉換為字形
        name = name.split("000")[0]+str(int(name.split("000")[-1]))
        code_value = font_dict[name]  # 通過name找到值
        page_text = page_text.replace(code,code_value) # 執行替換操作,將code替換為值
    tree = etree.HTML(page_text)
    for li in tree.xpath("/html/body/div[6]/div[2]/ul/li"):
        try:
            # 標題
            title = li.xpath("./div[@class='des']/h2/a/text()")[0].replace(" ", "").strip('\n')
            # 房屋型別及面積
            room = li.xpath("./div[@class='des']/p[@class='room']/text()")[0].replace(" ", "").split(
                "\xa0")  # \xa0是不間斷空白符  
            # 核驗
            check_status = li.xpath("./div[@class='des']/p[@class='room']/i/text()")[0]
            # 地點
            infor = "".join(li.xpath("./div[@class='des']/p[@class='infor']//text()")).replace("\xa0", "").replace("\n",
                                                                                                                   "").replace(
                " ", "").replace(":", "\:")
            # 來源
            gongyu = "".join(li.xpath("./div[@class='des']/p[3]//text()|./div[@class='des']/div//text()")).replace(" ",
                                                                                                                   "").replace(
                "\n", " ")
            # 價格
            price = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="money"]//text()')).replace(" ",
                                                                                                           "").replace(
                "\n", " ")
            # 釋出時間
            up_date = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="send-time"]//text()')).replace(" ",
                                                                                                                 "").replace(
                "\n", " ")
            # 來源
            # print(title)

            type_room = room[0]  # 房子型別:幾室幾廳
            area_room = room[-1].strip("\n")  # 面積m**2
            print(title,check_status,infor,gongyu,price,up_date,type_room,area_room)
        except Exception:
            pass
def main():
    get_ttf()


if __name__ == '__main__':
    main()

使用多工非同步協程

# -*- coding: utf-8 -*-
# __author__ = "maple"

# -*- coding: utf-8 -*-
# __author__ = "maple"
import io
import re
import requests
import base64
import pymysql
import json
import asyncio
import aiohttp
from fontTools.ttLib import TTFont
from lxml import etree
# 資料庫連線
conn = pymysql.connect(host='localhost', user='root', password='123456', db='wuba', charset='utf8')


async def get_ttf(url):
    """
    獲取ttf檔案,ttf檔案中字形與實際字型對應關係不變
    :return:返回字型檔案和響應文字
    """

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
    }
    async with aiohttp.ClientSession() as sess:
        async with await sess.get(url=url, headers=headers) as response:
            page_text = await response.text()
            font_sert =  re.search(r"charset=utf-8;base64,(.*?)\'", page_text).group(1)
            font = base64.b64decode(font_sert)
            return font, page_text


def page_source(task):
    """
    回撥函式,處理文字,資料解析
    :param font:
    :param page_text:
    :return:
    """
    # 字型形狀 --> 實際的字型
    font_dict = {
        'glyph1': '0',
        'glyph2': '1',
        'glyph3': '2',
        'glyph4': '3',
        'glyph5': '4',
        'glyph6': '5',
        'glyph7': '6',
        'glyph8': '7',
        'glyph9': '8',
        'glyph10': '9',
    }
    # 獲取結果
    font,page_text = task.result()
    font = TTFont(io.BytesIO(font))  # 不儲存,從記憶體中讀取
    for code, name in font['cmap'].getBestCmap().items():
        # 38006 glyph00002 code做雜湊變為0x993c
        # print(code,name)
        code = str(hex(code)).replace("0", "&#") + ";"
        name = name.split("000")[0] + str(int(name.split("000")[-1]))  # 將glyph00008變成字型形狀glyph8
        # 餼 glyph8
        real_text = str(font_dict[name])
        page_text = page_text.replace(code, real_text)
    tree = etree.HTML(page_text)
    for li in tree.xpath("/html/body/div[6]/div[2]/ul/li"):
        try:
            # 標題
            title = li.xpath("./div[@class='des']/h2/a/text()")[0].replace(" ", "").strip('\n')
            # 房屋型別及面積
            room = li.xpath("./div[@class='des']/p[@class='room']/text()")[0].replace(" ", "").split(
                "\xa0")  # \xa0是不間斷空白符  
            # 核驗
            check_status = li.xpath("./div[@class='des']/p[@class='room']/i/text()")[0]
            # 地點
            infor = "".join(li.xpath("./div[@class='des']/p[@class='infor']//text()")).replace("\xa0", "").replace("\n",
                                                                                                                   "").replace(
                " ", "").replace(":", "\:")
            # 來源
            gongyu = "".join(li.xpath("./div[@class='des']/p[3]//text()|./div[@class='des']/div//text()")).replace(" ",
                                                                                                                   "").replace(
                "\n", " ")
            # 價格
            price = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="money"]//text()')).replace(" ",
                                                                                                           "").replace(
                "\n", " ")
            # 釋出時間
            up_date = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="send-time"]//text()')).replace(" ",
                                                                                                                 "").replace(
                "\n", " ")
            # 來源
            # print(title)

            type_room = room[0]  # 房子型別:幾室幾廳
            area_room = room[-1].strip("\n")  # 面積m**2
            items = {
                "title": title,
                "check_status": check_status,
                "infor": infor,
                "gongyu": gongyu,
                "price": price,
                "up_date": up_date,
                "type_room": type_room,
                "area_room": area_room,
            }
            json.dumps(items)
            # 持久化儲存呼叫
            storage(items, conn)
        except Exception:
            pass

def storage(items, conn):
    """
    持久化儲存
    :param items:
    :param conn:
    :return:
    """
    ex = conn.cursor()
    title = items['title']
    type_room = items['type_room']
    area_room = items['area_room']
    infor = items['infor']
    gongyu = items['gongyu']
    price = items['price']
    check_status = items['check_status']
    up_date = items['up_date']

    sql = r"""
    insert into chuzu(title,type_room,area_room,infor,gongyu,price,check_status,up_date) values('%s','%s','%s','%s','%s','%s','%s','%s')
    """ % (title, type_room, area_room, infor, gongyu, price, check_status, up_date)
    try:
        ex.execute(sql)
        conn.commit()
    except Exception as e:
        print(e)
        conn.rollback()



def main():
    task_list = []
    for num in range(1, 71):
        url = f'https://cd.58.com/chuzu/pn{num}/'
        # 協程物件
        c = get_ttf(url)
        # 任務物件
        task = asyncio.ensure_future(c)
        # 繫結回撥
        task.add_done_callback(page_source)
        # 將任務物件放到列表中,後續提交給任務迴圈物件
        task_list.append(task)
    # 建立事件迴圈物件
    loop = asyncio.get_event_loop()
    # 提交併執行
    loop.run_until_complete(asyncio.wait(task_list))


if __name__ == '__main__':
    main()

成果: