1. 程式人生 > >Python-Requests-瓜子二手車資訊抓取

Python-Requests-瓜子二手車資訊抓取

#首先,來看下瓜子二手車網站,我們需要抓取的部分。

  • 1)列表頁 image.png
  • 2)詳情頁, 就來抓取下這個欄位吧: 1.標題 2.上牌時間 3.公里數 4.上牌地 5.排量 6.變速 7.價格 image.png

#思路:

  • 1.構造翻頁連結,可以看出一共有117頁,構造117個連結進行抓取 image.png

  • 2.獲取每頁的每個車的詳情連結 image.png

  • 3.進入詳情頁抓取關鍵欄位

  • 4.存入資料庫

#注意

  • 1)頭部必須要有cookie或者另外設定cookie cookie 裡面有好多值,測試後發現只需要一個,就是antipas; image.png image.png image.png 比較下 就是這麼的噁心 但是第一個裡面你會發現一個關鍵字: 他就是 cookie需要傳的值antipas,複製出來看看,script標籤裡面包著一段js程式碼,噁心的要死, js還是我的弱項,不知道怎麼去分析他,就暫且跳過吧, image.png
  • 2)再來看個有意思的圖片,什麼意思!!!!

2018-06-15_160428.png

  • 3)原來: 可能是因為頻繁訪問的願意,被限制了 image.png
  • 4) 那怎麼辦!!! – 換cookie antipas的值,如果懂js得話值得研究下… – 用隨機代理?? 測試:從庫裡找了個可用的代理,換了個antipas 也不行 image.png

#最後貼上我的程式碼:

# -*- coding: utf-8 -*-
import re
import requests
from pyquery import PyQuery as pq
from guazi.execute_script import excuteScript

headers ={
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}


class GuaZiCrawler():

    def __init__(self):
        self.baseurl = 'https://www.guazi.com'
        self.sess = requests.Session()
        self.sess.headers = headers
        self.start_url = 'https://www.guazi.com/qd/buy/'


    def anti_value(self):
        '''
        獲取antipas引數需要的key和value
        :return:
        '''
        content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')
        params = re.findall(r"value=anti\('(.*?)','(.*?)'\)", content)[0]
        return params

    def caculate_antipas(self):
        '''
        計算antipas引數
        :return: 
        '''
        params = self.anti_value()
        antipas = excuteScript(params[0], params[1])
        self.sess.cookies.set('antipas', antipas)

    def page_url(self):
        self.caculate_antipas()
        '''
        獲取翻頁連結
        :param start_url:
        :return:
        '''
        content = pq(self.sess.get(self.start_url).text)
        page_num_max = max([int(each.text()) for each in content('ul[@class="pageLink clearfix"]  > li > a').items() if re.match(r'\d+', each.text())])
        page_url_list = []
        for i in range(1,page_num_max+1,1):
            base_url = 'https://www.guazi.com/qd/buy/o{}/'.format(i)
            page_url_list.append(base_url)

        return page_url_list

    def index_page(self, start_url):
        '''
        抓取詳情頁連結
        :param start_url:
        :return:
        '''
        content = pq(self.sess.get(start_url).text)
        for each in content('ul[@class="carlist clearfix js-top"]  > li > a').items():
            url = each.attr.href
            if not url.startswith('http'):
                url = self.baseurl + url
                yield url

    def detail_page(self, detail_url):
        '''
        抓取詳情資訊
        :param detail_url:
        :return:
        '''
        content = pq(self.sess.get(detail_url).text)

        data_dict = {
            'title': content('h2.titlebox').text().strip(),
            'bordingdate': content('ul[@class="assort clearfix"] li[@class="one"] span').text(),
            'km': content('ul[@class="assort clearfix"] li[@class="two"] span').text(),
            'address': content('ul[@class="assort clearfix"]').find('li').eq(2).find('span').text(),
            'displacement': content('ul[@class="assort clearfix"]').find('li').eq(3).find('span').text(),
            'gearbox': content('ul[@class="assort clearfix"] li[@class="two"] span').text(),
            'price': content('span[@class="pricestype"]').text(),
        }
        if not data_dict['title']:
            print(str(content).encode('ISO-8859-1').decode('utf-8'))

        return data_dict


    def run(self):
        for pageurl in self.page_url():
            for detail_url in self.index_page(pageurl):
                result = self.detail_page(detail_url)
                print(result)
            print('*'*200)


if __name__ == '__main__':
    gzcrawler = GuaZiCrawler()
    gzcrawler.run()


js部分

# -*- coding: utf-8 -*-
import execjs


def excuteScript(value, key):
    """
    將訪問瓜子二手車時載入的第一個頁面中的JS程式碼解密後使用execjs在Python中執行,從而
    得到爬取時需要攜帶的cookie中的antipas引數
    """

    jsFunc1 = '''
    function safeAdd(x, y) {
        var lsw = (x & 0xFFFF) + (y & 0xFFFF);
        var msw = (x >> 16) + (y >> 16) + (lsw >> 16);
        return (msw << 16) | (lsw & 0xFFFF)
    }
    function bitRotateLeft(num, cnt) {
        return (num << cnt) | (num >>> (32 - cnt))
    }
    function cmn(q, a, b, x, s, t) {
        return safeAdd(bitRotateLeft(safeAdd(safeAdd(a, q), safeAdd(x, t)), s), b)
    }
    function ff(a, b, c, d, x, s, t) {
        return cmn((b & c) | ((~b) & d), a, b, x, s, t)
    }
    function gg(a, b, c, d, x, s, t) {
        return cmn((b & d) | (c & (~d)), a, b, x, s, t)
    }
    function hh(a, b, c, d, x, s, t) {
        return cmn(b ^ c ^ d, a, b, x, s, t)
    }
    function ii(a, b, c, d, x, s, t) {
        return cmn(c ^ (b | (~d)), a, b, x, s, t)
    }
    function binl(x, len) {
        x[len >> 5] |= 0x80 << (len % 32);
        x[(((len + 64) >>> 9) << 4) + 14] = len;
        var i;
        var olda;
        var oldb;
        var oldc;
        var oldd;
        var a = 1732584193;
        var b = -271733879;
        var c = -1732584194;
        var d = 271733878;
        for (i = 0; i < x.length; i += 16) {
            olda = a;
            oldb = b;
            oldc = c;
            oldd = d;
            a = ff(a, b, c, d, x[i], 7, -680876936);
            d = ff(d, a, b, c, x[i + 1], 12, -389564586);
            c = ff(c, d, a, b, x[i + 2], 17, 606105819);
            b = ff(b, c, d, a, x[i + 3], 22, -1044525330);
            a = ff(a, b, c, d, x[i + 4], 7, -176418897);
            d = ff(d, a, b, c, x[i + 5], 12, 1200080426);
            c = ff(c, d, a, b, x[i + 6], 17, -1473231341);
            b = ff(b, c, d, a, x[i + 7], 22, -45705983);
            a = ff(a, b, c, d, x[i + 8], 7, 1770035416);
            d = ff(d, a, b, c, x[i + 9], 12, -1958414417);
            c = ff(c, d, a, b, x[i + 10], 17, -42063);
            b = ff(b, c, d, a, x[i + 11], 22, -1990404162);
            a = ff(a, b, c, d, x[i + 12], 7, 1804603682);
            d = ff(d, a, b, c, x[i + 13], 12, -40341101);
            c = ff(c, d, a, b, x[i + 14], 17, -1502002290);
            b = ff(b, c, d, a, x[i + 15], 22, 1236535329);
            a = gg(a, b, c, d, x[i + 1], 5, -165796510);
            d = gg(d, a, b, c, x[i + 6], 9, -1069501632);
            c = gg(c, d, a, b, x[i + 11], 14, 643717713);
            b = gg(b, c, d, a, x[i], 20, -373897302);
            a = gg(a, b, c, d, x[i + 5], 5, -701558691);
            d = gg(d, a, b, c, x[i + 10], 9, 38016083);
            c = gg(c, d, a, b, x[i + 15], 14, -660478335);
            b = gg(b, c, d, a, x[i + 4], 20, -405537848);
            a = gg(a, b, c, d, x[i + 9], 5, 568446438);
            d = gg(d, a, b, c, x[i + 14], 9, -1019803690);
            c = gg(c, d, a, b, x[i + 3], 14, -187363961);
            b = gg(b, c, d, a, x[i + 8], 20, 1163531501);
            a = gg(a, b, c, d, x[i + 13], 5, -1444681467);
            d = gg(d, a, b, c, x[i + 2], 9, -51403784);
            c = gg(c, d, a, b, x[i + 7], 14, 1735328473);
            b = gg(b, c, d, a, x[i + 12], 20, -1926607734);
            a = hh(a, b, c, d, x[i + 5], 4, -378558);
            d = hh(d, a, b, c, x[i + 8], 11, -2022574463);
            c = hh(c, d, a, b, x[i + 11], 16, 1839030562);
            b = hh(b, c, d, a, x[i + 14], 23, -35309556);
            a = hh(a, b, c, d, x[i + 1], 4, -1530992060);
            d = hh(d, a, b, c, x[i + 4], 11, 1272893353);
            c = hh(c, d, a, b, x[i + 7], 16, -155497632);
            b = hh(b, c, d, a, x[i + 10], 23, -1094730640);
            a = hh(a, b, c, d, x[i + 13], 4, 681279174);
            d = hh(d, a, b, c, x[i], 11, -358537222);
            c = hh(c, d, a, b, x[i + 3], 16, -722521979);
            b = hh(b, c, d, a, x[i + 6], 23, 76029189);
            a = hh(a, b, c, d, x[i + 9], 4, -640364487);
            d = hh(d, a, b, c, x[i + 12], 11, -421815835);
            c = hh(c, d, a, b, x[i + 15], 16, 530742520);
            b = hh(b, c, d, a, x[i + 2], 23, -995338651);
            a = ii(a, b, c, d, x[i], 6, -198630844);
            d = ii(d, a, b, c, x[i + 7], 10, 1126891415);
            c = ii(c, d, a, b, x[i + 14], 15, -1416354905);
            b = ii(b, c, d, a, x[i + 5], 21, -57434055);
            a = ii(a, b, c, d, x[i + 12], 6, 1700485571);
            d = ii(d, a, b, c, x[i + 3], 10, -1894986606);
            c = ii(c, d, a, b, x[i + 10], 15, -1051523);
            b = ii(b, c, d, a, x[i + 1], 21, -2054922799);
            a = ii(a, b, c, d, x[i + 8], 6, 1873313359);
            d = ii(d, a, b, c, x[i + 15], 10, -30611744);
            c = ii(c, d, a, b, x[i + 6], 15, -1560198380);
            b = ii(b, c, d, a, x[i + 13], 21, 1309151649);
            a = ii(a, b, c, d, x[i + 4], 6, -145523070);
            d = ii(d, a, b, c, x[i + 11], 10, -1120210379);
            c = ii(c, d, a, b, x[i + 2], 15, 718787259);
            b = ii(b, c, d, a, x[i + 9], 21, -343485551);
            a = safeAdd(a, olda);
            b = safeAdd(b, oldb);
            c = safeAdd(c, oldc);
            d = safeAdd(d, oldd)
        }
        return [a, b, c, d]
    }
    function binl2rstr(input) {
        var i;
        var output = '';
        var length32 = input.length * 32;
        for (i = 0; i < length32; i += 8) {
            output += String.fromCharCode((input[i >> 5] >>> (i % 32)) & 0xFF)
        }
        return output
    }
    function rstr2binl(input) {
        var i;
        var output = [];
        output[(input.length >> 2) - 1] = undefined;
        for (i = 0; i < output.length; i += 1) {
            output[i] = 0
        }
        var length8 = input.length * 8;
        for (i = 0; i < length8; i += 8) {
            output[i >> 5] |= (input.charCodeAt(i / 8) & 0xFF) << (i % 32)
        }
        return output
    }
    function rstr(s) {
        return binl2rstr(binl(rstr2binl(s), s.length * 8))
    }
    function rstrHMAC(key, data) {
        var i;
        var bkey = rstr2binl(key);
        var ipad = [];
        var opad = [];
        var hash;
        ipad[15] = opad[15] = undefined;
        if (bkey.length > 16) {
            bkey = binl(bkey, key.length * 8)
        }
        for (i = 0; i < 16; i += 1) {
            ipad[i] = bkey[i] ^ 0x36363636;
            opad[i] = bkey[i] ^ 0x5C5C5C5C
        }
        hash = binl(ipad.concat(rstr2binl(data)), 512 + data.length * 8);
        return binl2rstr(binl(opad.concat(hash), 512 + 128))
    }
    function rstr2hex(input) {
        var hexTab = '0123456789abcdef';
        var output = '';
        var x;
        var i;
        for (i = 0; i < input.length; i += 1) {
            x = input.charCodeAt(i);
            output += hexTab.charAt((x >>> 4) & 0x0F) + hexTab.charAt(x & 0x0F)
        }
        return output
    }
    function str2rstrUTF8(input) {
        return unescape(encodeURIComponent(input))
    }
    function raw(s) {
        return rstr(str2rstrUTF8(s))
    }
    function hex(s) {
        return rstr2hex(raw(s))
    }
    function uid() {
        var text = "";
        var possible = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
        var len = Math.floor(Math.random() * 2);
        for (var i = 0; i < len; i++) {
            text += possible.charAt(Math.floor(Math.random() * possible.length))
        }
        return text
    }
    function charRun(s) {
        s = s.replace(/[a-zA-Z]/g, '#');
        var arr = s.split('');
        for (var i = 0; i < arr.length; i++) {
            if (arr[i] == '#') {
                arr[i] = uid()
            }
        }
        return arr.join('')
    }
    function anti(string, key) {
        var estring = hex(string);
        return charRun(estring)
    }
    '''
    jsContext = execjs.compile(jsFunc1)
    antipas = jsContext.call('anti', value, key)
    return antipas
  • JS引數破解部分程式碼直接用了了大神的程式碼,提取到引數計算部分的js程式碼 使用python js執行模組execjs執行對應程式碼獲取antipas

  • 來看下結果 image.png

  • 警告:訪問太頻繁 啊哈哈哈啊哈哈哈哈,具體解決辦法嘛

  • 可以搞一些Ip代理

總結:

  • 程式碼部分更新(引數antipas部分)
  • 另外感謝大神的指導
  • 希望可以多多的交流,互相學習