Python-Requests-瓜子二手車資訊抓取
阿新 • • 發佈:2018-12-20
#首先,來看下瓜子二手車網站,我們需要抓取的部分。
- 1)列表頁
- 2)詳情頁, 就來抓取下這個欄位吧: 1.標題 2.上牌時間 3.公里數 4.上牌地 5.排量 6.變速 7.價格
#思路:
-
1.構造翻頁連結,可以看出一共有117頁,構造117個連結進行抓取
-
2.獲取每頁的每個車的詳情連結
-
3.進入詳情頁抓取關鍵欄位
-
4.存入資料庫
#注意
- 1)頭部必須要有cookie或者另外設定cookie
cookie 裡面有好多值,測試後發現只需要一個,就是antipas;
比較下 就是這麼的噁心
但是第一個裡面你會發現一個關鍵字:
他就是 cookie需要傳的值antipas,複製出來看看,script標籤裡面包著一段js程式碼,噁心的要死,
js還是我的弱項,不知道怎麼去分析他,就暫且跳過吧,
- 2)再來看個有意思的圖片,什麼意思!!!!
- 3)原來: 可能是因為頻繁訪問的願意,被限制了
- 4) 那怎麼辦!!! – 換cookie antipas的值,如果懂js得話值得研究下… – 用隨機代理?? 測試:從庫裡找了個可用的代理,換了個antipas 也不行
#最後貼上我的程式碼:
# -*- coding: utf-8 -*- import re import requests from pyquery import PyQuery as pq from guazi.execute_script import excuteScript headers ={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' } class GuaZiCrawler(): def __init__(self): self.baseurl = 'https://www.guazi.com' self.sess = requests.Session() self.sess.headers = headers self.start_url = 'https://www.guazi.com/qd/buy/' def anti_value(self): ''' 獲取antipas引數需要的key和value :return: ''' content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8') params = re.findall(r"value=anti\('(.*?)','(.*?)'\)", content)[0] return params def caculate_antipas(self): ''' 計算antipas引數 :return: ''' params = self.anti_value() antipas = excuteScript(params[0], params[1]) self.sess.cookies.set('antipas', antipas) def page_url(self): self.caculate_antipas() ''' 獲取翻頁連結 :param start_url: :return: ''' content = pq(self.sess.get(self.start_url).text) page_num_max = max([int(each.text()) for each in content('ul[@class="pageLink clearfix"] > li > a').items() if re.match(r'\d+', each.text())]) page_url_list = [] for i in range(1,page_num_max+1,1): base_url = 'https://www.guazi.com/qd/buy/o{}/'.format(i) page_url_list.append(base_url) return page_url_list def index_page(self, start_url): ''' 抓取詳情頁連結 :param start_url: :return: ''' content = pq(self.sess.get(start_url).text) for each in content('ul[@class="carlist clearfix js-top"] > li > a').items(): url = each.attr.href if not url.startswith('http'): url = self.baseurl + url yield url def detail_page(self, detail_url): ''' 抓取詳情資訊 :param detail_url: :return: ''' content = pq(self.sess.get(detail_url).text) data_dict = { 'title': content('h2.titlebox').text().strip(), 'bordingdate': content('ul[@class="assort clearfix"] li[@class="one"] span').text(), 'km': content('ul[@class="assort clearfix"] li[@class="two"] span').text(), 'address': content('ul[@class="assort clearfix"]').find('li').eq(2).find('span').text(), 'displacement': content('ul[@class="assort clearfix"]').find('li').eq(3).find('span').text(), 'gearbox': content('ul[@class="assort clearfix"] li[@class="two"] span').text(), 'price': content('span[@class="pricestype"]').text(), } if not data_dict['title']: print(str(content).encode('ISO-8859-1').decode('utf-8')) return data_dict def run(self): for pageurl in self.page_url(): for detail_url in self.index_page(pageurl): result = self.detail_page(detail_url) print(result) print('*'*200) if __name__ == '__main__': gzcrawler = GuaZiCrawler() gzcrawler.run()
js部分
# -*- coding: utf-8 -*- import execjs def excuteScript(value, key): """ 將訪問瓜子二手車時載入的第一個頁面中的JS程式碼解密後使用execjs在Python中執行,從而 得到爬取時需要攜帶的cookie中的antipas引數 """ jsFunc1 = ''' function safeAdd(x, y) { var lsw = (x & 0xFFFF) + (y & 0xFFFF); var msw = (x >> 16) + (y >> 16) + (lsw >> 16); return (msw << 16) | (lsw & 0xFFFF) } function bitRotateLeft(num, cnt) { return (num << cnt) | (num >>> (32 - cnt)) } function cmn(q, a, b, x, s, t) { return safeAdd(bitRotateLeft(safeAdd(safeAdd(a, q), safeAdd(x, t)), s), b) } function ff(a, b, c, d, x, s, t) { return cmn((b & c) | ((~b) & d), a, b, x, s, t) } function gg(a, b, c, d, x, s, t) { return cmn((b & d) | (c & (~d)), a, b, x, s, t) } function hh(a, b, c, d, x, s, t) { return cmn(b ^ c ^ d, a, b, x, s, t) } function ii(a, b, c, d, x, s, t) { return cmn(c ^ (b | (~d)), a, b, x, s, t) } function binl(x, len) { x[len >> 5] |= 0x80 << (len % 32); x[(((len + 64) >>> 9) << 4) + 14] = len; var i; var olda; var oldb; var oldc; var oldd; var a = 1732584193; var b = -271733879; var c = -1732584194; var d = 271733878; for (i = 0; i < x.length; i += 16) { olda = a; oldb = b; oldc = c; oldd = d; a = ff(a, b, c, d, x[i], 7, -680876936); d = ff(d, a, b, c, x[i + 1], 12, -389564586); c = ff(c, d, a, b, x[i + 2], 17, 606105819); b = ff(b, c, d, a, x[i + 3], 22, -1044525330); a = ff(a, b, c, d, x[i + 4], 7, -176418897); d = ff(d, a, b, c, x[i + 5], 12, 1200080426); c = ff(c, d, a, b, x[i + 6], 17, -1473231341); b = ff(b, c, d, a, x[i + 7], 22, -45705983); a = ff(a, b, c, d, x[i + 8], 7, 1770035416); d = ff(d, a, b, c, x[i + 9], 12, -1958414417); c = ff(c, d, a, b, x[i + 10], 17, -42063); b = ff(b, c, d, a, x[i + 11], 22, -1990404162); a = ff(a, b, c, d, x[i + 12], 7, 1804603682); d = ff(d, a, b, c, x[i + 13], 12, -40341101); c = ff(c, d, a, b, x[i + 14], 17, -1502002290); b = ff(b, c, d, a, x[i + 15], 22, 1236535329); a = gg(a, b, c, d, x[i + 1], 5, -165796510); d = gg(d, a, b, c, x[i + 6], 9, -1069501632); c = gg(c, d, a, b, x[i + 11], 14, 643717713); b = gg(b, c, d, a, x[i], 20, -373897302); a = gg(a, b, c, d, x[i + 5], 5, -701558691); d = gg(d, a, b, c, x[i + 10], 9, 38016083); c = gg(c, d, a, b, x[i + 15], 14, -660478335); b = gg(b, c, d, a, x[i + 4], 20, -405537848); a = gg(a, b, c, d, x[i + 9], 5, 568446438); d = gg(d, a, b, c, x[i + 14], 9, -1019803690); c = gg(c, d, a, b, x[i + 3], 14, -187363961); b = gg(b, c, d, a, x[i + 8], 20, 1163531501); a = gg(a, b, c, d, x[i + 13], 5, -1444681467); d = gg(d, a, b, c, x[i + 2], 9, -51403784); c = gg(c, d, a, b, x[i + 7], 14, 1735328473); b = gg(b, c, d, a, x[i + 12], 20, -1926607734); a = hh(a, b, c, d, x[i + 5], 4, -378558); d = hh(d, a, b, c, x[i + 8], 11, -2022574463); c = hh(c, d, a, b, x[i + 11], 16, 1839030562); b = hh(b, c, d, a, x[i + 14], 23, -35309556); a = hh(a, b, c, d, x[i + 1], 4, -1530992060); d = hh(d, a, b, c, x[i + 4], 11, 1272893353); c = hh(c, d, a, b, x[i + 7], 16, -155497632); b = hh(b, c, d, a, x[i + 10], 23, -1094730640); a = hh(a, b, c, d, x[i + 13], 4, 681279174); d = hh(d, a, b, c, x[i], 11, -358537222); c = hh(c, d, a, b, x[i + 3], 16, -722521979); b = hh(b, c, d, a, x[i + 6], 23, 76029189); a = hh(a, b, c, d, x[i + 9], 4, -640364487); d = hh(d, a, b, c, x[i + 12], 11, -421815835); c = hh(c, d, a, b, x[i + 15], 16, 530742520); b = hh(b, c, d, a, x[i + 2], 23, -995338651); a = ii(a, b, c, d, x[i], 6, -198630844); d = ii(d, a, b, c, x[i + 7], 10, 1126891415); c = ii(c, d, a, b, x[i + 14], 15, -1416354905); b = ii(b, c, d, a, x[i + 5], 21, -57434055); a = ii(a, b, c, d, x[i + 12], 6, 1700485571); d = ii(d, a, b, c, x[i + 3], 10, -1894986606); c = ii(c, d, a, b, x[i + 10], 15, -1051523); b = ii(b, c, d, a, x[i + 1], 21, -2054922799); a = ii(a, b, c, d, x[i + 8], 6, 1873313359); d = ii(d, a, b, c, x[i + 15], 10, -30611744); c = ii(c, d, a, b, x[i + 6], 15, -1560198380); b = ii(b, c, d, a, x[i + 13], 21, 1309151649); a = ii(a, b, c, d, x[i + 4], 6, -145523070); d = ii(d, a, b, c, x[i + 11], 10, -1120210379); c = ii(c, d, a, b, x[i + 2], 15, 718787259); b = ii(b, c, d, a, x[i + 9], 21, -343485551); a = safeAdd(a, olda); b = safeAdd(b, oldb); c = safeAdd(c, oldc); d = safeAdd(d, oldd) } return [a, b, c, d] } function binl2rstr(input) { var i; var output = ''; var length32 = input.length * 32; for (i = 0; i < length32; i += 8) { output += String.fromCharCode((input[i >> 5] >>> (i % 32)) & 0xFF) } return output } function rstr2binl(input) { var i; var output = []; output[(input.length >> 2) - 1] = undefined; for (i = 0; i < output.length; i += 1) { output[i] = 0 } var length8 = input.length * 8; for (i = 0; i < length8; i += 8) { output[i >> 5] |= (input.charCodeAt(i / 8) & 0xFF) << (i % 32) } return output } function rstr(s) { return binl2rstr(binl(rstr2binl(s), s.length * 8)) } function rstrHMAC(key, data) { var i; var bkey = rstr2binl(key); var ipad = []; var opad = []; var hash; ipad[15] = opad[15] = undefined; if (bkey.length > 16) { bkey = binl(bkey, key.length * 8) } for (i = 0; i < 16; i += 1) { ipad[i] = bkey[i] ^ 0x36363636; opad[i] = bkey[i] ^ 0x5C5C5C5C } hash = binl(ipad.concat(rstr2binl(data)), 512 + data.length * 8); return binl2rstr(binl(opad.concat(hash), 512 + 128)) } function rstr2hex(input) { var hexTab = '0123456789abcdef'; var output = ''; var x; var i; for (i = 0; i < input.length; i += 1) { x = input.charCodeAt(i); output += hexTab.charAt((x >>> 4) & 0x0F) + hexTab.charAt(x & 0x0F) } return output } function str2rstrUTF8(input) { return unescape(encodeURIComponent(input)) } function raw(s) { return rstr(str2rstrUTF8(s)) } function hex(s) { return rstr2hex(raw(s)) } function uid() { var text = ""; var possible = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; var len = Math.floor(Math.random() * 2); for (var i = 0; i < len; i++) { text += possible.charAt(Math.floor(Math.random() * possible.length)) } return text } function charRun(s) { s = s.replace(/[a-zA-Z]/g, '#'); var arr = s.split(''); for (var i = 0; i < arr.length; i++) { if (arr[i] == '#') { arr[i] = uid() } } return arr.join('') } function anti(string, key) { var estring = hex(string); return charRun(estring) } ''' jsContext = execjs.compile(jsFunc1) antipas = jsContext.call('anti', value, key) return antipas
-
JS引數破解部分程式碼直接用了了大神的程式碼,提取到引數計算部分的js程式碼 使用python js執行模組execjs執行對應程式碼獲取antipas
-
來看下結果
-
警告:訪問太頻繁 啊哈哈哈啊哈哈哈哈,具體解決辦法嘛
-
可以搞一些Ip代理
總結:
- 程式碼部分更新(引數antipas部分)
- 另外感謝大神的指導
- 希望可以多多的交流,互相學習