1. 程式人生 > 實用技巧 >爬取汽車之家車型配置檔案

爬取汽車之家車型配置檔案

一、需求

獲取指定品牌的所有車型配置資訊,並儲存到excel中。

流程大致思路:

1.獲取品牌id:brand_id

2.通過品牌id獲取車型id:series_id

3.獲取車型配置頁面

4.解析配置頁面內容(這步最複雜,使用了之前一些大神的程式碼)

二、程式碼

測試完美執行

import requests
import json
import xlwt
from bs4 import BeautifulSoup
import re
from urllib import parse
from selenium import webdriver


class Car_home_config(object):
    
def __init__(self): self.session = requests.Session() self.params = None self.brand_dict = {} self.series_dict = {} self.brand_name = None def get_header(self): self.headers = { "authority": "car.autohome.com.cn", "method": "
GET", "path": "/AsLeftMenu/As_LeftListNew.ashx?%s" % parse.urlencode(self.params), "scheme": "https", "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", "cache-control": "
no-cache", "pragma": "no-cache", "sec-ch-ua": "Google Chrome;v=87,Not;A Brand;v=99,Chromium;v=87", "sec-ch-ua-mobile": "?0", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "none", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36" } # 獲取所有品牌id號 def get_brand_id(self): self.params = { "typeId": "1", "brandId": "0", "fctId": "0", "seriesId": "0" } self.get_header() url = r"https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx" res = self.session.get(url=url, headers=self.headers, params=self.params) res.encoding = res.apparent_encoding html = res.text # print(html) soup = BeautifulSoup(html, 'lxml') ul_list = soup.find_all("ul") for ul in ul_list: li_list = ul.find_all("li") for li in li_list: a_href = li.find("a").attrs.get('href') a_text = li.find("a").text # print(a_href) # print(a_text) brand_id = re.findall("[0-9]\d*", a_href)[0] self.brand_dict[brand_id] = a_text return self.brand_dict def get_AsLeftMenu(self): url = r"https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx" res = self.session.get(url=url, headers=self.headers, params=self.params) res.encoding = res.apparent_encoding html = res.text soup = BeautifulSoup(html, 'lxml') dd_list = soup.find_all("dd") for dd in dd_list: a_list = dd.find_all("a") for a in a_list: a_href = a.attrs.get('href') a_text = a.text print(a_href) print(a_text) series_id = re.findall("[0-9]\d*", a_href)[0] self.series_dict[series_id] = a_text # 獲取某一品牌下車型的id號 def get_series_id(self): self.get_brand_id() if self.brand_name: for k, v in self.brand_dict.items(): if self.brand_name in v: self.params = { "typeId": "1", "brandId": k, "fctId": "0", "seriesId": "0" } self.get_header() self.get_AsLeftMenu() return self.series_dict else: for k, v in self.brand_dict.items(): self.params = { "typeId": "1", "brandId": k, "fctId": "0", "seriesId": "0" } self.get_header() self.get_AsLeftMenu() return self.series_dict # 獲取車型配置資訊 def get_config_content(self, series_id): res = self.session.get(r"https://car.autohome.com.cn/config/series/{}.html".format(series_id), verify=False, headers={ "authority": "car.autohome.com.cn", "method": "GET", "path": "/config/series/{}.html".format(series_id), "scheme": "https", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", "cache-control": "no-cache", "referer": "https://www.autohome.com.cn/", "sec-ch-ua": "Google Chrome;v=87,Not;A Brand;v=99,Chromium;v=87", "sec-ch-ua-mobile": "?0", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-site", "ec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"}) html = res.content.decode("utf-8") return html def car_info(self, html): config = re.search("var config = (.*?)};", html) # 車的引數 option = re.search("var option = (.*?)};", html) # 主被動安全裝備 bag = re.search("var bag = (.*?)};", html) # 選裝包 # 處理汽車引數 car_info = "" if config and option and bag: car_info = car_info + config.group(0) + option.group(0) + bag.group(0) return car_info def write_html(self, js_list, car_info): # 執行JS的DOM -- 這部破解是最麻煩的,非常耗時間~參考了網際網路上的大神程式碼 DOM = ("var rules = '2';" "var document = {};" "function getRules(){return rules}" "document.createElement = function() {" " return {" " sheet: {" " insertRule: function(rule, i) {" " if (rules.length == 0) {" " rules = rule;" " } else {" " rules = rules + '#' + rule;" " }" " }" " }" " }" "};" "document.querySelectorAll = function() {" " return {};" "};" "document.head = {};" "document.head.appendChild = function() {};" "var window = {};" "window.decodeURIComponent = decodeURIComponent;") # 把JS檔案寫入到檔案中去 for item in js_list: DOM = DOM + item html_type = "<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8' /><head></head><body> <script type='text/javascript'>" # 拼接成一個可以執行的網頁 js = html_type + DOM + " document.write(rules)</script></body></html>" # 再次執行的時候,請把檔案刪除,否則無法建立同名檔案,或者自行加驗證即可 with open("original.html", "w", encoding="utf-8") as f: f.write(js) try: driver = webdriver.PhantomJS( executable_path=r"phantomjs.exe") driver.get("original.html") # 讀取body部分 text = driver.find_element_by_tag_name('body').text if not text: return except Exception as e: print(e) finally: driver.close() # 匹配車輛引數中所有的span標籤 span_list = re.findall("<span(.*?)></span>", car_info) # car_info 是我上面拼接的字串 # 按照span標籤與text中的關鍵字進行替換 for span in span_list: # 這個地方匹配的是class的名稱 例如 <span class='hs_kw7_optionZl'></span> 匹配 hs_kw7_optionZl 出來 info = re.search("'(.*?)'", span) if info: class_info = str(info.group( 1)) + "::before { content:(.*?)}" # 拼接為 hs_kw7_optionZl::before { content:(.*?)} content = re.search(class_info, text).group(1) # 匹配文字內容,返回結果為 "實測""油耗""質保" car_info = car_info.replace(str("<span class='" + info.group(1) + "'></span>"), re.search("\"(.*?)\"", content).group(1)) return car_info def save(self, car_info, car_name, save_path): # 持久化 car_item = {} config = re.search("var config = (.*?);", car_info).group(1) option = re.search("var option = (.*?);var", car_info).group(1) bag = re.search("var bag = (.*?);", car_info).group(1) config_re = json.loads(config) option_re = json.loads(option) bag_re = json.loads(bag) config_item =[] option_item = [] for i in config_re['result']['paramtypeitems']: config_item+=i['paramitems'] for i in option_re['result']['configtypeitems']: option_item+=i['configitems'] # bag_item = bag_re['result']['bagtypeitems'][0]['bagitems'] for car in config_item: car_item[car['name']] = [] for value in car['valueitems']: car_item[car['name']].append(value['value']) for car in option_item: car_item[car['name']] = [] for value in car['valueitems']: car_item[car['name']].append(value['value']) # for car in bag_item[0]['valueitems']: # car_item[car['name']] = [] # car_item[car['name']].append(car['bagid']) # car_item[car['name']].append(car['pricedesc']) # car_item[car['name']].append(car['description']) # 生成表格 workbook = xlwt.Workbook(encoding='ascii') # 建立一個檔案 worksheet = workbook.add_sheet('汽車之家') # 建立一個表 cols = 0 start_row = 0 for co in car_item: worksheet.write(start_row, cols, co) # 在第0(一)行寫入車的配置資訊 cols = cols + 1 end_row_num = start_row + len(car_item['車型名稱']) # 車輛款式記錄數 for row in range(start_row, end_row_num): col_num = 0 # 列數 row += 1 for col in car_item: try: con = str(car_item[col][row - 1]) except: con = "" worksheet.write(row, col_num, con) col_num = col_num + 1 workbook.save('{}/{}.xls'.format(save_path, car_name)) # 查詢車型配置,brand_name不填就是查詢所有 def check(self, brand_name, save_path="./"): self.brand_name = brand_name self.get_series_id() for series_id, car_name in self.series_dict.items(): print(series_id, car_name) html = self.get_config_content(series_id) car_info = self.car_info(html) js_list = re.findall('(\(function\([a-zA-Z]{2}.*?_\).*?\(document\);)', html) car_info = self.write_html(js_list, car_info) if car_info: self.save(car_info, car_name, save_path) car = Car_home_config() car.check("奧迪")

phantomjs.exe下載地址:https://phantomjs.org/download.html

感謝以下作者:
https://www.cnblogs.com/kangz/p/10011348.html
https://www.cnblogs.com/pontoon/p/10459471.html