1. 程式人生 > >爬蟲1.6-selenium+HeadlessChrome

爬蟲1.6-selenium+HeadlessChrome

目錄

爬蟲-selenium+HeadlessChrome

之前的筆記已經提到過selenium+chromedriver爬取Ajax技術載入的資料,但這種方式過於笨重,原因在於,每開啟一個頁面,都需要瀏覽器解析資料渲染介面,但實際上我們的爬蟲不需要這些操作。所以一個沒有介面但又完全可以模擬瀏覽器行為和獲取與瀏覽器完全相同的資料就非常有意義,過去爬蟲界流行的PhantomJS已經停止更新,並且新版的selenium目前已停止支援PhantomJS,所以現在替代方案為headless-Firefox

headless-chrome(無頭瀏覽器)

不多BB,谷歌大法好。

1. 瀏覽器處理步驟

1)處理HTML指令碼,生成DOM樹
2)處理CSS指令碼,生成CSSOM樹 (DOM和CSSOM是獨立的資料結構)
3)將DOM樹和CSSOM樹合併為渲染樹
4)對渲染樹中的內容進行佈局,計算每個節點的幾何外觀
5)將渲染樹中的每個節點繪製到螢幕中

無頭瀏覽器實際上節省了第4、5兩個步驟。另外headeless-chrome還可以方便的實現併發,不多說了,直接上實戰。

2. headless-chrome初體驗

from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time

chrome_options = Options()
chrome_options.add_argument("--headless")  # 基礎設定
chrome_options.add_argument('--disable-gpu')
base_url = "https://www.baidu.com/"
driver = webdriver.Chrome(executable_path=r'C:\Users\helloworld\Desktop\python_test\chromedriver.exe',options=chrome_options)  # options必須設定,否則還是會開啟介面

driver.get(base_url)
driver.find_element_by_id('kw').send_keys('python')
click = driver.find_element_by_id('su')
driver.execute_script('arguments[0].click()', click)
time.sleep(3)  # 睡3秒是因為點選後需要等待一段時間,資料才會載入
driver.save_screenshot('baidu.png')  # 從資料夾中開啟baidu.png即可發現搜尋python成功
driver.close()

因為現在爬蟲的速度很快,前端的元素結構往往反應不過來,所以執行click操作時嵌入了JS指令碼比較穩妥。

3. 實戰爬取淘寶鎮、街道資訊

實際上無頭瀏覽器的的程式碼操作與又介面的時候是完全一樣的,唯一不同的是對無頭瀏覽器的某些操作最好嵌入js程式碼執行,以防出現速度過快找不到元素。

# encoding: utf-8

'''
Created on 2018年1月5日

@author: [email protected]部落格
@date: 2018-1-5
'''

import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql


def init_db():
    global CONNECTION
    CONNECTION = pymysql.connect("地址", "使用者名稱", "密碼", "資料庫", use_unicode=True, charset="utf8")


def init_web_driver():
    global DRIVER
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    DRIVER = webdriver.Chrome(executable_path=r'C:\Users\helloworld\Desktop\python_test\chromedriver.exe',
                              chrome_options=chrome_options)


def close_db():
    CONNECTION.close()


def close_web_driver():
    DRIVER.quit()


def login_taobao(username, password):
    DRIVER.get("https://login.taobao.com/member/login.jhtml?spm=a21bo.2017.754894437.1.5af911d9fOuJW4&f=top&redirectURL=https%3A%2F%2Fwww.taobao.com%2F")
    # 選擇登陸方式
    DRIVER.find_element_by_xpath("//div[@class='login-links']/a[1]").click()

    # 登陸
    input_user = DRIVER.find_element_by_xpath("//*[@id=\"TPL_username_1\"]")
    input_user.clear()
    input_user.send_keys(username)

    DRIVER.find_element_by_xpath("//*[@id=\"TPL_password_1\"]").send_keys(password)
    DRIVER.find_element_by_xpath("//*[@id=\"J_SubmitStatic\"]").click()
    time.sleep(0.5)


def get_data():
    # 點選地址選擇
    # DRIVER.find_element_by_xpath("//*[@id=\"city-title\"]").click()
    city_title = DRIVER.find_element_by_id("city-title")
    DRIVER.execute_script('arguments[0].click();', city_title)

    get_province_and_sub()


def get_province_and_sub():
    # 獲得省列表
    province_items = DRIVER.find_element_by_class_name("city-province").find_elements_by_tag_name("a")

    for province_item in province_items:
        pid = province_item.get_attribute("attr-id")
        pname = province_item.get_attribute("title")
        if pid == "-1":
            print("continue province")
            continue

        sql = "insert into region_province_t (province_id,province) values('" + pid + "','" + pname + "')"
        print(sql)
        cursor = CONNECTION.cursor()
        cursor.execute(sql)
        CONNECTION.commit()

        # province_item.click()
        DRIVER.execute_script('arguments[0].click();', province_item)
        time.sleep(0.5)

        get_city_and_sub(pid)
        back_tab(0)


def get_city_and_sub(pid):
    # 獲得市列表
    city_items = DRIVER.find_element_by_class_name("city-city").find_elements_by_tag_name("a")
    for city_item in city_items:
        cid = city_item.get_attribute("attr-id")
        cname = city_item.get_attribute("title")
        if cid == "-1":
            print("continue city")
            continue

        sql = "insert into region_city_t (city_id,city,province_id) values('" + cid + "','" + cname + "','" + pid + "')"
        print(sql)
        cursor = CONNECTION.cursor()
        cursor.execute(sql)
        CONNECTION.commit()

        # city_item.click()
        DRIVER.execute_script('arguments[0].click();', city_item)
        time.sleep(1)

        get_area_and_sub(cid)
        back_tab(1)


def get_area_and_sub(cid):
    # 獲得縣區列表
    area_items = DRIVER.find_element_by_class_name("city-district").find_elements_by_tag_name("a")
    for area_item in area_items:
        aid = area_item.get_attribute("attr-id")
        aname = area_item.get_attribute("title")
        if aid == "-1":
            print("continue area")
            continue

        sql = "insert into region_area_t (area_id,area,city_id) values('" + aid + "','" + aname + "','" + cid + "')"
        print(sql)
        cursor = CONNECTION.cursor()
        cursor.execute(sql)
        CONNECTION.commit()

        # area_item.click()
        DRIVER.execute_script('arguments[0].click();', area_item)
        time.sleep(0.5)

        get_town_and_sub(aid)
        back_tab(2)


def get_town_and_sub(aid):
    # 獲得鎮列表
    town_items = DRIVER.find_element_by_class_name("city-street").find_elements_by_tag_name("a")
    for town_item in town_items:
        tid = town_item.get_attribute("attr-id")
        tname = town_item.get_attribute("title")
        if tid == "-1":
            print("continue town")
            continue

        sql = "insert into region_town_t (town_id,town,area_id) values('" + tid + "','" + tname + "','" + aid + "')"
        print(sql)
        cursor = CONNECTION.cursor()
        cursor.execute(sql)
        CONNECTION.commit()


def back_tab(index):
    districtEle = DRIVER.find_element_by_class_name("city-select-tab").find_elements_by_tag_name("a")[index]
    DRIVER.execute_script('arguments[0].click();', districtEle)
    time.sleep(0.5)


if __name__ == '__main__':    
    init_db()
    init_web_driver()
    login_taobao("使用者名稱", "密碼")
    get_data()
    close_db()
    close_web_driver()