1. 程式人生 > 其它 >python爬蟲-xpath解析

python爬蟲-xpath解析

前言

xpath解析方式可以說是最常用最便捷高效的一種解析方式了。而且具有很高的通用性。

環境的安裝

pip install lxml

xpath解析原理

1. 例項化一個etree物件,並且需要將被解析的頁面原始碼資料載入到該物件中。
2. 呼叫etree物件中的xpath方法結合著xpath表示式實現標籤的定位和內容的捕獲。

xpath教程

   這裡更加詳細

xpath實踐

XPath實踐一:爬取58二手房標題
"""
XPath實踐1:爬取58二手房資訊
"""

import requests
from lxml import etree

if __name__ == '__main__':
    # UA偽裝
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
                      "Version/14.1 Safari/605.1.15 "
    }
    # 指定url
    url = 'https://bj.58.com/ershoufang/'
    # 獲取頁面
    page = requests.get(url=url, headers=headers)
    page.encoding = 'utf-8'
    # 資料解析
    tree = etree.HTML(page.text)
    title_list = tree.xpath('//section[@class="list"]/div/a/div[2]/div[1]/div[1]/h3/text()')
    print(title_list)
    with open("../data2/58二手.text", 'wb') as fp:
        for title in title_list:
            # 這裡似乎是位元組型別的
            fp.write(bytes(title, 'utf-8'))
            fp.write(bytes('\n','utf-8'))
        print("寫入結束!")
XPath實踐二:爬取高清圖片
"""
Xpath實踐2:爬取高清圖片
之前我們使用過正則爬取過wallhaven中的縮圖,看著不怎麼清洗,這裡我們爬取清晰的圖片
"""

import os
import requests
from lxml import etree


# 判斷頁數輸入是否合理
def judge_num(num1, num2):
    return 0 < num1 < num2


if __name__ == '__main__':
    # UA偽裝
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
                      "Version/14.1 Safari/605.1.15 "
    }
    # 指定url
    bash_url = 'https://wallhaven.cc/'

    # 搜尋關鍵詞
    search_words = input("請輸入你想要搜尋的圖片標籤(例如:WLOP):\n")
    # 更新url
    # 最後沒加根據觀看次數排序 '&sorting=views&order=desc'
    # 你也可以去掉,但是我相信廣大網友的眼光
    # '&categories=111&purity=111' 似乎是圖片等級
    bash_url2 = bash_url + 'search?q=' + search_words + '&categories=111&purity=111' + '&sorting=views&order=desc'
    # 輸入想要爬取的頁面範圍
    # 起始和結束頁
    num1 = int(input("請輸入你想爬取的起始頁數:\n"))
    num2 = int(input("請輸入你想爬取的結束頁數:\n"))
    while not judge_num(num1, num2):
        print("輸入不合格!!!\n")
        num1 = int(input("請重新輸入你想爬取的起始頁數:\n"))
        num2 = int(input("請重新輸入你想爬取的結束頁數:\n"))
    # 存放目錄設定
    if not os.path.exists('../data2/wallhaven'):  # 在上級目錄下建立一個名為wallpapers的資料夾
        os.mkdir('../data2/wallhaven')
    path = '../data2/wallhaven/' + search_words
    if not os.path.exists(path):  # 在wallpapers資料夾下建立一個以關鍵詞命名的子資料夾以存放此次下載的所有桌布
        os.mkdir(path)

    # 用來存放圖片詳細網站
    img_href_list = []
    for i in range(num1, num2):
        # 圖片頁面
        url = bash_url2 + '&page=' + str(i)
        # 拿到當前頁面
        page = requests.get(url=url, headers=headers)
        page.encoding = 'utf-8'
        # 通過xpath解析
        tree = etree.HTML(page.text)
        img_href_list += tree.xpath('//a[@class="preview"]/@href')
        # print(img_href_list)

    # 對詳細圖片下載位置獲取
    img_src_list = []
    for img_url in img_href_list:
        img_page = requests.get(url=img_url, headers=headers)
        img_page.encoding = 'utf-8'
        tree = etree.HTML(img_page.text)
        img_src_list += tree.xpath('//img[@id="wallpaper"]/@src')
        # print(img_src_list)

    # 儲存高清圖片
    img_num = 0
    for img_src in img_src_list:
        # 請求到圖片資料
        img_data = requests.get(url=img_src, headers=headers).content
        # 生成圖片名稱(根據自己的喜好,也可以按url本來的命名方式命名
        img_num += 1
        img_name = search_words + str(img_num) + '.jpg'
        with open('../data2/wallhaven/' + search_words + '/' + img_name, 'wb') as fp:
            fp.write(img_data)
        print(img_name, '下載完成!')
XPath實踐三:爬取全國城市名稱
"""
XPath實踐3:爬取全國城市名稱
"""
import requests
from lxml import etree

if __name__ == '__main__':
    # UA偽裝
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
                      "Version/14.1 Safari/605.1.15 "
    }
    # 指定url
    url = 'https://www.aqistudy.cn/historydata/'
    page = requests.get(url=url,headers=headers)
    # 解析
    tree = etree.HTML(page.text)
    hot_city_list = tree.xpath('//div[@class="row"]/div[1]/div[@class="hot"]/div[2]/ul/li/a/text()')
    all_city_list = tree.xpath('//div[@class="row"]/div[1]/div[@class="all"]/div[2]/ul/div[2]/li/a/text()')
    # 我們可以看到這裡用了兩次解析
    # 那麼如何通過一次解析獲得全部呢?
    all_city_list2 = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')

    print(hot_city_list)
    print(all_city_list)
    print(all_city_list2)
XPath實踐四:爬取站長素材中的免費簡介
"""
XPath實踐4:爬取站長素材上的免費簡歷模板
"https://sc.chinaz.com"
url = "https://aspx.sc.chinaz.com/query.aspx?keyword=免費&classID=864"
"""
import os
import requests
from lxml import etree

if __name__ == '__main__':
    # UA偽裝
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
                      "Version/14.1 Safari/605.1.15 "
    }
    # 指定url
    url = 'https://aspx.sc.chinaz.com/query.aspx?keyword=免費&classID=864&page=1'
    page = requests.get(url=url, headers=headers)
    page.encoding = 'utf-8'
    tree = etree.HTML(page.text)
    jianjie_list = tree.xpath('//div[@id="container"]/div/a/@href')
    jianjie_list = ['https:' + i for i in jianjie_list]

    # 獲取下載地址
    download_list = []
    for detail_jianjie in jianjie_list:
        detail_page = requests.get(url=detail_jianjie, headers=headers)
        detail_page.encoding = 'utf-8'
        tree = etree.HTML(detail_page.text)
        download_list += tree.xpath('//div[@id="down"]/div[2]/ul/li[4]/a/@href')
        # print(download_list)

    # 持久化儲存
    if not os.path.exists('../data2/jianjie'):
        os.mkdir('../data2/jianjie')
    download_num = 0
    for download in download_list:
        download_num += 1
        jianjie_data = requests.get(url=download, headers=headers).content
        jianjie_name = 'jianjie_' + str(download_num) + '.rar'
        with open('../data2/jianjie/' + jianjie_name, 'wb') as fp:
            fp.write(jianjie_data)
        print(jianjie_name, '下載完成!')