1. 程式人生 > 其它 >爬蟲入門 02 xpath庫初步使用

爬蟲入門 02 xpath庫初步使用

xpath概述:

1.xpath:
最常用、最高效、的一種解析方式【資料解析首選】
2.如何使用?
1.例項化 etree => 載入資料
2.呼叫 api => 1.標籤定位 2.資料解析
xxx.xpath('')
=> xpath表示式:
1.標籤定位
2.資料解析
1.標籤的文字
2.標籤的屬性
3.api使用
1.標籤定位
2.資料解析
pip install lxml

xpath爬蟲案例

1.離線方式使用xpath


import requests
from lxml import etree

if __name__ == '__main__':
    # 1.例項化
    root = etree.parse("D:\ \python-sk\data\lol.html")

    # 2.呼叫api
    '''
        編寫xpath表示式
        1.標籤定位
        2.資料解析
    '''

    '''
    1.標籤定位:
        1.絕對路徑
        2.相對路徑
        3.屬性定位
        4.索引定位
    爬蟲考試卷
        以為特別深入
        結果考試卷就一個xpath
        和script的基本建立
    '''


    #api:1 => 絕對路徑 => 返回 Element-list(返回陣列)
    t_info = root.xpath('/html/head/title')
    print(t_info)

    #api:2 => 相對路徑 => 可以從任何路徑定位
    t_info1 = root.xpath('//title')
    print(t_info1)
    t_info1 = root.xpath('/html//title')
    print(t_info1)
   
    # api:3 => 屬性定位 => 標籤[@屬性="xxx"]
    div = root.xpath('//div')
    print(div)

    div_adc = root.xpath('//div[@class="adc"]')
    print(div_adc)

    li_list = root.xpath('//div[@class="adc"]//ul//li')
    print(li_list)

    li_list = root.xpath('//div[@class="adc"]//ul//li')
    print(li_list)

    # api:4 => 索引定位 => 索引下標從1開始 標籤[索引]
    # xpath中不管怎麼寫,返回的都是連結串列,但是用下標就不是了
    li_list1 = root.xpath('//div[@class="adc"]//ul//li[1]')
    print(li_list1)

    li_list1 = root.xpath('//div[@class="adc"]/ul/li[1]/a')
    print(li_list1)

    # li_list2 = root.xpath('//div[@class="adc"]//ul//li')[0]
    # print(li_list2)

    # 2.資料解析
        # 1.標籤文字
        # 2.標籤屬性
    # a下面的文字
    a_text = root.xpath('//div[@class="adc"]//li[1]/a/text()')
    print(a_text)
    # li下面的文字
    li1_text = root.xpath('//div[@class="adc"]//li[1]//text()')
    print(li1_text)

    # 2.屬性解析 => 標籤/@屬性名
    img_info = root.xpath('//div[@class="top"]//img')
    print(img_info)

    img = root.xpath('//div[@class="top"]//img/@src')
    print(img)

2.xpath爬取二手房資訊

import requests
from lxml import etree

if __name__ == '__main__':
    # ua偽裝 => 模擬瀏覽器上網
    headers = {
        "User-Agent":"你的ua"
    }
    url = "https://dl.58.com/ershoufang"

    # 1.通用爬蟲
    page_info = requests.get(url,headers=headers)

    # 2.資料解析
    root = etree.HTML(page_info.text)

    # 3.標籤定位
    div_list = root.xpath('//section[@class="list"]/div')
    print(div_list)

    fp = open("D:\ \python-sk\data\二手房.txt","w",encoding="utf-8")
    for div in div_list:
        # 標籤定位,要加[0]得到裡面的資料,不然都是一個一個的list
        title = div.xpath('./a/div[@class="property-content"]/div[@class="property-content-detail"]/div[@class="property-content-title"]/h3/text()')[0]
        fp.write(title+"\n")
        print(title, "=>爬蟲ok")

3.爬取58同城租房資訊

'''
練習
    1.爬取58租房
'''
import requests
from lxml import etree

if __name__ == '__main__':
    # ua偽裝 => 模擬瀏覽器上網
    headers = {
        "User-Agent":"你的ua"
    }
    url = "https://dl.58.com/chuzu"

    # 1.通用爬蟲
    page_info = requests.get(url=url,headers=headers)

    # 2.資料解析
    root = etree.HTML(page_info.text)

    # 3.標籤定位
    house_list = root.xpath('//div[@class="list-wrap"]/div[@class="list-box"]/ul/li')
    print(house_list)

    fp = open("D:\ \python-sk\data\租房.txt","w",encoding="utf-8")
    # 4.資料解析,try,except丟擲異常
    for el in house_list:
        try:
            message = el.xpath('./div[@class="des"]/h2/a/text()')[0]
            # print(message + "\n")
            fp.write(message)
            print(message,"爬取ok")
        except  BaseException as e:
            print(e)

4.爬取58同城二手車【坑,慎入,別爬】

'''
練習
    2.爬取58二手車
    裡面有坑,不太行,售價是UI元件做的
    而且被經常被58牆,請求一次,就不行了
    只能看程式碼了,已經被牆了,請求發出去返回的是空資料
'''

import requests
from lxml import etree

if __name__ == '__main__':
    # ua偽裝 => 模擬瀏覽器上網
    headers = {
        "User-Agent":"你的ua"
    }
    url = "https://dl.58.com/ershouche"

    # 1.通用爬蟲
    page_info = requests.get(url=url,headers=headers)

    # 2.解析資料
    root = etree.HTML(page_info.text)

    # 3.標籤定位
    # car_list = root.xpath('//div[@class="list-wrap"]/ul/li')
    # print(car_list)

    car_list = root.xpath('//div[@class="list-wrap"]/ul/li')
    print(car_list)
    # 4.解析資料
    # for el in car_list:
    #     message = el.xpath('./div[@class="info--wrap"]/a/div')
    #     print(message)

5.爬取圖片

import requests
from lxml import etree

if __name__ == '__main__':
    # ua偽裝 => 模擬瀏覽器上網
    headers = {
        "User-Agent":"你的ua"
    }

    url = "https://pic.netbian.com/4kmeinv/"

    # url = "https://pic.netbian.com/4kdongman"

    # 1.通用爬蟲
    page_info = requests.get(url=url, headers=headers)
    page_info.encoding="gbk"
    print(page_info)
    # 2.解析資料
    root = etree.HTML(page_info.text)

    # 3.標籤定位
    li_list = root.xpath('//div[@class="slist"]/ul/li')

    for el in li_list:
        img_url = "https://pic.netbian.com"+el.xpath('./a/img/@src')[0]
        img_title = el.xpath('./a/img/@alt')[0]
        print(img_url)

        # 1.通用爬蟲
        img_reponse = requests.get(url=img_url,headers=headers)
        img_data = img_reponse.content
        with open(f"D:\ \python-sk\data\img\{img_title}.jpg","wb") as fp:
            fp.write(img_data)
            print(img_title,"爬取ok")

6.爬取城市資訊並去重

import requests
from lxml import etree

if __name__ == '__main__':
    # ua偽裝 => 模擬瀏覽器上網
    headers = {
        "User-Agent":"你的ua"
    }
    url = "http://www.air-level.com/"

    # 1.通用爬蟲
    page_info = requests.get(url=url, headers=headers)

    # 2.解析資料
    root = etree.HTML(page_info.text)

    # 3.標籤定位
    a_list = root.xpath('//div[@id="citylist"]/div[@class="citynames"]/a')
    # print(a_list)

    fp = open("D:\ \python-sk\data\cityname.txt","w",encoding="utf-8")


    # 去重方法1,使用set
    s1 = {""}
    s1.pop()
    for a in a_list:
        a_text = a.xpath('./text()')[0]
        s1.add(a_text)
    fp.write(str(s1))

    # # 去重方法2,大字串,in和not in
    st = ""
    for a in a_list:
        a_text = a.xpath('./text()')[0]
        print(a_text)
        if((a_text not in st) == True):
            st = st + a_text
            fp.write(a_text)
            print("寫入一個字串")
        else:
            print("該字串已存在")



    print("successful")

    '''
    作業,cityname去重
        1.把重點城市去掉
        2.取完再去重【推薦】
    '''