1. 程式人生 > >xpath基本操作用法

xpath基本操作用法

查詢python lxml庫用法:lxml.de/

from lxml import etree
text = '''
<div>
    <ul>
        <li class="item-0"><a href="link1.html">first item</a></li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-inactive"><a href="link3.html">third item</a></li>
        <li class="item-1"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
#HTML初始化,構造一個XPATH解析物件
# html = etree.HTML(text)
#通過.tostring 生成一個修正過的bytes型別的HTML程式碼
# result = etree.tostring(html)
# print(result.decode('utf-8'))

#讀取本地html檔案
# html = etree.parse('./res.html',etree.HTMLParser())
# result = etree.tostring(html)
# print(result.decode('utf-8'))

#獲取所有節點
# html = etree.parse('./res.html',etree.HTMLParser())
# result = html.xpath('//*')
# print(result)

html = etree.parse('./2.html',etree.HTMLParser())
#5獲取所有節點  獲取本地html所有li節點
# result = html.xpath('//li')
# print(result[0])
#6子節點 獲取li裡面的a節點
# result = html.xpath('//li/a')
#7父節點  獲取href屬性叫link3.html的a節點的父節點的class屬性
# result = html.xpath('//a[@href="link3.html"]/../@class')
#8屬性匹配獲取class為item-0的li節點
# result = html.xpath('//li[@class="item-0"]')
#9文字獲取。獲取所有li節點下a節點的內容
# result = html.xpath('//li/a/text()')
# result = html.xpath('//li[@class="item-0"]//text()')
#10屬性獲取
# result = html.xpath('//li/a/@href')
#11 屬性多值匹配
# result = html.xpath('//li[contains(@class,"class_1")]/a/text()')
#12多屬性匹配
# result = html.xpath('//li[contains(@class,"class_1") and @name="item"]/a/text()')
#13 按順序選擇
# result1 = html.xpath('//li[1]/a/text()')
# result2 = html.xpath('//li[last()]/a/text()')
# result3 = html.xpath('//li[position()<3]/a/text()')
# result4 = html.xpath('//li[last()-2]/a/text()')
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# 14節點軸選擇
#所有祖先節點
result = html.xpath('//li[1]/ancestor::*')
#祖先節點裡的div
result = html.xpath('//li[1]/ancestor::div')
#attribute獲取節點所有屬性
result = html.xpath('//li[1]/atrribute::*')
#child獲取所有直接子節點
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
#descendant,獲取所有子孫節點,加要求span
result = html.xpath('//li[1]/descendant::span')
#following 獲取當前節點後的所有節點,加要求只獲取第二個後續節點
result = html.xpath('//li[1]/following::*[2]')
#following-sibling 獲取當前節點之後的所有同級節點
result = html.xpath('//li[1]/following-sibling::*')