解析庫--XPath

阿新 • • 發佈：2020-10-13

from lxml import etree
  2 text = '''
  3 <div>
  4 <ul>
  5 <li class = "item-0"><a herf = "link1.html">first item</a></li>
  6 <li class = "item-1"><a herf = "link2.html">second item</a></li>
  7 <li class = "item-inactive"><a herf = "link3.html">third item</a></li>                                             
  8 <li class = "item-1"><a herf = "link4.html">fourth item</a></li>
  9 <li class = "item-0"><a herf = "link5.html">fifth item</a></li>
 10 </ul>
 11 </div>
 12  
'''
 13 html = etree.HTML(text)
 14 result = etree.tostring((html))#輸出修正後的HTML文字
 15 code_all = html.xpath("//*")#選取HTML全部的節點
 16 code_li = html.xpath("//li")
 17 code_a = html.xpath("//li/a")#選取HTML的li節點的子節點a
 18 code_p = html.xpath("//a[@herf = 'link4.html']/../@class")#一直子節點尋找父節點的class屬性
 19 print(code_p)
  
20 print(code_li)
 21 print("///")
 22 print(code_all)
 23 print("///")
 24 print(code_a)
 25 #屬性匹配
 26 attribute = html.xpath("//li[@class = 'item-0']")
 27 print(attribute)
 28 #文字獲取
29 text = html.xpath("//li/text()")
 30 print(text)
 31 #屬性獲取
 32 attribute_get = html.xpath("//li/a/@herf")
 33 print(attribute_get)
  
34 #屬性多值匹配
 35 text1 = """
 36 <li class = "li li-fist"><a href = "link.html">first item</a></li>
 37 """
 38 html1 = etree.HTML(text1)
 39 attribute_number = html1.xpath("//li[contains(@class,'li')]/a/text()")
 40 print(attribute_number)
 41 #多屬性匹配
 42 text2 = """
 43 <li calss = "li li-first" name = "name"><a href = "link.html">first item</a></li>
 44 """
 45 html2 = etree.HTML(text2)
 46 attribute_text2 = html2.xpath("//li[contains(@calss,'li') and @name = 'name']/a/text()")
 47 print(attribute_text2)
 48 #按序選擇
 49 """
 50 有時候，我們在選擇的時候某些屬性可能同時匹配了多個節點，但是隻想要其中某個節點
 51 這是可以利用中括號傳入索引的方法獲取特定次序的節點
 52 """
 53 text3 = '''
 54 <div>
 55 <ul>        
  <li class = "item-0"><a herf = "link1.html">first item</a></li>
 57 <li class = "item-1"><a herf = "link2.html">second item</a></li>
 58 <li class = "item-inactive"><a herf = "link3.html">third item</a></li>
 59 <li class = "item-1"><a herf = "link4.html">fourth item</a></li>
 60 <li class = "item-0"><a herf = "link5.html">fifth item</a></li>
 61 </ul>
 62 </div>
 63 '''
 64 html3 = etree.HTML(text3)
 65 result = html3.xpath("//li[1]/a/text()")#選取第一個li節點
 66 print(result)
 67 result = html3.xpath("//li[last()]/a/text()")#選取左後一個li節點
 68 print(result)
 69 result = html3.xpath("//li[position() < 3]/a/text()")#選取位置小於三的節點
 70 print(result)
 71 #節點軸選取
 72 result = html3.xpath("//li[1]/ancestor::*")#獲取所有祖先節點，後跟*表示匹配所有節點
 73 print(result)
 74 result = html3.xpath("//li[1]/ancestor::div")#獲取div這個祖先節點
 75 print(result)
 76 result = html3.xpath("//li[1]/attribute::*")#獲取所有屬性
 77 print(result)
 #執行結果
    ['item-1']
[<Element li at 0x7f72f489c888>, <Element li at 0x7f72f489c948>, <Element li at 0x7f72f489c9c8>, <Element li at 0x7f72f489ca08>, <Element li at 0x7f72f489ca88>]
///
[<Element html at 0x7f72f489c808>, <Element body at 0x7f72f489c788>, <Element div at 0x7f72f489c748>, <Element ul at 0x7f72f489c848>, <Element li at 0x7f72f489c888>, <Element a at 0x7f72f489c908>, <Element li at 0x7f72f489c948>, <Element a at 0x7f72f489c988>, <Element li at 0x7f72f489c9c8>, <Element a at 0x7f72f489c8c8>, <Element li at 0x7f72f489ca08>, <Element a at 0x7f72f489ca48>, <Element li at 0x7f72f489ca88>, <Element a at 0x7f72f489cac8>]
///
[<Element a at 0x7f72f489c908>, <Element a at 0x7f72f489c988>, <Element a at 0x7f72f489c8c8>, <Element a at 0x7f72f489ca48>, <Element a at 0x7f72f489cac8>]
[<Element li at 0x7f72f489c888>, <Element li at 0x7f72f489ca88>]
[]
['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
['first item']
['first item']
['first item']
['fifth item']
['first item', 'second item']
[<Element html at 0x7f72f489cdc8>, <Element body at 0x7f72f489cec8>, <Element div at 0x7f72f489cf48>, <Element ul at 0x7f72f489cf08>]
[<Element div at 0x7f72f489cf48>]
['item-0']