1. 程式人生 > >爬蟲基礎框架 之xpath(一) --- xpath基礎

爬蟲基礎框架 之xpath(一) --- xpath基礎

join read [1] tco ext from ans 註意 csv

xpath簡介

  • lxml是一個第三方框架,用於對xml文件進行格式化操作(html文件是一種特殊xml文件)

  • xpath是一種基於xml文件,根據xml文件的文檔結構來提取目標元素或者屬性的語法,它的基本依賴工具就是lxml

  • etree是lxml中的一種格式化工具,用於將html文件格式化成一個節點樹結構
  • 1、將本地的test.html文件格式化成一個節點樹對象
1 from lxml import etree
2 html_tree = etree.parse("./test.html")
3 
4 print(html_tree)
5 輸出結果: <lxml.etree._ElementTree object at 0x0000028A81E566C8> 6 7 xpath語法中"/"代表當前節點的子節點 "//"代表當前節點的後代節點 如果以“/”代表從根節點開始查找 8 xpath函數,傳入一個字符串參數,代表的是xpath路徑,用於定位目標節點,返回值是一個列表,列表中定位到測那些節點 9 「註意」在xpath語法中數字都是從1開始數,沒有0序號也沒有負數
  • 2、獲取節點
1 ret = html_tree.xpath("/html/body/ol/li[1]")
2 ret = html_tree.xpath("
/html/body/div/div[1]/a") # 裏面用xpath路徑來定位目標節點
  • 3、提取節點的屬性和內容
1 ret = html_tree.xpath("/html/body/div/div[1]/a/text()")    # 提取標簽的內容
2 
3 ret = html_tree.xpath("/html/body/div/div[1]/a/@href")    # 提取href屬性,【註意】xpath語法中所有的節點屬性要在前面加上“@ ”符號
  • 4、定位
1 層級定位
2 ret = html_tree.xpath("/html/body//li/text()
") # 獲取頁面上的所有的li
1 屬性定位
2 
3 ret = html_tree.xpath("/html/body//li[@id]/text()")                       # 查找頁面上所有帶有id屬性的li
4 ret = html_tree.xpath("/html/body//li[@class=‘dudu‘]/text()")             # 查找頁面上所有的class屬性為dudu的li
5 ret = html_tree.xpath("/html/body//li[@class=‘tanshui taohua‘]/text()")   # 屬性的值一定寫全
  • 5、模糊匹配
1 ret = html_tree.xpath("/html/body//li[contains(@class,‘he‘)]/text()")     #包含:查找所有class值中包含he的li
2 ret = html_tree.xpath("/html/body//li[starts-with(@class,‘h‘)]/text()")   # 開頭:查找所有的class值以h開頭的li
  • 6、邏輯匹配
1 ret = html_tree.xpath("/html/body//li[@class and @id]/text()")     # 與:查找所有的包含id屬性和class屬性的那些li
2 ret = html_tree.xpath("//li[@class=‘nene‘ or @id=‘hh‘]/text()")    # 或:查找所有的id值為hh,或者class值為neme的li
3 print(ret)
  • 7、相對定位
1 ol = html_tree.xpath("//ol[2]")[0] print(ol)    # 查找第二個ol
2 
3 從上面查找到的ol中提取li "."代表當前 ".."代表當前的上一級
4 ret = ol.xpath("//li/text()")    # 用絕對路徑來提取,無論xpath函數前面用誰調用,都是從文檔的跟節來提取點
5 ret = ol.xpath("..//li/text()")  # 用相對路徑來提取,從xpath前面調用對象來查找
6 
7 print(ret)

實例

 1 from urllib import request,parse
 2 from time import sleep
 3 from lxml import etree
 4 import re
 5 import json
 6 import csv
 7 import redis
 8 
 9 # 1、【數據的獲取】
10 def request_from(url,page,city):
11     page_url = url%(city,page)
12     req = request.Request(headers={User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36},url=page_url)
13     return req
14 def get_pages(url,start,end,city):
15     # 創建請求對象
16     for page in range(start,end+1):
17         req = request_from(url=url,page=page,city=city)
18         # 發起請求
19         res = request.urlopen(req)
20         sleep(1)
21         html = res.read().decode("utf-8")
22 
23         yield html
24 
25 # 2、【數據的解析】
26 def anylasis_data(pages):
27     for page in pages:
28         # 用etree將頁面轉成節點樹
29         page_tree = etree.HTML(page)
30         house_list = page_tree.xpath("//ul[@class=‘sellListContent‘]/li")
31         # print(house_list)
32         # 叠代每一個li(每一個房屋信息內容)
33         for house in house_list:
34             # 提取內容
35             # 創建一個item字典,用於整合每一個房屋信息
36             item = {}
37             item["title"] = house.xpath(".//div[@class=‘title‘]//a/text()")[0]
38             item["houseInfo"] = "".join(house.xpath(".//div[@class=‘houseInfo‘]//text()"))
39             item["positionInfo"] = "".join(house.xpath(".//div[@class=‘positionInfo‘]//text()"))
40             item["unitPrice"] = re.findall(pattern=r[0-9]+,string=house.xpath(".//div[@class=‘unitPrice‘]//text()")[0])[0]
41             item["totalPrice"] = house.xpath(".//div[@class=‘totalPrice‘]//text()")[0]
42             item["picUrl"] = house.xpath(".//img[@class=‘lj-lazy‘]/@data-original")[0]
43 
44             yield item
45 
46 # 3、【數據的存儲】
47 def write_to_json(houses):
48     # 整合json數據
49     # 創建一個字典用於整合所有的房屋數據
50     hd = {}
51     # 創建一個列表,用於存儲每一個房屋的信息
52     hl = []
53     for house in houses:
54         hl.append(house)
55     hd["house"] = hl
56     # print(hd)
57     with open("house.json",w,encoding=utf-8) as fp:
58         fp.write(json.dumps(hd))
59 
60 def write_to_redis(houses):
61     # 創建redis數據庫連接
62     rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=6)
63     for house in houses:
64         rds.lpush("ershoufang",house)
65 
66 def write_to_csv(houses):
67     # 打開一個csv文件
68     fp = open("ershoufang.csv","a+")
69     # 創建一個寫對象
70     writer = csv.writer(fp)
71     # 寫表頭
72     writer.writerow(["title","houseInfo","positionInfo","unitPrice","totalPrice","picUrl"])
73     for house in houses:
74         # csv二維表的每一行是一個列表
75         values = []
76         for k,v in house.items():
77             values.append(v)
78         writer.writerow(values)
79     fp.close()
80 
81 if __name__ == __main__:
82     url = "https://%s.lianjia.com/ershoufang/pg%d/"
83     city = input("請輸入城市簡稱:")
84     start = int(input("請輸入起始頁:"))
85     end = int(input("請輸入終止頁:"))
86     pages = get_pages(url=url,city=city,start=start,end=end)
87     # print(pages)
88     houses = anylasis_data(pages)
89     # 存入json
90     write_to_csv(houses)

爬蟲基礎框架 之xpath(一) --- xpath基礎