爬蟲基礎框架 之xpath(一) --- xpath基礎
阿新 • • 發佈:2019-05-02
join read [1] tco ext from ans 註意 csv
xpath簡介
-
lxml是一個第三方框架,用於對xml文件進行格式化操作(html文件是一種特殊xml文件)
-
xpath是一種基於xml文件,根據xml文件的文檔結構來提取目標元素或者屬性的語法,它的基本依賴工具就是lxml
- etree是lxml中的一種格式化工具,用於將html文件格式化成一個節點樹結構
- 1、將本地的test.html文件格式化成一個節點樹對象
1 from lxml import etree 2 html_tree = etree.parse("./test.html") 3 4 print(html_tree)5 輸出結果: <lxml.etree._ElementTree object at 0x0000028A81E566C8> 6 7 xpath語法中"/"代表當前節點的子節點 "//"代表當前節點的後代節點 如果以“/”代表從根節點開始查找 8 xpath函數,傳入一個字符串參數,代表的是xpath路徑,用於定位目標節點,返回值是一個列表,列表中定位到測那些節點 9 「註意」在xpath語法中數字都是從1開始數,沒有0序號也沒有負數
- 2、獲取節點
1 ret = html_tree.xpath("/html/body/ol/li[1]") 2 ret = html_tree.xpath("/html/body/div/div[1]/a") # 裏面用xpath路徑來定位目標節點
- 3、提取節點的屬性和內容
1 ret = html_tree.xpath("/html/body/div/div[1]/a/text()") # 提取標簽的內容 2 3 ret = html_tree.xpath("/html/body/div/div[1]/a/@href") # 提取href屬性,【註意】xpath語法中所有的節點屬性要在前面加上“@ ”符號
- 4、定位
1 層級定位 2 ret = html_tree.xpath("/html/body//li/text()") # 獲取頁面上的所有的li
1 屬性定位 2 3 ret = html_tree.xpath("/html/body//li[@id]/text()") # 查找頁面上所有帶有id屬性的li 4 ret = html_tree.xpath("/html/body//li[@class=‘dudu‘]/text()") # 查找頁面上所有的class屬性為dudu的li 5 ret = html_tree.xpath("/html/body//li[@class=‘tanshui taohua‘]/text()") # 屬性的值一定寫全
- 5、模糊匹配
1 ret = html_tree.xpath("/html/body//li[contains(@class,‘he‘)]/text()") #包含:查找所有class值中包含he的li 2 ret = html_tree.xpath("/html/body//li[starts-with(@class,‘h‘)]/text()") # 開頭:查找所有的class值以h開頭的li
- 6、邏輯匹配
1 ret = html_tree.xpath("/html/body//li[@class and @id]/text()") # 與:查找所有的包含id屬性和class屬性的那些li 2 ret = html_tree.xpath("//li[@class=‘nene‘ or @id=‘hh‘]/text()") # 或:查找所有的id值為hh,或者class值為neme的li 3 print(ret)
- 7、相對定位
1 ol = html_tree.xpath("//ol[2]")[0] print(ol) # 查找第二個ol 2 3 從上面查找到的ol中提取li "."代表當前 ".."代表當前的上一級 4 ret = ol.xpath("//li/text()") # 用絕對路徑來提取,無論xpath函數前面用誰調用,都是從文檔的跟節來提取點 5 ret = ol.xpath("..//li/text()") # 用相對路徑來提取,從xpath前面調用對象來查找 6 7 print(ret)
實例
1 from urllib import request,parse 2 from time import sleep 3 from lxml import etree 4 import re 5 import json 6 import csv 7 import redis 8 9 # 1、【數據的獲取】 10 def request_from(url,page,city): 11 page_url = url%(city,page) 12 req = request.Request(headers={‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36‘},url=page_url) 13 return req 14 def get_pages(url,start,end,city): 15 # 創建請求對象 16 for page in range(start,end+1): 17 req = request_from(url=url,page=page,city=city) 18 # 發起請求 19 res = request.urlopen(req) 20 sleep(1) 21 html = res.read().decode("utf-8") 22 23 yield html 24 25 # 2、【數據的解析】 26 def anylasis_data(pages): 27 for page in pages: 28 # 用etree將頁面轉成節點樹 29 page_tree = etree.HTML(page) 30 house_list = page_tree.xpath("//ul[@class=‘sellListContent‘]/li") 31 # print(house_list) 32 # 叠代每一個li(每一個房屋信息內容) 33 for house in house_list: 34 # 提取內容 35 # 創建一個item字典,用於整合每一個房屋信息 36 item = {} 37 item["title"] = house.xpath(".//div[@class=‘title‘]//a/text()")[0] 38 item["houseInfo"] = "".join(house.xpath(".//div[@class=‘houseInfo‘]//text()")) 39 item["positionInfo"] = "".join(house.xpath(".//div[@class=‘positionInfo‘]//text()")) 40 item["unitPrice"] = re.findall(pattern=r‘[0-9]+‘,string=house.xpath(".//div[@class=‘unitPrice‘]//text()")[0])[0] 41 item["totalPrice"] = house.xpath(".//div[@class=‘totalPrice‘]//text()")[0] 42 item["picUrl"] = house.xpath(".//img[@class=‘lj-lazy‘]/@data-original")[0] 43 44 yield item 45 46 # 3、【數據的存儲】 47 def write_to_json(houses): 48 # 整合json數據 49 # 創建一個字典用於整合所有的房屋數據 50 hd = {} 51 # 創建一個列表,用於存儲每一個房屋的信息 52 hl = [] 53 for house in houses: 54 hl.append(house) 55 hd["house"] = hl 56 # print(hd) 57 with open("house.json",‘w‘,encoding=‘utf-8‘) as fp: 58 fp.write(json.dumps(hd)) 59 60 def write_to_redis(houses): 61 # 創建redis數據庫連接 62 rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=6) 63 for house in houses: 64 rds.lpush("ershoufang",house) 65 66 def write_to_csv(houses): 67 # 打開一個csv文件 68 fp = open("ershoufang.csv","a+") 69 # 創建一個寫對象 70 writer = csv.writer(fp) 71 # 寫表頭 72 writer.writerow(["title","houseInfo","positionInfo","unitPrice","totalPrice","picUrl"]) 73 for house in houses: 74 # csv二維表的每一行是一個列表 75 values = [] 76 for k,v in house.items(): 77 values.append(v) 78 writer.writerow(values) 79 fp.close() 80 81 if __name__ == ‘__main__‘: 82 url = "https://%s.lianjia.com/ershoufang/pg%d/" 83 city = input("請輸入城市簡稱:") 84 start = int(input("請輸入起始頁:")) 85 end = int(input("請輸入終止頁:")) 86 pages = get_pages(url=url,city=city,start=start,end=end) 87 # print(pages) 88 houses = anylasis_data(pages) 89 # 存入json 90 write_to_csv(houses)
爬蟲基礎框架 之xpath(一) --- xpath基礎