爬蟲入門 02 xpath庫初步使用
阿新 • • 發佈:2022-04-05
xpath概述:
1.xpath:
最常用、最高效、的一種解析方式【資料解析首選】
2.如何使用?
1.例項化 etree => 載入資料
2.呼叫 api => 1.標籤定位 2.資料解析
xxx.xpath('')
=> xpath表示式:
1.標籤定位
2.資料解析
1.標籤的文字
2.標籤的屬性
3.api使用
1.標籤定位
2.資料解析
pip install lxml
xpath爬蟲案例
1.離線方式使用xpath
import requests from lxml import etree if __name__ == '__main__': # 1.例項化 root = etree.parse("D:\ \python-sk\data\lol.html") # 2.呼叫api ''' 編寫xpath表示式 1.標籤定位 2.資料解析 ''' ''' 1.標籤定位: 1.絕對路徑 2.相對路徑 3.屬性定位 4.索引定位 爬蟲考試卷 以為特別深入 結果考試卷就一個xpath 和script的基本建立 ''' #api:1 => 絕對路徑 => 返回 Element-list(返回陣列) t_info = root.xpath('/html/head/title') print(t_info) #api:2 => 相對路徑 => 可以從任何路徑定位 t_info1 = root.xpath('//title') print(t_info1) t_info1 = root.xpath('/html//title') print(t_info1) # api:3 => 屬性定位 => 標籤[@屬性="xxx"] div = root.xpath('//div') print(div) div_adc = root.xpath('//div[@class="adc"]') print(div_adc) li_list = root.xpath('//div[@class="adc"]//ul//li') print(li_list) li_list = root.xpath('//div[@class="adc"]//ul//li') print(li_list) # api:4 => 索引定位 => 索引下標從1開始 標籤[索引] # xpath中不管怎麼寫,返回的都是連結串列,但是用下標就不是了 li_list1 = root.xpath('//div[@class="adc"]//ul//li[1]') print(li_list1) li_list1 = root.xpath('//div[@class="adc"]/ul/li[1]/a') print(li_list1) # li_list2 = root.xpath('//div[@class="adc"]//ul//li')[0] # print(li_list2) # 2.資料解析 # 1.標籤文字 # 2.標籤屬性 # a下面的文字 a_text = root.xpath('//div[@class="adc"]//li[1]/a/text()') print(a_text) # li下面的文字 li1_text = root.xpath('//div[@class="adc"]//li[1]//text()') print(li1_text) # 2.屬性解析 => 標籤/@屬性名 img_info = root.xpath('//div[@class="top"]//img') print(img_info) img = root.xpath('//div[@class="top"]//img/@src') print(img)
2.xpath爬取二手房資訊
import requests from lxml import etree if __name__ == '__main__': # ua偽裝 => 模擬瀏覽器上網 headers = { "User-Agent":"你的ua" } url = "https://dl.58.com/ershoufang" # 1.通用爬蟲 page_info = requests.get(url,headers=headers) # 2.資料解析 root = etree.HTML(page_info.text) # 3.標籤定位 div_list = root.xpath('//section[@class="list"]/div') print(div_list) fp = open("D:\ \python-sk\data\二手房.txt","w",encoding="utf-8") for div in div_list: # 標籤定位,要加[0]得到裡面的資料,不然都是一個一個的list title = div.xpath('./a/div[@class="property-content"]/div[@class="property-content-detail"]/div[@class="property-content-title"]/h3/text()')[0] fp.write(title+"\n") print(title, "=>爬蟲ok")
3.爬取58同城租房資訊
''' 練習 1.爬取58租房 ''' import requests from lxml import etree if __name__ == '__main__': # ua偽裝 => 模擬瀏覽器上網 headers = { "User-Agent":"你的ua" } url = "https://dl.58.com/chuzu" # 1.通用爬蟲 page_info = requests.get(url=url,headers=headers) # 2.資料解析 root = etree.HTML(page_info.text) # 3.標籤定位 house_list = root.xpath('//div[@class="list-wrap"]/div[@class="list-box"]/ul/li') print(house_list) fp = open("D:\ \python-sk\data\租房.txt","w",encoding="utf-8") # 4.資料解析,try,except丟擲異常 for el in house_list: try: message = el.xpath('./div[@class="des"]/h2/a/text()')[0] # print(message + "\n") fp.write(message) print(message,"爬取ok") except BaseException as e: print(e)
4.爬取58同城二手車【坑,慎入,別爬】
'''
練習
2.爬取58二手車
裡面有坑,不太行,售價是UI元件做的
而且被經常被58牆,請求一次,就不行了
只能看程式碼了,已經被牆了,請求發出去返回的是空資料
'''
import requests
from lxml import etree
if __name__ == '__main__':
# ua偽裝 => 模擬瀏覽器上網
headers = {
"User-Agent":"你的ua"
}
url = "https://dl.58.com/ershouche"
# 1.通用爬蟲
page_info = requests.get(url=url,headers=headers)
# 2.解析資料
root = etree.HTML(page_info.text)
# 3.標籤定位
# car_list = root.xpath('//div[@class="list-wrap"]/ul/li')
# print(car_list)
car_list = root.xpath('//div[@class="list-wrap"]/ul/li')
print(car_list)
# 4.解析資料
# for el in car_list:
# message = el.xpath('./div[@class="info--wrap"]/a/div')
# print(message)
5.爬取圖片
import requests
from lxml import etree
if __name__ == '__main__':
# ua偽裝 => 模擬瀏覽器上網
headers = {
"User-Agent":"你的ua"
}
url = "https://pic.netbian.com/4kmeinv/"
# url = "https://pic.netbian.com/4kdongman"
# 1.通用爬蟲
page_info = requests.get(url=url, headers=headers)
page_info.encoding="gbk"
print(page_info)
# 2.解析資料
root = etree.HTML(page_info.text)
# 3.標籤定位
li_list = root.xpath('//div[@class="slist"]/ul/li')
for el in li_list:
img_url = "https://pic.netbian.com"+el.xpath('./a/img/@src')[0]
img_title = el.xpath('./a/img/@alt')[0]
print(img_url)
# 1.通用爬蟲
img_reponse = requests.get(url=img_url,headers=headers)
img_data = img_reponse.content
with open(f"D:\ \python-sk\data\img\{img_title}.jpg","wb") as fp:
fp.write(img_data)
print(img_title,"爬取ok")
6.爬取城市資訊並去重
import requests
from lxml import etree
if __name__ == '__main__':
# ua偽裝 => 模擬瀏覽器上網
headers = {
"User-Agent":"你的ua"
}
url = "http://www.air-level.com/"
# 1.通用爬蟲
page_info = requests.get(url=url, headers=headers)
# 2.解析資料
root = etree.HTML(page_info.text)
# 3.標籤定位
a_list = root.xpath('//div[@id="citylist"]/div[@class="citynames"]/a')
# print(a_list)
fp = open("D:\ \python-sk\data\cityname.txt","w",encoding="utf-8")
# 去重方法1,使用set
s1 = {""}
s1.pop()
for a in a_list:
a_text = a.xpath('./text()')[0]
s1.add(a_text)
fp.write(str(s1))
# # 去重方法2,大字串,in和not in
st = ""
for a in a_list:
a_text = a.xpath('./text()')[0]
print(a_text)
if((a_text not in st) == True):
st = st + a_text
fp.write(a_text)
print("寫入一個字串")
else:
print("該字串已存在")
print("successful")
'''
作業,cityname去重
1.把重點城市去掉
2.取完再去重【推薦】
'''