1. 程式人生 > >scrapy解析庫之Xpath( Selectors)

scrapy解析庫之Xpath( Selectors)

#1 //與/
#2 text
#3、extract與extract_first:從selector物件中解出內容
#4、屬性:xpath的屬性加字首@
#4、巢狀查詢
#5、設定預設值
#4、按照屬性查詢
#5、按照屬性模糊查詢
#6、正則表示式
#7、xpath相對路徑
#8、帶變數的xpath
response.selector.css()
response.selector.xpath()
可簡寫為
response.css()
response.xpath()

#1 //與/
response.xpath('//body/a/')#
response.css('div a::text')

>>> response.xpath('//body/a') #開頭的//代表從整篇文件中尋找,body之後的/代表body的兒子 [] >>> response.xpath('//body//a') #開頭的//代表從整篇文件中尋找,body之後的//代表body的子子孫孫 [<Selector xpath='//body//a' data='<a href="image1.html">Name: My image 1 <'>, <Selector xpath='//body//a' data='<a href="image2.html">Name: My image 2 <
'>, <Selector xpath='//body//a' data='<a href=" image3.html">Name: My image 3 <'>, <Selector xpath='//body//a' data='<a href="image4.html">Name: My image 4 <'>, <Selector xpath='//body//a' data='<a href="image5.html">Name: My image 5 <'>] #2 text >>> response.xpath('
//body//a/text()') >>> response.css('body a::text') #3、extract與extract_first:從selector物件中解出內容 >>> response.xpath('//div/a/text()').extract() ['Name: My image 1 ', 'Name: My image 2 ', 'Name: My image 3 ', 'Name: My image 4 ', 'Name: My image 5 '] >>> response.css('div a::text').extract() ['Name: My image 1 ', 'Name: My image 2 ', 'Name: My image 3 ', 'Name: My image 4 ', 'Name: My image 5 '] >>> response.xpath('//div/a/text()').extract_first() 'Name: My image 1 ' >>> response.css('div a::text').extract_first() 'Name: My image 1 ' #4、屬性:xpath的屬性加字首@ >>> response.xpath('//div/a/@href').extract_first() 'image1.html' >>> response.css('div a::attr(href)').extract_first() 'image1.html' #4、巢狀查詢 >>> response.xpath('//div').css('a').xpath('@href').extract_first() 'image1.html' #5、設定預設值 >>> response.xpath('//div[@id="xxx"]').extract_first(default="not found") 'not found' #4、按照屬性查詢 response.xpath('//div[@id="images"]/a[@href="image3.html"]/text()').extract() response.css('#images a[@href="image3.html"]/text()').extract() #5、按照屬性模糊查詢 response.xpath('//a[contains(@href,"image")]/@href').extract() response.css('a[href*="image"]::attr(href)').extract() response.xpath('//a[contains(@href,"image")]/img/@src').extract() response.css('a[href*="imag"] img::attr(src)').extract() response.xpath('//*[@href="image1.html"]') response.css('*[href="image1.html"]') #6、正則表示式 response.xpath('//a/text()').re(r'Name: (.*)') response.xpath('//a/text()').re_first(r'Name: (.*)') #7、xpath相對路徑 >>> res=response.xpath('//a[contains(@href,"3")]')[0] >>> res.xpath('img') [<Selector xpath='img' data='<img src="image3_thumb.jpg">'>] >>> res.xpath('./img') [<Selector xpath='./img' data='<img src="image3_thumb.jpg">'>] >>> res.xpath('.//img') [<Selector xpath='.//img' data='<img src="image3_thumb.jpg">'>] >>> res.xpath('//img') #這就是從頭開始掃描 [<Selector xpath='//img' data='<img src="image1_thumb.jpg">'>, <Selector xpath='//img' data='<img src="image2_thumb.jpg">'>, <Selector xpath='//img' data='<img src="image3_thumb.jpg">'>, <Selector xpa th='//img' data='<img src="image4_thumb.jpg">'>, <Selector xpath='//img' data='<img src="image5_thumb.jpg">'>] #8、帶變數的xpath >>> response.xpath('//div[@id=$xxx]/a/text()',xxx='images').extract_first() 'Name: My image 1 ' >>> response.xpath('//div[count(a)=$yyy]/@id',yyy=5).extract_first() #求有5個a標籤的div的id 'images'

https://docs.scrapy.org/en/latest/topics/selectors.html