爬蟲--Lxml簡單案例
阿新 • • 發佈:2018-11-11
1.以爬取簡書首頁標題為例
import requests from lxml import etree # 簡書首頁title爬取 class LxmlSpider: def __init__(self): self.session = requests.Session() def jian_shu_spider(self, url, headers): response = requests.get(url, headers=headers).text result = etree.HTML(response) # title的xpath title_list = result.xpath("//div/a[@class='title']") for title in title_list: print("文章標題:%s"%title.text) if __name__ == '__main__': lxml_soup = LxmlSpider() lxml_soup.jian_shu_spider( "http://www.jianshu.com", { "Referer": "https://www.jianshu.com/", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36" } )
2.爬取結果