實戰 利用Xpath爬取網頁資料
阿新 • • 發佈:2019-02-12
#coding=utf-8
#step1 匯入模組
import re
import requests
from lxml import etree
#抓取網頁原始碼
url = 'http://www.liaoxuefeng.com/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000'
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
# 獲取原始碼
htmll = requests.get(url,headers=header)
html=htmll.text
#匹配不同目錄網址
page = re.findall('<li id=.*?>.*?<a href="(.*?)">.*?</a>',html,re.S) # 匹配不同目錄後部分網址
i = 0
for each in page:
#print each
page1 ='http://www.liaoxuefeng.com'+each # 不同目錄前半部分+後半部分網址
html2 = requests.get(page1,headers =header)
html2 = html2.text
i +=1
for each2 in page1:
Selector = etree.HTML(html2)
content = Selector.xpath('//*[@class="x-wiki-content"]/p') # 匹配漢字 是一個list
for each2 in content:
print each2.text