xpath爬取簡書 攝影專題裡的最新收錄 文章的詳情及圖片 完整程式碼
阿新 • • 發佈:2018-12-11
import requests
from lxml import etree #etree
import urllib.parse
import re
header = {
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
def suibian(url):
respones = requests.get(url,headers=header)
a = etree.HTML(respones. text)
b = a.xpath('//ul[@class="note-list"]/li')
for i in b:
print(i)
a_O = i.xpath('.//a[@class="title"]/@href')[0]
d = urllib.parse.urljoin(respones.url,a_O)
print(d)
jiexi(d)
def jiexi(url):
response = requests.get(url,headers=header)
a = etree.HTML( response.text)
b = a.xpath('//h1[@class="title"]/text()')
c = a.xpath('//span[@class="name"]/a/text()')
d = a.xpath('//div[@class="meta"]//text()')
e = a.xpath('//div[@class="show-content"]//img/@data-original-src')
f = a.xpath('//div[@class="show-content"]//text()')
for i in e:
url = i. replace('//','https://')
img(url,b)
for i in f:
writes(i,b)
def img(url,b):
response = requests.get(url,headers=header)
r = re.compile(".*-(.{1,15})",re.S)
c = r.findall(response.url)[0]
# print(c)
with open('{}{}.jpg'.format(b,c),'wb') as f:
f.write(response.content)
def writes(t,b):
with open('{}.txt'.format(b),'a') as f:
f.write(t)
if __name__ == '__main__':
url ="https://www.jianshu.com/c/7b2be866f564?order_by=added_at&page=1"
suibian(url)