robots檢測外掛編寫
阿新 • • 發佈:2020-09-09
首先先把url分割
url = 'https://www.baidu.com/s?wd=123&rsv_spt=1&rsv_iqid=0x8d22781d000014ad&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=0&rsv_sug3=3&rsv_sug1=2&rsv_sug7=100&rsv_btype=i&inputT=875&rsv_sug4=875' ends = "robots.txt" url = url.split('/') print(url)
輸出
['https:', '', 'www.baidu.com', 's?wd=123&rsv_spt=1&rsv_iqid=0x8d22781d000014ad&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=0&rsv_sug3=3&rsv_sug1=2&rsv_sug7=100&rsv_btype=i&inputT=875&rsv_sug4=875']
再使用資料清洗,取出前3個,用/連線起來再利用urljoin連線起來ends
from urllib.parse import urljoin url = 'https://www.baidu.com/s?wd=123&rsv_spt=1&rsv_iqid=0x8d22781d000014ad&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=0&rsv_sug3=3&rsv_sug1=2&rsv_sug7=100&rsv_btype=i&inputT=875&rsv_sug4=875' ends = "robots.txt" url = url.split('/') url = '/'.join(url[:3]) url = urljoin(url,ends) print(url)
輸出
https://www.baidu.com/robots.txt
現在得到了目標url再獲取robots.txt內容
匯入再打印出來
import requests
html = requests.get(url)
print(html.text)
列印結果為
G:\python3.8\python.exe "F:/python post/code/RobotsTest.py"
User-agent: Baiduspider
Disallow: /baidu
Disallow: /s?
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: Googlebot
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: MSNBot
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: Baiduspider-image
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: YoudaoBot
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: Sogou web spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: Sogou inst spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: Sogou spider2
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: Sogou blog
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: Sogou News Spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: Sogou Orion spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: ChinasoSpider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: Sosospider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: yisouspider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: EasouSpider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: *
Disallow: /
Process finished with exit code 0
現在把內容寫入robots.txt,再讀取出來判斷我們頭是否存在robots裡面的禁止列表
現在我們架設頭為Googlebot,來判斷是否存在
headers = {'user-agent':'Googlebot'}
with open('robots.txt','w',encoding='utf-8') as f:
f.write(html.text)
with open('robots.txt','r',encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line.strip().replace('\n','')
現在就再line裡面,再來判斷是否再line裡面,然後獲取disallow值,存入一個新的裡面,這裡來個標誌flag
lines = f.readlines()
domain = []
flag = False
for line in lines:
line.strip().replace('\n','')
if headers['user-agent'] in line:
flag = True
continue
elif line.startswith('Disallow'):
if flag is True:
domain.append(line.replace('Disallow',''))
elif line is None or line == '':
if flag is True:
break
這裡來封裝一下完整程式碼
from urllib.parse import urljoin
import requests
# url = 'https://www.baidu.com/s?wd=123&rsv_spt=1&rsv_iqid=0x8d22781d000014ad&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=0&rsv_sug3=3&rsv_sug1=2&rsv_sug7=100&rsv_btype=i&inputT=875&rsv_sug4=875'
# ends = "robots.txt"
class Robots:
def __init__(self,url,Agent):
self.Baseurl = url
self.url = url
self.headers = {'user-agent':Agent}
self.ends = 'robots.txt'
self.Dourl()
def Dourl(self):
url = self.url.split('/')
url = '/'.join(url[:3])
url = urljoin(url, self.ends)
self.url = url
def getRobots(self):
html = requests.get(self.url)
with open('robots.txt', 'w', encoding='utf-8') as f:
f.write(html.text)
with open('robots.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
domain = []
flag = False
for line in lines:
line = line.strip().replace('\n', '')
if self.headers['user-agent'] in line:
flag = True
continue
elif line.startswith('Disallow'):
if flag is True:
domain.append(line.replace('Disallow: ',''))
elif line is None or line == '':
if flag is True:
break
for d in domain:
if d in self.Baseurl:
print("網站禁止爬取")
return False
return True
if __name__ == '__main__':
url = input('url is >>')
agent = input('agent is >>')
# url = 'https://www.baidu.com/s?wd=123&rsv_spt=1&rsv_iqid=0xc6f64e0200000143&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=0&rsv_sug3=3&rsv_sug1=1&rsv_sug7=100&rsv_btype=i&inputT=993&rsv_sug4=993'
# agent = 'Googlebot'
r = Robots(url, agent)
print(r.getRobots())
F:\python post\code>python RobotsTest.py
url is >>https://www.baidu.com/s?wd=123&rsv_spt=1&rsv_iqid=0x8d22781d000014ad&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=0&rsv_sug3=3&rsv_sug1=2&rsv_sug7=100&rsv_btype=i&inputT=875&rsv_sug4=875
agent is >>Googlebot
網站禁止爬取
False
F:\python post\code>