【百度爬蟲系列 III】深度搜索(給定網址採集全部url)
阿新 • • 發佈:2019-01-10
目的
給定網址,以及儲存檔案,將該網頁內全部網址採集下,可指定檔案儲存。
思路
用lxml解析工具解析請求的文字,分析網頁中url在的位置以及標籤會出現三種情況:
- 通過href獲取的格式正確的url。
- 通過href獲取的為”javascript”開頭的,跳過。
程式碼
#coding:utf-8
# 網頁url採集爬蟲,給定網址,以及儲存檔案,將該網頁內全部網址採集下,可指定檔案儲存方式
#[email protected] 許娜
#os : ubuntu16.04
#python: python2
import requests,time
from lxml import etree
"""
url:給定的url
save_file_name:為url儲存檔案
"""
def Redirect(url):
try:
res = requests.get(url,timeout=10)
url = res.url
except Exception as e:
print("4",e)
time.sleep(1)
return url
def requests_for_url(url, save_file_name, file_model) :
headers = {
'pragma': "no-cache",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.8",
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36" ,
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
'cache-control': "no-cache",
'connection': "keep-alive",
}
try:
response = requests.request("GET", url, headers=headers)
selector = etree.HTML(response.text, parser=etree.HTMLParser(encoding='utf-8'))
except Exception as e:
print ("頁面載入失敗", e)
return_set = set()
with open(save_file_name,file_model) as f:
try:
context = selector.xpath('//a/@href')
for i in context:
try:
if i[0] == "j":
continue
if i[0] == "/":
print i
i = url+i.replace("/","");
f.write(i)
f.write("\n")
return_set.add(i)
print(len(context),context[0],i)
except Exception as e:
print("1",e)
except Exception as e:
print("2",e)
return return_set
if __name__ == '__main__':
# 網頁url採集爬蟲,給定網址,以及儲存檔案,將該網頁內全部網址採集下,可指定檔案儲存方式
url = "http://news.baidu.com/"
save_file_name = "save_url_2.txt"
return_set = requests_for_url(url,save_file_name,"a") #“a”:追加
print(len(return_set))