最全Python爬蟲總結(轉載)

阿新 • • 發佈：2018-04-06

其中網頁 -i 變量 oba cati nod style 應該

[html] view plain copy

最近總是要爬取一些東西，索性就把Python爬蟲的相關內容都總結起來了，自己多動手還是好。

（1）普通的內容爬取
（2）保存爬取的圖片/視頻和文件和網頁
（3）普通模擬登錄
（4）處理驗證碼登錄
（5）爬取js網站
（6）全網爬蟲
（7）某個網站的站內所有目錄爬蟲
（8）多線程
（9）爬蟲框架Scrapy

一，普通的內容爬取
[html] view plain copy

#coding=utf-8
import urllib
import urllib2
url = ‘http://www.dataanswer.top‘
headers = {
‘Host‘:‘www.dataanswer.top‘,
‘User-Agent‘:‘Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0‘,
#‘Accept‘:‘application/json, text/javascript, */*; q=0.01‘,
#‘Accept-Language‘:‘zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3‘,
#‘Accept-Encoding‘:‘gzip,deflate‘,

#‘Referer‘:‘http://www.dataanswer.top‘
}
request = urllib2.Request(url,headers=headers)
response = urllib2.urlopen(request)
page = response.read()
print page

二，保存爬取的圖片/視頻和文件和網頁
#圖片/視頻和文件和網頁的地址抓取下來後，利用模塊urllib裏的urlretrieve()方法下載下來：
[html] view plain copy

#coding=utf-8
import urllib
import urllib2
import os
def getPage(url):
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
url=‘http://www.dataanswer.top/‘
result=getPage(url)
file_name=‘test.doc‘
file_path=‘doc‘
if os.path.exists(file_path) == False:
os.makedirs(file_path)
local=os.path.join(file_path,file_name)
f = open(local,"w+")
f.write(result)
f.close()
#coding=utf-8
import urllib
import urllib2
import os
def getPage(url):
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
url=‘http://www.dataanswer.top/‘ #把該地址改成圖片/文件/視頻/網頁的地址即可
result=getPage(url)
file_name=‘test.doc‘
file_path=‘doc‘
if os.path.exists(file_path) == False:
os.makedirs(file_path)
local=os.path.join(file_path,file_name)
urllib.urlretrieve(local)

三,普通模擬登錄
[html] view plain copy

import urllib
import urllib2
import cookielib
filename = ‘cookie.txt‘
#聲明一個MozillaCookieJar對象實例來保存cookie，之後寫入文件
cookie = cookielib.MozillaCookieJar(filename)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
postdata = urllib.urlencode({
‘name‘:‘春天裏‘,
‘pwd‘:‘1222222‘
})
#登錄的URL
loginUrl = ‘http://www.dataanswer.top/LoginService?action=tologin‘
#模擬登錄，並把cookie保存到變量
result = opener.open(loginUrl,postdata)
#保存cookie到cookie.txt中
cookie.save(ignore_discard=True, ignore_expires=True)
#利用cookie請求訪問另一個網址
gradeUrl = ‘http://www.dataanswer.top/LoginService?action=myHome‘
#請求訪問
result = opener.open(gradeUrl)
print result.read()

四，處理驗證碼登錄
#先把驗證碼圖片下載下來保存，再人工讀入
[html] view plain copy

#coding=utf-8
import sys, time, os, re
import urllib, urllib2, cookielib
loginurl = ‘https://www.douban.com/accounts/login‘
cookie = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
params = {
"form_email":"13161055481",
"form_password":"wwwwwww",
"source":"index_nav" #沒有的話登錄不成功
}
#從首頁提交登錄
response=opener.open(loginurl)
#驗證成功跳轉至登錄頁
print(response.geturl())
if response.geturl() == "https://www.douban.com/accounts/login":
html=response.read()
print(html)
#驗證碼圖片地址--圖片地址加密怎麽辦？？？
imgurl=re.search(‘<img id="captcha_image" src="(.+?)" alt="captcha" class="captcha_image"/>‘, html)
print(imgurl)
if imgurl:
url=imgurl.group(1)
#將圖片保存至同目錄下
res=urllib.urlretrieve(url,‘v.jpg‘)
#獲取captcha-id參數
captcha=re.search(‘<input type="hidden" name="captcha-id" value="(.+?)"/>‘,html)
if captcha:
vcode=raw_input(‘請輸入圖片上的驗證碼：‘)
params["captcha-solution"]=vcode
params["captcha-id"]=captcha.group(1)
params["user_login"]="登錄"
#提交驗證碼驗證
response=opener.open(loginurl, urllib.urlencode(params))
‘‘‘ 登錄成功跳轉至首頁 ‘‘‘
if response.geturl() == "https://www.douban.com/":
print ‘login success ! ‘
print ‘準備進行發帖‘
addtopicurl="http://www.douban.com/group/python/new_topic"
res=opener.open(addtopicurl)
html=res.read()
else:
print("Fail3")
else:
print("Fail2")
else:
print("Fail1")
else:
print("Fail0")

五，爬取js網站
#利用selenium模擬瀏覽器，結合html的解析
[html] view plain copy

#coding=utf-8
#1、安裝 python-pip
#sudo apt-get install python-pip
#2、安裝selenium
#sudo pip install -U selenium
from selenium import webdriver
driver = webdriver.Firefox()
driver.get(‘http://www.newsmth.net/nForum/#!article/Intern/206790‘)
html=driver.page_source.encode(‘utf-8‘,‘ignore‘) #這個函數獲取頁面的html
print(html)
driver.close()

六，全網爬蟲
#廣度優先，模擬爬取隊列
[html] view plain copy

#coding=utf-8
"""
全網爬取所有鏈接，包括外鏈--廣度優先
"""
import urllib2
import re
from bs4 import BeautifulSoup
import time
#爬蟲開始的時間
t=time.time()
#設置的暫停爬取條數
N_STOP=10
#存放已經爬取過的url
CHECKED_URL=[]
#存放待爬取的url
CHECKING_URL=[]
#存放連接失敗的url
FAIL_URL=[]
#存放不能連接的url
ERROR_URL=[]
#失敗後允許連接的次數
RETRY=3
#連接超時時間
TIMEOUT=20
class url_node:
def __init__(self,url):
"""
url節點初始化
：param url:String 當前url
"""
self.url=url
self.content=‘‘
def __is_connectable(self):
"""
檢驗url是否可以連接
"""
#在允許連接次數下連接
for i in range(RETRY):
try:
#打開url沒有報錯，則表示可連接
response=urllib2.urlopen(self.url,timeout=TIMEOUT)
return True
except:
#如果在嘗試允許連接次數下報錯，則不可連接
if i==RETRY-1:
return False
def get_next(self):
"""
獲取爬取該頁中包含的其他所有的url
"""
soup=BeautifulSoup(self.content)
#******************在此處可以從網頁中解析你想要的內容************************************
next_urls=soup.findAll(‘a‘)
if len(next_urls)!=0:
for link in next_urls:
tmp_url=link.get(‘href‘)
#如果url不在爬取過的列表中也不在待爬取列表中則把其放到待爬列表中（沒有確保該url有效）
if tmp_url not in CHECKED_URL and tmp_url not in CHECKING_URL:
CHECKING_URL.append(tmp_url)
def run(self):
if self.url:
if self.__is_connectable():
try:
#獲取爬取頁面的所有內容
self.content=urllib2.urlopen(self.url,timeout=TIMEOUT).read()
#從該頁面中獲取url
self.get_next()
except:
#把連接失敗的存放起來
FAIL_URL.append(self.url)
print(‘[!]Connect Failed‘)
else:
#把不能連接的存放起來
ERROR_URL.append(self.url)
else:
print("所給的初始url有問題！")
if __name__==‘__main__‘:
#把初始的url放到待爬的列表中
CHECKING_URL.append(‘http://www.36dsj.com/‘)
#不斷的從待爬的列表中獲取url進行爬取
ff=open("Mytest.txt",‘w‘)
i=0
for url in CHECKING_URL:
#對該url進行爬取
url_node(url).run()
#存放已經爬取過的url
CHECKED_URL.append(url)
#刪除CHECKING_URL中已經爬取過的url
CHECKING_URL.remove(url)
i+=1
if i==N_STOP:
#打出停止時的url，下次可以把該url作為初始繼續
print url
print("爬取過的列表長度：%d") % len(CHECKED_URL)
print("待爬取的列表長度：%d") % len(CHECKING_URL)
print("連接失敗的列表長度：%d") % len(FAIL_URL)
print("不能連接的列表長度：%d") % len(ERROR_URL)
break
ff.close()
print("time:%d s") % (time.time()-t)

七，某個網站的站內所有目錄爬蟲
#把縮寫的站內網址還原
[html] view plain copy

#coding=utf-8
"""
爬取同一個網站所有的url,不包括外鏈
"""
import urllib2
import re
from bs4 import BeautifulSoup
import time
t=time.time()
HOST=‘‘
CHECKED_URL=[]
CHECKING_URL=[]
RESULT=[]
RETRY=3
TIMEOUT=20
class url_node:
def __init__(self,url):
"""
url節點初始化
：param url:String 當前url
"""
self.url=self.handle_url(url,is_next_url=False)
self.next_url=[]
self.content=‘‘
def handle_url(self,url,is_next_url=True):
"""
將所有的url處理成標準形式
"""
global CHECKED_URL
global CHECKING_URL
#去掉尾部的‘/’
url=url[0:len(url)-1] if url.endswith(‘/‘) else url
if url.find(HOST)==-1:
if not url.startswith(‘http‘):
url=‘http://‘+HOST+url if url.startswith(‘/‘) else ‘http://‘+HOST+‘/‘+url
else:
#如果含有http說明是外鏈，url的host不是當前的host，返回空
return
else:
if not url.startswith(‘http‘):
url=‘http://‘+url
if is_next_url:
#下一層url放入待檢測列表
if url not in CHECKING_URL:
CHECKING_URL.append(url)
else:
#對於當前需要檢測的url將參數都替換為1，然後加入規則表
#參數相同類型不同的url只檢測一次
rule=re.compile(r‘=.*?\&|=.*?$‘)
result=re.sub(rule,‘=1&‘,url)
if result in CHECKED_URL:
return ‘[!] Url has checked!‘
else:
CHECKED_URL.append(result)
RESULT.append(url)
return url
def __is_connectable(self):
print("進入__is_connectable()函數")
#檢驗是否可以連接
retry=3
timeout=2
for i in range(RETRY):
try:
#print("進入_..............函數")
response=urllib2.urlopen(self.url,timeout=TIMEOUT)
return True
except:
if i==retry-1:
return False
def get_next(self):
#獲取當前所有的url
#print("進入get_next()函數")
soup=BeautifulSoup(self.content)
next_urls=soup.findAll(‘a‘)
if len(next_urls)!=0:
for link in next_urls:
self.handle_url(link.get(‘href‘))
#print(link.text)
def run(self):
#print("進入run()函數")
if self.url:
#print self.url
if self.__is_connectable():
try:
self.content=urllib2.urlopen(self.url,timeout=TIMEOUT).read()
self.get_next()
except:
print(‘[!]Connect Failed‘)
#處理https開頭的url的類和方法
class Poc:
def run(self,url):
global HOST
global CHECKING_URL
url=check_url(url)
if not url.find(‘https‘):
HOST=url[:8]
else:
HOST=url[7:]
for url in CHECKING_URL:
print(url)
url_node(url).run()
def check_url(url):
url=‘http://‘+url if not url.startswith(‘http‘) else url
url=url[0:len(url)-1] if url.endswith(‘/‘) else url
for i in range(RETRY):
try:
response=urllib2.urlopen(url,timeout=TIMEOUT)
return url
except:
raise Exception("Connect error")
if __name__==‘__main__‘:
HOST=‘www.dataanswer.com‘
CHECKING_URL.append(‘http://www.dataanswer.com/‘)
f=open(‘36大數據‘,‘w‘)
for url in CHECKING_URL:
f.write(url+‘\n‘)
print(url)
url_node(url).run()
print RESULT
print "URL num:"+str(len(RESULT))
print("time:%d s") % (time.time()-t)

八，多線程
#對列和線程的結合
[html] view plain copy

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
一個簡單的Python爬蟲, 使用了多線程,
爬取豆瓣Top前250的所有電影
"""
import urllib2, re, string
import threading, Queue, time
import sys
reload(sys)
sys.setdefaultencoding(‘utf8‘)
_DATA = []
FILE_LOCK = threading.Lock()
SHARE_Q = Queue.Queue() #構造一個不限制大小的的隊列
_WORKER_THREAD_NUM = 3 #設置線程的個數
class MyThread(threading.Thread) :
def __init__(self, func) :
super(MyThread, self).__init__() #調用父類的構造函數
self.func = func #傳入線程函數邏輯
def run(self) :
self.func()
def worker() :
global SHARE_Q
while not SHARE_Q.empty():
url = SHARE_Q.get() #獲得任務
my_page = get_page(url)
find_title(my_page) #獲得當前頁面的電影名
#write_into_file(temp_data)
time.sleep(1)
SHARE_Q.task_done()
def get_page(url) :
"""
根據所給的url爬取網頁HTML
Args:
url: 表示當前要爬取頁面的url
Returns:
返回抓取到整個頁面的HTML(unicode編碼)
Raises:
URLError:url引發的異常
"""
try :
my_page = urllib2.urlopen(url).read().decode("utf-8")
except urllib2.URLError, e :
if hasattr(e, "code"):
print "The server couldn‘t fulfill the request."
print "Error code: %s" % e.code
elif hasattr(e, "reason"):
print "We failed to reach a server. Please check your url and read the Reason"
print "Reason: %s" % e.reason
return my_page
def find_title(my_page) :
"""
通過返回的整個網頁HTML, 正則匹配前100的電影名稱
Args:
my_page: 傳入頁面的HTML文本用於正則匹配
"""
temp_data = []
movie_items = re.findall(r‘<span.*?class="title">(.*?)</span>‘, my_page, re.S)
for index, item in enumerate(movie_items) :
if item.find(" ") == -1 :
#print item,
temp_data.append(item)
_DATA.append(temp_data)
def main() :
global SHARE_Q
threads = []
douban_url = "http://movie.douban.com/top250?start={page}&filter=&type="
#向隊列中放入任務, 真正使用時, 應該設置為可持續的放入任務
for index in xrange(10) :
SHARE_Q.put(douban_url.format(page = index * 25))
for i in xrange(_WORKER_THREAD_NUM) :
thread = MyThread(worker)
thread.start() #線程開始處理任務
print("第%s個線程開始工作") % i
threads.append(thread)
for thread in threads :
thread.join()
SHARE_Q.join()
with open("movie.txt", "w+") as my_file :
for page in _DATA :
for movie_name in page:
my_file.write(movie_name + "\n")
print "Spider Successful!!!"
if __name__ == ‘__main__‘:
main()

九,爬蟲框架Scrapy

items.py：用來定義需要保存的變量，其中的變量用Field來定義，有點像python的字典
pipelines.py：用來將提取出來的Item進行處理，處理過程按自己需要進行定義
spiders：定義自己的爬蟲

爬蟲的類型也有好幾種：
　　1）spider:最基本的爬蟲，其他的爬蟲一般是繼承了該最基本的爬蟲類，提供訪問url，返回response的功能，會默認調用parse方法
　　2）CrawlSpider：繼承spider的爬蟲，實際使用比較多，設定rule規則進行網頁的跟進與處理，註意點：編寫爬蟲的規則的時候避免使用parse名，因為這會覆蓋繼承的spider的的方法parse造成錯誤。其中比較重要的是對Rule的規則的編寫，要對具體的網頁的情況進行分析。
　　3）XMLFeedSpider 與 CSVFeedSpider

(1)打開命令行，執行：scrapy startproject tutorial（項目名稱）
(2)scrapy.cfg是項目的配置文件,用戶自己寫的spider要放在spiders目錄下面
(3)解析：name屬性很重要，不同spider不能使用相同的name
start_urls是spider抓取網頁的起始點，可以包括多個url
parse方法是spider抓到一個網頁以後默認調用的callback，避免使用這個名字來定義自己的方法。
當spider拿到url的內容以後，會調用parse方法，並且傳遞一個response參數給它，response包含了抓到的網頁的內容，在parse方法裏，你可以從抓到的網頁裏面解析數據。
(3)開始抓取，進入生成的項目根目錄tutorial/，執行 scrapy crawl dmoz， dmoz是spider的name。
(4)保存對象：在items.py中添加一些類，這些類用來描述我們要保存的數據

from scrapy.item import Item, Field
class DmozItem(Item):
title = Field()
link = Field()
desc = Field()
（5）執行scrapy crawl dmoz --set FEED_URI=items.json --set FEED_FORMAT=json後得到保存的文件
（6）讓scrapy自動抓取網頁上的所有鏈接

在parse方法裏面提取我們需要的鏈接，然後構造一些Request對象，並且把他們返回，scrapy會自動的去抓取這些鏈接

最全Python爬蟲總結(轉載)

其中網頁 -i 變量 oba cati nod style 應該 [html] view plain copy 最近總是要爬取一些東西，索性就把Python爬蟲的相關內容都總結起來了，自己多動手還是好。（1）普通的內容爬取（2）保存爬取的圖片/

最全Python爬蟲總結(轉載)

[html] view plain copy

最全Python爬蟲總結(轉載)

2019最全Python爬蟲教程+書籍分享

最全反爬蟲技術介紹

python爬蟲總結

最全python全棧工程師視頻教程

史上最全Python從入門到資深書籍資料分享！

python 爬蟲總結

深度學習最全優化方法總結比較（SGD，Adagrad，Adadelta，Adam，Adamax，Nadam）

某課最全Python flask構建微信小程式訂餐系統

史上最全Python資料型別詳解

史上最全Python資料分析學習路徑圖

史上最全python面試題詳解（二）（附帶詳細答案（關注、持續更新））

史上最全python面試題詳解（三）（附帶詳細答案（關注、持續更新））

python爬蟲總結: 網頁內容需要分類爬取

整理的最全 python常見面試題（基本必考）

AJAX跨域最全解決方案（轉載）

史上最全Python基礎合集！集合用法、檔案操作字元轉換、函式

整理的最全 python常見面試題（基本必考）① ②③④⑤⑥⑦⑧⑨⑩

史上最全 python常見面試題

這應該是你見過的最全前端下載總結

最全Python爬蟲總結(轉載)

[html] view plain copy

相關推薦