2018/10/06 python3 scrapy 爬蟲基礎
爬douban,將資料儲存到mongodb 中
新建一個scrapy專案 : scrapy startproject doubantest
建立一個基類爬蟲:scrapy genspider doubanmovie "movie.douban.com"
下面開始上程式碼:
首先在items中寫出你要提取的字元:
class DoubantestItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 影名 title = scrapy.Field() # 評分 score = scrapy.Field() # 主演 star = scrapy.Field() # 簡介 info = scrapy.Field()
下一步開始爬蟲的編寫:
import scrapy # 我們之前在doubantest/items.py 裡定義了一個DoubanttestItem類。 這裡引入進來 from doubantest.items import DoubantestItem class DoubanmovieSpider(scrapy.Spider): # 爬蟲名 name = 'doubanmovie' # 域 allowed_domains = ['movie.douban.com'] start = 0 url = 'https://movie.douban.com/top250?start=' # 要爬取的第一個url start_urls = [url+str(start)] # 在基類爬蟲中必須要有parse() def parse(self, response): #將我們得到的資料封裝到一個 `DocbantestItem` 物件 item = DoubantestItem() # 匹配網頁的根節點 movies = response.xpath('//div[@class = "info"]') for each in movies: # 從根節點中匹配要的資訊, #extract()方法返回的都是unicode字串 #xpath返回的是包含一個元素的列表 item['title'] = each.xpath('.//span[@class="title"]/text()').extract()[0] item['score'] = each.xpath('.//span[@class = "rating_num"]/text()').extract()[0] item['star'] = each.xpath('.//p[@class=""]/text()').extract()[0] item['info'] = each.xpath('.//span[@class = "inq"]/text()').extract()[0] # 將獲取的資料交給pipelines yield item if self.start < 250: self.start += 25 yield scrapy.Request(self.url+ str(self.start),callback=self.parse)
修改settines:
BOT_NAME = 'doubantest' SPIDER_MODULES = ['doubantest.spiders'] NEWSPIDER_MODULE = 'doubantest.spiders' # 這個爬蟲協議,false就好了 ROBOTSTXT_OBEY = False # 禁用cookies COOKIES_ENABLED = False # 設定管道 ITEM_PIPELINES = { 'doubantest.pipelines.DoubantestPipeline': 300, } # mongodb 中設定 MONGODB_HOST = '127.0.0.1' MONGODB_PORT = 27017 MONGODB_DBNAME = 'Douban' MONGODB_DOCNAME = 'Doubanmovie'
寫管道檔案:
import pymongo
from scrapy.conf import settings
class DoubantestPipeline(object):
def __init__(self):
# 初始化mongodb的設定
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbname = settings['MONGODB_DBNAME']
docname = settings['MONGODB_DOCNAME']
# 連線到mongodb資料庫
client = pymongo.MongoClient(host=host,port=port)
# 指定資料庫
mdb = client[dbname]
# 指定資料表
self.post = mdb[docname]
def process_item(self, item, spider):
# 將資料儲存到資料庫中
self.post.insert(dict(item))
# 必須要有return item
return item
如果是在pycharm中跑這個程式還必須寫一個執行的指令碼:start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl doubanmovie".split())
以上的程式碼是可以將資料儲存到本地的mongodb資料庫了的
因為有很多的網站會一定的反爬蟲的設定,一般的反爬蟲都是跟據User-Agent 和Ip 來反爬的,所以
下面我們來跟據User-Agent和ip來做一定的反反爬蟲:下載中介軟體
#先要有settings.py新增自己編寫的下載中介軟體類。和User-Agent和代理ip池
DOWNLOADER_MIDDLEWARES = {
'doubantest.middlewares.Doubantestuseragent': 100,
'doubantest.middlewares.Doubantestproxy': 100,
}
USER_AGENTS = [
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
]
PROXIES = [
{"ip_port":"106.75.164.15:3128","user_passwd":""},
{"ip_port":"61.135.217.7:80","user_passwd":""},
{"ip_port":"118.190.95.35:9001","user_passwd":""}
]
編寫下載中介軟體:middlewares.py
import random import base64
from settings import USER_AGENTS from settings import PROXIES
# 隨機的User-Agent class RandomUserAgent(object): def process_request(self, request, spider): useragent = random.choice(USER_AGENTS) #print useragent
request.headers.setdefault("User-Agent", useragent)
class RandomProxy(object): def process_request(self, request, spider): proxy = random.choice(PROXIES)
if proxy['user_passwd'] is None: # 沒有代理賬戶驗證的代理使用方式 request.meta['proxy'] = "http://" + proxy['ip_port']
else: # 對賬戶密碼進行base64編碼轉換 base64_userpasswd = base64.b64encode(proxy['user_passwd']) # 對應到代理伺服器的信令格式裡 request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
request.meta['proxy'] = "http://" + proxy['ip_port']
為什麼HTTP代理要使用base64編碼:
HTTP代理的原理很簡單,就是通過HTTP協議與代理伺服器建立連線,協議信令中包含要連線到的遠端主機的IP和埠號,如果有需要身份驗證的話還需要加上授權資訊,伺服器收到信令後首先進行身份驗證,通過後便與遠端主機建立連線,連線成功之後會返回給客戶端200,表示驗證通過,就這麼簡單,下面是具體的信令格式:
CONNECT 59.64.128.198:21 HTTP/1.1 Host: 59.64.128.198:21 Proxy-Authorization: Basic bGV2I1TU5OTIz User-Agent: OpenFetion