Python+Scrapy批量抓取唯一相簿圖片並按系列儲存
阿新 • • 發佈:2019-01-29
人生苦短,我用python!
博主閒暇中自學Scrapy,水平有限,不到之處,還請大家指正。
開發及執行環境
CentOS Linux release 7.4.1708 + Pycharm2018.1.3
Python 2.7.5 + Scrapy 1.5.0
如何安裝開發環境和執行環境這裡就不贅述了,Scrapy是個很強大的框架,本例只使用了其中部分功能。乾貨
MMspider.py
爬蟲主解析程式,關於網站原始碼解析以及XPATH語法,請自行百度,或者留言
# --coding:utf-8-- import os import scrapy import datetime from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from mmonly.items import mmonlyItem class Myspider(CrawlSpider): name = 'mmspider' base = r'/home/yinchong/Downloads/mmtp/' # 定義基礎儲存路徑 allowed_domains = ['mmonly.cc'] start_urls = [ 'http://www.mmonly.cc/mmtp/', ] # 定義主頁面爬取規則,有下一頁則繼續深挖,其他符合條件的連結則呼叫parse_item解析原圖地址 rules = ( Rule(LinkExtractor(allow=(''), restrict_xpaths=(u"//a[contains(text(),'下一頁')]")), follow=True), Rule(LinkExtractor(allow=('http://www.mmonly.cc/(.*?).html'), restrict_xpaths=(u"//div[@class='ABox']")), callback="parse_item", follow=False), ) def parse_item(self, response): item = mmonlyItem() item['siteURL'] = response.url item['title'] = response.xpath('//h1/text()').extract_first() # xpath解析標題 item['path'] = self.base + item['title'] # 定義儲存路徑,同一系列儲存在同一目錄 path = item['path'] if not os.path.exists(path): os.makedirs(path) # 如果儲存路徑不存在則建立 item['detailURL'] = response.xpath('//a[@class="down-btn"]/@href').extract_first() # 解析原圖URL print(item['detailURL'] ) num = response.xpath('//span[@class="nowpage"]/text()').extract_first() # 解析同一系列圖片編號 item['fileName'] = item['path'] + '/' + str(num) + '.jpg' # 拼接圖片名稱 print datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), item['fileName'], u'解析成功!' yield item # 傳入的解析item的連結如果有下一頁的話,繼續呼叫parse_item next_page = response.xpath(u"//a[contains(text(),'下一頁')]/@href").extract_first() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse_item)
items.py
# -*- coding: utf-8 -*-
import scrapy
class mmonlyItem(scrapy.Item):
siteURL = scrapy.Field() # 圖片網站地址
detailURL = scrapy.Field() # 圖片原圖地址
title = scrapy.Field() # 圖片系列名稱
fileName = scrapy.Field() # 圖片儲存全路徑名稱
path = scrapy.Field() # 圖片系列儲存路徑
pipelines.py
下載處理程式
# -*- coding: utf-8 -*- import requests import datetime class mmonlyPipeline(object): def process_item(self, item, spider): count = 0 detailURL = item['detailURL'] fileName = item['fileName'] while True: try: print datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), u'正在儲存圖片:', detailURL print u'檔案:', fileName image = requests.get(detailURL) # 根據解析出的item原圖連結下載圖片 f = open(fileName, 'wb') # 開啟圖片 f.write(image.content) # 寫入圖片 f.close() except Exception, e: print fileName, 'other fault:', e count += 1 else: print datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), fileName, u'儲存成功!' break return item
settings.py
scrapy設定,由於本次採集的網站反爬不嚴,未使用隨機User-Agent和IP代理。
# -*- coding: utf-8 -*- # Scrapy settings for mmonly project BOT_NAME = 'mmonly' SPIDER_MODULES = ['mmonly.spiders'] NEWSPIDER_MODULE = 'mmonly.spiders' FEED_EXPORT_ENCODING = 'utf-8' ROBOTSTXT_OBEY = False # 預設是16,一次可以請求的最大次數 CONCURRENT_REQUESTS = 32 # 下載延遲 # DOWNLOAD_DELAY = 0.1 COOKIES_ENABLED = False DEFAULT_REQUEST_HEADERS = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'} ITEM_PIPELINES = {'mmonly.pipelines.mmonlyPipeline': 100} # 日誌級別 LOG_LEVEL = 'INFO' LOG_FILE = '/tmp/log.txt'
main.py
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'mmspider'])
命令列執行scrapy crawl mmspider或者 python main.py
Pycharm執行main.py
執行效果
執行main.py開始抓取圖片,總計抓取圖片超過20萬張,執行時間會很長,需要的磁碟空間也很大,做好心理準備。如果您有好的多執行緒方案,可以留言討論。