python 爬蟲儲存豆瓣TOP250電影海報及修改名稱

阿新 • • 發佈：2019-01-15

1. spider程式碼：這裡注意找title和star，以及pic時xpath不同。前兩者是在info下，後者是在pic下。for迴圈中按item尋找，每次找到一個item（電影）的title、star和圖片資訊，每次呼叫一次yield生成器，在pipeline裡面進行處理。在item找完後，找下一個page的連結，再呼叫parse進行解析

# -*- coding: utf-8 -*-
import scrapy
from douban.items import DoubanItem

class Douban250Spider(scrapy.Spider):
    name = 'douban250'
 
# allowed_domains = ['https://movie.douban.com/']
start_urls = ['https://movie.douban.com/top250']

    def parse(self, response):
        for sel in response.xpath('//div[@class="item"]'):
            item = DoubanItem()
            item['title'] = sel.xpath('div[@class="info"]/div[@class="hd"]/a/span/text()').extract()[0 
]
            item['star'] = sel.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]\
            /span[@class="rating_num"]/text()').extract()[0]
            item['image_urls'] = sel.xpath('div[@class="pic"]/a/img/@src').extract()          yield item
        nextPage = sel.xpath('//div[@class="paginator"]/ 
\
                             span[@class="next"]/a/@href').extract()[0].strip()
        if nextPage:
            next_url = 'https://movie.douban.com/top250'+nextPage
            yield scrapy.http.Request(next_url,callback=self.parse,dont_filter=True)

2. settings檔案：指定pipeline。這裡有處理文字和圖片兩個pipeline，設定隨機代理：

# -*- coding: utf-8 -*-
# Scrapy settings for douban project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
BOT_NAME = 'douban'
SPIDER_MODULES = ['douban.spiders']
NEWSPIDER_MODULE = 'douban.spiders'
FEED_EXPORT_ENCODING = 'utf-8'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
user_agent_list = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UA = random.choice(user_agent_list)
USER_AGENT = UA

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
IMAGES_STORE = 'D:\\python project\\douban\\images'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'douban.middlewares.DoubanSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'douban.middlewares.DoubanDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'douban.pipelines.DoubanPipeline': 100,
    'douban.pipelines.SaveNameScore':200,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

3. pipeline檔案：設定文字和圖片兩個pipeline。對於圖片pipeline,引入ImagePipeline通道，重寫get_media_requests函式，將圖片的url生產request請求。重寫file_path函式，將每個電影名稱和評分設為檔名。注意ImagePlieline需要PIL庫支援，下載Pillow

# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import sys
import random
from scrapy.http import Request
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
reload(sys)
sys.setdefaultencoding('utf8')

class DoubanPipeline(ImagesPipeline):

    def get_media_requests(self,item,info):
        for image_url in item['image_urls']:
            yield Request(url=image_url ,meta={'item':item})

    def file_path(self,request,response=None,info=None):
        item=request.meta['item'] #通過上面的meta傳遞過來item +str(random.random())        #圖片檔名，item['carname'][index]得到汽車名稱，request.url.split('/')[-1].split('.')[-1]得到圖片字尾jpg,png
image_guid = item['title']+'_'+item['star']+'.'+request.url.split('/')[-1].split('.')[-1]
        #圖片下載目錄 此處item['country']即需要前面item['country']=''.join()......,否則目錄名會變成\u97e9\u56fd\u6c7d\u8f66\u6807\u5fd7\xxx.jpg
filename = u'full/{0}'.format(image_guid)
        return filename

    def item_completed(self,results,item,info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        return item

對於名稱和評分pipeline，寫入檔案中：

class SaveNameScore(object):
    def __init__(self):
        self.file= open('douban_top250.txt',mode='wb')


    def process_item(self, item, spider):
        line = 'The top250 movie list:'
title = item['title']
        star = item['star']
        line = line + ' ' + title +' '
line = line + star + '\n'
self.file.write(line)

    def close_spider(self, spider):
        self.file.close()

items檔案這樣寫：

import scrapy


class DoubanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
title=scrapy.Field()
    star=scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()
    pass

效果：

程式碼獲取目錄：https://github.com/xzxin/douban_scrapy

python 爬蟲儲存豆瓣TOP250電影海報及修改名稱

python 爬蟲儲存豆瓣TOP250電影海報及修改名稱

Python爬蟲之多線程下載豆瓣Top250電影圖片

python爬蟲--爬取豆瓣top250電影名

python scrapy框架爬取豆瓣top250電影篇一儲存資料到mongogdb | mysql中

python scrapy框架爬取豆瓣top250電影篇一明確目標&&爬蟲編寫

python3爬蟲豆瓣top250電影（並儲存到mysql資料庫）

python scrapy框架爬取豆瓣top250電影篇一代理編寫

python爬蟲學習中國大學排名顯示及儲存檔案 DAY3

python 爬蟲抓豆瓣電影，並存入資料庫

用Python爬取豆瓣Top250的電影標題

python爬蟲登陸豆瓣爬豆瓣電影短評

python爬蟲之豆瓣電影評分

python爬蟲--儲存本地

用爬蟲分析IMDB TOP250電影數據

《團隊-爬取豆瓣Top250電影-團隊-階段互評》

團隊-爬取豆瓣Top250電影-團隊-階段互評

Python爬蟲實習筆記 | Week1 軟體安裝及基礎知識學習

Python爬蟲：Scrapy的Crawler物件及擴充套件Extensions和訊號Signals

Python爬取豆瓣TOP250圖書排行榜

python爬蟲爬取貓眼電影top100

python 爬蟲 儲存豆瓣TOP250電影海報及修改名稱

相關推薦

python 爬蟲儲存豆瓣TOP250電影海報及修改名稱