python 爬蟲 儲存豆瓣TOP250電影海報及修改名稱
1. spider程式碼:這裡注意找title和star,以及pic時xpath不同。前兩者是在info下,後者是在pic下。for迴圈中按item尋找,每次找到一個item(電影)的title、star和圖片資訊,每次呼叫一次yield生成器,在pipeline裡面進行處理。在item找完後,找下一個page的連結,再呼叫parse進行解析
# -*- coding: utf-8 -*- import scrapy from douban.items import DoubanItem class Douban250Spider(scrapy.Spider): name = 'douban250'# allowed_domains = ['https://movie.douban.com/'] start_urls = ['https://movie.douban.com/top250'] def parse(self, response): for sel in response.xpath('//div[@class="item"]'): item = DoubanItem() item['title'] = sel.xpath('div[@class="info"]/div[@class="hd"]/a/span/text()').extract()[0] item['star'] = sel.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]\ /span[@class="rating_num"]/text()').extract()[0] item['image_urls'] = sel.xpath('div[@class="pic"]/a/img/@src').extract() yield item nextPage = sel.xpath('//div[@class="paginator"]/\ span[@class="next"]/a/@href').extract()[0].strip() if nextPage: next_url = 'https://movie.douban.com/top250'+nextPage yield scrapy.http.Request(next_url,callback=self.parse,dont_filter=True)
2. settings檔案:指定pipeline。這裡有處理文字和圖片兩個pipeline,設定隨機代理:
# -*- coding: utf-8 -*- # Scrapy settings for douban project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import random BOT_NAME = 'douban' SPIDER_MODULES = ['douban.spiders'] NEWSPIDER_MODULE = 'douban.spiders' FEED_EXPORT_ENCODING = 'utf-8' # Crawl responsibly by identifying yourself (and your website) on the user-agent user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] UA = random.choice(user_agent_list) USER_AGENT = UA # Obey robots.txt rules ROBOTSTXT_OBEY = False IMAGES_STORE = 'D:\\python project\\douban\\images' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'douban.middlewares.DoubanSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'douban.middlewares.DoubanDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline': 100, 'douban.pipelines.SaveNameScore':200, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'3. pipeline檔案:設定文字和圖片兩個pipeline。對於圖片pipeline,引入ImagePipeline通道,重寫get_media_requests函式,將圖片的url生產request請求。重寫file_path函式,將每個電影名稱和評分設為檔名。注意ImagePlieline需要PIL庫支援,下載Pillow
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import sys import random from scrapy.http import Request from scrapy.contrib.pipeline.images import ImagesPipeline from scrapy.exceptions import DropItem reload(sys) sys.setdefaultencoding('utf8') class DoubanPipeline(ImagesPipeline): def get_media_requests(self,item,info): for image_url in item['image_urls']: yield Request(url=image_url ,meta={'item':item}) def file_path(self,request,response=None,info=None): item=request.meta['item'] #通過上面的meta傳遞過來item +str(random.random()) #圖片檔名,item['carname'][index]得到汽車名稱,request.url.split('/')[-1].split('.')[-1]得到圖片字尾jpg,png image_guid = item['title']+'_'+item['star']+'.'+request.url.split('/')[-1].split('.')[-1] #圖片下載目錄 此處item['country']即需要前面item['country']=''.join()......,否則目錄名會變成\u97e9\u56fd\u6c7d\u8f66\u6807\u5fd7\xxx.jpg filename = u'full/{0}'.format(image_guid) return filename def item_completed(self,results,item,info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") return item
對於名稱和評分pipeline,寫入檔案中:
class SaveNameScore(object): def __init__(self): self.file= open('douban_top250.txt',mode='wb') def process_item(self, item, spider): line = 'The top250 movie list:' title = item['title'] star = item['star'] line = line + ' ' + title +' ' line = line + star + '\n' self.file.write(line) def close_spider(self, spider): self.file.close()items檔案這樣寫:
import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title=scrapy.Field() star=scrapy.Field() image_urls = scrapy.Field() images = scrapy.Field() pass效果:
程式碼獲取目錄:https://github.com/xzxin/douban_scrapy
相關推薦
python 爬蟲 儲存豆瓣TOP250電影海報及修改名稱
1. spider程式碼:這裡注意找title和star,以及pic時xpath不同。前兩者是在info下,後者是在pic下。for迴圈中按item尋找,每次找到一個item(電影)的title、star和圖片資訊,每次呼叫一次yield生成器,在pipeline裡面進行處
Python爬蟲之多線程下載豆瓣Top250電影圖片
process current ocs code roc 輸出 wait div 允許 爬蟲項目介紹 ??本次爬蟲項目將爬取豆瓣Top250電影的圖片,其網址為:https://movie.douban.com/top250, 具體頁面如下圖所示: ??本次爬蟲項目將分別
python爬蟲--爬取豆瓣top250電影名
python爬蟲--爬取豆瓣top250電影名 關於模擬瀏覽器登入的header,可以在相應網站按F12調取出編輯器,點選netwook,如下: 以便於不會被網站反爬蟲拒絕。 1 import requests 2 from bs4 import BeautifulSoup
python scrapy框架爬取豆瓣top250電影篇一儲存資料到mongogdb | mysql中
存到mongodb中 環境 windows7 mongodb4.0 mongodb安裝教程 設定具體引數 在管道里面寫具體引數 開啟settings 設定引數 測試開始–結果 程式碼 import pymongo from douban.
python scrapy框架爬取豆瓣top250電影篇一明確目標&&爬蟲編寫
1.明確目標 1.1在url上找到要爬取的資訊 1.2.確定了資訊,編寫items檔案 class DoubanItem(scrapy.Item): &nb
python3爬蟲豆瓣top250電影(並儲存到mysql資料庫)
所用到的模組(需要提前安裝好): requests、BeautifulSoup、lxml、mysql.connector(或者pymysql也可以,如果不想插入到資料庫,只需要將裡邊相關的程式碼刪除) (程式碼下邊將貼出本文beautifulsoup的使用) 程式碼
python scrapy框架爬取豆瓣top250電影篇一代理編寫
爬蟲偽裝: UA中介軟體編寫 settings設定 from scrapy import signals import base64 import random class my_useragent(object): def process_req
python爬蟲 學習 中國大學排名顯示及儲存檔案 DAY3
import requests from bs4 import BeautifulSoup import bs4 def getHTMLText(url): try: r = requests.get(url, timeout=30)
python 爬蟲抓豆瓣電影,並存入資料庫
import urllib.request import json import codecs class info(object): #@classmethod def moviedown(url): #網址 url = "https://m
用Python爬取豆瓣Top250的電影標題
ive f11 parse www 表達 star import utf-8 各類 所以我們可以這麽寫去得到所有頁面的鏈接我們知道標題是在 target="_blank"> 標題的位置</a> 之中 所以可以通過正則表達式找到所有符合條
python爬蟲 登陸豆瓣 爬豆瓣電影短評
這個爬蟲的目的是爬取豆瓣電影短評和評分(從1星到5星),這些東西可以做情感分類。由於不登入的情況下只能看電影短評的前幾頁,所以要實現登陸豆瓣。 登陸豆瓣的部分是在網上看的別人的程式碼,忘了從哪看的了。# -*- coding: utf-8 -*- f
python爬蟲之豆瓣電影評分
想知道一部電影好不好看,豆瓣的評分還是比較靠譜的,於是,搞了搞,寫了一個小爬蟲: 說明文件: 1.直接講程式碼儲存成.py檔案 2.輸入你喜歡的電影,按下回車 3.顯示豆瓣的電影評分 4. 按任意鍵退出程式; # -*- coding: utf-8 -*- im
python爬蟲--儲存本地
創建 utf-8 title str .cn itl 1-43 color read 1、儲存到txt title="today is beautiful" with open(‘C:\\Users\\leon\\Desktop\\title.txt‘,"a+") as
用爬蟲分析IMDB TOP250電影數據
tle table close 保存 ins turn com 現在 標示 起因 恰逢諾蘭導演的新片《敦刻爾克》即將在中國上映,作為諾蘭導演的鐵粉,印象中他的很多部電影都進入了IMDB TOP250的榜單,但是具體是多少部呢?他是不是IMDB TOP250 中作品最多的導演
《團隊-爬取豆瓣Top250電影-團隊-階段互評》
溝通 爬取 top 負責 負責任 完成 好的 電影 責任 學號:2015035107080得分:9.8原因:認真完成任務,與組員相互溝通交流,相互協作。 學號:2015035107152得分:9.6原因:為人誠實謙虛,能吃苦耐勞,敏而好學,積極尋找答案。 學號:201503
團隊-爬取豆瓣Top250電影-團隊-階段互評
尋找 爬取 編程 階段 豆瓣 top 積極 領導 耐心 學號:2015035107001得分:8.5 原因:有耐心,較為認真 學號:2015035107004得分:9.6 原因:結對編程夥伴,負責 學號:2015035107080得分:10 原因:領導性較強 ,認真負責,樂
Python爬蟲實習筆記 | Week1 軟體安裝及基礎知識學習
2018/10/15 1.所思所想:今天劉鳳成學長跟我介紹了公司情況,以及我們小組的主要任務,即網路資料的爬取,決定學好學深,不辜負半年時光。下午的主要任務就是配置環境,所謂“工欲善其事,必先利其器”,但還是不能花太多時間,後面的具體工作才是根本。 2.工作: (1)ubuntu系統的安裝,因為之前
Python爬蟲:Scrapy的Crawler物件及擴充套件Extensions和訊號Signals
先了解Scrapy中的Crawler物件體系 Crawler物件 settings crawler的配置管理器 set(name, value, priority=‘project’) setdict(values, priority=‘p
Python爬取豆瓣TOP250圖書排行榜
# -*- coding: utf-8 -*- import bs4 import requests def open_url(url): # url = 'https://movie.douban.com/top250' hd = {}
python爬蟲爬取貓眼電影top100
這個爬蟲我是跟著教程做的,也是第一次用python的re和multiprocessing(多執行緒),還知道了yield生成器的用法。不過re正則表示式真的厲害,但是學起來比較難,還在學習中。import requests import re import pymysql f