python爬蟲(16)使用scrapy框架爬取頂點小說網
阿新 • • 發佈:2019-01-03
本文以scrapy 框架來爬取整個頂點小說網的小說
1.scrapy的安裝
這個安裝教程,網上有很多的例子,這裡就不在贅述了
2.關於scrapy
scrapy框架 是一個非常好的東西,能夠實現非同步爬取,節省時間,其實本文純粹的按照之前的思維來做,
也不是不可以,但是感覺速度太慢了,畢竟資料量有點大
框架內容也在網上找找例子吧
3.直接說實現吧
使用
scrapy startproject dingdian
建立專案
然後增加檔案,最後程式碼目錄如下:
├── dingdian │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── mydingdian.py
主要程式:
mydingdian.py
定義的存貯內容即 items.py#coding:utf-8 import scrapy import re from scrapy.http import Request from dingdian.items import DingdianItem #from dingdian.items import DDNovelContentItem class Myspider(scrapy.Spider): name = "dingdian" allowed_domains = ["23us.com"] bash_url = "http://www.23us.com/class/" bashurl='.html' def start_requests(self): #for i in range(1,11): for i in range(7,8): url=self.bash_url+str(i)+"_1"+self.bashurl yield Request(url,self.parse) def parse(self, response): baseurl=response.url #此處得到的url為http://www.23us.com/class/*_1.html max_num=response.xpath('//*[@id="pagelink"]/a[14]/text()').extract_first()#獲取當前頁面的最大頁碼數 print max_num baseurl=baseurl[:-7] #for num in xrange(1,int(max_num)+1): for num in xrange(1,3): newurl=baseurl+"_"+str(num)+self.bashurl #此處使用dont_filter和不使用的效果不一樣,使用dont_filter就能夠抓取到第一個頁面的內容,不用就抓不到 #scrapy會對request的URL去重(RFPDupeFilter),加上dont_filter則告訴它這個URL不參與去重。 yield Request(newurl,dont_filter=True,callback=self.get_name)#將新的頁面url的內容傳遞給get_name函式去處理 def get_name(self,response): for nameinfo in response.xpath('//tr'): novelurl = nameinfo.xpath('td[1]/a/@href').extract_first()#小說地址 name = nameinfo.xpath('td[1]/a/text()').extract_first()#小說名字 if novelurl: yield Request(novelurl,dont_filter=True,callback=self.get_novelcontent,meta={'name':name}) ''' #在當前頁面獲取小說詳情 #print nameinfo name = nameinfo.xpath('td[1]/a/text()').extract_first()#小說名字 author= nameinfo.xpath('td[3]/text()').extract_first()#小說作者 novelurl = nameinfo.xpath('td[1]/a/@href').extract_first()#小說地址 serialstatus = nameinfo.xpath('td[6]/text()').extract_first()#小說狀態 serialnumber = nameinfo.xpath('td[4]/text()').extract_first()#小說字數 if novelurl: targentcontent['novel_name']=name targentcontent['author']=author targentcontent['novelurl']=novelurl targentcontent['serialstatus']=serialstatus targentcontent['serialnumber']=serialnumber #print name,author,novelurl,serialstatus,serialnumber yield Request(novelurl,callback=self.get_novelcontent,meta={'targentcontent':targentcontent}) 小說相關的詳情可以暫時不傳遞 ''' def get_novelcontent(self,response): #targentcontent=response.meta['targentcontent'] #print targentcontent['novelurl'],targentcontent['name'] #title = response.xpath('//dd[1]/h1/text()').extract_first() novel_name=response.meta['name']#小說名字 author = response.xpath('//tr[1]/td[2]/text()').extract_first()#作者 novelurl = response.url#小說地址 serialstatus = response.xpath('//tr[1]/td[3]/text()').extract_first()#狀態 serialnumber = response.xpath('//tr[2]/td[2]/text()').extract_first()#連載字數 category = response.xpath('//tr[1]/td[1]/a/text()').extract_first()#小說類別 name_id = novelurl[-5:]#小說編號 collect_num_total=response.xpath('//tr[2]/td[1]/text()').extract_first()#總收藏 click_num_total=response.xpath('//tr[3]/td[1]/text()').extract_first()#總點選 #chapterlistul=response.xpath('//dd[2]/div[2]/p[2]/a/text()').extract_first() chapterlisturl=response.xpath('//dd[2]/div[2]/p[2]/a/@href').extract_first() novel_breif=response.xpath('//dd[2]/p[2]').extract_first() targentcontent=DingdianItem() targentcontent['novel_name']=novel_name targentcontent['author']=author targentcontent['novelurl']=novelurl targentcontent['serialstatus']=serialstatus targentcontent['serialnumber']=serialnumber targentcontent['category']=category targentcontent['name_id']=name_id targentcontent['collect_num_total']=collect_num_total targentcontent['click_num_total']=click_num_total targentcontent['novel_breif']=novel_breif #yield targentcontent #print novel_name,author,novelurl,serialstatus,serialnumber,category,name_id,collect_num_total,click_num_total,chapterlisturl yield Request(chapterlisturl,dont_filter=True,callback=self.get_charaterurl,meta={'targentcontent':targentcontent}) def get_charaterurl(self,response): #print response.url item=response.meta['targentcontent'] for contents in response.xpath('//table/tr'): for content in contents.xpath('td'): if content.xpath('a/text()').extract_first(): #print content.xpath('a/text()').extract_first() item['chapterurl']=response.url+content.xpath('a/@href').extract_first() item['chaptername']=content.xpath('a/text()').extract_first() yield item
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class DingdianItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() novel_name = scrapy.Field()#小說名字 author = scrapy.Field()#作者 novelurl = scrapy.Field()#小說地址 serialstatus = scrapy.Field()#狀態 serialnumber = scrapy.Field()#連載字數 category = scrapy.Field()#小說類別 name_id = scrapy.Field()#小說編號 collect_num_total=scrapy.Field()#總收藏 click_num_total=scrapy.Field()#總點選 novel_breif=scrapy.Field()#小說簡介 chapterurl = scrapy.Field()#小說章節地址 chaptername = scrapy.Field()#小說章節名字
設定相關 settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for dingdian project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'dingdian'
SPIDER_MODULES = ['dingdian.spiders']
NEWSPIDER_MODULE = 'dingdian.spiders'
PAGE_STORGE="novels"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'dingdian (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'dingdian.middlewares.MyCustomSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'dingdian.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'dingdian.pipelines.DingdianPipeline': 100,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
最終的資料處理以及儲存
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from dingdian import settings
import os
import urllib2
from dingdian.items import DingdianItem
#from dingdian.items import DDNovelContentItem
from bs4 import BeautifulSoup as bs
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class DingdianPipeline(object):
def process_item(self, item, spider):
dir_path="%s/%s" % (settings.PAGE_STORGE,spider.name)
if not os.path.exists(dir_path):
# print "dir_path is %s",dir_path
os.makedirs(dir_path)
if isinstance(item,DingdianItem):
novelpath=dir_path+'/'+item['novel_name']
print novelpath
if not os.path.exists(novelpath):
os.makedirs(novelpath)
novelbreif=item['novel_name']+"_簡介"
novelbreifpath=novelpath+'/'+novelbreif+'.txt'
if not os.path.exists(novelbreifpath):
with open(novelbreifpath,'wb') as novel_write:
novel_write.write(item['novel_name'])
novel_write.write('\t|\t')
novel_write.write(item['author'])
novel_write.write('\t|\t')
novel_write.write(item['novelurl'])
novel_write.write('\n')
novel_write.write(item['serialstatus'])
novel_write.write('\t|\t')
novel_write.write(item['serialnumber'])
novel_write.write('\t|\t')
novel_write.write(item['category'])
novel_write.write('\n')
novel_write.write(item['name_id'])
novel_write.write('\t|\t')
novel_write.write(item['collect_num_total'])
novel_write.write('\t|\t')
novel_write.write(item['click_num_total'])
novel_write.write('\n')
novel_write.write(item['novel_breif'])
novel_write.close
titlename=item['chaptername']
titlenamepath=novelpath+'/'+titlename+'.txt'
print titlenamepath
chapterurl=item['chapterurl']
html=urllib2.urlopen(chapterurl).read()
soup1=bs(html,'lxml')
if not os.path.exists(titlenamepath):
with open(titlenamepath,'wb') as file_write:
cont=soup1.find("dd",attrs={"id":"contents"}).getText()
#print cont
file_write.write(cont)
file_write.close()
return item
#-o books.csv 引數的意思是將抓取的Item集合輸出到csv檔案。
#除了CSV格式,Scrapy還支援JSON,XML的格式輸入
然後執行
scrapy crawl dingdian
沒有報錯的話,就等上幾個小時,然後就能看到好多小說就躺在自己的電腦上面了